From 2824944530f1db4af69142900c1b85a7d2852034 Mon Sep 17 00:00:00 2001 From: tvalentyn Date: Wed, 31 Jul 2024 10:49:45 -0700 Subject: [PATCH] Remove `--impersonate_service_account` whenever PipelineOptions are serialized (#32031) * Remove the impersonate_service_account pipeline option during serialization. * Update Changes.md --- CHANGES.md | 3 ++- .../python/apache_beam/options/pipeline_options.py | 14 ++++++++++++++ website/www/site/content/en/blog/beam-2.49.0.md | 1 + 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 2bc2ecc49970..b127599ae0aa 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -79,7 +79,7 @@ ## Bugfixes -* Fixed X (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). +* Fixed incorrect service account impersonation flow for Python pipelines using BigQuery IOs ([#32030](https://github.com/apache/beam/issues/32030)). ## Security Fixes * Fixed (CVE-YYYY-NNNN)[https://www.cve.org/CVERecord?id=CVE-YYYY-NNNN] (Java/Python/Go) ([#X](https://github.com/apache/beam/issues/X)). @@ -526,6 +526,7 @@ as a workaround, a copy of "old" `CountingSource` class should be placed into a ## Known Issues * Long-running Python pipelines might experience a memory leak: [#28246](https://github.com/apache/beam/issues/28246). +* Python pipelines using the `--impersonate_service_account` option with BigQuery IOs might fail on Dataflow ([#32030](https://github.com/apache/beam/issues/32030)). This is fixed in 2.59.0 release. # [2.48.0] - 2023-05-31 diff --git a/sdks/python/apache_beam/options/pipeline_options.py b/sdks/python/apache_beam/options/pipeline_options.py index 6b1dd8bb48c0..0f8457a40a7b 100644 --- a/sdks/python/apache_beam/options/pipeline_options.py +++ b/sdks/python/apache_beam/options/pipeline_options.py @@ -247,6 +247,20 @@ def __init__(self, flags=None, **kwargs): self._all_options[option_name] = getattr( self._visible_options, option_name) + def __getstate__(self): + # The impersonate_service_account option must be used only at submission of + # a Beam job. However, Beam IOs might store pipeline options + # within transform implementation that becomes serialized in RunnerAPI, + # causing this option to be inadvertently used at runtime. + # This serialization hook removes it. + if self.view_as(GoogleCloudOptions).impersonate_service_account: + dict_copy = dict(self.__dict__) + dict_copy['_all_options'] = dict(dict_copy['_all_options']) + dict_copy['_all_options']['impersonate_service_account'] = None + return dict_copy + else: + return self.__dict__ + @classmethod def _add_argparse_args(cls, parser): # type: (_BeamArgumentParser) -> None diff --git a/website/www/site/content/en/blog/beam-2.49.0.md b/website/www/site/content/en/blog/beam-2.49.0.md index a2e7af0e18f8..4dbc08693f5b 100644 --- a/website/www/site/content/en/blog/beam-2.49.0.md +++ b/website/www/site/content/en/blog/beam-2.49.0.md @@ -52,6 +52,7 @@ For more information on changes in 2.49.0, check out the [detailed release notes * Long-running Python pipelines might experience a memory leak: [#28246](https://github.com/apache/beam/issues/28246). * Python SDK's cross-language Bigtable sink mishandles records that don't have an explicit timestamp set: [#28632](https://github.com/apache/beam/issues/28632). To avoid this issue, set explicit timestamps for all records before writing to Bigtable. +* Python pipelines using the `--impersonate_service_account` option with BigQuery IOs might fail on Dataflow ([#32030](https://github.com/apache/beam/issues/32030)). This is fixed in 2.59.0 release. ## List of Contributors