From 8d9bb663670ff4e55127de493e1eb9b40c2b2de0 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Thu, 10 Oct 2024 12:52:11 -0400 Subject: [PATCH] Updated documentation for get_bcast_params() method Signed-off-by: Constantin M Adam --- data-processing-lib/doc/spark-runtime.md | 8 +++++--- .../runtime/spark/runtime_configuration.py | 3 ++- .../runtime/spark/transform_runtime.py | 3 ++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/data-processing-lib/doc/spark-runtime.md b/data-processing-lib/doc/spark-runtime.md index d30ed11b8..62fc0fe9e 100644 --- a/data-processing-lib/doc/spark-runtime.md +++ b/data-processing-lib/doc/spark-runtime.md @@ -41,9 +41,11 @@ of this parameter: ## Transforms -* [SparkTransformRuntimeConfiguration](../spark/src/data_processing_spark/transform/runtime_configuration.py) allows - to configure transform to use PySpark - +* [SparkTransformRuntimeConfiguration](../spark/src/data_processing_spark/runtime/spark/runtime_configuration.py) + allows to configure transform to use PySpark. In addition to its base class + [TransformRuntimeConfiguration](../python//src/data_processing/runtime/runtime_configuration.py) features, + this class includes `get_bcast_params()` method to get very large configuration settings. Before starting the + transform execution, the Spark runtime will broadcast these settings to all the workers. ## Runtime diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/runtime_configuration.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/runtime_configuration.py index 4c00c717e..0f788396e 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/runtime_configuration.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/runtime_configuration.py @@ -36,7 +36,8 @@ def get_bcast_params(self, data_access_factory: DataAccessFactoryBase) -> dict[s """Allows retrieving and broadcasting to all the workers very large configuration parameters, like the list of document IDs to remove for fuzzy dedup, or the list of blocked web domains for block listing. This - function is called after spark initialization, and before spark_context.parallelize() + function is called by the spark runtime after spark initialization, and + before spark_context.parallelize() :param data_access_factory - creates data_access object to download the large config parameter """ return {} diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py index c5bfc94ad..1f3d671a5 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py @@ -48,7 +48,8 @@ def get_bcast_params(self, data_access_factory: DataAccessFactoryBase) -> dict[s """Allows retrieving and broadcasting to all the workers very large configuration parameters, like the list of document IDs to remove for fuzzy dedup, or the list of blocked web domains for block listing. This - function is called after spark initialization, and before spark_context.parallelize(). + function is called by the spark runtime after spark initialization, and + before spark_context.parallelize() :param data_access_factory - creates data_access object to download the large config parameter """ return {}