From 9d658cfde64b2c248bab2de805cd6267b0fb0761 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Thu, 19 Sep 2024 21:04:48 +0100 Subject: [PATCH 01/12] pipeline transform --- .../src/data_processing/runtime/__init__.py | 1 + .../runtime/base_transform_runtime.py | 30 ++++ .../pure_python/runtime_configuration.py | 9 +- .../runtime/pure_python/transform_runtime.py | 5 +- .../runtime/runtime_configuration.py | 15 +- .../runtime/transform_file_processor.py | 6 +- .../runtime/transform_launcher.py | 1 + .../transform/binary_transform.py | 2 +- .../transform/pipeline_transform.py | 157 ++++++++++++++++++ .../data_processing/utils/transform_utils.py | 2 +- .../runtime/ray/runtime_configuration.py | 10 +- .../runtime/ray/transform_runtime.py | 5 +- .../runtime/spark/runtime_configuration.py | 10 +- .../runtime/spark/transform_file_processor.py | 2 +- .../runtime/spark/transform_runtime.py | 14 +- .../spark/src/doc_id_transform_spark.py | 32 +--- 16 files changed, 228 insertions(+), 73 deletions(-) create mode 100644 data-processing-lib/python/src/data_processing/runtime/base_transform_runtime.py create mode 100644 data-processing-lib/python/src/data_processing/transform/pipeline_transform.py diff --git a/data-processing-lib/python/src/data_processing/runtime/__init__.py b/data-processing-lib/python/src/data_processing/runtime/__init__.py index 7fb42a33a..7ddf4f60b 100644 --- a/data-processing-lib/python/src/data_processing/runtime/__init__.py +++ b/data-processing-lib/python/src/data_processing/runtime/__init__.py @@ -1,3 +1,4 @@ +from data_processing.runtime.base_transform_runtime import BaseTransformRuntime from data_processing.runtime.execution_configuration import TransformExecutionConfiguration, runtime_cli_prefix from data_processing.runtime.runtime_configuration import TransformRuntimeConfiguration from data_processing.runtime.transform_launcher import AbstractTransformLauncher, multi_launcher diff --git a/data-processing-lib/python/src/data_processing/runtime/base_transform_runtime.py b/data-processing-lib/python/src/data_processing/runtime/base_transform_runtime.py new file mode 100644 index 000000000..dc9575219 --- /dev/null +++ b/data-processing-lib/python/src/data_processing/runtime/base_transform_runtime.py @@ -0,0 +1,30 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any + +from data_processing.data_access import DataAccessFactoryBase +from data_processing.transform import TransformStatistics + + +class BaseTransformRuntime: + """ + Base Transformer runtime used by processor to to create Transform specific environment + Every Runtime defines specific implementation of this class + """ + + def __init__(self, params: dict[str, Any]): + """ + Create/config this runtime. + :param params: parameters, often provided by the CLI arguments as defined by a TableTansformConfiguration. + """ + self.params = params diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/runtime_configuration.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/runtime_configuration.py index 10f9bcf27..be0101174 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/runtime_configuration.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/runtime_configuration.py @@ -26,12 +26,5 @@ def __init__( :param transform_config - base configuration class :param runtime_class: implementation of the transform runtime """ - self.runtime_class = runtime_class - super().__init__(transform_config=transform_config) + super().__init__(transform_config=transform_config, runtime_class=runtime_class) - def create_transform_runtime(self) -> DefaultPythonTransformRuntime: - """ - Create transform runtime with the parameters captured during apply_input_params() - :return: transform runtime object - """ - return self.runtime_class(self.transform_config.get_transform_params()) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py index 4173154ae..bb7ac09df 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py @@ -14,9 +14,10 @@ from data_processing.data_access import DataAccessFactoryBase from data_processing.transform import TransformStatistics +from data_processing.runtime import BaseTransformRuntime -class DefaultPythonTransformRuntime: +class DefaultPythonTransformRuntime(BaseTransformRuntime): """ Transformer runtime used by processor to to create Transform specific environment """ @@ -26,7 +27,7 @@ def __init__(self, params: dict[str, Any]): Create/config this runtime. :param params: parameters, often provided by the CLI arguments as defined by a TableTansformConfiguration. """ - self.params = params + super().__init__(params) def get_transform_config( self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str] diff --git a/data-processing-lib/python/src/data_processing/runtime/runtime_configuration.py b/data-processing-lib/python/src/data_processing/runtime/runtime_configuration.py index ef85d1363..3b6d16e9b 100644 --- a/data-processing-lib/python/src/data_processing/runtime/runtime_configuration.py +++ b/data-processing-lib/python/src/data_processing/runtime/runtime_configuration.py @@ -15,15 +15,21 @@ from data_processing.transform import AbstractBinaryTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider +from data_processing.runtime import BaseTransformRuntime class TransformRuntimeConfiguration(CLIArgumentProvider): - def __init__(self, transform_config: TransformConfiguration): + def __init__(self, + transform_config: TransformConfiguration, + runtime_class: type[BaseTransformRuntime] + ): """ Initialization :param transform_config - base configuration class + :param runtime_class - base runtime class """ self.transform_config = transform_config + self.runtime_class = runtime_class def add_input_params(self, parser: ArgumentParser) -> None: self.transform_config.add_input_params(parser) @@ -62,3 +68,10 @@ def get_transform_params(self) -> dict[str, Any]: :return: transform parameters """ return self.transform_config.get_transform_params() + + def create_transform_runtime(self) -> BaseTransformRuntime: + """ + Create transform runtime with the parameters captured during apply_input_params() + :return: transform runtime object + """ + return self.runtime_class(self.transform_config.get_transform_params()) diff --git a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py index d4ec548d8..5cdd6a183 100644 --- a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py @@ -45,7 +45,7 @@ def __init__( self.data_access = data_access_factory.create_data_access() # Add data access and statistics to the processor parameters self.transform_params = transform_parameters - self.transform_params["data_access"] = self.data_access + self.transform_params["data_access_factory"] = data_access_factory def process_file(self, f_name: str) -> None: """ @@ -205,7 +205,7 @@ def _submit_file(self, t_start: float, out_files: list[tuple[bytes, str]], stats def _publish_stats(self, stats: dict[str, Any]) -> None: """ Publishing execution statistics - :param stats: Statistics + :param stats: dictionary :return: None """ - raise ValueError("must be implemented by subclass") + raise NotImplemented("must be implemented by subclass") diff --git a/data-processing-lib/python/src/data_processing/runtime/transform_launcher.py b/data-processing-lib/python/src/data_processing/runtime/transform_launcher.py index 4c3abbd83..becb4b6c3 100644 --- a/data-processing-lib/python/src/data_processing/runtime/transform_launcher.py +++ b/data-processing-lib/python/src/data_processing/runtime/transform_launcher.py @@ -36,6 +36,7 @@ def __init__( self.runtime_config = runtime_config self.name = self.runtime_config.get_name() self.data_access_factory = data_access_factory + self.execution_config = None def _get_parser(self) -> argparse.ArgumentParser: """ diff --git a/data-processing-lib/python/src/data_processing/transform/binary_transform.py b/data-processing-lib/python/src/data_processing/transform/binary_transform.py index 80dff61ea..9b415efca 100644 --- a/data-processing-lib/python/src/data_processing/transform/binary_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/binary_transform.py @@ -10,7 +10,7 @@ # limitations under the License. ################################################################################ -from typing import Any, TypeVar +from typing import Any class AbstractBinaryTransform: diff --git a/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py b/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py new file mode 100644 index 000000000..f4fca7489 --- /dev/null +++ b/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py @@ -0,0 +1,157 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any +from data_processing.transform import AbstractBinaryTransform +from data_processing.runtime import TransformRuntimeConfiguration, BaseTransformRuntime +from data_processing.utils import get_logger, UnrecoverableException, TransformUtils + + +class PipelineTransformBase(AbstractBinaryTransform): + """ + Transform that executes a set of base transforms sequentially. Data is passed between + participating transforms in memory + """ + + def __init__(self, config: dict[str, Any], transforms: list[TransformRuntimeConfiguration]): + """ + Initializes pipeline execution for the list of transforms + :param config - configuration parameters + :param transforms - list of transforms in the pipeline. Note that transforms will + be executed + """ + super().__init__(config) + self.logger = get_logger(__name__) + if len(transforms) == 0: + # Empty pipeline + self.logger.error("Pipeline transform with empty list") + raise UnrecoverableException("Pipeline transform with empty list") + self.data_access_factory = config.get("data_access_factory", None) + if self.data_access_factory is None: + self.logger.error("pipeline transform - Data access factory is not defined") + raise UnrecoverableException("pipeline transform - Data access factory is not defined") + self.statistics = config.get("statistics", None) + if self.statistics is None: + self.logger.error("pipeline transform - Statistics is not defined") + raise UnrecoverableException("pipeline transform - Statistics is not defined") + participants = [] + # for every transform in the pipeline + for transform in transforms: + # create runtime + runtime = transform.create_transform_runtime() + # get parameters + transform_params = self._get_transform_params(runtime) + # Create transform + tr = transform.get_transform_class()(transform_params) + participants.append((tr, runtime)) + # save participating transforms + self.transforms = participants + + def _get_transform_params(self, runtime: BaseTransformRuntime) -> dict[str, Any]: + """ + get transform parameters + :param runtime - runtime + :return: transform params + """ + raise NotImplemented("must be implemented by subclass") + + def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + """ + Converts input file into o or more output files. + If there is an error, an exception must be raised - exit()ing is not generally allowed. + :param byte_array: contents of the input file to be transformed. + :param file_name: the name of the file containing the given byte_array. + :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated + to metadata. Each element of the return list, is a tuple of the transformed bytes and a string + holding the extension to be used when writing out the new bytes. + """ + # process transforms sequentially + data = [(byte_array, file_name)] + stats = {} + for transform, _ in self.transforms: + data, st = self._process_transform(transform=transform, data=data) + # Accumulate stats + stats |= st + if len(data) == 0: + # no data returned by this transform + return [], stats + # all done + return data, stats + + @staticmethod + def _process_transform(transform: AbstractBinaryTransform, data: list[tuple[bytes, str]] + ) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + """ + Process individual transform. Note here that the predecessor could create multiple data objects + :param transform - transform + :param data - data to process + :return: + """ + stats = {} + res = [] + for dt in data: + # for every data element + src = TransformUtils.get_file_extension(dt[1]) + out_files, st = transform.transform_binary(byte_array=dt[0], file_name=dt[1]) + # Accumulate results + for ouf in out_files: + res.append((ouf[0], src[0] + ouf[1])) + # accumulate statistics + stats |= st + return res, stats + + def flush_binary(self) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + """ + This is supporting method for transformers, that implement buffering of data, for example coalesce. + These transformers can have buffers containing data that were not written to the output immediately. + Flush is the hook for them to return back locally stored data and their statistics. + The majority of transformers are expected not to use such buffering and can use this default implementation. + If there is an error, an exception must be raised - exit()ing is not generally allowed. + :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated + to metadata. Each element of the return list, is a tuple of the transformed bytes and a string + holding the extension to be used when writing out the new bytes. + """ + stats = {} + res = [] + i = 0 + for transform, _ in self.transforms: + out_files, st = transform.flush_binary() + # accumulate statistics + stats |= st + if len(out_files) > 0 and i < len(self.transforms) - 1: + # flush produced output - run it through the rest of the chain + data = [] + for ouf in out_files: + data.append((ouf[0], f"file{ouf[1]}")) + for n in range(i + 1, len(self.transforms)): + data, st = self._process_transform(transform=self.transforms[n][0], data=data) + # Accumulate stats + stats |= st + if len(data) == 0: + # no data returned by this transform + break + res += data + else: + res += out_files + # Done flushing, compute execution stats + for _, runtime in self.transforms: + self._compute_execution_stats(runtime=runtime, st=stats) + return res, {} + + def _compute_execution_stats(self, runtime: BaseTransformRuntime, st: dict[str, Any]) -> dict[str, Any]: + """ + get transform parameters + :param runtime - runtime + :param st - statistics + :return: + """ + raise NotImplemented("must be implemented by subclass") diff --git a/data-processing-lib/python/src/data_processing/utils/transform_utils.py b/data-processing-lib/python/src/data_processing/utils/transform_utils.py index e2d37581c..adfe00afd 100644 --- a/data-processing-lib/python/src/data_processing/utils/transform_utils.py +++ b/data-processing-lib/python/src/data_processing/utils/transform_utils.py @@ -96,7 +96,7 @@ def get_file_extension(file_path) -> list[str]: """ Get the file's root and extension from the given file path. :param file_path : The path of the file. - :return: str: The file extension including the dot ('.') if present, otherwise an empty string. + :return: str: The file name and extension """ return os.path.splitext(file_path) diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/runtime_configuration.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/runtime_configuration.py index 321937dd4..11f83780f 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/runtime_configuration.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/runtime_configuration.py @@ -26,12 +26,4 @@ def __init__( :param transform_config - base configuration class :param runtime_class: implementation of the transform runtime """ - super().__init__(transform_config=transform_config) - self.runtime_class = runtime_class - - def create_transform_runtime(self) -> DefaultRayTransformRuntime: - """ - Create transform runtime with the parameters captured during apply_input_params() - :return: transform runtime object - """ - return self.runtime_class(self.transform_config.get_transform_params()) + super().__init__(transform_config=transform_config, runtime_class=runtime_class) diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py index 57f071406..6dbb3a73b 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py @@ -13,10 +13,11 @@ from typing import Any from data_processing.data_access import DataAccessFactoryBase +from data_processing.runtime import BaseTransformRuntime from ray.actor import ActorHandle -class DefaultRayTransformRuntime: +class DefaultRayTransformRuntime(BaseTransformRuntime): """ Transformer runtime used by processor to to create Transform specific environment """ @@ -26,7 +27,7 @@ def __init__(self, params: dict[str, Any]): Create/config this runtime. :param params: parameters, often provided by the CLI arguments as defined by a TableTansformConfiguration. """ - self.params = params + super().__init__(params) def get_transform_config( self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str] diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/runtime_configuration.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/runtime_configuration.py index e0804e1e3..4fc0bef95 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/runtime_configuration.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/runtime_configuration.py @@ -26,12 +26,4 @@ def __init__( :param transform_config - base configuration class :param runtime_class: implementation of the transform runtime """ - super().__init__(transform_config=transform_config) - self.runtime_class = runtime_class - - def create_transform_runtime(self) -> DefaultSparkTransformRuntime: - """ - Create transform runtime with the parameters captured during apply_input_params() - :return: transform runtime object - """ - return self.runtime_class(self.transform_config.get_transform_params()) + super().__init__(transform_config=transform_config, runtime_class=runtime_class) diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py index eeb45bfc3..049006b10 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py @@ -35,7 +35,7 @@ def __init__( super().__init__( data_access_factory=data_access_factory, transform_parameters=runtime_configuration.get_transform_params() ) - # Add data access ant statistics to the processor parameters + # Add statistics to the processor parameters self.runtime_configuration = runtime_configuration self.transform = None # set up statistics diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py index f16b09520..fe36159ec 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py @@ -14,9 +14,10 @@ from data_processing.data_access import DataAccessFactoryBase from data_processing.transform import TransformStatistics +from data_processing.runtime import BaseTransformRuntime -class DefaultSparkTransformRuntime: +class DefaultSparkTransformRuntime(BaseTransformRuntime): """ Transformer runtime used by processor to to create Transform specific environment """ @@ -24,9 +25,9 @@ class DefaultSparkTransformRuntime: def __init__(self, params: dict[str, Any]): """ Create/config this runtime. - :param params: parameters, often provided by the CLI arguments as defined by a TableTansformConfiguration. + :param params: parameters, often provided by the CLI arguments as defined by a Transform Configuration. """ - self.params = params + super().__init__(params) def get_transform_config( self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics @@ -35,13 +36,14 @@ def get_transform_config( Get the dictionary of configuration that will be provided to the transform's initializer. This is the opportunity for this runtime to create a new set of configuration based on the config/params provided to this instance's initializer. This may include the addition - of new configuration data such as ray shared memory, new actors, etc, that might be needed and + of new configuration data such as ray shared memory, new actors, etc., that might be needed and expected by the transform in its initializer and/or transform() methods. + :param partition - Spark partition :param data_access_factory - data access factory class being used by the RayOrchestrator. :param statistics - reference to statistics actor :return: dictionary of transform init params """ - return self.params + return self.params | {"partition_index" : partition} def compute_execution_stats(self, stats: TransformStatistics) -> None: """ @@ -49,4 +51,4 @@ def compute_execution_stats(self, stats: TransformStatistics) -> None: :param stats: output of statistics as aggregated across all calls to all transforms. :return: job execution statistics. These are generally reported as metadata by the Ray Orchestrator. """ - pass \ No newline at end of file + return diff --git a/transforms/universal/doc_id/spark/src/doc_id_transform_spark.py b/transforms/universal/doc_id/spark/src/doc_id_transform_spark.py index beeb77ce5..7a01b370d 100644 --- a/transforms/universal/doc_id/spark/src/doc_id_transform_spark.py +++ b/transforms/universal/doc_id/spark/src/doc_id_transform_spark.py @@ -15,11 +15,9 @@ import pyarrow as pa from data_processing.transform import AbstractTableTransform, TransformConfiguration -from data_processing.data_access import DataAccessFactoryBase -from data_processing.transform import TransformStatistics from data_processing.utils import CLIArgumentProvider, TransformUtils from data_processing_spark.runtime.spark import SparkTransformLauncher -from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration, DefaultSparkTransformRuntime +from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration short_name = "doc_id" @@ -136,32 +134,6 @@ def apply_input_params(self, args: Namespace) -> bool: return True -class DocIDSparkTransformRuntime(DefaultSparkTransformRuntime): - - def __init__(self, params: dict[str, Any]): - """ - Create/config this runtime. - :param params: parameters, often provided by the CLI arguments as defined by a TableTansformConfiguration. - """ - super().__init__(params) - - def get_transform_config( - self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics - ) -> dict[str, Any]: - """ - Get the dictionary of configuration that will be provided to the transform's initializer. - This is the opportunity for this runtime to create a new set of configuration based on the - config/params provided to this instance's initializer. This may include the addition - of new configuration data such as ray shared memory, new actors, etc, that might be needed and - expected by the transform in its initializer and/or transform() methods. - :param data_access_factory - data access factory class being used by the RayOrchestrator. - :param statistics - reference to statistics actor - :return: dictionary of transform init params - """ - return self.params | {"partition_index": partition} - - - class DocIDSparkTransformConfiguration(SparkTransformRuntimeConfiguration): """ Implements the SparkTransformConfiguration for NOOP as required by the PythonTransformLauncher. @@ -173,7 +145,7 @@ def __init__(self): """ Initialization """ - super().__init__(transform_config=DocIDTransformConfiguration(), runtime_class=DocIDSparkTransformRuntime) + super().__init__(transform_config=DocIDTransformConfiguration()) if __name__ == "__main__": From b913df4e4ab4330a3df385ad8be97c6c9556790e Mon Sep 17 00:00:00 2001 From: blublinsky Date: Fri, 20 Sep 2024 13:06:25 +0100 Subject: [PATCH 02/12] fixed build --- data-processing-lib/doc/overview.md | 2 +- .../src/data_processing/runtime/__init__.py | 4 +- .../pure_python/runtime_configuration.py | 3 +- .../runtime/pure_python/transform_runtime.py | 3 +- .../runtime/transform_launcher.py | 2 +- .../src/data_processing/transform/__init__.py | 3 ++ .../base_transform_runtime.py | 3 -- .../transform/pipeline_transform.py | 24 ++++----- .../transform/pure_python/__init__.py | 1 + .../pure_python/pipeline_transform.py | 51 ++++++++++++++++++ .../runtime_configuration.py | 3 +- .../input/{ => s3_support}/sample1.parquet | Bin .../data_access/data_access_s3_test.py | 2 +- 13 files changed, 74 insertions(+), 27 deletions(-) rename data-processing-lib/python/src/data_processing/{runtime => transform}/base_transform_runtime.py (90%) create mode 100644 data-processing-lib/python/src/data_processing/transform/pure_python/__init__.py create mode 100644 data-processing-lib/python/src/data_processing/transform/pure_python/pipeline_transform.py rename data-processing-lib/python/src/data_processing/{runtime => transform}/runtime_configuration.py (97%) rename data-processing-lib/python/test-data/data_processing/input/{ => s3_support}/sample1.parquet (100%) diff --git a/data-processing-lib/doc/overview.md b/data-processing-lib/doc/overview.md index f4571d5b9..c1bc23104 100644 --- a/data-processing-lib/doc/overview.md +++ b/data-processing-lib/doc/overview.md @@ -32,7 +32,7 @@ runtime interfacee expected to be implemented by each runtime ([python](python-r * [DataAccessFactory](../python/src/data_processing/data_access/data_access_factory_base.py) - is used to configure the input and output data files to be processed and creates the `DataAccess` instance (see below) according to the CLI parameters. -* [TransformRuntimeConfiguration](../python/src/data_processing/runtime/runtime_configuration.py) - captures +* [TransformRuntimeConfiguration](../python/src/data_processing/transform/runtime_configuration.py) - captures the `TransformConfiguration` and runtime-specific configuration. * [DataAccess](../python/src/data_processing/data_access/data_access.py) - is the interface defining data i/o methods and selection. Implementations for local diff --git a/data-processing-lib/python/src/data_processing/runtime/__init__.py b/data-processing-lib/python/src/data_processing/runtime/__init__.py index 7ddf4f60b..88a8bf10b 100644 --- a/data-processing-lib/python/src/data_processing/runtime/__init__.py +++ b/data-processing-lib/python/src/data_processing/runtime/__init__.py @@ -1,5 +1,3 @@ -from data_processing.runtime.base_transform_runtime import BaseTransformRuntime from data_processing.runtime.execution_configuration import TransformExecutionConfiguration, runtime_cli_prefix -from data_processing.runtime.runtime_configuration import TransformRuntimeConfiguration -from data_processing.runtime.transform_launcher import AbstractTransformLauncher, multi_launcher from data_processing.runtime.transform_file_processor import AbstractTransformFileProcessor +from data_processing.runtime.transform_launcher import AbstractTransformLauncher, multi_launcher diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/runtime_configuration.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/runtime_configuration.py index be0101174..c34bf607a 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/runtime_configuration.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/runtime_configuration.py @@ -10,9 +10,8 @@ # limitations under the License. ################################################################################ -from data_processing.runtime import TransformRuntimeConfiguration from data_processing.runtime.pure_python import DefaultPythonTransformRuntime -from data_processing.transform import TransformConfiguration +from data_processing.transform import TransformConfiguration, TransformRuntimeConfiguration class PythonTransformRuntimeConfiguration(TransformRuntimeConfiguration): diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py index bb7ac09df..12cac1278 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py @@ -13,8 +13,7 @@ from typing import Any from data_processing.data_access import DataAccessFactoryBase -from data_processing.transform import TransformStatistics -from data_processing.runtime import BaseTransformRuntime +from data_processing.transform import TransformStatistics, BaseTransformRuntime class DefaultPythonTransformRuntime(BaseTransformRuntime): diff --git a/data-processing-lib/python/src/data_processing/runtime/transform_launcher.py b/data-processing-lib/python/src/data_processing/runtime/transform_launcher.py index becb4b6c3..648d48669 100644 --- a/data-processing-lib/python/src/data_processing/runtime/transform_launcher.py +++ b/data-processing-lib/python/src/data_processing/runtime/transform_launcher.py @@ -15,7 +15,7 @@ import argparse from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase -from data_processing.runtime import TransformRuntimeConfiguration +from data_processing.transform import TransformRuntimeConfiguration from data_processing.utils import ParamsUtils, get_logger diff --git a/data-processing-lib/python/src/data_processing/transform/__init__.py b/data-processing-lib/python/src/data_processing/transform/__init__.py index 6af43ad60..9f97f0528 100644 --- a/data-processing-lib/python/src/data_processing/transform/__init__.py +++ b/data-processing-lib/python/src/data_processing/transform/__init__.py @@ -1,4 +1,7 @@ from data_processing.transform.binary_transform import AbstractBinaryTransform from data_processing.transform.table_transform import AbstractTableTransform from data_processing.transform.transform_statistics import TransformStatistics +from data_processing.transform.base_transform_runtime import BaseTransformRuntime from data_processing.transform.transform_configuration import TransformConfiguration, get_transform_config +from data_processing.transform.runtime_configuration import TransformRuntimeConfiguration +from data_processing.transform.pipeline_transform import AbstractPipelineTransform diff --git a/data-processing-lib/python/src/data_processing/runtime/base_transform_runtime.py b/data-processing-lib/python/src/data_processing/transform/base_transform_runtime.py similarity index 90% rename from data-processing-lib/python/src/data_processing/runtime/base_transform_runtime.py rename to data-processing-lib/python/src/data_processing/transform/base_transform_runtime.py index dc9575219..706c1af2f 100644 --- a/data-processing-lib/python/src/data_processing/runtime/base_transform_runtime.py +++ b/data-processing-lib/python/src/data_processing/transform/base_transform_runtime.py @@ -12,9 +12,6 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase -from data_processing.transform import TransformStatistics - class BaseTransformRuntime: """ diff --git a/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py b/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py index f4fca7489..34feb050d 100644 --- a/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py @@ -11,12 +11,11 @@ ################################################################################ from typing import Any -from data_processing.transform import AbstractBinaryTransform -from data_processing.runtime import TransformRuntimeConfiguration, BaseTransformRuntime +from data_processing.transform import AbstractBinaryTransform, BaseTransformRuntime, TransformRuntimeConfiguration from data_processing.utils import get_logger, UnrecoverableException, TransformUtils -class PipelineTransformBase(AbstractBinaryTransform): +class AbstractPipelineTransform(AbstractBinaryTransform): """ Transform that executes a set of base transforms sequentially. Data is passed between participating transforms in memory @@ -43,6 +42,7 @@ def __init__(self, config: dict[str, Any], transforms: list[TransformRuntimeConf if self.statistics is None: self.logger.error("pipeline transform - Statistics is not defined") raise UnrecoverableException("pipeline transform - Statistics is not defined") + self.transforms = transforms participants = [] # for every transform in the pipeline for transform in transforms: @@ -54,7 +54,7 @@ def __init__(self, config: dict[str, Any], transforms: list[TransformRuntimeConf tr = transform.get_transform_class()(transform_params) participants.append((tr, runtime)) # save participating transforms - self.transforms = participants + self.participants = participants def _get_transform_params(self, runtime: BaseTransformRuntime) -> dict[str, Any]: """ @@ -77,7 +77,7 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl # process transforms sequentially data = [(byte_array, file_name)] stats = {} - for transform, _ in self.transforms: + for transform, _ in self.participants: data, st = self._process_transform(transform=transform, data=data) # Accumulate stats stats |= st @@ -123,17 +123,17 @@ def flush_binary(self) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: stats = {} res = [] i = 0 - for transform, _ in self.transforms: + for transform, _ in self.participants: out_files, st = transform.flush_binary() # accumulate statistics stats |= st - if len(out_files) > 0 and i < len(self.transforms) - 1: + if len(out_files) > 0 and i < len(self.participants) - 1: # flush produced output - run it through the rest of the chain data = [] for ouf in out_files: data.append((ouf[0], f"file{ouf[1]}")) - for n in range(i + 1, len(self.transforms)): - data, st = self._process_transform(transform=self.transforms[n][0], data=data) + for n in range(i + 1, len(self.participants)): + data, st = self._process_transform(transform=self.participants[n][0], data=data) # Accumulate stats stats |= st if len(data) == 0: @@ -143,15 +143,15 @@ def flush_binary(self) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: else: res += out_files # Done flushing, compute execution stats - for _, runtime in self.transforms: + for _, runtime in self.participants: self._compute_execution_stats(runtime=runtime, st=stats) return res, {} - def _compute_execution_stats(self, runtime: BaseTransformRuntime, st: dict[str, Any]) -> dict[str, Any]: + def _compute_execution_stats(self, runtime: BaseTransformRuntime, st: dict[str, Any]) -> None: """ get transform parameters :param runtime - runtime :param st - statistics - :return: + :return: None """ raise NotImplemented("must be implemented by subclass") diff --git a/data-processing-lib/python/src/data_processing/transform/pure_python/__init__.py b/data-processing-lib/python/src/data_processing/transform/pure_python/__init__.py new file mode 100644 index 000000000..9c7653525 --- /dev/null +++ b/data-processing-lib/python/src/data_processing/transform/pure_python/__init__.py @@ -0,0 +1 @@ +from data_processing.transform.pure_python.pipeline_transform import PythonPipelineTransform diff --git a/data-processing-lib/python/src/data_processing/transform/pure_python/pipeline_transform.py b/data-processing-lib/python/src/data_processing/transform/pure_python/pipeline_transform.py new file mode 100644 index 000000000..e798e6a36 --- /dev/null +++ b/data-processing-lib/python/src/data_processing/transform/pure_python/pipeline_transform.py @@ -0,0 +1,51 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any +from data_processing.transform import AbstractPipelineTransform +from data_processing.runtime import TransformRuntimeConfiguration, BaseTransformRuntime + + +class PythonPipelineTransform(AbstractPipelineTransform): + """ + Transform that executes a set of base transforms sequentially. Data is passed between + participating transforms in memory + """ + + def __init__(self, config: dict[str, Any], transforms: list[TransformRuntimeConfiguration]): + """ + Initializes pipeline execution for the list of transforms + :param config - configuration parameters + :param transforms - list of transforms in the pipeline. Note that transforms will + be executed + """ + super().__init__(config, transforms) + + def _get_transform_params(self, runtime: BaseTransformRuntime) -> dict[str, Any]: + """ + get transform parameters + :param runtime - runtime + :return: transform params + """ + return runtime.get_transform_config(data_access_factory=self.data_access_factory, + statistics=self.statistics, files=[]) + + def _compute_execution_stats(self, runtime: BaseTransformRuntime, st: dict[str, Any]) -> None: + """ + get transform parameters + :param runtime - runtime + :param st - statistics + :return: None + """ + self.statistics.add_stats(st) + runtime.compute_execution_stats(stats=self.statistics) + return diff --git a/data-processing-lib/python/src/data_processing/runtime/runtime_configuration.py b/data-processing-lib/python/src/data_processing/transform/runtime_configuration.py similarity index 97% rename from data-processing-lib/python/src/data_processing/runtime/runtime_configuration.py rename to data-processing-lib/python/src/data_processing/transform/runtime_configuration.py index 3b6d16e9b..75850f4f8 100644 --- a/data-processing-lib/python/src/data_processing/runtime/runtime_configuration.py +++ b/data-processing-lib/python/src/data_processing/transform/runtime_configuration.py @@ -13,9 +13,8 @@ from argparse import ArgumentParser, Namespace from typing import Any -from data_processing.transform import AbstractBinaryTransform, TransformConfiguration +from data_processing.transform import AbstractBinaryTransform, TransformConfiguration, BaseTransformRuntime from data_processing.utils import CLIArgumentProvider -from data_processing.runtime import BaseTransformRuntime class TransformRuntimeConfiguration(CLIArgumentProvider): diff --git a/data-processing-lib/python/test-data/data_processing/input/sample1.parquet b/data-processing-lib/python/test-data/data_processing/input/s3_support/sample1.parquet similarity index 100% rename from data-processing-lib/python/test-data/data_processing/input/sample1.parquet rename to data-processing-lib/python/test-data/data_processing/input/s3_support/sample1.parquet diff --git a/data-processing-lib/python/test/data_processing_tests/data_access/data_access_s3_test.py b/data-processing-lib/python/test/data_processing_tests/data_access/data_access_s3_test.py index 9cff1f6b0..1a4ffb9b2 100644 --- a/data-processing-lib/python/test/data_processing_tests/data_access/data_access_s3_test.py +++ b/data-processing-lib/python/test/data_processing_tests/data_access/data_access_s3_test.py @@ -33,7 +33,7 @@ def _create_and_populate_bucket(d_a: DataAccessS3, input_location: str, n_files: d_a.arrS3.s3_client.create_bucket(Bucket="test") # upload file loc = os.path.abspath( - os.path.join(os.path.dirname(__file__), "../../../test-data/data_processing/input/sample1.parquet") + os.path.join(os.path.dirname(__file__), "../../../test-data/data_processing/input/s3_support/sample1.parquet") ) with open(loc, "rb") as file: bdata = file.read() From 32dcbca7b5bb157484864303e8b602c4fd4a3d1e Mon Sep 17 00:00:00 2001 From: blublinsky Date: Fri, 20 Sep 2024 13:14:37 +0100 Subject: [PATCH 03/12] fixed build --- .../src/data_processing_ray/runtime/ray/transform_runtime.py | 2 +- .../data_processing_spark/runtime/spark/transform_runtime.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py index 6dbb3a73b..233206002 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py @@ -13,7 +13,7 @@ from typing import Any from data_processing.data_access import DataAccessFactoryBase -from data_processing.runtime import BaseTransformRuntime +from data_processing.transform import BaseTransformRuntime from ray.actor import ActorHandle diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py index fe36159ec..c634a4ab7 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py @@ -13,8 +13,7 @@ from typing import Any from data_processing.data_access import DataAccessFactoryBase -from data_processing.transform import TransformStatistics -from data_processing.runtime import BaseTransformRuntime +from data_processing.transform import TransformStatistics, BaseTransformRuntime class DefaultSparkTransformRuntime(BaseTransformRuntime): From 2b7df37162e1bcdb9ba19062cd1d3b4641dc6196 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Fri, 20 Sep 2024 14:17:39 +0100 Subject: [PATCH 04/12] fixed build --- .../transform/resize_transform.py | 217 ++++++++++++++++++ .../runtime/ray/runtime_configuration.py | 3 +- .../runtime/spark/runtime_configuration.py | 3 +- 3 files changed, 219 insertions(+), 4 deletions(-) create mode 100644 data-processing-lib/python/src/data_processing/test_support/transform/resize_transform.py diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/resize_transform.py b/data-processing-lib/python/src/data_processing/test_support/transform/resize_transform.py new file mode 100644 index 000000000..7247ee3bc --- /dev/null +++ b/data-processing-lib/python/src/data_processing/test_support/transform/resize_transform.py @@ -0,0 +1,217 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from argparse import ArgumentParser, Namespace +from typing import Any + +import pyarrow as pa +from data_processing.transform import AbstractTableTransform, TransformConfiguration +from data_processing.utils import ( + LOCAL_TO_DISK, + MB, + CLIArgumentProvider, + UnrecoverableException, + get_logger, +) +from data_processing.runtime.pure_python.runtime_configuration import ( + PythonTransformRuntimeConfiguration, +) +from data_processing.runtime.pure_python import PythonTransformLauncher + + +logger = get_logger(__name__) + +max_rows_per_table_key = "max_rows_per_table" +max_mbytes_per_table_key = "max_mbytes_per_table" +size_type_key = "size_type" +shortname = "resize" +cli_prefix = f"{shortname}_" +max_rows_per_table_cli_param = f"{cli_prefix}{max_rows_per_table_key}" +max_mbytes_per_table_cli_param = f"{cli_prefix}{max_mbytes_per_table_key}" +size_type_cli_param = f"{cli_prefix}{size_type_key}" +size_type_disk = "disk" +size_type_memory = "memory" +size_type_default = size_type_disk + + +class ResizeTransform(AbstractTableTransform): + """ + Implements splitting large files into smaller ones. + Two flavours of splitting are supported - based on the amount of documents and based on the size + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + """ + super().__init__(config) + self.max_rows_per_table = config.get(max_rows_per_table_key, 0) + self.max_bytes_per_table = MB * config.get(max_mbytes_per_table_key, 0) + disk_memory = config.get(size_type_key, size_type_default) + if size_type_default in disk_memory: + self.max_bytes_per_table *= LOCAL_TO_DISK + + self.logger.debug(f"max bytes = {self.max_bytes_per_table}") + self.logger.debug(f"max rows = {self.max_rows_per_table}") + self.buffer = None + if self.max_rows_per_table <= 0 and self.max_bytes_per_table <= 0: + raise ValueError("Neither max rows per table nor max table size are defined") + if self.max_rows_per_table > 0 and self.max_bytes_per_table > 0: + raise ValueError("Both max rows per table and max table size are defined. Only one should be present") + + def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: + """ + split larger files into the smaller ones + :param table: table + :param file_name: name of the file + :return: resulting set of tables + """ + self.logger.debug(f"got new table with {table.num_rows} rows") + if self.buffer is not None: + try: + self.logger.debug( + f"concatenating buffer with {self.buffer.num_rows} rows to table with {table.num_rows} rows" + ) + # table = pa.concat_tables([self.buffer, table], unicode_promote_options="permissive") + table = pa.concat_tables([self.buffer, table]) + self.buffer = None + self.logger.debug(f"concatenated table has {table.num_rows} rows") + except Exception as _: # Can happen if schemas are different + # Raise unrecoverable error to stop the execution + self.logger.warning(f"table in {file_name} can't be merged with the buffer") + self.logger.warning(f"incoming table columns {table.schema.names} ") + self.logger.warning(f"buffer columns {self.buffer.schema.names}") + raise UnrecoverableException() + + result = [] + start_row = 0 + if self.max_rows_per_table > 0: + # split file with max documents + n_rows = table.num_rows + rows_left = n_rows + while start_row < n_rows and rows_left >= self.max_rows_per_table: + length = n_rows - start_row + if length > self.max_rows_per_table: + length = self.max_rows_per_table + a_slice = table.slice(offset=start_row, length=length) + self.logger.debug(f"created table slice with {a_slice.num_rows} rows, starting with row {start_row}") + result.append(a_slice) + start_row = start_row + self.max_rows_per_table + rows_left = rows_left - self.max_rows_per_table + else: + # split based on size + current_size = 0.0 + if table.nbytes >= self.max_bytes_per_table: + for n in range(table.num_rows): + current_size += table.slice(offset=n, length=1).nbytes + if current_size >= self.max_bytes_per_table: + self.logger.debug(f"capturing slice, current_size={current_size}") + # Reached the size + a_slice = table.slice(offset=start_row, length=(n - start_row)) + result.append(a_slice) + start_row = n + current_size = 0.0 + if start_row < table.num_rows: + # buffer remaining chunk for next call + self.logger.debug(f"Buffering table starting at row {start_row}") + self.buffer = table.slice(offset=start_row, length=(table.num_rows - start_row)) + self.logger.debug(f"buffered table has {self.buffer.num_rows} rows") + self.logger.debug(f"returning {len(result)} tables") + return result, {} + + def flush(self) -> tuple[list[pa.Table], dict[str, Any]]: + result = [] + if self.buffer is not None: + self.logger.debug(f"flushing buffered table with {self.buffer.num_rows} rows of size {self.buffer.nbytes}") + result.append(self.buffer) + self.buffer = None + else: + self.logger.debug(f"Empty buffer. nothing to flush.") + return result, {} + + +class ResizeTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args and combining of metadata. + """ + + def __init__(self): + super().__init__(name=shortname, transform_class=ResizeTransform) + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the resizeTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + parser.add_argument( + f"--{max_rows_per_table_cli_param}", + type=int, + default=-1, + help="Max number of rows per table", + ) + parser.add_argument( + f"--{max_mbytes_per_table_cli_param}", + type=float, + default=-1, + help=f"Max table size (MB). Size is measured according to the --{size_type_cli_param} parameter", + ) + parser.add_argument( + f"--{size_type_cli_param}", + type=str, + required=False, + default=size_type_default, + choices=[size_type_disk, size_type_memory], + help=f"Determines how memory is measured when using the --{max_mbytes_per_table_cli_param} option." + "\n'memory' measures the in-process memory footprint and \n'disk' makes an estimate of the resulting parquet file size.", + ) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + # Capture the args that are specific to this transform + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + # dargs = vars(args) + if self.params.get(max_rows_per_table_key) <= 0 and self.params.get(max_mbytes_per_table_key) <= 0: + logger.info("Neither max documents per table nor max table size are defined") + return False + if self.params.get(max_rows_per_table_key) > 0 and self.params.get(max_mbytes_per_table_key) > 0: + logger.info("Both max documents per table and max table size are defined. Only one should be present") + return False + logger.info(f"Split file parameters are : {self.params}") + return True + + +class ResizePythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for resize as required by the RayTransformLauncher. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config=ResizeTransformConfiguration()) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = PythonTransformLauncher(ResizePythonTransformConfiguration()) + logger.info("Launching noop transform") + launcher.launch() diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/runtime_configuration.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/runtime_configuration.py index 11f83780f..8c53df641 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/runtime_configuration.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/runtime_configuration.py @@ -10,8 +10,7 @@ # limitations under the License. ################################################################################ -from data_processing.runtime import TransformRuntimeConfiguration -from data_processing.transform import TransformConfiguration +from data_processing.transform import TransformConfiguration, TransformRuntimeConfiguration from data_processing_ray.runtime.ray import DefaultRayTransformRuntime diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/runtime_configuration.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/runtime_configuration.py index 4fc0bef95..f4567188e 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/runtime_configuration.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/runtime_configuration.py @@ -10,8 +10,7 @@ # limitations under the License. ################################################################################ -from data_processing.runtime import TransformRuntimeConfiguration -from data_processing.transform import TransformConfiguration +from data_processing.transform import TransformConfiguration, TransformRuntimeConfiguration from data_processing_spark.runtime.spark import DefaultSparkTransformRuntime From 2b44b4c0ad85d6475a897276155de7a40e615e48 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Sat, 21 Sep 2024 19:30:02 +0100 Subject: [PATCH 05/12] made it work --- .../runtime/transform_launcher.py | 7 +- .../test_support/transform/__init__.py | 14 +++- .../transform/pipeline_transform.py | 45 +++++++++++ .../src/data_processing/transform/__init__.py | 1 + .../transform/pipeline_transform.py | 30 ++++--- .../pipeline_transform_configuration.py | 80 +++++++++++++++++++ .../pure_python/pipeline_transform.py | 6 +- .../transform/transform_configuration.py | 5 +- .../transform/test_noop.py | 8 +- .../transform/test_resize.py | 53 ++++++++++++ .../transform/test_resize_noop.py | 54 +++++++++++++ 11 files changed, 278 insertions(+), 25 deletions(-) create mode 100644 data-processing-lib/python/src/data_processing/test_support/transform/pipeline_transform.py create mode 100644 data-processing-lib/python/src/data_processing/transform/pipeline_transform_configuration.py create mode 100644 data-processing-lib/python/test/data_processing_tests/transform/test_resize.py create mode 100644 data-processing-lib/python/test/data_processing_tests/transform/test_resize_noop.py diff --git a/data-processing-lib/python/src/data_processing/runtime/transform_launcher.py b/data-processing-lib/python/src/data_processing/runtime/transform_launcher.py index 648d48669..3344491d2 100644 --- a/data-processing-lib/python/src/data_processing/runtime/transform_launcher.py +++ b/data-processing-lib/python/src/data_processing/runtime/transform_launcher.py @@ -57,9 +57,9 @@ def _get_arguments(self, parser: argparse.ArgumentParser) -> argparse.Namespace: :return: list of arguments """ # add additional arguments - self.runtime_config.add_input_params(parser=parser) self.data_access_factory.add_input_params(parser=parser) self.execution_config.add_input_params(parser=parser) + self.runtime_config.add_input_params(parser=parser) return parser.parse_args() def _get_parameters(self, args: argparse.Namespace) -> bool: @@ -68,11 +68,10 @@ def _get_parameters(self, args: argparse.Namespace) -> bool: and does parameters validation :return: True if validation passes or False, if not """ - return ( - self.runtime_config.apply_input_params(args=args) + return (self.runtime_config.apply_input_params(args=args) and self.execution_config.apply_input_params(args=args) and self.data_access_factory.apply_input_params(args=args) - ) + ) def _submit_for_execution(self) -> int: """ diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py b/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py index 0e90f7ffd..1f665dea1 100644 --- a/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py +++ b/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py @@ -1,6 +1,14 @@ -from .table_transform_test import AbstractTableTransformTest -from .binary_transform_test import AbstractBinaryTransformTest -from .noop_transform import ( +from data_processing.test_support.transform.table_transform_test import AbstractTableTransformTest +from data_processing.test_support.transform.binary_transform_test import AbstractBinaryTransformTest +from data_processing.test_support.transform.noop_transform import ( NOOPTransform, NOOPPythonTransformConfiguration, ) +from data_processing.test_support.transform.resize_transform import ( + ResizeTransform, + ResizePythonTransformConfiguration, +) + +from data_processing.test_support.transform.pipeline_transform import ( + ResizeNOOPPythonTransformConfiguration, +) diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/pipeline_transform.py b/data-processing-lib/python/src/data_processing/test_support/transform/pipeline_transform.py new file mode 100644 index 000000000..591f679b8 --- /dev/null +++ b/data-processing-lib/python/src/data_processing/test_support/transform/pipeline_transform.py @@ -0,0 +1,45 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python.runtime_configuration import ( + PythonTransformRuntimeConfiguration, +) +from data_processing.transform import PipelineTransformConfiguration +from data_processing.utils import get_logger +from data_processing.test_support.transform import NOOPPythonTransformConfiguration, ResizePythonTransformConfiguration + +logger = get_logger(__name__) + + +class ResizeNOOPPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config= + PipelineTransformConfiguration({"transforms": [ResizePythonTransformConfiguration(), + NOOPPythonTransformConfiguration()]}) + ) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = PythonTransformLauncher(ResizeNOOPPythonTransformConfiguration()) + logger.info("Launching resize/noop transform") + launcher.launch() diff --git a/data-processing-lib/python/src/data_processing/transform/__init__.py b/data-processing-lib/python/src/data_processing/transform/__init__.py index 9f97f0528..69ca1323f 100644 --- a/data-processing-lib/python/src/data_processing/transform/__init__.py +++ b/data-processing-lib/python/src/data_processing/transform/__init__.py @@ -5,3 +5,4 @@ from data_processing.transform.transform_configuration import TransformConfiguration, get_transform_config from data_processing.transform.runtime_configuration import TransformRuntimeConfiguration from data_processing.transform.pipeline_transform import AbstractPipelineTransform +from data_processing.transform.pipeline_transform_configuration import PipelineTransformConfiguration diff --git a/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py b/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py index 34feb050d..d825d4697 100644 --- a/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py @@ -11,7 +11,7 @@ ################################################################################ from typing import Any -from data_processing.transform import AbstractBinaryTransform, BaseTransformRuntime, TransformRuntimeConfiguration +from data_processing.transform import AbstractBinaryTransform, BaseTransformRuntime from data_processing.utils import get_logger, UnrecoverableException, TransformUtils @@ -21,15 +21,16 @@ class AbstractPipelineTransform(AbstractBinaryTransform): participating transforms in memory """ - def __init__(self, config: dict[str, Any], transforms: list[TransformRuntimeConfiguration]): + def __init__(self, config: dict[str, Any]): """ Initializes pipeline execution for the list of transforms - :param config - configuration parameters - :param transforms - list of transforms in the pipeline. Note that transforms will + :param config - configuration parameters - dictionary of transforms in the pipeline. + Note that transforms will be executed be executed """ - super().__init__(config) + super().__init__({}) self.logger = get_logger(__name__) + transforms = config.get("transforms", []) if len(transforms) == 0: # Empty pipeline self.logger.error("Pipeline transform with empty list") @@ -85,7 +86,17 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl # no data returned by this transform return [], stats # all done - return data, stats + return self._convert_output(data), stats + + @staticmethod + def _convert_output(data: list[tuple[bytes, str]]) -> list[tuple[bytes, str]]: + res = [None] * len(data) + i = 0 + for dt in data: + fname = TransformUtils.get_file_extension(dt[1]) + res[i] = (dt[0], fname[1]) + i += 1 + return res @staticmethod def _process_transform(transform: AbstractBinaryTransform, data: list[tuple[bytes, str]] @@ -107,7 +118,7 @@ def _process_transform(transform: AbstractBinaryTransform, data: list[tuple[byte res.append((ouf[0], src[0] + ouf[1])) # accumulate statistics stats |= st - return res, stats + return res, stats def flush_binary(self) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: """ @@ -139,9 +150,10 @@ def flush_binary(self) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: if len(data) == 0: # no data returned by this transform break - res += data + res += self._convert_output(data) else: - res += out_files + res += self._convert_output(out_files) + i += 1 # Done flushing, compute execution stats for _, runtime in self.participants: self._compute_execution_stats(runtime=runtime, st=stats) diff --git a/data-processing-lib/python/src/data_processing/transform/pipeline_transform_configuration.py b/data-processing-lib/python/src/data_processing/transform/pipeline_transform_configuration.py new file mode 100644 index 000000000..8416c2884 --- /dev/null +++ b/data-processing-lib/python/src/data_processing/transform/pipeline_transform_configuration.py @@ -0,0 +1,80 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any +from argparse import ArgumentParser, Namespace + +from data_processing.transform import TransformConfiguration +from data_processing.transform.pure_python import PythonPipelineTransform +from data_processing.utils import get_logger + +logger = get_logger(__name__) + + +class PipelineTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self, config: dict[str, Any]): + super().__init__( + name="pipeline", + transform_class=PythonPipelineTransform, + ) + self.params = config + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the NOOPTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + for t in self.params["transforms"]: + t.transform_config.add_input_params(parser=parser) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + res = True + for t in self.params["transforms"]: + res = res and t.transform_config.apply_input_params(args=args) + return res + + def get_input_params(self) -> dict[str, Any]: + """ + Provides a default implementation if the user has provided a set of keys to the initializer. + These keys are used in apply_input_params() to extract our key/values from the global Namespace of args. + :return: + """ + params = {} + for t in self.params["transforms"]: + params |= t.transform_config.get_input_params() + return params + + def get_transform_metadata(self) -> dict[str, Any]: + """ + Get transform metadata. Before returning remove all parameters key accumulated in + self.remove_from metadata. This allows transform developer to mark any input parameters + that should not make it to the metadata. This can be parameters containing sensitive + information, access keys, secrets, passwords, etc. + :return parameters for metadata: + """ + params = {} + for t in self.params["transforms"]: + params |= t.transform_config.get_transform_metadata() + return params diff --git a/data-processing-lib/python/src/data_processing/transform/pure_python/pipeline_transform.py b/data-processing-lib/python/src/data_processing/transform/pure_python/pipeline_transform.py index e798e6a36..d52e3a0bb 100644 --- a/data-processing-lib/python/src/data_processing/transform/pure_python/pipeline_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/pure_python/pipeline_transform.py @@ -12,7 +12,7 @@ from typing import Any from data_processing.transform import AbstractPipelineTransform -from data_processing.runtime import TransformRuntimeConfiguration, BaseTransformRuntime +from data_processing.transform import TransformRuntimeConfiguration, BaseTransformRuntime class PythonPipelineTransform(AbstractPipelineTransform): @@ -21,14 +21,14 @@ class PythonPipelineTransform(AbstractPipelineTransform): participating transforms in memory """ - def __init__(self, config: dict[str, Any], transforms: list[TransformRuntimeConfiguration]): + def __init__(self, config: dict[str, Any]): """ Initializes pipeline execution for the list of transforms :param config - configuration parameters :param transforms - list of transforms in the pipeline. Note that transforms will be executed """ - super().__init__(config, transforms) + super().__init__(config) def _get_transform_params(self, runtime: BaseTransformRuntime) -> dict[str, Any]: """ diff --git a/data-processing-lib/python/src/data_processing/transform/transform_configuration.py b/data-processing-lib/python/src/data_processing/transform/transform_configuration.py index 033e92f2a..5e4938ce8 100644 --- a/data-processing-lib/python/src/data_processing/transform/transform_configuration.py +++ b/data-processing-lib/python/src/data_processing/transform/transform_configuration.py @@ -23,7 +23,10 @@ class TransformConfiguration(CLIArgumentProvider): """ def __init__( - self, name: str, transform_class: type[AbstractBinaryTransform], remove_from_metadata: list[str] = [] + self, + name: str, + transform_class: type[AbstractBinaryTransform], + remove_from_metadata: list[str] = [], ): """ Initialization diff --git a/data-processing-lib/python/test/data_processing_tests/transform/test_noop.py b/data-processing-lib/python/test/data_processing_tests/transform/test_noop.py index 1eb85fe48..caf1c60f6 100644 --- a/data-processing-lib/python/test/data_processing_tests/transform/test_noop.py +++ b/data-processing-lib/python/test/data_processing_tests/transform/test_noop.py @@ -10,11 +10,9 @@ # limitations under the License. ################################################################################ -from typing import Tuple - import pyarrow as pa -from data_processing.test_support.transform.noop_transform import NOOPTransform -from data_processing.test_support.transform.table_transform_test import ( +from data_processing.test_support.transform import NOOPTransform +from data_processing.test_support.transform import ( AbstractTableTransformTest, ) @@ -30,7 +28,7 @@ class TestNOOPTransform(AbstractTableTransformTest): The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. """ - def get_test_transform_fixtures(self) -> list[Tuple]: + def get_test_transform_fixtures(self) -> list[tuple]: fixtures = [ (NOOPTransform({"sleep": 0}), [table], [expected_table], expected_metadata_list), (NOOPTransform({"sleep": 0}), [table], [expected_table], expected_metadata_list), diff --git a/data-processing-lib/python/test/data_processing_tests/transform/test_resize.py b/data-processing-lib/python/test/data_processing_tests/transform/test_resize.py new file mode 100644 index 000000000..61ec43c50 --- /dev/null +++ b/data-processing-lib/python/test/data_processing_tests/transform/test_resize.py @@ -0,0 +1,53 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import os + +from data_processing.test_support.transform import ResizePythonTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) + + +class TestPythonResizeTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + # The following based on 3 identical input files of about 39kbytes, and 200 rows + fixtures = [] + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../transforms/universal/resize/python/test-data")) + launcher = PythonTransformLauncher(ResizePythonTransformConfiguration()) + + # Split into 4 or so files + config = {"resize_max_rows_per_table": 125} + fixtures.append((launcher, config, basedir + "/input", basedir + "/expected-rows-125")) + + # Merge into 2 or so files + config = {"resize_max_rows_per_table": 300} + fixtures.append((launcher, config, basedir + "/input", basedir + "/expected-rows-300")) + + # # Merge all into a single table + config = {"resize_max_mbytes_per_table": 1} + fixtures.append((launcher, config, basedir + "/input", basedir + "/expected-mbytes-1")) + + # # Merge the 1st 2 and some of the 2nd with the 3rd + config = {"resize_max_mbytes_per_table": 0.05} + fixtures.append((launcher, config, basedir + "/input", basedir + "/expected-mbytes-0.05")) + + # Split into 4 or so files + config = {"resize_max_mbytes_per_table": 0.02} + fixtures.append((launcher, config, basedir + "/input", basedir + "/expected-mbytes-0.02")) + + return fixtures diff --git a/data-processing-lib/python/test/data_processing_tests/transform/test_resize_noop.py b/data-processing-lib/python/test/data_processing_tests/transform/test_resize_noop.py new file mode 100644 index 000000000..939b34da0 --- /dev/null +++ b/data-processing-lib/python/test/data_processing_tests/transform/test_resize_noop.py @@ -0,0 +1,54 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import os + +from data_processing.test_support.transform import ResizeNOOPPythonTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) + + +class TestPythonResizeNOOPTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + # The following based on 3 identical input files of about 39kbytes, and 200 rows + fixtures = [] + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), + "../../../../../transforms/universal/resize/python/test-data")) + launcher = PythonTransformLauncher(ResizeNOOPPythonTransformConfiguration()) + + # Split into 4 or so files + config = {"resize_max_rows_per_table": 125, "noop_sleep_sec": 1} + fixtures.append((launcher, config, basedir + "/input", basedir + "/expected-rows-125")) + + # Merge into 2 or so files + config = {"resize_max_rows_per_table": 300, "noop_sleep_sec": 1} + fixtures.append((launcher, config, basedir + "/input", basedir + "/expected-rows-300")) + + # # Merge all into a single table + config = {"resize_max_mbytes_per_table": 1, "noop_sleep_sec": 1} + fixtures.append((launcher, config, basedir + "/input", basedir + "/expected-mbytes-1")) + + # # Merge the 1st 2 and some of the 2nd with the 3rd + config = {"resize_max_mbytes_per_table": 0.05, "noop_sleep_sec": 1} + fixtures.append((launcher, config, basedir + "/input", basedir + "/expected-mbytes-0.05")) + + # Split into 4 or so files + config = {"resize_max_mbytes_per_table": 0.02, "noop_sleep_sec": 1} + fixtures.append((launcher, config, basedir + "/input", basedir + "/expected-mbytes-0.02")) + + return fixtures From d8f531ad7b57606315e2d512256f4895e7ce94ff Mon Sep 17 00:00:00 2001 From: blublinsky Date: Sat, 21 Sep 2024 22:00:28 +0100 Subject: [PATCH 06/12] fixed small bug --- .../src/data_processing/transform/pipeline_transform.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py b/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py index d825d4697..ea0de5175 100644 --- a/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py @@ -94,7 +94,10 @@ def _convert_output(data: list[tuple[bytes, str]]) -> list[tuple[bytes, str]]: i = 0 for dt in data: fname = TransformUtils.get_file_extension(dt[1]) - res[i] = (dt[0], fname[1]) + ext = fname[1] + if len(ext) <= 1: + ext = fname[0] + res[i] = (dt[0], ext) i += 1 return res From 18e01a312648e5ae01fc71a5f04a3fd20d2119eb Mon Sep 17 00:00:00 2001 From: blublinsky Date: Sun, 22 Sep 2024 09:28:16 +0100 Subject: [PATCH 07/12] small bugs fixes --- .../data_processing/transform/pipeline_transform.py | 11 +++++------ .../launch/ray/ray_test_noop_launch.py | 6 ------ 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py b/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py index ea0de5175..e7eb349b6 100644 --- a/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py @@ -56,6 +56,7 @@ def __init__(self, config: dict[str, Any]): participants.append((tr, runtime)) # save participating transforms self.participants = participants + self.file_name = "" def _get_transform_params(self, runtime: BaseTransformRuntime) -> dict[str, Any]: """ @@ -76,6 +77,7 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl holding the extension to be used when writing out the new bytes. """ # process transforms sequentially + self.file_name = file_name data = [(byte_array, file_name)] stats = {} for transform, _ in self.participants: @@ -94,10 +96,7 @@ def _convert_output(data: list[tuple[bytes, str]]) -> list[tuple[bytes, str]]: i = 0 for dt in data: fname = TransformUtils.get_file_extension(dt[1]) - ext = fname[1] - if len(ext) <= 1: - ext = fname[0] - res[i] = (dt[0], ext) + res[i] = (dt[0], fname[1]) i += 1 return res @@ -145,7 +144,7 @@ def flush_binary(self) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: # flush produced output - run it through the rest of the chain data = [] for ouf in out_files: - data.append((ouf[0], f"file{ouf[1]}")) + data.append((ouf[0], self.file_name)) for n in range(i + 1, len(self.participants)): data, st = self._process_transform(transform=self.participants[n][0], data=data) # Accumulate stats @@ -155,7 +154,7 @@ def flush_binary(self) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: break res += self._convert_output(data) else: - res += self._convert_output(out_files) + res += out_files i += 1 # Done flushing, compute execution stats for _, runtime in self.participants: diff --git a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py index d4cc874f0..e706a4dfa 100644 --- a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py +++ b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py @@ -12,7 +12,6 @@ import os -import pyarrow as pa from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) @@ -20,11 +19,6 @@ from data_processing_ray.test_support.transform import NOOPRayTransformConfiguration -table = pa.Table.from_pydict({"name": pa.array(["Tom"]), "age": pa.array([23])}) -expected_table = table # We're a noop after all. -expected_metadata_list = [{"nfiles": 1, "nrows": 1}, {}] # transform() result # flush() result - - class TestRayNOOPTransform(AbstractTransformLauncherTest): """ Extends the super-class to define the test data for the tests defined there. From 4681ecd7fe026635d4093dae07ed36a4915f3539 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Sun, 22 Sep 2024 19:13:45 +0100 Subject: [PATCH 08/12] more tests - ededup and noop --- ...dedup_pipeline_local_python_incremental.py | 49 +++++++++++++++++++ .../src/ededup_pipeline_transform_python.py | 43 ++++++++++++++++ .../test/test_ededup_pipeline_python.py | 35 +++++++++++++ .../python/src/noop_pipeline_local_python.py | 45 +++++++++++++++++ .../src/noop_pipeline_transform_python.py | 43 ++++++++++++++++ .../python/test/test_noop_pipeline_python.py | 47 ++++++++++++++++++ 6 files changed, 262 insertions(+) create mode 100644 transforms/universal/ededup/python/src/ededup_pipeline_local_python_incremental.py create mode 100644 transforms/universal/ededup/python/src/ededup_pipeline_transform_python.py create mode 100644 transforms/universal/ededup/python/test/test_ededup_pipeline_python.py create mode 100644 transforms/universal/noop/python/src/noop_pipeline_local_python.py create mode 100644 transforms/universal/noop/python/src/noop_pipeline_transform_python.py create mode 100644 transforms/universal/noop/python/test/test_noop_pipeline_python.py diff --git a/transforms/universal/ededup/python/src/ededup_pipeline_local_python_incremental.py b/transforms/universal/ededup/python/src/ededup_pipeline_local_python_incremental.py new file mode 100644 index 000000000..170c248db --- /dev/null +++ b/transforms/universal/ededup/python/src/ededup_pipeline_local_python_incremental.py @@ -0,0 +1,49 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from ededup_pipeline_transform_python import EdedupPypelinePythonTransformConfiguration +from ededup_transform_base import ( + doc_column_name_cli_param, + int_column_name_cli_param, +) + + +# create launcher +launcher = PythonTransformLauncher(EdedupPypelinePythonTransformConfiguration()) +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # orchestrator + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # ededup parameters + doc_column_name_cli_param: "contents", + int_column_name_cli_param: "document_id", +} +sys.argv = ParamsUtils.dict_to_req(d=params) + +# launch +launcher.launch() diff --git a/transforms/universal/ededup/python/src/ededup_pipeline_transform_python.py b/transforms/universal/ededup/python/src/ededup_pipeline_transform_python.py new file mode 100644 index 000000000..a10d3f06e --- /dev/null +++ b/transforms/universal/ededup/python/src/ededup_pipeline_transform_python.py @@ -0,0 +1,43 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python.runtime_configuration import ( + PythonTransformRuntimeConfiguration, +) +from data_processing.transform import PipelineTransformConfiguration +from data_processing.utils import get_logger +from ededup_transform_python import EdedupPythonTransformRuntimeConfiguration + +logger = get_logger(__name__) + + +class EdedupPypelinePythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config= + PipelineTransformConfiguration({"transforms": [EdedupPythonTransformRuntimeConfiguration()]})) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = PythonTransformLauncher(EdedupPypelinePythonTransformConfiguration()) + logger.info("Launching resize/noop transform") + launcher.launch() diff --git a/transforms/universal/ededup/python/test/test_ededup_pipeline_python.py b/transforms/universal/ededup/python/test/test_ededup_pipeline_python.py new file mode 100644 index 000000000..81f09c4e7 --- /dev/null +++ b/transforms/universal/ededup/python/test/test_ededup_pipeline_python.py @@ -0,0 +1,35 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from ededup_pipeline_transform_python import EdedupPypelinePythonTransformConfiguration +from ededup_transform_base import doc_column_name_cli_param, int_column_name_cli_param + + +class TestEdedupPypilinePythonTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + # The following based on 3 identical input files of about 39kbytes, and 200 rows + fixtures = [] + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + launcher = PythonTransformLauncher(EdedupPypelinePythonTransformConfiguration()) + config = {doc_column_name_cli_param: "contents", int_column_name_cli_param: "document_id"} + return [(launcher, config, basedir + "/input", basedir + "/expected")] diff --git a/transforms/universal/noop/python/src/noop_pipeline_local_python.py b/transforms/universal/noop/python/src/noop_pipeline_local_python.py new file mode 100644 index 000000000..c3d2b648d --- /dev/null +++ b/transforms/universal/noop/python/src/noop_pipeline_local_python.py @@ -0,0 +1,45 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from noop_pipeline_transform_python import NOOPPypelinePythonTransformConfiguration + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # noop params + "noop_sleep_sec": 1, +} +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = PythonTransformLauncher(runtime_config=NOOPPypelinePythonTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/noop/python/src/noop_pipeline_transform_python.py b/transforms/universal/noop/python/src/noop_pipeline_transform_python.py new file mode 100644 index 000000000..381f13149 --- /dev/null +++ b/transforms/universal/noop/python/src/noop_pipeline_transform_python.py @@ -0,0 +1,43 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python.runtime_configuration import ( + PythonTransformRuntimeConfiguration, +) +from data_processing.transform import PipelineTransformConfiguration +from data_processing.utils import get_logger +from noop_transform_python import NOOPPythonTransformConfiguration + +logger = get_logger(__name__) + + +class NOOPPypelinePythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config= + PipelineTransformConfiguration({"transforms": [NOOPPythonTransformConfiguration()]})) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = PythonTransformLauncher(NOOPPypelinePythonTransformConfiguration()) + logger.info("Launching resize/noop transform") + launcher.launch() diff --git a/transforms/universal/noop/python/test/test_noop_pipeline_python.py b/transforms/universal/noop/python/test/test_noop_pipeline_python.py new file mode 100644 index 000000000..d0fec66a8 --- /dev/null +++ b/transforms/universal/noop/python/test/test_noop_pipeline_python.py @@ -0,0 +1,47 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from noop_transform import sleep_cli_param +from noop_pipeline_transform_python import NOOPPypelinePythonTransformConfiguration + + +class TestPythonNOOPTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + src_file_dir = os.path.abspath(os.path.dirname(__file__)) + fixtures = [] + + launcher = PythonTransformLauncher(NOOPPypelinePythonTransformConfiguration()) + input_dir = os.path.join(src_file_dir, "../test-data/input") + expected_dir = os.path.join(src_file_dir, "../test-data/expected") + transform_config = {sleep_cli_param: 0} + fixtures.append( + ( + launcher, + transform_config, + input_dir, + expected_dir, + [], # optional list of column names to ignore in comparing test-generated with expected. + ) + ) + + return fixtures From d187dba4db4d3240ff6857372cbada1adc89ee0a Mon Sep 17 00:00:00 2001 From: blublinsky Date: Mon, 23 Sep 2024 10:17:00 +0100 Subject: [PATCH 09/12] add Ray implementation --- .../test_support/transform/noop_transform.py | 5 +- .../transform/pipeline_transform.py | 14 ++- .../transform/resize_transform.py | 5 +- .../transform/pipeline_transform.py | 19 ++-- .../pipeline_transform_configuration.py | 11 ++- .../pure_python/pipeline_transform.py | 20 ++-- .../runtime/ray/transform_statistics.py | 8 ++ .../transform/ray/__init__.py | 1 + .../transform/ray/pipeline_transform.py | 52 ++++++++++ ...tal.py => ededup_pipeline_local_python.py} | 0 .../src/ededup_pipeline_transform_python.py | 11 ++- .../src/noop_pipeline_transform_python.py | 11 +-- .../python/test-data/expected/metadata.json | 90 +++++++++--------- .../python/test-data/expected/test1.parquet | Bin 753 -> 759 bytes .../python/test/test_noop_pipeline_python.py | 2 +- .../noop/ray/src/noop_pipeline_local_ray.py | 47 +++++++++ .../ray/src/noop_pipeline_transform_ray.py | 42 ++++++++ .../noop/ray/test/test_noop_pipeline_ray.py | 48 ++++++++++ 18 files changed, 288 insertions(+), 98 deletions(-) create mode 100644 data-processing-lib/ray/src/data_processing_ray/transform/ray/__init__.py create mode 100644 data-processing-lib/ray/src/data_processing_ray/transform/ray/pipeline_transform.py rename transforms/universal/ededup/python/src/{ededup_pipeline_local_python_incremental.py => ededup_pipeline_local_python.py} (100%) create mode 100644 transforms/universal/noop/ray/src/noop_pipeline_local_ray.py create mode 100644 transforms/universal/noop/ray/src/noop_pipeline_transform_ray.py create mode 100644 transforms/universal/noop/ray/test/test_noop_pipeline_ray.py diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py b/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py index 0dee013a4..e87481a4f 100644 --- a/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py +++ b/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py @@ -15,10 +15,7 @@ from typing import Any import pyarrow as pa -from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.runtime.pure_python.runtime_configuration import ( - PythonTransformRuntimeConfiguration, -) +from data_processing.runtime.pure_python import PythonTransformLauncher, PythonTransformRuntimeConfiguration from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider, get_logger diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/pipeline_transform.py b/data-processing-lib/python/src/data_processing/test_support/transform/pipeline_transform.py index 591f679b8..b7afdc2e7 100644 --- a/data-processing-lib/python/src/data_processing/test_support/transform/pipeline_transform.py +++ b/data-processing-lib/python/src/data_processing/test_support/transform/pipeline_transform.py @@ -10,10 +10,8 @@ # limitations under the License. ################################################################################ -from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.runtime.pure_python.runtime_configuration import ( - PythonTransformRuntimeConfiguration, -) +from data_processing.runtime.pure_python import PythonTransformLauncher, PythonTransformRuntimeConfiguration +from data_processing.transform.pure_python import PythonPipelineTransform from data_processing.transform import PipelineTransformConfiguration from data_processing.utils import get_logger from data_processing.test_support.transform import NOOPPythonTransformConfiguration, ResizePythonTransformConfiguration @@ -32,10 +30,10 @@ def __init__(self): """ Initialization """ - super().__init__(transform_config= - PipelineTransformConfiguration({"transforms": [ResizePythonTransformConfiguration(), - NOOPPythonTransformConfiguration()]}) - ) + super().__init__(transform_config=PipelineTransformConfiguration( + config={"transforms": [ResizePythonTransformConfiguration(), + NOOPPythonTransformConfiguration()]}, + transform_class=PythonPipelineTransform)) if __name__ == "__main__": diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/resize_transform.py b/data-processing-lib/python/src/data_processing/test_support/transform/resize_transform.py index 7247ee3bc..96c43830b 100644 --- a/data-processing-lib/python/src/data_processing/test_support/transform/resize_transform.py +++ b/data-processing-lib/python/src/data_processing/test_support/transform/resize_transform.py @@ -22,10 +22,7 @@ UnrecoverableException, get_logger, ) -from data_processing.runtime.pure_python.runtime_configuration import ( - PythonTransformRuntimeConfiguration, -) -from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python import PythonTransformLauncher, PythonTransformRuntimeConfiguration logger = get_logger(__name__) diff --git a/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py b/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py index e7eb349b6..5a410ceba 100644 --- a/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/pipeline_transform.py @@ -24,9 +24,8 @@ class AbstractPipelineTransform(AbstractBinaryTransform): def __init__(self, config: dict[str, Any]): """ Initializes pipeline execution for the list of transforms - :param config - configuration parameters - dictionary of transforms in the pipeline. - Note that transforms will be executed - be executed + :param config - configuration parameters - list of transforms in the pipeline. + Note that transforms will be executed in the order they are defined """ super().__init__({}) self.logger = get_logger(__name__) @@ -95,8 +94,8 @@ def _convert_output(data: list[tuple[bytes, str]]) -> list[tuple[bytes, str]]: res = [None] * len(data) i = 0 for dt in data: - fname = TransformUtils.get_file_extension(dt[1]) - res[i] = (dt[0], fname[1]) + f_name = TransformUtils.get_file_extension(dt[1]) + res[i] = (dt[0], f_name[1]) i += 1 return res @@ -157,15 +156,13 @@ def flush_binary(self) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: res += out_files i += 1 # Done flushing, compute execution stats - for _, runtime in self.participants: - self._compute_execution_stats(runtime=runtime, st=stats) + self._compute_execution_statistics(stats) return res, {} - def _compute_execution_stats(self, runtime: BaseTransformRuntime, st: dict[str, Any]) -> None: + def _compute_execution_statistics(self, stats: dict[str, Any]) -> None: """ - get transform parameters - :param runtime - runtime - :param st - statistics + Compute execution statistics + :param stats: current statistics from flush :return: None """ raise NotImplemented("must be implemented by subclass") diff --git a/data-processing-lib/python/src/data_processing/transform/pipeline_transform_configuration.py b/data-processing-lib/python/src/data_processing/transform/pipeline_transform_configuration.py index 8416c2884..8e5bb4097 100644 --- a/data-processing-lib/python/src/data_processing/transform/pipeline_transform_configuration.py +++ b/data-processing-lib/python/src/data_processing/transform/pipeline_transform_configuration.py @@ -13,8 +13,7 @@ from typing import Any from argparse import ArgumentParser, Namespace -from data_processing.transform import TransformConfiguration -from data_processing.transform.pure_python import PythonPipelineTransform +from data_processing.transform import TransformConfiguration, AbstractPipelineTransform from data_processing.utils import get_logger logger = get_logger(__name__) @@ -27,10 +26,14 @@ class PipelineTransformConfiguration(TransformConfiguration): configuration with CLI args. """ - def __init__(self, config: dict[str, Any]): + def __init__( + self, + config: dict[str, Any], + transform_class: type[AbstractPipelineTransform], + ): super().__init__( name="pipeline", - transform_class=PythonPipelineTransform, + transform_class=transform_class, ) self.params = config diff --git a/data-processing-lib/python/src/data_processing/transform/pure_python/pipeline_transform.py b/data-processing-lib/python/src/data_processing/transform/pure_python/pipeline_transform.py index d52e3a0bb..922c71683 100644 --- a/data-processing-lib/python/src/data_processing/transform/pure_python/pipeline_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/pure_python/pipeline_transform.py @@ -12,7 +12,7 @@ from typing import Any from data_processing.transform import AbstractPipelineTransform -from data_processing.transform import TransformRuntimeConfiguration, BaseTransformRuntime +from data_processing.transform import BaseTransformRuntime class PythonPipelineTransform(AbstractPipelineTransform): @@ -24,9 +24,8 @@ class PythonPipelineTransform(AbstractPipelineTransform): def __init__(self, config: dict[str, Any]): """ Initializes pipeline execution for the list of transforms - :param config - configuration parameters - :param transforms - list of transforms in the pipeline. Note that transforms will - be executed + :param config - configuration parameters - list of transforms in the pipeline. + Note that transforms will be executed in the order they are defined """ super().__init__(config) @@ -39,13 +38,12 @@ def _get_transform_params(self, runtime: BaseTransformRuntime) -> dict[str, Any] return runtime.get_transform_config(data_access_factory=self.data_access_factory, statistics=self.statistics, files=[]) - def _compute_execution_stats(self, runtime: BaseTransformRuntime, st: dict[str, Any]) -> None: + def _compute_execution_statistics(self, stats: dict[str, Any]) -> None: """ - get transform parameters - :param runtime - runtime - :param st - statistics + Compute execution statistics + :param stats: current statistics from flush :return: None """ - self.statistics.add_stats(st) - runtime.compute_execution_stats(stats=self.statistics) - return + self.statistics.add_stats(stats) + for _, runtime in self.participants: + runtime.compute_execution_stats(stats=self.statistics) diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_statistics.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_statistics.py index 2095820c8..b8ae35990 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_statistics.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_statistics.py @@ -74,3 +74,11 @@ def add_stats(self, stats=dict[str, Any]) -> None: self.transform_exceptions_counter.inc(val) if key == "data access retries": self.data_retries_counter.inc(val) + + def update_stats(self, stats=dict[str, Any]) -> None: + """ + Update (overwrite) statistics + :param stats - dictionary creating new statistics + :return: None + """ + self.stats = stats \ No newline at end of file diff --git a/data-processing-lib/ray/src/data_processing_ray/transform/ray/__init__.py b/data-processing-lib/ray/src/data_processing_ray/transform/ray/__init__.py new file mode 100644 index 000000000..8339074af --- /dev/null +++ b/data-processing-lib/ray/src/data_processing_ray/transform/ray/__init__.py @@ -0,0 +1 @@ +from data_processing_ray.transform.ray.pipeline_transform import RayPipelineTransform diff --git a/data-processing-lib/ray/src/data_processing_ray/transform/ray/pipeline_transform.py b/data-processing-lib/ray/src/data_processing_ray/transform/ray/pipeline_transform.py new file mode 100644 index 000000000..433f76d7d --- /dev/null +++ b/data-processing-lib/ray/src/data_processing_ray/transform/ray/pipeline_transform.py @@ -0,0 +1,52 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any +import ray +from data_processing.transform import AbstractPipelineTransform +from data_processing.transform import BaseTransformRuntime + + +class RayPipelineTransform(AbstractPipelineTransform): + """ + Transform that executes a set of base transforms sequentially. Data is passed between + participating transforms in memory + """ + + def __init__(self, config: dict[str, Any]): + """ + Initializes pipeline execution for the list of transforms + :param config - configuration parameters - list of transforms in the pipeline. + Note that transforms will be executed in the order they are defined + """ + super().__init__(config) + + def _get_transform_params(self, runtime: BaseTransformRuntime) -> dict[str, Any]: + """ + get transform parameters + :param runtime - runtime + :return: transform params + """ + return runtime.get_transform_config(data_access_factory=self.data_access_factory, + statistics=self.statistics, files=[]) + + def _compute_execution_statistics(self, stats: dict[str, Any]) -> None: + """ + Compute execution statistics + :param stats: current statistics from flush + :return: None + """ + current = ray.get(self.statistics.get_execution_stats.remote()) + current |= stats + for _, runtime in self.participants: + current = runtime.compute_execution_stats(stats=current) + ray.get(self.statistics.update_stats.remote(current)) diff --git a/transforms/universal/ededup/python/src/ededup_pipeline_local_python_incremental.py b/transforms/universal/ededup/python/src/ededup_pipeline_local_python.py similarity index 100% rename from transforms/universal/ededup/python/src/ededup_pipeline_local_python_incremental.py rename to transforms/universal/ededup/python/src/ededup_pipeline_local_python.py diff --git a/transforms/universal/ededup/python/src/ededup_pipeline_transform_python.py b/transforms/universal/ededup/python/src/ededup_pipeline_transform_python.py index a10d3f06e..99e0ca487 100644 --- a/transforms/universal/ededup/python/src/ededup_pipeline_transform_python.py +++ b/transforms/universal/ededup/python/src/ededup_pipeline_transform_python.py @@ -10,10 +10,11 @@ # limitations under the License. ################################################################################ -from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.runtime.pure_python.runtime_configuration import ( +from data_processing.runtime.pure_python import ( + PythonTransformLauncher, PythonTransformRuntimeConfiguration, ) +from data_processing.transform.pure_python import PythonPipelineTransform from data_processing.transform import PipelineTransformConfiguration from data_processing.utils import get_logger from ededup_transform_python import EdedupPythonTransformRuntimeConfiguration @@ -32,8 +33,10 @@ def __init__(self): """ Initialization """ - super().__init__(transform_config= - PipelineTransformConfiguration({"transforms": [EdedupPythonTransformRuntimeConfiguration()]})) + super().__init__( + transform_config=PipelineTransformConfiguration( + config={"transforms": [EdedupPythonTransformRuntimeConfiguration()]}, + transform_class=PythonPipelineTransform)) if __name__ == "__main__": diff --git a/transforms/universal/noop/python/src/noop_pipeline_transform_python.py b/transforms/universal/noop/python/src/noop_pipeline_transform_python.py index 381f13149..2f0d9c936 100644 --- a/transforms/universal/noop/python/src/noop_pipeline_transform_python.py +++ b/transforms/universal/noop/python/src/noop_pipeline_transform_python.py @@ -10,10 +10,8 @@ # limitations under the License. ################################################################################ -from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.runtime.pure_python.runtime_configuration import ( - PythonTransformRuntimeConfiguration, -) +from data_processing.runtime.pure_python import PythonTransformLauncher, PythonTransformRuntimeConfiguration +from data_processing.transform.pure_python import PythonPipelineTransform from data_processing.transform import PipelineTransformConfiguration from data_processing.utils import get_logger from noop_transform_python import NOOPPythonTransformConfiguration @@ -32,8 +30,9 @@ def __init__(self): """ Initialization """ - super().__init__(transform_config= - PipelineTransformConfiguration({"transforms": [NOOPPythonTransformConfiguration()]})) + super().__init__(transform_config=PipelineTransformConfiguration( + config={"transforms": [NOOPPythonTransformConfiguration()]}, + transform_class=PythonPipelineTransform)) if __name__ == "__main__": diff --git a/transforms/universal/noop/python/test-data/expected/metadata.json b/transforms/universal/noop/python/test-data/expected/metadata.json index eed590d79..ca124d145 100644 --- a/transforms/universal/noop/python/test-data/expected/metadata.json +++ b/transforms/universal/noop/python/test-data/expected/metadata.json @@ -1,46 +1,46 @@ { - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "NOOP", - "job type": "ray", - "job id": "job_id", - "start_time": "2024-03-01 15:17:56", - "end_time": "2024-03-01 15:17:57", - "status": "success" - }, - "code": [null], - "job_input_params": { - "sleep": 0, - "checkpointing": false, - "max_files": -1, - "number of workers": 1, - "worker options": { - "num_cpus": 0.8 - }, - "actor creation delay": 0 - }, - "execution_stats": { - "cpus": 10, - "gpus": 0, - "memory": 14.031964112073183, - "object_store": 2.0 - }, - "job_output_stats": { - "source_files": 1, - "source_size": 16534, - "result_files": 1, - "result_size": 16534, - "table_processing": 0.012392997741699219, - "nfiles": 1, - "nrows": 5 - }, - "source": { - "name": "test-data/data_processing/ray/noop/input", - "type": "path" - }, - "target": { - "name": "/tmp/NOOP4o9gv2bq", - "type": "path" - } -} + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "noop", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-09-22 21:58:36", + "end_time": "2024-09-22 21:58:37", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "sleep_sec": 1, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [ + ".parquet" + ], + "num_processors": 0 + }, + "job_output_stats": { + "source_files": 1, + "source_size": 753, + "result_files": 1, + "result_size": 759, + "processing_time": 1.093, + "nfiles": 1, + "nrows": 7, + "source_doc_count": 7, + "result_doc_count": 7 + }, + "source": { + "name": "/Users/borisl/Projects/data-prep-kit/transforms/universal/noop/python/test-data/input", + "type": "path" + }, + "target": { + "name": "/Users/borisl/Projects/data-prep-kit/transforms/universal/noop/python/output", + "type": "path" + } +} \ No newline at end of file diff --git a/transforms/universal/noop/python/test-data/expected/test1.parquet b/transforms/universal/noop/python/test-data/expected/test1.parquet index 7baa6d82f33adfa2fecb41f053f7e9a250daea06..043fb7aad775ee3768b9ef74b9f91c341b0b010c 100644 GIT binary patch delta 267 zcmey!`kl2tz%j^BltuIhljt)>A5lJ01|bFpjjj5B750QOGh8|Z0f06}MBTN2u(-qKd0@ zfYe0pHO_0aB2F|+KRDN$N-J+WMp6fg#ZAqVNhED delta 261 zcmey)`jNFhz%j^BltuIhljsXZA5lJ01|bH9J&Z2A3=9kzB_##LR{Ht{`Pr#O0(OE7 zJbGzGqHZD#68ZUhNr^?mvXVN9Ny&PtY3X{&`MC=d)wgvor2O>zX3tYb`I bJT;l0>8h}ao`If`q>K#E_)bO!29VzYgB?Jt diff --git a/transforms/universal/noop/python/test/test_noop_pipeline_python.py b/transforms/universal/noop/python/test/test_noop_pipeline_python.py index d0fec66a8..acb1b3f06 100644 --- a/transforms/universal/noop/python/test/test_noop_pipeline_python.py +++ b/transforms/universal/noop/python/test/test_noop_pipeline_python.py @@ -20,7 +20,7 @@ from noop_pipeline_transform_python import NOOPPypelinePythonTransformConfiguration -class TestPythonNOOPTransform(AbstractTransformLauncherTest): +class TestPythonNOOPPipelineTransform(AbstractTransformLauncherTest): """ Extends the super-class to define the test data for the tests defined there. The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. diff --git a/transforms/universal/noop/ray/src/noop_pipeline_local_ray.py b/transforms/universal/noop/ray/src/noop_pipeline_local_ray.py new file mode 100644 index 000000000..aca0f6cd3 --- /dev/null +++ b/transforms/universal/noop/ray/src/noop_pipeline_local_ray.py @@ -0,0 +1,47 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing_ray.runtime.ray import RayTransformLauncher +from data_processing.utils import ParamsUtils +from noop_pipeline_transform_ray import NOOPPypelineRayTransformConfiguration + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # where to run + "run_locally": True, + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # noop params + "noop_sleep_sec": 1, +} +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = RayTransformLauncher(runtime_config=NOOPPypelineRayTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/noop/ray/src/noop_pipeline_transform_ray.py b/transforms/universal/noop/ray/src/noop_pipeline_transform_ray.py new file mode 100644 index 000000000..00803049a --- /dev/null +++ b/transforms/universal/noop/ray/src/noop_pipeline_transform_ray.py @@ -0,0 +1,42 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing_ray.runtime.ray import RayTransformLauncher, RayTransformRuntimeConfiguration +from data_processing.transform import PipelineTransformConfiguration +from data_processing_ray.transform.ray import RayPipelineTransform +from data_processing.utils import get_logger +from noop_transform_ray import NOOPRayTransformConfiguration + +logger = get_logger(__name__) + + +class NOOPPypelineRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config=PipelineTransformConfiguration( + config={"transforms": [NOOPRayTransformConfiguration()]}, + transform_class=RayPipelineTransform)) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = RayTransformLauncher(NOOPPypelineRayTransformConfiguration()) + logger.info("Launching resize/noop transform") + launcher.launch() diff --git a/transforms/universal/noop/ray/test/test_noop_pipeline_ray.py b/transforms/universal/noop/ray/test/test_noop_pipeline_ray.py new file mode 100644 index 000000000..bc87a1af5 --- /dev/null +++ b/transforms/universal/noop/ray/test/test_noop_pipeline_ray.py @@ -0,0 +1,48 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing_ray.runtime.ray import RayTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from noop_transform import sleep_cli_param +from noop_pipeline_transform_ray import NOOPPypelineRayTransformConfiguration + + +class TestRayNOOPPipelineTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + src_file_dir = os.path.abspath(os.path.dirname(__file__)) + fixtures = [] + + launcher = RayTransformLauncher(NOOPPypelineRayTransformConfiguration()) + input_dir = os.path.join(src_file_dir, "../test-data/input") + expected_dir = os.path.join(src_file_dir, "../test-data/expected") + runtime_config = {"run_locally": True} + transform_config = {sleep_cli_param: 0} + fixtures.append( + ( + launcher, + transform_config | runtime_config, + input_dir, + expected_dir, + [], # optional list of column names to ignore in comparing test-generated with expected. + ) + ) + + return fixtures From 26978f18130f3e6c2b2be90d3f97eb79b9045621 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Mon, 23 Sep 2024 10:52:54 +0100 Subject: [PATCH 10/12] add ededup Ray sample --- .../ray/src/ededup_pipeline_local_ray.py | 55 +++++++++++++++++++ .../ray/src/ededup_pipeline_transform_ray.py | 42 ++++++++++++++ .../ray/test/test_ededup_pipeline_ray.py | 43 +++++++++++++++ 3 files changed, 140 insertions(+) create mode 100644 transforms/universal/ededup/ray/src/ededup_pipeline_local_ray.py create mode 100644 transforms/universal/ededup/ray/src/ededup_pipeline_transform_ray.py create mode 100644 transforms/universal/ededup/ray/test/test_ededup_pipeline_ray.py diff --git a/transforms/universal/ededup/ray/src/ededup_pipeline_local_ray.py b/transforms/universal/ededup/ray/src/ededup_pipeline_local_ray.py new file mode 100644 index 000000000..d8c3b09c8 --- /dev/null +++ b/transforms/universal/ededup/ray/src/ededup_pipeline_local_ray.py @@ -0,0 +1,55 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher +from ededup_pipeline_transform_ray import EdedupPypelineRayTransformConfiguration +from ededup_transform_base import doc_column_name_cli_param, int_column_name_cli_param +from ededup_transform_ray import hash_cpu_cli_params, num_hashes_cli_params + + +# create launcher +launcher = RayTransformLauncher(EdedupPypelineRayTransformConfiguration()) +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +worker_options = {"num_cpus": 0.5} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # where to run + "run_locally": True, + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), + "runtime_num_workers": 2, + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_creation_delay": 0, + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # ededup parameters + hash_cpu_cli_params: 0.5, + num_hashes_cli_params: 2, + doc_column_name_cli_param: "contents", + int_column_name_cli_param: "document_id", +} +sys.argv = ParamsUtils.dict_to_req(d=params) + +# launch +launcher.launch() diff --git a/transforms/universal/ededup/ray/src/ededup_pipeline_transform_ray.py b/transforms/universal/ededup/ray/src/ededup_pipeline_transform_ray.py new file mode 100644 index 000000000..5b2e1216d --- /dev/null +++ b/transforms/universal/ededup/ray/src/ededup_pipeline_transform_ray.py @@ -0,0 +1,42 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing_ray.runtime.ray import RayTransformLauncher, RayTransformRuntimeConfiguration +from data_processing.transform import PipelineTransformConfiguration +from data_processing_ray.transform.ray import RayPipelineTransform +from data_processing.utils import get_logger +from ededup_transform_ray import EdedupRayTransformRuntimeConfiguration + +logger = get_logger(__name__) + + +class EdedupPypelineRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config=PipelineTransformConfiguration( + config={"transforms": [EdedupRayTransformRuntimeConfiguration()]}, + transform_class=RayPipelineTransform)) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = RayTransformLauncher(EdedupPypelineRayTransformConfiguration()) + logger.info("Launching resize/noop transform") + launcher.launch() diff --git a/transforms/universal/ededup/ray/test/test_ededup_pipeline_ray.py b/transforms/universal/ededup/ray/test/test_ededup_pipeline_ray.py new file mode 100644 index 000000000..d9460ef93 --- /dev/null +++ b/transforms/universal/ededup/ray/test/test_ededup_pipeline_ray.py @@ -0,0 +1,43 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_ray.runtime.ray import RayTransformLauncher +from ededup_pipeline_transform_ray import EdedupPypelineRayTransformConfiguration +from ededup_transform_base import doc_column_name_cli_param, int_column_name_cli_param +from ededup_transform_ray import hash_cpu_cli_params, num_hashes_cli_params + + +class TestRayEdedupTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "run_locally": True, + # When running in ray, our Runtime's get_transform_config() method will load the domains using + # the orchestrator's DataAccess/Factory. So we don't need to provide the bl_local_config configuration. + hash_cpu_cli_params: 0.5, + num_hashes_cli_params: 2, + doc_column_name_cli_param: "contents", + int_column_name_cli_param: "document_id", + } + launcher = RayTransformLauncher(EdedupPypelineRayTransformConfiguration()) + fixtures = [(launcher, config, basedir + "/input", basedir + "/expected")] + return fixtures From 12e5ef00020045568d6ff1ef5d143d32872535a2 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Mon, 23 Sep 2024 12:49:47 +0100 Subject: [PATCH 11/12] completed Ray tests --- .../transform/test_noop.py | 38 ++-- .../transform/test_resize.py | 3 +- ...e_noop.py => test_resize_noop_pipeline.py} | 0 .../test_support/transform/__init__.py | 2 + .../transform/pipeline_transform.py | 43 ++++ .../transform/resize_transform.py | 214 ++++++++++++++++++ .../launch/ray/ray_test_resize.py | 55 +++++ .../ray/ray_test_resize_noop_pipeline.py | 55 +++++ 8 files changed, 393 insertions(+), 17 deletions(-) rename data-processing-lib/python/test/data_processing_tests/transform/{test_resize_noop.py => test_resize_noop_pipeline.py} (100%) create mode 100644 data-processing-lib/ray/src/data_processing_ray/test_support/transform/pipeline_transform.py create mode 100644 data-processing-lib/ray/src/data_processing_ray/test_support/transform/resize_transform.py create mode 100644 data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_resize.py create mode 100644 data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_resize_noop_pipeline.py diff --git a/data-processing-lib/python/test/data_processing_tests/transform/test_noop.py b/data-processing-lib/python/test/data_processing_tests/transform/test_noop.py index caf1c60f6..0e5b0396a 100644 --- a/data-processing-lib/python/test/data_processing_tests/transform/test_noop.py +++ b/data-processing-lib/python/test/data_processing_tests/transform/test_noop.py @@ -9,28 +9,34 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ - -import pyarrow as pa -from data_processing.test_support.transform import NOOPTransform -from data_processing.test_support.transform import ( - AbstractTableTransformTest, +import os +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, ) +from data_processing.test_support.transform.noop_transform import NOOPPythonTransformConfiguration, sleep_cli_param - -table = pa.Table.from_pydict({"name": pa.array(["Tom", "Dick", "Harry"]), "age": pa.array([0, 1, 2])}) -expected_table = table # We're a noop after all. -expected_metadata_list = [{"nfiles": 1, "nrows": 3}, {}] # transform() result # flush() result - - -class TestNOOPTransform(AbstractTableTransformTest): +class TestPythonNOOPTransform(AbstractTransformLauncherTest): """ Extends the super-class to define the test data for the tests defined there. The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. """ def get_test_transform_fixtures(self) -> list[tuple]: - fixtures = [ - (NOOPTransform({"sleep": 0}), [table], [expected_table], expected_metadata_list), - (NOOPTransform({"sleep": 0}), [table], [expected_table], expected_metadata_list), - ] + src_file_dir = os.path.abspath(os.path.dirname(__file__)) + fixtures = [] + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), + "../../../../../transforms/universal/noop/python/test-data")) + launcher = PythonTransformLauncher(NOOPPythonTransformConfiguration()) + transform_config = {sleep_cli_param: 0} + fixtures.append( + ( + launcher, + transform_config, + basedir + "/input", + basedir + "/expected", + [], # optional list of column names to ignore in comparing test-generated with expected. + ) + ) + return fixtures diff --git a/data-processing-lib/python/test/data_processing_tests/transform/test_resize.py b/data-processing-lib/python/test/data_processing_tests/transform/test_resize.py index 61ec43c50..d90a5fa8e 100644 --- a/data-processing-lib/python/test/data_processing_tests/transform/test_resize.py +++ b/data-processing-lib/python/test/data_processing_tests/transform/test_resize.py @@ -27,7 +27,8 @@ class TestPythonResizeTransform(AbstractTransformLauncherTest): def get_test_transform_fixtures(self) -> list[tuple]: # The following based on 3 identical input files of about 39kbytes, and 200 rows fixtures = [] - basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../../transforms/universal/resize/python/test-data")) + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), + "../../../../../transforms/universal/resize/python/test-data")) launcher = PythonTransformLauncher(ResizePythonTransformConfiguration()) # Split into 4 or so files diff --git a/data-processing-lib/python/test/data_processing_tests/transform/test_resize_noop.py b/data-processing-lib/python/test/data_processing_tests/transform/test_resize_noop_pipeline.py similarity index 100% rename from data-processing-lib/python/test/data_processing_tests/transform/test_resize_noop.py rename to data-processing-lib/python/test/data_processing_tests/transform/test_resize_noop_pipeline.py diff --git a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/__init__.py b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/__init__.py index a6cd700f7..7ba7d4c16 100644 --- a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/__init__.py +++ b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/__init__.py @@ -1 +1,3 @@ from data_processing_ray.test_support.transform.noop_transform import NOOPRayTransformConfiguration +from data_processing_ray.test_support.transform.resize_transform import ResizeRayTransformConfiguration +from data_processing_ray.test_support.transform.pipeline_transform import ResizeNOOPRayTransformConfiguration diff --git a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/pipeline_transform.py b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/pipeline_transform.py new file mode 100644 index 000000000..32db42606 --- /dev/null +++ b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/pipeline_transform.py @@ -0,0 +1,43 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing_ray.runtime.ray import RayTransformLauncher, RayTransformRuntimeConfiguration +from data_processing_ray.transform.ray import RayPipelineTransform +from data_processing.transform import PipelineTransformConfiguration +from data_processing.utils import get_logger +from data_processing_ray.test_support.transform import NOOPRayTransformConfiguration, ResizeRayTransformConfiguration + +logger = get_logger(__name__) + + +class ResizeNOOPRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config=PipelineTransformConfiguration( + config={"transforms": [ResizeRayTransformConfiguration(), + NOOPRayTransformConfiguration()]}, + transform_class=RayPipelineTransform)) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = RayTransformLauncher(ResizeNOOPRayTransformConfiguration()) + logger.info("Launching resize/noop transform") + launcher.launch() diff --git a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/resize_transform.py b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/resize_transform.py new file mode 100644 index 000000000..19546950d --- /dev/null +++ b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/resize_transform.py @@ -0,0 +1,214 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from argparse import ArgumentParser, Namespace +from typing import Any + +import pyarrow as pa +from data_processing.transform import AbstractTableTransform, TransformConfiguration +from data_processing.utils import ( + LOCAL_TO_DISK, + MB, + CLIArgumentProvider, + UnrecoverableException, + get_logger, +) +from data_processing_ray.runtime.ray import RayTransformLauncher, RayTransformRuntimeConfiguration + + +logger = get_logger(__name__) + +max_rows_per_table_key = "max_rows_per_table" +max_mbytes_per_table_key = "max_mbytes_per_table" +size_type_key = "size_type" +shortname = "resize" +cli_prefix = f"{shortname}_" +max_rows_per_table_cli_param = f"{cli_prefix}{max_rows_per_table_key}" +max_mbytes_per_table_cli_param = f"{cli_prefix}{max_mbytes_per_table_key}" +size_type_cli_param = f"{cli_prefix}{size_type_key}" +size_type_disk = "disk" +size_type_memory = "memory" +size_type_default = size_type_disk + + +class ResizeTransform(AbstractTableTransform): + """ + Implements splitting large files into smaller ones. + Two flavours of splitting are supported - based on the amount of documents and based on the size + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + """ + super().__init__(config) + self.max_rows_per_table = config.get(max_rows_per_table_key, 0) + self.max_bytes_per_table = MB * config.get(max_mbytes_per_table_key, 0) + disk_memory = config.get(size_type_key, size_type_default) + if size_type_default in disk_memory: + self.max_bytes_per_table *= LOCAL_TO_DISK + + self.logger.debug(f"max bytes = {self.max_bytes_per_table}") + self.logger.debug(f"max rows = {self.max_rows_per_table}") + self.buffer = None + if self.max_rows_per_table <= 0 and self.max_bytes_per_table <= 0: + raise ValueError("Neither max rows per table nor max table size are defined") + if self.max_rows_per_table > 0 and self.max_bytes_per_table > 0: + raise ValueError("Both max rows per table and max table size are defined. Only one should be present") + + def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: + """ + split larger files into the smaller ones + :param table: table + :param file_name: name of the file + :return: resulting set of tables + """ + self.logger.debug(f"got new table with {table.num_rows} rows") + if self.buffer is not None: + try: + self.logger.debug( + f"concatenating buffer with {self.buffer.num_rows} rows to table with {table.num_rows} rows" + ) + # table = pa.concat_tables([self.buffer, table], unicode_promote_options="permissive") + table = pa.concat_tables([self.buffer, table]) + self.buffer = None + self.logger.debug(f"concatenated table has {table.num_rows} rows") + except Exception as _: # Can happen if schemas are different + # Raise unrecoverable error to stop the execution + self.logger.warning(f"table in {file_name} can't be merged with the buffer") + self.logger.warning(f"incoming table columns {table.schema.names} ") + self.logger.warning(f"buffer columns {self.buffer.schema.names}") + raise UnrecoverableException() + + result = [] + start_row = 0 + if self.max_rows_per_table > 0: + # split file with max documents + n_rows = table.num_rows + rows_left = n_rows + while start_row < n_rows and rows_left >= self.max_rows_per_table: + length = n_rows - start_row + if length > self.max_rows_per_table: + length = self.max_rows_per_table + a_slice = table.slice(offset=start_row, length=length) + self.logger.debug(f"created table slice with {a_slice.num_rows} rows, starting with row {start_row}") + result.append(a_slice) + start_row = start_row + self.max_rows_per_table + rows_left = rows_left - self.max_rows_per_table + else: + # split based on size + current_size = 0.0 + if table.nbytes >= self.max_bytes_per_table: + for n in range(table.num_rows): + current_size += table.slice(offset=n, length=1).nbytes + if current_size >= self.max_bytes_per_table: + self.logger.debug(f"capturing slice, current_size={current_size}") + # Reached the size + a_slice = table.slice(offset=start_row, length=(n - start_row)) + result.append(a_slice) + start_row = n + current_size = 0.0 + if start_row < table.num_rows: + # buffer remaining chunk for next call + self.logger.debug(f"Buffering table starting at row {start_row}") + self.buffer = table.slice(offset=start_row, length=(table.num_rows - start_row)) + self.logger.debug(f"buffered table has {self.buffer.num_rows} rows") + self.logger.debug(f"returning {len(result)} tables") + return result, {} + + def flush(self) -> tuple[list[pa.Table], dict[str, Any]]: + result = [] + if self.buffer is not None: + self.logger.debug(f"flushing buffered table with {self.buffer.num_rows} rows of size {self.buffer.nbytes}") + result.append(self.buffer) + self.buffer = None + else: + self.logger.debug(f"Empty buffer. nothing to flush.") + return result, {} + + +class ResizeTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args and combining of metadata. + """ + + def __init__(self): + super().__init__(name=shortname, transform_class=ResizeTransform) + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the resizeTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + parser.add_argument( + f"--{max_rows_per_table_cli_param}", + type=int, + default=-1, + help="Max number of rows per table", + ) + parser.add_argument( + f"--{max_mbytes_per_table_cli_param}", + type=float, + default=-1, + help=f"Max table size (MB). Size is measured according to the --{size_type_cli_param} parameter", + ) + parser.add_argument( + f"--{size_type_cli_param}", + type=str, + required=False, + default=size_type_default, + choices=[size_type_disk, size_type_memory], + help=f"Determines how memory is measured when using the --{max_mbytes_per_table_cli_param} option." + "\n'memory' measures the in-process memory footprint and \n'disk' makes an estimate of the resulting parquet file size.", + ) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + # Capture the args that are specific to this transform + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + # dargs = vars(args) + if self.params.get(max_rows_per_table_key) <= 0 and self.params.get(max_mbytes_per_table_key) <= 0: + logger.info("Neither max documents per table nor max table size are defined") + return False + if self.params.get(max_rows_per_table_key) > 0 and self.params.get(max_mbytes_per_table_key) > 0: + logger.info("Both max documents per table and max table size are defined. Only one should be present") + return False + logger.info(f"Split file parameters are : {self.params}") + return True + + +class ResizeRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for resize as required by the RayTransformLauncher. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config=ResizeTransformConfiguration()) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = RayTransformLauncher(ResizeRayTransformConfiguration()) + logger.info("Launching noop transform") + launcher.launch() diff --git a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_resize.py b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_resize.py new file mode 100644 index 000000000..4dbd121b9 --- /dev/null +++ b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_resize.py @@ -0,0 +1,55 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import os + +from data_processing_ray.test_support.transform import ResizeRayTransformConfiguration +from data_processing_ray.runtime.ray import RayTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) + + +class TestRayResizeTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + # The following based on 3 identical input files of about 39kbytes, and 200 rows + fixtures = [] + common_config = {"runtime_num_workers": 1, "run_locally": True} + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), + "../../../../../../transforms/universal/resize/ray/test-data")) + launcher = RayTransformLauncher(ResizeRayTransformConfiguration()) + + # Split into 4 or so files + config = {"resize_max_rows_per_table": 125} | common_config + fixtures.append((launcher, config, basedir + "/input", basedir + "/expected-rows-125")) + + # Merge into 2 or so files + config = {"resize_max_rows_per_table": 300} | common_config + fixtures.append((launcher, config, basedir + "/input", basedir + "/expected-rows-300")) + + # # Merge all into a single table + config = {"resize_max_mbytes_per_table": 1} | common_config + fixtures.append((launcher, config, basedir + "/input", basedir + "/expected-mbytes-1")) + + # # Merge the 1st 2 and some of the 2nd with the 3rd + config = {"resize_max_mbytes_per_table": 0.05} | common_config + fixtures.append((launcher, config, basedir + "/input", basedir + "/expected-mbytes-0.05")) + + # Split into 4 or so files + config = {"resize_max_mbytes_per_table": 0.02} | common_config + fixtures.append((launcher, config, basedir + "/input", basedir + "/expected-mbytes-0.02")) + + return fixtures diff --git a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_resize_noop_pipeline.py b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_resize_noop_pipeline.py new file mode 100644 index 000000000..276fdfc98 --- /dev/null +++ b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_resize_noop_pipeline.py @@ -0,0 +1,55 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import os + +from data_processing_ray.test_support.transform import ResizeNOOPRayTransformConfiguration +from data_processing_ray.runtime.ray import RayTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) + + +class TestPythonResizeNOOPTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + # The following based on 3 identical input files of about 39kbytes, and 200 rows + fixtures = [] + common_config = {"runtime_num_workers": 1, "run_locally": True} + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), + "../../../../../../transforms/universal/resize/ray/test-data")) + launcher = RayTransformLauncher(ResizeNOOPRayTransformConfiguration()) + + # Split into 4 or so files + config = {"resize_max_rows_per_table": 125, "noop_sleep_sec": 1} | common_config + fixtures.append((launcher, config, basedir + "/input", basedir + "/expected-rows-125")) + + # Merge into 2 or so files + config = {"resize_max_rows_per_table": 300, "noop_sleep_sec": 1} | common_config + fixtures.append((launcher, config, basedir + "/input", basedir + "/expected-rows-300")) + + # # Merge all into a single table + config = {"resize_max_mbytes_per_table": 1, "noop_sleep_sec": 1} | common_config + fixtures.append((launcher, config, basedir + "/input", basedir + "/expected-mbytes-1")) + + # # Merge the 1st 2 and some of the 2nd with the 3rd + config = {"resize_max_mbytes_per_table": 0.05, "noop_sleep_sec": 1} | common_config + fixtures.append((launcher, config, basedir + "/input", basedir + "/expected-mbytes-0.05")) + + # Split into 4 or so files + config = {"resize_max_mbytes_per_table": 0.02, "noop_sleep_sec": 1} | common_config + fixtures.append((launcher, config, basedir + "/input", basedir + "/expected-mbytes-0.02")) + + return fixtures From 5dc6f9396a89b932b775ade61ceb48ea5eb82ada Mon Sep 17 00:00:00 2001 From: blublinsky Date: Mon, 23 Sep 2024 16:20:36 +0100 Subject: [PATCH 12/12] Add Spark support and tests --- .../transform/spark/__init__.py | 1 + .../transform/spark/pipeline_transform.py | 50 +++++++++++++++++++ .../spark/src/noop_pipeline_local_spark.py | 45 +++++++++++++++++ .../src/noop_pipeline_transform_spark.py | 42 ++++++++++++++++ .../spark/test/test_noop_pipeline_spark.py | 34 +++++++++++++ 5 files changed, 172 insertions(+) create mode 100644 data-processing-lib/spark/src/data_processing_spark/transform/spark/__init__.py create mode 100644 data-processing-lib/spark/src/data_processing_spark/transform/spark/pipeline_transform.py create mode 100644 transforms/universal/noop/spark/src/noop_pipeline_local_spark.py create mode 100644 transforms/universal/noop/spark/src/noop_pipeline_transform_spark.py create mode 100644 transforms/universal/noop/spark/test/test_noop_pipeline_spark.py diff --git a/data-processing-lib/spark/src/data_processing_spark/transform/spark/__init__.py b/data-processing-lib/spark/src/data_processing_spark/transform/spark/__init__.py new file mode 100644 index 000000000..9a72e3503 --- /dev/null +++ b/data-processing-lib/spark/src/data_processing_spark/transform/spark/__init__.py @@ -0,0 +1 @@ +from data_processing_spark.transform.spark.pipeline_transform import SparkPipelineTransform diff --git a/data-processing-lib/spark/src/data_processing_spark/transform/spark/pipeline_transform.py b/data-processing-lib/spark/src/data_processing_spark/transform/spark/pipeline_transform.py new file mode 100644 index 000000000..c49551c6f --- /dev/null +++ b/data-processing-lib/spark/src/data_processing_spark/transform/spark/pipeline_transform.py @@ -0,0 +1,50 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any +from data_processing.transform import AbstractPipelineTransform +from data_processing.transform import BaseTransformRuntime + + +class SparkPipelineTransform(AbstractPipelineTransform): + """ + Transform that executes a set of base transforms sequentially. Data is passed between + participating transforms in memory + """ + + def __init__(self, config: dict[str, Any]): + """ + Initializes pipeline execution for the list of transforms + :param config - configuration parameters - list of transforms in the pipeline. + Note that transforms will be executed in the order they are defined + """ + self.partition = config.get("partition_index", 0) + super().__init__(config) + + def _get_transform_params(self, runtime: BaseTransformRuntime) -> dict[str, Any]: + """ + get transform parameters + :param runtime - runtime + :return: transform params + """ + return runtime.get_transform_config(partition=self.partition, + data_access_factory=self.data_access_factory,statistics=self.statistics) + + def _compute_execution_statistics(self, stats: dict[str, Any]) -> None: + """ + Compute execution statistics + :param stats: current statistics from flush + :return: None + """ + self.statistics.add_stats(stats) + for _, runtime in self.participants: + runtime.compute_execution_stats(stats=self.statistics) \ No newline at end of file diff --git a/transforms/universal/noop/spark/src/noop_pipeline_local_spark.py b/transforms/universal/noop/spark/src/noop_pipeline_local_spark.py new file mode 100644 index 000000000..1d7eea850 --- /dev/null +++ b/transforms/universal/noop/spark/src/noop_pipeline_local_spark.py @@ -0,0 +1,45 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing_spark.runtime.spark import SparkTransformLauncher +from data_processing.utils import ParamsUtils +from noop_pipeline_transform_spark import NOOPPypelineSparkTransformConfiguration + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # noop params + "noop_sleep_sec": 1, +} +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = SparkTransformLauncher(runtime_config=NOOPPypelineSparkTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/noop/spark/src/noop_pipeline_transform_spark.py b/transforms/universal/noop/spark/src/noop_pipeline_transform_spark.py new file mode 100644 index 000000000..4c3d6718e --- /dev/null +++ b/transforms/universal/noop/spark/src/noop_pipeline_transform_spark.py @@ -0,0 +1,42 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing_spark.runtime.spark import SparkTransformLauncher, SparkTransformRuntimeConfiguration +from data_processing.transform import PipelineTransformConfiguration +from data_processing_spark.transform.spark import SparkPipelineTransform +from data_processing.utils import get_logger +from noop_transform_spark import NOOPSparkTransformConfiguration + +logger = get_logger(__name__) + + +class NOOPPypelineSparkTransformConfiguration(SparkTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config=PipelineTransformConfiguration( + config={"transforms": [NOOPSparkTransformConfiguration()]}, + transform_class=SparkPipelineTransform)) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = SparkTransformLauncher(NOOPPypelineSparkTransformConfiguration()) + logger.info("Launching resize/noop transform") + launcher.launch() diff --git a/transforms/universal/noop/spark/test/test_noop_pipeline_spark.py b/transforms/universal/noop/spark/test/test_noop_pipeline_spark.py new file mode 100644 index 000000000..03759b9e6 --- /dev/null +++ b/transforms/universal/noop/spark/test/test_noop_pipeline_spark.py @@ -0,0 +1,34 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_spark.runtime.spark import SparkTransformLauncher +from noop_pipeline_transform_spark import NOOPPypelineSparkTransformConfiguration + + +class TestSparkNOOPTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = "../test-data" + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) + fixtures = [] + launcher = SparkTransformLauncher(NOOPPypelineSparkTransformConfiguration()) + fixtures.append((launcher, {"noop_sleep_sec": 1}, basedir + "/input", basedir + "/expected")) + return fixtures