Merge pull request #140 from IBM/runtime-reorg

Restructure the repository to distinguish/separate runtime libraries
IBM · May 17, 2024 · 30d1538 · 30d1538
2 parents b933ef0 + 7c49c4c
commit 30d1538
Show file tree

Hide file tree

Showing 94 changed files with 173 additions and 130 deletions.
diff --git a/.github/workflows/build-library.yml b/.github/workflows/build-library.yml
@@ -18,6 +18,6 @@ jobs:
         steps:
             - name: Checkout
               uses: actions/checkout@v4
-            - name: Build data-processing-lib
+            - name: Build data-processing-lib/ray
               run: |
-                  make -C data-processing-lib DOCKER=docker venv build
+                  make -C data-processing-lib/ray DOCKER=docker venv build
diff --git a/.github/workflows/deploy-library.yml b/.github/workflows/deploy-library.yml
@@ -13,7 +13,7 @@ permissions:
 
 jobs:
     build-package:
-        name: Build and check packages
+        name: Build Ray data processing library
         runs-on: ubuntu-latest
         steps:
             - name: Checkout
@@ -23,7 +23,7 @@ jobs:
                   fetch-depth: 0
             - name: Build Package for pypi
               run: |
-                  make -C data-processing-lib build
+                  make -C data-processing-lib/ray build
     publish-test-pypi:
         name: Publish packages to test.pypi.org
         # disabled

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -14,9 +14,9 @@ jobs:
         steps:
             - name: Checkout
               uses: actions/checkout@v4
-            - name: Test data-processing-lib
+            - name: Test data-processing-lib/ray
               run: |
-                  make -C data-processing-lib DOCKER=docker venv test
+                  make -C data-processing-lib/ray DOCKER=docker venv test
     test-code:
         runs-on: ubuntu-latest
         steps:

diff --git a/.make.defaults b/.make.defaults
@@ -44,7 +44,7 @@ DOCKER_IMAGE?=${DOCKER_REGISTRY_ENDPOINT}/$(DOCKER_NAME):$(DOCKER_IMAGE_VERSION)
 include $(REPOROOT)/.make.versions
 
 KIND_CLUSTER_NAME=dataprep
-
+DPK_RAY_LIB_DIR=$(REPOROOT)/data-processing-lib/ray
 #######################################################################################
 # Lists all targets and optional help text found in the target.
 # Adapted from https://stackoverflow.com/a/65243296/45375
@@ -175,9 +175,9 @@ __check_defined = \
 	rm -rf data-processing-lib
 	mkdir data-processing-lib	       
 	# Copy with -p so docker cachine works when copying this into the image
-	cp -p -R $(REPOROOT)/data-processing-lib/src data-processing-lib
-	cp -p  $(REPOROOT)/data-processing-lib/pyproject.toml data-processing-lib
-	cp -p  $(REPOROOT)/data-processing-lib/README.md data-processing-lib
+	cp -p -R $(DPK_RAY_LIB_DIR)/src data-processing-lib
+	cp -p  $(DPK_RAY_LIB_DIR)/pyproject.toml data-processing-lib
+	cp -p  $(DPK_RAY_LIB_DIR)/README.md data-processing-lib
 	$(MAKE) DOCKER_IMAGE=$(DOCKER_IMAGE) .defaults.image
 	rm -rf data-processing-lib
 
@@ -188,11 +188,11 @@ __check_defined = \
 	@echo Installing source from data processing library for venv
 	source venv/bin/activate;				      	\
 	pip install pytest;						\
-	pip uninstall -y data-prep-toolkit;		 			\
+	pip uninstall -y data-prep-toolkit;		 		\
 	if [ ! -z "$(EXTRA_INDEX_URL)" ]; then				\
 		extra_url='--extra-index-url $(EXTRA_INDEX_URL)';	\
 	fi;								\
-	pip install $${extra_url}  -e $(REPOROOT)/data-processing-lib/;	\
+	pip install $${extra_url}  -e $(DPK_RAY_LIB_DIR);			\
 	if [ $$? -eq 0 ]; then  					\
 		echo Installed source from data processing library for `which $(PYTHON)`;       \
 	else 								\

diff --git a/Makefile b/Makefile
@@ -45,7 +45,7 @@ test::
 
 lib-release:
 	@# Help: Publish data-prep-kit $(DPK_LIB_VERSION) and data-prep-kit-kfp $(DPK_LIB_KFP_VERSION) libraries to pypi 
-	@$(MAKE) -C data-processing-lib build publish
+	@$(MAKE) -C $(DPK_RAY_LIB_DIR) build publish
 	@$(MAKE) -C kfp/kfp_support_lib build publish
 	@echo ""
 	@echo "This modified files in the repo. Please be sure to commit/push back to the repository."

diff --git a/README.md b/README.md
@@ -65,7 +65,7 @@ Features of the toolkit:
 - It aims to accelerate unstructured data prep for the "long tail" of LLM use cases.
 - It offers a growing set of module implementations across multiple runtimes, targeting laptop-scale to datacenter-scale processing.
 - It provides a growing set of sample pipelines developed for real enterprise use cases.
-- It provides the [Data processing library](data-processing-lib) to enable contribution of new custom modules targeting new use cases.
+- It provides the [Data processing library](data-processing-lib/ray) to enable contribution of new custom modules targeting new use cases.
 - It uses [Kube Flow Pipelines](https://www.kubeflow.org/docs/components/pipelines/v1/introduction/)-based [workflow automation](kfp/doc/simple_transform_pipeline.md) for no-code data prep.
 
 Data modalities supported: 

diff --git a/data-processing-lib/Makefile b/data-processing-lib/Makefile
@@ -1,63 +1,45 @@
-# Use make help, to see the available rules
-REPOROOT=../
-include ../.make.defaults
-include ../.make.versions
-
-TAG := "v${DPK_LIB_VERSION}"
-
-
-clean::
-	@# Help: Clean up the distribution build and the venv 
-	rm -rf dist venv
-	rm -rf src/*egg-info
-
-.check-env::
-	@echo "Checks passed"
-
-update-toml:: .check-env
-	@# Help: Copy the Makefile distribution version into the pyproject.toml 
-	sed -e 's/^version[ ]*=.*/version = "'${DPK_LIB_VERSION}'"/' pyproject.toml > tt.toml
-	mv tt.toml pyproject.toml
+#################################################################################################################
+#
+# This is the top level makefile, which is intended to be able to process a common set of rules on all 
+# sub-projects underneath this directory.  Currently, the common/standardized set of rules are as follows
+# and supported by .make.defaults 
+#
+# setup: 
+# clean: 
+# build:
+# test:
+#
+# When finally getting to a makefile that requires a rule implementation, for example to test the build,
+# that makefile should override/implement the rule to meet its needs.  Such a rule may continue to recurse
+# using "$(MAKE) <rule>-recurse", for example "$(MAKE) test-recurse". 
+#
+# Each rule is called recursively on sub-directories and if a similar inclusion is done in the sub-Makefiles,
+# the rules will be applied/executed recursively in their sub-directories.
+#
+#################################################################################################################
+
+REPOROOT=..
+
+# Get some common rules for the whole repo
+include $(REPOROOT)/.make.defaults
+
+########## ########## ########## ########## ########## ########## ########## ########## 
+# Global rules that are generally to be implemented in the sub-directories and can
+# be overridden there (the double colon on the rule makes the overridable). 
+
+clean:: 
+	@# Help: Recursively $@ in all subdirs 
+	$(MAKE) RULE=$@ .recurse
 
 setup::
+	@# Help: Recursively $@ in all subdirs
+	@$(MAKE) RULE=$@ .recurse
+
+build:: 
+	@# Help: Recursively $@ in all subdirs 
+	$(MAKE) RULE=$@ .recurse
 
-build:: update-toml venv
-	@# Help: Build the distribution for publishing to a pypi 
-	rm -r dist || true
-	rm -rf src/*egg-info || true
-	${PIP} install --upgrade build
-	${PYTHON} -m build
-
-publish:: .check-env update-toml
-	@# Help: Publish project to pypi
-	${PYTHON} -m twine check dist/*
-	${PYTHON} -m twine upload --verbose --non-interactive dist/*
-	#@echo "create a git tag to reference published version"
-	#@git tag ${TAG}
-	#@git push origin ${TAG}
-
-venv::	pyproject.toml
-	@# Help: Create the virtual environment using pyproject.toml 
-	rm -r dist venv || true
-	rm -rf src/*egg-info || true
-	rm makeenv || true
-	$(PYTHON) -m venv venv
-	source venv/bin/activate; 	\
-	pip install --upgrade pip;	\
-	pip install -e .;		\
-	pip install pytest pytest-cov moto==5.0.5 markupsafe==2.0.1
-
-
-# Here we run each test directory of tests and each ray launched test separately, because
-# it seems when running multiple ray launch tests in a single pytest run there is some sort of ray.init() duplication.
-# pytest-forked was tried, but then we get SIGABRT in pytest when running the s3 tests, some of which are skipped.. 
 test::  
-	@# Help: Use the already-built virtual environment to run pytest on the test directory. 
-	source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST)  data_processing_tests/data_access;
-	source venv/bin/activate; export PYTHONPATH=../src;  cd test; $(PYTEST)  data_processing_tests/transform;
-	source venv/bin/activate; export PYTHONPATH=../src;  cd test; $(PYTEST)  data_processing_tests/launch/pure_python/launcher_test.py;
-	source venv/bin/activate; export PYTHONPATH=../src;  cd test; $(PYTEST)  data_processing_tests/launch/pure_python/test_noop_launch.py;
-	source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST)  data_processing_tests/launch/ray/ray_util_test.py;
-	source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST)  data_processing_tests/launch/ray/launcher_test.py;
-	source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST)  data_processing_tests/launch/ray/test_noop_launch.py;
+	@# Help: Recursively $@ in all subdirs 
+	@$(MAKE) RULE=$@ .recurse
 
diff --git a/data-processing-lib/doc/advanced-transform-tutorial.md b/data-processing-lib/doc/advanced-transform-tutorial.md
@@ -38,7 +38,7 @@ found [here](../../transforms/universal/ededup/src/ededup_transform.py)
 
 First, let's define the transform class.  To do this we extend
 the base abstract/interface class
-[AbstractTableTransform](../src/data_processing/transform/table_transform.py),
+[AbstractTableTransform](../ray/src/data_processing/transform/table_transform.py),
 which requires definition of the following:
 
 * an initializer (i.e. `init()`) that accepts a dictionary of configuration
@@ -140,7 +140,7 @@ If there is no metadata then simply return an empty dictionary.
 
 First, let's define the transform runtime class.  To do this we extend
 the base abstract/interface class
-[DefaultTableTransformRuntime](../src/data_processing/runtime/ray/transform_runtime.py),
+[DefaultTableTransformRuntime](../ray/src/data_processing/runtime/ray/transform_runtime.py),
 which requires definition of the following:
 
 * an initializer (i.e. `init()`) that accepts a dictionary of configuration

diff --git a/data-processing-lib/doc/architecture.md b/data-processing-lib/doc/architecture.md
@@ -13,27 +13,27 @@ process many input files in parallel using a distribute network of RayWorkers.
 
 The architecture includes the following core components: 
 
-* [RayLauncher](../src/data_processing/runtime/ray/transform_launcher.py) accepts and validates 
+* [RayLauncher](../ray/src/data_processing/runtime/ray/transform_launcher.py) accepts and validates 
  CLI parameters to establish the Ray Orchestrator with the proper configuration. 
 It uses the following components, all of which can/do define CLI configuration parameters.:
-    * [Transform Orchestrator Configuration](../src/data_processing/runtime/ray/execution_configuration.py) is responsible 
+    * [Transform Orchestrator Configuration](../ray/src/data_processing/runtime/ray/execution_configuration.py) is responsible 
      for defining and validating infrastructure parameters 
      (e.g., number of workers, memory and cpu, local or remote cluster, etc.). This class has very simple state
      (several dictionaries) and is fully pickleable. As a result framework uses its instance as a
      parameter in remote functions/actors invocation.
-    * [DataAccessFactory](../src/data_processing/data_access/data_access_factory.py) - provides the
+    * [DataAccessFactory](../ray/src/data_processing/data_access/data_access_factory.py) - provides the
       configuration for the type of DataAccess to use when reading/writing the input/output data for
       the transforms.  Similar to Transform Orchestrator Configuration, this is a pickleable
       instance that is passed between Launcher, Orchestrator and Workers.
-    * [TransformConfiguration](../src/data_processing/runtime/ray/transform_runtime.py) - defines specifics
+    * [TransformConfiguration](../ray/src/data_processing/runtime/ray/transform_runtime.py) - defines specifics
       of the transform implementation including transform implementation class, its short name, any transform-
       specific CLI parameters, and an optional TransformRuntime class, discussed below. 
 
     After all parameters are validated, the ray cluster is started and the DataAccessFactory, TransformOrchestratorConfiguraiton
     and TransformConfiguration are given to the Ray Orchestrator, via Ray remote() method invocation.
     The Launcher waits for the Ray Orchestrator to complete.
 
-* documents with [Ray Orchestrator](../src/data_processing/runtime/ray/transform_orchestrator.py) is responsible for overall management of
+* documents with [Ray Orchestrator](../ray/src/data_processing/runtime/ray/transform_orchestrator.py) is responsible for overall management of
   the data processing job. It creates the actors, determines the set of input data and distributes the 
   references to the data files to be processed by the workers. More specifically, it performs the following:
 
@@ -56,31 +56,31 @@ It uses the following components, all of which can/do define CLI configuration p
   and build and save it in the form of execution metadata (`metadata.json`). Finally, it will return the execution 
   result to the Launcher.
 
-* [Ray worker](../src/data_processing/runtime/ray/transform_table_processor.py) is responsible for 
+* [Ray worker](../ray/src/data_processing/runtime/ray/transform_table_processor.py) is responsible for 
 reading files (as [PyArrow Tables](https://levelup.gitconnected.com/deep-dive-into-pyarrow-understanding-its-features-and-benefits-2cce8b1466c8))
 assigned by the orchestrator, applying the transform to the input table and writing out the 
 resulting table(s).  Metadata produced by each table transformation is aggregated into
 Transform Statistics (below).
 
-* [Transform Statistics](../src/data_processing/runtime/ray/transform_statistics.py) is a general 
+* [Transform Statistics](../ray/src/data_processing/runtime/ray/transform_statistics.py) is a general 
 purpose data collector actor aggregating the numeric metadata from different places of 
 the framework (especially metadata produced by the transform).
 These statistics are reported as metadata (`metadata.json`) by the orchestrator upon completion.
 
 ## Core Components
 Some of the core components used by the architecture are definfed here:
 
-* [CLIProvider](../src/data_processing/utils/cli_utils.py) - provides a general purpose
+* [CLIProvider](../ray/src/data_processing/utils/cli_utils.py) - provides a general purpose
   mechanism for defining, validating and sharing CLI parameters. 
   It is used by the DataAccessFactor and Transform Configuration (below).
 * Data Access is an abstraction layer for different data access supported by the framework. The main components
   of this layer are:
-  * [Data Access](../src/data_processing/data_access/data_access.py) is the basic interface for the data access, and enables the identification of 
+  * [Data Access](../ray/src/data_processing/data_access/data_access.py) is the basic interface for the data access, and enables the identification of 
   input files to process, associated output files, checkpointing and general file reading/writing.
     Currently, the framework implements several concrete implementations of the Data Access, including
-    [local data support](../src/data_processing/data_access/data_access_local.py) and
-    [s3](../src/data_processing/data_access/data_access_s3.py). Additional Data Access implementations can be added as required.
-  * [Data Access Factory](../src/data_processing/data_access/data_access_factory.py) is an implementation of the 
+    [local data support](../ray/src/data_processing/data_access/data_access_local.py) and
+    [s3](../ray/src/data_processing/data_access/data_access_s3.py). Additional Data Access implementations can be added as required.
+  * [Data Access Factory](../ray/src/data_processing/data_access/data_access_factory.py) is an implementation of the 
     [factory design pattern](https://www.pentalog.com/blog/design-patterns/factory-method-design-pattern/) for creation
     of the data access instances. Data Access Factory, as a CLIProvider,  enables the definition of CLI 
     parameters that configure the instance of Data Access to be created. Data Access factory has very simple state 
@@ -92,14 +92,14 @@ Some of the core components used by the architecture are definfed here:
 A brief discussion of the Transform components are provided here.
 For a more complete discussion, see the [tutorials](transform-tutorials.md).
 
-* [Transform](../src/data_processing/transform/table_transform.py) - defines the methods required
+* [Transform](../ray/src/data_processing/transform/table_transform.py) - defines the methods required
 of any transform implementation - `transform()` and `flush()` - and provides the bulk of any transform implementation
 convert one Table to 0 or more new Tables.   In general, this is not tied to the above Ray infrastructure 
 and so can usually be used independent of Ray. 
-* [TransformRuntime ](../src/data_processing/runtime/ray/transform_runtime.py) - this class only needs to be
+* [TransformRuntime ](../ray/src/data_processing/runtime/ray/transform_runtime.py) - this class only needs to be
 extended/implemented when additional Ray components (actors, shared memory objects, etc.) are used
 by the transform. The main method `get_transform_config()` is used to enable these extensions.
-* [TransformConfiguration](../src/data_processing/runtime/ray/transform_runtime.py) - this is the bootstrap
+* [TransformConfiguration](../ray/src/data_processing/runtime/ray/transform_runtime.py) - this is the bootstrap
   class provided to the Launcher that enables the instantiation of the Transform and the TransformRuntime within
   the architecture.  It is a CLIProvider, which allows it to define transform-specific CLI configuration
   that is made available to the Transform's initializer.

diff --git a/data-processing-lib/doc/overview.md b/data-processing-lib/doc/overview.md
@@ -10,9 +10,9 @@ This might include operations such as de-duplication, merging, and splitting.
 The framework uses a plugin-model for the primary functions.  The key ones for
 developers of data transformation are:
 
-* [Transformation](../src/data_processing/transform/table_transform.py) - a simple, easily-implemented interface defines
+* [Transformation](../ray/src/data_processing/transform/table_transform.py) - a simple, easily-implemented interface defines
 the specifics of a given data transformation.
-* [Transform Configuration](../src/data_processing/runtime/ray/transform_runtime.py) - defines
+* [Transform Configuration](../ray/src/data_processing/runtime/ray/transform_runtime.py) - defines
 the transform short name, its implementation class,  and command line configuration
 parameters.