diff --git a/.make.defaults b/.make.defaults index f9f58500f..e1bd5275a 100644 --- a/.make.defaults +++ b/.make.defaults @@ -235,6 +235,10 @@ __check_defined = \ cp -p -R ${LIB_PATH}/src ${LIB_NAME} cp -p -R ${LIB_PATH}/pyproject.toml ${LIB_NAME} cp -p -R ${LIB_PATH}/README.md ${LIB_NAME} + if [ -e ${LIB_PATH}/requirements.txt ]; then \ + cp -p ${LIB_PATH}/requirements.txt ${LIB_NAME}; \ + fi + # Build and image using the local Dockerfile and make the data-processing-lib/python # available in the current directory for use by the Dockerfile (i.e. to install the library). @@ -591,8 +595,9 @@ MINIO_ADMIN_PWD= localminiosecretkey # Updates the versions references to our repo source as defined in .make.versions .PHONY: .defaults.__update-toml-lib-dep-versions .defaults.__update-toml-lib-dep-versions: +ifeq ($(USE_REPO_LIB_SRC), 1) @# Help: Update pyproject.toml to depend on lib versions defined in .make.versions - @if [ -e pyproject.toml ]; then \ + if [ -e pyproject.toml ]; then \ cat pyproject.toml | sed \ -e 's/"data-prep-toolkit-ray\([=><~][=]\).*"/"data-prep-toolkit-ray\1$(DPK_LIB_VERSION)"/' \ -e 's/"data-prep-toolkit-spark\([=><~][=]\).*"/"data-prep-toolkit-spark\1$(DPK_LIB_VERSION)"/' \ @@ -603,7 +608,7 @@ MINIO_ADMIN_PWD= localminiosecretkey > tt.toml; \ mv tt.toml pyproject.toml; \ fi - @if [ -e requirements.txt ]; then \ + if [ -e requirements.txt ]; then \ cat requirements.txt | sed \ -e 's/data-prep-toolkit-ray\([=><~][=]\).*/data-prep-toolkit-ray\1$(DPK_LIB_VERSION)/' \ -e 's/data-prep-toolkit-transforms\([=><~][=]\).*/data-prep-toolkit-transforms\1$(DPK_TRANSFORMS_VERSION)/' \ @@ -615,6 +620,7 @@ MINIO_ADMIN_PWD= localminiosecretkey > tt.txt; \ mv tt.txt requirements.txt; \ fi +endif # Build the distribution, usually in preparation for publishing using ith the .defaults.publish-dist target .PHONY: .defaults.build-dist diff --git a/.make.versions b/.make.versions index dd599aa04..4346291cc 100644 --- a/.make.versions +++ b/.make.versions @@ -25,9 +25,9 @@ DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_ # publish docker images with latest tag ifeq ($(DPK_VERSION_SUFFIX), ) - DOCKER_IMAGE_VERSION=$(DPK_VERSION) + DOCKER_IMAGE_VERSION?=$(DPK_VERSION) else - DOCKER_IMAGE_VERSION=latest + DOCKER_IMAGE_VERSION?=latest endif # Data prep lab wheel version @@ -39,82 +39,6 @@ DPK_LIB_KFP_SHARED=$(DPK_VERSION) KFP_DOCKER_VERSION=$(DOCKER_IMAGE_VERSION) KFP_DOCKER_VERSION_v2=$(DOCKER_IMAGE_VERSION) -# Begin transform versions/tags -BLOCKLIST_VERSION=$(DPK_VERSION) - -DOC_ID_PYTHON_VERSION=$(DPK_VERSION) -DOC_ID_RAY_VERSION=$(DPK_VERSION) -DOC_ID_SPARK_VERSION=$(DPK_VERSION) - -EDEDUP_PYTHON_VERSION=$(DPK_VERSION) -EDEDUP_RAY_VERSION=$(DPK_VERSION) - -FDEDUP_RAY_VERSION=$(DPK_VERSION) - -FILTER_PYTHON_VERSION=$(DPK_VERSION) -FILTER_RAY_VERSION=$(DPK_VERSION) -FILTER_SPARK_VERSION=$(DPK_VERSION) - -NOOP_PYTHON_VERSION=$(DPK_VERSION) -NOOP_RAY_VERSION=$(DPK_VERSION) -NOOP_SPARK_VERSION=$(DPK_VERSION) - -PROFILER_PYTHON_VERSION=$(DPK_VERSION) -PROFILER_RAY_VERSION=$(DPK_VERSION) -PROFILER_SPARK_VERSION=$(DPK_VERSION) - -RESIZE_PYTHON_VERSION=$(DPK_VERSION) -RESIZE_RAY_VERSION=$(DPK_VERSION) -RESIZE_SPARK_VERSION=$(DPK_VERSION) - -LANG_ID_PYTHON_VERSION=$(DPK_VERSION) -LANG_ID_RAY_VERSION=$(DPK_VERSION) - -TOKENIZATION_RAY_VERSION=$(DPK_VERSION) -TOKENIZATION_PYTHON_VERSION=$(DPK_VERSION) - -MALWARE_RAY_VERSION=$(DPK_VERSION) -MALWARE_PYTHON_VERSION=$(DPK_VERSION) - -PROGLANG_SELECT_PYTHON_VERSION=$(DPK_VERSION) -PROGLANG_SELECT_RAY_VERSION=$(DPK_VERSION) - -DOC_QUALITY_PYTHON_VERSION=$(DPK_VERSION) -DOC_QUALITY_RAY_VERSION=$(DPK_VERSION) - -CODE_QUALITY_RAY_VERSION=$(DPK_VERSION) -CODE_QUALITY_PYTHON_VERSION=$(DPK_VERSION) - -CODE2PARQUET_PYTHON_VERSION=$(DPK_VERSION) -CODE2PARQUET_RAY_VERSION=$(DPK_VERSION) -INGEST_TO_PARQUET_VERSION=$(DPK_VERSION) -REPO_LVL_ORDER_RAY_VERSION=$(DPK_VERSION) - -PDF2PARQUET_PYTHON_VERSION=$(DPK_VERSION) -PDF2PARQUET_RAY_VERSION=$(DPK_VERSION) - -DOC_CHUNK_PYTHON_VERSION=$(DPK_VERSION) -DOC_CHUNK_RAY_VERSION=$(DPK_VERSION) - -TEXT_ENCODER_PYTHON_VERSION=$(DPK_VERSION) -TEXT_ENCODER_RAY_VERSION=$(DPK_VERSION) - -HEADER_CLEANSER_PYTHON_VERSION=$(DPK_VERSION) -HEADER_CLEANSER_RAY_VERSION=$(DPK_VERSION) - -LICENSE_SELECT_PYTHON_VERSION=$(DPK_VERSION) -LICENSE_SELECT_RAY_VERSION=$(DPK_VERSION) - -PII_REDACTOR_PYTHON_VERSION=$(DPK_VERSION) - -HTML2PARQUET_PYTHON_VERSION=$(DPK_VERSION) - -DPK_TRANSFORMS_VERSION=$(DPK_VERSION) - -SYNTACTIC_CONCEPT_EXTRACTOR_PYTHON_VERSION=$(DPK_VERSION) -SYNTACTIC_CONCEPT_EXTRACTOR_RAY_VERSION=$(DPK_VERSION) - - ################## ################## ################## ################## ################## ################## # Begin versions that the repo depends on. diff --git a/README.md b/README.md index aeec4ef70..b4d372356 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,8 @@ The goal is to offer high-level APIs for developers to quickly get started in wo - [Scaling transforms from laptop to cluster](#laptop_cluster) - [Repository Use and Navigation](doc/repo.md) - [How to Contribute](CONTRIBUTING.md) -- [Papers and Talks](#talks_papers) +- [Talks and Papers](#talks_papers) +- [Citations](#citations) ## 📖 About @@ -131,7 +132,7 @@ The matrix below shows the the combination of modules and supported runtimes. Al | **Data Ingestion** | | | | | | [Code (from zip) to Parquet](transforms/code/code2parquet/python/README.md) | :white_check_mark: | :white_check_mark: | | :white_check_mark: | | [PDF to Parquet](transforms/language/pdf2parquet/python/README.md) | :white_check_mark: | :white_check_mark: | | :white_check_mark: | -| [HTML to Parquet](transforms/universal/html2parquet/python/README.md) | :white_check_mark: | | | | +| [HTML to Parquet](transforms/language/html2parquet/python/README.md) | :white_check_mark: | :white_check_mark: | | | | **Universal (Code & Language)** | | | | | | [Exact dedup filter](transforms/universal/ededup/ray/README.md) | :white_check_mark: | :white_check_mark: | | :white_check_mark: | | [Fuzzy dedup filter](transforms/universal/fdedup/ray/README.md) | | :white_check_mark: | | :white_check_mark: | @@ -220,3 +221,23 @@ You can run transforms via docker image or using virtual environments. This [doc 5. Talk on "Hands on session for fine tuning LLMs" [Video](https://www.youtube.com/watch?v=VEHIA3E64DM) 6. Talk on "Build your own data preparation module using data-prep-kit" [Video](https://www.youtube.com/watch?v=0WUMG6HIgMg) +## Citations + +If you use Data Prep Kit in your research, please cite our paper: + +```bash +@misc{wood2024dataprepkitgettingdataready, + title={Data-Prep-Kit: getting your data ready for LLM application development}, + author={David Wood and Boris Lublinsky and Alexy Roytman and Shivdeep Singh + and Abdulhamid Adebayo and Revital Eres and Mohammad Nassar and Hima Patel + and Yousaf Shah and Constantin Adam and Petros Zerfos and Nirmit Desai + and Daiki Tsuzuku and Takuya Goto and Michele Dolfi and Saptha Surendran + and Paramesvaran Selvam and Sungeun An and Yuan Chi Chang and Dhiraj Joshi + and Hajar Emami-Gohari and Xuan-Hong Dang and Yan Koyfman and Shahrokh Daijavad}, + year={2024}, + eprint={2409.18164}, + archivePrefix={arXiv}, + primaryClass={cs.AI}, + url={https://arxiv.org/abs/2409.18164}, +} +``` \ No newline at end of file diff --git a/transforms/code/Makefile b/transforms/code/Makefile index b5d5c7bbe..17afe2785 100644 --- a/transforms/code/Makefile +++ b/transforms/code/Makefile @@ -27,10 +27,26 @@ image:: @# Help: Recursively make $@ in all subdirs @$(MAKE) RULE=$@ .recurse +test-image:: + @# Help: Recursively make $@ in all subdirs + @$(MAKE) RULE=$@ .recurse + publish:: @# Help: Recursively make $@ in all subdirs @$(MAKE) RULE=$@ .recurse +kind-load-image:: + @# Help: Recursively make $@ in all subdirs + @$(MAKE) RULE=$@ .recurse + +docker-load-image:: + @# Help: Recursively make $@ in all subdirs + $(MAKE) RULE=$@ .recurse + +docker-save-image:: + @# Help: Recursively make $@ in all subdirs + $(MAKE) RULE=$@ .recurse + set-versions: @# Help: Recursively $@ in all subdirs @$(MAKE) RULE=$@ .recurse diff --git a/transforms/code/code2parquet/kfp_ray/Makefile b/transforms/code/code2parquet/kfp_ray/Makefile index 6b9e640d1..847a743b8 100644 --- a/transforms/code/code2parquet/kfp_ray/Makefile +++ b/transforms/code/code2parquet/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/code/code2parquet/python/Makefile b/transforms/code/code2parquet/python/Makefile index d0403e601..e27e402c7 100644 --- a/transforms/code/code2parquet/python/Makefile +++ b/transforms/code/code2parquet/python/Makefile @@ -1,16 +1,22 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME=code2parquet - include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + venv:: .transforms.python-venv test:: .transforms.python-test diff --git a/transforms/code/code2parquet/ray/Makefile b/transforms/code/code2parquet/ray/Makefile index bc1580987..42383457f 100644 --- a/transforms/code/code2parquet/ray/Makefile +++ b/transforms/code/code2parquet/ray/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=code2parquet +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/code/code2parquet/transform.config b/transforms/code/code2parquet/transform.config new file mode 100644 index 000000000..2049a2261 --- /dev/null +++ b/transforms/code/code2parquet/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=code2parquet + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +CODE2PARQUET_PYTHON_VERSION=$(DPK_VERSION) +CODE2PARQUET_RAY_VERSION=$(CODE2PARQUET_PYTHON_VERSION) +CODE2PARQUET_SPARK_VERSION=$(CODE2PARQUET_PYTHON_VERSION) + diff --git a/transforms/code/code_quality/kfp_ray/Makefile b/transforms/code/code_quality/kfp_ray/Makefile index a22efcf8e..1cab0d878 100644 --- a/transforms/code/code_quality/kfp_ray/Makefile +++ b/transforms/code/code_quality/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') @@ -48,4 +51,4 @@ workflow-test: workflow-build workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done \ No newline at end of file + done diff --git a/transforms/code/code_quality/python/Makefile b/transforms/code/code_quality/python/Makefile index 1b50d41b8..cd9811f79 100644 --- a/transforms/code/code_quality/python/Makefile +++ b/transforms/code/code_quality/python/Makefile @@ -1,10 +1,21 @@ - -# Define the root of the local git clone for the common rules to be able +# Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=code_quality +# Include the common configuration for this transform +include ../transform.config # Use default rule inherited from makefile.common clean:: .transforms.clean diff --git a/transforms/code/code_quality/ray/Makefile b/transforms/code/code_quality/ray/Makefile index 720cf9c00..5a744e861 100644 --- a/transforms/code/code_quality/ray/Makefile +++ b/transforms/code/code_quality/ray/Makefile @@ -1,10 +1,21 @@ - -# Define the root of the local git clone for the common rules to be able +# Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=code_quality +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} diff --git a/transforms/code/code_quality/transform.config b/transforms/code/code_quality/transform.config new file mode 100644 index 000000000..4ebec625a --- /dev/null +++ b/transforms/code/code_quality/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=code_quality + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +CODE_QUALITY_PYTHON_VERSION=$(DPK_VERSION) +CODE_QUALITY_RAY_VERSION=$(CODE_QUALITY_PYTHON_VERSION) +CODE_QUALITY_SPARK_VERSION=$(CODE_QUALITY_PYTHON_VERSION) + diff --git a/transforms/code/header_cleanser/kfp_ray/Makefile b/transforms/code/header_cleanser/kfp_ray/Makefile.disable-cicd similarity index 91% rename from transforms/code/header_cleanser/kfp_ray/Makefile rename to transforms/code/header_cleanser/kfp_ray/Makefile.disable-cicd index 05a343384..411cc97f1 100644 --- a/transforms/code/header_cleanser/kfp_ray/Makefile +++ b/transforms/code/header_cleanser/kfp_ray/Makefile.disable-cicd @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') @@ -48,4 +51,4 @@ workflow-test: workflow-build workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done \ No newline at end of file + done diff --git a/transforms/code/header_cleanser/python/Makefile b/transforms/code/header_cleanser/python/Makefile index 1e3fa68fd..0a91a14d6 100644 --- a/transforms/code/header_cleanser/python/Makefile +++ b/transforms/code/header_cleanser/python/Makefile @@ -1,10 +1,21 @@ - -# Define the root of the local git clone for the common rules to be able +# Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=header_cleanser +# Include the common configuration for this transform +include ../transform.config # Use default rule inherited from makefile.common clean:: .transforms.clean diff --git a/transforms/code/header_cleanser/ray/Dockerfile b/transforms/code/header_cleanser/ray/Dockerfile index 6521c8662..16f8cf69c 100644 --- a/transforms/code/header_cleanser/ray/Dockerfile +++ b/transforms/code/header_cleanser/ray/Dockerfile @@ -17,9 +17,11 @@ COPY --chown=ray:users pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # Install system dependencies, including libgomp1 +USER root RUN sudo apt-get update && sudo apt-get install -y \ libgomp1 \ && sudo rm -rf /var/lib/apt/lists/* +User ray # copy source data COPY ./src/header_cleanser_transform_ray.py . @@ -36,4 +38,4 @@ ENV PYTHONPATH /home/ray ARG BUILD_DATE ARG GIT_COMMIT LABEL build-date=$BUILD_DATE -LABEL git-commit=$GIT_COMMIT \ No newline at end of file +LABEL git-commit=$GIT_COMMIT diff --git a/transforms/code/header_cleanser/ray/Makefile b/transforms/code/header_cleanser/ray/Makefile index d223bc1cb..9d83c71d0 100644 --- a/transforms/code/header_cleanser/ray/Makefile +++ b/transforms/code/header_cleanser/ray/Makefile @@ -1,10 +1,21 @@ - -# Define the root of the local git clone for the common rules to be able +# Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=header_cleanser +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} diff --git a/transforms/code/header_cleanser/transform.config b/transforms/code/header_cleanser/transform.config new file mode 100644 index 000000000..e1da13d0c --- /dev/null +++ b/transforms/code/header_cleanser/transform.config @@ -0,0 +1,18 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=header_cleanser + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +HEADER_CLEANSER_PYTHON_VERSION=$(DPK_VERSION) +HEADER_CLEANSER_RAY_VERSION=$(HEADER_CLEANSER_PYTHON_VERSION) diff --git a/transforms/code/license_select/kfp_ray/Makefile.disable-cicd b/transforms/code/license_select/kfp_ray/Makefile.disable-cicd index 9f21f3d58..28e244faa 100644 --- a/transforms/code/license_select/kfp_ray/Makefile.disable-cicd +++ b/transforms/code/license_select/kfp_ray/Makefile.disable-cicd @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/code/license_select/python/Makefile b/transforms/code/license_select/python/Makefile index 7077c801f..2f3825fda 100644 --- a/transforms/code/license_select/python/Makefile +++ b/transforms/code/license_select/python/Makefile @@ -1,10 +1,22 @@ -# Define the root of the local git clone for the common rules to be able +# Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=license_select -# $(REPOROOT)/.make.versions file contains the versions +# Include the common configuration for this transform +include ../transform.config + DOCKER_IMAGE_VERSION=${LICENSE_SELECT_PYTHON_VERSION} # Use default rule inherited from makefile.common diff --git a/transforms/code/license_select/ray/Makefile b/transforms/code/license_select/ray/Makefile index 25fe6ab8a..d69cf00ca 100644 --- a/transforms/code/license_select/ray/Makefile +++ b/transforms/code/license_select/ray/Makefile @@ -1,15 +1,24 @@ - -# Define the root of the local git clone for the common rules to be able +# Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + BASE_IMAGE=$(RAY_BASE_IMAGE) -TRANSFORM_NAME=license_select -# $(REPOROOT)/.make.versions file contains the versions + DOCKER_IMAGE_VERSION=${LICENSE_SELECT_RAY_VERSION} # Use default rule inherited from makefile.common diff --git a/transforms/code/license_select/transform.config b/transforms/code/license_select/transform.config new file mode 100644 index 000000000..bba10d3e5 --- /dev/null +++ b/transforms/code/license_select/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=license_select + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +LICENSE_SELECT_PYTHON_VERSION=$(DPK_VERSION) +LICENSE_SELECT_RAY_VERSION=$(LICENSE_SELECT_PYTHON_VERSION) +LICENSE_SELECT_SPARK_VERSION=$(LICENSE_SELECT_PYTHON_VERSION) + diff --git a/transforms/code/malware/kfp_ray/Makefile b/transforms/code/malware/kfp_ray/Makefile index 7b423d8bd..0446e2d29 100644 --- a/transforms/code/malware/kfp_ray/Makefile +++ b/transforms/code/malware/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') @@ -48,4 +51,4 @@ workflow-test: workflow-build workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done \ No newline at end of file + done diff --git a/transforms/code/malware/python/Makefile b/transforms/code/malware/python/Makefile index 99174e9a1..bd523b629 100644 --- a/transforms/code/malware/python/Makefile +++ b/transforms/code/malware/python/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=malware +# Include the common configuration for this transform +include ../transform.config OS := $(shell uname -s) ifeq ($(OS),Darwin) diff --git a/transforms/code/malware/ray/Makefile b/transforms/code/malware/ray/Makefile index 99515c036..a92cbd529 100644 --- a/transforms/code/malware/ray/Makefile +++ b/transforms/code/malware/ray/Makefile @@ -1,12 +1,22 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=malware +# Include the common configuration for this transform +include ../transform.config + BASE_IMAGE=${RAY_BASE_IMAGE} OS := $(shell uname -s) ifeq ($(OS),Darwin) diff --git a/transforms/code/malware/transform.config b/transforms/code/malware/transform.config new file mode 100644 index 000000000..be0b6651d --- /dev/null +++ b/transforms/code/malware/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=malware + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +MALWARE_PYTHON_VERSION=$(DPK_VERSION) +MALWARE_RAY_VERSION=$(MALWARE_PYTHON_VERSION) +MALWARE_SPARK_VERSION=$(MALWARE_PYTHON_VERSION) + diff --git a/transforms/code/proglang_select/kfp_ray/Makefile b/transforms/code/proglang_select/kfp_ray/Makefile index abbf75c8c..b8a21bca8 100644 --- a/transforms/code/proglang_select/kfp_ray/Makefile +++ b/transforms/code/proglang_select/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/code/proglang_select/python/Makefile b/transforms/code/proglang_select/python/Makefile index 2cec4f6db..7d64e0a90 100644 --- a/transforms/code/proglang_select/python/Makefile +++ b/transforms/code/proglang_select/python/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=proglang_select +# Include the common configuration for this transform +include ../transform.config venv:: .transforms.python-venv diff --git a/transforms/code/proglang_select/ray/Makefile b/transforms/code/proglang_select/ray/Makefile index 82db54db7..20315a234 100644 --- a/transforms/code/proglang_select/ray/Makefile +++ b/transforms/code/proglang_select/ray/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=proglang_select +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/code/proglang_select/transform.config b/transforms/code/proglang_select/transform.config new file mode 100644 index 000000000..c32cb9775 --- /dev/null +++ b/transforms/code/proglang_select/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=proglang_select + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +PROGLANG_SELECT_PYTHON_VERSION=$(DPK_VERSION) +PROGLANG_SELECT_RAY_VERSION=$(PROGLANG_SELECT_PYTHON_VERSION) +PROGLANG_SELECT_SPARK_VERSION=$(PROGLANG_SELECT_PYTHON_VERSION) + diff --git a/transforms/code/repo_level_ordering/kfp_ray/Makefile.disable-cicd b/transforms/code/repo_level_ordering/kfp_ray/Makefile.disable-cicd index ef3765e31..5b2425357 100644 --- a/transforms/code/repo_level_ordering/kfp_ray/Makefile.disable-cicd +++ b/transforms/code/repo_level_ordering/kfp_ray/Makefile.disable-cicd @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/code/repo_level_ordering/ray/Makefile b/transforms/code/repo_level_ordering/ray/Makefile index 83f8692de..8d2f784fb 100644 --- a/transforms/code/repo_level_ordering/ray/Makefile +++ b/transforms/code/repo_level_ordering/ray/Makefile @@ -1,15 +1,23 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - include $(REPOROOT)/transforms/.make.transforms -BASE_IMAGE=$(RAY_BASE_IMAGE) +# Include the common configuration for this transform +include ../transform.config -TRANSFORM_NAME=repo_level_order +BASE_IMAGE=$(RAY_BASE_IMAGE) venv:: .transforms.ray-venv diff --git a/transforms/code/repo_level_ordering/transform.config b/transforms/code/repo_level_ordering/transform.config new file mode 100644 index 000000000..0d82c6377 --- /dev/null +++ b/transforms/code/repo_level_ordering/transform.config @@ -0,0 +1,19 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=repo_level_order + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +REPO_LVL_ORDER_RAY_VERSION=$(DPK_VERSION) + + diff --git a/transforms/code/syntactic_concept_extractor/python/Makefile b/transforms/code/syntactic_concept_extractor/python/Makefile index c0cc96637..87d5b46bb 100644 --- a/transforms/code/syntactic_concept_extractor/python/Makefile +++ b/transforms/code/syntactic_concept_extractor/python/Makefile @@ -3,14 +3,12 @@ REPOROOT=../../../.. # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free -# to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME=syntactic_concept_extractor - +# to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + # values possible mach-arm64, x86_64 export RUNTIME_HOST_ARCH=x86_64 diff --git a/transforms/code/syntactic_concept_extractor/python/pyproject.toml b/transforms/code/syntactic_concept_extractor/python/pyproject.toml index 91402f09b..8d9c40c72 100644 --- a/transforms/code/syntactic_concept_extractor/python/pyproject.toml +++ b/transforms/code/syntactic_concept_extractor/python/pyproject.toml @@ -8,106 +8,9 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Pankaj Thorat", email = "pankaj.thorat@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "parameterized", - "pandas", - "aiolimiter==1.1.0", - "altair==5.3.0", - "annotated-types==0.7.0", - "anyio==4.4.0", - "appnope==0.1.4", - "asttokens==2.4.1", - "attrs==23.2.0", - "blinker==1.8.2", - "cachetools==5.3.3", - "certifi==2024.6.2", - "charset-normalizer==3.3.2", - "click==8.1.7", - "comm==0.2.2", - "contourpy==1.2.1", - "cycler==0.12.1", - "debugpy==1.8.1", - "decorator==5.1.1", - "Deprecated==1.2.14", - "executing==2.0.1", - "fonttools==4.53.0", - "gitdb==4.0.11", - "GitPython==3.1.43", - "h11==0.14.0", - "htbuilder==0.6.2", - "httpcore==1.0.5", - "httpx==0.27.0", - "httpx-sse==0.4.0", - "ibm-generative-ai==3.0.0", - "idna==3.7", - "ipykernel==6.29.4", - "ipython==8.25.0", - "jedi==0.19.1", - "Jinja2==3.1.4", - "jsonschema==4.22.0", - "jsonschema-specifications==2023.12.1", - "jupyter_client==8.6.2", - "jupyter_core==5.7.2", - "kiwisolver==1.4.5", - "markdown-it-py==3.0.0", - "MarkupSafe==2.1.5", - "matplotlib==3.9.0", - "matplotlib-inline==0.1.7", - "mdurl==0.1.2", - "more-itertools==10.3.0", - "nest-asyncio==1.6.0", - "networkx==3.3", - "numpy==1.26.4", - "packaging==24.0", - "pandas==2.2.2", - "parso==0.8.4", - "pexpect==4.9.0", - "pillow==10.3.0", - "platformdirs==4.2.2", - "prompt_toolkit==3.0.45", - "protobuf==5.27.2", - "psutil==5.9.8", - "ptyprocess==0.7.0", - "pure-eval==0.2.2", - "pyarrow==16.1.0", - "pydantic==2.7.4", - "pydantic_core==2.18.4", - "pydeck==0.9.1", - "Pygments==2.18.0", - "pyparsing==3.1.2", - "python-dateutil==2.9.0.post0", - "pytz==2024.1", - "pyzmq==26.0.3", - "referencing==0.35.1", - "regex==2024.5.15", - "requests==2.32.3", - "rich==13.7.1", - "rpds-py==0.18.1", - "seaborn==0.13.2", - "six==1.16.0", - "smmap==5.0.1", - "sniffio==1.3.1", - "st-annotated-text==4.0.1", - "stack-data==0.6.3", - "streamlit==1.36.0", - "tenacity==8.4.2", - "toml==0.10.2", - "toolz==0.12.1", - "tornado==6.4", - "traitlets==5.14.3", - "tree-sitter==0.21.3", - "tree-sitter-cpp==0.22.1", - "tree-sitter-java==0.21.0", - "tree-sitter-languages==1.10.2", - "tree-sitter-php==0.22.5", - "typing_extensions==4.12.2", - "tzdata==2024.1", - "urllib3==2.2.2", - "uuid", - "wcwidth==0.2.13", - "wrapt==1.16.0", - ] + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] diff --git a/transforms/code/syntactic_concept_extractor/python/requirements.txt b/transforms/code/syntactic_concept_extractor/python/requirements.txt new file mode 100644 index 000000000..aa44489f2 --- /dev/null +++ b/transforms/code/syntactic_concept_extractor/python/requirements.txt @@ -0,0 +1,98 @@ + data-prep-toolkit==0.2.2.dev0 + parameterized + pandas + aiolimiter==1.1.0 + altair==5.3.0 + annotated-types==0.7.0 + anyio==4.4.0 + appnope==0.1.4 + asttokens==2.4.1 + attrs==23.2.0 + blinker==1.8.2 + cachetools==5.3.3 + certifi==2024.6.2 + charset-normalizer==3.3.2 + click==8.1.7 + comm==0.2.2 + contourpy==1.2.1 + cycler==0.12.1 + debugpy==1.8.1 + decorator==5.1.1 + Deprecated==1.2.14 + executing==2.0.1 + fonttools==4.53.0 + gitdb==4.0.11 + GitPython==3.1.43 + h11==0.14.0 + htbuilder==0.6.2 + httpcore==1.0.5 + httpx==0.27.0 + httpx-sse==0.4.0 + ibm-generative-ai==3.0.0 + idna==3.7 + ipykernel==6.29.4 + ipython==8.25.0 + jedi==0.19.1 + Jinja2==3.1.4 + jsonschema==4.22.0 + jsonschema-specifications==2023.12.1 + jupyter_client==8.6.2 + jupyter_core==5.7.2 + kiwisolver==1.4.5 + markdown-it-py==3.0.0 + MarkupSafe==2.1.5 + matplotlib==3.9.0 + matplotlib-inline==0.1.7 + mdurl==0.1.2 + more-itertools==10.3.0 + nest-asyncio==1.6.0 + networkx==3.3 + numpy==1.26.4 + packaging==24.0 + pandas==2.2.2 + parso==0.8.4 + pexpect==4.9.0 + pillow==10.3.0 + platformdirs==4.2.2 + prompt_toolkit==3.0.45 + protobuf==5.27.2 + psutil==5.9.8 + ptyprocess==0.7.0 + pure-eval==0.2.2 + pyarrow==16.1.0 + pydantic==2.7.4 + pydantic_core==2.18.4 + pydeck==0.9.1 + Pygments==2.18.0 + pyparsing==3.1.2 + python-dateutil==2.9.0.post0 + pytz==2024.1 + pyzmq==26.0.3 + referencing==0.35.1 + regex==2024.5.15 + requests==2.32.3 + rich==13.7.1 + rpds-py==0.18.1 + seaborn==0.13.2 + six==1.16.0 + smmap==5.0.1 + sniffio==1.3.1 + st-annotated-text==4.0.1 + stack-data==0.6.3 + streamlit==1.36.0 + tenacity==8.4.2 + toml==0.10.2 + toolz==0.12.1 + tornado==6.4 + traitlets==5.14.3 + tree-sitter==0.21.3 + tree-sitter-cpp==0.22.1 + tree-sitter-java==0.21.0 + tree-sitter-languages==1.10.2 + tree-sitter-php==0.22.5 + typing_extensions==4.12.2 + tzdata==2024.1 + urllib3==2.2.2 + uuid + wcwidth==0.2.13 + wrapt==1.16.0 \ No newline at end of file diff --git a/transforms/code/syntactic_concept_extractor/ray/Makefile b/transforms/code/syntactic_concept_extractor/ray/Makefile index ecc3a34dc..bca844f7a 100644 --- a/transforms/code/syntactic_concept_extractor/ray/Makefile +++ b/transforms/code/syntactic_concept_extractor/ray/Makefile @@ -7,7 +7,8 @@ REPOROOT=../../../.. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=syntactic_concept_extractor +# Include the common configuration for this transform +include ../transform.config # values possible mach-arm64, x86_64 export RUNTIME_HOST_ARCH=x86_64 @@ -15,8 +16,6 @@ export RUNTIME_HOST_ARCH=x86_64 BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv -venv:: .transforms.python-venv - test:: .transforms.ray-test clean:: .transforms.clean diff --git a/transforms/code/syntactic_concept_extractor/ray/pyproject.toml b/transforms/code/syntactic_concept_extractor/ray/pyproject.toml index e0bf538ee..537b28b21 100644 --- a/transforms/code/syntactic_concept_extractor/ray/pyproject.toml +++ b/transforms/code/syntactic_concept_extractor/ray/pyproject.toml @@ -12,7 +12,6 @@ dependencies = [ "dpk-syntactic-concept-extractor-transform-python==0.2.2.dev0", "data-prep-toolkit-ray==0.2.2.dev0", "data-prep-toolkit==0.2.2.dev0", - "protobuf==5.27.2" ] [build-system] diff --git a/transforms/code/syntactic_concept_extractor/transform.config b/transforms/code/syntactic_concept_extractor/transform.config new file mode 100644 index 000000000..4e678e6eb --- /dev/null +++ b/transforms/code/syntactic_concept_extractor/transform.config @@ -0,0 +1,19 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=syntactic_concept_extractor + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +SYNTACTIC_CONCEPT_EXTRACTOR_PYTHON_VERSION=$(DPK_VERSION) +SYNTACTIC_CONCEPT_EXTRACTOR_RAY_VERSION=$(SYNTACTIC_CONCEPT_EXTRACTOR_PYTHON_VERSION) + diff --git a/transforms/language/doc_chunk/kfp_ray/Makefile b/transforms/language/doc_chunk/kfp_ray/Makefile index 189b36ea5..30e912e33 100644 --- a/transforms/language/doc_chunk/kfp_ray/Makefile +++ b/transforms/language/doc_chunk/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/language/doc_chunk/python/Makefile b/transforms/language/doc_chunk/python/Makefile index a6fbe35dc..2f2a7e789 100644 --- a/transforms/language/doc_chunk/python/Makefile +++ b/transforms/language/doc_chunk/python/Makefile @@ -1,14 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=doc_chunk +# Include the common configuration for this transform +include ../transform.config LINUX_WITH_CPU_TORCH?=true OS := $(shell uname -s) diff --git a/transforms/language/doc_chunk/ray/Makefile b/transforms/language/doc_chunk/ray/Makefile index 6b9b4ae6a..b4f394f84 100644 --- a/transforms/language/doc_chunk/ray/Makefile +++ b/transforms/language/doc_chunk/ray/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=doc_chunk +# Include the common configuration for this transform +include ../transform.config LINUX_WITH_CPU_TORCH?=true OS := $(shell uname -s) diff --git a/transforms/language/doc_chunk/transform.config b/transforms/language/doc_chunk/transform.config new file mode 100644 index 000000000..f433f360b --- /dev/null +++ b/transforms/language/doc_chunk/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=doc_chunk + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +DOC_CHUNK_PYTHON_VERSION=$(DPK_VERSION) +DOC_CHUNK_RAY_VERSION=$(DOC_CHUNK_PYTHON_VERSION) +DOC_CHUNK_SPARK_VERSION=$(DOC_CHUNK_PYTHON_VERSION) + diff --git a/transforms/language/doc_quality/kfp_ray/Makefile b/transforms/language/doc_quality/kfp_ray/Makefile index 004f17616..9f5e93615 100644 --- a/transforms/language/doc_quality/kfp_ray/Makefile +++ b/transforms/language/doc_quality/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/language/doc_quality/python/Makefile b/transforms/language/doc_quality/python/Makefile index 684ce47ae..f0f309400 100644 --- a/transforms/language/doc_quality/python/Makefile +++ b/transforms/language/doc_quality/python/Makefile @@ -1,16 +1,22 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME=doc_quality - include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + venv:: .transforms.python-venv test:: .transforms.test-src test-image diff --git a/transforms/language/doc_quality/ray/Makefile b/transforms/language/doc_quality/ray/Makefile index d462543a1..dd278af88 100644 --- a/transforms/language/doc_quality/ray/Makefile +++ b/transforms/language/doc_quality/ray/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=doc_quality -# $(REPOROOT)/.make.versions file contains the versions +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/language/doc_quality/transform.config b/transforms/language/doc_quality/transform.config new file mode 100644 index 000000000..2ece0e071 --- /dev/null +++ b/transforms/language/doc_quality/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=doc_quality + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +DOC_QUALITY_PYTHON_VERSION=$(DPK_VERSION) +DOC_QUALITY_RAY_VERSION=$(DOC_QUALITY_PYTHON_VERSION) +DOC_QUALITY_SPARK_VERSION=$(DOC_QUALITY_PYTHON_VERSION) + diff --git a/transforms/language/html2parquet/python/Makefile b/transforms/language/html2parquet/python/Makefile index 0e552d5be..284bb8e8a 100644 --- a/transforms/language/html2parquet/python/Makefile +++ b/transforms/language/html2parquet/python/Makefile @@ -1,16 +1,22 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME= html2parquet - include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + venv:: .transforms.python-venv test:: .transforms.python-test diff --git a/transforms/language/html2parquet/ray/Makefile b/transforms/language/html2parquet/ray/Makefile index 30c908259..1667be8b9 100644 --- a/transforms/language/html2parquet/ray/Makefile +++ b/transforms/language/html2parquet/ray/Makefile @@ -1,15 +1,22 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. +include $(REPOROOT)/transforms/.make.transforms -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME= html2parquet +# Include the common configuration for this transform +include ../transform.config -include $(REPOROOT)/transforms/.make.transforms BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv @@ -59,3 +66,6 @@ kind-load-image:: .transforms.kind-load-image docker-load-image: .defaults.docker-load-image docker-save-image: .defaults.docker-save-image + + + diff --git a/transforms/language/html2parquet/ray/requirements.txt b/transforms/language/html2parquet/ray/requirements.txt index dc2111e9e..dc796d602 100644 --- a/transforms/language/html2parquet/ray/requirements.txt +++ b/transforms/language/html2parquet/ray/requirements.txt @@ -1,3 +1,2 @@ dpk-html2parquet-transform-python==0.2.2.dev0 data-prep-toolkit-ray==0.2.2.dev0 -trafilatura==1.12.0 \ No newline at end of file diff --git a/transforms/language/html2parquet/transform.config b/transforms/language/html2parquet/transform.config new file mode 100644 index 000000000..10847c6af --- /dev/null +++ b/transforms/language/html2parquet/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=html2parquet + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +HTML2PARQUET_PYTHON_VERSION=$(DPK_VERSION) +HTML2PARQUET_RAY_VERSION=$(HTML2PARQUET_PYTHON_VERSION) +HTML2PARQUET_SPARK_VERSION=$(HTML2PARQUET_PYTHON_VERSION) + diff --git a/transforms/language/lang_id/kfp_ray/Makefile b/transforms/language/lang_id/kfp_ray/Makefile index b8f11ffc8..fd2c42d8e 100644 --- a/transforms/language/lang_id/kfp_ray/Makefile +++ b/transforms/language/lang_id/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/language/lang_id/python/Makefile b/transforms/language/lang_id/python/Makefile index 441f6093d..972ccb729 100644 --- a/transforms/language/lang_id/python/Makefile +++ b/transforms/language/lang_id/python/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. +include $(REPOROOT)/transforms/.make.transforms -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME=lang_id +# Include the common configuration for this transform +include ../transform.config include $(REPOROOT)/transforms/.make.transforms diff --git a/transforms/language/lang_id/ray/Makefile b/transforms/language/lang_id/ray/Makefile index 6b0e307d7..1339af964 100644 --- a/transforms/language/lang_id/ray/Makefile +++ b/transforms/language/lang_id/ray/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=lang_id +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/language/lang_id/transform.config b/transforms/language/lang_id/transform.config new file mode 100644 index 000000000..3a969f41d --- /dev/null +++ b/transforms/language/lang_id/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=lang_id + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +LANG_ID_PYTHON_VERSION=$(DPK_VERSION) +LANG_ID_RAY_VERSION=$(LANG_ID_PYTHON_VERSION) +LANG_ID_SPARK_VERSION=$(LANG_ID_PYTHON_VERSION) + diff --git a/transforms/language/pdf2parquet/kfp_ray/Makefile b/transforms/language/pdf2parquet/kfp_ray/Makefile index 24154bffa..66edd91fc 100644 --- a/transforms/language/pdf2parquet/kfp_ray/Makefile +++ b/transforms/language/pdf2parquet/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/language/pdf2parquet/python/Makefile b/transforms/language/pdf2parquet/python/Makefile index 0e06a5900..b18b068ac 100644 --- a/transforms/language/pdf2parquet/python/Makefile +++ b/transforms/language/pdf2parquet/python/Makefile @@ -1,14 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=pdf2parquet +# Include the common configuration for this transform +include ../transform.config RUN_ARGS=" --data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\" \ --data_files_to_use \"['.pdf','.zip']\" " diff --git a/transforms/language/pdf2parquet/ray/Makefile b/transforms/language/pdf2parquet/ray/Makefile index fba43ea15..ced1f45f1 100644 --- a/transforms/language/pdf2parquet/ray/Makefile +++ b/transforms/language/pdf2parquet/ray/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=pdf2parquet +# Include the common configuration for this transform +include ../transform.config LINUX_WITH_CPU_TORCH?=true OS := $(shell uname -s) diff --git a/transforms/language/pdf2parquet/transform.config b/transforms/language/pdf2parquet/transform.config new file mode 100644 index 000000000..1bda1908e --- /dev/null +++ b/transforms/language/pdf2parquet/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=pdf2parquet + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +PDF2PARQUET_PYTHON_VERSION=$(DPK_VERSION) +PDF2PARQUET_RAY_VERSION=$(PDF2PARQUET_PYTHON_VERSION) +PDF2PARQUET_SPARK_VERSION=$(PDF2PARQUET_PYTHON_VERSION) + diff --git a/transforms/language/pii_redactor/kfp_ray/Makefile b/transforms/language/pii_redactor/kfp_ray/Makefile index 77844a79e..370f85cb0 100644 --- a/transforms/language/pii_redactor/kfp_ray/Makefile +++ b/transforms/language/pii_redactor/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/language/pii_redactor/python/Makefile b/transforms/language/pii_redactor/python/Makefile index 28fd33fff..50161da6e 100644 --- a/transforms/language/pii_redactor/python/Makefile +++ b/transforms/language/pii_redactor/python/Makefile @@ -1,16 +1,22 @@ -# Define the root of the local git clone for the common rules to be able +# Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions -TRANSFORM_NAME=pii_redactor +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + venv:: .transforms.python-venv test:: .transforms.python-test diff --git a/transforms/language/pii_redactor/ray/Makefile b/transforms/language/pii_redactor/ray/Makefile index 3a67b90b8..e52494534 100644 --- a/transforms/language/pii_redactor/ray/Makefile +++ b/transforms/language/pii_redactor/ray/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=pii_redactor +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/language/pii_redactor/transform.config b/transforms/language/pii_redactor/transform.config new file mode 100644 index 000000000..c06adf82c --- /dev/null +++ b/transforms/language/pii_redactor/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=pii_redactor + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +PII_REDACTOR_PYTHON_VERSION=$(DPK_VERSION) +PII_REDACTOR_RAY_VERSION=$(PII_REDACTOR_PYTHON_VERSION) +PII_REDACTOR_SPARK_VERSION=$(PII_REDACTOR_PYTHON_VERSION) + diff --git a/transforms/language/text_encoder/kfp_ray/Makefile.disable-cicd b/transforms/language/text_encoder/kfp_ray/Makefile.disable-cicd index 70613cc01..36bd47560 100644 --- a/transforms/language/text_encoder/kfp_ray/Makefile.disable-cicd +++ b/transforms/language/text_encoder/kfp_ray/Makefile.disable-cicd @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/language/text_encoder/python/Makefile b/transforms/language/text_encoder/python/Makefile index c9e8b8c1b..564bb405b 100644 --- a/transforms/language/text_encoder/python/Makefile +++ b/transforms/language/text_encoder/python/Makefile @@ -1,14 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=text_encoder +# Include the common configuration for this transform +include ../transform.config LINUX_WITH_CPU_TORCH?=true OS := $(shell uname -s) diff --git a/transforms/language/text_encoder/ray/Makefile b/transforms/language/text_encoder/ray/Makefile index b95b299c4..85cf45cac 100644 --- a/transforms/language/text_encoder/ray/Makefile +++ b/transforms/language/text_encoder/ray/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=text_encoder +# Include the common configuration for this transform +include ../transform.config LINUX_WITH_CPU_TORCH?=true OS := $(shell uname -s) diff --git a/transforms/language/text_encoder/transform.config b/transforms/language/text_encoder/transform.config new file mode 100644 index 000000000..df5754fb8 --- /dev/null +++ b/transforms/language/text_encoder/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=text_encoder + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +TEXT_ENCODER_PYTHON_VERSION=$(DPK_VERSION) +TEXT_ENCODER_RAY_VERSION=$(TEXT_ENCODER_PYTHON_VERSION) +TEXT_ENCODER_SPARK_VERSION=$(TEXT_ENCODER_PYTHON_VERSION) + diff --git a/transforms/packaging/.make.packaging b/transforms/packaging/.make.packaging index 5268889d0..29506aaf1 100644 --- a/transforms/packaging/.make.packaging +++ b/transforms/packaging/.make.packaging @@ -2,6 +2,11 @@ ifndef T_SET T_SET=all endif +# Defines the version of the wheel for the package transforms +# If you change this value, you will need to run "make set-versions" to +# apply the new version number to the toml files. +DPK_TRANSFORMS_VERSION=$(DPK_VERSION) + venv: $(MAKE) .defaults.create-venv diff --git a/transforms/universal/doc_id/kfp_ray/Makefile b/transforms/universal/doc_id/kfp_ray/Makefile index 94fc75145..f170326e2 100644 --- a/transforms/universal/doc_id/kfp_ray/Makefile +++ b/transforms/universal/doc_id/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') @@ -48,4 +51,4 @@ workflow-test: workflow-build workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done \ No newline at end of file + done diff --git a/transforms/universal/doc_id/python/Makefile b/transforms/universal/doc_id/python/Makefile index 1f7d0d353..26da1fc8f 100644 --- a/transforms/universal/doc_id/python/Makefile +++ b/transforms/universal/doc_id/python/Makefile @@ -1,16 +1,22 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME=doc_id - include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + venv:: .transforms.python-venv test:: .transforms.python-test diff --git a/transforms/universal/doc_id/ray/Makefile b/transforms/universal/doc_id/ray/Makefile index d7844f2f9..79787406b 100644 --- a/transforms/universal/doc_id/ray/Makefile +++ b/transforms/universal/doc_id/ray/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=doc_id +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/universal/doc_id/spark/Makefile b/transforms/universal/doc_id/spark/Makefile index 954786dac..9303d021f 100644 --- a/transforms/universal/doc_id/spark/Makefile +++ b/transforms/universal/doc_id/spark/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=doc_id +# Include the common configuration for this transform +include ../transform.config venv:: .transforms.spark-venv diff --git a/transforms/universal/doc_id/transform.config b/transforms/universal/doc_id/transform.config new file mode 100644 index 000000000..d3715f3b2 --- /dev/null +++ b/transforms/universal/doc_id/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=doc_id + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +DOC_ID_PYTHON_VERSION=$(DPK_VERSION) +DOC_ID_RAY_VERSION=$(DOC_ID_PYTHON_VERSION) +DOC_ID_SPARK_VERSION=$(DOC_ID_PYTHON_VERSION) + diff --git a/transforms/universal/ededup/kfp_ray/Makefile b/transforms/universal/ededup/kfp_ray/Makefile index 456cf76d1..f0c5cc217 100644 --- a/transforms/universal/ededup/kfp_ray/Makefile +++ b/transforms/universal/ededup/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') @@ -47,4 +50,4 @@ workflow-test: workflow-build workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done \ No newline at end of file + done diff --git a/transforms/universal/ededup/python/Makefile b/transforms/universal/ededup/python/Makefile index 92f3fac27..348edc74d 100644 --- a/transforms/universal/ededup/python/Makefile +++ b/transforms/universal/ededup/python/Makefile @@ -1,16 +1,22 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME=ededup - include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + venv:: .transforms.python-venv test:: .transforms.python-test diff --git a/transforms/universal/ededup/ray/Makefile b/transforms/universal/ededup/ray/Makefile index f828e107e..1ff055e29 100644 --- a/transforms/universal/ededup/ray/Makefile +++ b/transforms/universal/ededup/ray/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=ededup +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/universal/ededup/transform.config b/transforms/universal/ededup/transform.config new file mode 100644 index 000000000..12f5357f1 --- /dev/null +++ b/transforms/universal/ededup/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=ededup + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +EDEDUP_PYTHON_VERSION=$(DPK_VERSION) +EDEDUP_RAY_VERSION=$(EDEDUP_PYTHON_VERSION) +EDEDUP_SPARK_VERSION=$(EDEDUP_PYTHON_VERSION) + diff --git a/transforms/universal/fdedup/kfp_ray/Makefile b/transforms/universal/fdedup/kfp_ray/Makefile index f6b215984..55f7851f6 100644 --- a/transforms/universal/fdedup/kfp_ray/Makefile +++ b/transforms/universal/fdedup/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') @@ -48,4 +51,4 @@ workflow-test: workflow-build workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done \ No newline at end of file + done diff --git a/transforms/universal/fdedup/ray/Makefile b/transforms/universal/fdedup/ray/Makefile index 15173ba00..f5f06c3c3 100644 --- a/transforms/universal/fdedup/ray/Makefile +++ b/transforms/universal/fdedup/ray/Makefile @@ -1,14 +1,24 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=fdedup +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} + venv:: .transforms.ray-venv test:: .transforms.ray-test diff --git a/transforms/universal/fdedup/transform.config b/transforms/universal/fdedup/transform.config new file mode 100644 index 000000000..774716e15 --- /dev/null +++ b/transforms/universal/fdedup/transform.config @@ -0,0 +1,18 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=fdedup + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +FDEDUP_RAY_VERSION=$(DPK_VERSION) + diff --git a/transforms/universal/filter/kfp_ray/Makefile b/transforms/universal/filter/kfp_ray/Makefile index bd26792be..c48298d22 100644 --- a/transforms/universal/filter/kfp_ray/Makefile +++ b/transforms/universal/filter/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/universal/filter/python/Makefile b/transforms/universal/filter/python/Makefile index 1ea1151ce..9a01deea1 100644 --- a/transforms/universal/filter/python/Makefile +++ b/transforms/universal/filter/python/Makefile @@ -1,16 +1,22 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME=filter - include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + venv:: .transforms.python-venv test:: .transforms.python-test diff --git a/transforms/universal/filter/ray/Makefile b/transforms/universal/filter/ray/Makefile index 5960a9670..0c0af0004 100644 --- a/transforms/universal/filter/ray/Makefile +++ b/transforms/universal/filter/ray/Makefile @@ -1,13 +1,21 @@ - # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=filter +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/universal/filter/spark/Makefile b/transforms/universal/filter/spark/Makefile index 329da35a2..72bc78a15 100644 --- a/transforms/universal/filter/spark/Makefile +++ b/transforms/universal/filter/spark/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -# This is included in the image name, if defined -TRANSFORM_NAME=filter +# Include the common configuration for this transform +include ../transform.config venv:: .transforms.spark-venv diff --git a/transforms/universal/filter/transform.config b/transforms/universal/filter/transform.config new file mode 100644 index 000000000..70f2ada5b --- /dev/null +++ b/transforms/universal/filter/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=filter + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +FILTER_PYTHON_VERSION=$(DPK_VERSION) +FILTER_RAY_VERSION=$(FILTER_PYTHON_VERSION) +FILTER_SPARK_VERSION=$(FILTER_PYTHON_VERSION) + diff --git a/transforms/universal/hap/python/Makefile b/transforms/universal/hap/python/Makefile index c7c15dba7..2363e51c2 100644 --- a/transforms/universal/hap/python/Makefile +++ b/transforms/universal/hap/python/Makefile @@ -1,15 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -#TRANSFORM_RUNTIME_SRC_FILE=hap_transform_python.py -TRANSFORM_NAME=hap - -HAP_PYTHON_VERSION= $(DPK_VERSION) +# Include the common configuration for this transform +include ../transform.config venv:: .transforms.python-venv diff --git a/transforms/universal/hap/transform.config b/transforms/universal/hap/transform.config new file mode 100644 index 000000000..6aa7018b3 --- /dev/null +++ b/transforms/universal/hap/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=hap + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +HAP_PYTHON_VERSION=$(DPK_VERSION) +HAP_RAY_VERSION=$(HAP_PYTHON_VERSION) +HAP_SPARK_VERSION=$(HAP_PYTHON_VERSION) + diff --git a/transforms/universal/noop/kfp_ray/Makefile b/transforms/universal/noop/kfp_ray/Makefile index d1198e5a2..fc541f367 100644 --- a/transforms/universal/noop/kfp_ray/Makefile +++ b/transforms/universal/noop/kfp_ray/Makefile @@ -1,7 +1,11 @@ REPOROOT=${CURDIR}/../../../../ + WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/universal/noop/python/Makefile b/transforms/universal/noop/python/Makefile index 80797bcc9..5e6121b04 100644 --- a/transforms/universal/noop/python/Makefile +++ b/transforms/universal/noop/python/Makefile @@ -1,15 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=noop - +# Include the common configuration for this transform +include ../transform.config venv:: .transforms.python-venv diff --git a/transforms/universal/noop/ray/Makefile b/transforms/universal/noop/ray/Makefile index 0b70f6662..ad7ff3320 100644 --- a/transforms/universal/noop/ray/Makefile +++ b/transforms/universal/noop/ray/Makefile @@ -1,15 +1,24 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=noop +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} + venv:: .transforms.ray-venv test:: .transforms.ray-test diff --git a/transforms/universal/noop/spark/Makefile b/transforms/universal/noop/spark/Makefile index 726fd9e6a..ebc72992e 100644 --- a/transforms/universal/noop/spark/Makefile +++ b/transforms/universal/noop/spark/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=noop +# Include the common configuration for this transform +include ../transform.config venv:: .transforms.spark-venv diff --git a/transforms/universal/noop/transform.config b/transforms/universal/noop/transform.config new file mode 100644 index 000000000..49c9b2cbf --- /dev/null +++ b/transforms/universal/noop/transform.config @@ -0,0 +1,21 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=noop + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +NOOP_PYTHON_VERSION=$(DPK_VERSION) +NOOP_RAY_VERSION=$(NOOP_PYTHON_VERSION) +NOOP_SPARK_VERSION=$(NOOP_PYTHON_VERSION) + diff --git a/transforms/universal/profiler/kfp_ray/Makefile b/transforms/universal/profiler/kfp_ray/Makefile index 2fbd17653..e4f6b860b 100644 --- a/transforms/universal/profiler/kfp_ray/Makefile +++ b/transforms/universal/profiler/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/universal/profiler/python/Makefile b/transforms/universal/profiler/python/Makefile index 61c807a23..983250184 100644 --- a/transforms/universal/profiler/python/Makefile +++ b/transforms/universal/profiler/python/Makefile @@ -1,16 +1,22 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME=profiler - include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + venv:: .transforms.python-venv test:: .transforms.python-test diff --git a/transforms/universal/profiler/ray/Makefile b/transforms/universal/profiler/ray/Makefile index 8cec28968..12d75c4c3 100644 --- a/transforms/universal/profiler/ray/Makefile +++ b/transforms/universal/profiler/ray/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=profiler +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/universal/profiler/spark/Makefile b/transforms/universal/profiler/spark/Makefile index cb90b4020..39b16cac6 100644 --- a/transforms/universal/profiler/spark/Makefile +++ b/transforms/universal/profiler/spark/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=profiler +# Include the common configuration for this transform +include ../transform.config venv:: .transforms.spark-venv diff --git a/transforms/universal/profiler/transform.config b/transforms/universal/profiler/transform.config new file mode 100644 index 000000000..c86cd6415 --- /dev/null +++ b/transforms/universal/profiler/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=profiler + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +PROFILER_PYTHON_VERSION=$(DPK_VERSION) +PROFILER_RAY_VERSION=$(PROFILER_PYTHON_VERSION) +PROFILER_SPARK_VERSION=$(PROFILER_PYTHON_VERSION) + diff --git a/transforms/universal/resize/kfp_ray/Makefile b/transforms/universal/resize/kfp_ray/Makefile index a0e2faf37..8c7e592af 100644 --- a/transforms/universal/resize/kfp_ray/Makefile +++ b/transforms/universal/resize/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') @@ -48,4 +51,4 @@ workflow-test: workflow-build workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done \ No newline at end of file + done diff --git a/transforms/universal/resize/python/Makefile b/transforms/universal/resize/python/Makefile index 7de0032e3..66453c846 100644 --- a/transforms/universal/resize/python/Makefile +++ b/transforms/universal/resize/python/Makefile @@ -1,16 +1,22 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME=resize - include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + venv:: .transforms.python-venv test:: .transforms.python-test diff --git a/transforms/universal/resize/ray/Makefile b/transforms/universal/resize/ray/Makefile index 1a2f2496f..dd229b3f4 100644 --- a/transforms/universal/resize/ray/Makefile +++ b/transforms/universal/resize/ray/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free -# to override/redefine the rules below. - +# to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=resize +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/universal/resize/spark/Makefile b/transforms/universal/resize/spark/Makefile index f02e9db3f..18d72d31d 100644 --- a/transforms/universal/resize/spark/Makefile +++ b/transforms/universal/resize/spark/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=resize +# Include the common configuration for this transform +include ../transform.config venv:: .transforms.spark-venv diff --git a/transforms/universal/resize/transform.config b/transforms/universal/resize/transform.config new file mode 100644 index 000000000..4b7171a4e --- /dev/null +++ b/transforms/universal/resize/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=resize + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +RESIZE_PYTHON_VERSION=$(DPK_VERSION) +RESIZE_RAY_VERSION=$(RESIZE_PYTHON_VERSION) +RESIZE_SPARK_VERSION=$(RESIZE_PYTHON_VERSION) + diff --git a/transforms/universal/tokenization/kfp_ray/Makefile b/transforms/universal/tokenization/kfp_ray/Makefile index 09656297a..c43105ff1 100644 --- a/transforms/universal/tokenization/kfp_ray/Makefile +++ b/transforms/universal/tokenization/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') @@ -48,4 +51,4 @@ workflow-test: workflow-build workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done \ No newline at end of file + done diff --git a/transforms/universal/tokenization/python/Makefile b/transforms/universal/tokenization/python/Makefile index d23661983..8f4f7fbf5 100644 --- a/transforms/universal/tokenization/python/Makefile +++ b/transforms/universal/tokenization/python/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=tokenization +# Include the common configuration for this transform +include ../transform.config venv:: .transforms.python-venv diff --git a/transforms/universal/tokenization/ray/Makefile b/transforms/universal/tokenization/ray/Makefile index 3d5a46d09..0a4e3a370 100644 --- a/transforms/universal/tokenization/ray/Makefile +++ b/transforms/universal/tokenization/ray/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=tokenization +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/universal/tokenization/transform.config b/transforms/universal/tokenization/transform.config new file mode 100644 index 000000000..04f517d42 --- /dev/null +++ b/transforms/universal/tokenization/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=tokenization + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +TOKENIZATION_PYTHON_VERSION=$(DPK_VERSION) +TOKENIZATION_RAY_VERSION=$(TOKENIZATION_PYTHON_VERSION) +TOKENIZATION_SPARK_VERSION=$(TOKENIZATION_PYTHON_VERSION) +