diff --git a/.make.defaults b/.make.defaults index f9f58500f..e1bd5275a 100644 --- a/.make.defaults +++ b/.make.defaults @@ -235,6 +235,10 @@ __check_defined = \ cp -p -R ${LIB_PATH}/src ${LIB_NAME} cp -p -R ${LIB_PATH}/pyproject.toml ${LIB_NAME} cp -p -R ${LIB_PATH}/README.md ${LIB_NAME} + if [ -e ${LIB_PATH}/requirements.txt ]; then \ + cp -p ${LIB_PATH}/requirements.txt ${LIB_NAME}; \ + fi + # Build and image using the local Dockerfile and make the data-processing-lib/python # available in the current directory for use by the Dockerfile (i.e. to install the library). @@ -591,8 +595,9 @@ MINIO_ADMIN_PWD= localminiosecretkey # Updates the versions references to our repo source as defined in .make.versions .PHONY: .defaults.__update-toml-lib-dep-versions .defaults.__update-toml-lib-dep-versions: +ifeq ($(USE_REPO_LIB_SRC), 1) @# Help: Update pyproject.toml to depend on lib versions defined in .make.versions - @if [ -e pyproject.toml ]; then \ + if [ -e pyproject.toml ]; then \ cat pyproject.toml | sed \ -e 's/"data-prep-toolkit-ray\([=><~][=]\).*"/"data-prep-toolkit-ray\1$(DPK_LIB_VERSION)"/' \ -e 's/"data-prep-toolkit-spark\([=><~][=]\).*"/"data-prep-toolkit-spark\1$(DPK_LIB_VERSION)"/' \ @@ -603,7 +608,7 @@ MINIO_ADMIN_PWD= localminiosecretkey > tt.toml; \ mv tt.toml pyproject.toml; \ fi - @if [ -e requirements.txt ]; then \ + if [ -e requirements.txt ]; then \ cat requirements.txt | sed \ -e 's/data-prep-toolkit-ray\([=><~][=]\).*/data-prep-toolkit-ray\1$(DPK_LIB_VERSION)/' \ -e 's/data-prep-toolkit-transforms\([=><~][=]\).*/data-prep-toolkit-transforms\1$(DPK_TRANSFORMS_VERSION)/' \ @@ -615,6 +620,7 @@ MINIO_ADMIN_PWD= localminiosecretkey > tt.txt; \ mv tt.txt requirements.txt; \ fi +endif # Build the distribution, usually in preparation for publishing using ith the .defaults.publish-dist target .PHONY: .defaults.build-dist diff --git a/.make.versions b/.make.versions index dd599aa04..4346291cc 100644 --- a/.make.versions +++ b/.make.versions @@ -25,9 +25,9 @@ DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_ # publish docker images with latest tag ifeq ($(DPK_VERSION_SUFFIX), ) - DOCKER_IMAGE_VERSION=$(DPK_VERSION) + DOCKER_IMAGE_VERSION?=$(DPK_VERSION) else - DOCKER_IMAGE_VERSION=latest + DOCKER_IMAGE_VERSION?=latest endif # Data prep lab wheel version @@ -39,82 +39,6 @@ DPK_LIB_KFP_SHARED=$(DPK_VERSION) KFP_DOCKER_VERSION=$(DOCKER_IMAGE_VERSION) KFP_DOCKER_VERSION_v2=$(DOCKER_IMAGE_VERSION) -# Begin transform versions/tags -BLOCKLIST_VERSION=$(DPK_VERSION) - -DOC_ID_PYTHON_VERSION=$(DPK_VERSION) -DOC_ID_RAY_VERSION=$(DPK_VERSION) -DOC_ID_SPARK_VERSION=$(DPK_VERSION) - -EDEDUP_PYTHON_VERSION=$(DPK_VERSION) -EDEDUP_RAY_VERSION=$(DPK_VERSION) - -FDEDUP_RAY_VERSION=$(DPK_VERSION) - -FILTER_PYTHON_VERSION=$(DPK_VERSION) -FILTER_RAY_VERSION=$(DPK_VERSION) -FILTER_SPARK_VERSION=$(DPK_VERSION) - -NOOP_PYTHON_VERSION=$(DPK_VERSION) -NOOP_RAY_VERSION=$(DPK_VERSION) -NOOP_SPARK_VERSION=$(DPK_VERSION) - -PROFILER_PYTHON_VERSION=$(DPK_VERSION) -PROFILER_RAY_VERSION=$(DPK_VERSION) -PROFILER_SPARK_VERSION=$(DPK_VERSION) - -RESIZE_PYTHON_VERSION=$(DPK_VERSION) -RESIZE_RAY_VERSION=$(DPK_VERSION) -RESIZE_SPARK_VERSION=$(DPK_VERSION) - -LANG_ID_PYTHON_VERSION=$(DPK_VERSION) -LANG_ID_RAY_VERSION=$(DPK_VERSION) - -TOKENIZATION_RAY_VERSION=$(DPK_VERSION) -TOKENIZATION_PYTHON_VERSION=$(DPK_VERSION) - -MALWARE_RAY_VERSION=$(DPK_VERSION) -MALWARE_PYTHON_VERSION=$(DPK_VERSION) - -PROGLANG_SELECT_PYTHON_VERSION=$(DPK_VERSION) -PROGLANG_SELECT_RAY_VERSION=$(DPK_VERSION) - -DOC_QUALITY_PYTHON_VERSION=$(DPK_VERSION) -DOC_QUALITY_RAY_VERSION=$(DPK_VERSION) - -CODE_QUALITY_RAY_VERSION=$(DPK_VERSION) -CODE_QUALITY_PYTHON_VERSION=$(DPK_VERSION) - -CODE2PARQUET_PYTHON_VERSION=$(DPK_VERSION) -CODE2PARQUET_RAY_VERSION=$(DPK_VERSION) -INGEST_TO_PARQUET_VERSION=$(DPK_VERSION) -REPO_LVL_ORDER_RAY_VERSION=$(DPK_VERSION) - -PDF2PARQUET_PYTHON_VERSION=$(DPK_VERSION) -PDF2PARQUET_RAY_VERSION=$(DPK_VERSION) - -DOC_CHUNK_PYTHON_VERSION=$(DPK_VERSION) -DOC_CHUNK_RAY_VERSION=$(DPK_VERSION) - -TEXT_ENCODER_PYTHON_VERSION=$(DPK_VERSION) -TEXT_ENCODER_RAY_VERSION=$(DPK_VERSION) - -HEADER_CLEANSER_PYTHON_VERSION=$(DPK_VERSION) -HEADER_CLEANSER_RAY_VERSION=$(DPK_VERSION) - -LICENSE_SELECT_PYTHON_VERSION=$(DPK_VERSION) -LICENSE_SELECT_RAY_VERSION=$(DPK_VERSION) - -PII_REDACTOR_PYTHON_VERSION=$(DPK_VERSION) - -HTML2PARQUET_PYTHON_VERSION=$(DPK_VERSION) - -DPK_TRANSFORMS_VERSION=$(DPK_VERSION) - -SYNTACTIC_CONCEPT_EXTRACTOR_PYTHON_VERSION=$(DPK_VERSION) -SYNTACTIC_CONCEPT_EXTRACTOR_RAY_VERSION=$(DPK_VERSION) - - ################## ################## ################## ################## ################## ################## # Begin versions that the repo depends on. diff --git a/README.md b/README.md index aeec4ef70..b4d372356 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,8 @@ The goal is to offer high-level APIs for developers to quickly get started in wo - [Scaling transforms from laptop to cluster](#laptop_cluster) - [Repository Use and Navigation](doc/repo.md) - [How to Contribute](CONTRIBUTING.md) -- [Papers and Talks](#talks_papers) +- [Talks and Papers](#talks_papers) +- [Citations](#citations) ## 📖 About @@ -131,7 +132,7 @@ The matrix below shows the the combination of modules and supported runtimes. Al | **Data Ingestion** | | | | | | [Code (from zip) to Parquet](transforms/code/code2parquet/python/README.md) | :white_check_mark: | :white_check_mark: | | :white_check_mark: | | [PDF to Parquet](transforms/language/pdf2parquet/python/README.md) | :white_check_mark: | :white_check_mark: | | :white_check_mark: | -| [HTML to Parquet](transforms/universal/html2parquet/python/README.md) | :white_check_mark: | | | | +| [HTML to Parquet](transforms/language/html2parquet/python/README.md) | :white_check_mark: | :white_check_mark: | | | | **Universal (Code & Language)** | | | | | | [Exact dedup filter](transforms/universal/ededup/ray/README.md) | :white_check_mark: | :white_check_mark: | | :white_check_mark: | | [Fuzzy dedup filter](transforms/universal/fdedup/ray/README.md) | | :white_check_mark: | | :white_check_mark: | @@ -220,3 +221,23 @@ You can run transforms via docker image or using virtual environments. This [doc 5. Talk on "Hands on session for fine tuning LLMs" [Video](https://www.youtube.com/watch?v=VEHIA3E64DM) 6. Talk on "Build your own data preparation module using data-prep-kit" [Video](https://www.youtube.com/watch?v=0WUMG6HIgMg) +## Citations + +If you use Data Prep Kit in your research, please cite our paper: + +```bash +@misc{wood2024dataprepkitgettingdataready, + title={Data-Prep-Kit: getting your data ready for LLM application development}, + author={David Wood and Boris Lublinsky and Alexy Roytman and Shivdeep Singh + and Abdulhamid Adebayo and Revital Eres and Mohammad Nassar and Hima Patel + and Yousaf Shah and Constantin Adam and Petros Zerfos and Nirmit Desai + and Daiki Tsuzuku and Takuya Goto and Michele Dolfi and Saptha Surendran + and Paramesvaran Selvam and Sungeun An and Yuan Chi Chang and Dhiraj Joshi + and Hajar Emami-Gohari and Xuan-Hong Dang and Yan Koyfman and Shahrokh Daijavad}, + year={2024}, + eprint={2409.18164}, + archivePrefix={arXiv}, + primaryClass={cs.AI}, + url={https://arxiv.org/abs/2409.18164}, +} +``` \ No newline at end of file diff --git a/transforms/code/code2parquet/kfp_ray/Makefile b/transforms/code/code2parquet/kfp_ray/Makefile index 6b9e640d1..847a743b8 100644 --- a/transforms/code/code2parquet/kfp_ray/Makefile +++ b/transforms/code/code2parquet/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/code/code2parquet/python/Makefile b/transforms/code/code2parquet/python/Makefile index d0403e601..e27e402c7 100644 --- a/transforms/code/code2parquet/python/Makefile +++ b/transforms/code/code2parquet/python/Makefile @@ -1,16 +1,22 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME=code2parquet - include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + venv:: .transforms.python-venv test:: .transforms.python-test diff --git a/transforms/code/code2parquet/ray/Makefile b/transforms/code/code2parquet/ray/Makefile index bc1580987..42383457f 100644 --- a/transforms/code/code2parquet/ray/Makefile +++ b/transforms/code/code2parquet/ray/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=code2parquet +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/code/code2parquet/transform.config b/transforms/code/code2parquet/transform.config new file mode 100644 index 000000000..2049a2261 --- /dev/null +++ b/transforms/code/code2parquet/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=code2parquet + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +CODE2PARQUET_PYTHON_VERSION=$(DPK_VERSION) +CODE2PARQUET_RAY_VERSION=$(CODE2PARQUET_PYTHON_VERSION) +CODE2PARQUET_SPARK_VERSION=$(CODE2PARQUET_PYTHON_VERSION) + diff --git a/transforms/code/code_quality/kfp_ray/Makefile b/transforms/code/code_quality/kfp_ray/Makefile index a22efcf8e..1cab0d878 100644 --- a/transforms/code/code_quality/kfp_ray/Makefile +++ b/transforms/code/code_quality/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') @@ -48,4 +51,4 @@ workflow-test: workflow-build workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done \ No newline at end of file + done diff --git a/transforms/code/code_quality/python/Makefile b/transforms/code/code_quality/python/Makefile index 1b50d41b8..cd9811f79 100644 --- a/transforms/code/code_quality/python/Makefile +++ b/transforms/code/code_quality/python/Makefile @@ -1,10 +1,21 @@ - -# Define the root of the local git clone for the common rules to be able +# Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=code_quality +# Include the common configuration for this transform +include ../transform.config # Use default rule inherited from makefile.common clean:: .transforms.clean diff --git a/transforms/code/code_quality/ray/Makefile b/transforms/code/code_quality/ray/Makefile index 720cf9c00..5a744e861 100644 --- a/transforms/code/code_quality/ray/Makefile +++ b/transforms/code/code_quality/ray/Makefile @@ -1,10 +1,21 @@ - -# Define the root of the local git clone for the common rules to be able +# Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=code_quality +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} diff --git a/transforms/code/code_quality/transform.config b/transforms/code/code_quality/transform.config new file mode 100644 index 000000000..4ebec625a --- /dev/null +++ b/transforms/code/code_quality/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=code_quality + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +CODE_QUALITY_PYTHON_VERSION=$(DPK_VERSION) +CODE_QUALITY_RAY_VERSION=$(CODE_QUALITY_PYTHON_VERSION) +CODE_QUALITY_SPARK_VERSION=$(CODE_QUALITY_PYTHON_VERSION) + diff --git a/transforms/code/header_cleanser/kfp_ray/Makefile b/transforms/code/header_cleanser/kfp_ray/Makefile.disable-cicd similarity index 91% rename from transforms/code/header_cleanser/kfp_ray/Makefile rename to transforms/code/header_cleanser/kfp_ray/Makefile.disable-cicd index 05a343384..411cc97f1 100644 --- a/transforms/code/header_cleanser/kfp_ray/Makefile +++ b/transforms/code/header_cleanser/kfp_ray/Makefile.disable-cicd @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') @@ -48,4 +51,4 @@ workflow-test: workflow-build workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done \ No newline at end of file + done diff --git a/transforms/code/header_cleanser/python/Makefile b/transforms/code/header_cleanser/python/Makefile index 1e3fa68fd..0a91a14d6 100644 --- a/transforms/code/header_cleanser/python/Makefile +++ b/transforms/code/header_cleanser/python/Makefile @@ -1,10 +1,21 @@ - -# Define the root of the local git clone for the common rules to be able +# Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=header_cleanser +# Include the common configuration for this transform +include ../transform.config # Use default rule inherited from makefile.common clean:: .transforms.clean diff --git a/transforms/code/header_cleanser/ray/Dockerfile b/transforms/code/header_cleanser/ray/Dockerfile index 6521c8662..16f8cf69c 100644 --- a/transforms/code/header_cleanser/ray/Dockerfile +++ b/transforms/code/header_cleanser/ray/Dockerfile @@ -17,9 +17,11 @@ COPY --chown=ray:users pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # Install system dependencies, including libgomp1 +USER root RUN sudo apt-get update && sudo apt-get install -y \ libgomp1 \ && sudo rm -rf /var/lib/apt/lists/* +User ray # copy source data COPY ./src/header_cleanser_transform_ray.py . @@ -36,4 +38,4 @@ ENV PYTHONPATH /home/ray ARG BUILD_DATE ARG GIT_COMMIT LABEL build-date=$BUILD_DATE -LABEL git-commit=$GIT_COMMIT \ No newline at end of file +LABEL git-commit=$GIT_COMMIT diff --git a/transforms/code/header_cleanser/ray/Makefile b/transforms/code/header_cleanser/ray/Makefile index d223bc1cb..9d83c71d0 100644 --- a/transforms/code/header_cleanser/ray/Makefile +++ b/transforms/code/header_cleanser/ray/Makefile @@ -1,10 +1,21 @@ - -# Define the root of the local git clone for the common rules to be able +# Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=header_cleanser +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} diff --git a/transforms/code/header_cleanser/transform.config b/transforms/code/header_cleanser/transform.config new file mode 100644 index 000000000..e1da13d0c --- /dev/null +++ b/transforms/code/header_cleanser/transform.config @@ -0,0 +1,18 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=header_cleanser + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +HEADER_CLEANSER_PYTHON_VERSION=$(DPK_VERSION) +HEADER_CLEANSER_RAY_VERSION=$(HEADER_CLEANSER_PYTHON_VERSION) diff --git a/transforms/code/license_select/kfp_ray/Makefile.disable-cicd b/transforms/code/license_select/kfp_ray/Makefile.disable-cicd index 9f21f3d58..28e244faa 100644 --- a/transforms/code/license_select/kfp_ray/Makefile.disable-cicd +++ b/transforms/code/license_select/kfp_ray/Makefile.disable-cicd @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/code/license_select/python/Makefile b/transforms/code/license_select/python/Makefile index 7077c801f..2f3825fda 100644 --- a/transforms/code/license_select/python/Makefile +++ b/transforms/code/license_select/python/Makefile @@ -1,10 +1,22 @@ -# Define the root of the local git clone for the common rules to be able +# Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=license_select -# $(REPOROOT)/.make.versions file contains the versions +# Include the common configuration for this transform +include ../transform.config + DOCKER_IMAGE_VERSION=${LICENSE_SELECT_PYTHON_VERSION} # Use default rule inherited from makefile.common diff --git a/transforms/code/license_select/ray/Makefile b/transforms/code/license_select/ray/Makefile index 25fe6ab8a..d69cf00ca 100644 --- a/transforms/code/license_select/ray/Makefile +++ b/transforms/code/license_select/ray/Makefile @@ -1,15 +1,24 @@ - -# Define the root of the local git clone for the common rules to be able +# Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + BASE_IMAGE=$(RAY_BASE_IMAGE) -TRANSFORM_NAME=license_select -# $(REPOROOT)/.make.versions file contains the versions + DOCKER_IMAGE_VERSION=${LICENSE_SELECT_RAY_VERSION} # Use default rule inherited from makefile.common diff --git a/transforms/code/license_select/transform.config b/transforms/code/license_select/transform.config new file mode 100644 index 000000000..bba10d3e5 --- /dev/null +++ b/transforms/code/license_select/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=license_select + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +LICENSE_SELECT_PYTHON_VERSION=$(DPK_VERSION) +LICENSE_SELECT_RAY_VERSION=$(LICENSE_SELECT_PYTHON_VERSION) +LICENSE_SELECT_SPARK_VERSION=$(LICENSE_SELECT_PYTHON_VERSION) + diff --git a/transforms/code/malware/kfp_ray/Makefile b/transforms/code/malware/kfp_ray/Makefile index 7b423d8bd..0446e2d29 100644 --- a/transforms/code/malware/kfp_ray/Makefile +++ b/transforms/code/malware/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') @@ -48,4 +51,4 @@ workflow-test: workflow-build workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done \ No newline at end of file + done diff --git a/transforms/code/malware/python/Makefile b/transforms/code/malware/python/Makefile index 99174e9a1..bd523b629 100644 --- a/transforms/code/malware/python/Makefile +++ b/transforms/code/malware/python/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=malware +# Include the common configuration for this transform +include ../transform.config OS := $(shell uname -s) ifeq ($(OS),Darwin) diff --git a/transforms/code/malware/ray/Makefile b/transforms/code/malware/ray/Makefile index 99515c036..a92cbd529 100644 --- a/transforms/code/malware/ray/Makefile +++ b/transforms/code/malware/ray/Makefile @@ -1,12 +1,22 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=malware +# Include the common configuration for this transform +include ../transform.config + BASE_IMAGE=${RAY_BASE_IMAGE} OS := $(shell uname -s) ifeq ($(OS),Darwin) diff --git a/transforms/code/malware/transform.config b/transforms/code/malware/transform.config new file mode 100644 index 000000000..be0b6651d --- /dev/null +++ b/transforms/code/malware/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=malware + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +MALWARE_PYTHON_VERSION=$(DPK_VERSION) +MALWARE_RAY_VERSION=$(MALWARE_PYTHON_VERSION) +MALWARE_SPARK_VERSION=$(MALWARE_PYTHON_VERSION) + diff --git a/transforms/code/proglang_select/kfp_ray/Makefile b/transforms/code/proglang_select/kfp_ray/Makefile index abbf75c8c..b8a21bca8 100644 --- a/transforms/code/proglang_select/kfp_ray/Makefile +++ b/transforms/code/proglang_select/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/code/proglang_select/python/Makefile b/transforms/code/proglang_select/python/Makefile index 2cec4f6db..7d64e0a90 100644 --- a/transforms/code/proglang_select/python/Makefile +++ b/transforms/code/proglang_select/python/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=proglang_select +# Include the common configuration for this transform +include ../transform.config venv:: .transforms.python-venv diff --git a/transforms/code/proglang_select/ray/Makefile b/transforms/code/proglang_select/ray/Makefile index 82db54db7..20315a234 100644 --- a/transforms/code/proglang_select/ray/Makefile +++ b/transforms/code/proglang_select/ray/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=proglang_select +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/code/proglang_select/transform.config b/transforms/code/proglang_select/transform.config new file mode 100644 index 000000000..c32cb9775 --- /dev/null +++ b/transforms/code/proglang_select/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=proglang_select + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +PROGLANG_SELECT_PYTHON_VERSION=$(DPK_VERSION) +PROGLANG_SELECT_RAY_VERSION=$(PROGLANG_SELECT_PYTHON_VERSION) +PROGLANG_SELECT_SPARK_VERSION=$(PROGLANG_SELECT_PYTHON_VERSION) + diff --git a/transforms/code/repo_level_ordering/kfp_ray/Makefile.disable-cicd b/transforms/code/repo_level_ordering/kfp_ray/Makefile.disable-cicd index ef3765e31..5b2425357 100644 --- a/transforms/code/repo_level_ordering/kfp_ray/Makefile.disable-cicd +++ b/transforms/code/repo_level_ordering/kfp_ray/Makefile.disable-cicd @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/code/repo_level_ordering/ray/Makefile b/transforms/code/repo_level_ordering/ray/Makefile index 83f8692de..8d2f784fb 100644 --- a/transforms/code/repo_level_ordering/ray/Makefile +++ b/transforms/code/repo_level_ordering/ray/Makefile @@ -1,15 +1,23 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - include $(REPOROOT)/transforms/.make.transforms -BASE_IMAGE=$(RAY_BASE_IMAGE) +# Include the common configuration for this transform +include ../transform.config -TRANSFORM_NAME=repo_level_order +BASE_IMAGE=$(RAY_BASE_IMAGE) venv:: .transforms.ray-venv diff --git a/transforms/code/repo_level_ordering/transform.config b/transforms/code/repo_level_ordering/transform.config new file mode 100644 index 000000000..0d82c6377 --- /dev/null +++ b/transforms/code/repo_level_ordering/transform.config @@ -0,0 +1,19 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=repo_level_order + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +REPO_LVL_ORDER_RAY_VERSION=$(DPK_VERSION) + + diff --git a/transforms/language/doc_chunk/kfp_ray/Makefile b/transforms/language/doc_chunk/kfp_ray/Makefile index 189b36ea5..30e912e33 100644 --- a/transforms/language/doc_chunk/kfp_ray/Makefile +++ b/transforms/language/doc_chunk/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/language/doc_chunk/python/Makefile b/transforms/language/doc_chunk/python/Makefile index a6fbe35dc..2f2a7e789 100644 --- a/transforms/language/doc_chunk/python/Makefile +++ b/transforms/language/doc_chunk/python/Makefile @@ -1,14 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=doc_chunk +# Include the common configuration for this transform +include ../transform.config LINUX_WITH_CPU_TORCH?=true OS := $(shell uname -s) diff --git a/transforms/language/doc_chunk/ray/Makefile b/transforms/language/doc_chunk/ray/Makefile index 6b9b4ae6a..b4f394f84 100644 --- a/transforms/language/doc_chunk/ray/Makefile +++ b/transforms/language/doc_chunk/ray/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=doc_chunk +# Include the common configuration for this transform +include ../transform.config LINUX_WITH_CPU_TORCH?=true OS := $(shell uname -s) diff --git a/transforms/language/doc_chunk/transform.config b/transforms/language/doc_chunk/transform.config new file mode 100644 index 000000000..f433f360b --- /dev/null +++ b/transforms/language/doc_chunk/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=doc_chunk + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +DOC_CHUNK_PYTHON_VERSION=$(DPK_VERSION) +DOC_CHUNK_RAY_VERSION=$(DOC_CHUNK_PYTHON_VERSION) +DOC_CHUNK_SPARK_VERSION=$(DOC_CHUNK_PYTHON_VERSION) + diff --git a/transforms/language/doc_quality/kfp_ray/Makefile b/transforms/language/doc_quality/kfp_ray/Makefile index 004f17616..9f5e93615 100644 --- a/transforms/language/doc_quality/kfp_ray/Makefile +++ b/transforms/language/doc_quality/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/language/doc_quality/python/Makefile b/transforms/language/doc_quality/python/Makefile index 684ce47ae..f0f309400 100644 --- a/transforms/language/doc_quality/python/Makefile +++ b/transforms/language/doc_quality/python/Makefile @@ -1,16 +1,22 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME=doc_quality - include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + venv:: .transforms.python-venv test:: .transforms.test-src test-image diff --git a/transforms/language/doc_quality/ray/Makefile b/transforms/language/doc_quality/ray/Makefile index d462543a1..dd278af88 100644 --- a/transforms/language/doc_quality/ray/Makefile +++ b/transforms/language/doc_quality/ray/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=doc_quality -# $(REPOROOT)/.make.versions file contains the versions +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/language/doc_quality/transform.config b/transforms/language/doc_quality/transform.config new file mode 100644 index 000000000..2ece0e071 --- /dev/null +++ b/transforms/language/doc_quality/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=doc_quality + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +DOC_QUALITY_PYTHON_VERSION=$(DPK_VERSION) +DOC_QUALITY_RAY_VERSION=$(DOC_QUALITY_PYTHON_VERSION) +DOC_QUALITY_SPARK_VERSION=$(DOC_QUALITY_PYTHON_VERSION) + diff --git a/transforms/language/html2parquet/python/Makefile b/transforms/language/html2parquet/python/Makefile index 0e552d5be..284bb8e8a 100644 --- a/transforms/language/html2parquet/python/Makefile +++ b/transforms/language/html2parquet/python/Makefile @@ -1,16 +1,22 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME= html2parquet - include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + venv:: .transforms.python-venv test:: .transforms.python-test diff --git a/transforms/language/html2parquet/ray/Makefile b/transforms/language/html2parquet/ray/Makefile index 30c908259..1667be8b9 100644 --- a/transforms/language/html2parquet/ray/Makefile +++ b/transforms/language/html2parquet/ray/Makefile @@ -1,15 +1,22 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. +include $(REPOROOT)/transforms/.make.transforms -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME= html2parquet +# Include the common configuration for this transform +include ../transform.config -include $(REPOROOT)/transforms/.make.transforms BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv @@ -59,3 +66,6 @@ kind-load-image:: .transforms.kind-load-image docker-load-image: .defaults.docker-load-image docker-save-image: .defaults.docker-save-image + + + diff --git a/transforms/language/html2parquet/ray/requirements.txt b/transforms/language/html2parquet/ray/requirements.txt index dc2111e9e..dc796d602 100644 --- a/transforms/language/html2parquet/ray/requirements.txt +++ b/transforms/language/html2parquet/ray/requirements.txt @@ -1,3 +1,2 @@ dpk-html2parquet-transform-python==0.2.2.dev0 data-prep-toolkit-ray==0.2.2.dev0 -trafilatura==1.12.0 \ No newline at end of file diff --git a/transforms/language/html2parquet/transform.config b/transforms/language/html2parquet/transform.config new file mode 100644 index 000000000..10847c6af --- /dev/null +++ b/transforms/language/html2parquet/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=html2parquet + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +HTML2PARQUET_PYTHON_VERSION=$(DPK_VERSION) +HTML2PARQUET_RAY_VERSION=$(HTML2PARQUET_PYTHON_VERSION) +HTML2PARQUET_SPARK_VERSION=$(HTML2PARQUET_PYTHON_VERSION) + diff --git a/transforms/language/lang_id/kfp_ray/Makefile b/transforms/language/lang_id/kfp_ray/Makefile index b8f11ffc8..fd2c42d8e 100644 --- a/transforms/language/lang_id/kfp_ray/Makefile +++ b/transforms/language/lang_id/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/language/lang_id/python/Makefile b/transforms/language/lang_id/python/Makefile index 441f6093d..972ccb729 100644 --- a/transforms/language/lang_id/python/Makefile +++ b/transforms/language/lang_id/python/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. +include $(REPOROOT)/transforms/.make.transforms -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME=lang_id +# Include the common configuration for this transform +include ../transform.config include $(REPOROOT)/transforms/.make.transforms diff --git a/transforms/language/lang_id/ray/Makefile b/transforms/language/lang_id/ray/Makefile index 6b0e307d7..1339af964 100644 --- a/transforms/language/lang_id/ray/Makefile +++ b/transforms/language/lang_id/ray/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=lang_id +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/language/lang_id/transform.config b/transforms/language/lang_id/transform.config new file mode 100644 index 000000000..3a969f41d --- /dev/null +++ b/transforms/language/lang_id/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=lang_id + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +LANG_ID_PYTHON_VERSION=$(DPK_VERSION) +LANG_ID_RAY_VERSION=$(LANG_ID_PYTHON_VERSION) +LANG_ID_SPARK_VERSION=$(LANG_ID_PYTHON_VERSION) + diff --git a/transforms/language/pdf2parquet/kfp_ray/Makefile b/transforms/language/pdf2parquet/kfp_ray/Makefile index 24154bffa..66edd91fc 100644 --- a/transforms/language/pdf2parquet/kfp_ray/Makefile +++ b/transforms/language/pdf2parquet/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/language/pdf2parquet/python/Makefile b/transforms/language/pdf2parquet/python/Makefile index 0e06a5900..b18b068ac 100644 --- a/transforms/language/pdf2parquet/python/Makefile +++ b/transforms/language/pdf2parquet/python/Makefile @@ -1,14 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=pdf2parquet +# Include the common configuration for this transform +include ../transform.config RUN_ARGS=" --data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\" \ --data_files_to_use \"['.pdf','.zip']\" " diff --git a/transforms/language/pdf2parquet/ray/Makefile b/transforms/language/pdf2parquet/ray/Makefile index fba43ea15..ced1f45f1 100644 --- a/transforms/language/pdf2parquet/ray/Makefile +++ b/transforms/language/pdf2parquet/ray/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=pdf2parquet +# Include the common configuration for this transform +include ../transform.config LINUX_WITH_CPU_TORCH?=true OS := $(shell uname -s) diff --git a/transforms/language/pdf2parquet/transform.config b/transforms/language/pdf2parquet/transform.config new file mode 100644 index 000000000..1bda1908e --- /dev/null +++ b/transforms/language/pdf2parquet/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=pdf2parquet + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +PDF2PARQUET_PYTHON_VERSION=$(DPK_VERSION) +PDF2PARQUET_RAY_VERSION=$(PDF2PARQUET_PYTHON_VERSION) +PDF2PARQUET_SPARK_VERSION=$(PDF2PARQUET_PYTHON_VERSION) + diff --git a/transforms/language/pii_redactor/kfp_ray/Makefile b/transforms/language/pii_redactor/kfp_ray/Makefile index 77844a79e..370f85cb0 100644 --- a/transforms/language/pii_redactor/kfp_ray/Makefile +++ b/transforms/language/pii_redactor/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/language/pii_redactor/python/Makefile b/transforms/language/pii_redactor/python/Makefile index 28fd33fff..50161da6e 100644 --- a/transforms/language/pii_redactor/python/Makefile +++ b/transforms/language/pii_redactor/python/Makefile @@ -1,16 +1,22 @@ -# Define the root of the local git clone for the common rules to be able +# Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions -TRANSFORM_NAME=pii_redactor +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + venv:: .transforms.python-venv test:: .transforms.python-test diff --git a/transforms/language/pii_redactor/ray/Makefile b/transforms/language/pii_redactor/ray/Makefile index 3a67b90b8..e52494534 100644 --- a/transforms/language/pii_redactor/ray/Makefile +++ b/transforms/language/pii_redactor/ray/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=pii_redactor +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/language/pii_redactor/transform.config b/transforms/language/pii_redactor/transform.config new file mode 100644 index 000000000..c06adf82c --- /dev/null +++ b/transforms/language/pii_redactor/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=pii_redactor + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +PII_REDACTOR_PYTHON_VERSION=$(DPK_VERSION) +PII_REDACTOR_RAY_VERSION=$(PII_REDACTOR_PYTHON_VERSION) +PII_REDACTOR_SPARK_VERSION=$(PII_REDACTOR_PYTHON_VERSION) + diff --git a/transforms/language/text_encoder/kfp_ray/Makefile.disable-cicd b/transforms/language/text_encoder/kfp_ray/Makefile.disable-cicd index 70613cc01..36bd47560 100644 --- a/transforms/language/text_encoder/kfp_ray/Makefile.disable-cicd +++ b/transforms/language/text_encoder/kfp_ray/Makefile.disable-cicd @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/language/text_encoder/python/Makefile b/transforms/language/text_encoder/python/Makefile index c9e8b8c1b..564bb405b 100644 --- a/transforms/language/text_encoder/python/Makefile +++ b/transforms/language/text_encoder/python/Makefile @@ -1,14 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=text_encoder +# Include the common configuration for this transform +include ../transform.config LINUX_WITH_CPU_TORCH?=true OS := $(shell uname -s) diff --git a/transforms/language/text_encoder/ray/Makefile b/transforms/language/text_encoder/ray/Makefile index b95b299c4..85cf45cac 100644 --- a/transforms/language/text_encoder/ray/Makefile +++ b/transforms/language/text_encoder/ray/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=text_encoder +# Include the common configuration for this transform +include ../transform.config LINUX_WITH_CPU_TORCH?=true OS := $(shell uname -s) diff --git a/transforms/language/text_encoder/transform.config b/transforms/language/text_encoder/transform.config new file mode 100644 index 000000000..df5754fb8 --- /dev/null +++ b/transforms/language/text_encoder/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=text_encoder + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +TEXT_ENCODER_PYTHON_VERSION=$(DPK_VERSION) +TEXT_ENCODER_RAY_VERSION=$(TEXT_ENCODER_PYTHON_VERSION) +TEXT_ENCODER_SPARK_VERSION=$(TEXT_ENCODER_PYTHON_VERSION) + diff --git a/transforms/packaging/.make.packaging b/transforms/packaging/.make.packaging index 5268889d0..29506aaf1 100644 --- a/transforms/packaging/.make.packaging +++ b/transforms/packaging/.make.packaging @@ -2,6 +2,11 @@ ifndef T_SET T_SET=all endif +# Defines the version of the wheel for the package transforms +# If you change this value, you will need to run "make set-versions" to +# apply the new version number to the toml files. +DPK_TRANSFORMS_VERSION=$(DPK_VERSION) + venv: $(MAKE) .defaults.create-venv diff --git a/transforms/universal/doc_id/kfp_ray/Makefile b/transforms/universal/doc_id/kfp_ray/Makefile index 94fc75145..f170326e2 100644 --- a/transforms/universal/doc_id/kfp_ray/Makefile +++ b/transforms/universal/doc_id/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') @@ -48,4 +51,4 @@ workflow-test: workflow-build workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done \ No newline at end of file + done diff --git a/transforms/universal/doc_id/python/Makefile b/transforms/universal/doc_id/python/Makefile index 1f7d0d353..26da1fc8f 100644 --- a/transforms/universal/doc_id/python/Makefile +++ b/transforms/universal/doc_id/python/Makefile @@ -1,16 +1,22 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME=doc_id - include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + venv:: .transforms.python-venv test:: .transforms.python-test diff --git a/transforms/universal/doc_id/ray/Makefile b/transforms/universal/doc_id/ray/Makefile index d7844f2f9..79787406b 100644 --- a/transforms/universal/doc_id/ray/Makefile +++ b/transforms/universal/doc_id/ray/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=doc_id +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/universal/doc_id/spark/Makefile b/transforms/universal/doc_id/spark/Makefile index 954786dac..9303d021f 100644 --- a/transforms/universal/doc_id/spark/Makefile +++ b/transforms/universal/doc_id/spark/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=doc_id +# Include the common configuration for this transform +include ../transform.config venv:: .transforms.spark-venv diff --git a/transforms/universal/doc_id/transform.config b/transforms/universal/doc_id/transform.config new file mode 100644 index 000000000..d3715f3b2 --- /dev/null +++ b/transforms/universal/doc_id/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=doc_id + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +DOC_ID_PYTHON_VERSION=$(DPK_VERSION) +DOC_ID_RAY_VERSION=$(DOC_ID_PYTHON_VERSION) +DOC_ID_SPARK_VERSION=$(DOC_ID_PYTHON_VERSION) + diff --git a/transforms/universal/ededup/kfp_ray/Makefile b/transforms/universal/ededup/kfp_ray/Makefile index 456cf76d1..f0c5cc217 100644 --- a/transforms/universal/ededup/kfp_ray/Makefile +++ b/transforms/universal/ededup/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') @@ -47,4 +50,4 @@ workflow-test: workflow-build workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done \ No newline at end of file + done diff --git a/transforms/universal/ededup/python/Makefile b/transforms/universal/ededup/python/Makefile index 92f3fac27..348edc74d 100644 --- a/transforms/universal/ededup/python/Makefile +++ b/transforms/universal/ededup/python/Makefile @@ -1,16 +1,22 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME=ededup - include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + venv:: .transforms.python-venv test:: .transforms.python-test diff --git a/transforms/universal/ededup/ray/Makefile b/transforms/universal/ededup/ray/Makefile index f828e107e..1ff055e29 100644 --- a/transforms/universal/ededup/ray/Makefile +++ b/transforms/universal/ededup/ray/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=ededup +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/universal/ededup/transform.config b/transforms/universal/ededup/transform.config new file mode 100644 index 000000000..12f5357f1 --- /dev/null +++ b/transforms/universal/ededup/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=ededup + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +EDEDUP_PYTHON_VERSION=$(DPK_VERSION) +EDEDUP_RAY_VERSION=$(EDEDUP_PYTHON_VERSION) +EDEDUP_SPARK_VERSION=$(EDEDUP_PYTHON_VERSION) + diff --git a/transforms/universal/fdedup/kfp_ray/Makefile b/transforms/universal/fdedup/kfp_ray/Makefile index f6b215984..55f7851f6 100644 --- a/transforms/universal/fdedup/kfp_ray/Makefile +++ b/transforms/universal/fdedup/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') @@ -48,4 +51,4 @@ workflow-test: workflow-build workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done \ No newline at end of file + done diff --git a/transforms/universal/fdedup/ray/Makefile b/transforms/universal/fdedup/ray/Makefile index 15173ba00..f5f06c3c3 100644 --- a/transforms/universal/fdedup/ray/Makefile +++ b/transforms/universal/fdedup/ray/Makefile @@ -1,14 +1,24 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=fdedup +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} + venv:: .transforms.ray-venv test:: .transforms.ray-test diff --git a/transforms/universal/fdedup/transform.config b/transforms/universal/fdedup/transform.config new file mode 100644 index 000000000..774716e15 --- /dev/null +++ b/transforms/universal/fdedup/transform.config @@ -0,0 +1,18 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=fdedup + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +FDEDUP_RAY_VERSION=$(DPK_VERSION) + diff --git a/transforms/universal/filter/kfp_ray/Makefile b/transforms/universal/filter/kfp_ray/Makefile index bd26792be..c48298d22 100644 --- a/transforms/universal/filter/kfp_ray/Makefile +++ b/transforms/universal/filter/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/universal/filter/python/Makefile b/transforms/universal/filter/python/Makefile index 1ea1151ce..9a01deea1 100644 --- a/transforms/universal/filter/python/Makefile +++ b/transforms/universal/filter/python/Makefile @@ -1,16 +1,22 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME=filter - include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + venv:: .transforms.python-venv test:: .transforms.python-test diff --git a/transforms/universal/filter/ray/Makefile b/transforms/universal/filter/ray/Makefile index 5960a9670..0c0af0004 100644 --- a/transforms/universal/filter/ray/Makefile +++ b/transforms/universal/filter/ray/Makefile @@ -1,13 +1,21 @@ - # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=filter +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/universal/filter/spark/Makefile b/transforms/universal/filter/spark/Makefile index 329da35a2..72bc78a15 100644 --- a/transforms/universal/filter/spark/Makefile +++ b/transforms/universal/filter/spark/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -# This is included in the image name, if defined -TRANSFORM_NAME=filter +# Include the common configuration for this transform +include ../transform.config venv:: .transforms.spark-venv diff --git a/transforms/universal/filter/transform.config b/transforms/universal/filter/transform.config new file mode 100644 index 000000000..70f2ada5b --- /dev/null +++ b/transforms/universal/filter/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=filter + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +FILTER_PYTHON_VERSION=$(DPK_VERSION) +FILTER_RAY_VERSION=$(FILTER_PYTHON_VERSION) +FILTER_SPARK_VERSION=$(FILTER_PYTHON_VERSION) + diff --git a/transforms/universal/hap/python/Makefile b/transforms/universal/hap/python/Makefile index c7c15dba7..2363e51c2 100644 --- a/transforms/universal/hap/python/Makefile +++ b/transforms/universal/hap/python/Makefile @@ -1,15 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -#TRANSFORM_RUNTIME_SRC_FILE=hap_transform_python.py -TRANSFORM_NAME=hap - -HAP_PYTHON_VERSION= $(DPK_VERSION) +# Include the common configuration for this transform +include ../transform.config venv:: .transforms.python-venv diff --git a/transforms/universal/hap/transform.config b/transforms/universal/hap/transform.config new file mode 100644 index 000000000..6aa7018b3 --- /dev/null +++ b/transforms/universal/hap/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=hap + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +HAP_PYTHON_VERSION=$(DPK_VERSION) +HAP_RAY_VERSION=$(HAP_PYTHON_VERSION) +HAP_SPARK_VERSION=$(HAP_PYTHON_VERSION) + diff --git a/transforms/universal/noop/kfp_ray/Makefile b/transforms/universal/noop/kfp_ray/Makefile index d1198e5a2..fc541f367 100644 --- a/transforms/universal/noop/kfp_ray/Makefile +++ b/transforms/universal/noop/kfp_ray/Makefile @@ -1,7 +1,11 @@ REPOROOT=${CURDIR}/../../../../ + WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/universal/noop/python/Makefile b/transforms/universal/noop/python/Makefile index 80797bcc9..5e6121b04 100644 --- a/transforms/universal/noop/python/Makefile +++ b/transforms/universal/noop/python/Makefile @@ -1,15 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=noop - +# Include the common configuration for this transform +include ../transform.config venv:: .transforms.python-venv diff --git a/transforms/universal/noop/ray/Makefile b/transforms/universal/noop/ray/Makefile index 0b70f6662..ad7ff3320 100644 --- a/transforms/universal/noop/ray/Makefile +++ b/transforms/universal/noop/ray/Makefile @@ -1,15 +1,24 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=noop +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} + venv:: .transforms.ray-venv test:: .transforms.ray-test diff --git a/transforms/universal/noop/spark/Makefile b/transforms/universal/noop/spark/Makefile index 726fd9e6a..ebc72992e 100644 --- a/transforms/universal/noop/spark/Makefile +++ b/transforms/universal/noop/spark/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=noop +# Include the common configuration for this transform +include ../transform.config venv:: .transforms.spark-venv diff --git a/transforms/universal/noop/transform.config b/transforms/universal/noop/transform.config new file mode 100644 index 000000000..49c9b2cbf --- /dev/null +++ b/transforms/universal/noop/transform.config @@ -0,0 +1,21 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=noop + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +NOOP_PYTHON_VERSION=$(DPK_VERSION) +NOOP_RAY_VERSION=$(NOOP_PYTHON_VERSION) +NOOP_SPARK_VERSION=$(NOOP_PYTHON_VERSION) + diff --git a/transforms/universal/profiler/kfp_ray/Makefile b/transforms/universal/profiler/kfp_ray/Makefile index 2fbd17653..e4f6b860b 100644 --- a/transforms/universal/profiler/kfp_ray/Makefile +++ b/transforms/universal/profiler/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') diff --git a/transforms/universal/profiler/python/Makefile b/transforms/universal/profiler/python/Makefile index 61c807a23..983250184 100644 --- a/transforms/universal/profiler/python/Makefile +++ b/transforms/universal/profiler/python/Makefile @@ -1,16 +1,22 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME=profiler - include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + venv:: .transforms.python-venv test:: .transforms.python-test diff --git a/transforms/universal/profiler/ray/Makefile b/transforms/universal/profiler/ray/Makefile index 8cec28968..12d75c4c3 100644 --- a/transforms/universal/profiler/ray/Makefile +++ b/transforms/universal/profiler/ray/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=profiler +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/universal/profiler/spark/Makefile b/transforms/universal/profiler/spark/Makefile index cb90b4020..39b16cac6 100644 --- a/transforms/universal/profiler/spark/Makefile +++ b/transforms/universal/profiler/spark/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=profiler +# Include the common configuration for this transform +include ../transform.config venv:: .transforms.spark-venv diff --git a/transforms/universal/profiler/transform.config b/transforms/universal/profiler/transform.config new file mode 100644 index 000000000..c86cd6415 --- /dev/null +++ b/transforms/universal/profiler/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=profiler + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +PROFILER_PYTHON_VERSION=$(DPK_VERSION) +PROFILER_RAY_VERSION=$(PROFILER_PYTHON_VERSION) +PROFILER_SPARK_VERSION=$(PROFILER_PYTHON_VERSION) + diff --git a/transforms/universal/resize/kfp_ray/Makefile b/transforms/universal/resize/kfp_ray/Makefile index a0e2faf37..8c7e592af 100644 --- a/transforms/universal/resize/kfp_ray/Makefile +++ b/transforms/universal/resize/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') @@ -48,4 +51,4 @@ workflow-test: workflow-build workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done \ No newline at end of file + done diff --git a/transforms/universal/resize/python/Makefile b/transforms/universal/resize/python/Makefile index 7de0032e3..66453c846 100644 --- a/transforms/universal/resize/python/Makefile +++ b/transforms/universal/resize/python/Makefile @@ -1,16 +1,22 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -TRANSFORM_NAME=resize - include $(REPOROOT)/transforms/.make.transforms +# Include the common configuration for this transform +include ../transform.config + venv:: .transforms.python-venv test:: .transforms.python-test diff --git a/transforms/universal/resize/ray/Makefile b/transforms/universal/resize/ray/Makefile index 1a2f2496f..dd229b3f4 100644 --- a/transforms/universal/resize/ray/Makefile +++ b/transforms/universal/resize/ray/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free -# to override/redefine the rules below. - +# to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=resize +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/universal/resize/spark/Makefile b/transforms/universal/resize/spark/Makefile index f02e9db3f..18d72d31d 100644 --- a/transforms/universal/resize/spark/Makefile +++ b/transforms/universal/resize/spark/Makefile @@ -1,13 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. - include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=resize +# Include the common configuration for this transform +include ../transform.config venv:: .transforms.spark-venv diff --git a/transforms/universal/resize/transform.config b/transforms/universal/resize/transform.config new file mode 100644 index 000000000..4b7171a4e --- /dev/null +++ b/transforms/universal/resize/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=resize + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +RESIZE_PYTHON_VERSION=$(DPK_VERSION) +RESIZE_RAY_VERSION=$(RESIZE_PYTHON_VERSION) +RESIZE_SPARK_VERSION=$(RESIZE_PYTHON_VERSION) + diff --git a/transforms/universal/tokenization/kfp_ray/Makefile b/transforms/universal/tokenization/kfp_ray/Makefile index 09656297a..c43105ff1 100644 --- a/transforms/universal/tokenization/kfp_ray/Makefile +++ b/transforms/universal/tokenization/kfp_ray/Makefile @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows +# Include the common configuration for this transform +include ../transform.config + SRC_DIR=${CURDIR}/../ray/ PYTHON_WF := $(shell find ./ -name '*_wf.py') @@ -48,4 +51,4 @@ workflow-test: workflow-build workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done \ No newline at end of file + done diff --git a/transforms/universal/tokenization/python/Makefile b/transforms/universal/tokenization/python/Makefile index d23661983..8f4f7fbf5 100644 --- a/transforms/universal/tokenization/python/Makefile +++ b/transforms/universal/tokenization/python/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=tokenization +# Include the common configuration for this transform +include ../transform.config venv:: .transforms.python-venv diff --git a/transforms/universal/tokenization/ray/Makefile b/transforms/universal/tokenization/ray/Makefile index 3d5a46d09..0a4e3a370 100644 --- a/transforms/universal/tokenization/ray/Makefile +++ b/transforms/universal/tokenization/ray/Makefile @@ -1,12 +1,21 @@ # Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free # to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -TRANSFORM_NAME=tokenization +# Include the common configuration for this transform +include ../transform.config BASE_IMAGE=${RAY_BASE_IMAGE} venv:: .transforms.ray-venv diff --git a/transforms/universal/tokenization/transform.config b/transforms/universal/tokenization/transform.config new file mode 100644 index 000000000..04f517d42 --- /dev/null +++ b/transforms/universal/tokenization/transform.config @@ -0,0 +1,20 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=tokenization + +################################################################################ +# This defines the transforms' version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +TOKENIZATION_PYTHON_VERSION=$(DPK_VERSION) +TOKENIZATION_RAY_VERSION=$(TOKENIZATION_PYTHON_VERSION) +TOKENIZATION_SPARK_VERSION=$(TOKENIZATION_PYTHON_VERSION) +