Skip to content

Commit

Permalink
Merge branch 'dev' into fuzzy-dedup-modifications
Browse files Browse the repository at this point in the history
  • Loading branch information
cmadam committed Oct 9, 2024
2 parents cf70213 + efc1162 commit 36e5894
Show file tree
Hide file tree
Showing 97 changed files with 1,071 additions and 217 deletions.
10 changes: 8 additions & 2 deletions .make.defaults
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,10 @@ __check_defined = \
cp -p -R ${LIB_PATH}/src ${LIB_NAME}
cp -p -R ${LIB_PATH}/pyproject.toml ${LIB_NAME}
cp -p -R ${LIB_PATH}/README.md ${LIB_NAME}
if [ -e ${LIB_PATH}/requirements.txt ]; then \
cp -p ${LIB_PATH}/requirements.txt ${LIB_NAME}; \
fi


# Build and image using the local Dockerfile and make the data-processing-lib/python
# available in the current directory for use by the Dockerfile (i.e. to install the library).
Expand Down Expand Up @@ -591,8 +595,9 @@ MINIO_ADMIN_PWD= localminiosecretkey
# Updates the versions references to our repo source as defined in .make.versions
.PHONY: .defaults.__update-toml-lib-dep-versions
.defaults.__update-toml-lib-dep-versions:
ifeq ($(USE_REPO_LIB_SRC), 1)
@# Help: Update pyproject.toml to depend on lib versions defined in .make.versions
@if [ -e pyproject.toml ]; then \
if [ -e pyproject.toml ]; then \
cat pyproject.toml | sed \
-e 's/"data-prep-toolkit-ray\([=><~][=]\).*"/"data-prep-toolkit-ray\1$(DPK_LIB_VERSION)"/' \
-e 's/"data-prep-toolkit-spark\([=><~][=]\).*"/"data-prep-toolkit-spark\1$(DPK_LIB_VERSION)"/' \
Expand All @@ -603,7 +608,7 @@ MINIO_ADMIN_PWD= localminiosecretkey
> tt.toml; \
mv tt.toml pyproject.toml; \
fi
@if [ -e requirements.txt ]; then \
if [ -e requirements.txt ]; then \
cat requirements.txt | sed \
-e 's/data-prep-toolkit-ray\([=><~][=]\).*/data-prep-toolkit-ray\1$(DPK_LIB_VERSION)/' \
-e 's/data-prep-toolkit-transforms\([=><~][=]\).*/data-prep-toolkit-transforms\1$(DPK_TRANSFORMS_VERSION)/' \
Expand All @@ -615,6 +620,7 @@ MINIO_ADMIN_PWD= localminiosecretkey
> tt.txt; \
mv tt.txt requirements.txt; \
fi
endif

# Build the distribution, usually in preparation for publishing using ith the .defaults.publish-dist target
.PHONY: .defaults.build-dist
Expand Down
76 changes: 2 additions & 74 deletions .make.versions
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_

# publish docker images with latest tag
ifeq ($(DPK_VERSION_SUFFIX), )
DOCKER_IMAGE_VERSION=$(DPK_VERSION)
DOCKER_IMAGE_VERSION?=$(DPK_VERSION)
else
DOCKER_IMAGE_VERSION=latest
DOCKER_IMAGE_VERSION?=latest
endif

# Data prep lab wheel version
Expand All @@ -39,78 +39,6 @@ DPK_LIB_KFP_SHARED=$(DPK_VERSION)
KFP_DOCKER_VERSION=$(DOCKER_IMAGE_VERSION)
KFP_DOCKER_VERSION_v2=$(DOCKER_IMAGE_VERSION)

# Begin transform versions/tags
BLOCKLIST_VERSION=$(DPK_VERSION)

DOC_ID_PYTHON_VERSION=$(DPK_VERSION)
DOC_ID_RAY_VERSION=$(DPK_VERSION)
DOC_ID_SPARK_VERSION=$(DPK_VERSION)

EDEDUP_PYTHON_VERSION=$(DPK_VERSION)
EDEDUP_RAY_VERSION=$(DPK_VERSION)

FDEDUP_RAY_VERSION=$(DPK_VERSION)

FILTER_PYTHON_VERSION=$(DPK_VERSION)
FILTER_RAY_VERSION=$(DPK_VERSION)
FILTER_SPARK_VERSION=$(DPK_VERSION)

NOOP_PYTHON_VERSION=$(DPK_VERSION)
NOOP_RAY_VERSION=$(DPK_VERSION)
NOOP_SPARK_VERSION=$(DPK_VERSION)

PROFILER_PYTHON_VERSION=$(DPK_VERSION)
PROFILER_RAY_VERSION=$(DPK_VERSION)
PROFILER_SPARK_VERSION=$(DPK_VERSION)

RESIZE_PYTHON_VERSION=$(DPK_VERSION)
RESIZE_RAY_VERSION=$(DPK_VERSION)
RESIZE_SPARK_VERSION=$(DPK_VERSION)

LANG_ID_PYTHON_VERSION=$(DPK_VERSION)
LANG_ID_RAY_VERSION=$(DPK_VERSION)

TOKENIZATION_RAY_VERSION=$(DPK_VERSION)
TOKENIZATION_PYTHON_VERSION=$(DPK_VERSION)

MALWARE_RAY_VERSION=$(DPK_VERSION)
MALWARE_PYTHON_VERSION=$(DPK_VERSION)

PROGLANG_SELECT_PYTHON_VERSION=$(DPK_VERSION)
PROGLANG_SELECT_RAY_VERSION=$(DPK_VERSION)

DOC_QUALITY_PYTHON_VERSION=$(DPK_VERSION)
DOC_QUALITY_RAY_VERSION=$(DPK_VERSION)

CODE_QUALITY_RAY_VERSION=$(DPK_VERSION)
CODE_QUALITY_PYTHON_VERSION=$(DPK_VERSION)

CODE2PARQUET_PYTHON_VERSION=$(DPK_VERSION)
CODE2PARQUET_RAY_VERSION=$(DPK_VERSION)
INGEST_TO_PARQUET_VERSION=$(DPK_VERSION)
REPO_LVL_ORDER_RAY_VERSION=$(DPK_VERSION)

PDF2PARQUET_PYTHON_VERSION=$(DPK_VERSION)
PDF2PARQUET_RAY_VERSION=$(DPK_VERSION)

DOC_CHUNK_PYTHON_VERSION=$(DPK_VERSION)
DOC_CHUNK_RAY_VERSION=$(DPK_VERSION)

TEXT_ENCODER_PYTHON_VERSION=$(DPK_VERSION)
TEXT_ENCODER_RAY_VERSION=$(DPK_VERSION)

HEADER_CLEANSER_PYTHON_VERSION=$(DPK_VERSION)
HEADER_CLEANSER_RAY_VERSION=$(DPK_VERSION)

LICENSE_SELECT_PYTHON_VERSION=$(DPK_VERSION)
LICENSE_SELECT_RAY_VERSION=$(DPK_VERSION)

PII_REDACTOR_PYTHON_VERSION=$(DPK_VERSION)

HTML2PARQUET_PYTHON_VERSION=$(DPK_VERSION)

DPK_TRANSFORMS_VERSION=$(DPK_VERSION)

################## ################## ################## ################## ################## ##################
# Begin versions that the repo depends on.

Expand Down
3 changes: 3 additions & 0 deletions transforms/code/code2parquet/kfp_ray/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../
WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate
include $(REPOROOT)/transforms/.make.workflows

# Include the common configuration for this transform
include ../transform.config

SRC_DIR=${CURDIR}/../ray/

PYTHON_WF := $(shell find ./ -name '*_wf.py')
Expand Down
16 changes: 11 additions & 5 deletions transforms/code/code2parquet/python/Makefile
Original file line number Diff line number Diff line change
@@ -1,16 +1,22 @@
# Define the root of the local git clone for the common rules to be able
# know where they are running from.
REPOROOT=../../../..

# Set this, before including .make.defaults, to
# 1 if requirements reference the latest code in the data processing library
# in this repo (that is not yet published to pypi). This is the default setting.
# 0 if the transforms DPK dependencies are on wheels published to
# pypi (e.g. data-prep-toolkit=0.2.1)
#USE_REPO_LIB_SRC=1

# Include a library of common .transform.* targets which most
# transforms should be able to reuse. However, feel free
# to override/redefine the rules below.

# $(REPOROOT)/.make.versions file contains the versions

TRANSFORM_NAME=code2parquet

include $(REPOROOT)/transforms/.make.transforms

# Include the common configuration for this transform
include ../transform.config

venv:: .transforms.python-venv

test:: .transforms.python-test
Expand Down
11 changes: 10 additions & 1 deletion transforms/code/code2parquet/ray/Makefile
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
# Define the root of the local git clone for the common rules to be able
# know where they are running from.
REPOROOT=../../../..

# Set this, before including .make.defaults, to
# 1 if requirements reference the latest code in the data processing library
# in this repo (that is not yet published to pypi). This is the default setting.
# 0 if the transforms DPK dependencies are on wheels published to
# pypi (e.g. data-prep-toolkit=0.2.1)
#USE_REPO_LIB_SRC=1

# Include a library of common .transform.* targets which most
# transforms should be able to reuse. However, feel free
# to override/redefine the rules below.
include $(REPOROOT)/transforms/.make.transforms

TRANSFORM_NAME=code2parquet
# Include the common configuration for this transform
include ../transform.config

BASE_IMAGE=${RAY_BASE_IMAGE}
venv:: .transforms.ray-venv
Expand Down
20 changes: 20 additions & 0 deletions transforms/code/code2parquet/transform.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#
# This is intended to be included across the Makefiles provided within
# a given transform's directory tree, so must use compatible syntax.
#
################################################################################
# This defines the name of the transform and is used to match against
# expected files and is used to define the transform's image name.
TRANSFORM_NAME=code2parquet

################################################################################
# This defines the transforms' version number as would be used
# when publishing the wheel. In general, only the micro version
# number should be advanced relative to the DPK_VERSION.
#
# If you change the versions numbers, be sure to run "make set-versions" to
# update version numbers across the transform (e.g., pyproject.toml).
CODE2PARQUET_PYTHON_VERSION=$(DPK_VERSION)
CODE2PARQUET_RAY_VERSION=$(CODE2PARQUET_PYTHON_VERSION)
CODE2PARQUET_SPARK_VERSION=$(CODE2PARQUET_PYTHON_VERSION)

5 changes: 4 additions & 1 deletion transforms/code/code_quality/kfp_ray/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../
WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate
include $(REPOROOT)/transforms/.make.workflows

# Include the common configuration for this transform
include ../transform.config

SRC_DIR=${CURDIR}/../ray/

PYTHON_WF := $(shell find ./ -name '*_wf.py')
Expand Down Expand Up @@ -48,4 +51,4 @@ workflow-test: workflow-build
workflow-upload: workflow-build
@for file in $(YAML_WF); do \
$(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \
done
done
17 changes: 14 additions & 3 deletions transforms/code/code_quality/python/Makefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@

# Define the root of the local git clone for the common rules to be able
# Define the root of the local git clone for the common rules to be able
# know where they are running from.
REPOROOT=../../../..

# Set this, before including .make.defaults, to
# 1 if requirements reference the latest code in the data processing library
# in this repo (that is not yet published to pypi). This is the default setting.
# 0 if the transforms DPK dependencies are on wheels published to
# pypi (e.g. data-prep-toolkit=0.2.1)
#USE_REPO_LIB_SRC=1

# Include a library of common .transform.* targets which most
# transforms should be able to reuse. However, feel free
# to override/redefine the rules below.
include $(REPOROOT)/transforms/.make.transforms

TRANSFORM_NAME=code_quality
# Include the common configuration for this transform
include ../transform.config

# Use default rule inherited from makefile.common
clean:: .transforms.clean
Expand Down
17 changes: 14 additions & 3 deletions transforms/code/code_quality/ray/Makefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@

# Define the root of the local git clone for the common rules to be able
# Define the root of the local git clone for the common rules to be able
# know where they are running from.
REPOROOT=../../../..

# Set this, before including .make.defaults, to
# 1 if requirements reference the latest code in the data processing library
# in this repo (that is not yet published to pypi). This is the default setting.
# 0 if the transforms DPK dependencies are on wheels published to
# pypi (e.g. data-prep-toolkit=0.2.1)
#USE_REPO_LIB_SRC=1

# Include a library of common .transform.* targets which most
# transforms should be able to reuse. However, feel free
# to override/redefine the rules below.
include $(REPOROOT)/transforms/.make.transforms

TRANSFORM_NAME=code_quality
# Include the common configuration for this transform
include ../transform.config

BASE_IMAGE=${RAY_BASE_IMAGE}

Expand Down
20 changes: 20 additions & 0 deletions transforms/code/code_quality/transform.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#
# This is intended to be included across the Makefiles provided within
# a given transform's directory tree, so must use compatible syntax.
#
################################################################################
# This defines the name of the transform and is used to match against
# expected files and is used to define the transform's image name.
TRANSFORM_NAME=code_quality

################################################################################
# This defines the transforms' version number as would be used
# when publishing the wheel. In general, only the micro version
# number should be advanced relative to the DPK_VERSION.
#
# If you change the versions numbers, be sure to run "make set-versions" to
# update version numbers across the transform (e.g., pyproject.toml).
CODE_QUALITY_PYTHON_VERSION=$(DPK_VERSION)
CODE_QUALITY_RAY_VERSION=$(CODE_QUALITY_PYTHON_VERSION)
CODE_QUALITY_SPARK_VERSION=$(CODE_QUALITY_PYTHON_VERSION)

Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../
WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate
include $(REPOROOT)/transforms/.make.workflows

# Include the common configuration for this transform
include ../transform.config

SRC_DIR=${CURDIR}/../ray/

PYTHON_WF := $(shell find ./ -name '*_wf.py')
Expand Down Expand Up @@ -48,4 +51,4 @@ workflow-test: workflow-build
workflow-upload: workflow-build
@for file in $(YAML_WF); do \
$(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \
done
done
17 changes: 14 additions & 3 deletions transforms/code/header_cleanser/python/Makefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@

# Define the root of the local git clone for the common rules to be able
# Define the root of the local git clone for the common rules to be able
# know where they are running from.
REPOROOT=../../../..

# Set this, before including .make.defaults, to
# 1 if requirements reference the latest code in the data processing library
# in this repo (that is not yet published to pypi). This is the default setting.
# 0 if the transforms DPK dependencies are on wheels published to
# pypi (e.g. data-prep-toolkit=0.2.1)
#USE_REPO_LIB_SRC=1

# Include a library of common .transform.* targets which most
# transforms should be able to reuse. However, feel free
# to override/redefine the rules below.
include $(REPOROOT)/transforms/.make.transforms

TRANSFORM_NAME=header_cleanser
# Include the common configuration for this transform
include ../transform.config

# Use default rule inherited from makefile.common
clean:: .transforms.clean
Expand Down
4 changes: 3 additions & 1 deletion transforms/code/header_cleanser/ray/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@ COPY --chown=ray:users pyproject.toml pyproject.toml
RUN pip install --no-cache-dir -e .

# Install system dependencies, including libgomp1
USER root
RUN sudo apt-get update && sudo apt-get install -y \
libgomp1 \
&& sudo rm -rf /var/lib/apt/lists/*
User ray

# copy source data
COPY ./src/header_cleanser_transform_ray.py .
Expand All @@ -36,4 +38,4 @@ ENV PYTHONPATH /home/ray
ARG BUILD_DATE
ARG GIT_COMMIT
LABEL build-date=$BUILD_DATE
LABEL git-commit=$GIT_COMMIT
LABEL git-commit=$GIT_COMMIT
17 changes: 14 additions & 3 deletions transforms/code/header_cleanser/ray/Makefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@

# Define the root of the local git clone for the common rules to be able
# Define the root of the local git clone for the common rules to be able
# know where they are running from.
REPOROOT=../../../..

# Set this, before including .make.defaults, to
# 1 if requirements reference the latest code in the data processing library
# in this repo (that is not yet published to pypi). This is the default setting.
# 0 if the transforms DPK dependencies are on wheels published to
# pypi (e.g. data-prep-toolkit=0.2.1)
#USE_REPO_LIB_SRC=1

# Include a library of common .transform.* targets which most
# transforms should be able to reuse. However, feel free
# to override/redefine the rules below.
include $(REPOROOT)/transforms/.make.transforms

TRANSFORM_NAME=header_cleanser
# Include the common configuration for this transform
include ../transform.config

BASE_IMAGE=${RAY_BASE_IMAGE}

Expand Down
Loading

0 comments on commit 36e5894

Please sign in to comment.