Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/dev' into dev-pankaj
Browse files Browse the repository at this point in the history
Signed-off-by: Pankaj Thorat <[email protected]>
  • Loading branch information
pankajskku committed Oct 10, 2024
2 parents 73286ff + efc1162 commit ac795c2
Show file tree
Hide file tree
Showing 98 changed files with 1,094 additions and 223 deletions.
10 changes: 8 additions & 2 deletions .make.defaults
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,10 @@ __check_defined = \
cp -p -R ${LIB_PATH}/src ${LIB_NAME}
cp -p -R ${LIB_PATH}/pyproject.toml ${LIB_NAME}
cp -p -R ${LIB_PATH}/README.md ${LIB_NAME}
if [ -e ${LIB_PATH}/requirements.txt ]; then \
cp -p ${LIB_PATH}/requirements.txt ${LIB_NAME}; \
fi


# Build and image using the local Dockerfile and make the data-processing-lib/python
# available in the current directory for use by the Dockerfile (i.e. to install the library).
Expand Down Expand Up @@ -591,8 +595,9 @@ MINIO_ADMIN_PWD= localminiosecretkey
# Updates the versions references to our repo source as defined in .make.versions
.PHONY: .defaults.__update-toml-lib-dep-versions
.defaults.__update-toml-lib-dep-versions:
ifeq ($(USE_REPO_LIB_SRC), 1)
@# Help: Update pyproject.toml to depend on lib versions defined in .make.versions
@if [ -e pyproject.toml ]; then \
if [ -e pyproject.toml ]; then \
cat pyproject.toml | sed \
-e 's/"data-prep-toolkit-ray\([=><~][=]\).*"/"data-prep-toolkit-ray\1$(DPK_LIB_VERSION)"/' \
-e 's/"data-prep-toolkit-spark\([=><~][=]\).*"/"data-prep-toolkit-spark\1$(DPK_LIB_VERSION)"/' \
Expand All @@ -603,7 +608,7 @@ MINIO_ADMIN_PWD= localminiosecretkey
> tt.toml; \
mv tt.toml pyproject.toml; \
fi
@if [ -e requirements.txt ]; then \
if [ -e requirements.txt ]; then \
cat requirements.txt | sed \
-e 's/data-prep-toolkit-ray\([=><~][=]\).*/data-prep-toolkit-ray\1$(DPK_LIB_VERSION)/' \
-e 's/data-prep-toolkit-transforms\([=><~][=]\).*/data-prep-toolkit-transforms\1$(DPK_TRANSFORMS_VERSION)/' \
Expand All @@ -615,6 +620,7 @@ MINIO_ADMIN_PWD= localminiosecretkey
> tt.txt; \
mv tt.txt requirements.txt; \
fi
endif

# Build the distribution, usually in preparation for publishing using ith the .defaults.publish-dist target
.PHONY: .defaults.build-dist
Expand Down
80 changes: 2 additions & 78 deletions .make.versions
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_

# publish docker images with latest tag
ifeq ($(DPK_VERSION_SUFFIX), )
DOCKER_IMAGE_VERSION=$(DPK_VERSION)
DOCKER_IMAGE_VERSION?=$(DPK_VERSION)
else
DOCKER_IMAGE_VERSION=latest
DOCKER_IMAGE_VERSION?=latest
endif

# Data prep lab wheel version
Expand All @@ -39,82 +39,6 @@ DPK_LIB_KFP_SHARED=$(DPK_VERSION)
KFP_DOCKER_VERSION=$(DOCKER_IMAGE_VERSION)
KFP_DOCKER_VERSION_v2=$(DOCKER_IMAGE_VERSION)

# Begin transform versions/tags
BLOCKLIST_VERSION=$(DPK_VERSION)

DOC_ID_PYTHON_VERSION=$(DPK_VERSION)
DOC_ID_RAY_VERSION=$(DPK_VERSION)
DOC_ID_SPARK_VERSION=$(DPK_VERSION)

EDEDUP_PYTHON_VERSION=$(DPK_VERSION)
EDEDUP_RAY_VERSION=$(DPK_VERSION)

FDEDUP_RAY_VERSION=$(DPK_VERSION)

FILTER_PYTHON_VERSION=$(DPK_VERSION)
FILTER_RAY_VERSION=$(DPK_VERSION)
FILTER_SPARK_VERSION=$(DPK_VERSION)

NOOP_PYTHON_VERSION=$(DPK_VERSION)
NOOP_RAY_VERSION=$(DPK_VERSION)
NOOP_SPARK_VERSION=$(DPK_VERSION)

PROFILER_PYTHON_VERSION=$(DPK_VERSION)
PROFILER_RAY_VERSION=$(DPK_VERSION)
PROFILER_SPARK_VERSION=$(DPK_VERSION)

RESIZE_PYTHON_VERSION=$(DPK_VERSION)
RESIZE_RAY_VERSION=$(DPK_VERSION)
RESIZE_SPARK_VERSION=$(DPK_VERSION)

LANG_ID_PYTHON_VERSION=$(DPK_VERSION)
LANG_ID_RAY_VERSION=$(DPK_VERSION)

TOKENIZATION_RAY_VERSION=$(DPK_VERSION)
TOKENIZATION_PYTHON_VERSION=$(DPK_VERSION)

MALWARE_RAY_VERSION=$(DPK_VERSION)
MALWARE_PYTHON_VERSION=$(DPK_VERSION)

PROGLANG_SELECT_PYTHON_VERSION=$(DPK_VERSION)
PROGLANG_SELECT_RAY_VERSION=$(DPK_VERSION)

DOC_QUALITY_PYTHON_VERSION=$(DPK_VERSION)
DOC_QUALITY_RAY_VERSION=$(DPK_VERSION)

CODE_QUALITY_RAY_VERSION=$(DPK_VERSION)
CODE_QUALITY_PYTHON_VERSION=$(DPK_VERSION)

CODE2PARQUET_PYTHON_VERSION=$(DPK_VERSION)
CODE2PARQUET_RAY_VERSION=$(DPK_VERSION)
INGEST_TO_PARQUET_VERSION=$(DPK_VERSION)
REPO_LVL_ORDER_RAY_VERSION=$(DPK_VERSION)

PDF2PARQUET_PYTHON_VERSION=$(DPK_VERSION)
PDF2PARQUET_RAY_VERSION=$(DPK_VERSION)

DOC_CHUNK_PYTHON_VERSION=$(DPK_VERSION)
DOC_CHUNK_RAY_VERSION=$(DPK_VERSION)

TEXT_ENCODER_PYTHON_VERSION=$(DPK_VERSION)
TEXT_ENCODER_RAY_VERSION=$(DPK_VERSION)

HEADER_CLEANSER_PYTHON_VERSION=$(DPK_VERSION)
HEADER_CLEANSER_RAY_VERSION=$(DPK_VERSION)

LICENSE_SELECT_PYTHON_VERSION=$(DPK_VERSION)
LICENSE_SELECT_RAY_VERSION=$(DPK_VERSION)

PII_REDACTOR_PYTHON_VERSION=$(DPK_VERSION)

HTML2PARQUET_PYTHON_VERSION=$(DPK_VERSION)

DPK_TRANSFORMS_VERSION=$(DPK_VERSION)

SYNTACTIC_CONCEPT_EXTRACTOR_PYTHON_VERSION=$(DPK_VERSION)
SYNTACTIC_CONCEPT_EXTRACTOR_RAY_VERSION=$(DPK_VERSION)


################## ################## ################## ################## ################## ##################
# Begin versions that the repo depends on.

Expand Down
25 changes: 23 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ The goal is to offer high-level APIs for developers to quickly get started in wo
- [Scaling transforms from laptop to cluster](#laptop_cluster)
- [Repository Use and Navigation](doc/repo.md)
- [How to Contribute](CONTRIBUTING.md)
- [Papers and Talks](#talks_papers)
- [Talks and Papers](#talks_papers)
- [Citations](#citations)

## &#x1F4D6; About <a name = "about"></a>

Expand Down Expand Up @@ -131,7 +132,7 @@ The matrix below shows the the combination of modules and supported runtimes. Al
| **Data Ingestion** | | | | |
| [Code (from zip) to Parquet](transforms/code/code2parquet/python/README.md) | :white_check_mark: | :white_check_mark: | | :white_check_mark: |
| [PDF to Parquet](transforms/language/pdf2parquet/python/README.md) | :white_check_mark: | :white_check_mark: | | :white_check_mark: |
| [HTML to Parquet](transforms/universal/html2parquet/python/README.md) | :white_check_mark: | | | |
| [HTML to Parquet](transforms/language/html2parquet/python/README.md) | :white_check_mark: | :white_check_mark: | | |
| **Universal (Code & Language)** | | | | |
| [Exact dedup filter](transforms/universal/ededup/ray/README.md) | :white_check_mark: | :white_check_mark: | | :white_check_mark: |
| [Fuzzy dedup filter](transforms/universal/fdedup/ray/README.md) | | :white_check_mark: | | :white_check_mark: |
Expand Down Expand Up @@ -220,3 +221,23 @@ You can run transforms via docker image or using virtual environments. This [doc
5. Talk on "Hands on session for fine tuning LLMs" [Video](https://www.youtube.com/watch?v=VEHIA3E64DM)
6. Talk on "Build your own data preparation module using data-prep-kit" [Video](https://www.youtube.com/watch?v=0WUMG6HIgMg)

## Citations <a name = "citations"></a>

If you use Data Prep Kit in your research, please cite our paper:

```bash
@misc{wood2024dataprepkitgettingdataready,
title={Data-Prep-Kit: getting your data ready for LLM application development},
author={David Wood and Boris Lublinsky and Alexy Roytman and Shivdeep Singh
and Abdulhamid Adebayo and Revital Eres and Mohammad Nassar and Hima Patel
and Yousaf Shah and Constantin Adam and Petros Zerfos and Nirmit Desai
and Daiki Tsuzuku and Takuya Goto and Michele Dolfi and Saptha Surendran
and Paramesvaran Selvam and Sungeun An and Yuan Chi Chang and Dhiraj Joshi
and Hajar Emami-Gohari and Xuan-Hong Dang and Yan Koyfman and Shahrokh Daijavad},
year={2024},
eprint={2409.18164},
archivePrefix={arXiv},
primaryClass={cs.AI},
url={https://arxiv.org/abs/2409.18164},
}
```
3 changes: 3 additions & 0 deletions transforms/code/code2parquet/kfp_ray/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../
WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate
include $(REPOROOT)/transforms/.make.workflows

# Include the common configuration for this transform
include ../transform.config

SRC_DIR=${CURDIR}/../ray/

PYTHON_WF := $(shell find ./ -name '*_wf.py')
Expand Down
16 changes: 11 additions & 5 deletions transforms/code/code2parquet/python/Makefile
Original file line number Diff line number Diff line change
@@ -1,16 +1,22 @@
# Define the root of the local git clone for the common rules to be able
# know where they are running from.
REPOROOT=../../../..

# Set this, before including .make.defaults, to
# 1 if requirements reference the latest code in the data processing library
# in this repo (that is not yet published to pypi). This is the default setting.
# 0 if the transforms DPK dependencies are on wheels published to
# pypi (e.g. data-prep-toolkit=0.2.1)
#USE_REPO_LIB_SRC=1

# Include a library of common .transform.* targets which most
# transforms should be able to reuse. However, feel free
# to override/redefine the rules below.

# $(REPOROOT)/.make.versions file contains the versions

TRANSFORM_NAME=code2parquet

include $(REPOROOT)/transforms/.make.transforms

# Include the common configuration for this transform
include ../transform.config

venv:: .transforms.python-venv

test:: .transforms.python-test
Expand Down
11 changes: 10 additions & 1 deletion transforms/code/code2parquet/ray/Makefile
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
# Define the root of the local git clone for the common rules to be able
# know where they are running from.
REPOROOT=../../../..

# Set this, before including .make.defaults, to
# 1 if requirements reference the latest code in the data processing library
# in this repo (that is not yet published to pypi). This is the default setting.
# 0 if the transforms DPK dependencies are on wheels published to
# pypi (e.g. data-prep-toolkit=0.2.1)
#USE_REPO_LIB_SRC=1

# Include a library of common .transform.* targets which most
# transforms should be able to reuse. However, feel free
# to override/redefine the rules below.
include $(REPOROOT)/transforms/.make.transforms

TRANSFORM_NAME=code2parquet
# Include the common configuration for this transform
include ../transform.config

BASE_IMAGE=${RAY_BASE_IMAGE}
venv:: .transforms.ray-venv
Expand Down
20 changes: 20 additions & 0 deletions transforms/code/code2parquet/transform.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#
# This is intended to be included across the Makefiles provided within
# a given transform's directory tree, so must use compatible syntax.
#
################################################################################
# This defines the name of the transform and is used to match against
# expected files and is used to define the transform's image name.
TRANSFORM_NAME=code2parquet

################################################################################
# This defines the transforms' version number as would be used
# when publishing the wheel. In general, only the micro version
# number should be advanced relative to the DPK_VERSION.
#
# If you change the versions numbers, be sure to run "make set-versions" to
# update version numbers across the transform (e.g., pyproject.toml).
CODE2PARQUET_PYTHON_VERSION=$(DPK_VERSION)
CODE2PARQUET_RAY_VERSION=$(CODE2PARQUET_PYTHON_VERSION)
CODE2PARQUET_SPARK_VERSION=$(CODE2PARQUET_PYTHON_VERSION)

5 changes: 4 additions & 1 deletion transforms/code/code_quality/kfp_ray/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../
WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate
include $(REPOROOT)/transforms/.make.workflows

# Include the common configuration for this transform
include ../transform.config

SRC_DIR=${CURDIR}/../ray/

PYTHON_WF := $(shell find ./ -name '*_wf.py')
Expand Down Expand Up @@ -48,4 +51,4 @@ workflow-test: workflow-build
workflow-upload: workflow-build
@for file in $(YAML_WF); do \
$(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \
done
done
17 changes: 14 additions & 3 deletions transforms/code/code_quality/python/Makefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@

# Define the root of the local git clone for the common rules to be able
# Define the root of the local git clone for the common rules to be able
# know where they are running from.
REPOROOT=../../../..

# Set this, before including .make.defaults, to
# 1 if requirements reference the latest code in the data processing library
# in this repo (that is not yet published to pypi). This is the default setting.
# 0 if the transforms DPK dependencies are on wheels published to
# pypi (e.g. data-prep-toolkit=0.2.1)
#USE_REPO_LIB_SRC=1

# Include a library of common .transform.* targets which most
# transforms should be able to reuse. However, feel free
# to override/redefine the rules below.
include $(REPOROOT)/transforms/.make.transforms

TRANSFORM_NAME=code_quality
# Include the common configuration for this transform
include ../transform.config

# Use default rule inherited from makefile.common
clean:: .transforms.clean
Expand Down
17 changes: 14 additions & 3 deletions transforms/code/code_quality/ray/Makefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@

# Define the root of the local git clone for the common rules to be able
# Define the root of the local git clone for the common rules to be able
# know where they are running from.
REPOROOT=../../../..

# Set this, before including .make.defaults, to
# 1 if requirements reference the latest code in the data processing library
# in this repo (that is not yet published to pypi). This is the default setting.
# 0 if the transforms DPK dependencies are on wheels published to
# pypi (e.g. data-prep-toolkit=0.2.1)
#USE_REPO_LIB_SRC=1

# Include a library of common .transform.* targets which most
# transforms should be able to reuse. However, feel free
# to override/redefine the rules below.
include $(REPOROOT)/transforms/.make.transforms

TRANSFORM_NAME=code_quality
# Include the common configuration for this transform
include ../transform.config

BASE_IMAGE=${RAY_BASE_IMAGE}

Expand Down
20 changes: 20 additions & 0 deletions transforms/code/code_quality/transform.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#
# This is intended to be included across the Makefiles provided within
# a given transform's directory tree, so must use compatible syntax.
#
################################################################################
# This defines the name of the transform and is used to match against
# expected files and is used to define the transform's image name.
TRANSFORM_NAME=code_quality

################################################################################
# This defines the transforms' version number as would be used
# when publishing the wheel. In general, only the micro version
# number should be advanced relative to the DPK_VERSION.
#
# If you change the versions numbers, be sure to run "make set-versions" to
# update version numbers across the transform (e.g., pyproject.toml).
CODE_QUALITY_PYTHON_VERSION=$(DPK_VERSION)
CODE_QUALITY_RAY_VERSION=$(CODE_QUALITY_PYTHON_VERSION)
CODE_QUALITY_SPARK_VERSION=$(CODE_QUALITY_PYTHON_VERSION)

Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ REPOROOT=${CURDIR}/../../../../
WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate
include $(REPOROOT)/transforms/.make.workflows

# Include the common configuration for this transform
include ../transform.config

SRC_DIR=${CURDIR}/../ray/

PYTHON_WF := $(shell find ./ -name '*_wf.py')
Expand Down Expand Up @@ -48,4 +51,4 @@ workflow-test: workflow-build
workflow-upload: workflow-build
@for file in $(YAML_WF); do \
$(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \
done
done
17 changes: 14 additions & 3 deletions transforms/code/header_cleanser/python/Makefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@

# Define the root of the local git clone for the common rules to be able
# Define the root of the local git clone for the common rules to be able
# know where they are running from.
REPOROOT=../../../..

# Set this, before including .make.defaults, to
# 1 if requirements reference the latest code in the data processing library
# in this repo (that is not yet published to pypi). This is the default setting.
# 0 if the transforms DPK dependencies are on wheels published to
# pypi (e.g. data-prep-toolkit=0.2.1)
#USE_REPO_LIB_SRC=1

# Include a library of common .transform.* targets which most
# transforms should be able to reuse. However, feel free
# to override/redefine the rules below.
include $(REPOROOT)/transforms/.make.transforms

TRANSFORM_NAME=header_cleanser
# Include the common configuration for this transform
include ../transform.config

# Use default rule inherited from makefile.common
clean:: .transforms.clean
Expand Down
Loading

0 comments on commit ac795c2

Please sign in to comment.