IBM · Kibnelson · Oct 10, 2024 · Oct 10, 2024 · Oct 10, 2024 · Oct 11, 2024
diff --git a/data-processing-lib/spark/Makefile b/data-processing-lib/spark/Makefile
@@ -11,9 +11,14 @@ setup::
 
 set-versions: .check-env
 	$(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml
-	sed -e 's/"pyspark...*",/"pyspark>=${SPARK_VERSION}",/'				\
-	    pyproject.toml > tt.toml
-	mv tt.toml pyproject.toml
+	if [ -e pyproject.toml ]; then					\
+		cat pyproject.toml | sed -e 's/"spark[default]==.*",/"spark[default]==$(SPARK_VERSION)",/' > tt.toml; \
+		mv tt.toml pyproject.toml; \
+	fi
+	if [ -e requirements.txt ]; then					\
+		cat requirements.txt | sed -e 's/ray[default]==.*/ray[default]==$(SPARK_VERSION)/' > tt.txt; \
+		mv tt.txt requirements.txt; \
+	fi
 
 build:: build-dist 
 
@@ -26,7 +31,7 @@ publish-dist :: .check-env .defaults.publish-dist
 
 publish-image:: .defaults.publish-image
 
-venv::  pyproject.toml
+venv::
 	$(MAKE) .defaults.spark-lib-src-venv
 	pip install pytest pytest-cov 
 

diff --git a/scripts/check-workflows.sh b/scripts/check-workflows.sh
@@ -17,7 +17,7 @@ if [ ! -d transforms ]; then
     echo Please run this script from the top of the repository
     exit 1
 fi
-KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor text_encoder license_select repo_level_ordering header_cleanser"
+KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor text_encoder license_select repo_level_ordering header_cleanser fdedup"
 while [ $# -ne 0 ]; do
    case $1 in
         -show-kfp-black-list)    echo $KFP_BLACK_LIST; exit 0;

diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py
diff --git a/transforms/universal/fdedup/python/.dockerignore b/transforms/universal/fdedup/python/.dockerignore
@@ -0,0 +1 @@
+venv/
diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile
@@ -0,0 +1,44 @@
+FROM docker.io/python:3.10.14-slim-bullseye
+
+RUN pip install --upgrade --no-cache-dir pip 
+
+# install pytest
+RUN pip install --no-cache-dir pytest
+ARG DPK_WHEEL_FILE_NAME
+
+# Create a user and use it to run the transform
+RUN useradd -ms /bin/bash dpk
+USER dpk
+WORKDIR /home/dpk
+
+# Copy and install data processing libraries 
+# These are expected to be placed in the docker context before this is run (see the make image).
+COPY --chown=dpk:root data-processing-dist data-processing-dist
+RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}
+
+COPY --chown=dpk:root src/ src/
+COPY --chown=dpk:root pyproject.toml pyproject.toml
+COPY --chown=dpk:root README.md README.md
+COPY --chown=dpk:root requirements.txt requirements.txt
+
+RUN pip install --no-cache-dir -e .
+
+# copy source data
+COPY src/ src/
+
+# copy source data
+COPY ./src/fdedup_transform_python.py fdedup_transform_python.py
+COPY ./src/fdedup_transform_python.py local/
+
+# copy test
+COPY test/ test/
+COPY test-data/ test-data/
+
+# Set environment
+ENV PYTHONPATH /home/dpk
+
+# Put these at the end since they seem to upset the docker cache.
+ARG BUILD_DATE
+ARG GIT_COMMIT
+LABEL build-date=$BUILD_DATE
+LABEL git-commit=$GIT_COMMIT
diff --git a/transforms/universal/fdedup/python/Makefile b/transforms/universal/fdedup/python/Makefile
@@ -0,0 +1,64 @@
+# Define the root of the local git clone for the common rules to be able 
+# know where they are running from.
+REPOROOT=../../../..
+
+# Set this, before including .make.defaults, to 
+#   1 if requirements reference the latest code in the data processing library 
+#     in this repo (that is not yet published to pypi).	 This is the default setting.
+#   0 if the transforms DPK dependencies are on wheels published to 
+#     pypi (e.g. data-prep-toolkit=0.2.1)
+#USE_REPO_LIB_SRC=1
+
+# Include a library of common .transform.* targets which most
+# transforms should be able to reuse.  However, feel free
+# to override/redefine the rules below. 
+include $(REPOROOT)/transforms/.make.transforms
+
+# Include the common configuration for this transform
+include ../transform.config
+
+venv::	.transforms.python-venv
+
+test::	.transforms.python-test
+
+clean:: .transforms.clean
+
+image:: .transforms.python-image
+
+test-src:: .transforms.test-src
+
+setup:: .transforms.setup
+
+build:: build-dist image
+
+publish: publish-image
+
+publish-image:: .transforms.publish-image-python
+
+setup:: .transforms.setup
+
+# distribution versions is the same as image version.
+set-versions:
+	$(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_PYTHON_VERSION) .transforms.set-versions
+
+build-dist:: .defaults.build-dist 
+
+publish-dist:: .defaults.publish-dist
+
+test-image:: .transforms.python-test-image
+
+run-cli-sample: .transforms.run-cli-python-sample
+
+run-local-sample: .transforms.run-local-sample
+
+run-local-python-sample: .transforms.run-local-python-sample
+
+#run-s3-ray-sample: .transforms.run-s3-ray-sample
+
+minio-start:	.minio-start
+
+kind-load-image:: .transforms.kind-load-image
+
+docker-load-image: .defaults.docker-load-image
+
+docker-save-image: .defaults.docker-save-image
diff --git a/transforms/universal/fdedup/python/README.md b/transforms/universal/fdedup/python/README.md
@@ -0,0 +1,11 @@
+# Fuzzy Dedup
+
+Please see the set of
+[transform project conventions](../../../README.md)
+for details on general project conventions, transform configuration,
+testing and IDE set up.
+
+## Summary
+
+The basic implementation of the fuzzy dedup is based on [MinHash](https://en.wikipedia.org/wiki/MinHash). Also see
+[here](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) for more details.
diff --git a/data-processing-lib/spark/pyproject.toml → ...ms/universal/fdedup/python/pyproject.toml b/data-processing-lib/spark/pyproject.toml → ...ms/universal/fdedup/python/pyproject.toml
@@ -1,31 +1,21 @@
 [project]
-name = "data_prep_toolkit_spark"
+name = "dpk_fdedup_transform_python"
 version = "0.2.2.dev2"
-keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
 requires-python = ">=3.10,<3.13"
-description = "Data Preparation Toolkit Library for Spark"
+description = "Fuzzy Dedup Transform for Python"
 license = {text = "Apache-2.0"}
 readme = {file = "README.md", content-type = "text/markdown"}
 authors = [
-    { name = "David Wood", email = "[email protected].com" },
-    { name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
+    { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" },
+    { name = "Constantin Adam", email = "cmadam@us.ibm.com" },
 ]
-dependencies = [
-    "data-prep-toolkit==0.2.2.dev2",
-    "pyspark>=3.5.2",
-    "psutil>=6.0.0",
-    "PyYAML>=6.0.2"
-]
-
-[project_urls]
-Repository = "https://github.com/IBM/data-prep-kit"
-Issues = "https://github.com/IBM/data-prep-kit/issues"
-Documentation = "https://ibm.github.io/data-prep-kit/"
-"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/noop"
+dynamic = ["dependencies"]
 
 [build-system]
 requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
 build-backend = "setuptools.build_meta"
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
 
 [project.optional-dependencies]
 dev = [
@@ -44,7 +34,7 @@ dev = [
 package_dir = ["src","test"]
 
 [options.packages.find]
-where = ["src/data_processing_spark"]
+where = ["src/"]
 
 [tool.pytest.ini_options]
 # Currently we use low coverage since we have to run tests separately (see makefile)

diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt
@@ -0,0 +1,10 @@
+data-prep-toolkit==0.2.2.dev2
+pyyaml>=6.0.2
+boto3>=1.34.69
+kubernetes>=30.1.0
+polars==1.9.0
+disjoint-set>=0.8.0
+scipy>=1.14.1, <2.0.0
+numpy<1.29.0
+sentencepiece>=0.2.0
+mmh3>=4.1.0
diff --git a/transforms/universal/fdedup/python/src/Murmur_MH.py b/transforms/universal/fdedup/python/src/Murmur_MH.py
@@ -0,0 +1,112 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+
+import logging
+import os
+from typing import List, Set
+
+import mmh3
+import numpy as np
+
+
+class Murmur_MH:
+    def __init__(self, num_perm=64, seed=42, hashfunc=None):
+        self.seed = seed
+        self.num_perm = num_perm  # the number of buckets, i.e. the vector length after self.minhash() call
+        self.permutations = self._init_permutations(seed, num_perm)
+
+    def _init_permutations(self, seed, num_perm):
+        # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic
+        max_int = np.uint64((1 << 64) - 1)
+        # initialize pseudo random number generator with given seed value
+        gen = np.random.RandomState(seed)
+        # get self.num_perm pseudo random numbers between 2 and max_int (excl)
+        permutations = np.array(
+            [gen.randint(0, max_int, dtype=np.uint64) for _ in range(num_perm)],
+            dtype=np.uint64,
+        ).T
+        # make all even pseudo random numbers odd by adding 1
+        permutations[permutations % 2 == 0] += 1
+        return permutations
+
+    def minhash(self, shingles: List[str]):
+        """return np.array of minhash"""
+        # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic
+        hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64)
+        return (
+            np.right_shift(
+                (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T,
+                32,
+            )
+            .astype(np.uint32)
+            .min(axis=0)
+        )
+
+    def minhash2(self, shingles: List[str], doc_len: int):
+        """
+        for each shingle (i.e. a group of k-words) it generates a digest value based on
+        mmh3-hash function (32-bit)
+
+        return tuple (A, B)
+            A = an array of values = np.array of minhash
+            B = document_length = number of characters"""
+        # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic
+        hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64)
+        return (
+            np.right_shift(
+                (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T,
+                32,
+            )
+            .astype(np.uint32)
+            .min(axis=0),
+            doc_len,
+        )
+
+    def minhash2_nosalt(self, shingles: List[str], doc_len: int, doc_id: int):
+        """
+        for each shingle (i.e. a group of k-words) it generates a digest value based on
+        mmh3-hash function (32-bit)
+
+        return tuple (A, B)
+            A = an array of values = np.array of minhash
+            B = document_length = number of characters"""
+        # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic
+        hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64)
+        return (
+            np.right_shift(
+                (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T,
+                32,
+            )
+            .astype(np.uint32)
+            .min(axis=0)
+            .tolist(),
+            doc_len,
+            doc_id,
+        )
+
+    @staticmethod
+    def jaccard(mh1: np.array, mh2: np.array) -> float:
+        """
+        The Jaccard similarity measures the similarity between two sets of data
+        to see which members are shared and distinct.
+
+        The Jaccard similarity is calculated by dividing the number of observations
+        in both sets by the number of observations in either set.
+
+        Developed by Paul Jaccard, the index ranges from 0 to 1.
+        The closer to 1, the more similar the two sets of data.
+
+        As a document is represented by a set. We use Jaccard distance to see how similar between two documents.
+        """
+        assert len(mh1) == len(mh2)
+        return np.count_nonzero(mh1 == mh2) / len(mh1)
diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py
@@ -0,0 +1,50 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+
+from cluster_analysis_transform_python import (
+    ClusterAnalysisPythonTransformConfiguration,
+)
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.utils import ParamsUtils
+
+
+# create parameters
+input_folder = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands")
+)
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove"))
+local_conf = {
+    "input_folder": input_folder,
+    "output_folder": output_folder,
+}
+code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+params = {
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    # execution info
+    "runtime_pipeline_id": "pipeline_id",
+    "runtime_job_id": "job_id",
+    "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+    "cluster_num_bands": 14,
+    "cluster_num_segments": 2,
+    "cluster_jaccard_similarity_threshold": 0.7,
+}
+if __name__ == "__main__":
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
+    # create launcher
+    launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration())
+    # Launch python to process the input
+    launcher.launch()