Merge pull request #638 from ian-cho/dev

adding hap transform. Additional work on readme.md and proper default values for input parameters will be done in a separate PR
IBM · Oct 1, 2024 · 00d1fea · 00d1fea
2 parents 6e21f64 + 7284c81
commit 00d1fea
Show file tree

Hide file tree

Showing 18 changed files with 864 additions and 0 deletions.
diff --git a/.github/workflows/test-universal-hap.yml b/.github/workflows/test-universal-hap.yml
@@ -0,0 +1,124 @@
+#
+# DO NOT EDIT THIS FILE: it is generated from test-transform.template,  Edit there and run make to change these files
+#
+name: Test - transforms/universal/hap
+
+on:
+    workflow_dispatch:
+    push:
+        branches:
+            - "dev"
+            - "releases/**"
+        tags:
+            - "*"
+        paths:
+            - "transforms/universal/hap/**"
+            - "data-processing-lib/**"
+            - "!transforms/universal/hap/**/kfp_ray/**" # This is/will be tested in separate workflow
+            - "!data-processing-lib/**/test/**"
+            - "!data-processing-lib/**/test-data/**"
+            - "!**.md"
+            - "!**/doc/**"
+            - "!**/images/**"
+            - "!**.gitignore"
+    pull_request:
+        branches:
+            - "dev"
+            - "releases/**"
+        paths:
+            - "transforms/universal/hap/**"
+            - "data-processing-lib/**"
+            - "!transforms/universal/hap/**/kfp_ray/**" # This is/will be tested in separate workflow
+            - "!data-processing-lib/**/test/**"
+            - "!data-processing-lib/**/test-data/**"
+            - "!**.md"
+            - "!**/doc/**"
+            - "!**/images/**"
+            - "!**.gitignore"
+
+jobs:
+    check_if_push_image:
+        # check whether the Docker images should be pushed to the remote repository
+        # The images are pushed if it is a merge to dev branch or a new tag is created.
+        # The latter being part of the release process.
+        # The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
+        runs-on: ubuntu-22.04
+        outputs:
+            publish_images: ${{ steps.version.outputs.publish_images }}
+        steps:
+            - id: version
+              run: |
+                  publish_images='false'
+                  if  [[ ${GITHUB_REF} == refs/heads/dev && ${GITHUB_EVENT_NAME} != 'pull_request' && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
+                  then
+                    publish_images='true'
+                  fi
+                  if  [[ ${GITHUB_REF} == refs/tags/* && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
+                  then
+                    publish_images='true'
+                  fi
+                  echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
+    test-src:
+        runs-on: ubuntu-22.04
+        steps:
+            - name: Checkout
+              uses: actions/checkout@v4
+            - name: Free up space in github runner
+              # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
+              run: |
+                  df -h
+                  sudo rm -rf "/usr/local/share/boost"
+                  sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+                  sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup
+                  sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
+                  df -h
+            - name: Test transform source in transforms/universal/hap
+              run: |
+                  if [ -e "transforms/universal/hap/Makefile" ]; then
+                      make -C transforms/universal/hap DOCKER=docker test-src
+                  else
+                      echo "transforms/universal/hap/Makefile not found - source testing disabled for this transform."
+                  fi
+    test-image:
+        needs: [check_if_push_image]
+        runs-on: ubuntu-22.04
+        timeout-minutes: 120
+        env:
+            DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
+            DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }}
+        steps:
+            - name: Checkout
+              uses: actions/checkout@v4
+            - name: Free up space in github runner
+              # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
+              run: |
+                  df -h
+                  sudo rm -rf /opt/ghc
+                  sudo rm -rf "/usr/local/share/boost"
+                  sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+                  sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
+                  sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
+                  df -h
+            - name: Test transform image in transforms/universal/hap
+              run: |
+                  if [ -e "transforms/universal/hap/Makefile" ]; then
+                      if [ -d "transforms/universal/hap/spark" ]; then
+                          make -C data-processing-lib/spark DOCKER=docker image
+                      fi
+                      make -C transforms/universal/hap DOCKER=docker test-image
+                  else
+                      echo "transforms/universal/hap/Makefile not found - testing disabled for this transform."
+                  fi
+            - name: Print space
+              # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
+              run: |
+                  df -h
+                  docker images
+            - name: Publish images
+              if: needs.check_if_push_image.outputs.publish_images == 'true'
+              run: |
+                  if [ -e "transforms/universal/hap/Makefile" ]; then
+                      make -C transforms/universal/hap publish
+                  else
+                      echo "transforms/universal/hap/Makefile not found - publishing disabled for this transform."
+                  fi
diff --git a/transforms/universal/hap/Makefile b/transforms/universal/hap/Makefile
@@ -0,0 +1,70 @@
+REPOROOT=../../..
+# Use make help, to see the available rules
+include $(REPOROOT)/.make.defaults
+
+setup::
+	@# Help: Recursively make $@ all subdirs 
+	$(MAKE) RULE=$@ .recurse
+
+clean::
+	@# Help: Recursively make $@ all subdirs 
+	$(MAKE) RULE=$@ .recurse
+
+build::
+	@# Help: Recursively make $@ in subdirs 
+	$(MAKE) RULE=$@ .recurse
+venv::
+	@# Help: Recursively make $@ in subdirs 
+	$(MAKE) RULE=$@ .recurse
+
+image:: 
+	@# Help: Recursively make $@ in all subdirs 
+	@$(MAKE) RULE=$@ .recurse
+
+set-versions:  
+	@# Help: Recursively $@ in all subdirs 
+	@$(MAKE) RULE=$@ .recurse
+
+publish:: 
+	@# Help: Recursively make $@ in all subdirs 
+	@$(MAKE) RULE=$@ .recurse
+
+test-image:: 
+	@# Help: Recursively make $@ in all subdirs 
+	@$(MAKE) RULE=$@ .recurse
+
+test:: 
+	@# Help: Recursively make $@ in all subdirs 
+	@$(MAKE) RULE=$@ .recurse
+
+test-src::
+	@# Help: Recursively make $@ in all subdirs 
+	$(MAKE) RULE=$@ .recurse
+
+kind-load-image::
+	@# Help: Recursively make $@ in all subdirs 
+	$(MAKE) RULE=$@ .recurse
+
+docker-load-image::
+	@# Help: Recursively make $@ in all subdirs
+	$(MAKE) RULE=$@ .recurse
+
+docker-save-image::
+	@# Help: Recursively make $@ in all subdirs 
+	$(MAKE) RULE=$@ .recurse
+
+.PHONY: workflow-venv
+workflow-venv:
+	$(MAKE) -C kfp_ray workflow-venv
+
+.PHONY: workflow-test
+workflow-test:
+	$(MAKE) -C kfp_ray workflow-test
+
+.PHONY: workflow-upload
+workflow-upload:
+	$(MAKE) -C kfp_ray workflow-upload
+
+.PHONY: workflow-build
+workflow-build:
+	$(MAKE) -C  kfp_ray workflow-build
diff --git a/transforms/universal/hap/python/Dockerfile b/transforms/universal/hap/python/Dockerfile
@@ -0,0 +1,42 @@
+FROM docker.io/python:3.10.14-slim-bullseye
+
+RUN pip install --upgrade --no-cache-dir pip 
+
+# install pytest
+RUN pip install --no-cache-dir pytest
+
+# Create a user and use it to run the transform
+RUN useradd -ms /bin/bash dpk
+USER dpk
+WORKDIR /home/dpk
+
+# Copy and install data processing libraries 
+# These are expected to be placed in the docker context before this is run (see the make image).
+COPY --chown=dpk:root data-processing-lib-python/ data-processing-lib-python/
+RUN cd data-processing-lib-python && pip install --no-cache-dir -e .
+
+# END OF STEPS destined for a data-prep-kit base image 
+
+COPY --chown=dpk:root src/ src/
+COPY --chown=dpk:root pyproject.toml pyproject.toml 
+COPY --chown=dpk:root requirements.txt requirements.txt 
+RUN pip install --no-cache-dir -e .
+
+# copy transform main() entry point to the image 
+COPY ./src/hap_transform_python.py .
+
+# copy some of the samples in
+COPY ./src/hap_local.py local/
+
+# copy test
+COPY test/ test/
+COPY test-data/ test-data/
+
+# Set environment
+ENV PYTHONPATH /home/dpk
+
+# Put these at the end since they seem to upset the docker cache.
+ARG BUILD_DATE
+ARG GIT_COMMIT
+LABEL build-date=$BUILD_DATE
+LABEL git-commit=$GIT_COMMIT
diff --git a/transforms/universal/hap/python/Makefile b/transforms/universal/hap/python/Makefile
@@ -0,0 +1,62 @@
+# Define the root of the local git clone for the common rules to be able 
+# know where they are running from.
+REPOROOT=../../../..
+# Include a library of common .transform.* targets which most
+# transforms should be able to reuse.  However, feel free
+# to override/redefine the rules below. 
+include $(REPOROOT)/transforms/.make.transforms
+
+#TRANSFORM_RUNTIME_SRC_FILE=hap_transform_python.py
+TRANSFORM_NAME=hap
+
+HAP_PYTHON_VERSION= $(DPK_VERSION)
+
+venv::	.transforms.python-venv
+
+install:: pip install -r requirements.txt
+
+test::	.transforms.python-test
+
+clean:: .transforms.clean
+
+image:: .transforms.python-image
+
+test-src:: .transforms.test-src
+
+setup:: .transforms.setup
+
+build:: build-dist image
+
+publish: publish-image
+
+publish-image:: .transforms.publish-image-python
+
+setup:: .transforms.setup
+
+# distribution versions is the same as image version.
+set-versions:
+	$(MAKE) TRANSFORM_PYTHON_VERSION=$(HAP_PYTHON_VERSION) TOML_VERSION=$(HAP_PYTHON_VERSION) .transforms.set-versions 
+
+build-dist:: set-versions .defaults.build-dist 
+
+publish-dist:: .defaults.publish-dist
+
+test-image:: .transforms.python-test-image
+
+run-cli-sample: .transforms.run-cli-python-sample
+
+run-local-sample: .transforms.run-local-sample
+
+run-local-python-sample: .transforms.run-local-python-sample
+
+#run-s3-ray-sample: .transforms.run-s3-ray-sample
+
+minio-start:	.minio-start
+
+kind-load-image:: .transforms.kind-load-image
+
+load-image:: .transforms.load-image
+
+docker-load-image: .defaults.docker-load-image
+
+docker-save-image: .defaults.docker-save-image
diff --git a/transforms/universal/hap/python/README.md b/transforms/universal/hap/python/README.md
@@ -0,0 +1,54 @@
+# HAP Annotation
+Please see the set of [transform project conventions](https://github.com/ian-cho/data-prep-kit/blob/dev/transforms/README.md) for details on general project conventions, transform configuration, testing and IDE set up.
+
+## Prerequisite
+This repo needs NLTK and please refer to `requirements.txt`.
+
+## Summary
+The hap transform maps a non-empty input table to an output table with an added `hap_score` column. Each row in the table represents a document, and the hap transform performs the following three steps to calculate the hap score for each document:
+
+* Sentence spliting: we use NLTK to split the document into sentence pieces.
+* Hap annotation: each sentence is assigned a hap score between 0 and 1, where 1 represents hap and 0 represents non-hap.
+* Aggregation: the document hap score is determined by selecting the maximum hap score among its sentences.
+
+
+## Configuration and command line Options
+The set of dictionary keys holding [HAPTransformConfiguration](src/hap_transform.py) 
+configuration for values are as follows:
+
+* --model_name_or_path - specifies HAP model which should be compatable with HuggingFace's `AutoModelForSequenceClassification` 
+* --batch_size - modify it based on the infrastructure capacity.
+* --max_length - the maximum length for the tokenizer.
+
+
+
+## input format
+The input is in .parquet format and contains the following columns:
+
+| doc_id  |   doc_text | 
+|:------|:------|
+| 1  |    GSC is very much a little Swiss Army knife for...   |
+| 2  |    Here are only a few examples. And no, I'm not ...   |
+
+## output format
+The output is in .parquet format and includes an additional column, in addition to those in the input:
+
+| doc_id  |   doc_text | hap_score   |
+|:------|:------|:-------------|
+| 1  |    GSC is very much a little Swiss Army knife for... | 0.002463     |
+| 2  |    Here are only a few examples. And no, I'm not ... | 0.989713     |
+
+## How to run
+Place your input Parquet file in the `test-data/input/` directory. A sample file, `test1.parquet`, is available in this directory. Once done, run the script.
+
+```python
+python hap_local_python.py
+```
+
+You will obtain the output file `test1.parquet` in the output directory.
+
+
+
+
+
+