Skip to content

Commit

Permalink
Merge pull request #638 from ian-cho/dev
Browse files Browse the repository at this point in the history
adding hap transform. Additional work on readme.md and proper default values for input parameters will be done in a separate PR
  • Loading branch information
touma-I authored Oct 1, 2024
2 parents 6e21f64 + 7284c81 commit 00d1fea
Show file tree
Hide file tree
Showing 18 changed files with 864 additions and 0 deletions.
124 changes: 124 additions & 0 deletions .github/workflows/test-universal-hap.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#
# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files
#
name: Test - transforms/universal/hap

on:
workflow_dispatch:
push:
branches:
- "dev"
- "releases/**"
tags:
- "*"
paths:
- "transforms/universal/hap/**"
- "data-processing-lib/**"
- "!transforms/universal/hap/**/kfp_ray/**" # This is/will be tested in separate workflow
- "!data-processing-lib/**/test/**"
- "!data-processing-lib/**/test-data/**"
- "!**.md"
- "!**/doc/**"
- "!**/images/**"
- "!**.gitignore"
pull_request:
branches:
- "dev"
- "releases/**"
paths:
- "transforms/universal/hap/**"
- "data-processing-lib/**"
- "!transforms/universal/hap/**/kfp_ray/**" # This is/will be tested in separate workflow
- "!data-processing-lib/**/test/**"
- "!data-processing-lib/**/test-data/**"
- "!**.md"
- "!**/doc/**"
- "!**/images/**"
- "!**.gitignore"

jobs:
check_if_push_image:
# check whether the Docker images should be pushed to the remote repository
# The images are pushed if it is a merge to dev branch or a new tag is created.
# The latter being part of the release process.
# The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
runs-on: ubuntu-22.04
outputs:
publish_images: ${{ steps.version.outputs.publish_images }}
steps:
- id: version
run: |
publish_images='false'
if [[ ${GITHUB_REF} == refs/heads/dev && ${GITHUB_EVENT_NAME} != 'pull_request' && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
then
publish_images='true'
fi
if [[ ${GITHUB_REF} == refs/tags/* && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
then
publish_images='true'
fi
echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
test-src:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free up space in github runner
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
df -h
- name: Test transform source in transforms/universal/hap
run: |
if [ -e "transforms/universal/hap/Makefile" ]; then
make -C transforms/universal/hap DOCKER=docker test-src
else
echo "transforms/universal/hap/Makefile not found - source testing disabled for this transform."
fi
test-image:
needs: [check_if_push_image]
runs-on: ubuntu-22.04
timeout-minutes: 120
env:
DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free up space in github runner
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
df -h
- name: Test transform image in transforms/universal/hap
run: |
if [ -e "transforms/universal/hap/Makefile" ]; then
if [ -d "transforms/universal/hap/spark" ]; then
make -C data-processing-lib/spark DOCKER=docker image
fi
make -C transforms/universal/hap DOCKER=docker test-image
else
echo "transforms/universal/hap/Makefile not found - testing disabled for this transform."
fi
- name: Print space
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
docker images
- name: Publish images
if: needs.check_if_push_image.outputs.publish_images == 'true'
run: |
if [ -e "transforms/universal/hap/Makefile" ]; then
make -C transforms/universal/hap publish
else
echo "transforms/universal/hap/Makefile not found - publishing disabled for this transform."
fi
70 changes: 70 additions & 0 deletions transforms/universal/hap/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
REPOROOT=../../..
# Use make help, to see the available rules
include $(REPOROOT)/.make.defaults

setup::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse

clean::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse

build::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse
venv::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse

image::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

set-versions:
@# Help: Recursively $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

publish::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-image::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-src::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

kind-load-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

docker-load-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

docker-save-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

.PHONY: workflow-venv
workflow-venv:
$(MAKE) -C kfp_ray workflow-venv

.PHONY: workflow-test
workflow-test:
$(MAKE) -C kfp_ray workflow-test

.PHONY: workflow-upload
workflow-upload:
$(MAKE) -C kfp_ray workflow-upload

.PHONY: workflow-build
workflow-build:
$(MAKE) -C kfp_ray workflow-build
42 changes: 42 additions & 0 deletions transforms/universal/hap/python/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
FROM docker.io/python:3.10.14-slim-bullseye

RUN pip install --upgrade --no-cache-dir pip

# install pytest
RUN pip install --no-cache-dir pytest

# Create a user and use it to run the transform
RUN useradd -ms /bin/bash dpk
USER dpk
WORKDIR /home/dpk

# Copy and install data processing libraries
# These are expected to be placed in the docker context before this is run (see the make image).
COPY --chown=dpk:root data-processing-lib-python/ data-processing-lib-python/
RUN cd data-processing-lib-python && pip install --no-cache-dir -e .

# END OF STEPS destined for a data-prep-kit base image

COPY --chown=dpk:root src/ src/
COPY --chown=dpk:root pyproject.toml pyproject.toml
COPY --chown=dpk:root requirements.txt requirements.txt
RUN pip install --no-cache-dir -e .

# copy transform main() entry point to the image
COPY ./src/hap_transform_python.py .

# copy some of the samples in
COPY ./src/hap_local.py local/

# copy test
COPY test/ test/
COPY test-data/ test-data/

# Set environment
ENV PYTHONPATH /home/dpk

# Put these at the end since they seem to upset the docker cache.
ARG BUILD_DATE
ARG GIT_COMMIT
LABEL build-date=$BUILD_DATE
LABEL git-commit=$GIT_COMMIT
62 changes: 62 additions & 0 deletions transforms/universal/hap/python/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Define the root of the local git clone for the common rules to be able
# know where they are running from.
REPOROOT=../../../..
# Include a library of common .transform.* targets which most
# transforms should be able to reuse. However, feel free
# to override/redefine the rules below.
include $(REPOROOT)/transforms/.make.transforms

#TRANSFORM_RUNTIME_SRC_FILE=hap_transform_python.py
TRANSFORM_NAME=hap

HAP_PYTHON_VERSION= $(DPK_VERSION)

venv:: .transforms.python-venv

install:: pip install -r requirements.txt

test:: .transforms.python-test

clean:: .transforms.clean

image:: .transforms.python-image

test-src:: .transforms.test-src

setup:: .transforms.setup

build:: build-dist image

publish: publish-image

publish-image:: .transforms.publish-image-python

setup:: .transforms.setup

# distribution versions is the same as image version.
set-versions:
$(MAKE) TRANSFORM_PYTHON_VERSION=$(HAP_PYTHON_VERSION) TOML_VERSION=$(HAP_PYTHON_VERSION) .transforms.set-versions

build-dist:: set-versions .defaults.build-dist

publish-dist:: .defaults.publish-dist

test-image:: .transforms.python-test-image

run-cli-sample: .transforms.run-cli-python-sample

run-local-sample: .transforms.run-local-sample

run-local-python-sample: .transforms.run-local-python-sample

#run-s3-ray-sample: .transforms.run-s3-ray-sample

minio-start: .minio-start

kind-load-image:: .transforms.kind-load-image

load-image:: .transforms.load-image

docker-load-image: .defaults.docker-load-image

docker-save-image: .defaults.docker-save-image
54 changes: 54 additions & 0 deletions transforms/universal/hap/python/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# HAP Annotation
Please see the set of [transform project conventions](https://github.com/ian-cho/data-prep-kit/blob/dev/transforms/README.md) for details on general project conventions, transform configuration, testing and IDE set up.

## Prerequisite
This repo needs NLTK and please refer to `requirements.txt`.

## Summary
The hap transform maps a non-empty input table to an output table with an added `hap_score` column. Each row in the table represents a document, and the hap transform performs the following three steps to calculate the hap score for each document:

* Sentence spliting: we use NLTK to split the document into sentence pieces.
* Hap annotation: each sentence is assigned a hap score between 0 and 1, where 1 represents hap and 0 represents non-hap.
* Aggregation: the document hap score is determined by selecting the maximum hap score among its sentences.


## Configuration and command line Options
The set of dictionary keys holding [HAPTransformConfiguration](src/hap_transform.py)
configuration for values are as follows:

* --model_name_or_path - specifies HAP model which should be compatable with HuggingFace's `AutoModelForSequenceClassification`
* --batch_size - modify it based on the infrastructure capacity.
* --max_length - the maximum length for the tokenizer.



## input format
The input is in .parquet format and contains the following columns:

| doc_id | doc_text |
|:------|:------|
| 1 | GSC is very much a little Swiss Army knife for... |
| 2 | Here are only a few examples. And no, I'm not ... |

## output format
The output is in .parquet format and includes an additional column, in addition to those in the input:

| doc_id | doc_text | hap_score |
|:------|:------|:-------------|
| 1 | GSC is very much a little Swiss Army knife for... | 0.002463 |
| 2 | Here are only a few examples. And no, I'm not ... | 0.989713 |

## How to run
Place your input Parquet file in the `test-data/input/` directory. A sample file, `test1.parquet`, is available in this directory. Once done, run the script.

```python
python hap_local_python.py
```

You will obtain the output file `test1.parquet` in the output directory.






Loading

0 comments on commit 00d1fea

Please sign in to comment.