-
Notifications
You must be signed in to change notification settings - Fork 129
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #638 from ian-cho/dev
adding hap transform. Additional work on readme.md and proper default values for input parameters will be done in a separate PR
- Loading branch information
Showing
18 changed files
with
864 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
# | ||
# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files | ||
# | ||
name: Test - transforms/universal/hap | ||
|
||
on: | ||
workflow_dispatch: | ||
push: | ||
branches: | ||
- "dev" | ||
- "releases/**" | ||
tags: | ||
- "*" | ||
paths: | ||
- "transforms/universal/hap/**" | ||
- "data-processing-lib/**" | ||
- "!transforms/universal/hap/**/kfp_ray/**" # This is/will be tested in separate workflow | ||
- "!data-processing-lib/**/test/**" | ||
- "!data-processing-lib/**/test-data/**" | ||
- "!**.md" | ||
- "!**/doc/**" | ||
- "!**/images/**" | ||
- "!**.gitignore" | ||
pull_request: | ||
branches: | ||
- "dev" | ||
- "releases/**" | ||
paths: | ||
- "transforms/universal/hap/**" | ||
- "data-processing-lib/**" | ||
- "!transforms/universal/hap/**/kfp_ray/**" # This is/will be tested in separate workflow | ||
- "!data-processing-lib/**/test/**" | ||
- "!data-processing-lib/**/test-data/**" | ||
- "!**.md" | ||
- "!**/doc/**" | ||
- "!**/images/**" | ||
- "!**.gitignore" | ||
|
||
jobs: | ||
check_if_push_image: | ||
# check whether the Docker images should be pushed to the remote repository | ||
# The images are pushed if it is a merge to dev branch or a new tag is created. | ||
# The latter being part of the release process. | ||
# The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file. | ||
runs-on: ubuntu-22.04 | ||
outputs: | ||
publish_images: ${{ steps.version.outputs.publish_images }} | ||
steps: | ||
- id: version | ||
run: | | ||
publish_images='false' | ||
if [[ ${GITHUB_REF} == refs/heads/dev && ${GITHUB_EVENT_NAME} != 'pull_request' && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ; | ||
then | ||
publish_images='true' | ||
fi | ||
if [[ ${GITHUB_REF} == refs/tags/* && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ; | ||
then | ||
publish_images='true' | ||
fi | ||
echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT" | ||
test-src: | ||
runs-on: ubuntu-22.04 | ||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v4 | ||
- name: Free up space in github runner | ||
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 | ||
run: | | ||
df -h | ||
sudo rm -rf "/usr/local/share/boost" | ||
sudo rm -rf "$AGENT_TOOLSDIRECTORY" | ||
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup | ||
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true | ||
df -h | ||
- name: Test transform source in transforms/universal/hap | ||
run: | | ||
if [ -e "transforms/universal/hap/Makefile" ]; then | ||
make -C transforms/universal/hap DOCKER=docker test-src | ||
else | ||
echo "transforms/universal/hap/Makefile not found - source testing disabled for this transform." | ||
fi | ||
test-image: | ||
needs: [check_if_push_image] | ||
runs-on: ubuntu-22.04 | ||
timeout-minutes: 120 | ||
env: | ||
DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }} | ||
DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }} | ||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v4 | ||
- name: Free up space in github runner | ||
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 | ||
run: | | ||
df -h | ||
sudo rm -rf /opt/ghc | ||
sudo rm -rf "/usr/local/share/boost" | ||
sudo rm -rf "$AGENT_TOOLSDIRECTORY" | ||
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup | ||
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true | ||
df -h | ||
- name: Test transform image in transforms/universal/hap | ||
run: | | ||
if [ -e "transforms/universal/hap/Makefile" ]; then | ||
if [ -d "transforms/universal/hap/spark" ]; then | ||
make -C data-processing-lib/spark DOCKER=docker image | ||
fi | ||
make -C transforms/universal/hap DOCKER=docker test-image | ||
else | ||
echo "transforms/universal/hap/Makefile not found - testing disabled for this transform." | ||
fi | ||
- name: Print space | ||
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 | ||
run: | | ||
df -h | ||
docker images | ||
- name: Publish images | ||
if: needs.check_if_push_image.outputs.publish_images == 'true' | ||
run: | | ||
if [ -e "transforms/universal/hap/Makefile" ]; then | ||
make -C transforms/universal/hap publish | ||
else | ||
echo "transforms/universal/hap/Makefile not found - publishing disabled for this transform." | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
REPOROOT=../../.. | ||
# Use make help, to see the available rules | ||
include $(REPOROOT)/.make.defaults | ||
|
||
setup:: | ||
@# Help: Recursively make $@ all subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
|
||
clean:: | ||
@# Help: Recursively make $@ all subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
|
||
build:: | ||
@# Help: Recursively make $@ in subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
venv:: | ||
@# Help: Recursively make $@ in subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
|
||
image:: | ||
@# Help: Recursively make $@ in all subdirs | ||
@$(MAKE) RULE=$@ .recurse | ||
|
||
set-versions: | ||
@# Help: Recursively $@ in all subdirs | ||
@$(MAKE) RULE=$@ .recurse | ||
|
||
publish:: | ||
@# Help: Recursively make $@ in all subdirs | ||
@$(MAKE) RULE=$@ .recurse | ||
|
||
test-image:: | ||
@# Help: Recursively make $@ in all subdirs | ||
@$(MAKE) RULE=$@ .recurse | ||
|
||
test:: | ||
@# Help: Recursively make $@ in all subdirs | ||
@$(MAKE) RULE=$@ .recurse | ||
|
||
test-src:: | ||
@# Help: Recursively make $@ in all subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
|
||
kind-load-image:: | ||
@# Help: Recursively make $@ in all subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
|
||
docker-load-image:: | ||
@# Help: Recursively make $@ in all subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
|
||
docker-save-image:: | ||
@# Help: Recursively make $@ in all subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
|
||
.PHONY: workflow-venv | ||
workflow-venv: | ||
$(MAKE) -C kfp_ray workflow-venv | ||
|
||
.PHONY: workflow-test | ||
workflow-test: | ||
$(MAKE) -C kfp_ray workflow-test | ||
|
||
.PHONY: workflow-upload | ||
workflow-upload: | ||
$(MAKE) -C kfp_ray workflow-upload | ||
|
||
.PHONY: workflow-build | ||
workflow-build: | ||
$(MAKE) -C kfp_ray workflow-build |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
FROM docker.io/python:3.10.14-slim-bullseye | ||
|
||
RUN pip install --upgrade --no-cache-dir pip | ||
|
||
# install pytest | ||
RUN pip install --no-cache-dir pytest | ||
|
||
# Create a user and use it to run the transform | ||
RUN useradd -ms /bin/bash dpk | ||
USER dpk | ||
WORKDIR /home/dpk | ||
|
||
# Copy and install data processing libraries | ||
# These are expected to be placed in the docker context before this is run (see the make image). | ||
COPY --chown=dpk:root data-processing-lib-python/ data-processing-lib-python/ | ||
RUN cd data-processing-lib-python && pip install --no-cache-dir -e . | ||
|
||
# END OF STEPS destined for a data-prep-kit base image | ||
|
||
COPY --chown=dpk:root src/ src/ | ||
COPY --chown=dpk:root pyproject.toml pyproject.toml | ||
COPY --chown=dpk:root requirements.txt requirements.txt | ||
RUN pip install --no-cache-dir -e . | ||
|
||
# copy transform main() entry point to the image | ||
COPY ./src/hap_transform_python.py . | ||
|
||
# copy some of the samples in | ||
COPY ./src/hap_local.py local/ | ||
|
||
# copy test | ||
COPY test/ test/ | ||
COPY test-data/ test-data/ | ||
|
||
# Set environment | ||
ENV PYTHONPATH /home/dpk | ||
|
||
# Put these at the end since they seem to upset the docker cache. | ||
ARG BUILD_DATE | ||
ARG GIT_COMMIT | ||
LABEL build-date=$BUILD_DATE | ||
LABEL git-commit=$GIT_COMMIT |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
# Define the root of the local git clone for the common rules to be able | ||
# know where they are running from. | ||
REPOROOT=../../../.. | ||
# Include a library of common .transform.* targets which most | ||
# transforms should be able to reuse. However, feel free | ||
# to override/redefine the rules below. | ||
include $(REPOROOT)/transforms/.make.transforms | ||
|
||
#TRANSFORM_RUNTIME_SRC_FILE=hap_transform_python.py | ||
TRANSFORM_NAME=hap | ||
|
||
HAP_PYTHON_VERSION= $(DPK_VERSION) | ||
|
||
venv:: .transforms.python-venv | ||
|
||
install:: pip install -r requirements.txt | ||
|
||
test:: .transforms.python-test | ||
|
||
clean:: .transforms.clean | ||
|
||
image:: .transforms.python-image | ||
|
||
test-src:: .transforms.test-src | ||
|
||
setup:: .transforms.setup | ||
|
||
build:: build-dist image | ||
|
||
publish: publish-image | ||
|
||
publish-image:: .transforms.publish-image-python | ||
|
||
setup:: .transforms.setup | ||
|
||
# distribution versions is the same as image version. | ||
set-versions: | ||
$(MAKE) TRANSFORM_PYTHON_VERSION=$(HAP_PYTHON_VERSION) TOML_VERSION=$(HAP_PYTHON_VERSION) .transforms.set-versions | ||
|
||
build-dist:: set-versions .defaults.build-dist | ||
|
||
publish-dist:: .defaults.publish-dist | ||
|
||
test-image:: .transforms.python-test-image | ||
|
||
run-cli-sample: .transforms.run-cli-python-sample | ||
|
||
run-local-sample: .transforms.run-local-sample | ||
|
||
run-local-python-sample: .transforms.run-local-python-sample | ||
|
||
#run-s3-ray-sample: .transforms.run-s3-ray-sample | ||
|
||
minio-start: .minio-start | ||
|
||
kind-load-image:: .transforms.kind-load-image | ||
|
||
load-image:: .transforms.load-image | ||
|
||
docker-load-image: .defaults.docker-load-image | ||
|
||
docker-save-image: .defaults.docker-save-image |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# HAP Annotation | ||
Please see the set of [transform project conventions](https://github.com/ian-cho/data-prep-kit/blob/dev/transforms/README.md) for details on general project conventions, transform configuration, testing and IDE set up. | ||
|
||
## Prerequisite | ||
This repo needs NLTK and please refer to `requirements.txt`. | ||
|
||
## Summary | ||
The hap transform maps a non-empty input table to an output table with an added `hap_score` column. Each row in the table represents a document, and the hap transform performs the following three steps to calculate the hap score for each document: | ||
|
||
* Sentence spliting: we use NLTK to split the document into sentence pieces. | ||
* Hap annotation: each sentence is assigned a hap score between 0 and 1, where 1 represents hap and 0 represents non-hap. | ||
* Aggregation: the document hap score is determined by selecting the maximum hap score among its sentences. | ||
|
||
|
||
## Configuration and command line Options | ||
The set of dictionary keys holding [HAPTransformConfiguration](src/hap_transform.py) | ||
configuration for values are as follows: | ||
|
||
* --model_name_or_path - specifies HAP model which should be compatable with HuggingFace's `AutoModelForSequenceClassification` | ||
* --batch_size - modify it based on the infrastructure capacity. | ||
* --max_length - the maximum length for the tokenizer. | ||
|
||
|
||
|
||
## input format | ||
The input is in .parquet format and contains the following columns: | ||
|
||
| doc_id | doc_text | | ||
|:------|:------| | ||
| 1 | GSC is very much a little Swiss Army knife for... | | ||
| 2 | Here are only a few examples. And no, I'm not ... | | ||
|
||
## output format | ||
The output is in .parquet format and includes an additional column, in addition to those in the input: | ||
|
||
| doc_id | doc_text | hap_score | | ||
|:------|:------|:-------------| | ||
| 1 | GSC is very much a little Swiss Army knife for... | 0.002463 | | ||
| 2 | Here are only a few examples. And no, I'm not ... | 0.989713 | | ||
|
||
## How to run | ||
Place your input Parquet file in the `test-data/input/` directory. A sample file, `test1.parquet`, is available in this directory. Once done, run the script. | ||
|
||
```python | ||
python hap_local_python.py | ||
``` | ||
|
||
You will obtain the output file `test1.parquet` in the output directory. | ||
|
||
|
||
|
||
|
||
|
||
|
Oops, something went wrong.