Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added ray-based version of hap transform #685

Merged
merged 5 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions transforms/universal/hap/ray/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310
FROM ${BASE_IMAGE}

RUN pip install --upgrade --no-cache-dir pip

# install pytest
RUN pip install --no-cache-dir pytest

# Copy and install data processing libraries
# These are expected to be placed in the docker context before this is run (see the make image).
COPY --chown=ray:users data-processing-lib-python/ data-processing-lib-python/
RUN cd data-processing-lib-python && pip install --no-cache-dir -e .
COPY --chown=ray:users data-processing-lib-ray/ data-processing-lib-ray/
RUN cd data-processing-lib-ray && pip install --no-cache-dir -e .
COPY --chown=ray:users python-transform/ python-transform/
RUN cd python-transform && pip install --no-cache-dir -e .

#COPY requirements.txt requirements.txt
#RUN pip install --no-cache-dir -r requirements.txt

COPY --chown=ray:users src/ src/
COPY --chown=ray:users pyproject.toml pyproject.toml
RUN pip install --no-cache-dir -e .

# copy the main() entry point to the image
COPY ./src/hap_transform_ray.py .

# copy some of the samples in
COPY ./src/hap_local_ray.py local/

# copy test
COPY test/ test/
COPY test-data/ test-data/

# Set environment
ENV PYTHONPATH /home/ray

# Put these at the end since they seem to upset the docker cache.
ARG BUILD_DATE
ARG GIT_COMMIT
LABEL build-date=$BUILD_DATE
LABEL git-commit=$GIT_COMMIT
58 changes: 58 additions & 0 deletions transforms/universal/hap/ray/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Define the root of the local git clone for the common rules to be able
# know where they are running from.
REPOROOT=../../../..
# Include a library of common .transform.* targets which most
# transforms should be able to reuse. However, feel free
# to override/redefine the rules below.
include $(REPOROOT)/transforms/.make.transforms

TRANSFORM_NAME=hap

BASE_IMAGE=${RAY_BASE_IMAGE}
HAP_PYTHON_VERSION= $(DPK_VERSION)

venv:: .transforms.ray-venv

install:: pip install -r requirements.txt

test:: .transforms.ray-test

clean:: .transforms.clean

image:: .transforms.ray-image

test-src:: .transforms.test-src

setup:: .transforms.setup

test-image:: .transforms.ray-test-image

build:: build-dist image

publish: publish-image

publish-image:: .transforms.publish-image-ray

setup:: .transforms.setup

# distribution versions is the same as image version.
set-versions:
$(MAKE) TRANSFORM_PYTHON_VERSION=$(HAP_PYTHON_VERSION) TOML_VERSION=$(HAP_PYTHON_VERSION) .transforms.set-versions

build-dist:: set-versions .defaults.build-dist

publish-dist:: .defaults.publish-dist

run-cli-sample: .transforms.run-cli-ray-sample

run-local-sample: .transforms.run-local-ray-sample

run-s3-sample: .transforms.run-s3-ray-sample

minio-start: .minio-start

kind-load-image:: .transforms.kind-load-image

docker-load-image: .defaults.docker-load-image

docker-save-image: .defaults.docker-save-image
20 changes: 20 additions & 0 deletions transforms/universal/hap/ray/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Hate, Abuse, and Profanity (HAP) Annotation
# HAP Transform for Ray
Please see the set of
[transform project conventions](../../../README.md#transform-project-conventions)
for details on general project conventions, transform configuration,
testing and IDE set up.

## Summary
This project wraps the [hap transform](../python) with a Ray runtime.

## Configuration and command line Options

Configuration and command line options are the same as for the base python transform.

## Running

### Launched Command Line Options
In addition to those available to the transform as defined in [here](../python/README.md),
the set of
[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available.
50 changes: 50 additions & 0 deletions transforms/universal/hap/ray/output/metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
{
"pipeline": "pipeline_id",
"job details": {
"job category": "preprocessing",
"job name": "hap",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-10-03 21:38:20",
"end_time": "2024-10-03 21:38:29",
"status": "success"
},
"code": {
"github": "github",
"commit_hash": "12345",
"path": "path"
},
"job_input_params": {
"model_name_or_path": "ibm-granite/granite-guardian-hap-38m",
"annotation_column": "hap_score",
"doc_text_column": "contents",
"inference_engine": "CPU",
"max_length": 512,
"batch_size": 128,
"checkpointing": false,
"max_files": -1,
"random_samples": -1,
"files_to_use": [
".parquet"
],
"num_processors": 0
},
"job_output_stats": {
"source_files": 2,
"source_size": 12124594,
"transform execution exception": 1,
"result_files": 1,
"result_size": 79822,
"processing_time": 6.932,
"source_doc_count": 50,
"result_doc_count": 50
},
"source": {
"name": "/Users/ian/Desktop/data-prep-kit/transforms/universal/hap/python/test-data/input",
"type": "path"
},
"target": {
"name": "/Users/ian/Desktop/data-prep-kit/transforms/universal/hap/python/output",
"type": "path"
}
}
Binary file not shown.
48 changes: 48 additions & 0 deletions transforms/universal/hap/ray/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
[project]
name = "dpk_hap_transform_ray"
version = "0.2.2.dev0"
requires-python = ">=3.10"
description = "HAP Ray Transform"
license = {text = "Apache-2.0"}
readme = {file = "README.md", content-type = "text/markdown"}
authors = [
{ name = "Ian Cho", email = "[email protected]" },
]
dynamic = ["dependencies"]


[build-system]
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
build-backend = "setuptools.build_meta"

[tool.setuptools.dynamic]
dependencies = {file = ["requirements.txt"]}

[project.optional-dependencies]
dev = [
"twine",
"pytest>=7.3.2",
"pytest-dotenv>=0.5.2",
"pytest-env>=1.0.0",
"pre-commit>=3.3.2",
"pytest-cov>=4.1.0",
"pytest-mock>=3.10.0",
"moto==5.0.5",
"markupsafe==2.0.1",
]



[options]
package_dir = ["src","test"]

[options.packages.find]
where = ["src/"]

[tool.pytest.ini_options]
# Currently we use low coverage since we have to run tests separately (see makefile)
#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
markers = ["unit: unit tests", "integration: integration tests"]

[tool.coverage.run]
include = ["src/*"]
6 changes: 6 additions & 0 deletions transforms/universal/hap/ray/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
data-prep-toolkit-ray==0.2.2.dev0
dpk-hap-transform-python==0.2.2.dev0
nltk==3.9.1
transformers==4.38.2
torch==2.4.1
pandas==2.2.2
Empty file.
60 changes: 60 additions & 0 deletions transforms/universal/hap/ray/src/hap_local_ray.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# (C) Copyright IBM Corp. 2024.
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################


import ast
import os
import sys

from data_processing.utils import ParamsUtils
from data_processing_ray.runtime.ray import RayTransformLauncher
from hap_transform_ray import HAPRayTransformConfiguration


# create parameters
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input"))
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output"))
local_conf = {
"input_folder": input_folder,
"output_folder": output_folder,
}
code_location = {"github": "github", "commit_hash": "12345", "path": "path"}

params = {
# where to run
"run_locally": True,

"data_local_config": ParamsUtils.convert_to_ast(local_conf),
"runtime_pipeline_id": "pipeline_id",
"runtime_job_id": "job_id",
"runtime_code_location": ParamsUtils.convert_to_ast(code_location),
}


hap_params = {
"model_name_or_path": 'ibm-granite/granite-guardian-hap-38m',
"annotation_column": "hap_score",
"doc_text_column": "contents",
"inference_engine": "CPU",
"max_length": 512,
"batch_size": 128,
}



if __name__ == "__main__":
# Set the simulated command line args
sys.argv = ParamsUtils.dict_to_req(d=params | hap_params)
# create launcher
launcher = RayTransformLauncher(HAPRayTransformConfiguration())
# Launch the ray actor(s) to process the input
launcher.launch()
64 changes: 64 additions & 0 deletions transforms/universal/hap/ray/src/hap_s3_ray.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# (C) Copyright IBM Corp. 2024.
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################

import os
import sys

from data_processing.utils import ParamsUtils
from data_processing_ray.runtime.ray import RayTransformLauncher
from hap_transform_ray import HAPRayTransformConfiguration


# create launcher
launcher = RayTransformLauncher(HAPRayTransformConfiguration())
# create parameters
s3_cred = {
"access_key": "localminioaccesskey",
"secret_key": "localminiosecretkey",
"url": "http://localhost:9000",
}

s3_conf = {
"input_folder": "test/hap/input",
"output_folder": "test/hap/output",
}
worker_options = {"num_cpus": 0.8}
code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
params = {
# where to run
"run_locally": True,
# Data access. Only required parameters are specified
"data_s3_cred": ParamsUtils.convert_to_ast(s3_cred),
"data_s3_config": ParamsUtils.convert_to_ast(s3_conf),
# orchestrator
"runtime_worker_options": ParamsUtils.convert_to_ast(worker_options),
"runtime_num_workers": 3,
"runtime_pipeline_id": "pipeline_id",
"runtime_job_id": "job_id",
"runtime_creation_delay": 0,
"runtime_code_location": ParamsUtils.convert_to_ast(code_location),
}


hap_params = {
"model_name_or_path": 'ibm-granite/granite-guardian-hap-38m',
"annotation_column": "hap_score",
"doc_text_column": "contents",
"inference_engine": "CPU",
"max_length": 512,
"batch_size": 128,
}


sys.argv = ParamsUtils.dict_to_req(d=params | hap_params)
# launch
launcher.launch()
39 changes: 39 additions & 0 deletions transforms/universal/hap/ray/src/hap_transform_ray.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# (C) Copyright IBM Corp. 2024.
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################

from data_processing.utils import get_logger
from data_processing_ray.runtime.ray import RayTransformLauncher
from data_processing_ray.runtime.ray.runtime_configuration import RayTransformRuntimeConfiguration
from hap_transform import HAPTransformConfiguration


logger = get_logger(__name__)


class HAPRayTransformConfiguration(RayTransformRuntimeConfiguration):
"""
Implements the RayTransformConfiguration for HAP as required by the RayTransformLauncher.
"""

def __init__(self):
"""
Initialization
:param base_configuration - base configuration class
"""
super().__init__(transform_config=HAPTransformConfiguration())


if __name__ == "__main__":
launcher = RayTransformLauncher(HAPRayTransformConfiguration())
logger.info("Launching hap transform")
launcher.launch()

Loading