Skip to content

multi_step version of fuzzy dedup #2054

multi_step version of fuzzy dedup

multi_step version of fuzzy dedup #2054

Workflow file for this run

name: Build, Test and (Optionally) Push images
on:
workflow_dispatch:
push:
branches:
- "dev"
- "releases/**"
tags:
- "*"
paths-ignore:
- "**.md"
- "examples/**"
- "**/doc/**"
- "**/.gitignore"
- "**/.dockerignore"
pull_request:
branches:
- "dev"
- "releases/**"
paths-ignore:
- "**.md"
- "examples/**"
- "**/doc/**"
- "**/.gitignore"
- "**/.dockerignore"
env:
KFP_BLACK_LIST: "doc_chunk,pdf2parquet,pii_redactor, fdedup_multi_step"
jobs:
check_if_push_images:
# check whether the Docker images should be pushed to the remote repository
# The images are pushed if it is a merge to dev branch or a new tag is created.
# The latter being part of the release process.
# The images tag is derived from the value of the DOCKER_IMAGE_VERSION variable set in the .make.versions file.
runs-on: ubuntu-22.04
outputs:
publish_images: ${{ steps.version.outputs.publish_images }}
steps:
- id: version
run: |
publish_images='false'
if [[ ${GITHUB_REF} == refs/heads/dev && ${GITHUB_EVENT_NAME} != 'pull_request' && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
then
publish_images='true'
fi
if [[ ${GITHUB_REF} == refs/tags/* && ${GITHUB_REPOSITORY} == IBM/data-prep-kit ]] ;
then
publish_images='true'
fi
echo "publish_images=$publish_images" >> "$GITHUB_OUTPUT"
test-make:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Test top-level recursive make targets.
run: |
make -n clean test build publish set-versions
test-python-lib:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Test data-processing-lib/python
run: |
make -C data-processing-lib/python DOCKER=docker venv test
test-ray-lib:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Test data-processing-lib/ray
run: |
make -C data-processing-lib/ray DOCKER=docker venv test
test-spark-lib:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Test data-processing-lib/spark
run: |
make -C data-processing-lib/spark DOCKER=docker venv test
test-code:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free up space in github runner
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
df -h
- name: Test Code Transforms
run: |
make -C transforms/code DOCKER=docker test-src
test-language:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free up space in github runner
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
df -h
- name: Test Language Transforms
run: |
make -C transforms/language DOCKER=docker test-src
test-universal:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free up space in github runner
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
df -h
- name: Test Universal Transforms
run: |
make -C transforms/universal DOCKER=docker test-src
test-tools:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Test tools
run: |
make -C tools DOCKER=docker venv test
test-kfp-v1:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free up space in github runner
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
df -h
- name: Test KFP libs (shared and v1) and run a workflow
timeout-minutes: 120
run: |
export REPOROOT=$PWD
export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
source $K8S_SETUP_SCRIPTS/requirements.env
export PATH=$PATH:/tmp/
curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
chmod 777 /tmp/kind
curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
chmod 700 /tmp/get_helm.sh
HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
chmod 777 /tmp/helm
curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
chmod 777 /tmp/kubectl
curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
chmod +x /tmp/mc
export DEPLOY_KUBEFLOW=1
make -C $K8S_SETUP_SCRIPTS setup
make -C kfp/kfp_support_lib test
make -C transforms workflow-build
source $K8S_SETUP_SCRIPTS/common.sh
while :
do
dir=("code" "universal" "language") && index=$(($RANDOM % ${#dir[@]})) && subdirs=${dir[$index]} && transforms=($(find transforms/$subdirs -type d -maxdepth 1 -mindepth 1 ))
set -- "${transforms[@]}" && transforms=("$@") && size=${#transforms[@]} && index=$(($RANDOM % $size))
transform=$(basename "${transforms[$index]}")
if [ -d ${transforms[$index]}/kfp_ray ] && echo ${KFP_BLACK_LIST} | grep -qv ${transform} ; then
header_text "Running ${transforms[$index]} workflow test"
break
fi
done
make -C ${transforms[$index]} workflow-test
echo "Run ${transforms[$index]} completed"
test-kfp-v2:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free up space in github runner
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
df -h
- name: Test KFP libs (shared and v2) and run a workflow
timeout-minutes: 120
run: |
export REPOROOT=$PWD
export K8S_SETUP_SCRIPTS=$PWD/scripts/k8s-setup
source $K8S_SETUP_SCRIPTS/requirements.env
export PATH=$PATH:/tmp/
curl -Lo /tmp/kind https://kind.sigs.k8s.io/dl/v${KIND_VERSION}/kind-linux-amd64
chmod 777 /tmp/kind
curl -fsSL -o /tmp/get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
chmod 700 /tmp/get_helm.sh
HELM_INSTALL_DIR=/tmp/ /tmp/get_helm.sh -v v${HELM_VERSION} --no-sudo
chmod 777 /tmp/helm
curl -L https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /tmp/kubectl
chmod 777 /tmp/kubectl
curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o /tmp/mc
chmod +x /tmp/mc
export DEPLOY_KUBEFLOW=1
export KFPv2=1
make -C $K8S_SETUP_SCRIPTS setup
make -C kfp/kfp_support_lib test
make -C transforms workflow-build
source $K8S_SETUP_SCRIPTS/common.sh
while :
do
dir=("code" "universal" "language") && index=$(($RANDOM % ${#dir[@]})) && subdirs=${dir[$index]} && transforms=($(find transforms/$subdirs -type d -maxdepth 1 -mindepth 1 ))
set -- "${transforms[@]}" && transforms=("$@") && size=${#transforms[@]} && index=$(($RANDOM % $size))
transform=$(basename "${transforms[$index]}")
if [ -d ${transforms[$index]}/kfp_ray ] && echo ${KFP_BLACK_LIST} | grep -qv ${transform} ; then
header_text "Running ${transforms[$index]} workflow test"
break
fi
done
make -C ${transforms[$index]} workflow-test
header_text "Run ${transforms[$index]} completed"
test-data-processing-lib-images:
needs: [check_if_push_images]
if: needs.check_if_push_images.outputs.publish_images == 'true'
runs-on: ubuntu-22.04
env:
DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }}
timeout-minutes: 30
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free up space in github runner
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
df -h
- name: Test Code Transform Images
run: |
make -C data-processing-lib/spark DOCKER=docker image
- name:
Print space
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
docker images
- name: Publish images
if: needs.check_if_push_images.outputs.publish_images == 'true'
run: |
make -C data-processing-lib/spark publish-image
test-code-images:
needs: [check_if_push_images]
runs-on: ubuntu-22.04
timeout-minutes: 30
env:
DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free up space in github runner
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
df -h
- name: Test Code Transform Images
run: |
make -C data-processing-lib DOCKER=docker image
make -C transforms/code DOCKER=docker test-image
- name:
Print space
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
docker images
- name: Publish images
if: needs.check_if_push_images.outputs.publish_images == 'true'
run: |
make -C transforms/code publish
test-language-images:
needs: [check_if_push_images]
runs-on: ubuntu-22.04
timeout-minutes: 120
env:
DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free up space in github runner
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
df -h
- name: Test language Transform Images
run: |
make -C data-processing-lib DOCKER=docker image
make -C transforms/language DOCKER=docker test-image
- name: Print space
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: df -h
- name: Publish images
if: needs.check_if_push_images.outputs.publish_images == 'true'
run: make -C transforms/language publish
test-universal-images:
needs: [check_if_push_images]
runs-on: ubuntu-22.04
timeout-minutes: 120
env:
DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free up space in github runner
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
df -h
- name: Test Universal Transform Images
run: |
make -C data-processing-lib/spark DOCKER=docker image
make -C transforms/universal DOCKER=docker test-image
- name:
Print space
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
docker images
- name: Publish images
if: needs.check_if_push_images.outputs.publish_images == 'true'
run: make -C transforms/universal publish
build-kfp-components:
needs: [check_if_push_images]
runs-on: ubuntu-22.04
timeout-minutes: 30
env:
DOCKER_REGISTRY_USER: ${{ secrets.DOCKER_REGISTRY_USER }}
DOCKER_REGISTRY_KEY: ${{ secrets.DOCKER_REGISTRY_KEY }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free up space in github runner
# Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
run: |
df -h
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/lib/jvm /usr/local/.ghcup
sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
df -h
- name: Build
run: |
make -C kfp/kfp_ray_components DOCKER=docker image
make KFPv2=1 -C kfp/kfp_ray_components DOCKER=docker image
- name: Publish images
if: needs.check_if_push_images.outputs.publish_images == 'true'
run: make -C kfp/kfp_ray_components publish
test-tool-images:
runs-on: ubuntu-22.04
timeout-minutes: 30
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Build and Test Tool images
run: |
make -C tools/ingest2parquet DOCKER=docker test-image