Skip to content

Commit

Permalink
Fix CI and validation scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
guangy10 committed Apr 16, 2024
1 parent 6b455c6 commit 87d6121
Show file tree
Hide file tree
Showing 4 changed files with 168 additions and 7 deletions.
37 changes: 37 additions & 0 deletions .ci/scripts/gather_test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,37 @@
}


def parse_args() -> Any:
from argparse import ArgumentParser

parser = ArgumentParser("Gather all models to test on CI for the target OS")
parser.add_argument(
"-e",
"--event",
type=str,
choices=["pull_request", "push", "periodic"],
required=True,
help="GitHub CI Event. See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#on",
)

return parser.parse_args()


def model_should_run_on_event(model: str, event: str) -> bool:
"""
A helper function to decide whether a model should be tested on an event (pull_request/push)
We put higher priority and fast models to pull request and rest to push.
"""
if event == "pull_request":
return model in ["tinyllamas/stories15M"]
elif event == "push":
return model in []
elif event == "periodic":
return model in ["mistralai/Mistral-7B-v0.1"]
else:
return False


def set_output(name: str, val: Any) -> None:
"""
Set the GitHb output so that it can be accessed by other jobs
Expand All @@ -45,6 +76,9 @@ def export_models_for_ci() -> dict[str, dict]:
This gathers all the models that we want to test on GitHub OSS CI
"""

args = parse_args()
event = args.event

# This is the JSON syntax for configuration matrix used by GitHub
# https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs
models = {"include": []}
Expand All @@ -53,6 +87,9 @@ def export_models_for_ci() -> dict[str, dict]:
MODEL_REPOS.keys(),
JOB_RUNNERS.keys(),
):
if not model_should_run_on_event(repo_name, event):
continue

record = {
"repo_name": repo_name,
"resources": MODEL_REPOS[repo_name],
Expand Down
86 changes: 86 additions & 0 deletions .github/workflows/periodic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,89 @@ on:
tags:
- ciflow/periodic/*
workflow_dispatch:

jobs:
gather-models:
runs-on: ubuntu-22.04
outputs:
models: ${{ steps.gather-models.outputs.models }}
steps:
- uses: actions/checkout@v3
with:
submodules: 'false'
- uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Extract the list of models to test
id: gather-models
run: |
set -eux
PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "periodic"
test-cpu:
name: test-cpu (${{ matrix.platform }}, ${{ matrix.repo_name }})
needs: gather-models
strategy:
matrix: ${{ fromJSON(needs.gather-models.outputs.models) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
env:
TORCHAT_ROOT: ${{ github.workspace }}
REPO_NAME: ${{ matrix.repo_name }}
ENABKE_ET_PYBIND: ${{ matrix.runner == 'macos-14' && 'false' || 'true' }}
steps:
- name: Checkout repo
uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Print machine info
run: |
echo "$(uname -a)"
- name: Install dependencies
run: |
bash ${TORCHAT_ROOT}/scripts/install_et.sh $ENABKE_ET_PYBIND
- name: Download checkpoints
run: |
bash ${TORCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
- name: Run validation
run: |
pushd ${TORCHAT_ROOT}
export CHECKPOINT_PATH=./checkpoints/${REPO_NAME}/model.pth
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
bash .ci/scripts/validate.sh ${CHECKPOINT_PATH}
test-cuda:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
name: test-cuda (linux, ${{ matrix.repo_name }})
needs: gather-models
strategy:
matrix: ${{ fromJSON(needs.gather-models.outputs.models) }}
fail-fast: false
with:
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.1"
script: |
echo "::group::Print machine info"
nvidia-smi
echo "::endgroup::"
echo "::group::Install required packages"
pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
pip install -r ./requirements.txt
pip list
echo "::endgroup::"
echo "::group::Download checkpoint"
export REPO_NAME=${{ matrix.repo_name }}
bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
echo "::endgroup::"
echo "::group::Convert checkpoint"
export CHECKPOINT_PATH=./checkpoints/${REPO_NAME}/model.pth
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
echo "::endgroup::"
echo "::group::Run inference"
bash .ci/scripts/validate.sh ${CHECKPOINT_PATH} cuda
echo "::endgroup::"
49 changes: 43 additions & 6 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
name: pull

on:
schedule:
- cron: '0,6,12,18 0 * * *' # Runs at midnight UTC and every 6 hours
pull_request:
push:
branches:
- main
workflow_dispatch:

jobs:
Expand All @@ -21,7 +23,7 @@ jobs:
id: gather-models
run: |
set -eux
PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py
PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request"
test-cpu:
name: test-cpu (${{ matrix.platform }}, ${{ matrix.repo_name }})
needs: gather-models
Expand Down Expand Up @@ -52,6 +54,41 @@ jobs:
- name: Run validation
run: |
pushd ${TORCHAT_ROOT}
export CHECKPOINT_PATH=${TORCHAT_ROOT}/checkpoints/${REPO_NAME}/model.pth
bash ${TORCHAT_ROOT}/.ci/scripts/convert_checkpoint.sh ${REPO_NAME}
bash ${TORCHAT_ROOT}/.ci/scripts/validate.sh ${CHECKPOINT_PATH}
export CHECKPOINT_PATH=./checkpoints/${REPO_NAME}/model.pth
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
bash .ci/scripts/validate.sh ${CHECKPOINT_PATH}
test-cuda:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
name: test-cuda (linux, ${{ matrix.repo_name }})
needs: gather-models
strategy:
matrix: ${{ fromJSON(needs.gather-models.outputs.models) }}
fail-fast: false
with:
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.1"
script: |
echo "::group::Print machine info"
nvidia-smi
echo "::endgroup::"
echo "::group::Install required packages"
pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
pip install -r ./requirements.txt
pip list
echo "::endgroup::"
echo "::group::Download checkpoint"
export REPO_NAME=${{ matrix.repo_name }}
bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
echo "::endgroup::"
echo "::group::Convert checkpoint"
export CHECKPOINT_PATH=./checkpoints/${REPO_NAME}/model.pth
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
echo "::endgroup::"
echo "::group::Run inference"
bash .ci/scripts/validate.sh ${CHECKPOINT_PATH} cuda
echo "::endgroup::"
3 changes: 2 additions & 1 deletion scripts/install_et.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ install_pip_dependencies() {
echo "Intalling common pip packages"

pip install wheel
pip install cmake
pip install "cmake>=3.19"
pip install ninja
pip install zstd
pushd ${TORCHCHAT_ROOT}
Expand All @@ -26,6 +26,7 @@ install_executorch() {
pushd ${TORCHCHAT_ROOT}/build/src
git clone https://github.com/pytorch/executorch.git
cd executorch
git checkout viable/strict
echo "Install executorch: submodule update"
git submodule sync
git submodule update --init
Expand Down

0 comments on commit 87d6121

Please sign in to comment.