Skip to content

Implement download subcommand, optional positional model name argument #348

Implement download subcommand, optional positional model name argument

Implement download subcommand, optional positional model name argument #348

Workflow file for this run

name: pull
on:
pull_request:
push:
branches:
- main
workflow_dispatch:
jobs:
gather-models-cpu:
runs-on: ubuntu-22.04
outputs:
models: ${{ steps.gather-models-cpu.outputs.models }}
steps:
- uses: actions/checkout@v3
with:
submodules: 'false'
- uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Extract the list of models to run on CPU
id: gather-models-cpu
run: |
set -eux
PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request" --backend "cpu"
test-cpu-compile:
name: test-cpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }})
needs: gather-models-cpu
strategy:
matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
env:
TORCHCHAT_ROOT: ${{ github.workspace }}
REPO_NAME: ${{ matrix.repo_name }}
steps:
- name: Checkout repo
uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Print machine info
run: |
echo "$(uname -a)"
- name: Install dependencies
run: |
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
pip install -r requirements.txt
pip list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
- name: Download checkpoints
run: |
bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
- name: Run validation
run: |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
pushd ${TORCHCHAT_ROOT}
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "compile"
test-cpu-aoti:
name: test-cpu-aoti (${{ matrix.platform }}, ${{ matrix.model_name }})
needs: gather-models-cpu
strategy:
matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
env:
TORCHCHAT_ROOT: ${{ github.workspace }}
REPO_NAME: ${{ matrix.repo_name }}
steps:
- name: Checkout repo
uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Print machine info
run: |
echo "$(uname -a)"
- name: Install dependencies
run: |
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
pip install -r requirements.txt
pip list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
- name: Download checkpoints
run: |
bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
- name: Run validation
run: |
pushd ${TORCHCHAT_ROOT}
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "aoti"
gather-models-gpu:
runs-on: ubuntu-22.04
outputs:
models: ${{ steps.gather-models-gpu.outputs.models }}
steps:
- uses: actions/checkout@v3
with:
submodules: 'false'
- uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Extract the list of models to run on GPU
id: gather-models-gpu
run: |
set -eux
PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request" --backend "gpu"
test-gpu-compile:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
name: test-gpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }})
needs: gather-models-gpu
strategy:
matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
fail-fast: false
with:
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.1"
script: |
echo "::group::Print machine info"
nvidia-smi
echo "::endgroup::"
echo "::group::Install required packages"
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
pip install -r ./requirements.txt
pip list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
echo "::endgroup::"
echo "::group::Download checkpoint"
export REPO_NAME=${{ matrix.repo_name }}
bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
echo "::endgroup::"
echo "::group::Convert checkpoint"
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
echo "::endgroup::"
echo "::group::Run inference"
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "compile"
echo "::endgroup::"
test-gpu-aoti:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
name: test-gpu-aoti (${{ matrix.platform }}, ${{ matrix.model_name }})
needs: gather-models-gpu
strategy:
matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
fail-fast: false
with:
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.1"
script: |
echo "::group::Print machine info"
nvidia-smi
echo "::endgroup::"
echo "::group::Install required packages"
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
pip install -r ./requirements.txt
pip list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
echo "::endgroup::"
echo "::group::Download checkpoint"
export REPO_NAME=${{ matrix.repo_name }}
bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
echo "::endgroup::"
echo "::group::Convert checkpoint"
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
echo "::endgroup::"
echo "::group::Run inference"
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti"
echo "::endgroup::"
test-tinystories-executorch:
strategy:
matrix:
runner: [32-core-ubuntu]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout repo
uses: actions/checkout@v2
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.11
- name: Print machine info
run: |
uname -a
if [ $(uname -s) == Darwin ]; then
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
fi
- name: Install requirements
run: |
echo "Intalling pip packages"
pip install wheel
pip install cmake
pip install ninja
pip install zstd
pip install -r requirements.txt
echo "Executorch: cloning"
mkdir etorch
cd etorch
git clone https://github.com/pytorch/executorch.git
cd executorch
echo "Inside: ${PWD}"
echo "Executorch: submodule update"
git submodule sync
git submodule update --init
echo "Executorch: installing python interface"
./install_requirements.sh --pybind xnnpack
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
python3 -c 'import torchvision;print(f"torchvision: {torchvision.__version__, torchvision.version.git_version}")'
python3 -c 'import torchaudio;print(f"torchaudio: {torchaudio.__version__, torchaudio.version.git_version}")'
cd ../..
echo "Inside: ${PWD}"
- name: Download checkpoints
run: |
mkdir -p checkpoints/stories15M
pushd checkpoints/stories15M
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
popd
mkdir gguf_files
export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf
export GGUF_TOKENIZER_PATH=gguf_files/tokenizer.model
wget -O ${GGUF_PATH} "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true"
wget -O ${GGUF_TOKENIZER_PATH} https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
- name: Run inference
run: |
export MODEL_PATH=${PWD}/checkpoints/stories15M/stories15M.pt
export MODEL_NAME=stories15M
python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 > ${PWD}/output_eager
cat ${PWD}/output_eager
python export.py --checkpoint-path ${MODEL_PATH} --output-pte-path ${PWD}/${MODEL_NAME}.pte
python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${PWD}/${MODEL_NAME}.pte > ${PWD}/output_et
cat ${PWD}/output_et
echo "Tests complete."
- name: Run inference
run: |
export MODEL_PATH=checkpoints/stories15M/stories15M.pt
export MODEL_NAME=stories15M
export MODEL_DIR=/tmp
python export.py --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte > ./output_et
cat ./output_et
echo "******************************************"
echo "******* Emb: channel-wise quantized ******"
echo "******************************************"
python export.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte > ./output_et
cat ./output_et
echo "******************************************"
echo "******** Emb: group-wise quantized *******"
echo "******************************************"
python export.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte > ./output_et
cat ./output_et
echo "******************************************"
echo "******* INT8 channel-wise quantized ******"
echo "******************************************"
python export.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte > ./output_et
cat ./output_et
echo "******************************************"
echo "******** INT8 group-wise quantized *******"
echo "******************************************"
python export.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte > ./output_et
cat ./output_et
echo "******************************************"
echo "******** ET: a8w4dq INT4 group-wise quantized *******"
echo "******************************************"
python export.py --quant '{"linear:a8w4dq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte > ./output_et
# cat ./output_et
echo "tests complete"
echo "******************************************"
- name: Run GGUF export + inference
run: |
export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf
export GGUF_TOKENIZER_PATH=gguf_files/tokenizer.model
python torchchat.py export --gguf-path ${GGUF_PATH} --output-pte-path ${PWD}/${MODEL_NAME}.pte
python torchchat.py generate --gguf-path ${GGUF_PATH} --pte-path ${PWD}/${MODEL_NAME}.pte --tokenizer-path ${GGUF_TOKENIZER_PATH} --temperature 0 --max-new-tokens 20 > ${PWD}/output_et
cat ${PWD}/output_et
echo "Tests complete."
torchchat-command-load-test:
strategy:
matrix:
runner: [macos-14]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout repo
uses: actions/checkout@v2
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.11
- name: Print machine info
run: |
uname -a
if [ $(uname -s) == Darwin ]; then
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
fi
- name: Install requirements
run: |
echo "Installing pip packages"
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
pip install -r requirements.txt
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
- name: Download Stories files
run: |
mkdir -p checkpoints/stories15M
pushd checkpoints/stories15M
curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
popd
- name: Test generate
run: |
export MODEL_PATH=checkpoints/stories15M/stories15M.pt
export MODEL_NAME=stories15M
export MODEL_DIR=/tmp
python generate.py --device cpu --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager1
python torchchat.py generate --device cpu --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager2
cat ./output_eager1
cat ./output_eager2
echo "Tests complete."
- name: Test download
run: |
python torchchat.py generate stories15M
test-tinystories-eager:
strategy:
matrix:
runner: [macos-12]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout repo
uses: actions/checkout@v2
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.11
- name: Print machine info
run: |
uname -a
if [ $(uname -s) == Darwin ]; then
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
fi
- name: Install requirements
run: |
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
pip install -r requirements.txt
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
- name: Download checkpoints
run: |
mkdir -p checkpoints/stories15M
pushd checkpoints/stories15M
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
popd
- name: Run inference
run: |
export MODEL_PATH=checkpoints/stories15M/stories15M.pt
export MODEL_NAME=stories15M
export MODEL_DIR=/tmp
for DTYPE in bfloat16 float16 float32; do
# if [ $(uname -s) == Darwin ]; then
# export DTYPE=float16
# fi
python generate.py --dtype ${DTYPE} --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager
echo "******************************************"
echo "******* Emb: channel-wise quantized ******"
echo "******************************************"
python generate.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager
echo "******************************************"
echo "******** Emb: group-wise quantized *******"
echo "******************************************"
python generate.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager
echo "******************************************"
echo "******* INT8 channel-wise quantized ******"
echo "******************************************"
python generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager
echo "******************************************"
echo "******** INT8 group-wise quantized *******"
echo "******************************************"
python generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager
echo "******************************************"
echo "******** INT4 group-wise quantized *******"
echo "******************************************"
echo "INT4 should work on MacOS on x86, but cannot be tested"
echo "because nightlies are too old!"
# python generate.py --dtype ${DTYPE} --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
# cat ./output_eager
echo "tests complete for ${DTYPE}"
done
echo "tests complete for all dtypes!"
test-mps:
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
with:
runner: macos-m1-stable
script: |
set -x
# NS: Remove previous installation of torch first
# as this script does not isntall anything into conda env but rather as system dep
pip uninstall -y torch || true
set -eou pipefail
echo "::group::Print machine info"
uname -a
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
echo "::endgroup::"
echo "::group::Install requirements"
# Install requirements
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
ls -la
pwd
pip install -r requirements.txt
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
echo "::endgroup::"
echo "::group::Download checkpoints"
(
mkdir -p checkpoints/stories15M
pushd checkpoints/stories15M
curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
popd
)
echo "::endgroup::"
echo "::group::Run inference"
export MODEL_PATH=checkpoints/stories15M/stories15M.pt
export MODEL_NAME=stories15M
export MODEL_DIR=/tmp
python generate.py --device mps --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager
echo "************************************************************"
echo "*** embedding"
echo "************************************************************"
python generate.py --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager
python generate.py --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager
echo "************************************************************"
echo "*** linear int8"
echo "************************************************************"
python generate.py --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager
python generate.py --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager
echo "************************************************************"
echo "*** linear int4"
echo "************************************************************"
PYTORCH_ENABLE_MPS_FALLBACK=1 python generate.py --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager
test-gguf-util:
strategy:
matrix:
runner: [macos-14]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout repo
uses: actions/checkout@v2
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.11
- name: Print machine info
run: |
uname -a
if [ $(uname -s) == Darwin ]; then
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
fi
- name: Install requirements
run: |
echo "Intalling pip packages"
pip install gguf
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
pip install -r requirements.txt
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
git clone https://github.com/ggerganov/llama.cpp.git
pushd llama.cpp
make
popd
- name: Download GGUF files
run: |
mkdir gguf_files
wget -O gguf_files/llama-2-7b.Q4_0.gguf "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q4_0.gguf?download=true"
./llama.cpp/quantize --allow-requantize gguf_files/llama-2-7b.Q4_0.gguf gguf_files/llama-2-7b.Q4_0.requant_F32.gguf F32
- name: Load files
run: |
touch test.py
echo "from build.gguf_util import test_by_to_float" >> test.py
echo "test_by_to_float(\"gguf_files/llama-2-7b.Q4_0.gguf\", \"gguf_files/llama-2-7b.Q4_0.requant_F32.gguf\")" >> test.py
cat test.py
python test.py
echo "Tests complete."
test-mps-dtype:
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
with:
runner: macos-m1-stable
script: |
set -x
# NS: Remove previous installation of torch first
# as this script does not isntall anything into conda env but rather as system dep
pip uninstall -y torch || true
set -eou pipefail
echo "::group::Print machine info"
uname -a
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
echo "::endgroup::"
echo "::group::Install requirements"
# Install requirements
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
ls -la
pwd
pip install -r requirements.txt
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
echo "::endgroup::"
echo "::group::Download checkpoints"
(
mkdir -p checkpoints/stories15M
pushd checkpoints/stories15M
curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
popd
)
echo "::endgroup::"
echo "::group::Run inference"
export MODEL_PATH=checkpoints/stories15M/stories15M.pt
export MODEL_NAME=stories15M
export MODEL_DIR=/tmp
for DTYPE in float16 float32; do
# if [ $(uname -s) == Darwin ]; then
# export DTYPE=float16
# fi
python generate.py --dtype ${DTYPE} --device mps --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager
python generate.py --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager
python generate.py --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager
python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager
python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager
PYTORCH_ENABLE_MPS_FALLBACK=1 python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
cat ./output_eager
done
compile-gguf:
strategy:
matrix:
runner: [macos-14]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout repo
uses: actions/checkout@v2
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.11
- name: Print machine info
run: |
uname -a
if [ $(uname -s) == Darwin ]; then
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
fi
- name: Install requirements
run: |
pip install gguf
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
pip install -r requirements.txt
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
- name: Download GGUF
run: |
mkdir gguf_files
export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf
export TOKENIZER_PATH=gguf_files/tokenizer.model
wget -O ${GGUF_PATH} "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true"
wget -O ${TOKENIZER_PATH} https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
- name: Run inference
run: |
export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf
export TOKENIZER_PATH=gguf_files/tokenizer.model
export MODEL_NAME=TinyLlama-1.1B-openorca.Q4_0.gguf
export MODEL_DIR=/tmp
echo "******************************************"
echo "******* Embed: not quantized *************"
echo "******************************************"
echo "Running eager"
python generate.py --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_eager
cat ./output_eager
echo "Running compiled"
python generate.py --compile --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_compiled
cat ./output_compiled
echo "******************************************"
echo "******* Emb: channel-wise quantized ******"
echo "******************************************"
echo "Running eager"
python generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_eager
cat ./output_eager
echo "Running compiled"
python generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_compiled
cat ./output_compiled
echo "******************************************"
echo "******** Emb: group-wise quantized *******"
echo "******************************************"
echo "Running eager"
python generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_eager
cat ./output_eager
echo "Running compiled"
python generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_compiled
cat ./output_compiled
echo "tests complete"
echo "******************************************"
runner-et:
strategy:
matrix:
runner: [macos-14-xlarge]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout repo
uses: actions/checkout@v2
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.11
- name: Print machine info
run: |
uname -a
if [ $(uname -s) == Darwin ]; then
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
fi
- name: Install requirements
run: |
echo "Intalling pip packages"
pip install -r requirements.txt
export TORCHCHAT_ROOT=${PWD}
export ENABLE_ET_PYBIND=false
./scripts/install_et.sh $ENABLE_ET_PYBIND
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
python3 -c 'import torchvision;print(f"torchvision: {torchvision.__version__, torchvision.version.git_version}")'
python3 -c 'import torchaudio;print(f"torchaudio: {torchaudio.__version__, torchaudio.version.git_version}")'
cmake -S ./runner-et -B et-build/cmake-out -G Ninja
cmake --build ./et-build/cmake-out
- name: Download checkpoints
run: |
mkdir -p checkpoints/stories15M
pushd checkpoints/stories15M
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
popd
- name: Run inference
run: |
export MODEL_DIR=${PWD}/checkpoints/stories15M
export PROMPT="Once upon a time in a land far away"
python torchchat.py generate --checkpoint-path ${MODEL_DIR}/stories15M.pt --temperature 0 --prompt "${PROMPT}" > ${PWD}/output_eager
cat ${PWD}/output_eager
python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-pte-path ${PWD}/stories15M.pte
./et-build/cmake-out/runner_et ${PWD}/stories15M.pte -z ${MODEL_DIR}/tokenizer.bin -i "${PROMPT}" > ${PWD}/output_et
cat ${PWD}/output_et
echo "Tests complete."
runner-aoti:
name: test-runner-aoti (${{ matrix.platform }}, ${{ matrix.model_name }})
needs: gather-models-cpu
strategy:
matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
env:
TORCHCHAT_ROOT: ${{ github.workspace }}
REPO_NAME: ${{ matrix.repo_name }}
steps:
- name: Checkout repo
uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Print machine info
run: |
echo "$(uname -a)"
- name: Install dependencies
run: |
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
pip install -r requirements.txt
pip list
cd ${TORCHCHAT_ROOT}/runner-aoti
cmake -Bbuild -DCMAKE_PREFIX_PATH=`python -c 'import torch;print(torch.utils.cmake_prefix_path)'`
cmake --build build
cd ..
- name: Download checkpoint
run: |
mkdir -p checkpoints/stories15M
pushd checkpoints/stories15M
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
popd
- name: Run inference
run: |
export MODEL_DIR=${PWD}/checkpoints/stories15M
export PROMPT="Once upon a time in a land far away"
python torchchat.py generate --checkpoint-path ${MODEL_DIR}/stories15M.pt --temperature 0 --prompt "${PROMPT}" > ${PWD}/output_eager
cat ${PWD}/output_eager
python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-dso-path /tmp/model.so
./runner-aoti/build/run /tmp/model.so -z ${MODEL_DIR}/tokenizer.bin -i "${PROMPT}" > ${PWD}/output_aoti
cat ${PWD}/output_aoti
echo "Tests complete."