Implement download subcommand, optional positional model name argument #348

Workflow file for this run

	name: pull

	on:
	pull_request:
	push:
	branches:
	- main
	workflow_dispatch:

	jobs:
	gather-models-cpu:
	runs-on: ubuntu-22.04
	outputs:
	models: ${{ steps.gather-models-cpu.outputs.models }}
	steps:
	- uses: actions/checkout@v3
	with:
	submodules: 'false'
	- uses: actions/setup-python@v4
	with:
	python-version: '3.11'
	- name: Extract the list of models to run on CPU
	id: gather-models-cpu
	run: \|
	set -eux
	PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request" --backend "cpu"
	test-cpu-compile:
	name: test-cpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }})
	needs: gather-models-cpu
	strategy:
	matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
	fail-fast: false
	runs-on: ${{ matrix.runner }}
	env:
	TORCHCHAT_ROOT: ${{ github.workspace }}
	REPO_NAME: ${{ matrix.repo_name }}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v3
	- name: Setup Python
	uses: actions/setup-python@v4
	with:
	python-version: '3.11'
	- name: Print machine info
	run: \|
	echo "$(uname -a)"
	- name: Install dependencies
	run: \|
	pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
	pip install -r requirements.txt
	pip list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	- name: Download checkpoints
	run: \|
	bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
	- name: Run validation
	run: \|
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	pushd ${TORCHCHAT_ROOT}
	bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
	bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "compile"
	test-cpu-aoti:
	name: test-cpu-aoti (${{ matrix.platform }}, ${{ matrix.model_name }})
	needs: gather-models-cpu
	strategy:
	matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
	fail-fast: false
	runs-on: ${{ matrix.runner }}
	env:
	TORCHCHAT_ROOT: ${{ github.workspace }}
	REPO_NAME: ${{ matrix.repo_name }}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v3
	- name: Setup Python
	uses: actions/setup-python@v4
	with:
	python-version: '3.11'
	- name: Print machine info
	run: \|
	echo "$(uname -a)"
	- name: Install dependencies
	run: \|
	pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
	pip install -r requirements.txt
	pip list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	- name: Download checkpoints
	run: \|
	bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
	- name: Run validation
	run: \|
	pushd ${TORCHCHAT_ROOT}
	bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
	bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "aoti"
	gather-models-gpu:
	runs-on: ubuntu-22.04
	outputs:
	models: ${{ steps.gather-models-gpu.outputs.models }}
	steps:
	- uses: actions/checkout@v3
	with:
	submodules: 'false'
	- uses: actions/setup-python@v4
	with:
	python-version: '3.11'
	- name: Extract the list of models to run on GPU
	id: gather-models-gpu
	run: \|
	set -eux
	PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request" --backend "gpu"
	test-gpu-compile:
	uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
	name: test-gpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }})
	needs: gather-models-gpu
	strategy:
	matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
	fail-fast: false
	with:
	runner: linux.g5.4xlarge.nvidia.gpu
	gpu-arch-type: cuda
	gpu-arch-version: "12.1"
	script: \|
	echo "::group::Print machine info"
	nvidia-smi
	echo "::endgroup::"

	echo "::group::Install required packages"
	pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
	pip install -r ./requirements.txt
	pip list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	echo "::endgroup::"

	echo "::group::Download checkpoint"
	export REPO_NAME=${{ matrix.repo_name }}
	bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
	echo "::endgroup::"

	echo "::group::Convert checkpoint"
	bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
	echo "::endgroup::"

	echo "::group::Run inference"
	bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "compile"
	echo "::endgroup::"
	test-gpu-aoti:
	uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
	name: test-gpu-aoti (${{ matrix.platform }}, ${{ matrix.model_name }})
	needs: gather-models-gpu
	strategy:
	matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
	fail-fast: false
	with:
	runner: linux.g5.4xlarge.nvidia.gpu
	gpu-arch-type: cuda
	gpu-arch-version: "12.1"
	script: \|
	echo "::group::Print machine info"
	nvidia-smi
	echo "::endgroup::"

	echo "::group::Install required packages"
	pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
	pip install -r ./requirements.txt
	pip list
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	echo "::endgroup::"

	echo "::group::Download checkpoint"
	export REPO_NAME=${{ matrix.repo_name }}
	bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
	echo "::endgroup::"

	echo "::group::Convert checkpoint"
	bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
	echo "::endgroup::"

	echo "::group::Run inference"
	bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti"
	echo "::endgroup::"
	test-tinystories-executorch:
	strategy:
	matrix:
	runner: [32-core-ubuntu]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v2
	- name: Setup Python
	uses: actions/setup-python@v2
	with:
	python-version: 3.11
	- name: Print machine info
	run: \|
	uname -a
	if [ $(uname -s) == Darwin ]; then
	sysctl machdep.cpu.brand_string
	sysctl machdep.cpu.core_count
	fi
	- name: Install requirements
	run: \|
	echo "Intalling pip packages"
	pip install wheel
	pip install cmake
	pip install ninja
	pip install zstd
	pip install -r requirements.txt

	echo "Executorch: cloning"
	mkdir etorch
	cd etorch
	git clone https://github.com/pytorch/executorch.git
	cd executorch
	echo "Inside: ${PWD}"

	echo "Executorch: submodule update"
	git submodule sync
	git submodule update --init

	echo "Executorch: installing python interface"
	./install_requirements.sh --pybind xnnpack

	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	python3 -c 'import torchvision;print(f"torchvision: {torchvision.__version__, torchvision.version.git_version}")'
	python3 -c 'import torchaudio;print(f"torchaudio: {torchaudio.__version__, torchaudio.version.git_version}")'

	cd ../..
	echo "Inside: ${PWD}"
	- name: Download checkpoints
	run: \|
	mkdir -p checkpoints/stories15M
	pushd checkpoints/stories15M
	wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
	wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
	wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
	popd

	mkdir gguf_files
	export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf
	export GGUF_TOKENIZER_PATH=gguf_files/tokenizer.model
	wget -O ${GGUF_PATH} "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true"
	wget -O ${GGUF_TOKENIZER_PATH} https://github.com/karpathy/llama2.c/raw/master/tokenizer.model

	- name: Run inference
	run: \|
	export MODEL_PATH=${PWD}/checkpoints/stories15M/stories15M.pt
	export MODEL_NAME=stories15M

	python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 > ${PWD}/output_eager
	cat ${PWD}/output_eager

	python export.py --checkpoint-path ${MODEL_PATH} --output-pte-path ${PWD}/${MODEL_NAME}.pte
	python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${PWD}/${MODEL_NAME}.pte > ${PWD}/output_et
	cat ${PWD}/output_et

	echo "Tests complete."

	- name: Run inference
	run: \|
	export MODEL_PATH=checkpoints/stories15M/stories15M.pt
	export MODEL_NAME=stories15M
	export MODEL_DIR=/tmp
	python export.py --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
	python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte > ./output_et
	cat ./output_et

	echo "******************************************"
	echo "***** Emb: channel-wise quantized ****"
	echo "******************************************"
	python export.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
	python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte > ./output_et
	cat ./output_et

	echo "******************************************"
	echo "****** Emb: group-wise quantized *****"
	echo "******************************************"
	python export.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
	python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte > ./output_et
	cat ./output_et

	echo "******************************************"
	echo "***** INT8 channel-wise quantized ****"
	echo "******************************************"
	python export.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
	python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte > ./output_et
	cat ./output_et

	echo "******************************************"
	echo "****** INT8 group-wise quantized *****"
	echo "******************************************"
	python export.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
	python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte > ./output_et
	cat ./output_et

	echo "******************************************"
	echo "****** ET: a8w4dq INT4 group-wise quantized *****"
	echo "******************************************"
	python export.py --quant '{"linear:a8w4dq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
	python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte > ./output_et
	# cat ./output_et

	echo "tests complete"
	echo "******************************************"

	- name: Run GGUF export + inference
	run: \|
	export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf
	export GGUF_TOKENIZER_PATH=gguf_files/tokenizer.model

	python torchchat.py export --gguf-path ${GGUF_PATH} --output-pte-path ${PWD}/${MODEL_NAME}.pte
	python torchchat.py generate --gguf-path ${GGUF_PATH} --pte-path ${PWD}/${MODEL_NAME}.pte --tokenizer-path ${GGUF_TOKENIZER_PATH} --temperature 0 --max-new-tokens 20 > ${PWD}/output_et
	cat ${PWD}/output_et

	echo "Tests complete."
	torchchat-command-load-test:
	strategy:
	matrix:
	runner: [macos-14]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v2
	- name: Setup Python
	uses: actions/setup-python@v2
	with:
	python-version: 3.11
	- name: Print machine info
	run: \|
	uname -a
	if [ $(uname -s) == Darwin ]; then
	sysctl machdep.cpu.brand_string
	sysctl machdep.cpu.core_count
	fi
	- name: Install requirements
	run: \|
	echo "Installing pip packages"
	pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
	pip install -r requirements.txt
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'

	- name: Download Stories files
	run: \|

	mkdir -p checkpoints/stories15M
	pushd checkpoints/stories15M
	curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
	curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
	popd

	- name: Test generate
	run: \|

	export MODEL_PATH=checkpoints/stories15M/stories15M.pt
	export MODEL_NAME=stories15M
	export MODEL_DIR=/tmp

	python generate.py --device cpu --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager1
	python torchchat.py generate --device cpu --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager2
	cat ./output_eager1
	cat ./output_eager2
	echo "Tests complete."

	- name: Test download
	run: \|

	python torchchat.py generate stories15M

	test-tinystories-eager:
	strategy:
	matrix:
	runner: [macos-12]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v2
	- name: Setup Python
	uses: actions/setup-python@v2
	with:
	python-version: 3.11
	- name: Print machine info
	run: \|
	uname -a
	if [ $(uname -s) == Darwin ]; then
	sysctl machdep.cpu.brand_string
	sysctl machdep.cpu.core_count
	fi
	- name: Install requirements
	run: \|
	pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
	pip install -r requirements.txt
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	- name: Download checkpoints
	run: \|
	mkdir -p checkpoints/stories15M
	pushd checkpoints/stories15M
	wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
	wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
	popd
	- name: Run inference
	run: \|
	export MODEL_PATH=checkpoints/stories15M/stories15M.pt
	export MODEL_NAME=stories15M
	export MODEL_DIR=/tmp
	for DTYPE in bfloat16 float16 float32; do
	# if [ $(uname -s) == Darwin ]; then
	# export DTYPE=float16
	# fi
	python generate.py --dtype ${DTYPE} --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
	cat ./output_eager

	echo "******************************************"
	echo "***** Emb: channel-wise quantized ****"
	echo "******************************************"
	python generate.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
	cat ./output_eager

	echo "******************************************"
	echo "****** Emb: group-wise quantized *****"
	echo "******************************************"
	python generate.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
	cat ./output_eager

	echo "******************************************"
	echo "***** INT8 channel-wise quantized ****"
	echo "******************************************"
	python generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
	cat ./output_eager

	echo "******************************************"
	echo "****** INT8 group-wise quantized *****"
	echo "******************************************"
	python generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
	cat ./output_eager

	echo "******************************************"
	echo "****** INT4 group-wise quantized *****"
	echo "******************************************"

	echo "INT4 should work on MacOS on x86, but cannot be tested"
	echo "because nightlies are too old!"

	# python generate.py --dtype ${DTYPE} --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
	# cat ./output_eager

	echo "tests complete for ${DTYPE}"
	done

	echo "tests complete for all dtypes!"
	test-mps:
	uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
	with:
	runner: macos-m1-stable
	script: \|
	set -x
	# NS: Remove previous installation of torch first
	# as this script does not isntall anything into conda env but rather as system dep
	pip uninstall -y torch \|\| true
	set -eou pipefail

	echo "::group::Print machine info"
	uname -a
	sysctl machdep.cpu.brand_string
	sysctl machdep.cpu.core_count
	echo "::endgroup::"

	echo "::group::Install requirements"
	# Install requirements
	pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
	ls -la
	pwd
	pip install -r requirements.txt
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	echo "::endgroup::"

	echo "::group::Download checkpoints"
	(
	mkdir -p checkpoints/stories15M
	pushd checkpoints/stories15M
	curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
	curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
	popd
	)
	echo "::endgroup::"

	echo "::group::Run inference"
	export MODEL_PATH=checkpoints/stories15M/stories15M.pt
	export MODEL_NAME=stories15M
	export MODEL_DIR=/tmp

	python generate.py --device mps --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
	cat ./output_eager

	echo "************************************************************"
	echo "*** embedding"
	echo "************************************************************"

	python generate.py --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
	cat ./output_eager
	python generate.py --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
	cat ./output_eager

	echo "************************************************************"
	echo "*** linear int8"
	echo "************************************************************"

	python generate.py --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
	cat ./output_eager
	python generate.py --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
	cat ./output_eager

	echo "************************************************************"
	echo "*** linear int4"
	echo "************************************************************"

	PYTORCH_ENABLE_MPS_FALLBACK=1 python generate.py --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
	cat ./output_eager
	test-gguf-util:
	strategy:
	matrix:
	runner: [macos-14]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v2
	- name: Setup Python
	uses: actions/setup-python@v2
	with:
	python-version: 3.11
	- name: Print machine info
	run: \|
	uname -a
	if [ $(uname -s) == Darwin ]; then
	sysctl machdep.cpu.brand_string
	sysctl machdep.cpu.core_count
	fi
	- name: Install requirements
	run: \|
	echo "Intalling pip packages"
	pip install gguf
	pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
	pip install -r requirements.txt
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'

	git clone https://github.com/ggerganov/llama.cpp.git
	pushd llama.cpp
	make
	popd

	- name: Download GGUF files
	run: \|
	mkdir gguf_files
	wget -O gguf_files/llama-2-7b.Q4_0.gguf "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q4_0.gguf?download=true"
	./llama.cpp/quantize --allow-requantize gguf_files/llama-2-7b.Q4_0.gguf gguf_files/llama-2-7b.Q4_0.requant_F32.gguf F32

	- name: Load files
	run: \|
	touch test.py
	echo "from build.gguf_util import test_by_to_float" >> test.py
	echo "test_by_to_float(\"gguf_files/llama-2-7b.Q4_0.gguf\", \"gguf_files/llama-2-7b.Q4_0.requant_F32.gguf\")" >> test.py
	cat test.py
	python test.py

	echo "Tests complete."
	test-mps-dtype:
	uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
	with:
	runner: macos-m1-stable
	script: \|
	set -x
	# NS: Remove previous installation of torch first
	# as this script does not isntall anything into conda env but rather as system dep
	pip uninstall -y torch \|\| true

	set -eou pipefail

	echo "::group::Print machine info"
	uname -a
	sysctl machdep.cpu.brand_string
	sysctl machdep.cpu.core_count
	echo "::endgroup::"

	echo "::group::Install requirements"
	# Install requirements
	pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
	ls -la
	pwd
	pip install -r requirements.txt
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	echo "::endgroup::"

	echo "::group::Download checkpoints"
	(
	mkdir -p checkpoints/stories15M
	pushd checkpoints/stories15M
	curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
	curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
	popd
	)
	echo "::endgroup::"

	echo "::group::Run inference"
	export MODEL_PATH=checkpoints/stories15M/stories15M.pt
	export MODEL_NAME=stories15M
	export MODEL_DIR=/tmp
	for DTYPE in float16 float32; do
	# if [ $(uname -s) == Darwin ]; then
	# export DTYPE=float16
	# fi

	python generate.py --dtype ${DTYPE} --device mps --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
	cat ./output_eager

	python generate.py --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
	cat ./output_eager

	python generate.py --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
	cat ./output_eager

	python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
	cat ./output_eager

	python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
	cat ./output_eager

	PYTORCH_ENABLE_MPS_FALLBACK=1 python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
	cat ./output_eager
	done
	compile-gguf:
	strategy:
	matrix:
	runner: [macos-14]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v2
	- name: Setup Python
	uses: actions/setup-python@v2
	with:
	python-version: 3.11
	- name: Print machine info
	run: \|
	uname -a
	if [ $(uname -s) == Darwin ]; then
	sysctl machdep.cpu.brand_string
	sysctl machdep.cpu.core_count
	fi
	- name: Install requirements
	run: \|
	pip install gguf
	pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
	pip install -r requirements.txt
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	- name: Download GGUF
	run: \|
	mkdir gguf_files
	export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf
	export TOKENIZER_PATH=gguf_files/tokenizer.model

	wget -O ${GGUF_PATH} "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true"
	wget -O ${TOKENIZER_PATH} https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
	- name: Run inference
	run: \|
	export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf
	export TOKENIZER_PATH=gguf_files/tokenizer.model
	export MODEL_NAME=TinyLlama-1.1B-openorca.Q4_0.gguf
	export MODEL_DIR=/tmp

	echo "******************************************"
	echo "***** Embed: not quantized ***********"
	echo "******************************************"

	echo "Running eager"
	python generate.py --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_eager
	cat ./output_eager

	echo "Running compiled"
	python generate.py --compile --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_compiled
	cat ./output_compiled

	echo "******************************************"
	echo "***** Emb: channel-wise quantized ****"
	echo "******************************************"

	echo "Running eager"
	python generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_eager
	cat ./output_eager

	echo "Running compiled"
	python generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_compiled
	cat ./output_compiled

	echo "******************************************"
	echo "****** Emb: group-wise quantized *****"
	echo "******************************************"

	echo "Running eager"
	python generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_eager
	cat ./output_eager

	echo "Running compiled"
	python generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_compiled
	cat ./output_compiled

	echo "tests complete"
	echo "******************************************"
	runner-et:
	strategy:
	matrix:
	runner: [macos-14-xlarge]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v2
	- name: Setup Python
	uses: actions/setup-python@v2
	with:
	python-version: 3.11
	- name: Print machine info
	run: \|
	uname -a
	if [ $(uname -s) == Darwin ]; then
	sysctl machdep.cpu.brand_string
	sysctl machdep.cpu.core_count
	fi
	- name: Install requirements
	run: \|
	echo "Intalling pip packages"
	pip install -r requirements.txt

	export TORCHCHAT_ROOT=${PWD}
	export ENABLE_ET_PYBIND=false
	./scripts/install_et.sh $ENABLE_ET_PYBIND
	python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
	python3 -c 'import torchvision;print(f"torchvision: {torchvision.__version__, torchvision.version.git_version}")'
	python3 -c 'import torchaudio;print(f"torchaudio: {torchaudio.__version__, torchaudio.version.git_version}")'
	cmake -S ./runner-et -B et-build/cmake-out -G Ninja
	cmake --build ./et-build/cmake-out
	- name: Download checkpoints
	run: \|
	mkdir -p checkpoints/stories15M
	pushd checkpoints/stories15M
	wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
	wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
	wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
	popd
	- name: Run inference
	run: \|
	export MODEL_DIR=${PWD}/checkpoints/stories15M
	export PROMPT="Once upon a time in a land far away"

	python torchchat.py generate --checkpoint-path ${MODEL_DIR}/stories15M.pt --temperature 0 --prompt "${PROMPT}" > ${PWD}/output_eager
	cat ${PWD}/output_eager

	python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-pte-path ${PWD}/stories15M.pte

	./et-build/cmake-out/runner_et ${PWD}/stories15M.pte -z ${MODEL_DIR}/tokenizer.bin -i "${PROMPT}" > ${PWD}/output_et
	cat ${PWD}/output_et

	echo "Tests complete."
	runner-aoti:
	name: test-runner-aoti (${{ matrix.platform }}, ${{ matrix.model_name }})
	needs: gather-models-cpu
	strategy:
	matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
	fail-fast: false
	runs-on: ${{ matrix.runner }}
	env:
	TORCHCHAT_ROOT: ${{ github.workspace }}
	REPO_NAME: ${{ matrix.repo_name }}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v3
	- name: Setup Python
	uses: actions/setup-python@v4
	with:
	python-version: '3.11'
	- name: Print machine info
	run: \|
	echo "$(uname -a)"
	- name: Install dependencies
	run: \|
	pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
	pip install -r requirements.txt
	pip list

	cd ${TORCHCHAT_ROOT}/runner-aoti
	cmake -Bbuild -DCMAKE_PREFIX_PATH=`python -c 'import torch;print(torch.utils.cmake_prefix_path)'`
	cmake --build build
	cd ..
	- name: Download checkpoint
	run: \|
	mkdir -p checkpoints/stories15M
	pushd checkpoints/stories15M
	wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
	wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
	wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
	popd
	- name: Run inference
	run: \|
	export MODEL_DIR=${PWD}/checkpoints/stories15M
	export PROMPT="Once upon a time in a land far away"

	python torchchat.py generate --checkpoint-path ${MODEL_DIR}/stories15M.pt --temperature 0 --prompt "${PROMPT}" > ${PWD}/output_eager
	cat ${PWD}/output_eager

	python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-dso-path /tmp/model.so

	./runner-aoti/build/run /tmp/model.so -z ${MODEL_DIR}/tokenizer.bin -i "${PROMPT}" > ${PWD}/output_aoti
	cat ${PWD}/output_aoti

	echo "Tests complete."

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Implement download subcommand, optional positional model name argument #348

Workflow file

Implement download subcommand, optional positional model name argument #348

Jobs

Run details

Workflow file for this run