Merge branch 'main' into xnnpack_dynamic_partitioner

pytorch · Apr 16, 2024 · 3b0219f · 3b0219f
2 parents 1d12473 + 804d128
commit 3b0219f
Show file tree

Hide file tree

Showing 36 changed files with 1,305 additions and 736 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,76 @@
+[flake8]
+select = B,C,E,F,P,W,B9,TOR0,TOR1,TOR2
+max-line-length = 120
+ignore =
+    # Black conflicts and overlaps.
+    B950,
+    E111,
+    E115,
+    E117,
+    E121,
+    E122,
+    E123,
+    E124,
+    E125,
+    E126,
+    E127,
+    E128,
+    E129,
+    E131,
+    E201,
+    E202,
+    E203,
+    E221,
+    E222,
+    E225,
+    E226,
+    E227,
+    E231,
+    E241,
+    E251,
+    E252,
+    E261,
+    E262,
+    E265,
+    E271,
+    E272,
+    E301,
+    E302,
+    E303,
+    E305,
+    E306,
+    E501,
+    E502,
+    E701,
+    E702,
+    E703,
+    E704,
+    W291,
+    W292,
+    W293,
+    W391,
+    W504,
+
+    # Too opinionated.
+    E265,
+    E266,
+    E402,
+    E722,
+    B001,
+    P207,
+    B003,
+    P208,
+    C403,
+    W503,
+
+    # Bugbear has opinions: https://github.com/PyCQA/flake8-bugbear#opinionated-warnings
+    B904,
+    B905,
+    B906,
+    B907,
+exclude =
+    ./.git,
+    *.pyi
+
+max-complexity = 12
+
diff --git a/.github/workflows/compile-gguf.yml b/.github/workflows/compile-gguf.yml
@@ -35,15 +35,16 @@ jobs:
       - name: Download GGUF
         run: |
           mkdir gguf_files
-          export GGUF_PATH=gguf_files/llama-2-7b.Q4_0.gguf
+          export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf
           export TOKENIZER_PATH=gguf_files/tokenizer.model
-          wget -O ${GGUF_PATH} "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q4_0.gguf?download=true"
+
+          wget -O ${GGUF_PATH} "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true"
           wget -O ${TOKENIZER_PATH} https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
       - name: Run inference
         run: |
-          export GGUF_PATH=gguf_files/llama-2-7b.Q4_0.gguf
+          export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf
           export TOKENIZER_PATH=gguf_files/tokenizer.model
-          export MODEL_NAME=llama-2-7b.Q4_0.gguf
+          export MODEL_NAME=TinyLlama-1.1B-openorca.Q4_0.gguf
           export MODEL_DIR=/tmp
 
           echo "******************************************"

diff --git a/.github/workflows/compile_t4-dtype.yml b/.github/workflows/compile_t4-dtype.yml
@@ -0,0 +1,115 @@
+name: Run compile tests
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+
+jobs:
+  test-cuda:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        if [ $(uname -s) == Darwin ]; then
+          sysctl machdep.cpu.brand_string
+          sysctl machdep.cpu.core_count
+        fi
+        echo "::endgroup::"
+
+        echo "::group::Download checkpoints"
+        # Install requirements
+        pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
+        pip install -r requirements.txt
+        echo "::endgroup::"
+
+        echo "::group::Download checkpoints"
+        mkdir -p checkpoints/stories15M
+        pushd checkpoints/stories15M
+        wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
+        wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
+        popd
+        echo "::endgroup::"
+
+        echo "::group::Run inference"
+        export MODEL_PATH=checkpoints/stories15M/stories15M.pt
+        export MODEL_NAME=stories15M
+        export MODEL_DIR=/tmp
+
+        for DTYPE in bfloat16 float16 float32; do
+
+          python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          cat ./output_eager
+          python generate.py --dtype ${DTYPE} --device cuda --compile --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
+          cat ./output_compiled
+          python export.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
+          python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
+          cat ./output_aoti
+
+          echo "******************************************"
+          echo "******* Emb: channel-wise quantized ******"
+        echo "******************************************"
+          python generate.py --dtype ${DTYPE} --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          cat ./output_eager
+        python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
+          cat ./output_compiled
+        python export.py --dtype ${DTYPE} --device cuda --quant '{"embedding" :   {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
+          python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti  
+          cat ./output_aoti
+
+          echo "******************************************"
+          echo "******** Emb: group-wise quantized *******"
+        echo "******************************************"
+          python generate.py --dtype ${DTYPE} --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          cat ./output_eager
+        python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"  embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
+          cat ./output_compiled
+        python export.py --dtype ${DTYPE} --device cuda --quant '{"embedding" :   {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
+          python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti  
+          cat ./output_aoti
+
+          echo "******************************************"
+          echo "******* INT8 channel-wise quantized ******"
+        echo "******************************************"
+          python generate.py --dtype ${DTYPE} --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          cat ./output_eager
+        python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"  linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
+          cat ./output_compiled
+        python export.py --dtype ${DTYPE} --device cuda --quant '{"linear:int8"   : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
+          python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti  
+          cat ./output_aoti
+
+          echo "******************************************"
+          echo "******** INT8 group-wise quantized *******"
+          echo "******************************************"
+          python generate.py --dtype ${DTYPE} --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          cat ./output_eager
+          python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
+          cat ./output_compiled
+          python export.py --dtype ${DTYPE} --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
+          python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
+          cat ./output_aoti
+
+          echo "******************************************"
+          echo "******** INT4 group-wise quantized *******"
+          echo "******************************************"
+          python generate.py --dtype ${DTYPE} --device cuda --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          cat ./output_eager
+          python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
+          cat ./output_compiled
+          python export.py --dtype ${DTYPE} --device cuda --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
+          python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
+          cat ./output_aoti
+
+        done
+
+        echo "tests complete"
+        echo "******************************************"
+        echo "::endgroup::"
+
diff --git a/.github/workflows/compile_t4.yml b/.github/workflows/compile_t4.yml
@@ -93,13 +93,18 @@ jobs:
         python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
         cat ./output_aoti
 
+        echo "******************************************"
+        echo "******** INT4 group-wise quantized *******"
+        echo "******************************************"
+        python generate.py --device cuda --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+        cat ./output_eager
+        python generate.py --device cuda --compile --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
+        cat ./output_compiled
+        python export.py --device cuda --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
+        python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
+        cat ./output_aoti
+
         echo "tests complete"
         echo "******************************************"
         echo "::endgroup::"
-        # echo "********* EAGER vs TORCH.COMPILE *********"
-        # echo "******************************************"
-        # diff output_eager output_compiled
-        # echo "******************************************"
-        # echo "********* EAGER vs AOT INDUCTOR  *********"
-        # echo "******************************************"
-        # diff output_eager output_aoti
+
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
@@ -0,0 +1,9 @@
+name: periodic
+
+on:
+  schedule:
+    - cron: '0,6,12,18 0 * * *'  # Runs at midnight UTC and every 6 hours
+  push:
+    tags:
+      - ciflow/periodic/*
+  workflow_dispatch:
diff --git a/.github/workflows/test_mps-dtype.yml b/.github/workflows/test_mps-dtype.yml
@@ -52,14 +52,19 @@ jobs:
 
           python generate.py --dtype ${DTYPE} --device mps --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
           cat ./output_eager
-          # python generate.py --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-          # cat ./output_eager
-          # python generate.py --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-          # cat ./output_eager
-          # python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-          # cat ./output_eager
-          # python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-          # cat ./output_eager
-          # PYTORCH_ENABLE_MPS_FALLBACK=1 python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-          # cat ./output_eager
+
+          python generate.py --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          cat ./output_eager
+
+          python generate.py --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          cat ./output_eager
+
+          python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          cat ./output_eager
+
+          python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          cat ./output_eager
+
+          PYTORCH_ENABLE_MPS_FALLBACK=1 python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          cat ./output_eager
         done
diff --git a/.github/workflows/test_mps.yml b/.github/workflows/test_mps.yml
@@ -71,6 +71,6 @@ jobs:
         echo "*** linear int4"
         echo "************************************************************"
 
-        # PYTORCH_ENABLE_MPS_FALLBACK=1 python generate.py --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
-        # cat ./output_eager
+        PYTORCH_ENABLE_MPS_FALLBACK=1 python generate.py --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+        cat ./output_eager
           
diff --git a/.github/workflows/test_torchchat_commands.yml b/.github/workflows/test_torchchat_commands.yml
@@ -0,0 +1,57 @@
+name: Run torchchat command tests
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  torchchat-command-load-test:
+    strategy:
+      matrix:
+        runner: [macos-14]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v2
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.11
+      - name: Print machine info
+        run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - name: Install requirements
+        run: |
+          echo "Installing pip packages"
+          pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+          pip install -r requirements.txt
+
+      - name: Download Stories files
+        run: |
+
+          mkdir -p checkpoints/stories15M
+          pushd checkpoints/stories15M
+          curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
+          curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
+          popd
+
+      - name: Test generate
+        run: |
+
+          export MODEL_PATH=checkpoints/stories15M/stories15M.pt
+          export MODEL_NAME=stories15M
+          export MODEL_DIR=/tmp
+          
+          python generate.py --device cpu --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager1
+          python torchchat.py generate --device cpu --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager2
+          cat ./output_eager1
+          cat ./output_eager2
+          echo "Tests complete."
+