Merge branch 'main' into sa/fp8

neuralmagic · Jun 14, 2024 · f7bd557 · f7bd557
2 parents 53d01f7 + 5c1de1c
commit f7bd557
Show file tree

Hide file tree

Showing 13 changed files with 269 additions and 245 deletions.
diff --git a/.github/workflows/build-container.yml b/.github/workflows/build-container.yml
@@ -53,4 +53,12 @@ jobs:
           build-args: |
             BRANCH=${{github.head_ref}}
           push: true
-          tags: ghcr.io/neuralmagic/sparseml-dev:${{ inputs.name }}
+          tags: ghcr.io/neuralmagic/sparseml-dev:${{ inputs.name }}
+
+      - name: Build Nightly Docker Container
+        if: ${{ inputs.dev == 'false' && inputs.release == 'false'}}
+        uses: docker/build-push-action@v4
+        with:
+          context: ./docker/containers/docker_nightly
+          push: true
+          tags: ghcr.io/neuralmagic/sparseml-nightly:latest, ghcr.io/neuralmagic/sparseml-nightly:${{ steps.date.outputs.date }}
diff --git a/.github/workflows/build-nightly.yml b/.github/workflows/build-nightly.yml
diff --git a/.github/workflows/build-wheel-and-container.yml b/.github/workflows/build-wheel-and-container.yml
@@ -4,15 +4,8 @@ on:
     types: [opened, synchronize, reopened]
     branches:
       - main
-      - 'release/[0-9]+.[0-9]+'
-  push:
-    branches:
-      - 'release/[0-9]+.[0-9]+'
-      - main
-  release:
-    types: [created, published]
   schedule:
-    - cron: '0 0 * * *'
+    - cron: '0 20 * * *'
 
 permissions:
   id-token: write
@@ -23,10 +16,10 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 
-# if not dev or release, will create a nightly build
+# TODO: do we want to push to nightly everytime we push to main?
+# if not dev or release, will create a nightly build; turning off release for now
 env:
-  PRODUCTION: ${{ github.event_name == 'schedule' || github.event_name == 'release'}}
-  RELEASE: ${{ github.event_name =='release' || startsWith(github.base_ref, 'release/') }}
+  RELEASE: 'false'
   DEV: ${{ github.base_ref == 'main' && github.event_name == 'pull_request'}}
 
 jobs:
@@ -42,8 +35,14 @@ jobs:
           echo "dev=$DEV" >> $GITHUB_OUTPUT
           echo "release=$RELEASE" >> $GITHUB_OUTPUT
 
-  build-wheel-and-push:
+  test-nightly:
     needs: set-outputs
+    if: ${{ needs.set-outputs.outputs.dev  == 'false' && needs.set-outputs.outputs.release  == 'false'}}
+    uses: ./.github/workflows/test-nightly.yml
+
+  build-wheel-and-push:
+    needs: [set-outputs, test-nightly]
+    if: ${{ always() && needs.set-outputs.outputs.dev == 'false' && needs.test-nightly.result == 'success' || always() && needs.set-outputs.outputs.dev  == 'true' && needs.set-outputs.result == 'success' }}
     uses: ./.github/workflows/build-wheel.yml
     with:
       build-label: ubuntu-20.04
@@ -55,22 +54,24 @@ jobs:
       python: '3.10'
     secrets: inherit
 
-  test-wheel-and-push-internal:
-    needs: build-wheel-and-push
-    uses: ./.github/workflows/test-wheel-push-to-internal.yml
+  test-wheel-and-publish:
+    needs: [set-outputs, build-wheel-and-push]
+    if: ${{ always() && !cancelled() && needs.build-wheel-and-push.result == 'success' }}
+    uses: ./.github/workflows/test-wheel-and-publish.yml
     with:
       build-label: ubuntu-20.04
       whl: ${{ needs.build-wheel-and-push.outputs.wheel }}
       python: '3.10'
+      dev: ${{ needs.set-outputs.outputs.dev }}
+      release: ${{ needs.set-outputs.outputs.release  }}
     secrets: inherit
 
-  # TODO: add nightly and release container build steps once wheel build push
-  # to production is automated. Removed until then.
   build-container-and-push:
-    needs: [set-outputs, test-wheel-and-push-internal]
+    needs: [test-wheel-and-publish, set-outputs]
+    if: ${{ always() && !cancelled() && needs.test-wheel-and-publish.result == 'success' }}
     uses: ./.github/workflows/build-container.yml
     with:
-      build-label: k8s-eng-gpu-64G-v100-32G
+      build-label: k8s-eng-gpu-16G-t4-32G
       dev: ${{ needs.set-outputs.outputs.dev }}
       release: ${{ needs.set-outputs.outputs.release  }}
       name: ${{ github.event.number }}

diff --git a/.github/workflows/publish-nightly-docker-images.yaml b/.github/workflows/publish-nightly-docker-images.yaml
diff --git a/.github/workflows/test-nightly.yml b/.github/workflows/test-nightly.yml
@@ -1,8 +1,7 @@
 name: Run Nightly Tests
 on:
-  schedule:
-    - cron: '0 20 * * *'
   workflow_dispatch:
+  workflow_call:
 jobs:
   test-nightly-tests:
     runs-on: k8s-mle-gpu-12-vcpu-225GB-ram-2-a6000-48G
@@ -33,6 +32,5 @@ jobs:
         run: |
           pytest tests/sparseml/transformers/obcq -m integration
       - name: Run finetune tests
-        if: always()
         run: |
           pytest tests/sparseml/transformers/finetune -m integration
diff --git a/...workflows/test-wheel-push-to-internal.yml → .github/workflows/test-wheel-and-publish.yml b/...workflows/test-wheel-push-to-internal.yml → .github/workflows/test-wheel-and-publish.yml
@@ -1,4 +1,4 @@
-name: Test Wheel and Push to Internal PyPi
+name: Test Wheel and Publish
 on:
   workflow_call:
     inputs:
@@ -11,9 +11,15 @@ on:
         required: true
       python:
         type: string
+      dev:
+        type: string
+        required: true 
+      release:
+        type: string
+        required: true
 
 jobs:
-  test-wheel-and-push-internal:
+  test-wheel-and-publish:
     runs-on: ${{ inputs.build-label }}
     steps:
     - uses: actions/setup-python@v4
@@ -36,24 +42,37 @@ jobs:
         filename: ${{ inputs.whl }}
         dst: dist_s3
 
-    - name: Set Env
-      run: |
-        pip3 install virtualenv
-        virtualenv venv
-        source venv/bin/activate
-  
     - name: Fetch name of whl
       run: |
           echo "FILENAME=$(echo dist_s3/*.whl)" >> $GITHUB_ENV
 
     - name: Install whl
       run: |
-          pip3 install $FILENAME[dev]
+          pip3 install $FILENAME[dev,onnxruntime,torch,torchvision,transformers]
 
     - name: Checkout code
       uses: actions/checkout@v3
 
     - name: Remove src files and run tests
       run: |
+            pwd
             rm -rf src
-            make test
+            make test
+
+    - name: Make directory for wheel
+      run: |
+          mkdir dist_s3
+          
+    - name: Pull from s3
+      uses: neuralmagic/nm-actions/actions/s3_pull@main
+      with:
+        filename: ${{ inputs.whl }}
+        dst: dist_s3
+
+    - name: Publish Nightly Wheel
+      if: ${{ inputs.DEV == 'false' && inputs.RELEASE == 'false'}}
+      uses: neuralmagic/nm-actions/actions/publish-whl@main
+      with:
+        username: ${{ secrets.PYPI_PUBLIC_USER }}
+        password: ${{ secrets.PYPI_PUBLIC_AUTH }}
+        whl: ./$FILENAME
diff --git a/examples/llama7b_one_shot_quantization.md b/examples/llama7b_one_shot_quantization.md
@@ -0,0 +1,50 @@
+# Creating a Quantized Llama Model in One Shot
+
+Quantizing a model to a lower precision can save on both memory and speed at inference time.
+This example demonstrates how to use the SparseML API to quantize a Llama model from 16 bits
+to 4 bits and save it to a compressed-tensors format for inference with vLLM.
+
+## Step 1: Select a model and dataset
+For this example, we will use a TinyLlama model and the open platypus dataset, however
+these can be swapped out for any huggingface compatible models and datasets
+
+```python
+model = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+dataset = "open_platypus"
+```
+
+## Step 2: Configure a `GPTQModifier`
+Modifiers in sparseml are used to apply optimizations to models. In this example we use a
+`GPTQModifier` to apply the GPTQ algorithm to our model.  We target all `Linear` layers
+for 4-bit weight quantization.  These options may be swapped out for any valid `QuantizationScheme`.
+
+```python
+from sparseml.modifiers.quantization.gptq import GPTQModifier
+
+gptq = GPTQModifier(
+    targets="Linear",
+    scheme="W4A16"
+)
+```
+
+
+### Step3: One-Shot Compression
+
+The `oneshot` api applies the created modifier to the target model and dataset.
+Setting `save_compressed` to True runs the model through `compressed_tensors` compression
+after the quantization is completed.
+
+```python
+from sparseml.transformers import oneshot
+
+oneshot(
+    model=model,
+    dataset=dataset,
+    recipe=gptq,
+    save_compressed=True,
+    output_dir="llama-compressed-example",
+    overwrite_output_dir=True,
+    max_seq_length=256,
+    num_calibration_samples=256,
+)
+```