From 7088caf766d7d05371342a91e8d572375625f8fc Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Thu, 17 Oct 2024 20:23:28 -0700
Subject: [PATCH 1/6] Better tinygemm warning for T4

---
 test/test_ops.py              | 33 ++++++++++++++++++++++++++++++++-
 torchao/quantization/utils.py |  4 ++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 31000eafc..5a4ea4db5 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -14,6 +14,8 @@
 from torchao.dtypes.floatx import from_scaled_tc_floatx
 from torchao.sparsity.marlin import marlin_24_workspace, pack_to_marlin_24, inject_24
 import pytest
+from unittest.mock import patch
+
 
 if is_fbcode():
     pytest.skip("Skipping the test in fbcode since we don't have TARGET file for kernels")
@@ -274,7 +276,6 @@ def test_dequantize_tensor_core_tiled_layout_correctness_unpack_and_dequant(shap
     assert diff_op_ao < 1e-1
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-# @pytest.mark.skipif(TORCH_VERSION_AT_LEAST_2_5, reason="weight packing is updated in 2.5+")
 @pytest.mark.parametrize("shape, inner_k_tiles, group_size", TEST_CONFIGS_DEQUANT, ids=str)
 def test_dequantize_tensor_core_tiled_layout_op(shape, inner_k_tiles, group_size):
     n, k = shape
@@ -303,6 +304,36 @@ def test_dequantize_tensor_core_tiled_layout_op(shape, inner_k_tiles, group_size
         test_utils=test_utils,
     )
 
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.parametrize("shape, inner_k_tiles, group_size", TEST_CONFIGS_DEQUANT, ids=str)
+def test_dequantize_tensor_core_tiled_layout_op(shape, inner_k_tiles, group_size):
+    n, k = shape
+    device = "cuda"
+
+    q = torch.randint(0, 16, shape, dtype=torch.int, device=device)
+    if TORCH_VERSION_AT_LEAST_2_5:
+        q = (q[::, ::2] << 4 | q[::, 1::2]).to(torch.uint8)
+    packed_w = torch._convert_weight_to_int4pack(q, inner_k_tiles)
+    q_groups = k // group_size
+    scales = torch.randn(n, q_groups, dtype=torch.bfloat16, device=device)
+    zeros = torch.randn_like(scales)
+    scales_and_zeros = pack_tinygemm_scales_and_zeros(scales, zeros)
+
+    # Test the case where CUDA SM version is less than 8.0
+    with patch('torch.cuda.get_device_capability', return_value=(7, 5)):
+        with pytest.raises(NotImplementedError) as excinfo:
+            scales_and_zeros = pack_tinygemm_scales_and_zeros(scales, zeros)
+        assert "4 bit quantization with tinygemm is not supported on this device" in str(excinfo.value)
+
+    # Test the case where CUDA SM version is 8.0 or higher (original behavior)
+    with patch('torch.cuda.get_device_capability', return_value=(8, 0)):
+        scales_and_zeros = pack_tinygemm_scales_and_zeros(scales, zeros)
+        # Add assertions to check if scales_and_zeros is correctly packed
+        # For example:
+        assert scales_and_zeros.shape == (n, q_groups, 2)
+        assert scales_and_zeros.dtype == torch.bfloat16
+    
+
 
 MARLIN_24_BATCH_SIZE = [1, 4, 8, 16, 32, 64]
 MARLIN_24_K_CHUNKS = [128]
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
index 0beadfe5d..a96a72405 100644
--- a/torchao/quantization/utils.py
+++ b/torchao/quantization/utils.py
@@ -312,6 +312,10 @@ def get_groupwise_affine_qparams(w, n_bit=4, groupsize=128, dtype=torch.bfloat16
 
 
 def pack_tinygemm_scales_and_zeros(scales, zeros, dtype=torch.bfloat16):
+    min_sm = (8, 0)
+    if torch.cuda.get_device_capability < min_sm:
+        raise NotImplementedError(f"4 bit quantization with tinygemm is not supported on this device as it requires sm_{min_sm[0]}.{min_sm[1]} or higher but got {torch.cuda.get_device_capability()}")
+
     guard_dtype_size(scales, "scales", dtype=dtype, size=zeros.size())
     guard_dtype_size(zeros, "zeros", dtype=dtype)
     return (

From 034ea1efefd4ca94f85063cf9343d397db5d57f7 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Thu, 17 Oct 2024 20:24:07 -0700
Subject: [PATCH 2/6] push

---
 test/test_ops.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 5a4ea4db5..070f2e432 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -317,7 +317,6 @@ def test_dequantize_tensor_core_tiled_layout_op(shape, inner_k_tiles, group_size
     q_groups = k // group_size
     scales = torch.randn(n, q_groups, dtype=torch.bfloat16, device=device)
     zeros = torch.randn_like(scales)
-    scales_and_zeros = pack_tinygemm_scales_and_zeros(scales, zeros)
 
     # Test the case where CUDA SM version is less than 8.0
     with patch('torch.cuda.get_device_capability', return_value=(7, 5)):

From 570a162b630d8417a78a97cc687ebfce42ee8062 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Thu, 17 Oct 2024 20:26:25 -0700
Subject: [PATCH 3/6] push

---
 test/test_ops.py              | 50 ++++++++++-------------------------
 torchao/quantization/utils.py |  2 +-
 2 files changed, 15 insertions(+), 37 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 070f2e432..d2caaef91 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -275,35 +275,6 @@ def test_dequantize_tensor_core_tiled_layout_correctness_unpack_and_dequant(shap
 
     assert diff_op_ao < 1e-1
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-@pytest.mark.parametrize("shape, inner_k_tiles, group_size", TEST_CONFIGS_DEQUANT, ids=str)
-def test_dequantize_tensor_core_tiled_layout_op(shape, inner_k_tiles, group_size):
-    n, k = shape
-    device = "cuda"
-
-    q = torch.randint(0, 16, shape, dtype=torch.int, device=device)
-    if TORCH_VERSION_AT_LEAST_2_5:
-        q = (q[::, ::2] << 4 | q[::, 1::2]).to(torch.uint8)
-    packed_w = torch._convert_weight_to_int4pack(q, inner_k_tiles)
-    q_groups = k // group_size
-    scales = torch.randn(n, q_groups, dtype=torch.bfloat16, device=device)
-    zeros = torch.randn_like(scales)
-    scales_and_zeros = pack_tinygemm_scales_and_zeros(scales, zeros)
-
-    test_utils = [
-    "test_schema",
-    "test_autograd_registration",
-    "test_faketensor",
-    ]
-    # TODO: Figure out why test fails unless torch >= 2.5
-    if TORCH_VERSION_AT_LEAST_2_5:
-        test_utils.append("test_aot_dispatch_dynamic")
-    opcheck(
-        torch.ops.torchao.dequantize_tensor_core_tiled_layout,
-        (packed_w, scales_and_zeros, group_size, inner_k_tiles),
-        test_utils=test_utils,
-    )
-
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.parametrize("shape, inner_k_tiles, group_size", TEST_CONFIGS_DEQUANT, ids=str)
 def test_dequantize_tensor_core_tiled_layout_op(shape, inner_k_tiles, group_size):
@@ -324,16 +295,23 @@ def test_dequantize_tensor_core_tiled_layout_op(shape, inner_k_tiles, group_size
             scales_and_zeros = pack_tinygemm_scales_and_zeros(scales, zeros)
         assert "4 bit quantization with tinygemm is not supported on this device" in str(excinfo.value)
 
-    # Test the case where CUDA SM version is 8.0 or higher (original behavior)
     with patch('torch.cuda.get_device_capability', return_value=(8, 0)):
         scales_and_zeros = pack_tinygemm_scales_and_zeros(scales, zeros)
-        # Add assertions to check if scales_and_zeros is correctly packed
-        # For example:
-        assert scales_and_zeros.shape == (n, q_groups, 2)
-        assert scales_and_zeros.dtype == torch.bfloat16
-    
-
+        test_utils = [
+        "test_schema",
+        "test_autograd_registration",
+        "test_faketensor",
+        ]
+        # TODO: Figure out why test fails unless torch >= 2.5
+        if TORCH_VERSION_AT_LEAST_2_5:
+            test_utils.append("test_aot_dispatch_dynamic")
+        opcheck(
+            torch.ops.torchao.dequantize_tensor_core_tiled_layout,
+            (packed_w, scales_and_zeros, group_size, inner_k_tiles),
+            test_utils=test_utils,
+        )
 
+    
 MARLIN_24_BATCH_SIZE = [1, 4, 8, 16, 32, 64]
 MARLIN_24_K_CHUNKS = [128]
 MARLIN_24_N_CHUNKS = [512]
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
index a96a72405..6c962df24 100644
--- a/torchao/quantization/utils.py
+++ b/torchao/quantization/utils.py
@@ -313,7 +313,7 @@ def get_groupwise_affine_qparams(w, n_bit=4, groupsize=128, dtype=torch.bfloat16
 
 def pack_tinygemm_scales_and_zeros(scales, zeros, dtype=torch.bfloat16):
     min_sm = (8, 0)
-    if torch.cuda.get_device_capability < min_sm:
+    if torch.cuda.get_device_capability <= min_sm:
         raise NotImplementedError(f"4 bit quantization with tinygemm is not supported on this device as it requires sm_{min_sm[0]}.{min_sm[1]} or higher but got {torch.cuda.get_device_capability()}")
 
     guard_dtype_size(scales, "scales", dtype=dtype, size=zeros.size())

From 2386315a76ecdea4cb56f6ba885b71468b2c70a2 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Thu, 17 Oct 2024 20:27:13 -0700
Subject: [PATCH 4/6] p

---
 torchao/quantization/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
index 6c962df24..7c6b41835 100644
--- a/torchao/quantization/utils.py
+++ b/torchao/quantization/utils.py
@@ -313,7 +313,7 @@ def get_groupwise_affine_qparams(w, n_bit=4, groupsize=128, dtype=torch.bfloat16
 
 def pack_tinygemm_scales_and_zeros(scales, zeros, dtype=torch.bfloat16):
     min_sm = (8, 0)
-    if torch.cuda.get_device_capability <= min_sm:
+    if torch.cuda.get_device_capability() <= min_sm:
         raise NotImplementedError(f"4 bit quantization with tinygemm is not supported on this device as it requires sm_{min_sm[0]}.{min_sm[1]} or higher but got {torch.cuda.get_device_capability()}")
 
     guard_dtype_size(scales, "scales", dtype=dtype, size=zeros.size())

From cb5a4ee1a1ab8d0e828c0c9a42362531f63b4ace Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Thu, 17 Oct 2024 20:38:50 -0700
Subject: [PATCH 5/6] push'

---
 torchao/quantization/utils.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
index 7c6b41835..6e07fee37 100644
--- a/torchao/quantization/utils.py
+++ b/torchao/quantization/utils.py
@@ -312,9 +312,10 @@ def get_groupwise_affine_qparams(w, n_bit=4, groupsize=128, dtype=torch.bfloat16
 
 
 def pack_tinygemm_scales_and_zeros(scales, zeros, dtype=torch.bfloat16):
-    min_sm = (8, 0)
-    if torch.cuda.get_device_capability() <= min_sm:
-        raise NotImplementedError(f"4 bit quantization with tinygemm is not supported on this device as it requires sm_{min_sm[0]}.{min_sm[1]} or higher but got {torch.cuda.get_device_capability()}")
+    if torch.cuda.is_available():
+        min_sm = (8, 0)
+        if torch.cuda.get_device_capability() <= min_sm:
+            raise NotImplementedError(f"4 bit quantization with tinygemm is not supported on this device as it requires sm_{min_sm[0]}.{min_sm[1]} or higher but got {torch.cuda.get_device_capability()}")
 
     guard_dtype_size(scales, "scales", dtype=dtype, size=zeros.size())
     guard_dtype_size(zeros, "zeros", dtype=dtype)

From 052966194a870de5956f05dfe2feae26830786ea Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Thu, 17 Oct 2024 20:10:49 -0800
Subject: [PATCH 6/6] Update utils.py

---
 torchao/quantization/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
index 6e07fee37..ded54cd39 100644
--- a/torchao/quantization/utils.py
+++ b/torchao/quantization/utils.py
@@ -314,7 +314,7 @@ def get_groupwise_affine_qparams(w, n_bit=4, groupsize=128, dtype=torch.bfloat16
 def pack_tinygemm_scales_and_zeros(scales, zeros, dtype=torch.bfloat16):
     if torch.cuda.is_available():
         min_sm = (8, 0)
-        if torch.cuda.get_device_capability() <= min_sm:
+        if torch.cuda.get_device_capability() < min_sm:
             raise NotImplementedError(f"4 bit quantization with tinygemm is not supported on this device as it requires sm_{min_sm[0]}.{min_sm[1]} or higher but got {torch.cuda.get_device_capability()}")
 
     guard_dtype_size(scales, "scales", dtype=dtype, size=zeros.size())