From 7088caf766d7d05371342a91e8d572375625f8fc Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Thu, 17 Oct 2024 20:23:28 -0700 Subject: [PATCH 1/6] Better tinygemm warning for T4 --- test/test_ops.py | 33 ++++++++++++++++++++++++++++++++- torchao/quantization/utils.py | 4 ++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/test/test_ops.py b/test/test_ops.py index 31000eafc..5a4ea4db5 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -14,6 +14,8 @@ from torchao.dtypes.floatx import from_scaled_tc_floatx from torchao.sparsity.marlin import marlin_24_workspace, pack_to_marlin_24, inject_24 import pytest +from unittest.mock import patch + if is_fbcode(): pytest.skip("Skipping the test in fbcode since we don't have TARGET file for kernels") @@ -274,7 +276,6 @@ def test_dequantize_tensor_core_tiled_layout_correctness_unpack_and_dequant(shap assert diff_op_ao < 1e-1 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") -# @pytest.mark.skipif(TORCH_VERSION_AT_LEAST_2_5, reason="weight packing is updated in 2.5+") @pytest.mark.parametrize("shape, inner_k_tiles, group_size", TEST_CONFIGS_DEQUANT, ids=str) def test_dequantize_tensor_core_tiled_layout_op(shape, inner_k_tiles, group_size): n, k = shape @@ -303,6 +304,36 @@ def test_dequantize_tensor_core_tiled_layout_op(shape, inner_k_tiles, group_size test_utils=test_utils, ) +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.parametrize("shape, inner_k_tiles, group_size", TEST_CONFIGS_DEQUANT, ids=str) +def test_dequantize_tensor_core_tiled_layout_op(shape, inner_k_tiles, group_size): + n, k = shape + device = "cuda" + + q = torch.randint(0, 16, shape, dtype=torch.int, device=device) + if TORCH_VERSION_AT_LEAST_2_5: + q = (q[::, ::2] << 4 | q[::, 1::2]).to(torch.uint8) + packed_w = torch._convert_weight_to_int4pack(q, inner_k_tiles) + q_groups = k // group_size + scales = torch.randn(n, q_groups, dtype=torch.bfloat16, device=device) + zeros = torch.randn_like(scales) + scales_and_zeros = pack_tinygemm_scales_and_zeros(scales, zeros) + + # Test the case where CUDA SM version is less than 8.0 + with patch('torch.cuda.get_device_capability', return_value=(7, 5)): + with pytest.raises(NotImplementedError) as excinfo: + scales_and_zeros = pack_tinygemm_scales_and_zeros(scales, zeros) + assert "4 bit quantization with tinygemm is not supported on this device" in str(excinfo.value) + + # Test the case where CUDA SM version is 8.0 or higher (original behavior) + with patch('torch.cuda.get_device_capability', return_value=(8, 0)): + scales_and_zeros = pack_tinygemm_scales_and_zeros(scales, zeros) + # Add assertions to check if scales_and_zeros is correctly packed + # For example: + assert scales_and_zeros.shape == (n, q_groups, 2) + assert scales_and_zeros.dtype == torch.bfloat16 + + MARLIN_24_BATCH_SIZE = [1, 4, 8, 16, 32, 64] MARLIN_24_K_CHUNKS = [128] diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py index 0beadfe5d..a96a72405 100644 --- a/torchao/quantization/utils.py +++ b/torchao/quantization/utils.py @@ -312,6 +312,10 @@ def get_groupwise_affine_qparams(w, n_bit=4, groupsize=128, dtype=torch.bfloat16 def pack_tinygemm_scales_and_zeros(scales, zeros, dtype=torch.bfloat16): + min_sm = (8, 0) + if torch.cuda.get_device_capability < min_sm: + raise NotImplementedError(f"4 bit quantization with tinygemm is not supported on this device as it requires sm_{min_sm[0]}.{min_sm[1]} or higher but got {torch.cuda.get_device_capability()}") + guard_dtype_size(scales, "scales", dtype=dtype, size=zeros.size()) guard_dtype_size(zeros, "zeros", dtype=dtype) return ( From 034ea1efefd4ca94f85063cf9343d397db5d57f7 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Thu, 17 Oct 2024 20:24:07 -0700 Subject: [PATCH 2/6] push --- test/test_ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_ops.py b/test/test_ops.py index 5a4ea4db5..070f2e432 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -317,7 +317,6 @@ def test_dequantize_tensor_core_tiled_layout_op(shape, inner_k_tiles, group_size q_groups = k // group_size scales = torch.randn(n, q_groups, dtype=torch.bfloat16, device=device) zeros = torch.randn_like(scales) - scales_and_zeros = pack_tinygemm_scales_and_zeros(scales, zeros) # Test the case where CUDA SM version is less than 8.0 with patch('torch.cuda.get_device_capability', return_value=(7, 5)): From 570a162b630d8417a78a97cc687ebfce42ee8062 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Thu, 17 Oct 2024 20:26:25 -0700 Subject: [PATCH 3/6] push --- test/test_ops.py | 50 ++++++++++------------------------- torchao/quantization/utils.py | 2 +- 2 files changed, 15 insertions(+), 37 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index 070f2e432..d2caaef91 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -275,35 +275,6 @@ def test_dequantize_tensor_core_tiled_layout_correctness_unpack_and_dequant(shap assert diff_op_ao < 1e-1 -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") -@pytest.mark.parametrize("shape, inner_k_tiles, group_size", TEST_CONFIGS_DEQUANT, ids=str) -def test_dequantize_tensor_core_tiled_layout_op(shape, inner_k_tiles, group_size): - n, k = shape - device = "cuda" - - q = torch.randint(0, 16, shape, dtype=torch.int, device=device) - if TORCH_VERSION_AT_LEAST_2_5: - q = (q[::, ::2] << 4 | q[::, 1::2]).to(torch.uint8) - packed_w = torch._convert_weight_to_int4pack(q, inner_k_tiles) - q_groups = k // group_size - scales = torch.randn(n, q_groups, dtype=torch.bfloat16, device=device) - zeros = torch.randn_like(scales) - scales_and_zeros = pack_tinygemm_scales_and_zeros(scales, zeros) - - test_utils = [ - "test_schema", - "test_autograd_registration", - "test_faketensor", - ] - # TODO: Figure out why test fails unless torch >= 2.5 - if TORCH_VERSION_AT_LEAST_2_5: - test_utils.append("test_aot_dispatch_dynamic") - opcheck( - torch.ops.torchao.dequantize_tensor_core_tiled_layout, - (packed_w, scales_and_zeros, group_size, inner_k_tiles), - test_utils=test_utils, - ) - @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @pytest.mark.parametrize("shape, inner_k_tiles, group_size", TEST_CONFIGS_DEQUANT, ids=str) def test_dequantize_tensor_core_tiled_layout_op(shape, inner_k_tiles, group_size): @@ -324,16 +295,23 @@ def test_dequantize_tensor_core_tiled_layout_op(shape, inner_k_tiles, group_size scales_and_zeros = pack_tinygemm_scales_and_zeros(scales, zeros) assert "4 bit quantization with tinygemm is not supported on this device" in str(excinfo.value) - # Test the case where CUDA SM version is 8.0 or higher (original behavior) with patch('torch.cuda.get_device_capability', return_value=(8, 0)): scales_and_zeros = pack_tinygemm_scales_and_zeros(scales, zeros) - # Add assertions to check if scales_and_zeros is correctly packed - # For example: - assert scales_and_zeros.shape == (n, q_groups, 2) - assert scales_and_zeros.dtype == torch.bfloat16 - - + test_utils = [ + "test_schema", + "test_autograd_registration", + "test_faketensor", + ] + # TODO: Figure out why test fails unless torch >= 2.5 + if TORCH_VERSION_AT_LEAST_2_5: + test_utils.append("test_aot_dispatch_dynamic") + opcheck( + torch.ops.torchao.dequantize_tensor_core_tiled_layout, + (packed_w, scales_and_zeros, group_size, inner_k_tiles), + test_utils=test_utils, + ) + MARLIN_24_BATCH_SIZE = [1, 4, 8, 16, 32, 64] MARLIN_24_K_CHUNKS = [128] MARLIN_24_N_CHUNKS = [512] diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py index a96a72405..6c962df24 100644 --- a/torchao/quantization/utils.py +++ b/torchao/quantization/utils.py @@ -313,7 +313,7 @@ def get_groupwise_affine_qparams(w, n_bit=4, groupsize=128, dtype=torch.bfloat16 def pack_tinygemm_scales_and_zeros(scales, zeros, dtype=torch.bfloat16): min_sm = (8, 0) - if torch.cuda.get_device_capability < min_sm: + if torch.cuda.get_device_capability <= min_sm: raise NotImplementedError(f"4 bit quantization with tinygemm is not supported on this device as it requires sm_{min_sm[0]}.{min_sm[1]} or higher but got {torch.cuda.get_device_capability()}") guard_dtype_size(scales, "scales", dtype=dtype, size=zeros.size()) From 2386315a76ecdea4cb56f6ba885b71468b2c70a2 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Thu, 17 Oct 2024 20:27:13 -0700 Subject: [PATCH 4/6] p --- torchao/quantization/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py index 6c962df24..7c6b41835 100644 --- a/torchao/quantization/utils.py +++ b/torchao/quantization/utils.py @@ -313,7 +313,7 @@ def get_groupwise_affine_qparams(w, n_bit=4, groupsize=128, dtype=torch.bfloat16 def pack_tinygemm_scales_and_zeros(scales, zeros, dtype=torch.bfloat16): min_sm = (8, 0) - if torch.cuda.get_device_capability <= min_sm: + if torch.cuda.get_device_capability() <= min_sm: raise NotImplementedError(f"4 bit quantization with tinygemm is not supported on this device as it requires sm_{min_sm[0]}.{min_sm[1]} or higher but got {torch.cuda.get_device_capability()}") guard_dtype_size(scales, "scales", dtype=dtype, size=zeros.size()) From cb5a4ee1a1ab8d0e828c0c9a42362531f63b4ace Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Thu, 17 Oct 2024 20:38:50 -0700 Subject: [PATCH 5/6] push' --- torchao/quantization/utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py index 7c6b41835..6e07fee37 100644 --- a/torchao/quantization/utils.py +++ b/torchao/quantization/utils.py @@ -312,9 +312,10 @@ def get_groupwise_affine_qparams(w, n_bit=4, groupsize=128, dtype=torch.bfloat16 def pack_tinygemm_scales_and_zeros(scales, zeros, dtype=torch.bfloat16): - min_sm = (8, 0) - if torch.cuda.get_device_capability() <= min_sm: - raise NotImplementedError(f"4 bit quantization with tinygemm is not supported on this device as it requires sm_{min_sm[0]}.{min_sm[1]} or higher but got {torch.cuda.get_device_capability()}") + if torch.cuda.is_available(): + min_sm = (8, 0) + if torch.cuda.get_device_capability() <= min_sm: + raise NotImplementedError(f"4 bit quantization with tinygemm is not supported on this device as it requires sm_{min_sm[0]}.{min_sm[1]} or higher but got {torch.cuda.get_device_capability()}") guard_dtype_size(scales, "scales", dtype=dtype, size=zeros.size()) guard_dtype_size(zeros, "zeros", dtype=dtype) From 052966194a870de5956f05dfe2feae26830786ea Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Thu, 17 Oct 2024 20:10:49 -0800 Subject: [PATCH 6/6] Update utils.py --- torchao/quantization/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py index 6e07fee37..ded54cd39 100644 --- a/torchao/quantization/utils.py +++ b/torchao/quantization/utils.py @@ -314,7 +314,7 @@ def get_groupwise_affine_qparams(w, n_bit=4, groupsize=128, dtype=torch.bfloat16 def pack_tinygemm_scales_and_zeros(scales, zeros, dtype=torch.bfloat16): if torch.cuda.is_available(): min_sm = (8, 0) - if torch.cuda.get_device_capability() <= min_sm: + if torch.cuda.get_device_capability() < min_sm: raise NotImplementedError(f"4 bit quantization with tinygemm is not supported on this device as it requires sm_{min_sm[0]}.{min_sm[1]} or higher but got {torch.cuda.get_device_capability()}") guard_dtype_size(scales, "scales", dtype=dtype, size=zeros.size())