From ca372f2b2784332bbce488ec640941ad93e1ff80 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 14 Oct 2024 14:43:52 -0400
Subject: [PATCH] int8 refactor: initial sparse decomp, cleanup

---
 bitsandbytes/autograd/_functions.py          | 142 ++---
 bitsandbytes/cextension.py                   |   3 +
 bitsandbytes/functional.py                   | 578 +++++++++----------
 bitsandbytes/research/autograd/_functions.py |   8 +-
 csrc/kernels.cu                              |  30 +-
 csrc/ops.cu                                  |   6 -
 tests/test_functional.py                     |  31 +-
 tests/test_linear8bitlt.py                   |  11 +-
 tests/test_modules.py                        |  15 +-
 9 files changed, 390 insertions(+), 434 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index e32763f56..bc7a51113 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -1,6 +1,5 @@
 from dataclasses import dataclass
-from functools import reduce  # Required in Python 3
-import operator
+from math import prod
 from typing import Callable, Optional, Tuple
 import warnings
 from warnings import warn
@@ -9,12 +8,6 @@
 
 import bitsandbytes.functional as F
 
-
-# math.prod not compatible with python < 3.8
-def prod(iterable):
-    return reduce(operator.mul, iterable, 1)
-
-
 # The inverse transformation for the colTuring and colAmpere format were contributed by Alex Borzunov:
 # https://github.com/bigscience-workshop/petals/blob/main/src/petals/utils/linear8bitlt_patch.py
 
@@ -284,10 +277,16 @@ def tile_indices(self):
 
 class MatMul8bitLt(torch.autograd.Function):
     @staticmethod
-    def forward(ctx, A, B, out=None, bias=None, state: MatmulLtState = None):
-        state = state or MatmulLtState()
+    def forward(
+        ctx: torch.autograd.function.FunctionCtx,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        out=None,
+        bias: Optional[torch.Tensor] = None,
+        state=MatmulLtState,
+    ):
+        # state = state or MatmulLtState()
 
-        using_igemmlt = supports_igemmlt(A.device) and not state.force_no_igemmlt
         # default of pytorch behavior if inputs are empty
         ctx.is_empty = False
         if prod(A.shape) == 0:
@@ -300,14 +299,7 @@ def forward(ctx, A, B, out=None, bias=None, state: MatmulLtState = None):
             else:
                 return torch.empty(A.shape[:-1] + B.shape[:1], dtype=A.dtype, device=A.device)
 
-        # 1. Quantize A
-        # 2. Quantize B
-        # 3. Matmul
-        # 4. Mixed-precision decomposition matmul
-        # 5. Save state
         input_shape = A.shape
-        if state.outlier_pool is None:
-            state.outlier_pool = GlobalOutlierPooler.get_instance()
 
         # Cast A to fp16
         if A.dtype != torch.float16:
@@ -318,20 +310,10 @@ def forward(ctx, A, B, out=None, bias=None, state: MatmulLtState = None):
             A = A.reshape(-1, A.shape[-1])
         CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(A.to(torch.float16), threshold=state.threshold)
 
-        if state.threshold > 0.0 and coo_tensorA is not None:
-            if state.has_fp16_weights:
-                idx = torch.unique(coo_tensorA.colidx).long()
-                CA[:, idx] = 0
-                CAt[:, idx] = 0
-                subA = A[:, idx]
-                state.subB = B[:, idx].t().contiguous()
-                state.idx = idx
-        else:
-            subA = None
+        has_grad = False
 
-        # 2. Quantize B
         if state.has_fp16_weights:
-            has_grad = True if (getattr(B, "grad", None) is not None) else False
+            has_grad = getattr(B, "grad", None) is not None
             is_transposed = not B.is_contiguous() and B.shape[0] == B.stride(1)
             if is_transposed:
                 B = B.contiguous()
@@ -339,71 +321,46 @@ def forward(ctx, A, B, out=None, bias=None, state: MatmulLtState = None):
             if (state.is_training and not has_grad) or state.CB is None:
                 state.reset_grads()
 
-                # quantize...
+                # 2. Quantize B
                 (
                     state.CB,
                     state.CBt,
                     state.SCB,
                     state.SCBt,
-                    coo_tensorB,
+                    _,
                 ) = F.double_quant(B.to(torch.float16))
-        else:
-            has_grad = False
-
-        if coo_tensorA is not None and not state.has_fp16_weights:
-            # extract outliers
-
-            outlier_idx = torch.unique(coo_tensorA.colidx)
-            state.idx = outlier_idx
-            # state.outlier_pool.add_outliers(outlier_idx, A.shape[-1])
-            # if state.use_pool and state.outlier_pool.model_dim == A.shape[-1]:
-            #    # do not use pool for 2nd FFN layer
-            #    state.idx = state.outlier_pool.get_current_outlier_idx().to(A.device)
-            # else:
-            #    state.idx = outlier_idx
-
-            # if state.CxB is not None:
-            #    outliers = F.extract_outliers(state.CxB, state.SB, state.idx.int())
-            # else:
-            outliers = state.CB[:, state.idx.long()].clone()
-
-            state.subB = (outliers * state.SCB.view(-1, 1) / 127.0).t().contiguous().to(A.dtype)
-            CA[:, state.idx.long()] = 0
-            CAt[:, state.idx.long()] = 0
-            subA = A[:, state.idx.long()]
-
-        shapeB = state.CB.shape
-
-        if len(input_shape) == 3:
-            output_shape = (input_shape[0], input_shape[1], shapeB[0])
-        else:
-            output_shape = (input_shape[0], shapeB[0])
 
-        # 3. Matmul
-        if using_igemmlt:
-            out32, Sout32 = F.igemmlt(CA, state.CB)
+        if state.threshold > 0.0 and coo_tensorA is not None:
+            state.idx = torch.unique(coo_tensorA._indices()[1]).long()
+
+            # Zero out the outliers in the int8 inputs
+            CA[:, state.idx] = 0
+            CAt[:, state.idx] = 0
 
-            if bias is None or bias.dtype == torch.float16:
-                # we apply the fused bias here
-                output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=bias)
-                output = output.to(A.dtype)
-            else:  # apply bias separately
-                # TODO: Fused bias for fp32/bf16?
-                output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=None)
-                output = output.to(A.dtype).add_(bias)
+            # Extract the input outliers in original precision
+            subA = A[:, state.idx]
 
+            # Extract the corresponding weights
+            if state.has_fp16_weights:
+                state.subB = B[:, state.idx].t().contiguous()
+            else:
+                outliers = state.CB[:, state.idx].clone()
+                state.subB = (outliers * state.SCB.view(-1, 1) / 127.0).t().contiguous().to(A.dtype)
         else:
-            A_wo_outliers = A.clone()
-            if state.idx is not None:
-                A_wo_outliers[:, state.idx.long()] = 0
-            output = torch.nn.functional.linear(A_wo_outliers, state.CB.to(A.dtype))
-            output = output.mul_(state.SCB.unsqueeze(0).mul(1.0 / 127.0))
-            if bias is not None:
-                output = output.add_(bias)
+            subA = state.subB = None
+
+        # 3. Int8 Matmul
+        out32, Sout32 = F.igemmlt(CA, state.CB)
+        if bias is None or bias.dtype == torch.float16:
+            # we apply the fused bias here
+            output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=bias).to(A.dtype)
+        else:  # apply bias separately
+            # TODO: Fused bias for fp32/bf16?
+            output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=None).to(A.dtype).add_(bias)
 
         # 4. Mixed-precision decomposition matmul
-        if coo_tensorA is not None and subA is not None:
-            output += torch.matmul(subA, state.subB)
+        if subA is not None and state.subB is not None:
+            output += torch.matmul(subA, state.subB.to(subA.dtype))
 
         # 5. Save state
         ctx.state = state
@@ -419,7 +376,8 @@ def forward(ctx, A, B, out=None, bias=None, state: MatmulLtState = None):
             ctx.tensor_states = (None, None)
             ctx.save_for_backward(None, None)
 
-        return output.reshape(output_shape)
+        output_shape = (*input_shape[:-1], state.CB.shape[0])
+        return output.reshape(output_shape).clone()
 
     @staticmethod
     def backward(ctx, grad_output):
@@ -441,16 +399,24 @@ def backward(ctx, grad_output):
         if len(grad_output.shape) == 3:
             grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous()
 
-        Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16))
+        Cgrad, Cgradt, SCgrad, SCgradt, _ = F.double_quant(grad_output.to(torch.float16))
         if req_gradB:
-            gradB32, SgradB32 = F.igemmlt(Cgradt.t(), CAt.t())
-            grad_B = F.mm_dequant(gradB32, SgradB32, SCgradt, SCAt)
+            # grad_output.T @ A
+            # grad_weight = grad_output.t().mm(A)
+            grad_B = torch.matmul(grad_output.t(), A)
             if state.threshold > 0.0 and subA is not None:
                 grad_B[:, idx] += torch.matmul(grad_output.t(), subA)
+        # if req_gradB:
+        #
+        #     gradB32, SgradB32 = F.igemmlt(Cgrad.t().contiguous(), CAt.t())
+        #     grad_B = F.mm_dequant(gradB32, SgradB32, SCgradt, SCAt)
+        #     if state.threshold > 0.0 and subA is not None:
+        #         grad_B[:, idx] += torch.matmul(grad_output.t(), subA)
 
         if req_gradA:
+            # grad_output @ B.T
             if state.CBt is not None:
-                gradA32, SgradA32 = F.igemmlt(Cgradt, state.CBt.t())
+                gradA32, SgradA32 = F.igemmlt(Cgrad, state.CBt.t())
                 grad_A = F.mm_dequant(gradA32, SgradA32, SCgrad, state.SCBt).view(ctx.grad_shape).to(ctx.dtype_A)
             elif state.CB is not None:
                 CB = state.CB.to(ctx.dtype_A, copy=True).mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 45573538e..b7522334c 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -67,6 +67,9 @@ def __init__(self, lib: ct.CDLL):
     def __getattr__(self, item):
         return getattr(self._lib, item)
 
+    def __getitem__(self, item):
+        return getattr(self._lib, item)
+
 
 class CudaBNBNativeLibrary(BNBNativeLibrary):
     compiled_with_cuda = True
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index d59fc8778..8d7226b2c 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -5,7 +5,7 @@
 import ctypes as ct
 import itertools
 from math import prod
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -845,8 +845,7 @@ def quantize_blockwise(
 
     if absmax is None:
         n = A.numel()
-        blocks = n // blocksize
-        blocks += 1 if n % blocksize > 0 else 0
+        blocks = -(n // -blocksize)
         absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
 
     if out is None:
@@ -854,40 +853,31 @@ def quantize_blockwise(
 
     if A.device.type != "cpu":
         assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
-        cblocksize = ct.c_int32(blocksize)
-        prev_device = pre_call(A.device)
+
         code = code.to(A.device)
         is_on_gpu([code, A, out, absmax])
-        if A.dtype == torch.float32:
-            lib.cquantize_blockwise_fp32(
-                get_ptr(code),
-                get_ptr(A),
-                get_ptr(absmax),
-                get_ptr(out),
-                cblocksize,
-                ct.c_int(A.numel()),
-            )
-        elif A.dtype == torch.float16:
-            lib.cquantize_blockwise_fp16(
-                get_ptr(code),
-                get_ptr(A),
-                get_ptr(absmax),
-                get_ptr(out),
-                cblocksize,
-                ct.c_int(A.numel()),
-            )
-        elif A.dtype == torch.bfloat16:
-            lib.cquantize_blockwise_bf16(
+
+        fn_map = {
+            torch.float32: "cquantize_blockwise_fp32",
+            torch.bfloat16: "cquantize_blockwise_bf16",
+            torch.float16: "cquantize_blockwise_fp16",
+        }
+
+        if A.dtype not in fn_map.keys():
+            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+
+        fn = fn_map[A.dtype]
+
+        with torch.cuda.device_of(A):
+            lib[fn](
                 get_ptr(code),
                 get_ptr(A),
                 get_ptr(absmax),
                 get_ptr(out),
-                cblocksize,
+                ct.c_int32(blocksize),
                 ct.c_int(A.numel()),
             )
-        else:
-            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-        post_call(A.device)
+
     else:
         # cpu
         code = code.cpu()
@@ -972,47 +962,34 @@ def dequantize_blockwise(
         out = torch.empty(A.shape, dtype=quant_state.dtype, device=A.device)
 
     if A.device.type != "cpu":
-        device = pre_call(A.device)
         code = quant_state.code.to(A.device)
         if quant_state.blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
             raise ValueError(
                 f"The blockwise of {quant_state.blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]",
             )
         is_on_gpu([A, absmax, out])
-        stream = get_tensor_stream(A)
-        if out.dtype == torch.float32:
-            lib.cdequantize_blockwise_fp32(
-                get_ptr(quant_state.code),
-                get_ptr(A),
-                get_ptr(absmax),
-                get_ptr(out),
-                ct.c_int(quant_state.blocksize),
-                ct.c_int(A.numel()),
-                stream,  # Used the _as_parameter_ attribute of torch.cuda.Stream, Similarly for the following
-            )
-        elif out.dtype == torch.float16:
-            lib.cdequantize_blockwise_fp16(
-                get_ptr(quant_state.code),
-                get_ptr(A),
-                get_ptr(absmax),
-                get_ptr(out),
-                ct.c_int(quant_state.blocksize),
-                ct.c_int(A.numel()),
-                stream,
-            )
-        elif out.dtype == torch.bfloat16:
-            lib.cdequantize_blockwise_bf16(
+
+        fn_map = {
+            torch.float32: "cdequantize_blockwise_fp32",
+            torch.bfloat16: "cdequantize_blockwise_bf16",
+            torch.float16: "cdequantize_blockwise_fp16",
+        }
+
+        if out.dtype not in fn_map.keys():
+            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {out.dtype}")
+
+        fn = fn_map[out.dtype]
+
+        with torch.cuda.device_of(A):
+            lib[fn](
                 get_ptr(quant_state.code),
                 get_ptr(A),
                 get_ptr(absmax),
                 get_ptr(out),
                 ct.c_int(quant_state.blocksize),
                 ct.c_int(A.numel()),
-                stream,
+                get_tensor_stream(A),
             )
-        else:
-            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-        post_call(A.device)
     else:
         code = quant_state.code.cpu()
         lib.cdequantize_blockwise_cpu_fp32(
@@ -1174,8 +1151,7 @@ def quantize_4bit(
     input_shape = A.shape
 
     if absmax is None:
-        blocks = n // blocksize
-        blocks += 1 if n % blocksize > 0 else 0
+        blocks = -(n // -blocksize)
         absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
 
     if out is None:
@@ -1184,68 +1160,72 @@ def quantize_4bit(
 
     assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
 
-    prev_device = pre_call(A.device)
     is_on_gpu([A, out, absmax])
     if A.dtype == torch.float32:
         if quant_type == "fp4":
-            lib.cquantize_blockwise_fp32_fp4(
-                get_ptr(None),
-                get_ptr(A),
-                get_ptr(absmax),
-                get_ptr(out),
-                ct.c_int32(blocksize),
-                ct.c_int(n),
-            )
+            with torch.cuda.device_of(A):
+                lib.cquantize_blockwise_fp32_fp4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int32(blocksize),
+                    ct.c_int(n),
+                )
         else:
-            lib.cquantize_blockwise_fp32_nf4(
-                get_ptr(None),
-                get_ptr(A),
-                get_ptr(absmax),
-                get_ptr(out),
-                ct.c_int32(blocksize),
-                ct.c_int(n),
-            )
+            with torch.cuda.device_of(A):
+                lib.cquantize_blockwise_fp32_nf4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int32(blocksize),
+                    ct.c_int(n),
+                )
     elif A.dtype == torch.float16:
         if quant_type == "fp4":
-            lib.cquantize_blockwise_fp16_fp4(
-                get_ptr(None),
-                get_ptr(A),
-                get_ptr(absmax),
-                get_ptr(out),
-                ct.c_int32(blocksize),
-                ct.c_int(n),
-            )
+            with torch.cuda.device_of(A):
+                lib.cquantize_blockwise_fp16_fp4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int32(blocksize),
+                    ct.c_int(n),
+                )
         else:
-            lib.cquantize_blockwise_fp16_nf4(
-                get_ptr(None),
-                get_ptr(A),
-                get_ptr(absmax),
-                get_ptr(out),
-                ct.c_int32(blocksize),
-                ct.c_int(n),
-            )
+            with torch.cuda.device_of(A):
+                lib.cquantize_blockwise_fp16_nf4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int32(blocksize),
+                    ct.c_int(n),
+                )
     elif A.dtype == torch.bfloat16:
         if quant_type == "fp4":
-            lib.cquantize_blockwise_bf16_fp4(
-                get_ptr(None),
-                get_ptr(A),
-                get_ptr(absmax),
-                get_ptr(out),
-                ct.c_int32(blocksize),
-                ct.c_int(n),
-            )
+            with torch.cuda.device_of(A):
+                lib.cquantize_blockwise_bf16_fp4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int32(blocksize),
+                    ct.c_int(n),
+                )
         else:
-            lib.cquantize_blockwise_bf16_nf4(
-                get_ptr(None),
-                get_ptr(A),
-                get_ptr(absmax),
-                get_ptr(out),
-                ct.c_int32(blocksize),
-                ct.c_int(n),
-            )
+            with torch.cuda.device_of(A):
+                lib.cquantize_blockwise_bf16_nf4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int32(blocksize),
+                    ct.c_int(n),
+                )
     else:
         raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-    post_call(A.device)
 
     code = get_4bit_type(quant_type, device=A.device)
 
@@ -1363,77 +1343,80 @@ def dequantize_4bit(
 
     n = out.numel()
 
-    device = pre_call(A.device)
     is_on_gpu([A, absmax, out])
     stream = get_tensor_stream(A)
     if out.dtype == torch.float32:
         if quant_state.quant_type == "fp4":
-            lib.cdequantize_blockwise_fp32_fp4(
-                get_ptr(None),
-                get_ptr(A),
-                get_ptr(absmax),
-                get_ptr(out),
-                ct.c_int(quant_state.blocksize),
-                ct.c_int(n),
-                stream,
-            )
+            with torch.cuda.device_of(A):
+                lib.cdequantize_blockwise_fp32_fp4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int(quant_state.blocksize),
+                    ct.c_int(n),
+                    stream,
+                )
         else:
-            lib.cdequantize_blockwise_fp32_nf4(
-                get_ptr(None),
-                get_ptr(A),
-                get_ptr(absmax),
-                get_ptr(out),
-                ct.c_int(quant_state.blocksize),
-                ct.c_int(n),
-                stream,
-            )
+            with torch.cuda.device_of(A):
+                lib.cdequantize_blockwise_fp32_nf4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int(quant_state.blocksize),
+                    ct.c_int(n),
+                    stream,
+                )
     elif out.dtype == torch.float16:
         if quant_state.quant_type == "fp4":
-            lib.cdequantize_blockwise_fp16_fp4(
-                get_ptr(None),
-                get_ptr(A),
-                get_ptr(absmax),
-                get_ptr(out),
-                ct.c_int(quant_state.blocksize),
-                ct.c_int(n),
-                stream,
-            )
+            with torch.cuda.device_of(A):
+                lib.cdequantize_blockwise_fp16_fp4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int(quant_state.blocksize),
+                    ct.c_int(n),
+                    stream,
+                )
         else:
-            lib.cdequantize_blockwise_fp16_nf4(
-                get_ptr(None),
-                get_ptr(A),
-                get_ptr(absmax),
-                get_ptr(out),
-                ct.c_int(quant_state.blocksize),
-                ct.c_int(n),
-                stream,
-            )
+            with torch.cuda.device_of(A):
+                lib.cdequantize_blockwise_fp16_nf4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int(quant_state.blocksize),
+                    ct.c_int(n),
+                    stream,
+                )
     elif out.dtype == torch.bfloat16:
-        if quant_state.quant_type == "fp4":
-            lib.cdequantize_blockwise_bf16_fp4(
-                get_ptr(None),
-                get_ptr(A),
-                get_ptr(absmax),
-                get_ptr(out),
-                ct.c_int(quant_state.blocksize),
-                ct.c_int(n),
-                stream,
-            )
-        else:
-            lib.cdequantize_blockwise_bf16_nf4(
-                get_ptr(None),
-                get_ptr(A),
-                get_ptr(absmax),
-                get_ptr(out),
-                ct.c_int(quant_state.blocksize),
-                ct.c_int(n),
-                stream,
-            )
+        with torch.cuda.device_of(A):
+            if quant_state.quant_type == "fp4":
+                lib.cdequantize_blockwise_bf16_fp4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int(quant_state.blocksize),
+                    ct.c_int(n),
+                    stream,
+                )
+            else:
+                lib.cdequantize_blockwise_bf16_nf4(
+                    get_ptr(None),
+                    get_ptr(A),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int(quant_state.blocksize),
+                    ct.c_int(n),
+                    stream,
+                )
     else:
         raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-    post_call(A.device)
 
-    is_transposed = True if A.shape[0] == 1 else False
+    is_transposed = A.shape[0] == 1
     if is_transposed:
         return out.t()
     else:
@@ -1995,10 +1978,9 @@ def gemv_4bit(
     transposed_B=False,
     state=None,
 ):
-    prev_device = pre_call(A.device)
     # sout = check_matmul(A, B, out, transposed_A, transposed_B, expected_type=A.dtype)
     if state is None:
-        raise ValueError("state cannot None. gem_4bit( ) requires the state from quantize_4bit( )")
+        raise ValueError("state cannot None. gemv_4bit() requires the state from quantize_4bit()")
 
     if A.numel() != A.shape[-1]:
         raise ValueError(
@@ -2032,62 +2014,64 @@ def gemv_4bit(
     ldb = ct.c_int32(ldb)
     ldc = ct.c_int32(ldc)
     stream = get_tensor_stream(A)
-    if B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32]:
-        if A.dtype == torch.float16:
-            lib.cgemm_4bit_inference_naive_fp16(
-                m,
-                n,
-                k,
-                get_ptr(A),
-                get_ptr(B),
-                get_ptr(absmax),
-                get_ptr(state.code),
-                get_ptr(out),
-                lda,
-                ldb,
-                ldc,
-                ct.c_int32(state.blocksize),
-                stream,
-            )
-        elif A.dtype == torch.bfloat16:
-            lib.cgemm_4bit_inference_naive_bf16(
-                m,
-                n,
-                k,
-                get_ptr(A),
-                get_ptr(B),
-                get_ptr(absmax),
-                get_ptr(state.code),
-                get_ptr(out),
-                lda,
-                ldb,
-                ldc,
-                ct.c_int32(state.blocksize),
-                stream,
-            )
-        elif A.dtype == torch.float32:
-            lib.cgemm_4bit_inference_naive_fp32(
-                m,
-                n,
-                k,
-                get_ptr(A),
-                get_ptr(B),
-                get_ptr(absmax),
-                get_ptr(state.code),
-                get_ptr(out),
-                lda,
-                ldb,
-                ldc,
-                ct.c_int32(state.blocksize),
-                stream,
-            )
+
+    with torch.cuda.device_of(A):
+        if B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32]:
+            if A.dtype == torch.float16:
+                lib.cgemm_4bit_inference_naive_fp16(
+                    m,
+                    n,
+                    k,
+                    get_ptr(A),
+                    get_ptr(B),
+                    get_ptr(absmax),
+                    get_ptr(state.code),
+                    get_ptr(out),
+                    lda,
+                    ldb,
+                    ldc,
+                    ct.c_int32(state.blocksize),
+                    stream,
+                )
+            elif A.dtype == torch.bfloat16:
+                lib.cgemm_4bit_inference_naive_bf16(
+                    m,
+                    n,
+                    k,
+                    get_ptr(A),
+                    get_ptr(B),
+                    get_ptr(absmax),
+                    get_ptr(state.code),
+                    get_ptr(out),
+                    lda,
+                    ldb,
+                    ldc,
+                    ct.c_int32(state.blocksize),
+                    stream,
+                )
+            elif A.dtype == torch.float32:
+                lib.cgemm_4bit_inference_naive_fp32(
+                    m,
+                    n,
+                    k,
+                    get_ptr(A),
+                    get_ptr(B),
+                    get_ptr(absmax),
+                    get_ptr(state.code),
+                    get_ptr(out),
+                    lda,
+                    ldb,
+                    ldc,
+                    ct.c_int32(state.blocksize),
+                    stream,
+                )
+            else:
+                raise NotImplementedError(f"Matmul not implemented for data type {A.dtype}")
+
         else:
             raise NotImplementedError(f"Matmul not implemented for data type {A.dtype}")
 
-    else:
-        raise NotImplementedError(f"Matmul not implemented for data type {A.dtype}")
-
-    post_call(prev_device)
+    # post_call(prev_device)
 
     return out
 
@@ -2332,62 +2316,35 @@ def igemmlt(A, B, out=None, Sout=None, dtype=torch.int32):
 
     assert lda == ldb, f"igemmlt only supports B^T @ A. Inner dimensions do not match: B @ A = {shapeB} @ {shapeA}"
 
-    prev_device = A.device
-    torch.cuda.set_device(A.device)
-
-    ctx = CUBLAS_Context.get_instance().get_context(A.device)
-    ptrA = get_ptr(A)
-    ptrB = get_ptr(B)
-    ptrC = get_ptr(out)
-    ptrRowScale = get_ptr(None)
-    m, n, k, lda, ldb, ldc = map(ct.c_int32, (m, n, k, lda, ldb, ldc))
-
     is_on_gpu([A, B, out])
 
-    if dtype == torch.int32:
-        has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
-    else:
-        has_error = lib.cigemmlt_8(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
+    with torch.cuda.device_of(A):
+        ctx = CUBLAS_Context.get_instance().get_context(A.device)
+        ptrA = get_ptr(A)
+        ptrB = get_ptr(B)
+        ptrC = get_ptr(out)
+        ptrRowScale = get_ptr(None)
+        m, n, k, lda, ldb, ldc = map(ct.c_int32, (m, n, k, lda, ldb, ldc))
+
+        if dtype == torch.int32:
+            has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
+        else:
+            has_error = lib.cigemmlt_8(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
 
     if has_error == 100:  # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
         raise NotImplementedError("igemmlt not implemented!")
 
     if has_error:
-        print(f"A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}")
-        raise Exception("cublasLt ran into an error!")
-
-    torch.cuda.set_device(prev_device)
+        raise RuntimeError(
+            f"cublasLt ran into an error!\n"
+            f"\tA: {shapeA}, B: {shapeB}, C: {Sout[0]}\n"
+            f"\t(lda, ldb, ldc): {(lda, ldb, ldc)}\n"
+            f"\t(m, n, k): {(m, n, k)}"
+        )
 
     return out, Sout
 
 
-def mm_dequant_torch(
-    A: torch.Tensor,
-    quant_state: Optional[Tuple[torch.Size, str]],  # TODO: deprecate. (shape, format)
-    row_stats: torch.Tensor,
-    col_stats: torch.Tensor,
-    out: Optional[torch.Tensor] = None,
-    new_row_stats=None,  # TODO: unused
-    new_col_stats=None,  # TODO: unused
-    bias: Optional[torch.Tensor] = None,
-):
-    assert A.dtype == torch.int32
-
-    A_calc = A.view(-1, A.shape[-1])
-    row_stats = row_stats.reshape(-1).unsqueeze(-1)
-    col_stats = col_stats.reshape(-1).unsqueeze(0)
-
-    # TODO support out != None
-
-    out = A_calc * (row_stats * col_stats) * 6.200124e-5
-
-    if bias is not None:
-        # assert bias.dtype == torch.float16
-        out.add_(bias)
-
-    return out.to(torch.float16)
-
-
 def mm_dequant(
     A: torch.Tensor,
     quant_state: Optional[Tuple[torch.Size, str]],  # TODO: deprecate. (shape, format)
@@ -2416,17 +2373,16 @@ def mm_dequant(
 
     is_on_gpu([A, row_stats, col_stats, out, bias])
 
-    prev_device = pre_call(A.device)
-    lib.cdequant_mm_int32_fp16(
-        ptrA,
-        ptrRowStats,
-        ptrColStats,
-        ptrOut,
-        ptrBias,
-        numRows,
-        numCols,
-    )
-    post_call(prev_device)
+    with torch.cuda.device_of(A):
+        lib.cdequant_mm_int32_fp16(
+            ptrA,
+            ptrRowStats,
+            ptrColStats,
+            ptrOut,
+            ptrBias,
+            numRows,
+            numCols,
+        )
 
     return out
 
@@ -2441,8 +2397,21 @@ def get_colrow_absmax(
     # Note: prior impl only works with fp16
     assert A.is_floating_point()
 
+    outlier_mask = None
+
     if row_stats is None or col_stats is None:
-        absA = A.abs().view(-1, A.shape[-1])  # view as 2D
+        # view as 2D
+        absA = A.abs().view(-1, A.shape[-1])
+
+        if threshold > 0.0:
+            # Filter outliers from stats when enabled
+            outlier_mask = absA >= threshold
+            absA.masked_fill_(outlier_mask, 0.0)
+
+            # For parity with tests build nnz_block_ptr.
+            nnz_block_ptr = torch.zeros(absA.shape[0] + 1, dtype=torch.int64, device=A.device)
+            nnz_block_ptr[1:] = outlier_mask.sum(1).cumsum(0)
+
         if row_stats is None:
             # shape [rows]; unsqueeze(-1) gives [rows,1]
             row_stats = absA.amax(dim=1, keepdim=False).float()
@@ -2450,11 +2419,7 @@ def get_colrow_absmax(
             # shape [cols]; unsqueeze(0) gives [1,cols]
             col_stats = absA.amax(dim=0, keepdim=False).float()
 
-    # TODO: threshold support
-    if nnz_block_ptr is None and threshold > 0.0:
-        nnz_block_ptr = torch.zeros_like(A, dtype=torch.int32)
-
-    return row_stats, col_stats, nnz_block_ptr
+    return row_stats, col_stats, outlier_mask
 
 
 def get_colrow_absmax_old(A, row_stats=None, col_stats=None, nnz_block_ptr=None, threshold=0.0):
@@ -2496,7 +2461,9 @@ def get_colrow_absmax_old(A, row_stats=None, col_stats=None, nnz_block_ptr=None,
 
 
 class COOSparseTensor:
-    def __init__(self, rows, cols, nnz, rowidx, colidx, values):
+    def __init__(
+        self, rows: int, cols: int, nnz: int, rowidx: torch.Tensor, colidx: torch.Tensor, values: torch.Tensor
+    ):
         assert rowidx.dtype == torch.int32
         assert colidx.dtype == torch.int32
         assert values.dtype == torch.float16
@@ -2574,16 +2541,26 @@ def coo_zeros(rows, cols, nnz, device, dtype=torch.half):
     return COOSparseTensor(rows, cols, nnz, rowidx, colidx, values)
 
 
-@torch.compile
-def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
+# @torch.compile
+def double_quant(A: torch.Tensor, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
     # TODO: Optimize/write CUDA kernel for this
-    # TODO: Support threshold
 
     if row_stats is None or col_stats is None:
-        row_stats, col_stats, nnz_row_ptr = get_colrow_absmax(A, threshold=threshold)
+        row_stats, col_stats, outlier_mask = get_colrow_absmax(A, threshold=threshold)
 
-    scaled_A = A.mul(C)
+    if threshold > 0.0:
+        # Extract outliers to COO tensor:
+        # 1. Zero out all of the non-outliers, convert to COO.
+        # 2. Zero out the outliers in the dense tensor.
+        # TODO we could improve perf of this
+        # is_outlier = A.abs() >= threshold
+        coo_tensor = A.masked_fill(outlier_mask == False, 0.0).to_sparse_coo()
+        A = A.masked_fill(outlier_mask, 0.0)
+    else:
+        coo_tensor = None
 
+    # Quantize
+    scaled_A = A.mul(C)
     # quant_row = torch.round(A * (C / row_stats.unsqueeze(-1))).to(torch.int8)
     # quant_col = torch.round(A * (C / col_stats.unsqueeze(0))).to(torch.int8)
     quant_row = torch.round(scaled_A / row_stats.unsqueeze(-1)).to(torch.int8)
@@ -2594,7 +2571,7 @@ def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None,
     if out_col is not None:
         quant_col = out_col.copy_(quant_col)
 
-    return quant_row, quant_col, row_stats.flatten().float(), col_stats.flatten().float(), None
+    return quant_row, quant_col, row_stats.flatten().float(), col_stats.flatten().float(), coo_tensor
 
 
 def double_quant_old(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
@@ -2735,7 +2712,22 @@ def transform(A, to_order, from_order="row", out=None, transpose=False, state=No
     return out, new_state
 
 
-def spmm_coo(cooA, B, out=None):
+def spmm_coo(cooA: Union[COOSparseTensor, torch.Tensor], B: torch.Tensor, out: torch.Tensor = None):
+    if not isinstance(cooA, COOSparseTensor):
+        assert (
+            cooA.is_sparse and cooA.layout == torch.sparse_coo
+        ), "Tensor must be `COOSparseTensor or a PyTorch COO tensor."
+
+        # Convert to custom COOSparseTensor
+        cooA = COOSparseTensor(
+            rows=cooA.shape[0],
+            cols=cooA.shape[1],
+            nnz=cooA._nnz(),
+            rowidx=cooA.indices()[0].int(),
+            colidx=cooA.indices()[1].int(),
+            values=cooA.values(),
+        )
+
     if out is None:
         out = torch.empty((cooA.rows, B.shape[1]), device=B.device, dtype=B.dtype)
     nnz = cooA.nnz
diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py
index 5f8b2c437..3e807d6e1 100644
--- a/bitsandbytes/research/autograd/_functions.py
+++ b/bitsandbytes/research/autograd/_functions.py
@@ -219,7 +219,8 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState()):  # noqa: B00
 
         if state.threshold > 0.0 and coo_tensorA is not None:
             if state.has_fp16_weights:
-                idx = torch.unique(coo_tensorA.colidx).long()
+                # idx = torch.unique(coo_tensorA.colidx).long()
+                idx = torch.unique(coo_tensorA._indices()[1]).long()
                 CA[:, idx] = 0
                 CAt[:, idx] = 0
                 subA = A[:, idx]
@@ -257,7 +258,8 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState()):  # noqa: B00
         if coo_tensorA is not None and not state.has_fp16_weights:
             # extract outliers
 
-            outlier_idx = torch.unique(coo_tensorA.colidx)
+            # outlier_idx = torch.unique(coo_tensorA.colidx)
+            outlier_idx = torch.unique(coo_tensorA._indices()[1]).long()
             state.idx = outlier_idx
             # state.outlier_pool.add_outliers(outlier_idx, A.shape[-1])
             # if state.use_pool and state.outlier_pool.model_dim == A.shape[-1]:
@@ -339,7 +341,7 @@ def backward(ctx, grad_output):
 
         if req_gradA:
             if state.CBt is not None:
-                gradA32, SgradA32 = F.igemmlt(Cgradt, state.CBt.t())
+                gradA32, SgradA32 = F.igemmlt(Cgrad, state.CBt.t())
                 grad_A = F.mm_dequant(gradA32, SgradA32, SCgrad, state.SCBt).view(ctx.grad_shape).to(ctx.dtype_A)
 
             elif state.CB is not None:
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
index 5bdcb1a41..34de9d5ca 100644
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -627,7 +627,7 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float
     for(int i = threadIdx.x; i < 256; i+=blockDim.x)
       smem_code[i] = code[i];
 
-  for (unsigned int i = base_idx; i < n_full; i += gridDim.x*BLOCK_SIZE)
+  for (int i = base_idx; i < n_full; i += gridDim.x*BLOCK_SIZE)
   {
     valid_items = n - i > BLOCK_SIZE ? BLOCK_SIZE : n - i;
     local_abs_max = -FLT_MAX;
@@ -645,19 +645,13 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float
 
     local_abs_max = BlockReduce(reduce).Reduce(local_abs_max, cub::Max(), valid_items);
 
-    if(threadIdx.x == 0)
-      smem_absmax_value[0] = local_abs_max;
-
+    if (threadIdx.x == 0) {
+      smem_absmax_value[0] = 1.0f / local_abs_max;
+      absmax[i / BLOCK_SIZE] = local_abs_max;
+    }
     __syncthreads();
 
-    if(threadIdx.x == 0)
-      absmax[i/BLOCK_SIZE] = local_abs_max;
-    else
-      local_abs_max = smem_absmax_value[0];
-
-    __syncwarp();
-
-    local_abs_max = 1.0f/local_abs_max;
+    local_abs_max = smem_absmax_value[0];
 
     if(STOCHASTIC)
     {
@@ -724,15 +718,15 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * abs
 
   for (int i = base_idx; i < n_load; i += gridDim.x*TILE_SIZE)
   {
-    if(DATA_TYPE > 0)
+    if (DATA_TYPE > 0)
     {
-      valid_items_load = (n+1)/2 - i > TILE_SIZE ? TILE_SIZE : (n+1)/2 - i;
-      valid_items_store = n - i*2 > TILE_SIZE*2 ? TILE_SIZE*2 : n - i*2;
+      valid_items_load = min(TILE_SIZE, (n + 1) / 2 - i);
+      valid_items_store = min(TILE_SIZE * 2, n - i * 2);
     }
     else
     {
-      valid_items_load = n - i > TILE_SIZE ? TILE_SIZE : n - i;
-      valid_items_store = n - i > TILE_SIZE ? TILE_SIZE : n - i;
+      valid_items_load = min(TILE_SIZE, n - i);
+      valid_items_store = valid_items_load;
     }
     local_abs_max = __ldg(&absmax[(i+threadIdx.x*NUM_PER_TH) >> (31 - __clz(blocksize))]);
     //local_abs_max = __ldg(&absmax[(i+threadIdx.x*NUM_PER_TH)/(blocksize)]);
@@ -740,7 +734,7 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * abs
     __syncthreads();
     LoadChar(loadchar).Load(&(A[i]), qvals, valid_items_load, 128);
 
-    switch(DATA_TYPE)
+    switch (DATA_TYPE)
     {
         case General8bit:
           // load code through read-only cache via __ldg
diff --git a/csrc/ops.cu b/csrc/ops.cu
index f3d349a41..089a30cc1 100644
--- a/csrc/ops.cu
+++ b/csrc/ops.cu
@@ -436,17 +436,11 @@ template <int DTYPE_OUT, int SCALE_ROWS> int igemmlt(
   //
   // Use the IMMA kernels requires:
   // * A must be transposed and B must be non-transposed.
-  // * All leading dimensions must be multiples of 4.
   // * Dimensions m and k must be multiples of 4.
   // * All pointers must be 4-byte aligned; 16-byte alignment preferred.
-  //
-
 
   int has_error = 0;
 
-  // this is the default
-  cublasLtOrder_t col_major = CUBLASLT_ORDER_COL;
-
   cublasLtMatmulDesc_t matmulDesc;
   cublasLtMatrixLayout_t aDesc, bDesc, cDesc;
   cublasOperation_t opT = CUBLAS_OP_T;
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 5052909e7..9b7004946 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -875,13 +875,12 @@ def test_colrow_absmax(dim1, dim2, dims, threshold):
 
             torch.testing.assert_close(col_stats1_trunc, col_stats2)
             torch.testing.assert_close(row_stats1_trunc, row_stats2)
-            torch.testing.assert_close(nnz_block_ptr1.int(), nnz_block_ptr2)
+            # torch.testing.assert_close(nnz_block_ptr1, nnz_block_ptr2)
         else:
             row_stats2, col_stats2, nnz_block_ptr2 = F.get_colrow_absmax(A, threshold=0.0)
             assert nnz_block_ptr2 is None
-
-        torch.testing.assert_close(col_stats1, col_stats2)
-        torch.testing.assert_close(row_stats1, row_stats2)
+            torch.testing.assert_close(col_stats1, col_stats2)
+            torch.testing.assert_close(row_stats1, row_stats2)
 
 
 # @pytest.mark.parametrize("dim1", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim1"))
@@ -1122,32 +1121,32 @@ def test_overflow():
     formatB = F.get_special_format_str()
     print(formatB)
     for i in range(2):
-        a = torch.arange(5, 15).cuda().to(torch.int8).view(-1, 1)
-        b = torch.arange(5, 15).cuda().to(torch.int8).view(-1, 1)
+        a = torch.arange(0, 16).cuda().to(torch.int8).view(-1, 4).contiguous()
+        b = torch.arange(0, 16).cuda().to(torch.int8).view(-1, 4).contiguous()
 
         # Ca, Sa = F.nvidia_transform(a, "col32")
         # Cb, Sb = F.nvidia_transform(b, formatB)
 
         # c = F.igemmlt(Ca, Cb, Sa, Sb, dtype=torch.int8)
-        c = F.igemmlt(a, b)
+        c = F.igemmlt(a, b, dtype=torch.int8)
         c2 = torch.matmul(a.float(), b.float().t())
 
 
-@pytest.mark.parametrize("dim1", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim1"))
-@pytest.mark.parametrize("dim2", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim2"))
+# @pytest.mark.parametrize("dim1", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim1"))
+# @pytest.mark.parametrize("dim2", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dim1", [512, 2048], ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", [1024, 4096], ids=id_formatter("dim2"))
 def test_coo_double_quant(dim1, dim2):
     threshold = 3.00
     for i in range(k):
         A = torch.randn(dim1, dim2, device="cuda").half()
 
         idx = torch.abs(A) >= threshold
-        CA2, CAt, statsA, statsAt, coo_tensor = F.double_quant(A)
         CA, CAt, statsA, statsAt, coo_tensor = F.double_quant(A, threshold=threshold)
 
         if coo_tensor is not None:
             A1 = A * idx
-            A2 = torch.zeros_like(A)
-            A2[coo_tensor.rowidx.long(), coo_tensor.colidx.long()] = coo_tensor.values
+            A2 = coo_tensor.to_dense()
             torch.testing.assert_close(A1, A2)
 
             A1 = A * (idx == 0)
@@ -1228,8 +1227,10 @@ def test_spmm_bench():
     print(tsp / t8)
 
 
-@pytest.mark.parametrize("dim1", get_test_dims(256, 1024, n=2), ids=id_formatter("dim1"))
-@pytest.mark.parametrize("dim2", get_test_dims(256, 1024, n=2), ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dim1", [256, 1024], ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", [256, 1024], ids=id_formatter("dim2"))
+# @pytest.mark.parametrize("dim1", get_test_dims(256, 1024, n=2), ids=id_formatter("dim1"))
+# @pytest.mark.parametrize("dim2", get_test_dims(256, 1024, n=2), ids=id_formatter("dim2"))
 def test_integrated_sparse_decomp(dim1, dim2):
     threshold = 3.0
     # formatB = "col_turing"
@@ -1252,6 +1253,8 @@ def test_integrated_sparse_decomp(dim1, dim2):
         assert coo_tensor is not None
 
         out4 = F.spmm_coo(coo_tensor, w1.t())
+        # idx = torch.unique(coo_tensor._indices()[1]).long()
+        # out4 = torch.matmul(A, w1.t())
         out5 = out3 + out4
 
         err1 = torch.abs(out1 - out2).mean().item()
diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index 9b7923312..149d9a93c 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -79,14 +79,13 @@ def test_linear_no_igemmlt():
 @pytest.mark.parametrize("has_fp16_weights", TRUE_FALSE, ids=id_formatter("has_fp16_weights"))
 @pytest.mark.parametrize("serialize_before_forward", TRUE_FALSE, ids=id_formatter("serialize_before_forward"))
 @pytest.mark.parametrize("deserialize_before_cuda", TRUE_FALSE, ids=id_formatter("deserialize_before_cuda"))
-@pytest.mark.parametrize("force_no_igemmlt", TRUE_FALSE, ids=id_formatter("force_no_igemmlt"))
+# @pytest.mark.parametrize("force_no_igemmlt", TRUE_FALSE, ids=id_formatter("force_no_igemmlt"))
 @pytest.mark.parametrize("save_before_forward", TRUE_FALSE, ids=id_formatter("save_before_forward"))
 @pytest.mark.parametrize("load_before_cuda", TRUE_FALSE, ids=id_formatter("load_before_cuda"))
 def test_linear_serialization(
     has_fp16_weights,
     serialize_before_forward,
     deserialize_before_cuda,
-    force_no_igemmlt,
     save_before_forward,
     load_before_cuda,
 ):
@@ -100,8 +99,8 @@ def test_linear_serialization(
         has_fp16_weights=has_fp16_weights,
         threshold=6.0,
     )
-    if force_no_igemmlt:
-        linear_custom.state.force_no_igemmlt = True
+    # if force_no_igemmlt:
+    #    linear_custom.state.force_no_igemmlt = True
 
     linear_custom.weight = bnb.nn.Int8Params(
         linear.weight.data.clone(),
@@ -147,8 +146,8 @@ def test_linear_serialization(
         has_fp16_weights=has_fp16_weights,
         threshold=6.0,
     )
-    if force_no_igemmlt:
-        new_linear_custom.state.force_no_igemmlt = True
+    # if force_no_igemmlt:
+    #    new_linear_custom.state.force_no_igemmlt = True
 
     if deserialize_before_cuda:
         with nullcontext() if has_fp16_weights else pytest.raises(RuntimeError):
diff --git a/tests/test_modules.py b/tests/test_modules.py
index 7369bb1cf..1f1b17584 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -528,15 +528,17 @@ def test_linear_kbit_fp32_bias(module):
 
 @pytest.mark.parametrize("module", module_dict.values(), ids=module_dict.keys())
 def test_kbit_backprop(module):
-    b = 17
-    dim1 = 37
-    dim2 = 83
+    b = 16
+    dim1 = 32
+    dim2 = 48
+    # dim1 = 37
+    # dim2 = 83
 
-    ref = nn.Sequential(*[torch.nn.Linear(dim1, dim2), torch.nn.Linear(dim2, 10)])
+    ref = nn.Sequential(*[torch.nn.Linear(dim1, dim2), torch.nn.Linear(dim2, 16)])
     ref[1].weight.requires_grad = False
     torch.nn.init.kaiming_normal_(ref[0].weight)
     torch.nn.init.kaiming_normal_(ref[1].weight)
-    kbit = nn.Sequential(*[torch.nn.Linear(dim1, dim2), module(dim2, 10)])
+    kbit = nn.Sequential(*[torch.nn.Linear(dim1, dim2), module(dim2, 16)])
     kbit[0].weight.detach().copy_(ref[0].weight)
     kbit[1].weight.detach().copy_(ref[1].weight)
     kbit[0].bias.detach().copy_(ref[0].bias)
@@ -570,7 +572,8 @@ def test_kbit_backprop(module):
         relerrs1.append(relerr1.mean().item())
         relerrs2.append(relerr2.mean().item())
 
-        if isinstance(module, bnb.nn.Linear8bitLt):
+        # if isinstance(module, bnb.nn.Linear8bitLt):
+        if module == bnb.nn.Linear8bitLt:
             assert_all_approx_close(grad1, grad2, atol=0.008, rtol=0.05, count=1)
             torch.testing.assert_close(bgrad1, bgrad2, atol=0.008, rtol=0.05)
         else: