NVIDIA · jacobhinkle · Dec 16, 2024 · Dec 17, 2024 · Dec 17, 2024 · Dec 17, 2024
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
@@ -375,12 +375,6 @@ void KernelExecutor::compile(
         !(compile_params.index_type.value() == PrimDataType::Int32 &&
           arg_index_type == PrimDataType::Int),
         "Compilation with int32 is requested but int64 is required for the arguments");
-    NVF_ERROR(
-        !has_cp_async_bulk ||
-            (compile_params.index_type.value() == PrimDataType::Int32),
-        "Compilation with int64 is requested but int32 is required because ",
-        "of TMA operations.");
-
   } else if (arg_index_type == PrimDataType::Int) {
     // If the given compile option doesn't specify the index type, and
     // the arguments require 64-bit indexing, we need to use 64-bit

diff --git a/runtime/memory.cu b/runtime/memory.cu
@@ -96,7 +96,7 @@ __device__ inline bool electSync(const uint32_t& membermask) {
 template <int dim>
 struct CpAsyncBulkTensorTileG2SIndex {
   const TensorMap* descriptor;
-  Array<int32_t, dim> crds;
+  Array<nvfuser_index_t, dim> crds;
   uint32_t mbarrier;
 };
 
@@ -108,7 +108,10 @@ __device__ inline void cpAsyncBulkTensorTileG2S(
       "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes"
       " [%0], [%1, {%3}], [%2];"
       :
-      : "r"(smem_addr), "l"(gmem_int_desc), "r"(src.mbarrier), "r"(src.crds[0])
+      : "r"(smem_addr),
+        "l"(gmem_int_desc),
+        "r"(src.mbarrier),
+        "r"((uint32_t)src.crds[0])
       : "memory");
 }
 
@@ -123,8 +126,8 @@ __device__ inline void cpAsyncBulkTensorTileG2S(
       : "r"(smem_addr),
         "l"(gmem_int_desc),
         "r"(src.mbarrier),
-        "r"(src.crds[0]),
-        "r"(src.crds[1])
+        "r"((uint32_t)src.crds[0]),
+        "r"((uint32_t)src.crds[1])
       : "memory");
 }
 
@@ -139,9 +142,9 @@ __device__ inline void cpAsyncBulkTensorTileG2S(
       : "r"(smem_addr),
         "l"(gmem_int_desc),
         "r"(src.mbarrier),
-        "r"(src.crds[0]),
-        "r"(src.crds[1]),
-        "r"(src.crds[2])
+        "r"((uint32_t)src.crds[0]),
+        "r"((uint32_t)src.crds[1]),
+        "r"((uint32_t)src.crds[2])
       : "memory");
 }
 
@@ -156,10 +159,10 @@ __device__ inline void cpAsyncBulkTensorTileG2S(
       : "r"(smem_addr),
         "l"(gmem_int_desc),
         "r"(src.mbarrier),
-        "r"(src.crds[0]),
-        "r"(src.crds[1]),
-        "r"(src.crds[2]),
-        "r"(src.crds[3])
+        "r"((uint32_t)src.crds[0]),
+        "r"((uint32_t)src.crds[1]),
+        "r"((uint32_t)src.crds[2]),
+        "r"((uint32_t)src.crds[3])
       : "memory");
 }
 
@@ -174,11 +177,11 @@ __device__ inline void cpAsyncBulkTensorTileG2S(
       : "r"(smem_addr),
         "l"(gmem_int_desc),
         "r"(src.mbarrier),
-        "r"(src.crds[0]),
-        "r"(src.crds[1]),
-        "r"(src.crds[2]),
-        "r"(src.crds[3]),
-        "r"(src.crds[4])
+        "r"((uint32_t)src.crds[0]),
+        "r"((uint32_t)src.crds[1]),
+        "r"((uint32_t)src.crds[2]),
+        "r"((uint32_t)src.crds[3]),
+        "r"((uint32_t)src.crds[4])
       : "memory");
 }