Merge remote-tracking branch 'origin/main' into update_llvm_49af6502c…

…6dcb4a7f7520178bd14df396f78240c
intel · Nov 20, 2023 · c53a23a · c53a23a
2 parents 0aba1b0 + f7d25d2
commit c53a23a
Show file tree

Hide file tree

Showing 13 changed files with 550 additions and 138 deletions.
diff --git a/include/imex/Conversion/Passes.td b/include/imex/Conversion/Passes.td
@@ -251,8 +251,8 @@ memref, arith and math.
   let constructor = "imex::createConvertGPUXToSPIRVPass()";
   let dependentDialects = ["::mlir::spirv::SPIRVDialect"];
   let options = [
-    Option<"enableSimtIntrinsic", "enable-simt-intrinsic","bool", "false",
-           "Enable XeGPU.simt Ops lowered to intel genISA simt Intrinsics">
+    Option<"enableVCIntrinsic", "enable-vc-intrinsic","bool", "true",
+           "Enable XeGPU Ops lowered to intel vc Intrinsics">
   ];
 }
 

diff --git a/include/imex/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.h b/include/imex/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.h
@@ -27,6 +27,9 @@ namespace imex {
 // XeGPU to VC Intrinsics pattern
 void populateXeGPUToVCIntrinsicsPatterns(
     mlir::SPIRVTypeConverter &typeConverter, mlir::RewritePatternSet &patterns);
+// XeGPU to genISA Intrinsics pattern
+void populateXeGPUToGenISAPatterns(mlir::SPIRVTypeConverter &typeConverter,
+                                   mlir::RewritePatternSet &patterns);
 } // namespace imex
 
 #endif // IMEX_CONVERSION_XEGPUTOSPIRV_H
diff --git a/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp b/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
@@ -162,8 +162,8 @@ void GPUXToSPIRVPass::runOnOperation() {
     });
     typeConverter.addConversion(
         [&](xegpu::TensorDescType type) -> ::mlir::Type {
-          auto i64Type = ::mlir::IntegerType::get(context, 64);
-          return ::mlir::VectorType::get(2, i64Type);
+          auto i32Type = ::mlir::IntegerType::get(context, 32);
+          return ::mlir::VectorType::get(8, i32Type);
         });
     typeConverter.addConversion([&](::mlir::VectorType type) -> ::mlir::Type {
       unsigned rank = type.getRank();
@@ -198,7 +198,10 @@ void GPUXToSPIRVPass::runOnOperation() {
     mlir::populateSCFToSPIRVPatterns(typeConverter, scfToSpirvCtx, patterns);
     mlir::cf::populateControlFlowToSPIRVPatterns(typeConverter, patterns);
     mlir::populateMathToSPIRVPatterns(typeConverter, patterns);
-    imex::populateXeGPUToVCIntrinsicsPatterns(typeConverter, patterns);
+    if (this->enableVCIntrinsic)
+      imex::populateXeGPUToVCIntrinsicsPatterns(typeConverter, patterns);
+    else
+      imex::populateXeGPUToGenISAPatterns(typeConverter, patterns);
 
     if (failed(applyFullConversion(gpuModule, *target, std::move(patterns))))
       return signalPassFailure();

diff --git a/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp b/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp
diff --git a/...Conversion/XeGPUToSPIRV/atomic_basic.mlir → ...version/XeGPUToSPIRV/atomic_basic.vc.mlir b/...Conversion/XeGPUToSPIRV/atomic_basic.mlir → ...version/XeGPUToSPIRV/atomic_basic.vc.mlir
diff --git a/...onversion/XeGPUToSPIRV/barrier_basic.mlir → ...ersion/XeGPUToSPIRV/barrier_basic.vc.mlir b/...onversion/XeGPUToSPIRV/barrier_basic.mlir → ...ersion/XeGPUToSPIRV/barrier_basic.vc.mlir
diff --git a/test/Conversion/XeGPUToSPIRV/gemm_basic.mlir b/test/Conversion/XeGPUToSPIRV/gemm_basic.mlir
@@ -1,5 +1,8 @@
-// RUN: imex-opt -imex-convert-gpu-to-spirv  %s | FileCheck %s
-// RUN: IMEX_NOT_PREFER_RAWSEND=1 imex-opt -imex-convert-gpu-to-spirv  %s | FileCheck %s --check-prefix=LSC
+// RUN: imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=false'  %s | FileCheck %s
+
+#sg_map_fp16_a = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>
+#sg_map_fp16_b = #xegpu.sg_map<{mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]}>
+#sg_map_fp16_c = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]}>
 module @gemm attributes {gpu.container_module} {
   memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01>
   memref.global "private" constant @__constant_16x16xf16 : memref<16x16xf16> = dense<1.099610e+00>
@@ -15,35 +18,25 @@ module @gemm attributes {gpu.container_module} {
     gpu.dealloc  %memref_0 : memref<16x16xf16>
     return %memref_1 : memref<8x16xf32>
   }
-  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
-    gpu.func @test_kernel(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
-      // LSC: spirv.FunctionCall @llvm_genx_lsc_prefetch2d_stateless_i1_i64
-      // LSC: spirv.FunctionCall @llvm_genx_lsc_prefetch2d_stateless_i1_i64
-      // LSC: spirv.FunctionCall @llvm_genx_lsc_load2d_stateless_v64i32_i1_i64
-      // LSC: spirv.FunctionCall @llvm_genx_lsc_load2d_stateless_v128i32_i1_i64
-      // LSC: spirv.FunctionCall @llvm_genx_dpas_nosrc0_v128f32_v128i32_v64i32
-      // LSC: spirv.FunctionCall @llvm_genx_lsc_store2d_stateless_i1_i64_v128f32
-      // CHECK: %[[BASE:.*]] = spirv.ConvertPtrToU %arg0 : !spirv.ptr<!spirv.array<128 x f16>, CrossWorkgroup> to i64
-      // CHECK: %[[BASE1:.*]] = spirv.VectorInsertDynamic %[[BASE]]
-      // CHECK: %[[BASE2:.*]] = spirv.Bitcast %[[BASE1]]
-      // CHECK: spirv.VectorInsertDynamic
-      // CHECK: spirv.VectorInsertDynamic
-      // CHECK: spirv.FunctionCall @llvm_genx_raw_send2_noresult_i1_v8i32
-      // CHECK: spirv.FunctionCall @llvm_genx_raw_send2_noresult_i1_v8i32
-      // CHECK: spirv.FunctionCall @llvm_genx_raw_send2_v64i32_i1_v8i32
-      // CHECK: spirv.FunctionCall @llvm_genx_raw_send2_v128i32_i1_v8i32
-      // CHECK: spirv.FunctionCall @llvm_genx_dpas_nosrc0_v128f32_v128i32_v64i32
-      // CHECK: spirv.FunctionCall @llvm_genx_raw_sends2_noresult_i1_v8i32_v128f32
-      %0 = xegpu.create_nd_tdesc %arg0[0, 0] {mode = vc} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-      %1 = xegpu.create_nd_tdesc %arg1[0, 0] {mode = vc} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-      %2 = xegpu.create_nd_tdesc %arg2[0, 0] {mode = vc} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-      xegpu.prefetch_nd %0 {mode = vc} : !xegpu.tensor_desc<8x16xf16>
-      xegpu.prefetch_nd %1 {mode = vc} : !xegpu.tensor_desc<16x16xf16>
-
-      %3 = xegpu.load_nd %0  {mode = vc, vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16> -> vector<8x8x2xf16>
-      %4 = xegpu.load_nd %1  {mode = vc, vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
-      %5 = xegpu.dpas %3, %4 {mode = vc} : vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
-      xegpu.store_nd %5, %2 {mode = vc} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) kernel attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      // CHECK: %[[a:.*]] = spirv.FunctionCall @llvm.genx.GenISA.LSC2DBlockRead.v8i16
+      // CHECK: %[[a0:.*]] = spirv.Bitcast %[[a]]
+      // CHECK: %[[b:.*]] = spirv.FunctionCall @llvm.genx.GenISA.LSC2DBlockRead.v16i16
+      // CHECK: %[[b0:.*]] = spirv.Bitcast %[[b]]
+      // CHECK: %[[A:.*]] = spirv.Bitcast %[[a0]]
+      // CHECK: %[[B:.*]] = spirv.Bitcast %[[b0]]
+      // CHECK: %[[C:.*]] = spirv.FunctionCall @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v16i16
+      // CHECK-SAME: %[[A]], %[[B]]
+      // CHECK: spirv.FunctionCall @llvm.genx.GenISA.LSC2DBlockWrite.isVoid
+      // CHECK-SAME: %[[C]]
+      %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a>
+      %1 = xegpu.create_nd_tdesc %arg1[0, 0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b>
+      %2 = xegpu.create_nd_tdesc %arg2[0, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c>
+      %3 = xegpu.load_nd %0  {vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a> -> vector<4x1x2xf16>
+      %4 = xegpu.load_nd %1  {vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b> -> vector<8x1x2xf16>
+      %5 = xegpu.dpas %3, %4 : vector<4x1x2xf16>, vector<8x1x2xf16> -> vector<8x1xf32>
+      xegpu.store_nd %5, %2 : vector<8x1xf32>, !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c>
       gpu.return
     }
   }

diff --git a/test/Conversion/XeGPUToSPIRV/gemm_basic.vc.mlir b/test/Conversion/XeGPUToSPIRV/gemm_basic.vc.mlir
@@ -0,0 +1,59 @@
+// RUN: imex-opt -imex-convert-gpu-to-spirv  %s | FileCheck %s
+// RUN: IMEX_NOT_PREFER_RAWSEND=1 imex-opt -imex-convert-gpu-to-spirv  %s | FileCheck %s --check-prefix=LSC
+module @gemm attributes {gpu.container_module} {
+  memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01>
+  memref.global "private" constant @__constant_16x16xf16 : memref<16x16xf16> = dense<1.099610e+00>
+  func.func @test(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %memref = gpu.alloc  host_shared () : memref<8x16xf16>
+    memref.copy %arg0, %memref : memref<8x16xf16> to memref<8x16xf16>
+    %memref_0 = gpu.alloc  host_shared () : memref<16x16xf16>
+    memref.copy %arg1, %memref_0 : memref<16x16xf16> to memref<16x16xf16>
+    %memref_1 = gpu.alloc  host_shared () : memref<8x16xf32>
+    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_0 : memref<16x16xf16>, %memref_1 : memref<8x16xf32>)
+    gpu.dealloc  %memref : memref<8x16xf16>
+    gpu.dealloc  %memref_0 : memref<16x16xf16>
+    return %memref_1 : memref<8x16xf32>
+  }
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      // LSC: spirv.FunctionCall @llvm_genx_lsc_prefetch2d_stateless_i1_i64
+      // LSC: spirv.FunctionCall @llvm_genx_lsc_prefetch2d_stateless_i1_i64
+      // LSC: spirv.FunctionCall @llvm_genx_lsc_load2d_stateless_v64i32_i1_i64
+      // LSC: spirv.FunctionCall @llvm_genx_lsc_load2d_stateless_v128i32_i1_i64
+      // LSC: spirv.FunctionCall @llvm_genx_dpas_nosrc0_v128f32_v128i32_v64i32
+      // LSC: spirv.FunctionCall @llvm_genx_lsc_store2d_stateless_i1_i64_v128f32
+      // CHECK: %[[BASE:.*]] = spirv.ConvertPtrToU %arg0 : !spirv.ptr<!spirv.array<128 x f16>, CrossWorkgroup> to i64
+      // CHECK: %[[BASE1:.*]] = spirv.VectorInsertDynamic %[[BASE]]
+      // CHECK: %[[BASE2:.*]] = spirv.Bitcast %[[BASE1]]
+      // CHECK: spirv.VectorInsertDynamic
+      // CHECK: spirv.VectorInsertDynamic
+      // CHECK: spirv.FunctionCall @llvm_genx_raw_send2_noresult_i1_v8i32
+      // CHECK: spirv.FunctionCall @llvm_genx_raw_send2_noresult_i1_v8i32
+      // CHECK: spirv.FunctionCall @llvm_genx_raw_send2_v64i32_i1_v8i32
+      // CHECK: spirv.FunctionCall @llvm_genx_raw_send2_v128i32_i1_v8i32
+      // CHECK: spirv.FunctionCall @llvm_genx_dpas_nosrc0_v128f32_v128i32_v64i32
+      // CHECK: spirv.FunctionCall @llvm_genx_raw_sends2_noresult_i1_v8i32_v128f32
+      %0 = xegpu.create_nd_tdesc %arg0[0, 0] {mode = vc} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+      %1 = xegpu.create_nd_tdesc %arg1[0, 0] {mode = vc} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+      %2 = xegpu.create_nd_tdesc %arg2[0, 0] {mode = vc} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+      xegpu.prefetch_nd %0 {mode = vc} : !xegpu.tensor_desc<8x16xf16>
+      xegpu.prefetch_nd %1 {mode = vc} : !xegpu.tensor_desc<16x16xf16>
+
+      %3 = xegpu.load_nd %0  {mode = vc, vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16> -> vector<8x8x2xf16>
+      %4 = xegpu.load_nd %1  {mode = vc, vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
+      %5 = xegpu.dpas %3, %4 {mode = vc} : vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+      xegpu.store_nd %5, %2 {mode = vc} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+      gpu.return
+    }
+  }
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %0 = memref.get_global @__constant_8x16xf16 : memref<8x16xf16>
+    %1 = memref.get_global @__constant_16x16xf16 : memref<16x16xf16>
+    %2 = call @test(%0, %1) : (memref<8x16xf16>, memref<16x16xf16>) -> memref<8x16xf32>
+    %cast = memref.cast %2 : memref<8x16xf32> to memref<*xf32>
+    //call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
+    return
+  }
+  func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
+}
diff --git a/...onversion/XeGPUToSPIRV/gemm_basic_1d.mlir → ...ersion/XeGPUToSPIRV/gemm_basic_1d.vc.mlir b/...onversion/XeGPUToSPIRV/gemm_basic_1d.mlir → ...ersion/XeGPUToSPIRV/gemm_basic_1d.vc.mlir
diff --git a/...rsion/XeGPUToSPIRV/gemm_basic_gather.mlir → ...on/XeGPUToSPIRV/gemm_basic_gather.vc.mlir b/...rsion/XeGPUToSPIRV/gemm_basic_gather.mlir → ...on/XeGPUToSPIRV/gemm_basic_gather.vc.mlir
diff --git a/...onversion/XeGPUToSPIRV/update_offset.mlir → ...ersion/XeGPUToSPIRV/update_offset.vc.mlir b/...onversion/XeGPUToSPIRV/update_offset.mlir → ...ersion/XeGPUToSPIRV/update_offset.vc.mlir
diff --git a/test/Integration/Dialect/XeGPU/gemm_1024x1024xf16.using.updateoffset.mlir b/test/Integration/Dialect/XeGPU/gemm_1024x1024xf16.using.updateoffset.mlir
@@ -0,0 +1,111 @@
+// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
+// RUN:                                       --runner imex-cpu-runner -e main \
+// RUN:                                       --entry-point-result=void \
+// RUN:                                       --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
+// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
+// RUN:                                        --runner imex-cpu-runner -e main \
+// RUN:                                        --entry-point-result=void \
+// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
+module @gemm attributes {gpu.container_module} {
+  memref.global "private" @__constant_1024x1024xf16 : memref<1024x1024xf16> = dense<0.0>
+  memref.global "private" @__constant_1024x1024xf16_ : memref<1024x1024xf16> = dense<0.0>
+  memref.global "private" @__constant_1024x1024xf32 : memref<1024x1024xf32> = dense<0.0>
+  func.func @test(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>) -> memref<1024x1024xf32> attributes {llvm.emit_c_interface} {
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %memref = gpu.alloc  host_shared () : memref<1024x1024xf16>
+    memref.copy %arg0, %memref : memref<1024x1024xf16> to memref<1024x1024xf16>
+    %memref_0 = gpu.alloc  host_shared () : memref<1024x1024xf16>
+    memref.copy %arg1, %memref_0 : memref<1024x1024xf16> to memref<1024x1024xf16>
+    %memref_1 = gpu.alloc  host_shared () : memref<1024x1024xf32>
+    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c128, %c64, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<1024x1024xf16>, %memref_0 : memref<1024x1024xf16>, %memref_1 : memref<1024x1024xf32>)
+    gpu.dealloc  %memref : memref<1024x1024xf16>
+    gpu.dealloc  %memref_0 : memref<1024x1024xf16>
+    return %memref_1 : memref<1024x1024xf32>
+  }
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) kernel attributes {VectorComputeFunctionINTEL, gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 128, 64, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %c0 = arith.constant 0 : index
+      %c16 = arith.constant 16 : index
+      %c8 = arith.constant 8 : index
+      %c1024 = arith.constant 1024 : index
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      %2 = arith.muli %0, %c8 : index
+      %3 = arith.muli %1, %c16 : index
+      %4 = xegpu.create_nd_tdesc %arg2[%2, %3] {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+      %5 = xegpu.load_nd %4 {mode = vc} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
+      // each work-group has 1 subgroup. the subgroup caculates a [8x16 = 8x1024 * 1024x16] block
+        %7 = xegpu.create_nd_tdesc %arg0[%2, %c0] {mode=vc}: memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        %8 = xegpu.create_nd_tdesc %arg1[%c0, %3]  {mode=vc}: memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+      %6:3 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %5, %subA = %7, %subB = %8) -> (vector<8x16xf32>, !xegpu.tensor_desc<8x16xf16>, !xegpu.tensor_desc<16x16xf16>) {
+        %9 = xegpu.load_nd %subA  {mode=vc, vnni_axis = 1}: !xegpu.tensor_desc<8x16xf16> -> vector<8x8x2xf16>
+        %10 = xegpu.load_nd %subB  {mode=vc, vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
+        %11 = xegpu.dpas %9, %10, %arg4 {mode=vc}: vector<8x8x2xf16>, vector<8x16x2xf16>, vector<8x16xf32> -> vector<8x16xf32>
+        %12 = xegpu.update_nd_offset %subA, [%c0, %c16] {mode=vc}: !xegpu.tensor_desc<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+        %13 = xegpu.update_nd_offset %subB, [%c16, %c0] {mode=vc}: !xegpu.tensor_desc<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+        scf.yield %11, %12, %13: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf16>, !xegpu.tensor_desc<16x16xf16>
+      }
+      xegpu.store_nd %6#0, %4 {mode = vc}: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+      gpu.return
+    }
+  }
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %0 = memref.get_global @__constant_1024x1024xf16 : memref<1024x1024xf16>
+    %1 = memref.get_global @__constant_1024x1024xf16_ : memref<1024x1024xf16>
+    %ref = memref.get_global @__constant_1024x1024xf32 : memref<1024x1024xf32>
+    %init = arith.constant 0.0 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+    %c128 = arith.constant 128 : index
+    %c1024 = arith.constant 1024 : index
+    // fill the top-left block 128x128
+    // A matrix: row-major, start from 0.0, increase 0.01 per element
+    // B matrix: A matrix + 1.0
+    scf.for %arg0 = %c0 to %c128 step %c1 {
+      scf.for %arg1 = %c0 to %c128 step %c1 {
+        %int0 = arith.index_cast %arg0 : index to i16
+        %int1 = arith.index_cast %arg1 : index to i16
+        %c128_i16 = arith.constant 128 : i16
+        %idx0 = arith.muli %int0, %c128_i16 : i16
+        %idx1 = arith.addi %int1, %idx0 : i16
+        %fp = arith.uitofp %idx1 : i16 to f16
+        %cst100 = arith.constant 100.0 : f16
+        %val0 = arith.divf %fp, %cst100 : f16
+        %cst1 = arith.constant 1.0 : f16
+        %val1 = arith.addf %val0, %cst1 : f16
+        memref.store %val0, %0[%arg0, %arg1] : memref<1024x1024xf16>
+        memref.store %val1, %1[%arg0, %arg1] : memref<1024x1024xf16>
+      }
+    }
+    // caculate the result C matrix
+    scf.for %arg0 = %c0 to %c1024 step %c1 {
+      scf.for %arg1 = %c0 to %c1024 step %c1 {
+        %acc = memref.load %ref[%arg0, %arg1] : memref<1024x1024xf32>
+        %res = scf.for %arg2 = %c0 to %c1024 step %c1 iter_args(%arg3 = %acc) -> f32 {
+          %a = memref.load %0[%arg0, %arg2] : memref<1024x1024xf16>
+          %b = memref.load %1[%arg2, %arg1] : memref<1024x1024xf16>
+          %c = arith.mulf %a, %b : f16
+          %cc = arith.extf %c : f16 to f32
+          %ccc = arith.addf %cc, %arg3 : f32
+          scf.yield %ccc : f32
+        }
+        memref.store %res, %ref[%arg0, %arg1] : memref<1024x1024xf32>
+      }
+    }
+
+    %2 = call @test(%0, %1) : (memref<1024x1024xf16>, memref<1024x1024xf16>) -> memref<1024x1024xf32>
+    %cast = memref.cast %2 : memref<1024x1024xf32> to memref<*xf32>
+    //call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
+    %cast_ref = memref.cast %ref : memref<1024x1024xf32> to memref<*xf32>
+    //call @printMemrefF32(%cast_ref) : (memref<*xf32>) -> ()
+    // CHECK:   [ALLCLOSE: TRUE]
+    call @printAllcloseF32(%cast, %cast_ref) : (memref<*xf32>, memref<*xf32>) -> ()
+    return
+  }
+  func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
+  func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
+}
diff --git a/test/Integration/Dialect/XeGPU/lit.local.cfg b/test/Integration/Dialect/XeGPU/lit.local.cfg
@@ -1,5 +1,6 @@
 local_excludes = [
                     'gemm_1024x1024xf16.mlir',
+                    'gemm_1024x1024xf16.using.updateoffset.mlir',
                     'gemm_1024x1016x1016_f16_f16_f32.mlir',
                     'load2d_dpas_store2d.mlir',
                     'load2d-padding-f32.mlir',