Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into update_llvm_49af6502c…
Browse files Browse the repository at this point in the history
…6dcb4a7f7520178bd14df396f78240c
  • Loading branch information
silee2 committed Nov 20, 2023
2 parents 0aba1b0 + f7d25d2 commit c53a23a
Show file tree
Hide file tree
Showing 13 changed files with 550 additions and 138 deletions.
4 changes: 2 additions & 2 deletions include/imex/Conversion/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -251,8 +251,8 @@ memref, arith and math.
let constructor = "imex::createConvertGPUXToSPIRVPass()";
let dependentDialects = ["::mlir::spirv::SPIRVDialect"];
let options = [
Option<"enableSimtIntrinsic", "enable-simt-intrinsic","bool", "false",
"Enable XeGPU.simt Ops lowered to intel genISA simt Intrinsics">
Option<"enableVCIntrinsic", "enable-vc-intrinsic","bool", "true",
"Enable XeGPU Ops lowered to intel vc Intrinsics">
];
}

Expand Down
3 changes: 3 additions & 0 deletions include/imex/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ namespace imex {
// XeGPU to VC Intrinsics pattern
void populateXeGPUToVCIntrinsicsPatterns(
mlir::SPIRVTypeConverter &typeConverter, mlir::RewritePatternSet &patterns);
// XeGPU to genISA Intrinsics pattern
void populateXeGPUToGenISAPatterns(mlir::SPIRVTypeConverter &typeConverter,
mlir::RewritePatternSet &patterns);
} // namespace imex

#endif // IMEX_CONVERSION_XEGPUTOSPIRV_H
9 changes: 6 additions & 3 deletions lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,8 @@ void GPUXToSPIRVPass::runOnOperation() {
});
typeConverter.addConversion(
[&](xegpu::TensorDescType type) -> ::mlir::Type {
auto i64Type = ::mlir::IntegerType::get(context, 64);
return ::mlir::VectorType::get(2, i64Type);
auto i32Type = ::mlir::IntegerType::get(context, 32);
return ::mlir::VectorType::get(8, i32Type);
});
typeConverter.addConversion([&](::mlir::VectorType type) -> ::mlir::Type {
unsigned rank = type.getRank();
Expand Down Expand Up @@ -198,7 +198,10 @@ void GPUXToSPIRVPass::runOnOperation() {
mlir::populateSCFToSPIRVPatterns(typeConverter, scfToSpirvCtx, patterns);
mlir::cf::populateControlFlowToSPIRVPatterns(typeConverter, patterns);
mlir::populateMathToSPIRVPatterns(typeConverter, patterns);
imex::populateXeGPUToVCIntrinsicsPatterns(typeConverter, patterns);
if (this->enableVCIntrinsic)
imex::populateXeGPUToVCIntrinsicsPatterns(typeConverter, patterns);
else
imex::populateXeGPUToGenISAPatterns(typeConverter, patterns);

if (failed(applyFullConversion(gpuModule, *target, std::move(patterns))))
return signalPassFailure();
Expand Down
446 changes: 344 additions & 102 deletions lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp

Large diffs are not rendered by default.

55 changes: 24 additions & 31 deletions test/Conversion/XeGPUToSPIRV/gemm_basic.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
// RUN: imex-opt -imex-convert-gpu-to-spirv %s | FileCheck %s
// RUN: IMEX_NOT_PREFER_RAWSEND=1 imex-opt -imex-convert-gpu-to-spirv %s | FileCheck %s --check-prefix=LSC
// RUN: imex-opt -imex-convert-gpu-to-spirv='enable-vc-intrinsic=false' %s | FileCheck %s

#sg_map_fp16_a = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>
#sg_map_fp16_b = #xegpu.sg_map<{mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]}>
#sg_map_fp16_c = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]}>
module @gemm attributes {gpu.container_module} {
memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01>
memref.global "private" constant @__constant_16x16xf16 : memref<16x16xf16> = dense<1.099610e+00>
Expand All @@ -15,35 +18,25 @@ module @gemm attributes {gpu.container_module} {
gpu.dealloc %memref_0 : memref<16x16xf16>
return %memref_1 : memref<8x16xf32>
}
gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
gpu.func @test_kernel(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
// LSC: spirv.FunctionCall @llvm_genx_lsc_prefetch2d_stateless_i1_i64
// LSC: spirv.FunctionCall @llvm_genx_lsc_prefetch2d_stateless_i1_i64
// LSC: spirv.FunctionCall @llvm_genx_lsc_load2d_stateless_v64i32_i1_i64
// LSC: spirv.FunctionCall @llvm_genx_lsc_load2d_stateless_v128i32_i1_i64
// LSC: spirv.FunctionCall @llvm_genx_dpas_nosrc0_v128f32_v128i32_v64i32
// LSC: spirv.FunctionCall @llvm_genx_lsc_store2d_stateless_i1_i64_v128f32
// CHECK: %[[BASE:.*]] = spirv.ConvertPtrToU %arg0 : !spirv.ptr<!spirv.array<128 x f16>, CrossWorkgroup> to i64
// CHECK: %[[BASE1:.*]] = spirv.VectorInsertDynamic %[[BASE]]
// CHECK: %[[BASE2:.*]] = spirv.Bitcast %[[BASE1]]
// CHECK: spirv.VectorInsertDynamic
// CHECK: spirv.VectorInsertDynamic
// CHECK: spirv.FunctionCall @llvm_genx_raw_send2_noresult_i1_v8i32
// CHECK: spirv.FunctionCall @llvm_genx_raw_send2_noresult_i1_v8i32
// CHECK: spirv.FunctionCall @llvm_genx_raw_send2_v64i32_i1_v8i32
// CHECK: spirv.FunctionCall @llvm_genx_raw_send2_v128i32_i1_v8i32
// CHECK: spirv.FunctionCall @llvm_genx_dpas_nosrc0_v128f32_v128i32_v64i32
// CHECK: spirv.FunctionCall @llvm_genx_raw_sends2_noresult_i1_v8i32_v128f32
%0 = xegpu.create_nd_tdesc %arg0[0, 0] {mode = vc} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
%1 = xegpu.create_nd_tdesc %arg1[0, 0] {mode = vc} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
%2 = xegpu.create_nd_tdesc %arg2[0, 0] {mode = vc} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
xegpu.prefetch_nd %0 {mode = vc} : !xegpu.tensor_desc<8x16xf16>
xegpu.prefetch_nd %1 {mode = vc} : !xegpu.tensor_desc<16x16xf16>

%3 = xegpu.load_nd %0 {mode = vc, vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16> -> vector<8x8x2xf16>
%4 = xegpu.load_nd %1 {mode = vc, vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
%5 = xegpu.dpas %3, %4 {mode = vc} : vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
xegpu.store_nd %5, %2 {mode = vc} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>} {
gpu.func @test_kernel(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) kernel attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>} {
// CHECK: %[[a:.*]] = spirv.FunctionCall @llvm.genx.GenISA.LSC2DBlockRead.v8i16
// CHECK: %[[a0:.*]] = spirv.Bitcast %[[a]]
// CHECK: %[[b:.*]] = spirv.FunctionCall @llvm.genx.GenISA.LSC2DBlockRead.v16i16
// CHECK: %[[b0:.*]] = spirv.Bitcast %[[b]]
// CHECK: %[[A:.*]] = spirv.Bitcast %[[a0]]
// CHECK: %[[B:.*]] = spirv.Bitcast %[[b0]]
// CHECK: %[[C:.*]] = spirv.FunctionCall @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v16i16
// CHECK-SAME: %[[A]], %[[B]]
// CHECK: spirv.FunctionCall @llvm.genx.GenISA.LSC2DBlockWrite.isVoid
// CHECK-SAME: %[[C]]
%0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a>
%1 = xegpu.create_nd_tdesc %arg1[0, 0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b>
%2 = xegpu.create_nd_tdesc %arg2[0, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c>
%3 = xegpu.load_nd %0 {vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a> -> vector<4x1x2xf16>
%4 = xegpu.load_nd %1 {vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b> -> vector<8x1x2xf16>
%5 = xegpu.dpas %3, %4 : vector<4x1x2xf16>, vector<8x1x2xf16> -> vector<8x1xf32>
xegpu.store_nd %5, %2 : vector<8x1xf32>, !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c>
gpu.return
}
}
Expand Down
59 changes: 59 additions & 0 deletions test/Conversion/XeGPUToSPIRV/gemm_basic.vc.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// RUN: imex-opt -imex-convert-gpu-to-spirv %s | FileCheck %s
// RUN: IMEX_NOT_PREFER_RAWSEND=1 imex-opt -imex-convert-gpu-to-spirv %s | FileCheck %s --check-prefix=LSC
module @gemm attributes {gpu.container_module} {
memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01>
memref.global "private" constant @__constant_16x16xf16 : memref<16x16xf16> = dense<1.099610e+00>
func.func @test(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
%c1 = arith.constant 1 : index
%memref = gpu.alloc host_shared () : memref<8x16xf16>
memref.copy %arg0, %memref : memref<8x16xf16> to memref<8x16xf16>
%memref_0 = gpu.alloc host_shared () : memref<16x16xf16>
memref.copy %arg1, %memref_0 : memref<16x16xf16> to memref<16x16xf16>
%memref_1 = gpu.alloc host_shared () : memref<8x16xf32>
gpu.launch_func @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_0 : memref<16x16xf16>, %memref_1 : memref<8x16xf32>)
gpu.dealloc %memref : memref<8x16xf16>
gpu.dealloc %memref_0 : memref<16x16xf16>
return %memref_1 : memref<8x16xf32>
}
gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
gpu.func @test_kernel(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
// LSC: spirv.FunctionCall @llvm_genx_lsc_prefetch2d_stateless_i1_i64
// LSC: spirv.FunctionCall @llvm_genx_lsc_prefetch2d_stateless_i1_i64
// LSC: spirv.FunctionCall @llvm_genx_lsc_load2d_stateless_v64i32_i1_i64
// LSC: spirv.FunctionCall @llvm_genx_lsc_load2d_stateless_v128i32_i1_i64
// LSC: spirv.FunctionCall @llvm_genx_dpas_nosrc0_v128f32_v128i32_v64i32
// LSC: spirv.FunctionCall @llvm_genx_lsc_store2d_stateless_i1_i64_v128f32
// CHECK: %[[BASE:.*]] = spirv.ConvertPtrToU %arg0 : !spirv.ptr<!spirv.array<128 x f16>, CrossWorkgroup> to i64
// CHECK: %[[BASE1:.*]] = spirv.VectorInsertDynamic %[[BASE]]
// CHECK: %[[BASE2:.*]] = spirv.Bitcast %[[BASE1]]
// CHECK: spirv.VectorInsertDynamic
// CHECK: spirv.VectorInsertDynamic
// CHECK: spirv.FunctionCall @llvm_genx_raw_send2_noresult_i1_v8i32
// CHECK: spirv.FunctionCall @llvm_genx_raw_send2_noresult_i1_v8i32
// CHECK: spirv.FunctionCall @llvm_genx_raw_send2_v64i32_i1_v8i32
// CHECK: spirv.FunctionCall @llvm_genx_raw_send2_v128i32_i1_v8i32
// CHECK: spirv.FunctionCall @llvm_genx_dpas_nosrc0_v128f32_v128i32_v64i32
// CHECK: spirv.FunctionCall @llvm_genx_raw_sends2_noresult_i1_v8i32_v128f32
%0 = xegpu.create_nd_tdesc %arg0[0, 0] {mode = vc} : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
%1 = xegpu.create_nd_tdesc %arg1[0, 0] {mode = vc} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
%2 = xegpu.create_nd_tdesc %arg2[0, 0] {mode = vc} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
xegpu.prefetch_nd %0 {mode = vc} : !xegpu.tensor_desc<8x16xf16>
xegpu.prefetch_nd %1 {mode = vc} : !xegpu.tensor_desc<16x16xf16>

%3 = xegpu.load_nd %0 {mode = vc, vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16> -> vector<8x8x2xf16>
%4 = xegpu.load_nd %1 {mode = vc, vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
%5 = xegpu.dpas %3, %4 {mode = vc} : vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
xegpu.store_nd %5, %2 {mode = vc} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
gpu.return
}
}
func.func @main() attributes {llvm.emit_c_interface} {
%0 = memref.get_global @__constant_8x16xf16 : memref<8x16xf16>
%1 = memref.get_global @__constant_16x16xf16 : memref<16x16xf16>
%2 = call @test(%0, %1) : (memref<8x16xf16>, memref<16x16xf16>) -> memref<8x16xf32>
%cast = memref.cast %2 : memref<8x16xf32> to memref<*xf32>
//call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
return
}
func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
// RUN: --runner imex-cpu-runner -e main \
// RUN: --entry-point-result=void \
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-llvm.pp \
// RUN: --runner imex-cpu-runner -e main \
// RUN: --entry-point-result=void \
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
module @gemm attributes {gpu.container_module} {
memref.global "private" @__constant_1024x1024xf16 : memref<1024x1024xf16> = dense<0.0>
memref.global "private" @__constant_1024x1024xf16_ : memref<1024x1024xf16> = dense<0.0>
memref.global "private" @__constant_1024x1024xf32 : memref<1024x1024xf32> = dense<0.0>
func.func @test(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>) -> memref<1024x1024xf32> attributes {llvm.emit_c_interface} {
%c64 = arith.constant 64 : index
%c128 = arith.constant 128 : index
%c1 = arith.constant 1 : index
%memref = gpu.alloc host_shared () : memref<1024x1024xf16>
memref.copy %arg0, %memref : memref<1024x1024xf16> to memref<1024x1024xf16>
%memref_0 = gpu.alloc host_shared () : memref<1024x1024xf16>
memref.copy %arg1, %memref_0 : memref<1024x1024xf16> to memref<1024x1024xf16>
%memref_1 = gpu.alloc host_shared () : memref<1024x1024xf32>
gpu.launch_func @test_kernel::@test_kernel blocks in (%c128, %c64, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<1024x1024xf16>, %memref_0 : memref<1024x1024xf16>, %memref_1 : memref<1024x1024xf32>)
gpu.dealloc %memref : memref<1024x1024xf16>
gpu.dealloc %memref_0 : memref<1024x1024xf16>
return %memref_1 : memref<1024x1024xf32>
}
gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
gpu.func @test_kernel(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) kernel attributes {VectorComputeFunctionINTEL, gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 128, 64, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
%c8 = arith.constant 8 : index
%c1024 = arith.constant 1024 : index
%0 = gpu.block_id x
%1 = gpu.block_id y
%2 = arith.muli %0, %c8 : index
%3 = arith.muli %1, %c16 : index
%4 = xegpu.create_nd_tdesc %arg2[%2, %3] {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
%5 = xegpu.load_nd %4 {mode = vc} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
// each work-group has 1 subgroup. the subgroup caculates a [8x16 = 8x1024 * 1024x16] block
%7 = xegpu.create_nd_tdesc %arg0[%2, %c0] {mode=vc}: memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
%8 = xegpu.create_nd_tdesc %arg1[%c0, %3] {mode=vc}: memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
%6:3 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %5, %subA = %7, %subB = %8) -> (vector<8x16xf32>, !xegpu.tensor_desc<8x16xf16>, !xegpu.tensor_desc<16x16xf16>) {
%9 = xegpu.load_nd %subA {mode=vc, vnni_axis = 1}: !xegpu.tensor_desc<8x16xf16> -> vector<8x8x2xf16>
%10 = xegpu.load_nd %subB {mode=vc, vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
%11 = xegpu.dpas %9, %10, %arg4 {mode=vc}: vector<8x8x2xf16>, vector<8x16x2xf16>, vector<8x16xf32> -> vector<8x16xf32>
%12 = xegpu.update_nd_offset %subA, [%c0, %c16] {mode=vc}: !xegpu.tensor_desc<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
%13 = xegpu.update_nd_offset %subB, [%c16, %c0] {mode=vc}: !xegpu.tensor_desc<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
scf.yield %11, %12, %13: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf16>, !xegpu.tensor_desc<16x16xf16>
}
xegpu.store_nd %6#0, %4 {mode = vc}: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
gpu.return
}
}
func.func @main() attributes {llvm.emit_c_interface} {
%0 = memref.get_global @__constant_1024x1024xf16 : memref<1024x1024xf16>
%1 = memref.get_global @__constant_1024x1024xf16_ : memref<1024x1024xf16>
%ref = memref.get_global @__constant_1024x1024xf32 : memref<1024x1024xf32>
%init = arith.constant 0.0 : f16
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c8 = arith.constant 8 : index
%c16 = arith.constant 16 : index
%c128 = arith.constant 128 : index
%c1024 = arith.constant 1024 : index
// fill the top-left block 128x128
// A matrix: row-major, start from 0.0, increase 0.01 per element
// B matrix: A matrix + 1.0
scf.for %arg0 = %c0 to %c128 step %c1 {
scf.for %arg1 = %c0 to %c128 step %c1 {
%int0 = arith.index_cast %arg0 : index to i16
%int1 = arith.index_cast %arg1 : index to i16
%c128_i16 = arith.constant 128 : i16
%idx0 = arith.muli %int0, %c128_i16 : i16
%idx1 = arith.addi %int1, %idx0 : i16
%fp = arith.uitofp %idx1 : i16 to f16
%cst100 = arith.constant 100.0 : f16
%val0 = arith.divf %fp, %cst100 : f16
%cst1 = arith.constant 1.0 : f16
%val1 = arith.addf %val0, %cst1 : f16
memref.store %val0, %0[%arg0, %arg1] : memref<1024x1024xf16>
memref.store %val1, %1[%arg0, %arg1] : memref<1024x1024xf16>
}
}
// caculate the result C matrix
scf.for %arg0 = %c0 to %c1024 step %c1 {
scf.for %arg1 = %c0 to %c1024 step %c1 {
%acc = memref.load %ref[%arg0, %arg1] : memref<1024x1024xf32>
%res = scf.for %arg2 = %c0 to %c1024 step %c1 iter_args(%arg3 = %acc) -> f32 {
%a = memref.load %0[%arg0, %arg2] : memref<1024x1024xf16>
%b = memref.load %1[%arg2, %arg1] : memref<1024x1024xf16>
%c = arith.mulf %a, %b : f16
%cc = arith.extf %c : f16 to f32
%ccc = arith.addf %cc, %arg3 : f32
scf.yield %ccc : f32
}
memref.store %res, %ref[%arg0, %arg1] : memref<1024x1024xf32>
}
}

%2 = call @test(%0, %1) : (memref<1024x1024xf16>, memref<1024x1024xf16>) -> memref<1024x1024xf32>
%cast = memref.cast %2 : memref<1024x1024xf32> to memref<*xf32>
//call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
%cast_ref = memref.cast %ref : memref<1024x1024xf32> to memref<*xf32>
//call @printMemrefF32(%cast_ref) : (memref<*xf32>) -> ()
// CHECK: [ALLCLOSE: TRUE]
call @printAllcloseF32(%cast, %cast_ref) : (memref<*xf32>, memref<*xf32>) -> ()
return
}
func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
}
1 change: 1 addition & 0 deletions test/Integration/Dialect/XeGPU/lit.local.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
local_excludes = [
'gemm_1024x1024xf16.mlir',
'gemm_1024x1024xf16.using.updateoffset.mlir',
'gemm_1024x1016x1016_f16_f16_f32.mlir',
'load2d_dpas_store2d.mlir',
'load2d-padding-f32.mlir',
Expand Down

0 comments on commit c53a23a

Please sign in to comment.