Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AMDGPU: Increase the LDS size to support to 160 KB for gfx950 #116309

Open
wants to merge 1 commit into
base: users/arsenm/gfx950/add-minimum3-maximum3-features
Choose a base branch
from

Conversation

arsenm
Copy link
Contributor

@arsenm arsenm commented Nov 15, 2024

No description provided.

Copy link
Contributor Author

arsenm commented Nov 15, 2024

@llvmbot
Copy link

llvmbot commented Nov 15, 2024

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/116309.diff

10 Files Affected:

  • (modified) llvm/docs/AMDGPUUsage.rst (+2)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+2-1)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (+8-4)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUFeatures.td (+1)
  • (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+2)
  • (modified) llvm/test/CodeGen/AMDGPU/extra-lds-size.ll (+7)
  • (added) llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx950.ll (+13)
  • (added) llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll (+31)
  • (added) llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll (+26)
  • (added) llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx950.s (+52)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index b85b680b9c82d3..a25b6feddbeddc 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5475,6 +5475,8 @@ The fields used by CP for code objects before V3 also match those specified in
                                                        roundup(lds-size / (64 * 4))
                                                      GFX7-GFX11
                                                        roundup(lds-size / (128 * 4))
+                                                     GFX950
+                                                       roundup(lds-size / (320 * 4))
 
      24      1 bit   ENABLE_EXCEPTION_IEEE_754_FP    Wavefront starts execution
                      _INVALID_OPERATION              with specified exceptions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 35dbf86b7c6f36..a05d4a644d08d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1494,7 +1494,8 @@ def FeatureISAVersion9_5_Common : FeatureSet<
   [FeatureFP8Insts,
    FeatureFP8ConversionInsts,
    FeatureCvtFP8VOP1Bug,
-   FeatureGFX950Insts
+   FeatureGFX950Insts,
+   FeatureAddressableLocalMemorySize163840
    ])>;
 
 def FeatureISAVersion9_4_0 : FeatureSet<
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index d801f2b1591275..90ece275412c7c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1172,12 +1172,16 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.DX10Clamp = Mode.DX10Clamp;
 
   unsigned LDSAlignShift;
-  if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
-    // LDS is allocated in 64 dword blocks.
-    LDSAlignShift = 8;
-  } else {
+  if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
+    // LDS is allocated in 320 dword blocks.
+    LDSAlignShift = 11;
+  } else if (STM.getFeatureBits().test(
+                 FeatureAddressableLocalMemorySize65536)) {
     // LDS is allocated in 128 dword blocks.
     LDSAlignShift = 9;
+  } else {
+    // LDS is allocated in 64 dword blocks.
+    LDSAlignShift = 8;
   }
 
   ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
index f832a2a55d6229..74d1faeb6f545b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -29,6 +29,7 @@ class SubtargetFeatureAddressableLocalMemorySize <int Value> : SubtargetFeature<
 
 def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>;
 def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>;
+def FeatureAddressableLocalMemorySize163840 : SubtargetFeatureAddressableLocalMemorySize<163840>;
 
 class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature<
   "wavefrontsize"#!shl(1, ValueLog2),
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 01866fbd9da6e7..501d00b1f308d9 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -916,6 +916,8 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
     return 32768;
   if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize65536))
     return 65536;
+  if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
+    return 163840;
   return 0;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
index 13640b74a7937b..318ecd16a2ccb3 100644
--- a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
@@ -2,6 +2,8 @@
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10-MESA %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-PAL %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-MESA %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-PAL %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-MESA %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-PAL %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-MESA %s
 
@@ -17,6 +19,11 @@
 ; GFX11-MESA: .long 45100
 ; GFX11-MESA-NEXT: .long 1024
 
+; GFX950-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x200
+
+; GFX950-MESA: .long 45100
+; GFX950-MESA-NEXT: .long 512
+
 ; GFX1200-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x400
 
 ; GFX1200-MESA: .long 45100
diff --git a/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx950.ll b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx950.ll
new file mode 100644
index 00000000000000..19166b271db775
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx950.ll
@@ -0,0 +1,13 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; GFX950 supports upto 160 KB LDS memory.
+; This is a negative test to check when the LDS size exceeds the max usable limit.
+
+; ERROR: error: <unknown>:0:0: local memory (163844) exceeds limit (163840) in function 'test_lds_limit'
+@dst = addrspace(3) global [40961 x i32] poison
+
+define amdgpu_kernel void @test_lds_limit(i32 %val) {
+  %gep = getelementptr [40961 x i32], ptr addrspace(3) @dst, i32 0, i32 100
+  store i32 %val, ptr addrspace(3) %gep
+  ret void
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll
new file mode 100644
index 00000000000000..6ebfc9a5e9d4f6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=MESA %s
+
+; gfx950 supports upto 160 KB configurable LDS memory.
+; This test checks the max and above the old i.e. 128 KiB size of LDS that can be allocated.
+
+@lds.i32 = addrspace(3) global i32 poison
+@lds.array.size.131076 = addrspace(3) global [32768 x i32] poison
+@lds.array.size.163840 = addrspace(3) global [40959 x i32] poison
+
+; GCN-LABEL: test_lds_array_size_131076:
+; GCN: .amdhsa_group_segment_fixed_size 131076
+; GCN: ; LDSByteSize: 131076 bytes/workgroup
+; MESA: granulated_lds_size = 65
+define amdgpu_kernel void @test_lds_array_size_131076() {
+  %gep = getelementptr inbounds [32768 x i32], ptr addrspace(3) @lds.array.size.131076, i32 0, i32 20
+  %val = load i32, ptr addrspace(3) %gep
+  store i32 %val, ptr addrspace(3) @lds.i32
+  ret void
+}
+
+; GCN-LABEL: test_lds_array_size_163840:
+; GCN: .amdhsa_group_segment_fixed_size 163840
+; GCN: ; LDSByteSize: 163840 bytes/workgroup
+; MESA: granulated_lds_size = 80
+define amdgpu_kernel void @test_lds_array_size_163840() {
+  %gep = getelementptr inbounds [40959 x i32], ptr addrspace(3) @lds.array.size.163840 , i32 0, i32 20
+  %val = load i32, ptr addrspace(3) %gep
+  store i32 %val, ptr addrspace(3) @lds.i32
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll
new file mode 100644
index 00000000000000..22cad8ab5f5360
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=PAL %s
+
+; GFX950supports upto 160 KB configurable LDS memory.
+; This test checks the min and max size of LDS that can be allocated.
+
+; PAL: .shader_functions:
+; PAL: test_lds_array_i32:
+; PAL: .lds_size:       0x28000
+; PAL: test_lds_i32:
+; PAL: .lds_size:       0x4
+
+
+@lds.i32 = addrspace(3) global i32 poison
+@lds.array.i32 = addrspace(3) global [40959 x i32] poison
+
+define amdgpu_gfx void @test_lds_i32(i32 %val) {
+  store i32 %val, ptr addrspace(3) @lds.i32
+  ret void
+}
+
+define amdgpu_gfx void @test_lds_array_i32() {
+  %gep = getelementptr inbounds [40959 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
+  %val = load i32, ptr addrspace(3) %gep
+  store i32 %val, ptr addrspace(3) @lds.i32
+  ret void
+}
\ No newline at end of file
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx950.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx950.s
new file mode 100644
index 00000000000000..5b9d42c7fad553
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx950.s
@@ -0,0 +1,52 @@
+;; Test disassembly for gfx950 kernel descriptor.
+
+; RUN: rm -rf %t && split-file %s %t && cd %t
+
+;--- 1.s
+; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -mattr=-xnack -filetype=obj -mcpu=gfx950 < 1.s > 1.o
+; RUN: llvm-objdump --disassemble-symbols=kernel.kd 1.o | tail -n +7 | tee 1-disasm.s | FileCheck 1.s
+; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -mattr=-xnack -filetype=obj -mcpu=gfx950 < 1-disasm.s > 1-disasm.o
+; FIxMe: cmp 1.o 1-disasm.o
+; CHECK: .amdhsa_kernel kernel
+; CHECK-NEXT:	.amdhsa_group_segment_fixed_size 163840
+; CHECK-NEXT:	.amdhsa_private_segment_fixed_size 0
+; CHECK-NEXT:	.amdhsa_kernarg_size 0
+; CHECK-NEXT:	.amdhsa_accum_offset 4
+; CHECK-NEXT:	.amdhsa_tg_split 0
+; CHECK-NEXT:	.amdhsa_next_free_vgpr 8
+; CHECK-NEXT:	.amdhsa_reserve_vcc 0
+; CHECK-NEXT:	.amdhsa_reserve_xnack_mask 0
+; CHECK-NEXT:	.amdhsa_next_free_sgpr 8
+; CHECK-NEXT:	.amdhsa_float_round_mode_32 0
+; CHECK-NEXT:	.amdhsa_float_round_mode_16_64 0
+; CHECK-NEXT:	.amdhsa_float_denorm_mode_32 0
+; CHECK-NEXT:	.amdhsa_float_denorm_mode_16_64 3
+; CHECK-NEXT:	.amdhsa_dx10_clamp 1
+; CHECK-NEXT:	.amdhsa_ieee_mode 1
+; CHECK-NEXT:	.amdhsa_fp16_overflow 0
+; CHECK-NEXT:	.amdhsa_enable_private_segment 0
+; CHECK-NEXT:	.amdhsa_system_sgpr_workgroup_id_x 1
+; CHECK-NEXT:	.amdhsa_system_sgpr_workgroup_id_y 0
+; CHECK-NEXT:	.amdhsa_system_sgpr_workgroup_id_z 0
+; CHECK-NEXT:	.amdhsa_system_sgpr_workgroup_info 0
+; CHECK-NEXT:	.amdhsa_system_vgpr_workitem_id 0
+; CHECK-NEXT:	.amdhsa_exception_fp_ieee_invalid_op 0
+; CHECK-NEXT:	.amdhsa_exception_fp_denorm_src 0
+; CHECK-NEXT:	.amdhsa_exception_fp_ieee_div_zero 0
+; CHECK-NEXT:	.amdhsa_exception_fp_ieee_overflow 0
+; CHECK-NEXT:	.amdhsa_exception_fp_ieee_underflow 0
+; CHECK-NEXT:	.amdhsa_exception_fp_ieee_inexact 0
+; CHECK-NEXT:	.amdhsa_exception_int_div_zero 0
+; CHECK-NEXT:	.amdhsa_user_sgpr_dispatch_ptr 0
+; CHECK-NEXT:	.amdhsa_user_sgpr_queue_ptr 0
+; CHECK-NEXT:	.amdhsa_user_sgpr_kernarg_segment_ptr 0
+; CHECK-NEXT:	.amdhsa_user_sgpr_dispatch_id 0
+; CHECK-NEXT:	.amdhsa_user_sgpr_private_segment_size 0
+; CHECK-NEXT:	.amdhsa_uses_dynamic_stack 0
+; CHECK-NEXT:.end_amdhsa_kernel
+.amdhsa_kernel kernel
+  .amdhsa_group_segment_fixed_size 163840
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+.end_amdhsa_kernel

@arsenm arsenm marked this pull request as ready for review November 15, 2024 01:37
@arsenm arsenm force-pushed the users/arsenm/gfx950/add-minimum3-maximum3-features branch from b99a4f4 to 1eebc85 Compare November 15, 2024 01:43
@arsenm arsenm force-pushed the users/arsenm/gfx950/increase-lds-size branch from 99a8e96 to 54f5e02 Compare November 15, 2024 01:43
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants