Skip to content

Commit

Permalink
AMDGPU: Increase the LDS size to support to 160 KB for gfx950
Browse files Browse the repository at this point in the history
  • Loading branch information
pravinjagtap authored and arsenm committed Nov 15, 2024
1 parent b99a4f4 commit 99a8e96
Show file tree
Hide file tree
Showing 10 changed files with 144 additions and 5 deletions.
2 changes: 2 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5475,6 +5475,8 @@ The fields used by CP for code objects before V3 also match those specified in
roundup(lds-size / (64 * 4))
GFX7-GFX11
roundup(lds-size / (128 * 4))
GFX950
roundup(lds-size / (320 * 4))

24 1 bit ENABLE_EXCEPTION_IEEE_754_FP Wavefront starts execution
_INVALID_OPERATION with specified exceptions
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1494,7 +1494,8 @@ def FeatureISAVersion9_5_Common : FeatureSet<
[FeatureFP8Insts,
FeatureFP8ConversionInsts,
FeatureCvtFP8VOP1Bug,
FeatureGFX950Insts
FeatureGFX950Insts,
FeatureAddressableLocalMemorySize163840
])>;

def FeatureISAVersion9_4_0 : FeatureSet<
Expand Down
12 changes: 8 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1172,12 +1172,16 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.DX10Clamp = Mode.DX10Clamp;

unsigned LDSAlignShift;
if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
// LDS is allocated in 64 dword blocks.
LDSAlignShift = 8;
} else {
if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
// LDS is allocated in 320 dword blocks.
LDSAlignShift = 11;
} else if (STM.getFeatureBits().test(
FeatureAddressableLocalMemorySize65536)) {
// LDS is allocated in 128 dword blocks.
LDSAlignShift = 9;
} else {
// LDS is allocated in 64 dword blocks.
LDSAlignShift = 8;
}

ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class SubtargetFeatureAddressableLocalMemorySize <int Value> : SubtargetFeature<

def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>;
def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>;
def FeatureAddressableLocalMemorySize163840 : SubtargetFeatureAddressableLocalMemorySize<163840>;

class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature<
"wavefrontsize"#!shl(1, ValueLog2),
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -916,6 +916,8 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
return 32768;
if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize65536))
return 65536;
if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
return 163840;
return 0;
}

Expand Down
7 changes: 7 additions & 0 deletions llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10-MESA %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-PAL %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-MESA %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-PAL %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-MESA %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-PAL %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-MESA %s

Expand All @@ -17,6 +19,11 @@
; GFX11-MESA: .long 45100
; GFX11-MESA-NEXT: .long 1024

; GFX950-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x200

; GFX950-MESA: .long 45100
; GFX950-MESA-NEXT: .long 512

; GFX1200-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x400

; GFX1200-MESA: .long 45100
Expand Down
13 changes: 13 additions & 0 deletions llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx950.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s

; GFX950 supports upto 160 KB LDS memory.
; This is a negative test to check when the LDS size exceeds the max usable limit.

; ERROR: error: <unknown>:0:0: local memory (163844) exceeds limit (163840) in function 'test_lds_limit'
@dst = addrspace(3) global [40961 x i32] poison

define amdgpu_kernel void @test_lds_limit(i32 %val) {
%gep = getelementptr [40961 x i32], ptr addrspace(3) @dst, i32 0, i32 100
store i32 %val, ptr addrspace(3) %gep
ret void
}
31 changes: 31 additions & 0 deletions llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=MESA %s

; gfx950 supports upto 160 KB configurable LDS memory.
; This test checks the max and above the old i.e. 128 KiB size of LDS that can be allocated.

@lds.i32 = addrspace(3) global i32 poison
@lds.array.size.131076 = addrspace(3) global [32768 x i32] poison
@lds.array.size.163840 = addrspace(3) global [40959 x i32] poison

; GCN-LABEL: test_lds_array_size_131076:
; GCN: .amdhsa_group_segment_fixed_size 131076
; GCN: ; LDSByteSize: 131076 bytes/workgroup
; MESA: granulated_lds_size = 65
define amdgpu_kernel void @test_lds_array_size_131076() {
%gep = getelementptr inbounds [32768 x i32], ptr addrspace(3) @lds.array.size.131076, i32 0, i32 20
%val = load i32, ptr addrspace(3) %gep
store i32 %val, ptr addrspace(3) @lds.i32
ret void
}

; GCN-LABEL: test_lds_array_size_163840:
; GCN: .amdhsa_group_segment_fixed_size 163840
; GCN: ; LDSByteSize: 163840 bytes/workgroup
; MESA: granulated_lds_size = 80
define amdgpu_kernel void @test_lds_array_size_163840() {
%gep = getelementptr inbounds [40959 x i32], ptr addrspace(3) @lds.array.size.163840 , i32 0, i32 20
%val = load i32, ptr addrspace(3) %gep
store i32 %val, ptr addrspace(3) @lds.i32
ret void
}
26 changes: 26 additions & 0 deletions llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=PAL %s

; GFX950supports upto 160 KB configurable LDS memory.
; This test checks the min and max size of LDS that can be allocated.

; PAL: .shader_functions:
; PAL: test_lds_array_i32:
; PAL: .lds_size: 0x28000
; PAL: test_lds_i32:
; PAL: .lds_size: 0x4


@lds.i32 = addrspace(3) global i32 poison
@lds.array.i32 = addrspace(3) global [40959 x i32] poison

define amdgpu_gfx void @test_lds_i32(i32 %val) {
store i32 %val, ptr addrspace(3) @lds.i32
ret void
}

define amdgpu_gfx void @test_lds_array_i32() {
%gep = getelementptr inbounds [40959 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
%val = load i32, ptr addrspace(3) %gep
store i32 %val, ptr addrspace(3) @lds.i32
ret void
}
52 changes: 52 additions & 0 deletions llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx950.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
;; Test disassembly for gfx950 kernel descriptor.

; RUN: rm -rf %t && split-file %s %t && cd %t

;--- 1.s
; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -mattr=-xnack -filetype=obj -mcpu=gfx950 < 1.s > 1.o
; RUN: llvm-objdump --disassemble-symbols=kernel.kd 1.o | tail -n +7 | tee 1-disasm.s | FileCheck 1.s
; RUN: llvm-mc --triple=amdgcn-amd-amdhsa -mattr=-xnack -filetype=obj -mcpu=gfx950 < 1-disasm.s > 1-disasm.o
; FIxMe: cmp 1.o 1-disasm.o
; CHECK: .amdhsa_kernel kernel
; CHECK-NEXT: .amdhsa_group_segment_fixed_size 163840
; CHECK-NEXT: .amdhsa_private_segment_fixed_size 0
; CHECK-NEXT: .amdhsa_kernarg_size 0
; CHECK-NEXT: .amdhsa_accum_offset 4
; CHECK-NEXT: .amdhsa_tg_split 0
; CHECK-NEXT: .amdhsa_next_free_vgpr 8
; CHECK-NEXT: .amdhsa_reserve_vcc 0
; CHECK-NEXT: .amdhsa_reserve_xnack_mask 0
; CHECK-NEXT: .amdhsa_next_free_sgpr 8
; CHECK-NEXT: .amdhsa_float_round_mode_32 0
; CHECK-NEXT: .amdhsa_float_round_mode_16_64 0
; CHECK-NEXT: .amdhsa_float_denorm_mode_32 0
; CHECK-NEXT: .amdhsa_float_denorm_mode_16_64 3
; CHECK-NEXT: .amdhsa_dx10_clamp 1
; CHECK-NEXT: .amdhsa_ieee_mode 1
; CHECK-NEXT: .amdhsa_fp16_overflow 0
; CHECK-NEXT: .amdhsa_enable_private_segment 0
; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; CHECK-NEXT: .amdhsa_system_vgpr_workitem_id 0
; CHECK-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
; CHECK-NEXT: .amdhsa_exception_fp_denorm_src 0
; CHECK-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
; CHECK-NEXT: .amdhsa_exception_fp_ieee_overflow 0
; CHECK-NEXT: .amdhsa_exception_fp_ieee_underflow 0
; CHECK-NEXT: .amdhsa_exception_fp_ieee_inexact 0
; CHECK-NEXT: .amdhsa_exception_int_div_zero 0
; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
; CHECK-NEXT: .amdhsa_user_sgpr_queue_ptr 0
; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
; CHECK-NEXT:.end_amdhsa_kernel
.amdhsa_kernel kernel
.amdhsa_group_segment_fixed_size 163840
.amdhsa_next_free_vgpr 0
.amdhsa_next_free_sgpr 0
.amdhsa_accum_offset 4
.end_amdhsa_kernel

0 comments on commit 99a8e96

Please sign in to comment.