diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 681728d36952c1..5caedd0b6d3cfd 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -710,6 +710,8 @@ Target Specific Changes AMDGPU Support ^^^^^^^^^^^^^^ +* Initial support for gfx950 + X86 Support ^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index 721e8981af6ffc..c2a4addf488df1 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -107,6 +107,7 @@ enum class OffloadArch { GFX940, GFX941, GFX942, + GFX950, GFX10_1_GENERIC, GFX1010, GFX1011, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index 59c932468cd891..d56609a2a8f24a 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -125,6 +125,7 @@ static const OffloadArchToStringMap arch_names[] = { GFX(940), // gfx940 GFX(941), // gfx941 GFX(942), // gfx942 + GFX(950), // gfx950 {OffloadArch::GFX10_1_GENERIC, "gfx10-1-generic", "compute_amdgcn"}, GFX(1010), // gfx1010 GFX(1011), // gfx1011 diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 0897032c4b8546..dbc3fec3657610 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -209,6 +209,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case OffloadArch::GFX940: case OffloadArch::GFX941: case OffloadArch::GFX942: + case OffloadArch::GFX950: case OffloadArch::GFX10_1_GENERIC: case OffloadArch::GFX1010: case OffloadArch::GFX1011: diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 43dc0e62284602..b595d3250d6230 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -2305,6 +2305,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) { case OffloadArch::GFX940: case OffloadArch::GFX941: case OffloadArch::GFX942: + case OffloadArch::GFX950: case OffloadArch::GFX10_1_GENERIC: case OffloadArch::GFX1010: case OffloadArch::GFX1011: diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 8b56ec94f2c4ee..5c324032b51956 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -32,6 +32,7 @@ // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx940 -emit-llvm -o - %s | FileCheck --check-prefix=GFX940 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx941 -emit-llvm -o - %s | FileCheck --check-prefix=GFX941 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx942 -emit-llvm -o - %s | FileCheck --check-prefix=GFX942 %s +// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx950 -emit-llvm -o - %s | FileCheck --check-prefix=GFX950 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1010 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1011 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1011 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1012 %s @@ -88,6 +89,7 @@ // GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" // GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" // GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" // GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" // GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" diff --git a/clang/test/Driver/amdgpu-macros.cl b/clang/test/Driver/amdgpu-macros.cl index d354f933c5ad78..d97b2ddb1fc663 100644 --- a/clang/test/Driver/amdgpu-macros.cl +++ b/clang/test/Driver/amdgpu-macros.cl @@ -110,6 +110,7 @@ // RUN: %clang -E -dM -target amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx940 -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx941 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx941 -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx942 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx942 -DFAMILY=GFX9 +// RUN: %clang -E -dM -target amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx950 -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1010 -DFAMILY=GFX10 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1011 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1011 -DFAMILY=GFX10 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1012 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1012 -DFAMILY=GFX10 diff --git a/clang/test/Driver/amdgpu-mcpu.cl b/clang/test/Driver/amdgpu-mcpu.cl index ba578435072985..7c34d3ec6c63a9 100644 --- a/clang/test/Driver/amdgpu-mcpu.cl +++ b/clang/test/Driver/amdgpu-mcpu.cl @@ -95,6 +95,7 @@ // RUN: %clang -### -target amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefix=GFX940 %s // RUN: %clang -### -target amdgcn -mcpu=gfx941 %s 2>&1 | FileCheck --check-prefix=GFX941 %s // RUN: %clang -### -target amdgcn -mcpu=gfx942 %s 2>&1 | FileCheck --check-prefix=GFX942 %s +// RUN: %clang -### -target amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck --check-prefix=GFX950 %s // RUN: %clang -### -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefix=GFX1010 %s // RUN: %clang -### -target amdgcn -mcpu=gfx1011 %s 2>&1 | FileCheck --check-prefix=GFX1011 %s // RUN: %clang -### -target amdgcn -mcpu=gfx1012 %s 2>&1 | FileCheck --check-prefix=GFX1012 %s @@ -150,6 +151,7 @@ // GFX940: "-target-cpu" "gfx940" // GFX941: "-target-cpu" "gfx941" // GFX942: "-target-cpu" "gfx942" +// GFX950: "-target-cpu" "gfx950" // GFX1010: "-target-cpu" "gfx1010" // GFX1011: "-target-cpu" "gfx1011" // GFX1012: "-target-cpu" "gfx1012" diff --git a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c index 4e675871f1e5bd..642d2df211c21a 100644 --- a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c +++ b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c @@ -48,6 +48,7 @@ // CHECK-SAME: {{^}}, gfx940 // CHECK-SAME: {{^}}, gfx941 // CHECK-SAME: {{^}}, gfx942 +// CHECK-SAME: {{^}}, gfx950 // CHECK-SAME: {{^}}, gfx1010 // CHECK-SAME: {{^}}, gfx1011 // CHECK-SAME: {{^}}, gfx1012 diff --git a/clang/test/Misc/target-invalid-cpu-note/nvptx.c b/clang/test/Misc/target-invalid-cpu-note/nvptx.c index 44fe07065b2428..3ea6c02d6b3846 100644 --- a/clang/test/Misc/target-invalid-cpu-note/nvptx.c +++ b/clang/test/Misc/target-invalid-cpu-note/nvptx.c @@ -54,6 +54,7 @@ // CHECK-SAME: {{^}}, gfx940 // CHECK-SAME: {{^}}, gfx941 // CHECK-SAME: {{^}}, gfx942 +// CHECK-SAME: {{^}}, gfx950 // CHECK-SAME: {{^}}, gfx10-1-generic // CHECK-SAME: {{^}}, gfx1010 // CHECK-SAME: {{^}}, gfx1011 diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index c180ca5fcebef3..b85b680b9c82d3 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -399,6 +399,13 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following work-item IDs + ``gfx950`` ``amdgcn`` dGPU - sramecc - Architected *TBA* + - tgsplit flat + - xnack scratch .. TODO:: + - kernarg preload - Packed + work-item Add product + IDs names. + **GCN GFX10.1 (RDNA 1)** [AMD-GCN-GFX10-RDNA1]_ ----------------------------------------------------------------------------------------------------------------------- ``gfx1010`` ``amdgcn`` dGPU - cumode - Absolute - *rocm-amdhsa* - Radeon RX 5700 @@ -2178,7 +2185,7 @@ The AMDGPU backend uses the following ELF header: ``EF_AMDGPU_MACH_AMDGCN_GFX942`` 0x04c ``gfx942`` *reserved* 0x04d Reserved. ``EF_AMDGPU_MACH_AMDGCN_GFX1201`` 0x04e ``gfx1201`` - *reserved* 0x04f Reserved. + ``EF_AMDGPU_MACH_AMDGCN_GFX950`` 0x04f ``gfx950`` *reserved* 0x050 Reserved. ``EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC`` 0x051 ``gfx9-generic`` ``EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC`` 0x052 ``gfx10-1-generic`` diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h index 6c05ea7208e1f1..fd32a6ec19652b 100644 --- a/llvm/include/llvm/BinaryFormat/ELF.h +++ b/llvm/include/llvm/BinaryFormat/ELF.h @@ -811,7 +811,7 @@ enum : unsigned { EF_AMDGPU_MACH_AMDGCN_GFX942 = 0x04c, EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4D = 0x04d, EF_AMDGPU_MACH_AMDGCN_GFX1201 = 0x04e, - EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4F = 0x04f, + EF_AMDGPU_MACH_AMDGCN_GFX950 = 0x04f, EF_AMDGPU_MACH_AMDGCN_RESERVED_0X50 = 0x050, EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC = 0x051, EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC = 0x052, diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h index c6db4dfd7f5159..55e7b417428c4e 100644 --- a/llvm/include/llvm/TargetParser/TargetParser.h +++ b/llvm/include/llvm/TargetParser/TargetParser.h @@ -86,18 +86,19 @@ enum GPUKind : uint32_t { GK_GFX940 = 68, GK_GFX941 = 69, GK_GFX942 = 70, - - GK_GFX1010 = 71, - GK_GFX1011 = 72, - GK_GFX1012 = 73, - GK_GFX1013 = 74, - GK_GFX1030 = 75, - GK_GFX1031 = 76, - GK_GFX1032 = 77, - GK_GFX1033 = 78, - GK_GFX1034 = 79, - GK_GFX1035 = 80, - GK_GFX1036 = 81, + GK_GFX950 = 71, + + GK_GFX1010 = 72, + GK_GFX1011 = 73, + GK_GFX1012 = 74, + GK_GFX1013 = 75, + GK_GFX1030 = 76, + GK_GFX1031 = 77, + GK_GFX1032 = 78, + GK_GFX1033 = 79, + GK_GFX1034 = 80, + GK_GFX1035 = 81, + GK_GFX1036 = 82, GK_GFX1100 = 90, GK_GFX1101 = 91, diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp index 9dc39936ffd8bb..2ffb2ac5e7e453 100644 --- a/llvm/lib/Object/ELFObjectFile.cpp +++ b/llvm/lib/Object/ELFObjectFile.cpp @@ -550,6 +550,8 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const { return "gfx941"; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942: return "gfx942"; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX950: + return "gfx950"; // AMDGCN GFX10. case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index 130b8798ab4a46..ca0ea03452d3be 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -609,6 +609,7 @@ void ScalarBitSetTraits::bitset(IO &IO, BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX940, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX941, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX942, EF_AMDGPU_MACH); + BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX950, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1010, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1011, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1012, EF_AMDGPU_MACH); diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index d7feaef8c4a97d..d028c1f5ca7613 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -360,6 +360,12 @@ def FeatureGFX940Insts : SubtargetFeature<"gfx940-insts", "Additional instructions for GFX940+" >; +def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts", + "GFX950Insts", + "true", + "Additional instructions for GFX950+" +>; + def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts", "GFX10Insts", "true", @@ -1470,6 +1476,14 @@ def FeatureISAVersion9_4_Common : FeatureSet< FeatureFlatBufferGlobalAtomicFaddF64Inst ]>; +def FeatureISAVersion9_5_Common : FeatureSet< + !listconcat(FeatureISAVersion9_4_Common.Features, + [FeatureFP8Insts, + FeatureFP8ConversionInsts, + FeatureCvtFP8VOP1Bug, + FeatureGFX950Insts + ])>; + def FeatureISAVersion9_4_0 : FeatureSet< !listconcat(FeatureISAVersion9_4_Common.Features, [ @@ -1503,6 +1517,8 @@ def FeatureISAVersion9_4_Generic : FeatureSet< !listconcat(FeatureISAVersion9_4_Common.Features, [FeatureRequiresCOV6])>; +def FeatureISAVersion9_5_0 : FeatureSet; + def FeatureISAVersion10_Common : FeatureSet< [FeatureGFX10, FeatureLDSBankCount32, diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td index 067043d290b760..3403cbab526d46 100644 --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -204,6 +204,10 @@ def : ProcessorModel<"gfx942", SIDPGFX940FullSpeedModel, FeatureISAVersion9_4_2.Features >; +def : ProcessorModel<"gfx950", SIDPGFX940FullSpeedModel, + FeatureISAVersion9_5_0.Features +>; + // [gfx900, gfx902, gfx904, gfx906, gfx909, gfx90c] def : ProcessorModel<"gfx9-generic", SIQuarterSpeedModel, FeatureISAVersion9_Generic.Features diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 6ff964077d8fd0..1b06756a8a1016 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -106,6 +106,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool GFX9Insts = false; bool GFX90AInsts = false; bool GFX940Insts = false; + bool GFX950Insts = false; bool GFX10Insts = false; bool GFX11Insts = false; bool GFX12Insts = false; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 55ba5ebbebb8fd..ffde4d33f1341a 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -96,6 +96,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940: AK = GK_GFX940; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX941: AK = GK_GFX941; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942: AK = GK_GFX942; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX950: AK = GK_GFX950; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break; @@ -182,6 +183,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { case GK_GFX940: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX940; case GK_GFX941: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX941; case GK_GFX942: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX942; + case GK_GFX950: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX950; case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010; case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011; case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012; diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 7dfb8c021a8a5f..b0385915f3042b 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -107,6 +107,7 @@ constexpr GPUInfo AMDGCNGPUs[] = { {{"gfx940"}, {"gfx940"}, GK_GFX940, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx941"}, {"gfx941"}, GK_GFX941, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx942"}, {"gfx942"}, GK_GFX942, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, + {{"gfx950"}, {"gfx950"}, GK_GFX950, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx1010"}, {"gfx1010"}, GK_GFX1010, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP}, {{"gfx1011"}, {"gfx1011"}, GK_GFX1011, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP}, {{"gfx1012"}, {"gfx1012"}, GK_GFX1012, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP}, @@ -262,6 +263,7 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) { case GK_GFX940: return {9, 4, 0}; case GK_GFX941: return {9, 4, 1}; case GK_GFX942: return {9, 4, 2}; + case GK_GFX950: return {9, 5, 0}; case GK_GFX1010: return {10, 1, 0}; case GK_GFX1011: return {10, 1, 1}; case GK_GFX1012: return {10, 1, 2}; @@ -361,7 +363,8 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["wavefrontsize32"] = true; Features["wavefrontsize64"] = true; } else if (T.isAMDGCN()) { - switch (parseArchAMDGCN(GPU)) { + AMDGPU::GPUKind Kind = parseArchAMDGCN(GPU); + switch (Kind) { case GK_GFX1201: case GK_GFX1200: case GK_GFX12_GENERIC: @@ -466,12 +469,16 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["s-memtime-inst"] = true; Features["gws"] = true; break; + case GK_GFX950: + Features["gfx950-insts"] = true; + [[fallthrough]]; case GK_GFX942: case GK_GFX941: case GK_GFX940: Features["fp8-insts"] = true; Features["fp8-conversion-insts"] = true; - Features["xf32-insts"] = true; + if (Kind != GK_GFX950) + Features["xf32-insts"] = true; [[fallthrough]]; case GK_GFX9_4_GENERIC: Features["gfx940-insts"] = true; diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll index 1c9f35dd45feeb..425fc5884cec7f 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -march=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN,GFX-940 %s +; RUN: llc -march=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX-950 %s ; TODO: Add global-isel when it can support bf16 @@ -198,19 +199,33 @@ entry: } define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) { -; GCN-LABEL: fptrunc_f32_to_bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: v_mov_b32_e32 v3, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GCN-NEXT: s_movk_i32 s0, 0x7fff -; GCN-NEXT: v_add3_u32 v1, v1, v0, s0 -; GCN-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc -; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 -; GCN-NEXT: s_endpgm +; GFX-940-LABEL: fptrunc_f32_to_bf16: +; GFX-940: ; %bb.0: ; %entry +; GFX-940-NEXT: v_mov_b32_e32 v3, v2 +; GFX-940-NEXT: v_mov_b32_e32 v2, v1 +; GFX-940-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX-940-NEXT: s_movk_i32 s0, 0x7fff +; GFX-940-NEXT: v_add3_u32 v1, v1, v0, s0 +; GFX-940-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX-940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX-940-NEXT: s_nop 1 +; GFX-940-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GFX-940-NEXT: s_endpgm +; +; GFX-950-LABEL: fptrunc_f32_to_bf16: +; GFX-950: ; %bb.0: ; %entry +; GFX-950-NEXT: v_mov_b32_e32 v3, v2 +; GFX-950-NEXT: v_mov_b32_e32 v2, v1 +; GFX-950-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX-950-NEXT: s_movk_i32 s0, 0x7fff +; GFX-950-NEXT: v_add3_u32 v1, v1, v0, s0 +; GFX-950-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX-950-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX-950-NEXT: s_nop 1 +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; GFX-950-NEXT: flat_store_short_d16_hi v[2:3], v0 +; GFX-950-NEXT: s_endpgm entry: %a.cvt = fptrunc float %a to bfloat store bfloat %a.cvt, ptr %out @@ -218,20 +233,35 @@ entry: } define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) { -; GCN-LABEL: fptrunc_f32_to_bf16_abs: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: v_mov_b32_e32 v3, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 -; GCN-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GCN-NEXT: s_movk_i32 s0, 0x7fff -; GCN-NEXT: v_add3_u32 v4, v4, v1, s0 -; GCN-NEXT: v_or_b32_e32 v1, 0x400000, v1 -; GCN-NEXT: v_cmp_u_f32_e64 vcc, |v0|, |v0| -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 -; GCN-NEXT: s_endpgm +; GFX-940-LABEL: fptrunc_f32_to_bf16_abs: +; GFX-940: ; %bb.0: ; %entry +; GFX-940-NEXT: v_mov_b32_e32 v3, v2 +; GFX-940-NEXT: v_mov_b32_e32 v2, v1 +; GFX-940-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 +; GFX-940-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX-940-NEXT: s_movk_i32 s0, 0x7fff +; GFX-940-NEXT: v_add3_u32 v4, v4, v1, s0 +; GFX-940-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GFX-940-NEXT: v_cmp_u_f32_e64 vcc, |v0|, |v0| +; GFX-940-NEXT: s_nop 1 +; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GFX-940-NEXT: s_endpgm +; +; GFX-950-LABEL: fptrunc_f32_to_bf16_abs: +; GFX-950: ; %bb.0: ; %entry +; GFX-950-NEXT: v_mov_b32_e32 v3, v2 +; GFX-950-NEXT: v_mov_b32_e32 v2, v1 +; GFX-950-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 +; GFX-950-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX-950-NEXT: s_movk_i32 s0, 0x7fff +; GFX-950-NEXT: v_add3_u32 v4, v4, v1, s0 +; GFX-950-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GFX-950-NEXT: v_cmp_u_f32_e64 vcc, |v0|, |v0| +; GFX-950-NEXT: s_nop 1 +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX-950-NEXT: flat_store_short_d16_hi v[2:3], v0 +; GFX-950-NEXT: s_endpgm entry: %a.abs = call float @llvm.fabs.f32(float %a) %a.cvt = fptrunc float %a.abs to bfloat @@ -240,20 +270,35 @@ entry: } define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) { -; GCN-LABEL: fptrunc_f32_to_bf16_neg: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: v_mov_b32_e32 v3, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v0 -; GCN-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GCN-NEXT: s_movk_i32 s0, 0x7fff -; GCN-NEXT: v_add3_u32 v4, v4, v1, s0 -; GCN-NEXT: v_or_b32_e32 v1, 0x400000, v1 -; GCN-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 -; GCN-NEXT: s_endpgm +; GFX-940-LABEL: fptrunc_f32_to_bf16_neg: +; GFX-940: ; %bb.0: ; %entry +; GFX-940-NEXT: v_mov_b32_e32 v3, v2 +; GFX-940-NEXT: v_mov_b32_e32 v2, v1 +; GFX-940-NEXT: v_xor_b32_e32 v1, 0x80000000, v0 +; GFX-940-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX-940-NEXT: s_movk_i32 s0, 0x7fff +; GFX-940-NEXT: v_add3_u32 v4, v4, v1, s0 +; GFX-940-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GFX-940-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0 +; GFX-940-NEXT: s_nop 1 +; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GFX-940-NEXT: s_endpgm +; +; GFX-950-LABEL: fptrunc_f32_to_bf16_neg: +; GFX-950: ; %bb.0: ; %entry +; GFX-950-NEXT: v_mov_b32_e32 v3, v2 +; GFX-950-NEXT: v_mov_b32_e32 v2, v1 +; GFX-950-NEXT: v_xor_b32_e32 v1, 0x80000000, v0 +; GFX-950-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX-950-NEXT: s_movk_i32 s0, 0x7fff +; GFX-950-NEXT: v_add3_u32 v4, v4, v1, s0 +; GFX-950-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GFX-950-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0 +; GFX-950-NEXT: s_nop 1 +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX-950-NEXT: flat_store_short_d16_hi v[2:3], v0 +; GFX-950-NEXT: s_endpgm entry: %a.neg = fneg float %a %a.cvt = fptrunc float %a.neg to bfloat @@ -262,29 +307,53 @@ entry: } define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) { -; GCN-LABEL: fptrunc_f64_to_bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| -; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GCN-NEXT: v_and_b32_e32 v7, 1, v6 -; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] -; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] -; GCN-NEXT: v_add_u32_e32 v4, v6, v4 -; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GCN-NEXT: s_brev_b32 s0, 1 -; GCN-NEXT: v_and_or_b32 v5, v1, s0, v4 -; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1 -; GCN-NEXT: s_movk_i32 s0, 0x7fff -; GCN-NEXT: v_add3_u32 v4, v4, v5, s0 -; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5 -; GCN-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 -; GCN-NEXT: s_endpgm +; GFX-940-LABEL: fptrunc_f64_to_bf16: +; GFX-940: ; %bb.0: ; %entry +; GFX-940-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| +; GFX-940-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX-940-NEXT: v_and_b32_e32 v7, 1, v6 +; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX-940-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GFX-940-NEXT: v_add_u32_e32 v4, v6, v4 +; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX-940-NEXT: s_brev_b32 s0, 1 +; GFX-940-NEXT: v_and_or_b32 v5, v1, s0, v4 +; GFX-940-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX-940-NEXT: s_movk_i32 s0, 0x7fff +; GFX-940-NEXT: v_add3_u32 v4, v4, v5, s0 +; GFX-940-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX-940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX-940-NEXT: s_nop 1 +; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GFX-940-NEXT: s_endpgm +; +; GFX-950-LABEL: fptrunc_f64_to_bf16: +; GFX-950: ; %bb.0: ; %entry +; GFX-950-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| +; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6 +; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX-950-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GFX-950-NEXT: v_add_u32_e32 v4, v6, v4 +; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-950-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX-950-NEXT: s_brev_b32 s0, 1 +; GFX-950-NEXT: v_and_or_b32 v5, v1, s0, v4 +; GFX-950-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX-950-NEXT: s_movk_i32 s0, 0x7fff +; GFX-950-NEXT: v_add3_u32 v4, v4, v5, s0 +; GFX-950-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX-950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX-950-NEXT: s_nop 1 +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX-950-NEXT: flat_store_short_d16_hi v[2:3], v0 +; GFX-950-NEXT: s_endpgm entry: %a.cvt = fptrunc double %a to bfloat store bfloat %a.cvt, ptr %out @@ -292,30 +361,55 @@ entry: } define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) { -; GCN-LABEL: fptrunc_f64_to_bf16_neg: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| -; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 -; GCN-NEXT: v_and_b32_e32 v8, 1, v7 -; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] -; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 -; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] -; GCN-NEXT: v_add_u32_e32 v4, v7, v4 -; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc -; GCN-NEXT: s_brev_b32 s4, 1 -; GCN-NEXT: v_xor_b32_e32 v6, 0x80000000, v1 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GCN-NEXT: v_and_or_b32 v5, v6, s4, v4 -; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1 -; GCN-NEXT: s_movk_i32 s0, 0x7fff -; GCN-NEXT: v_add3_u32 v4, v4, v5, s0 -; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5 -; GCN-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1] -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 -; GCN-NEXT: s_endpgm +; GFX-940-LABEL: fptrunc_f64_to_bf16_neg: +; GFX-940: ; %bb.0: ; %entry +; GFX-940-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| +; GFX-940-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GFX-940-NEXT: v_and_b32_e32 v8, 1, v7 +; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX-940-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GFX-940-NEXT: v_add_u32_e32 v4, v7, v4 +; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-940-NEXT: s_brev_b32 s4, 1 +; GFX-940-NEXT: v_xor_b32_e32 v6, 0x80000000, v1 +; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX-940-NEXT: v_and_or_b32 v5, v6, s4, v4 +; GFX-940-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX-940-NEXT: s_movk_i32 s0, 0x7fff +; GFX-940-NEXT: v_add3_u32 v4, v4, v5, s0 +; GFX-940-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX-940-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1] +; GFX-940-NEXT: s_nop 1 +; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GFX-940-NEXT: s_endpgm +; +; GFX-950-LABEL: fptrunc_f64_to_bf16_neg: +; GFX-950: ; %bb.0: ; %entry +; GFX-950-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| +; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GFX-950-NEXT: v_and_b32_e32 v8, 1, v7 +; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX-950-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GFX-950-NEXT: v_add_u32_e32 v4, v7, v4 +; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-950-NEXT: s_brev_b32 s4, 1 +; GFX-950-NEXT: v_xor_b32_e32 v6, 0x80000000, v1 +; GFX-950-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX-950-NEXT: v_and_or_b32 v5, v6, s4, v4 +; GFX-950-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX-950-NEXT: s_movk_i32 s0, 0x7fff +; GFX-950-NEXT: v_add3_u32 v4, v4, v5, s0 +; GFX-950-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX-950-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1] +; GFX-950-NEXT: s_nop 1 +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX-950-NEXT: flat_store_short_d16_hi v[2:3], v0 +; GFX-950-NEXT: s_endpgm entry: %a.neg = fneg double %a %a.cvt = fptrunc double %a.neg to bfloat @@ -324,30 +418,55 @@ entry: } define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) { -; GCN-LABEL: fptrunc_f64_to_bf16_abs: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| -; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 -; GCN-NEXT: v_and_b32_e32 v8, 1, v7 -; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] -; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 -; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] -; GCN-NEXT: v_add_u32_e32 v4, v7, v4 -; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc -; GCN-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GCN-NEXT: s_brev_b32 s0, 1 -; GCN-NEXT: v_and_or_b32 v5, v6, s0, v4 -; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1 -; GCN-NEXT: s_movk_i32 s0, 0x7fff -; GCN-NEXT: v_add3_u32 v4, v4, v5, s0 -; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5 -; GCN-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]| -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 -; GCN-NEXT: s_endpgm +; GFX-940-LABEL: fptrunc_f64_to_bf16_abs: +; GFX-940: ; %bb.0: ; %entry +; GFX-940-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| +; GFX-940-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GFX-940-NEXT: v_and_b32_e32 v8, 1, v7 +; GFX-940-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GFX-940-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GFX-940-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX-940-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GFX-940-NEXT: v_add_u32_e32 v4, v7, v4 +; GFX-940-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-940-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1 +; GFX-940-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX-940-NEXT: s_brev_b32 s0, 1 +; GFX-940-NEXT: v_and_or_b32 v5, v6, s0, v4 +; GFX-940-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX-940-NEXT: s_movk_i32 s0, 0x7fff +; GFX-940-NEXT: v_add3_u32 v4, v4, v5, s0 +; GFX-940-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX-940-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]| +; GFX-940-NEXT: s_nop 1 +; GFX-940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX-940-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GFX-940-NEXT: s_endpgm +; +; GFX-950-LABEL: fptrunc_f64_to_bf16_abs: +; GFX-950: ; %bb.0: ; %entry +; GFX-950-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| +; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GFX-950-NEXT: v_and_b32_e32 v8, 1, v7 +; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX-950-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GFX-950-NEXT: v_add_u32_e32 v4, v7, v4 +; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-950-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1 +; GFX-950-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX-950-NEXT: s_brev_b32 s0, 1 +; GFX-950-NEXT: v_and_or_b32 v5, v6, s0, v4 +; GFX-950-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GFX-950-NEXT: s_movk_i32 s0, 0x7fff +; GFX-950-NEXT: v_add3_u32 v4, v4, v5, s0 +; GFX-950-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX-950-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]| +; GFX-950-NEXT: s_nop 1 +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX-950-NEXT: flat_store_short_d16_hi v[2:3], v0 +; GFX-950-NEXT: s_endpgm entry: %a.abs = call double @llvm.fabs.f64(double %a) %a.cvt = fptrunc double %a.abs to bfloat diff --git a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll index 4eac26e853c2a0..b64968c9336b93 100644 --- a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll +++ b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll @@ -80,6 +80,9 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=-xnack < %s | FileCheck --check-prefixes=GFX942-NOXNACK %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=+xnack < %s | FileCheck --check-prefixes=GFX942-XNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -mattr=-xnack < %s | FileCheck --check-prefixes=GFX950-NOXNACK %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -mattr=+xnack < %s | FileCheck --check-prefixes=GFX950-XNACK %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX1010 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack < %s | FileCheck --check-prefixes=GFX1010-NOXNACK %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack < %s | FileCheck --check-prefixes=GFX1010-XNACK %s @@ -180,6 +183,9 @@ ; GFX942: .amdgcn_target "amdgcn-amd-amdhsa--gfx942" ; GFX942-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx942:xnack-" ; GFX942-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx942:xnack+" +; GFX950: .amdgcn_target "amdgcn-amd-amdhsa--gfx950" +; GFX950-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx950:xnack-" +; GFX950-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx950:xnack+" ; GFX1010: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010" ; GFX1010-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010:xnack-" ; GFX1010-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010:xnack+" diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll index f1f4edb94a6178..99344f16d4cd68 100644 --- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll @@ -57,6 +57,7 @@ ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx940 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX940 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx941 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX941 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx942 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX942 %s +; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX950 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1010 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1010 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1011 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1011 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1012 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1012 %s @@ -139,6 +140,7 @@ ; GFX940: EF_AMDGPU_MACH_AMDGCN_GFX940 (0x40) ; GFX941: EF_AMDGPU_MACH_AMDGCN_GFX941 (0x4B) ; GFX942: EF_AMDGPU_MACH_AMDGCN_GFX942 (0x4C) +; GFX950: EF_AMDGPU_MACH_AMDGCN_GFX950 (0x4F) ; GFX1010: EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33) ; GFX1011: EF_AMDGPU_MACH_AMDGCN_GFX1011 (0x34) ; GFX1012: EF_AMDGPU_MACH_AMDGCN_GFX1012 (0x35) diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll index 961b89ab28f623..3ad2a9df764be5 100644 --- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll @@ -12,6 +12,9 @@ ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx940 < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX940 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx940 -mattr=+sramecc < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX940 %s +; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX950 %s +; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 -mattr=+sramecc < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX950 %s + ; NO-SRAM-ECC-GFX906: Flags [ ; NO-SRAM-ECC-GFX906-NEXT: EF_AMDGPU_FEATURE_XNACK_V3 (0x100) ; NO-SRAM-ECC-GFX906-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F) @@ -44,6 +47,11 @@ ; SRAM-ECC-GFX940: EF_AMDGPU_MACH_AMDGCN_GFX940 (0x40) ; SRAM-ECC-GFX940: ] +; SRAM-ECC-GFX950: Flags [ +; SRAM-ECC-GFX950: EF_AMDGPU_FEATURE_SRAMECC_V3 (0x200) +; SRAM-ECC-GFX950: EF_AMDGPU_MACH_AMDGCN_GFX950 (0x4F) +; SRAM-ECC-GFX950: ] + define amdgpu_kernel void @elf_header() { ret void } diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index 27282a453075b3..08122cd0d89eab 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950 %s define float @v_fmaximum3_f32(float %a, float %b, float %c) { ; GFX12-LABEL: v_fmaximum3_f32: @@ -19,9 +20,11 @@ define float @v_fmaximum3_f32(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.maximum.f32(float %a, float %b) @@ -46,9 +49,11 @@ define float @v_fmaximum3_f32_commute(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v2, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.maximum.f32(float %a, float %b) @@ -71,10 +76,13 @@ define amdgpu_ps i32 @s_fmaximum3_f32(float inreg %a, float inreg %b, float inre ; GFX9-NEXT: v_max_f32_e32 v1, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_max_f32_e32 v1, s2, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s2, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %max0 = call float @llvm.maximum.f32(float %a, float %b) @@ -101,9 +109,11 @@ define float @v_fmaximum3_f32_fabs0(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e64 v3, |v0|, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call float @llvm.fabs.f32(float %a) @@ -129,9 +139,11 @@ define float @v_fmaximum3_f32_fabs1(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e64 v3, v0, |v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %b.fabs = call float @llvm.fabs.f32(float %b) @@ -157,9 +169,11 @@ define float @v_fmaximum3_f32_fabs2(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v1, v0, |v2| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %c.fabs = call float @llvm.fabs.f32(float %c) @@ -185,9 +199,11 @@ define float @v_fmaximum3_f32_fabs_all(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e64 v3, |v0|, |v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v1, v0, |v2| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call float @llvm.fabs.f32(float %a) @@ -215,9 +231,11 @@ define float @v_fmaximum3_f32_fneg_all(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e64 v3, -v0, -v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v1, v0, -v2 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg float %a @@ -245,9 +263,11 @@ define float @v_fmaximum3_f32_fneg_fabs_all(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e64 v3, -|v0|, -|v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -|v0|, -|v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v1, v0, -|v2| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -|v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call float @llvm.fabs.f32(float %a) @@ -278,9 +298,11 @@ define float @v_fmaximum3_f32_fneg0(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e64 v3, -v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg float %a @@ -306,9 +328,11 @@ define float @v_fmaximum3_f32_fneg1(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e64 v3, v0, -v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %b.fneg = fneg float %b @@ -334,9 +358,11 @@ define float @v_fmaximum3_f32_fneg2(float %a, float %b, float %c) { ; GFX9-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v1, v0, -v2 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %c.fneg = fneg float %c @@ -362,9 +388,11 @@ define float @v_fmaximum3_f32_const0(float %b, float %c) { ; GFX9-NEXT: v_max_f32_e32 v2, 0x41000000, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.maximum.f32(float 8.0, float %b) @@ -389,9 +417,11 @@ define float @v_fmaximum3_f32__const2(float %a, float %b) { ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_max_f32_e32 v1, 0x41000000, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.maximum.f32(float %a, float %b) @@ -416,9 +446,11 @@ define float @v_fmaximum3_f32_inlineimm0(float %b, float %c) { ; GFX9-NEXT: v_max_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.maximum.f32(float 4.0, float %b) @@ -443,9 +475,11 @@ define float @v_fmaximum3_f32__inlineimm(float %a, float %b) { ; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_max_f32_e32 v1, 4.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.maximum.f32(float %a, float %b) @@ -472,9 +506,11 @@ define float @v_fmaximum3_f32_const1_const2(float %a) { ; GFX9-NEXT: v_max_f32_e32 v1, 0x41000000, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_max_f32_e32 v1, 0x41800000, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.maximum.f32(float %a, float 8.0) @@ -500,15 +536,19 @@ define <2 x float> @v_fmaximum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float ; GFX9-NEXT: v_max_f32_e32 v6, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v4, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v5, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b) @@ -534,15 +574,19 @@ define <2 x float> @v_fmaximum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2 ; GFX9-NEXT: v_max_f32_e32 v6, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v0, v4 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v1, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b) @@ -568,15 +612,19 @@ define <2 x float> @v_fmaximum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b, ; GFX9-NEXT: v_max_f32_e64 v6, |v1|, |v3| ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v1|, |v3| -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_max_f32_e64 v3, |v0|, |v2| +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v2, v0, |v4| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v4| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: v_max_f32_e64 v2, v1, |v5| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, |v5| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) @@ -605,15 +653,19 @@ define <2 x float> @v_fmaximum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b, ; GFX9-NEXT: v_max_f32_e64 v6, -v1, -v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v1, -v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_max_f32_e64 v3, -v0, -v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v2, v0, -v4 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: v_max_f32_e64 v2, v1, -v5 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, -v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <2 x float> %a @@ -642,15 +694,19 @@ define <2 x float> @v_fmaximum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c ; GFX9-NEXT: v_max_f32_e32 v4, 2.0, v1 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX9-NEXT: v_max_f32_e32 v4, 2.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_max_f32_e32 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> ) @@ -676,15 +732,19 @@ define <2 x float> @v_fmaximum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b ; GFX9-NEXT: v_max_f32_e32 v4, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v0, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: v_max_f32_e32 v2, 4.0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b) @@ -711,21 +771,27 @@ define <3 x float> @v_fmaximum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float ; GFX9-NEXT: v_max_f32_e32 v9, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_max_f32_e32 v5, v1, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_max_f32_e32 v4, v0, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v6, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v7, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v8, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b) @@ -752,21 +818,27 @@ define <3 x float> @v_fmaximum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3 ; GFX9-NEXT: v_max_f32_e32 v9, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_max_f32_e32 v5, v1, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_max_f32_e32 v4, v0, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v0, v6 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v1, v7 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v7 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v2, v8 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v8 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b) @@ -793,21 +865,27 @@ define <3 x float> @v_fmaximum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b, ; GFX9-NEXT: v_max_f32_e64 v9, |v2|, |v5| ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v2|, |v5| -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_max_f32_e64 v5, |v1|, |v4| +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v1|, |v4| -; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_max_f32_e64 v4, |v0|, |v3| +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v3| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc ; GFX9-NEXT: v_max_f32_e64 v3, v0, |v6| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v6| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v3, v1, |v7| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, |v7| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v3, v2, |v8| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v2, |v8| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a) @@ -837,21 +915,27 @@ define <3 x float> @v_fmaximum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b, ; GFX9-NEXT: v_max_f32_e64 v9, -v2, -v5 ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v2, -v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_max_f32_e64 v5, -v1, -v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v1, -v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_max_f32_e64 v4, -v0, -v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc ; GFX9-NEXT: v_max_f32_e64 v3, v0, -v6 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v6 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v3, v1, -v7 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, -v7 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc ; GFX9-NEXT: v_max_f32_e64 v3, v2, -v8 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v2, -v8 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <3 x float> %a @@ -881,21 +965,27 @@ define <3 x float> @v_fmaximum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c ; GFX9-NEXT: v_max_f32_e32 v6, 2.0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc ; GFX9-NEXT: v_max_f32_e32 v6, 2.0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_max_f32_e32 v6, 2.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_max_f32_e32 v6, v0, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v1, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v3, v2, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> ) @@ -922,21 +1012,27 @@ define <3 x float> @v_fmaximum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b ; GFX9-NEXT: v_max_f32_e32 v6, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc ; GFX9-NEXT: v_max_f32_e32 v5, v1, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc ; GFX9-NEXT: v_max_f32_e32 v4, v0, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc ; GFX9-NEXT: v_max_f32_e32 v3, 4.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v3, 4.0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v3, 4.0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b) @@ -962,9 +1058,11 @@ define half @v_fmaximum3_f16(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) @@ -989,9 +1087,11 @@ define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e32 v1, v2, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) @@ -1016,11 +1116,14 @@ define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg % ; GFX9-NEXT: v_max_f16_e32 v1, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_max_f16_e32 v1, s2, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %max0 = call half @llvm.maximum.f16(half %a, half %b) @@ -1048,9 +1151,11 @@ define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e64 v3, |v0|, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) @@ -1076,9 +1181,11 @@ define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e64 v3, v0, |v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %b.fabs = call half @llvm.fabs.f16(half %b) @@ -1104,9 +1211,11 @@ define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e64 v1, v0, |v2| ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %c.fabs = call half @llvm.fabs.f16(half %c) @@ -1132,9 +1241,11 @@ define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e64 v3, |v0|, |v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e64 v1, v0, |v2| ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) @@ -1162,9 +1273,11 @@ define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e64 v3, -v0, -v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e64 v1, v0, -v2 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg half %a @@ -1192,9 +1305,11 @@ define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e64 v3, -|v0|, -|v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -|v0|, -|v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e64 v1, v0, -|v2| ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -|v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) @@ -1225,9 +1340,11 @@ define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e64 v3, -v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg half %a @@ -1253,9 +1370,11 @@ define half @v_fmaximum3_f16_fneg1(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e64 v3, v0, -v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %b.fneg = fneg half %b @@ -1281,9 +1400,11 @@ define half @v_fmaximum3_f16_fneg2(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e64 v1, v0, -v2 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %c.fneg = fneg half %c @@ -1309,9 +1430,11 @@ define half @v_fmaximum3_f16_const0(half %b, half %c) { ; GFX9-NEXT: v_max_f16_e32 v2, 0x4800, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half 8.0, half %b) @@ -1336,9 +1459,11 @@ define half @v_fmaximum3_f16__const2(half %a, half %b) { ; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_max_f16_e32 v1, 0x4800, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) @@ -1363,9 +1488,11 @@ define half @v_fmaximum3_f16_inlineimm0(half %b, half %c) { ; GFX9-NEXT: v_max_f16_e32 v2, 4.0, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half 4.0, half %b) @@ -1390,9 +1517,11 @@ define half @v_fmaximum3_f16__inlineimm(half %a, half %b) { ; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_max_f16_e32 v1, 4.0, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half %b) @@ -1419,9 +1548,11 @@ define half @v_fmaximum3_f16_const1_const2(half %a) { ; GFX9-NEXT: v_max_f16_e32 v1, 0x4800, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_max_f16_e32 v1, 0x4c00, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.maximum.f16(half %a, half 8.0) @@ -1448,19 +1579,23 @@ define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c ; GFX9-NEXT: v_pk_max_f16 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v0, v5, s4 +; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX9-NEXT: v_pk_max_f16 v1, v2, v1 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max0) @@ -1486,19 +1621,23 @@ define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x ; GFX9-NEXT: v_pk_max_f16 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v0, v5, s4 +; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c) @@ -1527,22 +1666,25 @@ define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 ; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1 ; GFX9-NEXT: v_pk_max_f16 v3, v3, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 -; GFX9-NEXT: v_perm_b32 v1, v4, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v4, v0, s0 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a) %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b) @@ -1571,19 +1713,23 @@ define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 ; GFX9-NEXT: v_pk_max_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v0, v5, s4 +; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v5, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <2 x half> %a %b.fneg = fneg <2 x half> %b @@ -1610,21 +1756,25 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v2, v0, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v2, v3, v0, s4 +; GFX9-NEXT: v_perm_b32 v2, v3, v0, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> ) %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c) @@ -1650,19 +1800,23 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) { ; GFX9-NEXT: v_pk_max_f16 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v0, v4, s4 +; GFX9-NEXT: v_perm_b32 v1, v0, v4, s0 ; GFX9-NEXT: v_pk_max_f16 v1, v1, 4.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v2, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> ) @@ -1690,29 +1844,35 @@ define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c ; GFX9-NEXT: v_pk_max_f16 v6, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX9-NEXT: v_pk_max_f16 v1, v5, v1 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %c, <3 x half> %max0) @@ -1740,29 +1900,35 @@ define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x ; GFX9-NEXT: v_pk_max_f16 v6, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v5 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c) @@ -1799,33 +1965,37 @@ define <3 x half> @v_fmaximum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 ; GFX9-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v0 ; GFX9-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v2 ; GFX9-NEXT: v_pk_max_f16 v7, v7, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX9-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_pk_max_f16 v6, v6, v8 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 +; GFX9-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 -; GFX9-NEXT: v_perm_b32 v2, v8, v0, s4 +; GFX9-NEXT: v_perm_b32 v2, v8, v0, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v11 -; GFX9-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v6, v9, v1, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX9-NEXT: v_perm_b32 v6, v9, v1, s0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc ; GFX9-NEXT: v_pk_max_f16 v6, v6, v10 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a) %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b) @@ -1856,29 +2026,35 @@ define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 ; GFX9-NEXT: v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX9-NEXT: v_pk_max_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <3 x half> %a %b.fneg = fneg <3 x half> %b @@ -1907,29 +2083,34 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_pk_max_f16 v7, v1, 2.0 +; GFX9-NEXT: s_mov_b32 s1, 0x5040100 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX9-NEXT: s_movk_i32 s0, 0x7e00 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX9-NEXT: s_mov_b32 s5, 0x5040100 -; GFX9-NEXT: v_perm_b32 v4, v5, v0, s5 +; GFX9-NEXT: v_perm_b32 v4, v5, v0, s1 ; GFX9-NEXT: v_pk_max_f16 v4, v4, v2 -; GFX9-NEXT: s_movk_i32 s4, 0x7e00 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX9-NEXT: v_pack_b32_f16 v7, v1, s4 +; GFX9-NEXT: v_pack_b32_f16 v7, v1, s0 ; GFX9-NEXT: v_pk_max_f16 v7, v7, v3 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX9-NEXT: v_perm_b32 v0, v5, v0, s5 +; GFX9-NEXT: v_perm_b32 v0, v5, v0, s1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> ) %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c) @@ -1957,29 +2138,35 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { ; GFX9-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v4, s0 ; GFX9-NEXT: v_pk_max_f16 v1, v1, 4.0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 -; GFX9-NEXT: v_perm_b32 v2, v0, v6, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9-NEXT: v_perm_b32 v2, v0, v6, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> ) @@ -2007,33 +2194,41 @@ define <4 x half> @v_fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c ; GFX9-NEXT: v_pk_max_f16 v6, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v2, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v5, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v4, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 +; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %c, <4 x half> %max0) @@ -2061,33 +2256,41 @@ define <4 x half> @v_fmaximum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x ; GFX9-NEXT: v_pk_max_f16 v6, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v2, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v5 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 +; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c) @@ -2124,37 +2327,43 @@ define <4 x half> @v_fmaximum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX9-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1 ; GFX9-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 ; GFX9-NEXT: v_pk_max_f16 v7, v7, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX9-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_pk_max_f16 v6, v6, v8 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 +; GFX9-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 -; GFX9-NEXT: v_perm_b32 v2, v8, v1, s4 -; GFX9-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 +; GFX9-NEXT: v_perm_b32 v2, v8, v1, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v11 -; GFX9-NEXT: v_perm_b32 v6, v9, v0, s4 +; GFX9-NEXT: v_perm_b32 v6, v9, v0, s0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_pk_max_f16 v6, v6, v10 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| +; GFX9-NEXT: v_perm_b32 v1, v3, v1, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc -; GFX9-NEXT: v_perm_b32 v0, v7, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX9-NEXT: v_perm_b32 v0, v7, v0, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a) %b.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %b) @@ -2185,33 +2394,41 @@ define <4 x half> @v_fmaximum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 ; GFX9-NEXT: v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v2, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 +; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <4 x half> %a %b.fneg = fneg <4 x half> %b @@ -2240,35 +2457,41 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_pk_max_f16 v7, v1, 2.0 op_sel_hi:[1,0] +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v7 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v4, v8, v1, s4 +; GFX9-NEXT: v_perm_b32 v4, v8, v1, s0 ; GFX9-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v8, v5, v0, s4 +; GFX9-NEXT: v_perm_b32 v8, v5, v0, s0 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX9-NEXT: v_pk_max_f16 v8, v8, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v8 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: v_perm_b32 v1, v7, v1, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc -; GFX9-NEXT: v_perm_b32 v0, v5, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v7, v1, s4 +; GFX9-NEXT: v_perm_b32 v0, v5, v0, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> ) %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c) @@ -2296,33 +2519,41 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { ; GFX9-NEXT: v_pk_max_f16 v4, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v2, v1, v4, s4 +; GFX9-NEXT: v_perm_b32 v2, v1, v4, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v6, s4 +; GFX9-NEXT: v_perm_b32 v2, v0, v6, s0 ; GFX9-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 +; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v4, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> ) @@ -2346,12 +2577,14 @@ define double @v_fmaximum3_f64(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2377,12 +2610,14 @@ define double @v_fmaximum3_f64_commute(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[0:1] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2404,19 +2639,20 @@ define amdgpu_ps <2 x i32> @s_fmaximum3_f64(double inreg %a, double inreg %b, do ; ; GFX9-LABEL: s_fmaximum3_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX9-NEXT: v_max_f64 v[2:3], s[0:1], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], s[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: ; return to shader part epilog %max0 = call double @llvm.maximum.f64(double %a, double %b) %max1 = call double @llvm.maximum.f64(double %max0, double %c) @@ -2447,12 +2683,14 @@ define double @v_fmaximum3_f64_fabs0(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], |v[0:1]|, v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2479,12 +2717,14 @@ define double @v_fmaximum3_f64_fabs1(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], |v[2:3]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2511,12 +2751,14 @@ define double @v_fmaximum3_f64_fabs2(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], |v[4:5]| ; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2543,12 +2785,14 @@ define double @v_fmaximum3_f64_fabs_all(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], |v[0:1]|, |v[2:3]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], |v[4:5]| ; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2577,12 +2821,14 @@ define double @v_fmaximum3_f64_fneg_all(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], -v[0:1], -v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], -v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2611,12 +2857,14 @@ define double @v_fmaximum3_f64_fneg_fabs_all(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], -|v[0:1]|, -|v[2:3]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], -|v[4:5]| ; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2648,12 +2896,14 @@ define double @v_fmaximum3_f64_fneg0(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], -v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2680,12 +2930,14 @@ define double @v_fmaximum3_f64_fneg1(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], -v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2712,12 +2964,14 @@ define double @v_fmaximum3_f64_fneg2(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], -v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2743,15 +2997,17 @@ define double @v_fmaximum3_f64_const0(double %b, double %c) { ; GFX9-LABEL: v_fmaximum3_f64_const0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40200000 -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], s[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 0x40200000 +; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2777,14 +3033,15 @@ define double @v_fmaximum3_f64__const2(double %a, double %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40200000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 0x40200000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], s[4:5] +; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], s[0:1] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2810,12 +3067,14 @@ define double @v_fmaximum3_f64_inlineimm0(double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], 4.0 -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2841,12 +3100,14 @@ define double @v_fmaximum3_f64__inlineimm(double %a, double %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], 4.0 ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2871,17 +3132,18 @@ define double @v_fmaximum3_f64_const1_const2(double %a) { ; GFX9-LABEL: v_fmaximum3_f64_const1_const2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40200000 -; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], s[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 0x40200000 +; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40300000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 0x40300000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], s[4:5] +; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], s[0:1] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2909,9 +3171,11 @@ define <2 x float> @v_no_fmaximum3_f32__multi_use(float %a, float %b, float %c) ; GFX9-NEXT: v_max_f32_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.maximum.f32(float %a, float %b) @@ -2935,11 +3199,14 @@ define amdgpu_ps <2 x i32> @s_no_fmaximum3_f32__multi_use(float inreg %a, float ; GFX9-NEXT: v_max_f32_e32 v1, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_max_f32_e32 v1, s2, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s2, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog %max0 = call float @llvm.maximum.f32(float %a, float %b) @@ -2973,9 +3240,11 @@ define <2 x half> @v_no_fmaximum3_f16__multi_use(half %a, half %b, half %c) { ; GFX9-NEXT: v_max_f16_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_max_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3002,11 +3271,13 @@ define amdgpu_ps <2 x i32> @s_no_fmaximum3_f16__multi_use(half inreg %a, half in ; GFX9-NEXT: v_max_f16_e32 v1, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_max_f16_e32 v1, s2, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 @@ -3043,19 +3314,23 @@ define <4 x half> @v_no_fmaximum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, ; GFX9-NEXT: v_pk_max_f16 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v5, s4 +; GFX9-NEXT: v_perm_b32 v0, v1, v5, s0 ; GFX9-NEXT: v_pk_max_f16 v3, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v5, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v5, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.maximum.f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.maximum.f16(<2 x half> %max0, <2 x half> %c) @@ -3080,12 +3355,14 @@ define <2 x double> @v_no_fmaximum3_f64__multi_use(double %a, double %b, double ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3095,3 +3372,6 @@ define <2 x double> @v_no_fmaximum3_f64__multi_use(double %a, double %b, double %insert.1 = insertelement <2 x double> %insert.0, double %max1, i32 1 ret <2 x double> %insert.1 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX940: {{.*}} +; GFX950: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index d9ba2de48bb010..43293512c8c21d 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950 %s define float @v_fminimum3_f32(float %a, float %b, float %c) { ; GFX12-LABEL: v_fminimum3_f32: @@ -19,9 +20,11 @@ define float @v_fminimum3_f32(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.minimum.f32(float %a, float %b) @@ -46,9 +49,11 @@ define float @v_fminimum3_f32_commute(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v1, v2, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.minimum.f32(float %a, float %b) @@ -71,10 +76,13 @@ define amdgpu_ps i32 @s_fminimum3_f32(float inreg %a, float inreg %b, float inre ; GFX9-NEXT: v_min_f32_e32 v1, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_min_f32_e32 v1, s2, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s2, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %max0 = call float @llvm.minimum.f32(float %a, float %b) @@ -101,9 +109,11 @@ define float @v_fminimum3_f32_fabs0(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e64 v3, |v0|, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call float @llvm.fabs.f32(float %a) @@ -129,9 +139,11 @@ define float @v_fminimum3_f32_fabs1(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e64 v3, v0, |v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %b.fabs = call float @llvm.fabs.f32(float %b) @@ -157,9 +169,11 @@ define float @v_fminimum3_f32_fabs2(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v1, v0, |v2| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %c.fabs = call float @llvm.fabs.f32(float %c) @@ -185,9 +199,11 @@ define float @v_fminimum3_f32_fabs_all(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e64 v3, |v0|, |v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v1, v0, |v2| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call float @llvm.fabs.f32(float %a) @@ -215,9 +231,11 @@ define float @v_fminimum3_f32_fneg_all(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e64 v3, -v0, -v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v1, v0, -v2 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg float %a @@ -245,9 +263,11 @@ define float @v_fminimum3_f32_fneg_fabs_all(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e64 v3, -|v0|, -|v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -|v0|, -|v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v1, v0, -|v2| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -|v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call float @llvm.fabs.f32(float %a) @@ -278,9 +298,11 @@ define float @v_fminimum3_f32_fneg0(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e64 v3, -v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg float %a @@ -306,9 +328,11 @@ define float @v_fminimum3_f32_fneg1(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e64 v3, v0, -v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %b.fneg = fneg float %b @@ -334,9 +358,11 @@ define float @v_fminimum3_f32_fneg2(float %a, float %b, float %c) { ; GFX9-NEXT: v_min_f32_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v1, v0, -v2 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %c.fneg = fneg float %c @@ -362,9 +388,11 @@ define float @v_fminimum3_f32_const0(float %b, float %c) { ; GFX9-NEXT: v_min_f32_e32 v2, 0x41000000, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.minimum.f32(float 8.0, float %b) @@ -389,9 +417,11 @@ define float @v_fminimum3_f32__const2(float %a, float %b) { ; GFX9-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_min_f32_e32 v1, 0x41000000, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.minimum.f32(float %a, float %b) @@ -416,9 +446,11 @@ define float @v_fminimum3_f32_inlineimm0(float %b, float %c) { ; GFX9-NEXT: v_min_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.minimum.f32(float 4.0, float %b) @@ -443,9 +475,11 @@ define float @v_fminimum3_f32__inlineimm(float %a, float %b) { ; GFX9-NEXT: v_min_f32_e32 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_min_f32_e32 v1, 4.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.minimum.f32(float %a, float %b) @@ -472,9 +506,11 @@ define float @v_fminimum3_f32_const1_const2(float %a) { ; GFX9-NEXT: v_min_f32_e32 v1, 0x41000000, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_min_f32_e32 v1, 0x41800000, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.minimum.f32(float %a, float 8.0) @@ -500,15 +536,19 @@ define <2 x float> @v_fminimum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float ; GFX9-NEXT: v_min_f32_e32 v6, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v2, v4, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: v_min_f32_e32 v2, v5, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b) @@ -534,15 +574,19 @@ define <2 x float> @v_fminimum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2 ; GFX9-NEXT: v_min_f32_e32 v6, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v2, v0, v4 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: v_min_f32_e32 v2, v1, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b) @@ -568,15 +612,19 @@ define <2 x float> @v_fminimum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b, ; GFX9-NEXT: v_min_f32_e64 v6, |v1|, |v3| ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v1|, |v3| -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_min_f32_e64 v3, |v0|, |v2| +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v2, v0, |v4| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v4| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: v_min_f32_e64 v2, v1, |v5| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, |v5| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) @@ -605,15 +653,19 @@ define <2 x float> @v_fminimum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b, ; GFX9-NEXT: v_min_f32_e64 v6, -v1, -v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v1, -v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_min_f32_e64 v3, -v0, -v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v2, v0, -v4 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc ; GFX9-NEXT: v_min_f32_e64 v2, v1, -v5 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, -v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <2 x float> %a @@ -642,15 +694,19 @@ define <2 x float> @v_fminimum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c ; GFX9-NEXT: v_min_f32_e32 v4, 2.0, v1 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX9-NEXT: v_min_f32_e32 v4, 2.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_min_f32_e32 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> ) @@ -676,15 +732,19 @@ define <2 x float> @v_fminimum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b ; GFX9-NEXT: v_min_f32_e32 v4, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v0, v2 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc ; GFX9-NEXT: v_min_f32_e32 v2, 4.0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b) @@ -711,21 +771,27 @@ define <3 x float> @v_fminimum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float ; GFX9-NEXT: v_min_f32_e32 v9, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_min_f32_e32 v5, v1, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_min_f32_e32 v4, v0, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v6, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v7, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v8, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b) @@ -752,21 +818,27 @@ define <3 x float> @v_fminimum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3 ; GFX9-NEXT: v_min_f32_e32 v9, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_min_f32_e32 v5, v1, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_min_f32_e32 v4, v0, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v0, v6 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v1, v7 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v7 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v2, v8 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v8 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b) @@ -793,21 +865,27 @@ define <3 x float> @v_fminimum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b, ; GFX9-NEXT: v_min_f32_e64 v9, |v2|, |v5| ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v2|, |v5| -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_min_f32_e64 v5, |v1|, |v4| +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v1|, |v4| -; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_min_f32_e64 v4, |v0|, |v3| +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v3| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc ; GFX9-NEXT: v_min_f32_e64 v3, v0, |v6| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v6| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v3, v1, |v7| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, |v7| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v3, v2, |v8| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v2, |v8| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a) @@ -837,21 +915,27 @@ define <3 x float> @v_fminimum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b, ; GFX9-NEXT: v_min_f32_e64 v9, -v2, -v5 ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v2, -v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_min_f32_e64 v5, -v1, -v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v1, -v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_min_f32_e64 v4, -v0, -v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc ; GFX9-NEXT: v_min_f32_e64 v3, v0, -v6 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v6 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v3, v1, -v7 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, -v7 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc ; GFX9-NEXT: v_min_f32_e64 v3, v2, -v8 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v2, -v8 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <3 x float> %a @@ -881,21 +965,27 @@ define <3 x float> @v_fminimum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c ; GFX9-NEXT: v_min_f32_e32 v6, 2.0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc ; GFX9-NEXT: v_min_f32_e32 v6, 2.0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc ; GFX9-NEXT: v_min_f32_e32 v6, 2.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_min_f32_e32 v6, v0, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v1, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v3, v2, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> ) @@ -922,21 +1012,27 @@ define <3 x float> @v_fminimum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b ; GFX9-NEXT: v_min_f32_e32 v6, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc ; GFX9-NEXT: v_min_f32_e32 v5, v1, v4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc ; GFX9-NEXT: v_min_f32_e32 v4, v0, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc ; GFX9-NEXT: v_min_f32_e32 v3, 4.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v3, 4.0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v3, 4.0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b) @@ -962,9 +1058,11 @@ define half @v_fminimum3_f16(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) @@ -989,9 +1087,11 @@ define half @v_fminimum3_f16_commute(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e32 v1, v2, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) @@ -1016,11 +1116,14 @@ define amdgpu_ps i32 @s_fminimum3_f16(half inreg %a, half inreg %b, half inreg % ; GFX9-NEXT: v_min_f16_e32 v1, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_min_f16_e32 v1, s2, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %max0 = call half @llvm.minimum.f16(half %a, half %b) @@ -1048,9 +1151,11 @@ define half @v_fminimum3_f16_fabs0(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e64 v3, |v0|, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) @@ -1076,9 +1181,11 @@ define half @v_fminimum3_f16_fabs1(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e64 v3, v0, |v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %b.fabs = call half @llvm.fabs.f16(half %b) @@ -1104,9 +1211,11 @@ define half @v_fminimum3_f16_fabs2(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e64 v1, v0, |v2| ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %c.fabs = call half @llvm.fabs.f16(half %c) @@ -1132,9 +1241,11 @@ define half @v_fminimum3_f16_fabs_all(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e64 v3, |v0|, |v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e64 v1, v0, |v2| ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) @@ -1162,9 +1273,11 @@ define half @v_fminimum3_f16_fneg_all(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e64 v3, -v0, -v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e64 v1, v0, -v2 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg half %a @@ -1192,9 +1305,11 @@ define half @v_fminimum3_f16_fneg_fabs_all(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e64 v3, -|v0|, -|v1| ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -|v0|, -|v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e64 v1, v0, -|v2| ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -|v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call half @llvm.fabs.f16(half %a) @@ -1225,9 +1340,11 @@ define half @v_fminimum3_f16_fneg0(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e64 v3, -v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg half %a @@ -1253,9 +1370,11 @@ define half @v_fminimum3_f16_fneg1(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e64 v3, v0, -v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %b.fneg = fneg half %b @@ -1281,9 +1400,11 @@ define half @v_fminimum3_f16_fneg2(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e64 v1, v0, -v2 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %c.fneg = fneg half %c @@ -1309,9 +1430,11 @@ define half @v_fminimum3_f16_const0(half %b, half %c) { ; GFX9-NEXT: v_min_f16_e32 v2, 0x4800, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half 8.0, half %b) @@ -1336,9 +1459,11 @@ define half @v_fminimum3_f16__const2(half %a, half %b) { ; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_min_f16_e32 v1, 0x4800, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) @@ -1363,9 +1488,11 @@ define half @v_fminimum3_f16_inlineimm0(half %b, half %c) { ; GFX9-NEXT: v_min_f16_e32 v2, 4.0, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half 4.0, half %b) @@ -1390,9 +1517,11 @@ define half @v_fminimum3_f16__inlineimm(half %a, half %b) { ; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: v_min_f16_e32 v1, 4.0, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half %b) @@ -1419,9 +1548,11 @@ define half @v_fminimum3_f16_const1_const2(half %a) { ; GFX9-NEXT: v_min_f16_e32 v1, 0x4800, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_min_f16_e32 v1, 0x4c00, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call half @llvm.minimum.f16(half %a, half 8.0) @@ -1448,19 +1579,23 @@ define <2 x half> @v_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c ; GFX9-NEXT: v_pk_min_f16 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v0, v5, s4 +; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX9-NEXT: v_pk_min_f16 v1, v2, v1 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %c, <2 x half> %max0) @@ -1486,19 +1621,23 @@ define <2 x half> @v_fminimum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x ; GFX9-NEXT: v_pk_min_f16 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v0, v5, s4 +; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c) @@ -1527,22 +1666,25 @@ define <2 x half> @v_fminimum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 ; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1 ; GFX9-NEXT: v_pk_min_f16 v3, v3, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 -; GFX9-NEXT: v_perm_b32 v1, v4, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v4, v0, s0 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a) %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b) @@ -1571,19 +1713,23 @@ define <2 x half> @v_fminimum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 ; GFX9-NEXT: v_pk_min_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v0, v5, s4 +; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v5, -v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <2 x half> %a %b.fneg = fneg <2 x half> %b @@ -1610,21 +1756,25 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_min_f16 v2, v0, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v2, v3, v0, s4 +; GFX9-NEXT: v_perm_b32 v2, v3, v0, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> ) %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c) @@ -1650,19 +1800,23 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) { ; GFX9-NEXT: v_pk_min_f16 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v0, v4, s4 +; GFX9-NEXT: v_perm_b32 v1, v0, v4, s0 ; GFX9-NEXT: v_pk_min_f16 v1, v1, 4.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v2, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> ) @@ -1690,29 +1844,35 @@ define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c ; GFX9-NEXT: v_pk_min_f16 v6, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX9-NEXT: v_pk_min_f16 v1, v5, v1 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %c, <3 x half> %max0) @@ -1740,29 +1900,35 @@ define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x ; GFX9-NEXT: v_pk_min_f16 v6, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v5 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c) @@ -1799,33 +1965,37 @@ define <3 x half> @v_fminimum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 ; GFX9-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v0 ; GFX9-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v2 ; GFX9-NEXT: v_pk_min_f16 v7, v7, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX9-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_pk_min_f16 v6, v6, v8 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 +; GFX9-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 -; GFX9-NEXT: v_perm_b32 v2, v8, v0, s4 +; GFX9-NEXT: v_perm_b32 v2, v8, v0, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, v11 -; GFX9-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v6, v9, v1, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX9-NEXT: v_perm_b32 v6, v9, v1, s0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc ; GFX9-NEXT: v_pk_min_f16 v6, v6, v10 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a) %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b) @@ -1856,29 +2026,35 @@ define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 ; GFX9-NEXT: v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v6, s0 ; GFX9-NEXT: v_pk_min_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <3 x half> %a %b.fneg = fneg <3 x half> %b @@ -1907,29 +2083,34 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_pk_min_f16 v7, v1, 2.0 +; GFX9-NEXT: s_mov_b32 s1, 0x5040100 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX9-NEXT: s_movk_i32 s0, 0x7e00 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX9-NEXT: s_mov_b32 s5, 0x5040100 -; GFX9-NEXT: v_perm_b32 v4, v5, v0, s5 +; GFX9-NEXT: v_perm_b32 v4, v5, v0, s1 ; GFX9-NEXT: v_pk_min_f16 v4, v4, v2 -; GFX9-NEXT: s_movk_i32 s4, 0x7e00 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX9-NEXT: v_pack_b32_f16 v7, v1, s4 +; GFX9-NEXT: v_pack_b32_f16 v7, v1, s0 ; GFX9-NEXT: v_pk_min_f16 v7, v7, v3 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX9-NEXT: v_perm_b32 v0, v5, v0, s5 +; GFX9-NEXT: v_perm_b32 v0, v5, v0, s1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> ) %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c) @@ -1957,29 +2138,35 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { ; GFX9-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v4, s0 ; GFX9-NEXT: v_pk_min_f16 v1, v1, 4.0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 -; GFX9-NEXT: v_perm_b32 v2, v0, v6, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9-NEXT: v_perm_b32 v2, v0, v6, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> ) @@ -2007,33 +2194,41 @@ define <4 x half> @v_fminimum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c ; GFX9-NEXT: v_pk_min_f16 v6, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v2, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v5, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v4, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 +; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %c, <4 x half> %max0) @@ -2061,33 +2256,41 @@ define <4 x half> @v_fminimum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x ; GFX9-NEXT: v_pk_min_f16 v6, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v2, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, v5 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 +; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c) @@ -2124,37 +2327,43 @@ define <4 x half> @v_fminimum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX9-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1 ; GFX9-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 ; GFX9-NEXT: v_pk_min_f16 v7, v7, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX9-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_pk_min_f16 v6, v6, v8 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 +; GFX9-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 -; GFX9-NEXT: v_perm_b32 v2, v8, v1, s4 -; GFX9-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 +; GFX9-NEXT: v_perm_b32 v2, v8, v1, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, v11 -; GFX9-NEXT: v_perm_b32 v6, v9, v0, s4 +; GFX9-NEXT: v_perm_b32 v6, v9, v0, s0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_pk_min_f16 v6, v6, v10 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| +; GFX9-NEXT: v_perm_b32 v1, v3, v1, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc -; GFX9-NEXT: v_perm_b32 v0, v7, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX9-NEXT: v_perm_b32 v0, v7, v0, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a) %b.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %b) @@ -2185,33 +2394,41 @@ define <4 x half> @v_fminimum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 ; GFX9-NEXT: v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v2, v1, v6, s4 +; GFX9-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s4 +; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 +; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <4 x half> %a %b.fneg = fneg <4 x half> %b @@ -2240,35 +2457,41 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_pk_min_f16 v7, v1, 2.0 op_sel_hi:[1,0] +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v7 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v4, v8, v1, s4 +; GFX9-NEXT: v_perm_b32 v4, v8, v1, s0 ; GFX9-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v8, v5, v0, s4 +; GFX9-NEXT: v_perm_b32 v8, v5, v0, s0 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX9-NEXT: v_pk_min_f16 v8, v8, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v8 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: v_perm_b32 v1, v7, v1, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc -; GFX9-NEXT: v_perm_b32 v0, v5, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v7, v1, s4 +; GFX9-NEXT: v_perm_b32 v0, v5, v0, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> ) %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c) @@ -2296,33 +2519,41 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { ; GFX9-NEXT: v_pk_min_f16 v4, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v2, v1, v4, s4 +; GFX9-NEXT: v_perm_b32 v2, v1, v4, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v6, s4 +; GFX9-NEXT: v_perm_b32 v2, v0, v6, s0 ; GFX9-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 +; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v4, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> ) @@ -2346,12 +2577,14 @@ define double @v_fminimum3_f64(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2377,12 +2610,14 @@ define double @v_fminimum3_f64_commute(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[4:5], v[0:1] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2404,19 +2639,20 @@ define amdgpu_ps <2 x i32> @s_fminimum3_f64(double inreg %a, double inreg %b, do ; ; GFX9-LABEL: s_fminimum3_f64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX9-NEXT: v_min_f64 v[2:3], s[0:1], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], s[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: ; return to shader part epilog %max0 = call double @llvm.minimum.f64(double %a, double %b) %max1 = call double @llvm.minimum.f64(double %max0, double %c) @@ -2447,12 +2683,14 @@ define double @v_fminimum3_f64_fabs0(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], |v[0:1]|, v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2479,12 +2717,14 @@ define double @v_fminimum3_f64_fabs1(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], |v[2:3]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2511,12 +2751,14 @@ define double @v_fminimum3_f64_fabs2(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], |v[4:5]| ; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2543,12 +2785,14 @@ define double @v_fminimum3_f64_fabs_all(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], |v[0:1]|, |v[2:3]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], |v[4:5]| ; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2577,12 +2821,14 @@ define double @v_fminimum3_f64_fneg_all(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], -v[0:1], -v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], -v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2611,12 +2857,14 @@ define double @v_fminimum3_f64_fneg_fabs_all(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], -|v[0:1]|, -|v[2:3]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], -|v[4:5]| ; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2648,12 +2896,14 @@ define double @v_fminimum3_f64_fneg0(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], -v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2680,12 +2930,14 @@ define double @v_fminimum3_f64_fneg1(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], -v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2712,12 +2964,14 @@ define double @v_fminimum3_f64_fneg2(double %a, double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], -v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2743,15 +2997,17 @@ define double @v_fminimum3_f64_const0(double %b, double %c) { ; GFX9-LABEL: v_fminimum3_f64_const0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40200000 -; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], s[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 0x40200000 +; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2777,14 +3033,15 @@ define double @v_fminimum3_f64__const2(double %a, double %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40200000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 0x40200000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], s[4:5] +; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], s[0:1] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2810,12 +3067,14 @@ define double @v_fminimum3_f64_inlineimm0(double %b, double %c) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], 4.0 -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2841,12 +3100,14 @@ define double @v_fminimum3_f64__inlineimm(double %a, double %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], 4.0 ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2871,17 +3132,18 @@ define double @v_fminimum3_f64_const1_const2(double %a) { ; GFX9-LABEL: v_fminimum3_f64_const1_const2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40200000 -; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], s[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 0x40200000 +; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40300000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 0x40300000 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], s[4:5] +; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], s[0:1] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2909,9 +3171,11 @@ define <2 x float> @v_no_fminimum3_f32__multi_use(float %a, float %b, float %c) ; GFX9-NEXT: v_min_f32_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f32_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call float @llvm.minimum.f32(float %a, float %b) @@ -2935,11 +3199,14 @@ define amdgpu_ps <2 x i32> @s_no_fminimum3_f32__multi_use(float inreg %a, float ; GFX9-NEXT: v_min_f32_e32 v1, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_min_f32_e32 v1, s2, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s2, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog %max0 = call float @llvm.minimum.f32(float %a, float %b) @@ -2973,9 +3240,11 @@ define <2 x half> @v_no_fminimum3_f16__multi_use(half %a, half %b, half %c) { ; GFX9-NEXT: v_min_f16_e32 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; GFX9-NEXT: v_min_f16_e32 v1, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3002,11 +3271,13 @@ define amdgpu_ps <2 x i32> @s_no_fminimum3_f16__multi_use(half inreg %a, half in ; GFX9-NEXT: v_min_f16_e32 v1, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX9-NEXT: v_min_f16_e32 v1, s2, v0 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 @@ -3043,19 +3314,23 @@ define <4 x half> @v_no_fminimum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, ; GFX9-NEXT: v_pk_min_f16 v3, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v5, s4 +; GFX9-NEXT: v_perm_b32 v0, v1, v5, s0 ; GFX9-NEXT: v_pk_min_f16 v3, v0, v2 ; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v5, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v5, s0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.minimum.f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.minimum.f16(<2 x half> %max0, <2 x half> %c) @@ -3080,12 +3355,14 @@ define <2 x double> @v_no_fminimum3_f64__multi_use(double %a, double %b, double ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc ; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3095,3 +3372,6 @@ define <2 x double> @v_no_fminimum3_f64__multi_use(double %a, double %b, double %insert.1 = insertelement <2 x double> %insert.0, double %max1, i32 1 ret <2 x double> %insert.1 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX940: {{.*}} +; GFX950: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll index 8313f5b655efba..bd35ee3f009736 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index d90c4a75ac5dea..e782f53cee6087 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s @@ -30,24 +30,24 @@ define half @v_maximum_f16(half %src0, half %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f16: ; GFX10: ; %bb.0: @@ -102,12 +102,6 @@ define half @v_maximum_f16__nnan(half %src0, half %src1) { ; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_f16__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_f16__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -156,24 +150,24 @@ define half @v_maximum_f16__nsz(half %src0, half %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f16__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f16__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f16__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f16__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f16__nsz: ; GFX10: ; %bb.0: @@ -228,12 +222,6 @@ define half @v_maximum_f16__nnan_nsz(half %src0, half %src1) { ; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_f16__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_f16__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -284,26 +272,26 @@ define half @v_maximum_f16__nnan_src0(half %arg0, half %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f16__nnan_src0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f16__nnan_src0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX940-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f16__nnan_src0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX900-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f16__nnan_src0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX950-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f16__nnan_src0: ; GFX10: ; %bb.0: @@ -365,26 +353,26 @@ define half @v_maximum_f16__nnan_src1(half %src0, half %arg1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f16__nnan_src1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX9-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f16__nnan_src1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX940-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f16__nnan_src1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX900-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f16__nnan_src1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX950-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f16__nnan_src1: ; GFX10: ; %bb.0: @@ -453,34 +441,34 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_maximum_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s17 -; GFX9-NEXT: v_max_f16_e32 v1, s16, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_maximum_f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_max_f16_e32 v1, s0, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_maximum_f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s17 +; GFX900-NEXT: v_max_f16_e32 v1, s16, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_maximum_f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s1 +; GFX950-NEXT: v_max_f16_e32 v1, s0, v0 +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_maximum_f16: ; GFX10: ; %bb.0: @@ -567,35 +555,35 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v2f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v2f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v2f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f16: ; GFX10: ; %bb.0: @@ -668,12 +656,6 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) { ; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v2f16__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v2f16__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -736,35 +718,35 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v2f16__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v2f16__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v2f16__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v2f16__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f16__nsz: ; GFX10: ; %bb.0: @@ -837,12 +819,6 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1) ; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v2f16__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v2f16__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -917,50 +893,50 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_maximum_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s17 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_lshr_b32 s4, s17, 16 -; GFX9-NEXT: v_pk_max_f16 v1, s16, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 -; GFX9-NEXT: s_lshr_b32 s5, s16, 16 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s5, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_maximum_v2f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NEXT: s_lshr_b32 s1, s1, 16 -; GFX940-NEXT: v_pk_max_f16 v1, s0, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX940-NEXT: s_lshr_b32 s0, s0, 16 -; GFX940-NEXT: v_mov_b32_e32 v3, s1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, s0, v3 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX940-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_maximum_v2f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s17 +; GFX900-NEXT: v_mov_b32_e32 v1, s17 +; GFX900-NEXT: s_lshr_b32 s4, s17, 16 +; GFX900-NEXT: v_pk_max_f16 v1, s16, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 +; GFX900-NEXT: s_lshr_b32 s5, s16, 16 +; GFX900-NEXT: v_mov_b32_e32 v3, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s5, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_maximum_v2f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s1 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: s_lshr_b32 s1, s1, 16 +; GFX950-NEXT: v_pk_max_f16 v1, s0, v1 +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX950-NEXT: s_lshr_b32 s0, s0, 16 +; GFX950-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v3 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX950-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_maximum_v2f16: ; GFX10: ; %bb.0: @@ -1065,41 +1041,41 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v3f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v3f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v3f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v3f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX950-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f16: ; GFX10: ; %bb.0: @@ -1187,13 +1163,6 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) { ; GFX9-NEXT: v_pk_max_f16 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v3f16__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v3f16__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1269,41 +1238,41 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v3f16__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v3f16__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v3f16__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v3f16__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX950-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f16__nsz: ; GFX10: ; %bb.0: @@ -1391,13 +1360,6 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v3f16__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v3f16__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1487,51 +1449,51 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v4f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v4f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v4f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v4f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX950-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f16: ; GFX10: ; %bb.0: @@ -1635,13 +1597,6 @@ define <4 x half> @v_maximum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) { ; GFX9-NEXT: v_pk_max_f16 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v4f16__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v4f16__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1731,51 +1686,51 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v4f16__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v4f16__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v4f16__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v4f16__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX950-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f16__nsz: ; GFX10: ; %bb.0: @@ -1879,13 +1834,6 @@ define <4 x half> @v_maximum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1) ; GFX9-NEXT: v_pk_max_f16 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v4f16__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v4f16__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2023,83 +1971,83 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v8f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v8, v3, v7 -; GFX9-NEXT: v_mov_b32_e32 v9, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc -; GFX9-NEXT: v_pk_max_f16 v7, v2, v6 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc -; GFX9-NEXT: v_pk_max_f16 v6, v1, v5 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc -; GFX9-NEXT: v_pk_max_f16 v5, v0, v4 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v6, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v7, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v8, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v10, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v8f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v8, v3, v7 -; GFX940-NEXT: v_mov_b32_e32 v9, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v7, v2, v6 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 -; GFX940-NEXT: v_perm_b32 v3, v3, v10, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v6, v1, v5 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 -; GFX940-NEXT: v_perm_b32 v2, v2, v8, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v5, v0, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 -; GFX940-NEXT: v_perm_b32 v1, v1, v7, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v6, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v8f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v8, v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v9, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc +; GFX900-NEXT: v_pk_max_f16 v7, v2, v6 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc +; GFX900-NEXT: v_pk_max_f16 v6, v1, v5 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc +; GFX900-NEXT: v_pk_max_f16 v5, v0, v4 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v6, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v7, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v8, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v10, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v8f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v8, v3, v7 +; GFX950-NEXT: v_mov_b32_e32 v9, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v7, v2, v6 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 +; GFX950-NEXT: v_perm_b32 v3, v3, v10, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v6, v1, v5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 +; GFX950-NEXT: v_perm_b32 v2, v2, v8, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v5, v0, v4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 +; GFX950-NEXT: v_perm_b32 v1, v1, v7, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v6, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v8f16: ; GFX10: ; %bb.0: @@ -2400,147 +2348,147 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v16f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v16, v7, v15 -; GFX9-NEXT: v_mov_b32_e32 v17, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc -; GFX9-NEXT: v_pk_max_f16 v15, v6, v14 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc -; GFX9-NEXT: v_pk_max_f16 v14, v5, v13 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc -; GFX9-NEXT: v_pk_max_f16 v13, v4, v12 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc -; GFX9-NEXT: v_pk_max_f16 v12, v3, v11 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc -; GFX9-NEXT: v_pk_max_f16 v11, v2, v10 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc -; GFX9-NEXT: v_pk_max_f16 v10, v1, v9 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc -; GFX9-NEXT: v_pk_max_f16 v9, v0, v8 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v10, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v11, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v12, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v13, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v14, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v15, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v16, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v18, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v16f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v16, v7, v15 -; GFX940-NEXT: v_mov_b32_e32 v17, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v15, v6, v14 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 -; GFX940-NEXT: v_perm_b32 v7, v7, v18, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v14, v5, v13 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 -; GFX940-NEXT: v_perm_b32 v6, v6, v16, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v13, v4, v12 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 -; GFX940-NEXT: v_perm_b32 v5, v5, v15, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v12, v3, v11 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 -; GFX940-NEXT: v_perm_b32 v4, v4, v14, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v11, v2, v10 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 -; GFX940-NEXT: v_perm_b32 v3, v3, v13, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v10, v1, v9 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 -; GFX940-NEXT: v_perm_b32 v2, v2, v12, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_max_f16 v9, v0, v8 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 -; GFX940-NEXT: v_perm_b32 v1, v1, v11, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v10, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v16f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v16, v7, v15 +; GFX900-NEXT: v_mov_b32_e32 v17, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc +; GFX900-NEXT: v_pk_max_f16 v15, v6, v14 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc +; GFX900-NEXT: v_pk_max_f16 v14, v5, v13 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc +; GFX900-NEXT: v_pk_max_f16 v13, v4, v12 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc +; GFX900-NEXT: v_pk_max_f16 v12, v3, v11 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc +; GFX900-NEXT: v_pk_max_f16 v11, v2, v10 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc +; GFX900-NEXT: v_pk_max_f16 v10, v1, v9 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc +; GFX900-NEXT: v_pk_max_f16 v9, v0, v8 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v10, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v11, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v12, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v13, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v14, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v15, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v16, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v18, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v16f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v16, v7, v15 +; GFX950-NEXT: v_mov_b32_e32 v17, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v15, v6, v14 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 +; GFX950-NEXT: v_perm_b32 v7, v7, v18, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v14, v5, v13 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 +; GFX950-NEXT: v_perm_b32 v6, v6, v16, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v13, v4, v12 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 +; GFX950-NEXT: v_perm_b32 v5, v5, v15, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v12, v3, v11 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 +; GFX950-NEXT: v_perm_b32 v4, v4, v14, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v11, v2, v10 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 +; GFX950-NEXT: v_perm_b32 v3, v3, v13, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v10, v1, v9 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 +; GFX950-NEXT: v_perm_b32 v2, v2, v12, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_max_f16 v9, v0, v8 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 +; GFX950-NEXT: v_perm_b32 v1, v1, v11, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v10, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v16f16: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll index 48851cb030233d..c1fdfa2c4cf9ab 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll @@ -2,7 +2,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s @@ -26,24 +27,24 @@ define float @v_maximum_f32(float %src0, float %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f32: ; GFX10: ; %bb.0: @@ -94,12 +95,6 @@ define float @v_maximum_f32__nnan(float %src0, float %src1) { ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_f32__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -144,24 +139,24 @@ define float @v_maximum_f32__nsz(float %src0, float %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f32__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f32__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f32__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f32__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f32__nsz: ; GFX10: ; %bb.0: @@ -212,12 +207,6 @@ define float @v_maximum_f32__nnan_nsz(float %src0, float %src1) { ; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_f32__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -264,26 +253,26 @@ define float @v_maximum_f32__nnan_src0(float %arg0, float %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f32__nnan_src0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f32__nnan_src0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX940-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f32__nnan_src0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX900-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f32__nnan_src0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX950-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f32__nnan_src0: ; GFX10: ; %bb.0: @@ -341,26 +330,26 @@ define float @v_maximum_f32__nnan_src1(float %src0, float %arg1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f32__nnan_src1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f32__nnan_src1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX940-NEXT: v_max_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f32__nnan_src1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX900-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f32__nnan_src1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX950-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f32__nnan_src1: ; GFX10: ; %bb.0: @@ -424,32 +413,32 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_maximum_f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s17 -; GFX9-NEXT: v_max_f32_e32 v1, s16, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_maximum_f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_max_f32_e32 v1, s0, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_maximum_f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s17 +; GFX900-NEXT: v_max_f32_e32 v1, s16, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_maximum_f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s1 +; GFX950-NEXT: v_max_f32_e32 v1, s0, v0 +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_maximum_f32: ; GFX10: ; %bb.0: @@ -517,31 +506,31 @@ define <2 x float> @v_maximum_v2f32(<2 x float> %src0, <2 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v2f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_max_f32_e32 v2, v1, v3 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v2f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v2f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX900-NEXT: v_max_f32_e32 v2, v1, v3 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v2f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX950-NEXT: v_max_f32_e32 v2, v1, v3 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f32: ; GFX10: ; %bb.0: @@ -601,13 +590,6 @@ define <2 x float> @v_maximum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1) ; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v2f32__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v2f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -660,31 +642,31 @@ define <2 x float> @v_maximum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v2f32__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_max_f32_e32 v2, v1, v3 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v2f32__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: v_max_f32_e32 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v2f32__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX900-NEXT: v_max_f32_e32 v2, v1, v3 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v2f32__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX950-NEXT: v_max_f32_e32 v2, v1, v3 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f32__nsz: ; GFX10: ; %bb.0: @@ -744,13 +726,6 @@ define <2 x float> @v_maximum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr ; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v2f32__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v2f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -813,40 +788,40 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_maximum_v2f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s19 -; GFX9-NEXT: v_max_f32_e32 v1, s17, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s17, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_max_f32_e32 v3, s16, v0 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v[0:1] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_maximum_v2f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s3 -; GFX940-NEXT: v_max_f32_e32 v1, s1, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_max_f32_e32 v3, s0, v0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v[0:1] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_maximum_v2f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s19 +; GFX900-NEXT: v_max_f32_e32 v1, s17, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s17, v0 +; GFX900-NEXT: v_mov_b32_e32 v0, s18 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX900-NEXT: v_max_f32_e32 v3, s16, v0 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_maximum_v2f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s3 +; GFX950-NEXT: v_max_f32_e32 v1, s1, v0 +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, s1, v0 +; GFX950-NEXT: v_mov_b32_e32 v0, s2 +; GFX950-NEXT: v_max_f32_e32 v3, s0, v0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_maximum_v2f32: ; GFX10: ; %bb.0: @@ -927,38 +902,38 @@ define <3 x float> @v_maximum_v3f32(<3 x float> %src0, <3 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v3f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v6, v0, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_max_f32_e32 v3, v1, v4 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX9-NEXT: v_max_f32_e32 v3, v2, v5 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v3f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v6, v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v1, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v3, v2, v5 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v3f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f32_e32 v6, v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX900-NEXT: v_max_f32_e32 v3, v1, v4 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX900-NEXT: v_max_f32_e32 v3, v2, v5 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v3f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v6, v0, v3 +; GFX950-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX950-NEXT: v_max_f32_e32 v3, v1, v4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX950-NEXT: v_max_f32_e32 v3, v2, v5 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f32: ; GFX10: ; %bb.0: @@ -1028,14 +1003,6 @@ define <3 x float> @v_maximum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1) ; GFX9-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v3f32__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX940-NEXT: v_max_f32_e32 v1, v1, v4 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v3f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1097,38 +1064,38 @@ define <3 x float> @v_maximum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v3f32__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v6, v0, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_max_f32_e32 v3, v1, v4 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX9-NEXT: v_max_f32_e32 v3, v2, v5 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v3f32__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v6, v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX940-NEXT: v_max_f32_e32 v3, v1, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX940-NEXT: v_max_f32_e32 v3, v2, v5 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v3f32__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f32_e32 v6, v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX900-NEXT: v_max_f32_e32 v3, v1, v4 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX900-NEXT: v_max_f32_e32 v3, v2, v5 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v3f32__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v6, v0, v3 +; GFX950-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX950-NEXT: v_max_f32_e32 v3, v1, v4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX950-NEXT: v_max_f32_e32 v3, v2, v5 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f32__nsz: ; GFX10: ; %bb.0: @@ -1198,14 +1165,6 @@ define <3 x float> @v_maximum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr ; GFX9-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v3f32__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX940-NEXT: v_max_f32_e32 v1, v1, v4 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v3f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1273,45 +1232,45 @@ define <4 x float> @v_maximum_v4f32(<4 x float> %src0, <4 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v4f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v8, v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX9-NEXT: v_max_f32_e32 v4, v1, v5 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX9-NEXT: v_max_f32_e32 v4, v2, v6 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX9-NEXT: v_max_f32_e32 v4, v3, v7 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v4f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v8, v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX940-NEXT: v_max_f32_e32 v4, v1, v5 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX940-NEXT: v_max_f32_e32 v4, v2, v6 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX940-NEXT: v_max_f32_e32 v4, v3, v7 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v4f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f32_e32 v8, v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX900-NEXT: v_max_f32_e32 v4, v1, v5 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX900-NEXT: v_max_f32_e32 v4, v2, v6 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX900-NEXT: v_max_f32_e32 v4, v3, v7 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v4f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v8, v0, v4 +; GFX950-NEXT: v_mov_b32_e32 v9, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX950-NEXT: v_max_f32_e32 v4, v1, v5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX950-NEXT: v_max_f32_e32 v4, v2, v6 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX950-NEXT: v_max_f32_e32 v4, v3, v7 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f32: ; GFX10: ; %bb.0: @@ -1391,15 +1350,6 @@ define <4 x float> @v_maximum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1) ; GFX9-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v4f32__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX940-NEXT: v_max_f32_e32 v1, v1, v5 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_max_f32_e32 v3, v3, v7 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v4f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1469,45 +1419,45 @@ define <4 x float> @v_maximum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v4f32__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v8, v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX9-NEXT: v_max_f32_e32 v4, v1, v5 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX9-NEXT: v_max_f32_e32 v4, v2, v6 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX9-NEXT: v_max_f32_e32 v4, v3, v7 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v4f32__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v8, v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX940-NEXT: v_max_f32_e32 v4, v1, v5 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX940-NEXT: v_max_f32_e32 v4, v2, v6 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX940-NEXT: v_max_f32_e32 v4, v3, v7 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v4f32__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f32_e32 v8, v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX900-NEXT: v_max_f32_e32 v4, v1, v5 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX900-NEXT: v_max_f32_e32 v4, v2, v6 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX900-NEXT: v_max_f32_e32 v4, v3, v7 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v4f32__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v8, v0, v4 +; GFX950-NEXT: v_mov_b32_e32 v9, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX950-NEXT: v_max_f32_e32 v4, v1, v5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX950-NEXT: v_max_f32_e32 v4, v2, v6 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX950-NEXT: v_max_f32_e32 v4, v3, v7 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f32__nsz: ; GFX10: ; %bb.0: @@ -1587,15 +1537,6 @@ define <4 x float> @v_maximum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr ; GFX9-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v4f32__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX940-NEXT: v_max_f32_e32 v1, v1, v5 -; GFX940-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_max_f32_e32 v3, v3, v7 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v4f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1689,73 +1630,73 @@ define <8 x float> @v_maximum_v8f32(<8 x float> %src0, <8 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v8f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v16, v0, v8 -; GFX9-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc -; GFX9-NEXT: v_max_f32_e32 v8, v1, v9 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc -; GFX9-NEXT: v_max_f32_e32 v8, v2, v10 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc -; GFX9-NEXT: v_max_f32_e32 v8, v3, v11 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc -; GFX9-NEXT: v_max_f32_e32 v8, v4, v12 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc -; GFX9-NEXT: v_max_f32_e32 v8, v5, v13 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc -; GFX9-NEXT: v_max_f32_e32 v8, v6, v14 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc -; GFX9-NEXT: v_max_f32_e32 v8, v7, v15 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v8f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v16, v0, v8 -; GFX940-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX940-NEXT: v_max_f32_e32 v8, v1, v9 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc -; GFX940-NEXT: v_max_f32_e32 v8, v2, v10 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc -; GFX940-NEXT: v_max_f32_e32 v8, v3, v11 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc -; GFX940-NEXT: v_max_f32_e32 v8, v4, v12 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc -; GFX940-NEXT: v_max_f32_e32 v8, v5, v13 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc -; GFX940-NEXT: v_max_f32_e32 v8, v6, v14 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc -; GFX940-NEXT: v_max_f32_e32 v8, v7, v15 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v8f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f32_e32 v16, v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc +; GFX900-NEXT: v_max_f32_e32 v8, v1, v9 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc +; GFX900-NEXT: v_max_f32_e32 v8, v2, v10 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc +; GFX900-NEXT: v_max_f32_e32 v8, v3, v11 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc +; GFX900-NEXT: v_max_f32_e32 v8, v4, v12 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc +; GFX900-NEXT: v_max_f32_e32 v8, v5, v13 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc +; GFX900-NEXT: v_max_f32_e32 v8, v6, v14 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc +; GFX900-NEXT: v_max_f32_e32 v8, v7, v15 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v8f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v16, v0, v8 +; GFX950-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 +; GFX950-NEXT: v_max_f32_e32 v8, v1, v9 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc +; GFX950-NEXT: v_max_f32_e32 v8, v2, v10 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc +; GFX950-NEXT: v_max_f32_e32 v8, v3, v11 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc +; GFX950-NEXT: v_max_f32_e32 v8, v4, v12 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc +; GFX950-NEXT: v_max_f32_e32 v8, v5, v13 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc +; GFX950-NEXT: v_max_f32_e32 v8, v6, v14 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc +; GFX950-NEXT: v_max_f32_e32 v8, v7, v15 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v8f32: ; GFX10: ; %bb.0: @@ -1968,136 +1909,136 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v16f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX9-NEXT: v_writelane_b32 v31, s30, 0 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v17 -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX9-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX9-NEXT: v_max_f32_e32 v18, v13, v29 -; GFX9-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX9-NEXT: v_writelane_b32 v31, s31, 1 -; GFX9-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 -; GFX9-NEXT: v_max_f32_e32 v3, v3, v19 -; GFX9-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 -; GFX9-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX9-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 -; GFX9-NEXT: v_max_f32_e32 v5, v5, v21 -; GFX9-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v22 -; GFX9-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX9-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 -; GFX9-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX9-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 -; GFX9-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX9-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 -; GFX9-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX9-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 -; GFX9-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX9-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 -; GFX9-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX9-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 -; GFX9-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX9-NEXT: v_max_f32_e32 v19, v14, v30 -; GFX9-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX9-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] -; GFX9-NEXT: v_readlane_b32 s31, v31, 1 -; GFX9-NEXT: v_readlane_b32 s30, v31, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v18, v15, v16 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v16f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: scratch_load_dword v31, off, s32 -; GFX940-NEXT: v_mov_b32_e32 v32, 0x7fc00000 -; GFX940-NEXT: v_max_f32_e32 v33, v0, v16 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 -; GFX940-NEXT: v_max_f32_e32 v34, v1, v17 -; GFX940-NEXT: v_max_f32_e32 v35, v2, v18 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v32, v33, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 -; GFX940-NEXT: v_max_f32_e32 v36, v3, v19 -; GFX940-NEXT: v_max_f32_e32 v37, v4, v20 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v32, v34, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v18 -; GFX940-NEXT: v_max_f32_e32 v38, v5, v21 -; GFX940-NEXT: v_max_f32_e32 v39, v6, v22 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v32, v35, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v19 -; GFX940-NEXT: v_max_f32_e32 v48, v7, v23 -; GFX940-NEXT: v_max_f32_e32 v49, v8, v24 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v32, v36, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v4, v20 -; GFX940-NEXT: v_max_f32_e32 v50, v9, v25 -; GFX940-NEXT: v_max_f32_e32 v51, v10, v26 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v32, v37, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v5, v21 -; GFX940-NEXT: v_max_f32_e32 v52, v11, v27 -; GFX940-NEXT: v_max_f32_e32 v53, v12, v28 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v32, v38, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v6, v22 -; GFX940-NEXT: v_max_f32_e32 v54, v13, v29 -; GFX940-NEXT: v_max_f32_e32 v55, v14, v30 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v32, v39, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v7, v23 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f32_e32 v16, v15, v31 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v32, v48, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v8, v24 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v32, v49, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v9, v25 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v9, v32, v50, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v10, v26 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v10, v32, v51, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v11, v27 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v11, v32, v52, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v12, v28 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v12, v32, v53, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v13, v29 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v13, v32, v54, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v14, v30 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v14, v32, v55, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v15, v31 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v15, v32, v16, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v16f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX900-NEXT: v_writelane_b32 v31, s30, 0 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 +; GFX900-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX900-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 +; GFX900-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX900-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX900-NEXT: v_max_f32_e32 v18, v13, v29 +; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 +; GFX900-NEXT: v_writelane_b32 v31, s31, 1 +; GFX900-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 +; GFX900-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX900-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 +; GFX900-NEXT: v_max_f32_e32 v4, v4, v20 +; GFX900-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 +; GFX900-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX900-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v22 +; GFX900-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX900-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 +; GFX900-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX900-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX900-NEXT: v_max_f32_e32 v8, v8, v24 +; GFX900-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX900-NEXT: v_max_f32_e32 v9, v9, v25 +; GFX900-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX900-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX900-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX900-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX900-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX900-NEXT: v_max_f32_e32 v19, v14, v30 +; GFX900-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] +; GFX900-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] +; GFX900-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] +; GFX900-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] +; GFX900-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] +; GFX900-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX900-NEXT: v_readlane_b32 s31, v31, 1 +; GFX900-NEXT: v_readlane_b32 s30, v31, 0 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_max_f32_e32 v18, v15, v16 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v16f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_mov_b32_e32 v32, 0x7fc00000 +; GFX950-NEXT: v_max_f32_e32 v33, v0, v16 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 +; GFX950-NEXT: v_max_f32_e32 v34, v1, v17 +; GFX950-NEXT: v_max_f32_e32 v35, v2, v18 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v32, v33, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 +; GFX950-NEXT: v_max_f32_e32 v36, v3, v19 +; GFX950-NEXT: v_max_f32_e32 v37, v4, v20 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v32, v34, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v18 +; GFX950-NEXT: v_max_f32_e32 v38, v5, v21 +; GFX950-NEXT: v_max_f32_e32 v39, v6, v22 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v32, v35, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v3, v19 +; GFX950-NEXT: v_max_f32_e32 v48, v7, v23 +; GFX950-NEXT: v_max_f32_e32 v49, v8, v24 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v32, v36, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v4, v20 +; GFX950-NEXT: v_max_f32_e32 v50, v9, v25 +; GFX950-NEXT: v_max_f32_e32 v51, v10, v26 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v32, v37, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v5, v21 +; GFX950-NEXT: v_max_f32_e32 v52, v11, v27 +; GFX950-NEXT: v_max_f32_e32 v53, v12, v28 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v32, v38, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v6, v22 +; GFX950-NEXT: v_max_f32_e32 v54, v13, v29 +; GFX950-NEXT: v_max_f32_e32 v55, v14, v30 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v32, v39, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v7, v23 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v16, v15, v31 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v32, v48, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v8, v24 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v8, v32, v49, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v9, v25 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v9, v32, v50, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v10, v26 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v10, v32, v51, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v11, v27 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v11, v32, v52, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v12, v28 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v12, v32, v53, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v13, v29 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v13, v32, v54, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v14, v30 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v14, v32, v55, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v15, v31 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v15, v32, v16, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v16f32: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll index 80a0a194713d90..e354ec6fb3dd78 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll @@ -2,7 +2,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s @@ -28,26 +29,26 @@ define double @v_maximum_f64(double %src0, double %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f64: ; GFX10: ; %bb.0: @@ -100,12 +101,6 @@ define double @v_maximum_f64__nnan(double %src0, double %src1) { ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_f64__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -152,26 +147,26 @@ define double @v_maximum_f64__nsz(double %src0, double %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f64__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f64__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f64__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f64__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f64__nsz: ; GFX10: ; %bb.0: @@ -224,12 +219,6 @@ define double @v_maximum_f64__nnan_nsz(double %src0, double %src1) { ; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_f64__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -278,28 +267,28 @@ define double @v_maximum_f64__nnan_src0(double %arg0, double %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f64__nnan_src0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f64__nnan_src0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f64__nnan_src0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX900-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f64__nnan_src0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f64__nnan_src0: ; GFX10: ; %bb.0: @@ -362,28 +351,28 @@ define double @v_maximum_f64__nnan_src1(double %src0, double %arg1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_f64__nnan_src1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_f64__nnan_src1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_f64__nnan_src1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX900-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_f64__nnan_src1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[2:3] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_f64__nnan_src1: ; GFX10: ; %bb.0: @@ -454,35 +443,35 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_maximum_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v[0:1] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_maximum_f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NEXT: v_max_f64 v[2:3], s[0:1], v[0:1] -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v[0:1] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_maximum_f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s18 +; GFX900-NEXT: v_mov_b32_e32 v1, s19 +; GFX900-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] +; GFX900-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_maximum_f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX950-NEXT: v_max_f64 v[2:3], s[0:1], v[0:1] +; GFX950-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_maximum_f64: ; GFX10: ; %bb.0: @@ -555,35 +544,35 @@ define <2 x double> @v_maximum_v2f64(<2 x double> %src0, <2 x double> %src1) { ; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v2f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v2f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX940-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[6:7] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v2f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX900-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v2f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[6:7] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f64: ; GFX10: ; %bb.0: @@ -648,13 +637,6 @@ define <2 x double> @v_maximum_v2f64__nnan(<2 x double> %src0, <2 x double> %src ; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v2f64__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v2f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -712,35 +694,35 @@ define <2 x double> @v_maximum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v2f64__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v2f64__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX940-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[6:7] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v2f64__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX900-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v2f64__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[6:7] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f64__nsz: ; GFX10: ; %bb.0: @@ -805,13 +787,6 @@ define <2 x double> @v_maximum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double> ; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v2f64__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v2f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -883,46 +858,46 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_maximum_v2f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_max_f64 v[2:3], s[18:19], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1] -; GFX9-NEXT: v_max_f64 v[0:1], s[16:17], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v[0:3] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_maximum_v2f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[18:19] -; GFX940-NEXT: v_max_f64 v[2:3], s[2:3], v[0:1] -; GFX940-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[16:17] -; GFX940-NEXT: v_max_f64 v[4:5], s[0:1], v[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v[0:3] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_maximum_v2f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s22 +; GFX900-NEXT: v_mov_b32_e32 v4, s20 +; GFX900-NEXT: v_mov_b32_e32 v1, s23 +; GFX900-NEXT: v_mov_b32_e32 v5, s21 +; GFX900-NEXT: v_max_f64 v[2:3], s[18:19], v[0:1] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1] +; GFX900-NEXT: v_max_f64 v[0:1], s[16:17], v[4:5] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[4:5] +; GFX900-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_maximum_v2f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[18:19] +; GFX950-NEXT: v_max_f64 v[2:3], s[2:3], v[0:1] +; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GFX950-NEXT: v_max_f64 v[4:5], s[0:1], v[0:1] +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_maximum_v2f64: ; GFX10: ; %bb.0: @@ -1012,44 +987,44 @@ define <3 x double> @v_maximum_v3f64(<3 x double> %src0, <3 x double> %src1) { ; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v3f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] -; GFX9-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] -; GFX9-NEXT: v_max_f64 v[8:9], v[4:5], v[10:11] -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v3f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] -; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v12, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v13, v12, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[8:9] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc -; GFX940-NEXT: v_max_f64 v[6:7], v[4:5], v[10:11] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[10:11] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v3f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX900-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] +; GFX900-NEXT: v_max_f64 v[8:9], v[4:5], v[10:11] +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v3f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX950-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v12, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v13, v12, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc +; GFX950-NEXT: v_max_f64 v[6:7], v[4:5], v[10:11] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[10:11] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f64: ; GFX10: ; %bb.0: @@ -1125,14 +1100,6 @@ define <3 x double> @v_maximum_v3f64__nnan(<3 x double> %src0, <3 x double> %src ; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v3f64__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] -; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v3f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1201,44 +1168,44 @@ define <3 x double> @v_maximum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v3f64__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] -; GFX9-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] -; GFX9-NEXT: v_max_f64 v[8:9], v[4:5], v[10:11] -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v3f64__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] -; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v12, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v13, v12, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[8:9] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc -; GFX940-NEXT: v_max_f64 v[6:7], v[4:5], v[10:11] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[10:11] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v3f64__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX900-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] +; GFX900-NEXT: v_max_f64 v[8:9], v[4:5], v[10:11] +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v3f64__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX950-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v12, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v13, v12, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc +; GFX950-NEXT: v_max_f64 v[6:7], v[4:5], v[10:11] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[10:11] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f64__nsz: ; GFX10: ; %bb.0: @@ -1314,14 +1281,6 @@ define <3 x double> @v_maximum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double> ; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v3f64__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] -; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v3f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1398,53 +1357,53 @@ define <4 x double> @v_maximum_v4f64(<4 x double> %src0, <4 x double> %src1) { ; GFX8-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v4f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] -; GFX9-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] -; GFX9-NEXT: v_max_f64 v[10:11], v[4:5], v[12:13] -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] -; GFX9-NEXT: v_max_f64 v[12:13], v[6:7], v[14:15] -; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v4f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] -; GFX940-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v16, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[10:11] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v16, vcc -; GFX940-NEXT: v_max_f64 v[8:9], v[4:5], v[12:13] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[12:13] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v16, vcc -; GFX940-NEXT: v_max_f64 v[8:9], v[6:7], v[14:15] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[14:15] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v16, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v4f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX900-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] +; GFX900-NEXT: v_max_f64 v[10:11], v[4:5], v[12:13] +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] +; GFX900-NEXT: v_max_f64 v[12:13], v[6:7], v[14:15] +; GFX900-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] +; GFX900-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v4f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX950-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v16, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v16, vcc +; GFX950-NEXT: v_max_f64 v[8:9], v[4:5], v[12:13] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[12:13] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v9, v16, vcc +; GFX950-NEXT: v_max_f64 v[8:9], v[6:7], v[14:15] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[14:15] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v16, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f64: ; GFX10: ; %bb.0: @@ -1532,15 +1491,6 @@ define <4 x double> @v_maximum_v4f64__nnan(<4 x double> %src0, <4 x double> %src ; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v4f64__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] -; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] -; GFX940-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v4f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1620,53 +1570,53 @@ define <4 x double> @v_maximum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1 ; GFX8-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v4f64__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] -; GFX9-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] -; GFX9-NEXT: v_max_f64 v[10:11], v[4:5], v[12:13] -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] -; GFX9-NEXT: v_max_f64 v[12:13], v[6:7], v[14:15] -; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v4f64__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] -; GFX940-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v16, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[10:11] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v16, vcc -; GFX940-NEXT: v_max_f64 v[8:9], v[4:5], v[12:13] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[12:13] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v16, vcc -; GFX940-NEXT: v_max_f64 v[8:9], v[6:7], v[14:15] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[14:15] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v16, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v4f64__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX900-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] +; GFX900-NEXT: v_max_f64 v[10:11], v[4:5], v[12:13] +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] +; GFX900-NEXT: v_max_f64 v[12:13], v[6:7], v[14:15] +; GFX900-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] +; GFX900-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v4f64__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX950-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v16, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v16, vcc +; GFX950-NEXT: v_max_f64 v[8:9], v[4:5], v[12:13] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[12:13] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v9, v16, vcc +; GFX950-NEXT: v_max_f64 v[8:9], v[6:7], v[14:15] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[14:15] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v16, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f64__nsz: ; GFX10: ; %bb.0: @@ -1754,15 +1704,6 @@ define <4 x double> @v_maximum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double> ; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_maximum_v4f64__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] -; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] -; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] -; GFX940-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_maximum_v4f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1878,89 +1819,89 @@ define <8 x double> @v_maximum_v8f64(<8 x double> %src0, <8 x double> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v8f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: v_max_f64 v[32:33], v[2:3], v[18:19] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] -; GFX9-NEXT: v_max_f64 v[18:19], v[4:5], v[20:21] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[4:5], v[20:21] -; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[16:17] -; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[0:1], v[16:17] -; GFX9-NEXT: v_mov_b32_e32 v34, 0x7ff80000 -; GFX9-NEXT: v_max_f64 v[20:21], v[6:7], v[22:23] -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[6:7], v[22:23] -; GFX9-NEXT: v_max_f64 v[16:17], v[8:9], v[24:25] -; GFX9-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25] -; GFX9-NEXT: v_max_f64 v[22:23], v[10:11], v[26:27] -; GFX9-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27] -; GFX9-NEXT: v_max_f64 v[24:25], v[12:13], v[28:29] -; GFX9-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v34, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v32, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v18, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v19, v34, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v20, 0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v21, v34, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v16, 0, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v17, v34, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v22, 0, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v23, v34, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v24, 0, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v25, v34, s[14:15] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[18:19], v[14:15], v[30:31] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] -; GFX9-NEXT: v_cndmask_b32_e64 v14, v18, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v8f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: scratch_load_dword v31, off, s32 -; GFX940-NEXT: v_mov_b32_e32 v54, 0x7ff80000 -; GFX940-NEXT: v_max_f64 v[32:33], v[0:1], v[16:17] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[16:17] -; GFX940-NEXT: v_max_f64 v[34:35], v[2:3], v[18:19] -; GFX940-NEXT: v_max_f64 v[36:37], v[4:5], v[20:21] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v32, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v33, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] -; GFX940-NEXT: v_max_f64 v[38:39], v[6:7], v[22:23] -; GFX940-NEXT: v_max_f64 v[48:49], v[8:9], v[24:25] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v34, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v35, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[20:21] -; GFX940-NEXT: v_max_f64 v[50:51], v[10:11], v[26:27] -; GFX940-NEXT: v_max_f64 v[52:53], v[12:13], v[28:29] -; GFX940-NEXT: v_cndmask_b32_e64 v4, v36, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v37, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[22:23] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[16:17], v[14:15], v[30:31] -; GFX940-NEXT: v_cndmask_b32_e64 v6, v38, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v39, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[24:25] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v8, v48, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v9, v49, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[26:27] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v10, v50, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v11, v51, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[28:29] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v12, v52, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v13, v53, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v14, v16, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v15, v17, v54, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v8f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX900-NEXT: v_max_f64 v[32:33], v[2:3], v[18:19] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] +; GFX900-NEXT: v_max_f64 v[18:19], v[4:5], v[20:21] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[4:5], v[20:21] +; GFX900-NEXT: v_max_f64 v[2:3], v[0:1], v[16:17] +; GFX900-NEXT: v_cmp_u_f64_e64 s[8:9], v[0:1], v[16:17] +; GFX900-NEXT: v_mov_b32_e32 v34, 0x7ff80000 +; GFX900-NEXT: v_max_f64 v[20:21], v[6:7], v[22:23] +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[6:7], v[22:23] +; GFX900-NEXT: v_max_f64 v[16:17], v[8:9], v[24:25] +; GFX900-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25] +; GFX900-NEXT: v_max_f64 v[22:23], v[10:11], v[26:27] +; GFX900-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27] +; GFX900-NEXT: v_max_f64 v[24:25], v[12:13], v[28:29] +; GFX900-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29] +; GFX900-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v1, v3, v34, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v2, v32, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v4, v18, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v19, v34, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v20, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v21, v34, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v8, v16, 0, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v17, v34, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v10, v22, 0, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v23, v34, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v12, v24, 0, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v25, v34, s[14:15] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_max_f64 v[18:19], v[14:15], v[30:31] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v14, v18, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v8f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_mov_b32_e32 v54, 0x7ff80000 +; GFX950-NEXT: v_max_f64 v[32:33], v[0:1], v[16:17] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[16:17] +; GFX950-NEXT: v_max_f64 v[34:35], v[2:3], v[18:19] +; GFX950-NEXT: v_max_f64 v[36:37], v[4:5], v[20:21] +; GFX950-NEXT: v_cndmask_b32_e64 v0, v32, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v33, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] +; GFX950-NEXT: v_max_f64 v[38:39], v[6:7], v[22:23] +; GFX950-NEXT: v_max_f64 v[48:49], v[8:9], v[24:25] +; GFX950-NEXT: v_cndmask_b32_e64 v2, v34, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v35, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[20:21] +; GFX950-NEXT: v_max_f64 v[50:51], v[10:11], v[26:27] +; GFX950-NEXT: v_max_f64 v[52:53], v[12:13], v[28:29] +; GFX950-NEXT: v_cndmask_b32_e64 v4, v36, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v37, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[22:23] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f64 v[16:17], v[14:15], v[30:31] +; GFX950-NEXT: v_cndmask_b32_e64 v6, v38, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v7, v39, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[24:25] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v8, v48, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v9, v49, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[26:27] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v10, v50, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v11, v51, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[28:29] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v12, v52, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v13, v53, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v14, v16, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v15, v17, v54, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v8f64: ; GFX10: ; %bb.0: @@ -2332,295 +2273,295 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v16f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: v_writelane_b32 v34, s30, 0 -; GFX9-NEXT: v_writelane_b32 v34, s31, 1 -; GFX9-NEXT: v_writelane_b32 v34, s34, 2 -; GFX9-NEXT: v_writelane_b32 v34, s35, 3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[31:32] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[31:32] -; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[31:32] -; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[31:32] -; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[31:32] -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[8:9] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[31:32] -; GFX9-NEXT: v_max_f64 v[8:9], v[8:9], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[10:11] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[31:32] -; GFX9-NEXT: v_max_f64 v[10:11], v[10:11], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, 0, s[12:13] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[31:32] -; GFX9-NEXT: v_max_f64 v[12:13], v[12:13], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, 0, s[14:15] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[16:17], v[14:15], v[31:32] -; GFX9-NEXT: v_max_f64 v[14:15], v[14:15], v[31:32] -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, 0, s[16:17] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[18:19], v[16:17], v[31:32] -; GFX9-NEXT: v_max_f64 v[16:17], v[16:17], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, 0, s[18:19] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[20:21], v[18:19], v[31:32] -; GFX9-NEXT: v_max_f64 v[18:19], v[18:19], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 -; GFX9-NEXT: v_cndmask_b32_e64 v18, v18, 0, s[20:21] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[22:23], v[20:21], v[31:32] -; GFX9-NEXT: v_max_f64 v[20:21], v[20:21], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, 0, s[22:23] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[24:25], v[22:23], v[31:32] -; GFX9-NEXT: v_max_f64 v[22:23], v[22:23], v[31:32] -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 -; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[24:25] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[26:27], v[24:25], v[31:32] -; GFX9-NEXT: v_max_f64 v[24:25], v[24:25], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 -; GFX9-NEXT: v_cndmask_b32_e64 v24, v24, 0, s[26:27] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[28:29], v[26:27], v[31:32] -; GFX9-NEXT: v_max_f64 v[26:27], v[26:27], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 -; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] -; GFX9-NEXT: v_max_f64 v[28:29], v[28:29], v[31:32] -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] -; GFX9-NEXT: v_max_f64 v[30:31], v[30:31], v[32:33] -; GFX9-NEXT: v_mov_b32_e32 v32, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v32, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v32, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v32, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v32, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v15, v15, v32, s[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v17, v17, v32, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v19, v19, v32, s[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v21, v21, v32, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] -; GFX9-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] -; GFX9-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] -; GFX9-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] -; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] -; GFX9-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] -; GFX9-NEXT: v_readlane_b32 s35, v34, 3 -; GFX9-NEXT: v_readlane_b32 s34, v34, 2 -; GFX9-NEXT: v_readlane_b32 s31, v34, 1 -; GFX9-NEXT: v_readlane_b32 s30, v34, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_maximum_v16f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_accvgpr_write_b32 a1, v40 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a2, v41 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a3, v42 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a4, v43 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a5, v44 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a6, v45 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a7, v46 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a8, v47 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a9, v56 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a10, v57 ; Reload Reuse -; GFX940-NEXT: scratch_load_dword v37, off, s32 offset:16 -; GFX940-NEXT: scratch_load_dword v36, off, s32 offset:12 -; GFX940-NEXT: scratch_load_dword v39, off, s32 offset:24 -; GFX940-NEXT: scratch_load_dword v38, off, s32 offset:20 -; GFX940-NEXT: scratch_load_dword v49, off, s32 offset:32 -; GFX940-NEXT: scratch_load_dword v48, off, s32 offset:28 -; GFX940-NEXT: scratch_load_dword v57, off, s32 offset:8 -; GFX940-NEXT: scratch_load_dword v56, off, s32 offset:4 -; GFX940-NEXT: scratch_load_dword v47, off, s32 offset:40 -; GFX940-NEXT: scratch_load_dword v46, off, s32 offset:36 -; GFX940-NEXT: scratch_load_dword v45, off, s32 offset:48 -; GFX940-NEXT: scratch_load_dword v44, off, s32 offset:44 -; GFX940-NEXT: scratch_load_dword v43, off, s32 offset:56 -; GFX940-NEXT: scratch_load_dword v42, off, s32 offset:52 -; GFX940-NEXT: scratch_load_dword v41, off, s32 offset:64 -; GFX940-NEXT: scratch_load_dword v40, off, s32 offset:60 -; GFX940-NEXT: scratch_load_dword v55, off, s32 offset:72 -; GFX940-NEXT: scratch_load_dword v54, off, s32 offset:68 -; GFX940-NEXT: scratch_load_dword v53, off, s32 offset:80 -; GFX940-NEXT: scratch_load_dword v52, off, s32 offset:76 -; GFX940-NEXT: scratch_load_dword v51, off, s32 offset:88 -; GFX940-NEXT: scratch_load_dword v50, off, s32 offset:84 -; GFX940-NEXT: scratch_load_dword v35, off, s32 offset:96 -; GFX940-NEXT: scratch_load_dword v34, off, s32 offset:92 -; GFX940-NEXT: scratch_load_dword v31, off, s32 -; GFX940-NEXT: scratch_load_dword v33, off, s32 offset:104 -; GFX940-NEXT: scratch_load_dword v32, off, s32 offset:100 -; GFX940-NEXT: v_accvgpr_write_b32 a11, v58 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a12, v59 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a13, v60 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a14, v61 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a15, v62 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a16, v63 ; Reload Reuse -; GFX940-NEXT: s_waitcnt vmcnt(25) -; GFX940-NEXT: v_max_f64 v[58:59], v[2:3], v[36:37] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[36:37] -; GFX940-NEXT: scratch_load_dword v37, off, s32 offset:112 -; GFX940-NEXT: scratch_load_dword v36, off, s32 offset:108 -; GFX940-NEXT: s_waitcnt vmcnt(25) -; GFX940-NEXT: v_max_f64 v[60:61], v[4:5], v[38:39] -; GFX940-NEXT: v_cmp_u_f64_e64 s[0:1], v[4:5], v[38:39] -; GFX940-NEXT: scratch_load_dword v39, off, s32 offset:120 -; GFX940-NEXT: scratch_load_dword v38, off, s32 offset:116 -; GFX940-NEXT: s_waitcnt vmcnt(25) -; GFX940-NEXT: v_max_f64 v[62:63], v[6:7], v[48:49] -; GFX940-NEXT: v_cmp_u_f64_e64 s[2:3], v[6:7], v[48:49] -; GFX940-NEXT: scratch_load_dword v49, off, s32 offset:128 -; GFX940-NEXT: scratch_load_dword v48, off, s32 offset:124 -; GFX940-NEXT: s_waitcnt vmcnt(25) -; GFX940-NEXT: v_max_f64 v[2:3], v[0:1], v[56:57] -; GFX940-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[56:57] -; GFX940-NEXT: v_mov_b32_e32 v0, 0x7ff80000 -; GFX940-NEXT: s_waitcnt vmcnt(23) -; GFX940-NEXT: v_max_f64 v[56:57], v[8:9], v[46:47] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v2, 0, s[4:5] -; GFX940-NEXT: v_accvgpr_write_b32 a0, v1 -; GFX940-NEXT: v_cndmask_b32_e64 v1, v3, v0, s[4:5] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v58, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v59, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[46:47] -; GFX940-NEXT: s_waitcnt vmcnt(21) -; GFX940-NEXT: v_max_f64 v[46:47], v[10:11], v[44:45] -; GFX940-NEXT: v_cndmask_b32_e64 v4, v60, 0, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v8, v56, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v9, v57, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[44:45] -; GFX940-NEXT: s_waitcnt vmcnt(19) -; GFX940-NEXT: v_max_f64 v[44:45], v[12:13], v[42:43] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v61, v0, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v10, v46, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v11, v47, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[42:43] -; GFX940-NEXT: s_waitcnt vmcnt(17) -; GFX940-NEXT: v_max_f64 v[42:43], v[14:15], v[40:41] -; GFX940-NEXT: v_cndmask_b32_e64 v6, v62, 0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v12, v44, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v13, v45, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[40:41] -; GFX940-NEXT: s_waitcnt vmcnt(15) -; GFX940-NEXT: v_max_f64 v[40:41], v[16:17], v[54:55] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v63, v0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v14, v42, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v15, v43, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[16:17], v[54:55] -; GFX940-NEXT: s_waitcnt vmcnt(13) -; GFX940-NEXT: v_max_f64 v[54:55], v[18:19], v[52:53] -; GFX940-NEXT: v_accvgpr_read_b32 v63, a16 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v16, v40, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v17, v41, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[18:19], v[52:53] -; GFX940-NEXT: s_waitcnt vmcnt(11) -; GFX940-NEXT: v_max_f64 v[52:53], v[20:21], v[50:51] -; GFX940-NEXT: v_accvgpr_read_b32 v62, a15 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v18, v54, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v19, v55, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[20:21], v[50:51] -; GFX940-NEXT: s_waitcnt vmcnt(9) -; GFX940-NEXT: v_max_f64 v[50:51], v[22:23], v[34:35] -; GFX940-NEXT: v_accvgpr_read_b32 v61, a14 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v20, v52, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v21, v53, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[22:23], v[34:35] -; GFX940-NEXT: s_waitcnt vmcnt(6) -; GFX940-NEXT: v_max_f64 v[34:35], v[24:25], v[32:33] -; GFX940-NEXT: v_accvgpr_read_b32 v60, a13 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v22, v50, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v23, v51, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[24:25], v[32:33] -; GFX940-NEXT: v_accvgpr_read_b32 v59, a12 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v58, a11 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v24, v34, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v25, v35, v0, vcc -; GFX940-NEXT: v_accvgpr_read_b32 v57, a10 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v56, a9 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v47, a8 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v46, a7 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v45, a6 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v44, a5 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v43, a4 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v42, a3 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v41, a2 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v40, a1 ; Reload Reuse -; GFX940-NEXT: s_waitcnt vmcnt(4) -; GFX940-NEXT: v_max_f64 v[32:33], v[26:27], v[36:37] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[26:27], v[36:37] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v26, v32, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v27, v33, v0, vcc -; GFX940-NEXT: s_waitcnt vmcnt(2) -; GFX940-NEXT: v_max_f64 v[32:33], v[28:29], v[38:39] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[28:29], v[38:39] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v28, v32, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v29, v33, v0, vcc -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_max_f64 v[32:33], v[30:31], v[48:49] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[30:31], v[48:49] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v30, v32, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v31, v33, v0, vcc -; GFX940-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v16f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX900-NEXT: v_writelane_b32 v34, s30, 0 +; GFX900-NEXT: v_writelane_b32 v34, s31, 1 +; GFX900-NEXT: v_writelane_b32 v34, s34, 2 +; GFX900-NEXT: v_writelane_b32 v34, s35, 3 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[31:32] +; GFX900-NEXT: v_max_f64 v[0:1], v[0:1], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[31:32] +; GFX900-NEXT: v_max_f64 v[2:3], v[2:3], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; GFX900-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[31:32] +; GFX900-NEXT: v_max_f64 v[4:5], v[4:5], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; GFX900-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[6:7] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[31:32] +; GFX900-NEXT: v_max_f64 v[6:7], v[6:7], v[31:32] +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; GFX900-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[8:9] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[31:32] +; GFX900-NEXT: v_max_f64 v[8:9], v[8:9], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; GFX900-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[10:11] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[31:32] +; GFX900-NEXT: v_max_f64 v[10:11], v[10:11], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; GFX900-NEXT: v_cndmask_b32_e64 v10, v10, 0, s[12:13] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[31:32] +; GFX900-NEXT: v_max_f64 v[12:13], v[12:13], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; GFX900-NEXT: v_cndmask_b32_e64 v12, v12, 0, s[14:15] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[16:17], v[14:15], v[31:32] +; GFX900-NEXT: v_max_f64 v[14:15], v[14:15], v[31:32] +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; GFX900-NEXT: v_cndmask_b32_e64 v14, v14, 0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[18:19], v[16:17], v[31:32] +; GFX900-NEXT: v_max_f64 v[16:17], v[16:17], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; GFX900-NEXT: v_cndmask_b32_e64 v16, v16, 0, s[18:19] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[20:21], v[18:19], v[31:32] +; GFX900-NEXT: v_max_f64 v[18:19], v[18:19], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 +; GFX900-NEXT: v_cndmask_b32_e64 v18, v18, 0, s[20:21] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[22:23], v[20:21], v[31:32] +; GFX900-NEXT: v_max_f64 v[20:21], v[20:21], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; GFX900-NEXT: v_cndmask_b32_e64 v20, v20, 0, s[22:23] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[24:25], v[22:23], v[31:32] +; GFX900-NEXT: v_max_f64 v[22:23], v[22:23], v[31:32] +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 +; GFX900-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[24:25] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[26:27], v[24:25], v[31:32] +; GFX900-NEXT: v_max_f64 v[24:25], v[24:25], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 +; GFX900-NEXT: v_cndmask_b32_e64 v24, v24, 0, s[26:27] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[28:29], v[26:27], v[31:32] +; GFX900-NEXT: v_max_f64 v[26:27], v[26:27], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; GFX900-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX900-NEXT: v_max_f64 v[28:29], v[28:29], v[31:32] +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GFX900-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX900-NEXT: v_max_f64 v[30:31], v[30:31], v[32:33] +; GFX900-NEXT: v_mov_b32_e32 v32, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v5, v32, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v7, v32, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v9, v32, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v11, v32, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v15, v15, v32, s[16:17] +; GFX900-NEXT: v_cndmask_b32_e64 v17, v17, v32, s[18:19] +; GFX900-NEXT: v_cndmask_b32_e64 v19, v19, v32, s[20:21] +; GFX900-NEXT: v_cndmask_b32_e64 v21, v21, v32, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] +; GFX900-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] +; GFX900-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] +; GFX900-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] +; GFX900-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] +; GFX900-NEXT: v_readlane_b32 s35, v34, 3 +; GFX900-NEXT: v_readlane_b32 s34, v34, 2 +; GFX900-NEXT: v_readlane_b32 s31, v34, 1 +; GFX900-NEXT: v_readlane_b32 s30, v34, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v16f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a1, v40 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a2, v41 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a3, v42 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a4, v43 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a5, v44 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a6, v45 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a7, v46 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a8, v47 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a9, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a10, v57 ; Reload Reuse +; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:16 +; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:12 +; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:24 +; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:20 +; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:32 +; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:28 +; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:8 +; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:4 +; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:40 +; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:36 +; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:48 +; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:44 +; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:56 +; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:52 +; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:64 +; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:60 +; GFX950-NEXT: scratch_load_dword v55, off, s32 offset:72 +; GFX950-NEXT: scratch_load_dword v54, off, s32 offset:68 +; GFX950-NEXT: scratch_load_dword v53, off, s32 offset:80 +; GFX950-NEXT: scratch_load_dword v52, off, s32 offset:76 +; GFX950-NEXT: scratch_load_dword v51, off, s32 offset:88 +; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:84 +; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:96 +; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:92 +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:104 +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:100 +; GFX950-NEXT: v_accvgpr_write_b32 a11, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a12, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a13, v60 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a14, v61 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a15, v62 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a16, v63 ; Reload Reuse +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_max_f64 v[58:59], v[2:3], v[36:37] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[36:37] +; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:112 +; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:108 +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_max_f64 v[60:61], v[4:5], v[38:39] +; GFX950-NEXT: v_cmp_u_f64_e64 s[0:1], v[4:5], v[38:39] +; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:120 +; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:116 +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_max_f64 v[62:63], v[6:7], v[48:49] +; GFX950-NEXT: v_cmp_u_f64_e64 s[2:3], v[6:7], v[48:49] +; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:128 +; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:124 +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[56:57] +; GFX950-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[56:57] +; GFX950-NEXT: v_mov_b32_e32 v0, 0x7ff80000 +; GFX950-NEXT: s_waitcnt vmcnt(23) +; GFX950-NEXT: v_max_f64 v[56:57], v[8:9], v[46:47] +; GFX950-NEXT: v_cndmask_b32_e64 v1, v2, 0, s[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX950-NEXT: v_cndmask_b32_e64 v1, v3, v0, s[4:5] +; GFX950-NEXT: v_cndmask_b32_e64 v2, v58, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v59, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[46:47] +; GFX950-NEXT: s_waitcnt vmcnt(21) +; GFX950-NEXT: v_max_f64 v[46:47], v[10:11], v[44:45] +; GFX950-NEXT: v_cndmask_b32_e64 v4, v60, 0, s[0:1] +; GFX950-NEXT: v_cndmask_b32_e64 v8, v56, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v9, v57, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[44:45] +; GFX950-NEXT: s_waitcnt vmcnt(19) +; GFX950-NEXT: v_max_f64 v[44:45], v[12:13], v[42:43] +; GFX950-NEXT: v_cndmask_b32_e64 v5, v61, v0, s[0:1] +; GFX950-NEXT: v_cndmask_b32_e64 v10, v46, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v11, v47, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[42:43] +; GFX950-NEXT: s_waitcnt vmcnt(17) +; GFX950-NEXT: v_max_f64 v[42:43], v[14:15], v[40:41] +; GFX950-NEXT: v_cndmask_b32_e64 v6, v62, 0, s[2:3] +; GFX950-NEXT: v_cndmask_b32_e64 v12, v44, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v13, v45, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[40:41] +; GFX950-NEXT: s_waitcnt vmcnt(15) +; GFX950-NEXT: v_max_f64 v[40:41], v[16:17], v[54:55] +; GFX950-NEXT: v_cndmask_b32_e64 v7, v63, v0, s[2:3] +; GFX950-NEXT: v_cndmask_b32_e64 v14, v42, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v15, v43, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[16:17], v[54:55] +; GFX950-NEXT: s_waitcnt vmcnt(13) +; GFX950-NEXT: v_max_f64 v[54:55], v[18:19], v[52:53] +; GFX950-NEXT: v_accvgpr_read_b32 v63, a16 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e64 v16, v40, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v17, v41, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[18:19], v[52:53] +; GFX950-NEXT: s_waitcnt vmcnt(11) +; GFX950-NEXT: v_max_f64 v[52:53], v[20:21], v[50:51] +; GFX950-NEXT: v_accvgpr_read_b32 v62, a15 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e64 v18, v54, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v19, v55, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[20:21], v[50:51] +; GFX950-NEXT: s_waitcnt vmcnt(9) +; GFX950-NEXT: v_max_f64 v[50:51], v[22:23], v[34:35] +; GFX950-NEXT: v_accvgpr_read_b32 v61, a14 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e64 v20, v52, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v21, v53, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[22:23], v[34:35] +; GFX950-NEXT: s_waitcnt vmcnt(6) +; GFX950-NEXT: v_max_f64 v[34:35], v[24:25], v[32:33] +; GFX950-NEXT: v_accvgpr_read_b32 v60, a13 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e64 v22, v50, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v23, v51, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[24:25], v[32:33] +; GFX950-NEXT: v_accvgpr_read_b32 v59, a12 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v58, a11 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e64 v24, v34, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v25, v35, v0, vcc +; GFX950-NEXT: v_accvgpr_read_b32 v57, a10 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v56, a9 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v47, a8 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v46, a7 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v45, a6 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v44, a5 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v43, a4 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v42, a3 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v41, a2 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v40, a1 ; Reload Reuse +; GFX950-NEXT: s_waitcnt vmcnt(4) +; GFX950-NEXT: v_max_f64 v[32:33], v[26:27], v[36:37] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[26:27], v[36:37] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v26, v32, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v27, v33, v0, vcc +; GFX950-NEXT: s_waitcnt vmcnt(2) +; GFX950-NEXT: v_max_f64 v[32:33], v[28:29], v[38:39] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[28:29], v[38:39] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v28, v32, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v29, v33, v0, vcc +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f64 v[32:33], v[30:31], v[48:49] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[30:31], v[48:49] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v30, v32, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v31, v33, v0, vcc +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v16f64: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index a74043378a2598..329a85f91c2514 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -2,7 +2,8 @@ ; xUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s @@ -17,24 +18,24 @@ define half @v_minimum_f16(half %src0, half %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f16: ; GFX10: ; %bb.0: @@ -79,12 +80,6 @@ define half @v_minimum_f16__nnan(half %src0, half %src1) { ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_f16__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_f16__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -120,24 +115,24 @@ define half @v_minimum_f16__nsz(half %src0, half %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f16__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f16__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f16__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f16__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f16__nsz: ; GFX10: ; %bb.0: @@ -182,12 +177,6 @@ define half @v_minimum_f16__nnan_nsz(half %src0, half %src1) { ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_f16__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_f16__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -224,26 +213,26 @@ define half @v_minimum_f16__nnan_src0(half %arg0, half %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f16__nnan_src0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f16__nnan_src0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX940-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f16__nnan_src0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX900-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f16__nnan_src0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX950-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f16__nnan_src0: ; GFX10: ; %bb.0: @@ -291,26 +280,26 @@ define half @v_minimum_f16__nnan_src1(half %src0, half %arg1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f16__nnan_src1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX9-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f16__nnan_src1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX940-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f16__nnan_src1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX900-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f16__nnan_src1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX950-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f16__nnan_src1: ; GFX10: ; %bb.0: @@ -362,34 +351,34 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_minimum_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s17 -; GFX9-NEXT: v_min_f16_e32 v1, s16, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_minimum_f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_min_f16_e32 v1, s0, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_minimum_f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s17 +; GFX900-NEXT: v_min_f16_e32 v1, s16, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_minimum_f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s1 +; GFX950-NEXT: v_min_f16_e32 v1, s0, v0 +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_minimum_f16: ; GFX10: ; %bb.0: @@ -456,35 +445,35 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v2f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v2f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v2f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_min_f16 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f16: ; GFX10: ; %bb.0: @@ -542,12 +531,6 @@ define <2 x half> @v_minimum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) { ; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v2f16__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v2f16__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -590,35 +573,35 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v2f16__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v2f16__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v2f16__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v2f16__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_min_f16 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f16__nsz: ; GFX10: ; %bb.0: @@ -676,12 +659,6 @@ define <2 x half> @v_minimum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1) ; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v2f16__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v2f16__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -729,50 +706,50 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_minimum_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s17 -; GFX9-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-NEXT: s_lshr_b32 s4, s17, 16 -; GFX9-NEXT: v_pk_min_f16 v1, s16, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 -; GFX9-NEXT: s_lshr_b32 s5, s16, 16 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s5, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_minimum_v2f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NEXT: s_lshr_b32 s1, s1, 16 -; GFX940-NEXT: v_pk_min_f16 v1, s0, v1 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX940-NEXT: s_lshr_b32 s0, s0, 16 -; GFX940-NEXT: v_mov_b32_e32 v3, s1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, s0, v3 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX940-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_minimum_v2f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s17 +; GFX900-NEXT: v_mov_b32_e32 v1, s17 +; GFX900-NEXT: s_lshr_b32 s4, s17, 16 +; GFX900-NEXT: v_pk_min_f16 v1, s16, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s16, v0 +; GFX900-NEXT: s_lshr_b32 s5, s16, 16 +; GFX900-NEXT: v_mov_b32_e32 v3, s4 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s5, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_minimum_v2f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s1 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: s_lshr_b32 s1, s1, 16 +; GFX950-NEXT: v_pk_min_f16 v1, s0, v1 +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 +; GFX950-NEXT: s_lshr_b32 s0, s0, 16 +; GFX950-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v3 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX950-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_minimum_v2f16: ; GFX10: ; %bb.0: @@ -850,41 +827,41 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v3f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v3f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v3f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v3f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX950-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f16: ; GFX10: ; %bb.0: @@ -952,13 +929,6 @@ define <3 x half> @v_minimum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) { ; GFX9-NEXT: v_pk_min_f16 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v3f16__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX940-NEXT: v_pk_min_f16 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v3f16__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1007,41 +977,41 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v3f16__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v3f16__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v3f16__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v3f16__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX950-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f16__nsz: ; GFX10: ; %bb.0: @@ -1109,13 +1079,6 @@ define <3 x half> @v_minimum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1) ; GFX9-NEXT: v_pk_min_f16 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v3f16__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX940-NEXT: v_pk_min_f16 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v3f16__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1171,51 +1134,51 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v4f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v4f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v4f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v4f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX950-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f16: ; GFX10: ; %bb.0: @@ -1294,13 +1257,6 @@ define <4 x half> @v_minimum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) { ; GFX9-NEXT: v_pk_min_f16 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v4f16__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX940-NEXT: v_pk_min_f16 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v4f16__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1356,51 +1312,51 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v4f16__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v4f16__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX940-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v4f16__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v4f16__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX950-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f16__nsz: ; GFX10: ; %bb.0: @@ -1479,13 +1435,6 @@ define <4 x half> @v_minimum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1) ; GFX9-NEXT: v_pk_min_f16 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v4f16__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX940-NEXT: v_pk_min_f16 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v4f16__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1561,83 +1510,83 @@ define <8 x half> @v_minimum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v8f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v8, v3, v7 -; GFX9-NEXT: v_mov_b32_e32 v9, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc -; GFX9-NEXT: v_pk_min_f16 v7, v2, v6 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc -; GFX9-NEXT: v_pk_min_f16 v6, v1, v5 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc -; GFX9-NEXT: v_pk_min_f16 v5, v0, v4 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v6, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v7, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v8, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v10, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v8f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v8, v3, v7 -; GFX940-NEXT: v_mov_b32_e32 v9, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v7, v2, v6 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 -; GFX940-NEXT: v_perm_b32 v3, v3, v10, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v6, v1, v5 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 -; GFX940-NEXT: v_perm_b32 v2, v2, v8, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v5, v0, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 -; GFX940-NEXT: v_perm_b32 v1, v1, v7, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v6, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v8f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v8, v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v9, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc +; GFX900-NEXT: v_pk_min_f16 v7, v2, v6 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc +; GFX900-NEXT: v_pk_min_f16 v6, v1, v5 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc +; GFX900-NEXT: v_pk_min_f16 v5, v0, v4 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v6, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v7, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v8, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v10, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v8f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_min_f16 v8, v3, v7 +; GFX950-NEXT: v_mov_b32_e32 v9, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v7, v2, v6 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 +; GFX950-NEXT: v_perm_b32 v3, v3, v10, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v6, v1, v5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 +; GFX950-NEXT: v_perm_b32 v2, v2, v8, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v5, v0, v4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 +; GFX950-NEXT: v_perm_b32 v1, v1, v7, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v6, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v8f16: ; GFX10: ; %bb.0: @@ -1818,147 +1767,147 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v16f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v16, v7, v15 -; GFX9-NEXT: v_mov_b32_e32 v17, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc -; GFX9-NEXT: v_pk_min_f16 v15, v6, v14 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc -; GFX9-NEXT: v_pk_min_f16 v14, v5, v13 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc -; GFX9-NEXT: v_pk_min_f16 v13, v4, v12 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc -; GFX9-NEXT: v_pk_min_f16 v12, v3, v11 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc -; GFX9-NEXT: v_pk_min_f16 v11, v2, v10 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc -; GFX9-NEXT: v_pk_min_f16 v10, v1, v9 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc -; GFX9-NEXT: v_pk_min_f16 v9, v0, v8 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v10, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v11, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v12, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v13, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v14, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v15, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v16, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v18, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v16f16: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_min_f16 v16, v7, v15 -; GFX940-NEXT: v_mov_b32_e32 v17, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v15, v6, v14 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 -; GFX940-NEXT: v_perm_b32 v7, v7, v18, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v14, v5, v13 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 -; GFX940-NEXT: v_perm_b32 v6, v6, v16, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v13, v4, v12 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 -; GFX940-NEXT: v_perm_b32 v5, v5, v15, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v12, v3, v11 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 -; GFX940-NEXT: v_perm_b32 v4, v4, v14, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v11, v2, v10 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 -; GFX940-NEXT: v_perm_b32 v3, v3, v13, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v10, v1, v9 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 -; GFX940-NEXT: v_perm_b32 v2, v2, v12, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: v_pk_min_f16 v9, v0, v8 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 -; GFX940-NEXT: v_perm_b32 v1, v1, v11, s0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v10, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v16f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v16, v7, v15 +; GFX900-NEXT: v_mov_b32_e32 v17, 0x7e00 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc +; GFX900-NEXT: v_pk_min_f16 v15, v6, v14 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc +; GFX900-NEXT: v_pk_min_f16 v14, v5, v13 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc +; GFX900-NEXT: v_pk_min_f16 v13, v4, v12 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc +; GFX900-NEXT: v_pk_min_f16 v12, v3, v11 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc +; GFX900-NEXT: v_pk_min_f16 v11, v2, v10 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc +; GFX900-NEXT: v_pk_min_f16 v10, v1, v9 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc +; GFX900-NEXT: v_pk_min_f16 v9, v0, v8 +; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v10, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v11, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v12, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v13, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v14, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v15, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v16, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v18, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v16f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_min_f16 v16, v7, v15 +; GFX950-NEXT: v_mov_b32_e32 v17, 0x7e00 +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v15, v6, v14 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 +; GFX950-NEXT: v_perm_b32 v7, v7, v18, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v14, v5, v13 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 +; GFX950-NEXT: v_perm_b32 v6, v6, v16, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v13, v4, v12 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 +; GFX950-NEXT: v_perm_b32 v5, v5, v15, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v12, v3, v11 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 +; GFX950-NEXT: v_perm_b32 v4, v4, v14, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v11, v2, v10 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 +; GFX950-NEXT: v_perm_b32 v3, v3, v13, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v10, v1, v9 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 +; GFX950-NEXT: v_perm_b32 v2, v2, v12, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_pk_min_f16 v9, v0, v8 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc +; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 +; GFX950-NEXT: v_perm_b32 v1, v1, v11, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc +; GFX950-NEXT: v_perm_b32 v0, v0, v10, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v16f16: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll index 2b3041290b5866..2614fb3bf9f737 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll @@ -2,7 +2,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s @@ -26,24 +27,24 @@ define float @v_minimum_f32(float %src0, float %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f32: ; GFX10: ; %bb.0: @@ -94,12 +95,6 @@ define float @v_minimum_f32__nnan(float %src0, float %src1) { ; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_f32__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -144,24 +139,24 @@ define float @v_minimum_f32__nsz(float %src0, float %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f32__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f32__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f32__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f32__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f32__nsz: ; GFX10: ; %bb.0: @@ -212,12 +207,6 @@ define float @v_minimum_f32__nnan_nsz(float %src0, float %src1) { ; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_f32__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -264,26 +253,26 @@ define float @v_minimum_f32__nnan_src0(float %arg0, float %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f32__nnan_src0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f32__nnan_src0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX940-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f32__nnan_src0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX900-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f32__nnan_src0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX950-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f32__nnan_src0: ; GFX10: ; %bb.0: @@ -341,26 +330,26 @@ define float @v_minimum_f32__nnan_src1(float %src0, float %arg1) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f32__nnan_src1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f32__nnan_src1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX940-NEXT: v_min_f32_e32 v2, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f32__nnan_src1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX900-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f32__nnan_src1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX950-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f32__nnan_src1: ; GFX10: ; %bb.0: @@ -424,32 +413,32 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_minimum_f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s17 -; GFX9-NEXT: v_min_f32_e32 v1, s16, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_minimum_f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_min_f32_e32 v1, s0, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_minimum_f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s17 +; GFX900-NEXT: v_min_f32_e32 v1, s16, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_minimum_f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s1 +; GFX950-NEXT: v_min_f32_e32 v1, s0, v0 +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_minimum_f32: ; GFX10: ; %bb.0: @@ -517,31 +506,31 @@ define <2 x float> @v_minimum_v2f32(<2 x float> %src0, <2 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v2f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_min_f32_e32 v2, v1, v3 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v2f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: v_min_f32_e32 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v2f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX900-NEXT: v_min_f32_e32 v2, v1, v3 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v2f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX950-NEXT: v_min_f32_e32 v2, v1, v3 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f32: ; GFX10: ; %bb.0: @@ -601,13 +590,6 @@ define <2 x float> @v_minimum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1) ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v2f32__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v2f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -660,31 +642,31 @@ define <2 x float> @v_minimum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v2f32__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_min_f32_e32 v2, v1, v3 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v2f32__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: v_min_f32_e32 v2, v1, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v2f32__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX900-NEXT: v_min_f32_e32 v2, v1, v3 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v2f32__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f32_e32 v4, v0, v2 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 +; GFX950-NEXT: v_min_f32_e32 v2, v1, v3 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f32__nsz: ; GFX10: ; %bb.0: @@ -744,13 +726,6 @@ define <2 x float> @v_minimum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr ; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v2f32__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX940-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v2f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -813,40 +788,40 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_minimum_v2f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s19 -; GFX9-NEXT: v_min_f32_e32 v1, s17, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s17, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_min_f32_e32 v3, s16, v0 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v[0:1] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_minimum_v2f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s3 -; GFX940-NEXT: v_min_f32_e32 v1, s1, v0 -; GFX940-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s1, v0 -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_min_f32_e32 v3, s0, v0 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v[0:1] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_minimum_v2f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s19 +; GFX900-NEXT: v_min_f32_e32 v1, s17, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s17, v0 +; GFX900-NEXT: v_mov_b32_e32 v0, s18 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX900-NEXT: v_min_f32_e32 v3, s16, v0 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s16, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_minimum_v2f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s3 +; GFX950-NEXT: v_min_f32_e32 v1, s1, v0 +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, s1, v0 +; GFX950-NEXT: v_mov_b32_e32 v0, s2 +; GFX950-NEXT: v_min_f32_e32 v3, s0, v0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_minimum_v2f32: ; GFX10: ; %bb.0: @@ -927,38 +902,38 @@ define <3 x float> @v_minimum_v3f32(<3 x float> %src0, <3 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v3f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v6, v0, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_min_f32_e32 v3, v1, v4 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX9-NEXT: v_min_f32_e32 v3, v2, v5 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v3f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v6, v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX940-NEXT: v_min_f32_e32 v3, v1, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v3, v2, v5 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v3f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f32_e32 v6, v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX900-NEXT: v_min_f32_e32 v3, v1, v4 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX900-NEXT: v_min_f32_e32 v3, v2, v5 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v3f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f32_e32 v6, v0, v3 +; GFX950-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX950-NEXT: v_min_f32_e32 v3, v1, v4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX950-NEXT: v_min_f32_e32 v3, v2, v5 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f32: ; GFX10: ; %bb.0: @@ -1028,14 +1003,6 @@ define <3 x float> @v_minimum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1) ; GFX9-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v3f32__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX940-NEXT: v_min_f32_e32 v1, v1, v4 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v3f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1097,38 +1064,38 @@ define <3 x float> @v_minimum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v3f32__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v6, v0, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_min_f32_e32 v3, v1, v4 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX9-NEXT: v_min_f32_e32 v3, v2, v5 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v3f32__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v6, v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX940-NEXT: v_min_f32_e32 v3, v1, v4 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX940-NEXT: v_min_f32_e32 v3, v2, v5 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v3f32__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f32_e32 v6, v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX900-NEXT: v_min_f32_e32 v3, v1, v4 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX900-NEXT: v_min_f32_e32 v3, v2, v5 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v3f32__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f32_e32 v6, v0, v3 +; GFX950-NEXT: v_mov_b32_e32 v7, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 +; GFX950-NEXT: v_min_f32_e32 v3, v1, v4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX950-NEXT: v_min_f32_e32 v3, v2, v5 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f32__nsz: ; GFX10: ; %bb.0: @@ -1198,14 +1165,6 @@ define <3 x float> @v_minimum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr ; GFX9-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v3f32__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX940-NEXT: v_min_f32_e32 v1, v1, v4 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v5 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v3f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1273,45 +1232,45 @@ define <4 x float> @v_minimum_v4f32(<4 x float> %src0, <4 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v4f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v8, v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX9-NEXT: v_min_f32_e32 v4, v1, v5 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX9-NEXT: v_min_f32_e32 v4, v2, v6 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX9-NEXT: v_min_f32_e32 v4, v3, v7 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v4f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v8, v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX940-NEXT: v_min_f32_e32 v4, v1, v5 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX940-NEXT: v_min_f32_e32 v4, v2, v6 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX940-NEXT: v_min_f32_e32 v4, v3, v7 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v4f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f32_e32 v8, v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX900-NEXT: v_min_f32_e32 v4, v1, v5 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX900-NEXT: v_min_f32_e32 v4, v2, v6 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX900-NEXT: v_min_f32_e32 v4, v3, v7 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v4f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f32_e32 v8, v0, v4 +; GFX950-NEXT: v_mov_b32_e32 v9, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX950-NEXT: v_min_f32_e32 v4, v1, v5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX950-NEXT: v_min_f32_e32 v4, v2, v6 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX950-NEXT: v_min_f32_e32 v4, v3, v7 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f32: ; GFX10: ; %bb.0: @@ -1391,15 +1350,6 @@ define <4 x float> @v_minimum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1) ; GFX9-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v4f32__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX940-NEXT: v_min_f32_e32 v1, v1, v5 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_min_f32_e32 v3, v3, v7 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v4f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1469,45 +1419,45 @@ define <4 x float> @v_minimum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v4f32__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v8, v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX9-NEXT: v_min_f32_e32 v4, v1, v5 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX9-NEXT: v_min_f32_e32 v4, v2, v6 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX9-NEXT: v_min_f32_e32 v4, v3, v7 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v4f32__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v8, v0, v4 -; GFX940-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX940-NEXT: v_min_f32_e32 v4, v1, v5 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX940-NEXT: v_min_f32_e32 v4, v2, v6 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX940-NEXT: v_min_f32_e32 v4, v3, v7 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v4f32__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f32_e32 v8, v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX900-NEXT: v_min_f32_e32 v4, v1, v5 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX900-NEXT: v_min_f32_e32 v4, v2, v6 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX900-NEXT: v_min_f32_e32 v4, v3, v7 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v4f32__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f32_e32 v8, v0, v4 +; GFX950-NEXT: v_mov_b32_e32 v9, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 +; GFX950-NEXT: v_min_f32_e32 v4, v1, v5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX950-NEXT: v_min_f32_e32 v4, v2, v6 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX950-NEXT: v_min_f32_e32 v4, v3, v7 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f32__nsz: ; GFX10: ; %bb.0: @@ -1587,15 +1537,6 @@ define <4 x float> @v_minimum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr ; GFX9-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v4f32__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX940-NEXT: v_min_f32_e32 v1, v1, v5 -; GFX940-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX940-NEXT: v_min_f32_e32 v3, v3, v7 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v4f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1689,73 +1630,73 @@ define <8 x float> @v_minimum_v8f32(<8 x float> %src0, <8 x float> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v8f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v16, v0, v8 -; GFX9-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc -; GFX9-NEXT: v_min_f32_e32 v8, v1, v9 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc -; GFX9-NEXT: v_min_f32_e32 v8, v2, v10 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc -; GFX9-NEXT: v_min_f32_e32 v8, v3, v11 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc -; GFX9-NEXT: v_min_f32_e32 v8, v4, v12 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc -; GFX9-NEXT: v_min_f32_e32 v8, v5, v13 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc -; GFX9-NEXT: v_min_f32_e32 v8, v6, v14 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc -; GFX9-NEXT: v_min_f32_e32 v8, v7, v15 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v8f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v16, v0, v8 -; GFX940-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX940-NEXT: v_min_f32_e32 v8, v1, v9 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc -; GFX940-NEXT: v_min_f32_e32 v8, v2, v10 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc -; GFX940-NEXT: v_min_f32_e32 v8, v3, v11 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc -; GFX940-NEXT: v_min_f32_e32 v8, v4, v12 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc -; GFX940-NEXT: v_min_f32_e32 v8, v5, v13 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc -; GFX940-NEXT: v_min_f32_e32 v8, v6, v14 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc -; GFX940-NEXT: v_min_f32_e32 v8, v7, v15 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v8f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f32_e32 v16, v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc +; GFX900-NEXT: v_min_f32_e32 v8, v1, v9 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc +; GFX900-NEXT: v_min_f32_e32 v8, v2, v10 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc +; GFX900-NEXT: v_min_f32_e32 v8, v3, v11 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc +; GFX900-NEXT: v_min_f32_e32 v8, v4, v12 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc +; GFX900-NEXT: v_min_f32_e32 v8, v5, v13 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc +; GFX900-NEXT: v_min_f32_e32 v8, v6, v14 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc +; GFX900-NEXT: v_min_f32_e32 v8, v7, v15 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v8f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f32_e32 v16, v0, v8 +; GFX950-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 +; GFX950-NEXT: v_min_f32_e32 v8, v1, v9 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc +; GFX950-NEXT: v_min_f32_e32 v8, v2, v10 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc +; GFX950-NEXT: v_min_f32_e32 v8, v3, v11 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc +; GFX950-NEXT: v_min_f32_e32 v8, v4, v12 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc +; GFX950-NEXT: v_min_f32_e32 v8, v5, v13 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc +; GFX950-NEXT: v_min_f32_e32 v8, v6, v14 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc +; GFX950-NEXT: v_min_f32_e32 v8, v7, v15 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v8f32: ; GFX10: ; %bb.0: @@ -1968,136 +1909,136 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v16f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX9-NEXT: v_writelane_b32 v31, s30, 0 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v17 -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 -; GFX9-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX9-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX9-NEXT: v_min_f32_e32 v18, v13, v29 -; GFX9-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 -; GFX9-NEXT: v_writelane_b32 v31, s31, 1 -; GFX9-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 -; GFX9-NEXT: v_min_f32_e32 v3, v3, v19 -; GFX9-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 -; GFX9-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX9-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 -; GFX9-NEXT: v_min_f32_e32 v5, v5, v21 -; GFX9-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v22 -; GFX9-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX9-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 -; GFX9-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX9-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 -; GFX9-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX9-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 -; GFX9-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX9-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 -; GFX9-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX9-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 -; GFX9-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX9-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 -; GFX9-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX9-NEXT: v_min_f32_e32 v19, v14, v30 -; GFX9-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] -; GFX9-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] -; GFX9-NEXT: v_readlane_b32 s31, v31, 1 -; GFX9-NEXT: v_readlane_b32 s30, v31, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v18, v15, v16 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v16f32: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: scratch_load_dword v31, off, s32 -; GFX940-NEXT: v_mov_b32_e32 v32, 0x7fc00000 -; GFX940-NEXT: v_min_f32_e32 v33, v0, v16 -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 -; GFX940-NEXT: v_min_f32_e32 v34, v1, v17 -; GFX940-NEXT: v_min_f32_e32 v35, v2, v18 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v32, v33, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 -; GFX940-NEXT: v_min_f32_e32 v36, v3, v19 -; GFX940-NEXT: v_min_f32_e32 v37, v4, v20 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v32, v34, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v18 -; GFX940-NEXT: v_min_f32_e32 v38, v5, v21 -; GFX940-NEXT: v_min_f32_e32 v39, v6, v22 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v32, v35, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v19 -; GFX940-NEXT: v_min_f32_e32 v48, v7, v23 -; GFX940-NEXT: v_min_f32_e32 v49, v8, v24 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v32, v36, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v4, v20 -; GFX940-NEXT: v_min_f32_e32 v50, v9, v25 -; GFX940-NEXT: v_min_f32_e32 v51, v10, v26 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v32, v37, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v5, v21 -; GFX940-NEXT: v_min_f32_e32 v52, v11, v27 -; GFX940-NEXT: v_min_f32_e32 v53, v12, v28 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v32, v38, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v6, v22 -; GFX940-NEXT: v_min_f32_e32 v54, v13, v29 -; GFX940-NEXT: v_min_f32_e32 v55, v14, v30 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v32, v39, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v7, v23 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_min_f32_e32 v16, v15, v31 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v32, v48, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v8, v24 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v32, v49, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v9, v25 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v9, v32, v50, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v10, v26 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v10, v32, v51, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v11, v27 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v11, v32, v52, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v12, v28 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v12, v32, v53, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v13, v29 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v13, v32, v54, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v14, v30 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v14, v32, v55, vcc -; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v15, v31 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v15, v32, v16, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v16f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX900-NEXT: v_writelane_b32 v31, s30, 0 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 +; GFX900-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX900-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 +; GFX900-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX900-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX900-NEXT: v_min_f32_e32 v18, v13, v29 +; GFX900-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 +; GFX900-NEXT: v_writelane_b32 v31, s31, 1 +; GFX900-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 +; GFX900-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX900-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 +; GFX900-NEXT: v_min_f32_e32 v4, v4, v20 +; GFX900-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 +; GFX900-NEXT: v_min_f32_e32 v5, v5, v21 +; GFX900-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v22 +; GFX900-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX900-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 +; GFX900-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX900-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX900-NEXT: v_min_f32_e32 v8, v8, v24 +; GFX900-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX900-NEXT: v_min_f32_e32 v9, v9, v25 +; GFX900-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX900-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX900-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX900-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX900-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX900-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX900-NEXT: v_min_f32_e32 v19, v14, v30 +; GFX900-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] +; GFX900-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] +; GFX900-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] +; GFX900-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] +; GFX900-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] +; GFX900-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX900-NEXT: v_readlane_b32 s31, v31, 1 +; GFX900-NEXT: v_readlane_b32 s30, v31, 0 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_min_f32_e32 v18, v15, v16 +; GFX900-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v16f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_mov_b32_e32 v32, 0x7fc00000 +; GFX950-NEXT: v_min_f32_e32 v33, v0, v16 +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 +; GFX950-NEXT: v_min_f32_e32 v34, v1, v17 +; GFX950-NEXT: v_min_f32_e32 v35, v2, v18 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v32, v33, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 +; GFX950-NEXT: v_min_f32_e32 v36, v3, v19 +; GFX950-NEXT: v_min_f32_e32 v37, v4, v20 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v32, v34, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v2, v18 +; GFX950-NEXT: v_min_f32_e32 v38, v5, v21 +; GFX950-NEXT: v_min_f32_e32 v39, v6, v22 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v32, v35, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v3, v19 +; GFX950-NEXT: v_min_f32_e32 v48, v7, v23 +; GFX950-NEXT: v_min_f32_e32 v49, v8, v24 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v32, v36, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v4, v20 +; GFX950-NEXT: v_min_f32_e32 v50, v9, v25 +; GFX950-NEXT: v_min_f32_e32 v51, v10, v26 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v32, v37, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v5, v21 +; GFX950-NEXT: v_min_f32_e32 v52, v11, v27 +; GFX950-NEXT: v_min_f32_e32 v53, v12, v28 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v32, v38, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v6, v22 +; GFX950-NEXT: v_min_f32_e32 v54, v13, v29 +; GFX950-NEXT: v_min_f32_e32 v55, v14, v30 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v32, v39, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v7, v23 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_min_f32_e32 v16, v15, v31 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v32, v48, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v8, v24 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v8, v32, v49, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v9, v25 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v9, v32, v50, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v10, v26 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v10, v32, v51, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v11, v27 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v11, v32, v52, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v12, v28 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v12, v32, v53, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v13, v29 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v13, v32, v54, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v14, v30 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v14, v32, v55, vcc +; GFX950-NEXT: v_cmp_o_f32_e32 vcc, v15, v31 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v15, v32, v16, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v16f32: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll index 567582c9f58ff2..71fdd691a15122 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll @@ -2,7 +2,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GCN,GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s @@ -28,26 +29,26 @@ define double @v_minimum_f64(double %src0, double %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f64: ; GFX10: ; %bb.0: @@ -100,12 +101,6 @@ define double @v_minimum_f64__nnan(double %src0, double %src1) { ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_f64__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -152,26 +147,26 @@ define double @v_minimum_f64__nsz(double %src0, double %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f64__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f64__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f64__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f64__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f64__nsz: ; GFX10: ; %bb.0: @@ -224,12 +219,6 @@ define double @v_minimum_f64__nnan_nsz(double %src0, double %src1) { ; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_f64__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -278,28 +267,28 @@ define double @v_minimum_f64__nnan_src0(double %arg0, double %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f64__nnan_src0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f64__nnan_src0: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX940-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f64__nnan_src0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX900-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f64__nnan_src0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX950-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f64__nnan_src0: ; GFX10: ; %bb.0: @@ -362,28 +351,28 @@ define double @v_minimum_f64__nnan_src1(double %src0, double %arg1) { ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_f64__nnan_src1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_f64__nnan_src1: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX940-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX940-NEXT: v_mov_b32_e32 v1, 0x7ff80000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_f64__nnan_src1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX900-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_f64__nnan_src1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX950-NEXT: v_min_f64 v[4:5], v[0:1], v[2:3] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_f64__nnan_src1: ; GFX10: ; %bb.0: @@ -454,35 +443,35 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) { ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_minimum_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s18 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v[0:1] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_minimum_f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NEXT: v_min_f64 v[2:3], s[0:1], v[0:1] -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v[0:1] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_minimum_f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s18 +; GFX900-NEXT: v_mov_b32_e32 v1, s19 +; GFX900-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1] +; GFX900-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_minimum_f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX950-NEXT: v_min_f64 v[2:3], s[0:1], v[0:1] +; GFX950-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_minimum_f64: ; GFX10: ; %bb.0: @@ -555,35 +544,35 @@ define <2 x double> @v_minimum_v2f64(<2 x double> %src0, <2 x double> %src1) { ; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v2f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v2f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX940-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[6:7] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v2f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX900-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v2f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX950-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[6:7] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f64: ; GFX10: ; %bb.0: @@ -648,13 +637,6 @@ define <2 x double> @v_minimum_v2f64__nnan(<2 x double> %src0, <2 x double> %src ; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v2f64__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v2f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -712,35 +694,35 @@ define <2 x double> @v_minimum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v2f64__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v2f64__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] -; GFX940-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[6:7] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v2f64__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX900-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v2f64__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX950-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[6:7] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f64__nsz: ; GFX10: ; %bb.0: @@ -805,13 +787,6 @@ define <2 x double> @v_minimum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double> ; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v2f64__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] -; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v2f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -883,46 +858,46 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: s_minimum_v2f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s22 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_min_f64 v[2:3], s[18:19], v[0:1] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1] -; GFX9-NEXT: v_min_f64 v[0:1], s[16:17], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v[0:3] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: s_minimum_v2f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[18:19] -; GFX940-NEXT: v_min_f64 v[2:3], s[2:3], v[0:1] -; GFX940-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[16:17] -; GFX940-NEXT: v_min_f64 v[4:5], s[0:1], v[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc -; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v[0:3] -; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_minimum_v2f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s22 +; GFX900-NEXT: v_mov_b32_e32 v4, s20 +; GFX900-NEXT: v_mov_b32_e32 v1, s23 +; GFX900-NEXT: v_mov_b32_e32 v5, s21 +; GFX900-NEXT: v_min_f64 v[2:3], s[18:19], v[0:1] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1] +; GFX900-NEXT: v_min_f64 v[0:1], s[16:17], v[4:5] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[4:5] +; GFX900-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: s_minimum_v2f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[18:19] +; GFX950-NEXT: v_min_f64 v[2:3], s[2:3], v[0:1] +; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[16:17] +; GFX950-NEXT: v_min_f64 v[4:5], s[0:1], v[0:1] +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_minimum_v2f64: ; GFX10: ; %bb.0: @@ -1012,44 +987,44 @@ define <3 x double> @v_minimum_v3f64(<3 x double> %src0, <3 x double> %src1) { ; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v3f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] -; GFX9-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] -; GFX9-NEXT: v_min_f64 v[8:9], v[4:5], v[10:11] -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v3f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] -; GFX940-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v12, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v13, v12, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[8:9] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc -; GFX940-NEXT: v_min_f64 v[6:7], v[4:5], v[10:11] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[10:11] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v3f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX900-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] +; GFX900-NEXT: v_min_f64 v[8:9], v[4:5], v[10:11] +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v3f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX950-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v12, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v13, v12, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc +; GFX950-NEXT: v_min_f64 v[6:7], v[4:5], v[10:11] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[10:11] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f64: ; GFX10: ; %bb.0: @@ -1125,14 +1100,6 @@ define <3 x double> @v_minimum_v3f64__nnan(<3 x double> %src0, <3 x double> %src ; GFX9-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v3f64__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] -; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] -; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v3f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1201,44 +1168,44 @@ define <3 x double> @v_minimum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1 ; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v3f64__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] -; GFX9-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] -; GFX9-NEXT: v_min_f64 v[8:9], v[4:5], v[10:11] -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v3f64__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] -; GFX940-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v12, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v13, v12, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[8:9] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc -; GFX940-NEXT: v_min_f64 v[6:7], v[4:5], v[10:11] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[10:11] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v3f64__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX900-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] +; GFX900-NEXT: v_min_f64 v[8:9], v[4:5], v[10:11] +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] +; GFX900-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v3f64__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX950-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v12, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v13, v12, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc +; GFX950-NEXT: v_min_f64 v[6:7], v[4:5], v[10:11] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[10:11] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f64__nsz: ; GFX10: ; %bb.0: @@ -1314,14 +1281,6 @@ define <3 x double> @v_minimum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double> ; GFX9-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v3f64__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] -; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] -; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v3f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1398,53 +1357,53 @@ define <4 x double> @v_minimum_v4f64(<4 x double> %src0, <4 x double> %src1) { ; GFX8-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v4f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] -; GFX9-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] -; GFX9-NEXT: v_min_f64 v[10:11], v[4:5], v[12:13] -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] -; GFX9-NEXT: v_min_f64 v[12:13], v[6:7], v[14:15] -; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v4f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] -; GFX940-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v16, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[10:11] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v16, vcc -; GFX940-NEXT: v_min_f64 v[8:9], v[4:5], v[12:13] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[12:13] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v16, vcc -; GFX940-NEXT: v_min_f64 v[8:9], v[6:7], v[14:15] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[14:15] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v16, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v4f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX900-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] +; GFX900-NEXT: v_min_f64 v[10:11], v[4:5], v[12:13] +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] +; GFX900-NEXT: v_min_f64 v[12:13], v[6:7], v[14:15] +; GFX900-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] +; GFX900-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v4f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX950-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v16, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v16, vcc +; GFX950-NEXT: v_min_f64 v[8:9], v[4:5], v[12:13] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[12:13] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v9, v16, vcc +; GFX950-NEXT: v_min_f64 v[8:9], v[6:7], v[14:15] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[14:15] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v16, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f64: ; GFX10: ; %bb.0: @@ -1532,15 +1491,6 @@ define <4 x double> @v_minimum_v4f64__nnan(<4 x double> %src0, <4 x double> %src ; GFX9-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v4f64__nnan: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] -; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] -; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] -; GFX940-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v4f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1620,53 +1570,53 @@ define <4 x double> @v_minimum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1 ; GFX8-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v4f64__nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] -; GFX9-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] -; GFX9-NEXT: v_min_f64 v[10:11], v[4:5], v[12:13] -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] -; GFX9-NEXT: v_min_f64 v[12:13], v[6:7], v[14:15] -; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v4f64__nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] -; GFX940-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc -; GFX940-NEXT: v_mov_b32_e32 v16, 0x7ff80000 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[10:11] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v16, vcc -; GFX940-NEXT: v_min_f64 v[8:9], v[4:5], v[12:13] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[12:13] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v16, vcc -; GFX940-NEXT: v_min_f64 v[8:9], v[6:7], v[14:15] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[14:15] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v16, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v4f64__nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX900-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] +; GFX900-NEXT: v_min_f64 v[10:11], v[4:5], v[12:13] +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] +; GFX900-NEXT: v_min_f64 v[12:13], v[6:7], v[14:15] +; GFX900-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] +; GFX900-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v4f64__nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX950-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX950-NEXT: v_mov_b32_e32 v16, 0x7ff80000 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v16, vcc +; GFX950-NEXT: v_min_f64 v[8:9], v[4:5], v[12:13] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[12:13] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v9, v16, vcc +; GFX950-NEXT: v_min_f64 v[8:9], v[6:7], v[14:15] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[14:15] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v16, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f64__nsz: ; GFX10: ; %bb.0: @@ -1754,15 +1704,6 @@ define <4 x double> @v_minimum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double> ; GFX9-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX940-LABEL: v_minimum_v4f64__nnan_nsz: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] -; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] -; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] -; GFX940-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] -; GFX940-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: v_minimum_v4f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1878,89 +1819,89 @@ define <8 x double> @v_minimum_v8f64(<8 x double> %src0, <8 x double> %src1) { ; GFX8-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v8f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: v_min_f64 v[32:33], v[2:3], v[18:19] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] -; GFX9-NEXT: v_min_f64 v[18:19], v[4:5], v[20:21] -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[4:5], v[20:21] -; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[16:17] -; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[0:1], v[16:17] -; GFX9-NEXT: v_mov_b32_e32 v34, 0x7ff80000 -; GFX9-NEXT: v_min_f64 v[20:21], v[6:7], v[22:23] -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[6:7], v[22:23] -; GFX9-NEXT: v_min_f64 v[16:17], v[8:9], v[24:25] -; GFX9-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25] -; GFX9-NEXT: v_min_f64 v[22:23], v[10:11], v[26:27] -; GFX9-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27] -; GFX9-NEXT: v_min_f64 v[24:25], v[12:13], v[28:29] -; GFX9-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v34, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v32, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v18, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v19, v34, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v20, 0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v21, v34, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v16, 0, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v17, v34, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v22, 0, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v23, v34, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v24, 0, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v25, v34, s[14:15] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_min_f64 v[18:19], v[14:15], v[30:31] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] -; GFX9-NEXT: v_cndmask_b32_e64 v14, v18, 0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v8f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: scratch_load_dword v31, off, s32 -; GFX940-NEXT: v_mov_b32_e32 v54, 0x7ff80000 -; GFX940-NEXT: v_min_f64 v[32:33], v[0:1], v[16:17] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[16:17] -; GFX940-NEXT: v_min_f64 v[34:35], v[2:3], v[18:19] -; GFX940-NEXT: v_min_f64 v[36:37], v[4:5], v[20:21] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v32, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v33, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] -; GFX940-NEXT: v_min_f64 v[38:39], v[6:7], v[22:23] -; GFX940-NEXT: v_min_f64 v[48:49], v[8:9], v[24:25] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v34, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v35, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[20:21] -; GFX940-NEXT: v_min_f64 v[50:51], v[10:11], v[26:27] -; GFX940-NEXT: v_min_f64 v[52:53], v[12:13], v[28:29] -; GFX940-NEXT: v_cndmask_b32_e64 v4, v36, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v37, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[22:23] -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_min_f64 v[16:17], v[14:15], v[30:31] -; GFX940-NEXT: v_cndmask_b32_e64 v6, v38, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v39, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[24:25] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v8, v48, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v9, v49, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[26:27] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v10, v50, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v11, v51, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[28:29] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v12, v52, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v13, v53, v54, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v14, v16, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v15, v17, v54, vcc -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v8f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX900-NEXT: v_min_f64 v[32:33], v[2:3], v[18:19] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] +; GFX900-NEXT: v_min_f64 v[18:19], v[4:5], v[20:21] +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[4:5], v[20:21] +; GFX900-NEXT: v_min_f64 v[2:3], v[0:1], v[16:17] +; GFX900-NEXT: v_cmp_u_f64_e64 s[8:9], v[0:1], v[16:17] +; GFX900-NEXT: v_mov_b32_e32 v34, 0x7ff80000 +; GFX900-NEXT: v_min_f64 v[20:21], v[6:7], v[22:23] +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[6:7], v[22:23] +; GFX900-NEXT: v_min_f64 v[16:17], v[8:9], v[24:25] +; GFX900-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25] +; GFX900-NEXT: v_min_f64 v[22:23], v[10:11], v[26:27] +; GFX900-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27] +; GFX900-NEXT: v_min_f64 v[24:25], v[12:13], v[28:29] +; GFX900-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29] +; GFX900-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v1, v3, v34, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v2, v32, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v4, v18, 0, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v19, v34, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v6, v20, 0, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v21, v34, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v8, v16, 0, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v17, v34, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v10, v22, 0, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v23, v34, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v12, v24, 0, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v25, v34, s[14:15] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_min_f64 v[18:19], v[14:15], v[30:31] +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v14, v18, 0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v8f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_mov_b32_e32 v54, 0x7ff80000 +; GFX950-NEXT: v_min_f64 v[32:33], v[0:1], v[16:17] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[16:17] +; GFX950-NEXT: v_min_f64 v[34:35], v[2:3], v[18:19] +; GFX950-NEXT: v_min_f64 v[36:37], v[4:5], v[20:21] +; GFX950-NEXT: v_cndmask_b32_e64 v0, v32, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v33, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] +; GFX950-NEXT: v_min_f64 v[38:39], v[6:7], v[22:23] +; GFX950-NEXT: v_min_f64 v[48:49], v[8:9], v[24:25] +; GFX950-NEXT: v_cndmask_b32_e64 v2, v34, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v35, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[20:21] +; GFX950-NEXT: v_min_f64 v[50:51], v[10:11], v[26:27] +; GFX950-NEXT: v_min_f64 v[52:53], v[12:13], v[28:29] +; GFX950-NEXT: v_cndmask_b32_e64 v4, v36, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v37, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[22:23] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_min_f64 v[16:17], v[14:15], v[30:31] +; GFX950-NEXT: v_cndmask_b32_e64 v6, v38, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v7, v39, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[24:25] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v8, v48, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v9, v49, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[26:27] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v10, v50, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v11, v51, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[28:29] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v12, v52, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v13, v53, v54, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v14, v16, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v15, v17, v54, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v8f64: ; GFX10: ; %bb.0: @@ -2332,295 +2273,295 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v16f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: v_writelane_b32 v34, s30, 0 -; GFX9-NEXT: v_writelane_b32 v34, s31, 1 -; GFX9-NEXT: v_writelane_b32 v34, s34, 2 -; GFX9-NEXT: v_writelane_b32 v34, s35, 3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[31:32] -; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[31:32] -; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[31:32] -; GFX9-NEXT: v_min_f64 v[4:5], v[4:5], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[31:32] -; GFX9-NEXT: v_min_f64 v[6:7], v[6:7], v[31:32] -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[8:9] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[31:32] -; GFX9-NEXT: v_min_f64 v[8:9], v[8:9], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[10:11] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[31:32] -; GFX9-NEXT: v_min_f64 v[10:11], v[10:11], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, 0, s[12:13] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[31:32] -; GFX9-NEXT: v_min_f64 v[12:13], v[12:13], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, 0, s[14:15] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[16:17], v[14:15], v[31:32] -; GFX9-NEXT: v_min_f64 v[14:15], v[14:15], v[31:32] -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 -; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, 0, s[16:17] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[18:19], v[16:17], v[31:32] -; GFX9-NEXT: v_min_f64 v[16:17], v[16:17], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, 0, s[18:19] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[20:21], v[18:19], v[31:32] -; GFX9-NEXT: v_min_f64 v[18:19], v[18:19], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 -; GFX9-NEXT: v_cndmask_b32_e64 v18, v18, 0, s[20:21] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[22:23], v[20:21], v[31:32] -; GFX9-NEXT: v_min_f64 v[20:21], v[20:21], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, 0, s[22:23] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[24:25], v[22:23], v[31:32] -; GFX9-NEXT: v_min_f64 v[22:23], v[22:23], v[31:32] -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 -; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[24:25] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[26:27], v[24:25], v[31:32] -; GFX9-NEXT: v_min_f64 v[24:25], v[24:25], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 -; GFX9-NEXT: v_cndmask_b32_e64 v24, v24, 0, s[26:27] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[28:29], v[26:27], v[31:32] -; GFX9-NEXT: v_min_f64 v[26:27], v[26:27], v[31:32] -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 -; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] -; GFX9-NEXT: v_min_f64 v[28:29], v[28:29], v[31:32] -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] -; GFX9-NEXT: v_min_f64 v[30:31], v[30:31], v[32:33] -; GFX9-NEXT: v_mov_b32_e32 v32, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v32, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v32, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v32, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v32, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v15, v15, v32, s[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v17, v17, v32, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v19, v19, v32, s[20:21] -; GFX9-NEXT: v_cndmask_b32_e64 v21, v21, v32, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] -; GFX9-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] -; GFX9-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] -; GFX9-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] -; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] -; GFX9-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] -; GFX9-NEXT: v_readlane_b32 s35, v34, 3 -; GFX9-NEXT: v_readlane_b32 s34, v34, 2 -; GFX9-NEXT: v_readlane_b32 s31, v34, 1 -; GFX9-NEXT: v_readlane_b32 s30, v34, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: v_minimum_v16f64: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_accvgpr_write_b32 a1, v40 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a2, v41 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a3, v42 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a4, v43 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a5, v44 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a6, v45 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a7, v46 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a8, v47 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a9, v56 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a10, v57 ; Reload Reuse -; GFX940-NEXT: scratch_load_dword v37, off, s32 offset:16 -; GFX940-NEXT: scratch_load_dword v36, off, s32 offset:12 -; GFX940-NEXT: scratch_load_dword v39, off, s32 offset:24 -; GFX940-NEXT: scratch_load_dword v38, off, s32 offset:20 -; GFX940-NEXT: scratch_load_dword v49, off, s32 offset:32 -; GFX940-NEXT: scratch_load_dword v48, off, s32 offset:28 -; GFX940-NEXT: scratch_load_dword v57, off, s32 offset:8 -; GFX940-NEXT: scratch_load_dword v56, off, s32 offset:4 -; GFX940-NEXT: scratch_load_dword v47, off, s32 offset:40 -; GFX940-NEXT: scratch_load_dword v46, off, s32 offset:36 -; GFX940-NEXT: scratch_load_dword v45, off, s32 offset:48 -; GFX940-NEXT: scratch_load_dword v44, off, s32 offset:44 -; GFX940-NEXT: scratch_load_dword v43, off, s32 offset:56 -; GFX940-NEXT: scratch_load_dword v42, off, s32 offset:52 -; GFX940-NEXT: scratch_load_dword v41, off, s32 offset:64 -; GFX940-NEXT: scratch_load_dword v40, off, s32 offset:60 -; GFX940-NEXT: scratch_load_dword v55, off, s32 offset:72 -; GFX940-NEXT: scratch_load_dword v54, off, s32 offset:68 -; GFX940-NEXT: scratch_load_dword v53, off, s32 offset:80 -; GFX940-NEXT: scratch_load_dword v52, off, s32 offset:76 -; GFX940-NEXT: scratch_load_dword v51, off, s32 offset:88 -; GFX940-NEXT: scratch_load_dword v50, off, s32 offset:84 -; GFX940-NEXT: scratch_load_dword v35, off, s32 offset:96 -; GFX940-NEXT: scratch_load_dword v34, off, s32 offset:92 -; GFX940-NEXT: scratch_load_dword v31, off, s32 -; GFX940-NEXT: scratch_load_dword v33, off, s32 offset:104 -; GFX940-NEXT: scratch_load_dword v32, off, s32 offset:100 -; GFX940-NEXT: v_accvgpr_write_b32 a11, v58 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a12, v59 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a13, v60 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a14, v61 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a15, v62 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a16, v63 ; Reload Reuse -; GFX940-NEXT: s_waitcnt vmcnt(25) -; GFX940-NEXT: v_min_f64 v[58:59], v[2:3], v[36:37] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[36:37] -; GFX940-NEXT: scratch_load_dword v37, off, s32 offset:112 -; GFX940-NEXT: scratch_load_dword v36, off, s32 offset:108 -; GFX940-NEXT: s_waitcnt vmcnt(25) -; GFX940-NEXT: v_min_f64 v[60:61], v[4:5], v[38:39] -; GFX940-NEXT: v_cmp_u_f64_e64 s[0:1], v[4:5], v[38:39] -; GFX940-NEXT: scratch_load_dword v39, off, s32 offset:120 -; GFX940-NEXT: scratch_load_dword v38, off, s32 offset:116 -; GFX940-NEXT: s_waitcnt vmcnt(25) -; GFX940-NEXT: v_min_f64 v[62:63], v[6:7], v[48:49] -; GFX940-NEXT: v_cmp_u_f64_e64 s[2:3], v[6:7], v[48:49] -; GFX940-NEXT: scratch_load_dword v49, off, s32 offset:128 -; GFX940-NEXT: scratch_load_dword v48, off, s32 offset:124 -; GFX940-NEXT: s_waitcnt vmcnt(25) -; GFX940-NEXT: v_min_f64 v[2:3], v[0:1], v[56:57] -; GFX940-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[56:57] -; GFX940-NEXT: v_mov_b32_e32 v0, 0x7ff80000 -; GFX940-NEXT: s_waitcnt vmcnt(23) -; GFX940-NEXT: v_min_f64 v[56:57], v[8:9], v[46:47] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v2, 0, s[4:5] -; GFX940-NEXT: v_accvgpr_write_b32 a0, v1 -; GFX940-NEXT: v_cndmask_b32_e64 v1, v3, v0, s[4:5] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v58, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v59, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[46:47] -; GFX940-NEXT: s_waitcnt vmcnt(21) -; GFX940-NEXT: v_min_f64 v[46:47], v[10:11], v[44:45] -; GFX940-NEXT: v_cndmask_b32_e64 v4, v60, 0, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v8, v56, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v9, v57, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[44:45] -; GFX940-NEXT: s_waitcnt vmcnt(19) -; GFX940-NEXT: v_min_f64 v[44:45], v[12:13], v[42:43] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v61, v0, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v10, v46, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v11, v47, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[42:43] -; GFX940-NEXT: s_waitcnt vmcnt(17) -; GFX940-NEXT: v_min_f64 v[42:43], v[14:15], v[40:41] -; GFX940-NEXT: v_cndmask_b32_e64 v6, v62, 0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v12, v44, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v13, v45, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[40:41] -; GFX940-NEXT: s_waitcnt vmcnt(15) -; GFX940-NEXT: v_min_f64 v[40:41], v[16:17], v[54:55] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v63, v0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v14, v42, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v15, v43, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[16:17], v[54:55] -; GFX940-NEXT: s_waitcnt vmcnt(13) -; GFX940-NEXT: v_min_f64 v[54:55], v[18:19], v[52:53] -; GFX940-NEXT: v_accvgpr_read_b32 v63, a16 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v16, v40, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v17, v41, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[18:19], v[52:53] -; GFX940-NEXT: s_waitcnt vmcnt(11) -; GFX940-NEXT: v_min_f64 v[52:53], v[20:21], v[50:51] -; GFX940-NEXT: v_accvgpr_read_b32 v62, a15 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v18, v54, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v19, v55, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[20:21], v[50:51] -; GFX940-NEXT: s_waitcnt vmcnt(9) -; GFX940-NEXT: v_min_f64 v[50:51], v[22:23], v[34:35] -; GFX940-NEXT: v_accvgpr_read_b32 v61, a14 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v20, v52, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v21, v53, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[22:23], v[34:35] -; GFX940-NEXT: s_waitcnt vmcnt(6) -; GFX940-NEXT: v_min_f64 v[34:35], v[24:25], v[32:33] -; GFX940-NEXT: v_accvgpr_read_b32 v60, a13 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v22, v50, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v23, v51, v0, vcc -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[24:25], v[32:33] -; GFX940-NEXT: v_accvgpr_read_b32 v59, a12 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v58, a11 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v24, v34, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v25, v35, v0, vcc -; GFX940-NEXT: v_accvgpr_read_b32 v57, a10 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v56, a9 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v47, a8 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v46, a7 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v45, a6 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v44, a5 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v43, a4 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v42, a3 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v41, a2 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v40, a1 ; Reload Reuse -; GFX940-NEXT: s_waitcnt vmcnt(4) -; GFX940-NEXT: v_min_f64 v[32:33], v[26:27], v[36:37] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[26:27], v[36:37] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v26, v32, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v27, v33, v0, vcc -; GFX940-NEXT: s_waitcnt vmcnt(2) -; GFX940-NEXT: v_min_f64 v[32:33], v[28:29], v[38:39] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[28:29], v[38:39] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v28, v32, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v29, v33, v0, vcc -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_min_f64 v[32:33], v[30:31], v[48:49] -; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[30:31], v[48:49] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v30, v32, 0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v31, v33, v0, vcc -; GFX940-NEXT: v_accvgpr_read_b32 v0, a0 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v16f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX900-NEXT: v_writelane_b32 v34, s30, 0 +; GFX900-NEXT: v_writelane_b32 v34, s31, 1 +; GFX900-NEXT: v_writelane_b32 v34, s34, 2 +; GFX900-NEXT: v_writelane_b32 v34, s35, 3 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[31:32] +; GFX900-NEXT: v_min_f64 v[0:1], v[0:1], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[31:32] +; GFX900-NEXT: v_min_f64 v[2:3], v[2:3], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; GFX900-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[31:32] +; GFX900-NEXT: v_min_f64 v[4:5], v[4:5], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; GFX900-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[6:7] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[31:32] +; GFX900-NEXT: v_min_f64 v[6:7], v[6:7], v[31:32] +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; GFX900-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[8:9] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[31:32] +; GFX900-NEXT: v_min_f64 v[8:9], v[8:9], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; GFX900-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[10:11] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[31:32] +; GFX900-NEXT: v_min_f64 v[10:11], v[10:11], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 +; GFX900-NEXT: v_cndmask_b32_e64 v10, v10, 0, s[12:13] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[31:32] +; GFX900-NEXT: v_min_f64 v[12:13], v[12:13], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; GFX900-NEXT: v_cndmask_b32_e64 v12, v12, 0, s[14:15] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[16:17], v[14:15], v[31:32] +; GFX900-NEXT: v_min_f64 v[14:15], v[14:15], v[31:32] +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; GFX900-NEXT: v_cndmask_b32_e64 v14, v14, 0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[18:19], v[16:17], v[31:32] +; GFX900-NEXT: v_min_f64 v[16:17], v[16:17], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; GFX900-NEXT: v_cndmask_b32_e64 v16, v16, 0, s[18:19] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[20:21], v[18:19], v[31:32] +; GFX900-NEXT: v_min_f64 v[18:19], v[18:19], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 +; GFX900-NEXT: v_cndmask_b32_e64 v18, v18, 0, s[20:21] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[22:23], v[20:21], v[31:32] +; GFX900-NEXT: v_min_f64 v[20:21], v[20:21], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; GFX900-NEXT: v_cndmask_b32_e64 v20, v20, 0, s[22:23] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[24:25], v[22:23], v[31:32] +; GFX900-NEXT: v_min_f64 v[22:23], v[22:23], v[31:32] +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 +; GFX900-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[24:25] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[26:27], v[24:25], v[31:32] +; GFX900-NEXT: v_min_f64 v[24:25], v[24:25], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 +; GFX900-NEXT: v_cndmask_b32_e64 v24, v24, 0, s[26:27] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[28:29], v[26:27], v[31:32] +; GFX900-NEXT: v_min_f64 v[26:27], v[26:27], v[31:32] +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; GFX900-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX900-NEXT: v_min_f64 v[28:29], v[28:29], v[31:32] +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GFX900-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX900-NEXT: v_min_f64 v[30:31], v[30:31], v[32:33] +; GFX900-NEXT: v_mov_b32_e32 v32, 0x7ff80000 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc +; GFX900-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v5, v32, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v7, v32, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v9, v32, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v11, v32, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v15, v15, v32, s[16:17] +; GFX900-NEXT: v_cndmask_b32_e64 v17, v17, v32, s[18:19] +; GFX900-NEXT: v_cndmask_b32_e64 v19, v19, v32, s[20:21] +; GFX900-NEXT: v_cndmask_b32_e64 v21, v21, v32, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] +; GFX900-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] +; GFX900-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] +; GFX900-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] +; GFX900-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] +; GFX900-NEXT: v_readlane_b32 s35, v34, 3 +; GFX900-NEXT: v_readlane_b32 s34, v34, 2 +; GFX900-NEXT: v_readlane_b32 s31, v34, 1 +; GFX900-NEXT: v_readlane_b32 s30, v34, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v16f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a1, v40 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a2, v41 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a3, v42 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a4, v43 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a5, v44 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a6, v45 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a7, v46 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a8, v47 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a9, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a10, v57 ; Reload Reuse +; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:16 +; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:12 +; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:24 +; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:20 +; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:32 +; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:28 +; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:8 +; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:4 +; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:40 +; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:36 +; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:48 +; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:44 +; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:56 +; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:52 +; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:64 +; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:60 +; GFX950-NEXT: scratch_load_dword v55, off, s32 offset:72 +; GFX950-NEXT: scratch_load_dword v54, off, s32 offset:68 +; GFX950-NEXT: scratch_load_dword v53, off, s32 offset:80 +; GFX950-NEXT: scratch_load_dword v52, off, s32 offset:76 +; GFX950-NEXT: scratch_load_dword v51, off, s32 offset:88 +; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:84 +; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:96 +; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:92 +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:104 +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:100 +; GFX950-NEXT: v_accvgpr_write_b32 a11, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a12, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a13, v60 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a14, v61 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a15, v62 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a16, v63 ; Reload Reuse +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_min_f64 v[58:59], v[2:3], v[36:37] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[36:37] +; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:112 +; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:108 +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_min_f64 v[60:61], v[4:5], v[38:39] +; GFX950-NEXT: v_cmp_u_f64_e64 s[0:1], v[4:5], v[38:39] +; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:120 +; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:116 +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_min_f64 v[62:63], v[6:7], v[48:49] +; GFX950-NEXT: v_cmp_u_f64_e64 s[2:3], v[6:7], v[48:49] +; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:128 +; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:124 +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[56:57] +; GFX950-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[56:57] +; GFX950-NEXT: v_mov_b32_e32 v0, 0x7ff80000 +; GFX950-NEXT: s_waitcnt vmcnt(23) +; GFX950-NEXT: v_min_f64 v[56:57], v[8:9], v[46:47] +; GFX950-NEXT: v_cndmask_b32_e64 v1, v2, 0, s[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX950-NEXT: v_cndmask_b32_e64 v1, v3, v0, s[4:5] +; GFX950-NEXT: v_cndmask_b32_e64 v2, v58, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v59, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[46:47] +; GFX950-NEXT: s_waitcnt vmcnt(21) +; GFX950-NEXT: v_min_f64 v[46:47], v[10:11], v[44:45] +; GFX950-NEXT: v_cndmask_b32_e64 v4, v60, 0, s[0:1] +; GFX950-NEXT: v_cndmask_b32_e64 v8, v56, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v9, v57, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[44:45] +; GFX950-NEXT: s_waitcnt vmcnt(19) +; GFX950-NEXT: v_min_f64 v[44:45], v[12:13], v[42:43] +; GFX950-NEXT: v_cndmask_b32_e64 v5, v61, v0, s[0:1] +; GFX950-NEXT: v_cndmask_b32_e64 v10, v46, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v11, v47, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[42:43] +; GFX950-NEXT: s_waitcnt vmcnt(17) +; GFX950-NEXT: v_min_f64 v[42:43], v[14:15], v[40:41] +; GFX950-NEXT: v_cndmask_b32_e64 v6, v62, 0, s[2:3] +; GFX950-NEXT: v_cndmask_b32_e64 v12, v44, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v13, v45, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[40:41] +; GFX950-NEXT: s_waitcnt vmcnt(15) +; GFX950-NEXT: v_min_f64 v[40:41], v[16:17], v[54:55] +; GFX950-NEXT: v_cndmask_b32_e64 v7, v63, v0, s[2:3] +; GFX950-NEXT: v_cndmask_b32_e64 v14, v42, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v15, v43, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[16:17], v[54:55] +; GFX950-NEXT: s_waitcnt vmcnt(13) +; GFX950-NEXT: v_min_f64 v[54:55], v[18:19], v[52:53] +; GFX950-NEXT: v_accvgpr_read_b32 v63, a16 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e64 v16, v40, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v17, v41, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[18:19], v[52:53] +; GFX950-NEXT: s_waitcnt vmcnt(11) +; GFX950-NEXT: v_min_f64 v[52:53], v[20:21], v[50:51] +; GFX950-NEXT: v_accvgpr_read_b32 v62, a15 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e64 v18, v54, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v19, v55, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[20:21], v[50:51] +; GFX950-NEXT: s_waitcnt vmcnt(9) +; GFX950-NEXT: v_min_f64 v[50:51], v[22:23], v[34:35] +; GFX950-NEXT: v_accvgpr_read_b32 v61, a14 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e64 v20, v52, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v21, v53, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[22:23], v[34:35] +; GFX950-NEXT: s_waitcnt vmcnt(6) +; GFX950-NEXT: v_min_f64 v[34:35], v[24:25], v[32:33] +; GFX950-NEXT: v_accvgpr_read_b32 v60, a13 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e64 v22, v50, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v23, v51, v0, vcc +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[24:25], v[32:33] +; GFX950-NEXT: v_accvgpr_read_b32 v59, a12 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v58, a11 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e64 v24, v34, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v25, v35, v0, vcc +; GFX950-NEXT: v_accvgpr_read_b32 v57, a10 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v56, a9 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v47, a8 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v46, a7 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v45, a6 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v44, a5 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v43, a4 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v42, a3 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v41, a2 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v40, a1 ; Reload Reuse +; GFX950-NEXT: s_waitcnt vmcnt(4) +; GFX950-NEXT: v_min_f64 v[32:33], v[26:27], v[36:37] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[26:27], v[36:37] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v26, v32, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v27, v33, v0, vcc +; GFX950-NEXT: s_waitcnt vmcnt(2) +; GFX950-NEXT: v_min_f64 v[32:33], v[28:29], v[38:39] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[28:29], v[38:39] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v28, v32, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v29, v33, v0, vcc +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_min_f64 v[32:33], v[30:31], v[48:49] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[30:31], v[48:49] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v30, v32, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v31, v33, v0, vcc +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v16f64: ; GFX10: ; %bb.0: diff --git a/llvm/test/MC/AMDGPU/flat-scratch-gfx940.s b/llvm/test/MC/AMDGPU/flat-scratch-gfx940.s index fde3d2057b2ad1..d3ca4281dca414 100644 --- a/llvm/test/MC/AMDGPU/flat-scratch-gfx940.s +++ b/llvm/test/MC/AMDGPU/flat-scratch-gfx940.s @@ -1,4 +1,5 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx940 -show-encoding %s | FileCheck -check-prefix=GFX940 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck -check-prefix=GFX940 %s scratch_load_dword a2, v4, s6 // GFX940: scratch_load_dword a2, v4, s6 ; encoding: [0x00,0x60,0x50,0xdc,0x04,0x00,0x86,0x02] diff --git a/llvm/test/MC/AMDGPU/gfx940_asm_features.s b/llvm/test/MC/AMDGPU/gfx940_asm_features.s index e208b6cf903d38..e2e84f27b828a4 100644 --- a/llvm/test/MC/AMDGPU/gfx940_asm_features.s +++ b/llvm/test/MC/AMDGPU/gfx940_asm_features.s @@ -1,4 +1,5 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx940 -show-encoding %s | FileCheck --check-prefix=GFX940 --strict-whitespace %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX940 --strict-whitespace %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=NOT-GFX940,GFX90A --implicit-check-not=error: %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX940,GFX10 --implicit-check-not=error: %s diff --git a/llvm/test/MC/AMDGPU/gfx950-unsupported.s b/llvm/test/MC/AMDGPU/gfx950-unsupported.s new file mode 100644 index 00000000000000..f8bbd40b700fd8 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx950-unsupported.s @@ -0,0 +1,179 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck -check-prefix=ERR %s + +//===----------------------------------------------------------------------===// +// v_mfma_f32_32x32x4_xf32 +//===----------------------------------------------------------------------===// + +v_mfma_f32_32x32x4_xf32 a[0:3], v[2:3], v[4:5], a[2:5] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:3], v[0:3], v[0:3], v[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:3], v[0:3], v[0:3], v[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:3], a[0:3], v[0:3], 1.0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], v[0:3], a[0:3], 1.0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] abid:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], v[0:3], v[0:3], a[4:7] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:3], a[0:3], a[0:3], v[4:7] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], v[2:3], v[4:5], a[2:5] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:3], v[0:3], v[0:3], v[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:3], v[0:3], v[0:3], v[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:3], a[0:3], v[0:3], 1.0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], v[0:3], a[0:3], 1.0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] abid:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:3], v[0:3], v[0:3], a[4:7] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:3], a[0:3], a[0:3], v[4:7] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + + +//===----------------------------------------------------------------------===// +// v_mfma_f32_16x16x8_xf32 +//===----------------------------------------------------------------------===// + +v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], v[0:3], 1.0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], a[0:3], 1.0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] abid:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], v[0:3], a[4:7] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], a[0:3], v[4:7] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + + +v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], v[0:3], 1.0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], a[0:3], 1.0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] abid:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], v[0:3], a[4:7] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], a[0:3], v[4:7] +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt b/llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt new file mode 100644 index 00000000000000..0697ee8661e76d --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt @@ -0,0 +1,13 @@ +# RUN: llvm-mc -disassemble -arch=amdgcn -mcpu=gfx950 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX950 %s + +# GFX950: warning: invalid instruction encoding +0x00,0x80,0xbe,0xd3,0x02,0x09,0x0a,0x04 + +# GFX950: warning: invalid instruction encoding +0x00,0x00,0xbe,0xd3,0x02,0x09,0x0a,0x04 + +# GFX950: warning: invalid instruction encoding +0x00,0x00,0xbf,0xd3,0x02,0x09,0x0a,0x04 + +# GFX950: warning: invalid instruction encoding +0x00,0x80,0xbf,0xd3,0x02,0x09,0x0a,0x04 \ No newline at end of file diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx940_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx940_features.txt index 9575e50f16312f..63e425fdb4ec96 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx940_features.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx940_features.txt @@ -1,4 +1,5 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx940 -disassemble -show-encoding %s | FileCheck -strict-whitespace --check-prefix=GFX940 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -disassemble -show-encoding %s | FileCheck -strict-whitespace --check-prefix=GFX940 %s # GFX940: global_load_dword v2, v[2:3], off sc0 ; encoding: [0x00,0x80,0x51,0xdc,0x02,0x00,0x7f,0x02] 0x00,0x80,0x51,0xdc,0x02,0x00,0x7f,0x02 diff --git a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml index 9c79ea588f6247..416419b3a333f8 100644 --- a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml +++ b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml @@ -162,6 +162,10 @@ # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX942 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX942 %s # RUN: obj2yaml %t.o.AMDGCN_GFX942 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX942 %s +# RUN: sed -e 's//64/' -e 's//AMDGCN_GFX950/' %s | yaml2obj -o %t.o.AMDGCN_GFX950 +# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX950 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX950 %s +# RUN: obj2yaml %t.o.AMDGCN_GFX950 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX950 %s + # RUN: sed -e 's//64/' -e 's//AMDGCN_GFX1010/' %s | yaml2obj -o %t.o.AMDGCN_GFX1010 # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX1010 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX1010 %s # RUN: obj2yaml %t.o.AMDGCN_GFX1010 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX1010 %s @@ -411,6 +415,9 @@ # ELF-AMDGCN-GFX942: EF_AMDGPU_MACH_AMDGCN_GFX942 (0x4C) # YAML-AMDGCN-GFX942: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX942 ] +# ELF-AMDGCN-GFX950: EF_AMDGPU_MACH_AMDGCN_GFX950 (0x4F) +# YAML-AMDGCN-GFX950: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX950 ] + # ELF-AMDGCN-GFX1010: EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33) # YAML-AMDGCN-GFX1010: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX1010 ] diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll index 45071ecb751321..475f6f6d8322c7 100644 --- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll @@ -146,6 +146,11 @@ define amdgpu_kernel void @test_kernel() { ; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx9-generic -filetype=obj -O0 -o %t.o %s ; RUN: llvm-objdump -D --arch-name=amdgcn -mllvm --amdhsa-code-object-version=6 --mcpu=gfx9-generic %t.o > %t-specify.txt ; RUN: llvm-objdump -D -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt +; +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -filetype=obj -O0 -o %t.o %s +; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx950 %t.o > %t-specify.txt +; RUN: llvm-objdump -D %t.o > %t-detect.txt +; ; RUN: diff %t-specify.txt %t-detect.txt ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj -O0 -o %t.o %s diff --git a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test index 34c22dca3aa183..7de64a6edfe2e6 100644 --- a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test +++ b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test @@ -223,6 +223,15 @@ # RUN: yaml2obj %s -o %t -DABI_VERSION=2 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX942 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=2 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX942 -DFLAG_VALUE=0x4C +# RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950 +# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950 -DFLAG_VALUE=0x4F + +# RUN: yaml2obj %s -o %t -DABI_VERSION=1 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950 +# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=1 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950 -DFLAG_VALUE=0x4F + +# RUN: yaml2obj %s -o %t -DABI_VERSION=2 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950 +# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=2 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950 -DFLAG_VALUE=0x4F + # RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1010 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1010 -DFLAG_VALUE=0x33 diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index 1012cd020d525e..6360a169cbeda9 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -1617,6 +1617,7 @@ const EnumEntry ElfHeaderMipsFlags[] = { ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX90A, "gfx90a"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX90C, "gfx90c"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX940, "gfx940"), \ + ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX950, "gfx950"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX941, "gfx941"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX942, "gfx942"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1010, "gfx1010"), \ diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt index 96cb79b7d071c5..c76ad018ab4fe7 100644 --- a/offload/DeviceRTL/CMakeLists.txt +++ b/offload/DeviceRTL/CMakeLists.txt @@ -43,7 +43,7 @@ set(include_directory ${devicertl_base_directory}/include) set(source_directory ${devicertl_base_directory}/src) set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906" - "gfx908;gfx90a;gfx90c;gfx940;gfx941;gfx942;gfx1010" + "gfx908;gfx90a;gfx90c;gfx940;gfx941;gfx942;gfx950;gfx1010" "gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035" "gfx1036;gfx1100;gfx1101;gfx1102;gfx1103;gfx1150" "gfx1151;gfx1152;gfx1153")