From 75d820dcdd868b8ab6c12ff62f90216ad377ce20 Mon Sep 17 00:00:00 2001
From: Sjoerd Meijer <smeijer@nvidia.com>
Date: Thu, 11 Jan 2024 09:46:47 +0000
Subject: [PATCH] [AArch64] MI Scheduler: create more LDP/STP pairs (#77565)

Target hook `canPairLdStOpc` is missing quite a few opcodes for which
LDPs/STPs can created. I was hoping that it would not be necessary to
add these missing opcodes here and that the attached motivating test
case would be handled by the LoadStoreOptimiser (especially after
#71908), but it's not. The problem is that after register allocation
some things are a lot harder to do. Consider this for the motivating
example

```
[1] renamable $q1 = LDURQi renamable $x9, -16 :: (load (s128) from %ir.r51, align 8, !tbaa !0)
[2] renamable $q2 = LDURQi renamable $x0, -16 :: (load (s128) from %ir.r53, align 8, !tbaa !4)
[3] renamable $q1 = nnan ninf nsz arcp contract afn reassoc nofpexcept FMLSv2f64 killed renamable $q1(tied-def 0), killed renamable $q2, renamable $q0, implicit $fpcr
[4] STURQi killed renamable $q1, renamable $x9, -16 :: (store (s128) into %ir.r51, align 1, !tbaa !0)
[5] renamable $q1 = LDRQui renamable $x9, 0 :: (load (s128) from %ir.r.G0001_609.0, align 8, !tbaa !0)
```
We can't combine the the load in line [5] into the load on [1]:
regisister q1 is used in between. And we can can't combine [1] into
[5]: it is aliasing with the STR on line [4].

So, adding some missing opcodes here seems the best/easiest approach.
I will follow up to add some more missing cases here.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  3 ++
 .../test/CodeGen/AArch64/arm64-ldp-cluster.ll | 41 +++++++++++++++++++
 .../CodeGen/AArch64/machine-combiner-copy.ll  |  8 ++--
 3 files changed, 48 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 1cfbf4737a6f..42b7a6418032 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4214,6 +4214,9 @@ static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
   switch (FirstOpc) {
   default:
     return false;
+  case AArch64::LDRQui:
+  case AArch64::LDURQi:
+    return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
   case AArch64::LDRWui:
   case AArch64::LDURWi:
     return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
diff --git a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
index 4fa34e846b20..83f86d1c3a7c 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
@@ -113,3 +113,44 @@ define <2 x i64> @ldq_cluster(ptr %p) {
   %res  = mul nsw <2 x i64> %tmp2, %tmp3
   ret <2 x i64> %res
 }
+
+; Test LDURQi / LDRQui clustering
+;
+; CHECK: ********** MI Scheduling **********
+; CHECK: LDURQi_LDRQui:%bb.1 vector_body
+;
+; CHECK: Cluster ld/st SU(0) - SU(4)
+; CHECK: Cluster ld/st SU(1) - SU(5)
+;
+; CHECK: SU(0): %{{[0-9]+}}:fpr128 = LDURQi
+; CHECK: SU(1): %{{[0-9]+}}:fpr128 = LDURQi
+; CHECK: SU(4): %{{[0-9]+}}:fpr128 = LDRQui
+; CHECK: SU(5): %{{[0-9]+}}:fpr128 = LDRQui
+;
+define void @LDURQi_LDRQui(ptr nocapture readonly %arg) {
+entry:
+  br label %vector_body
+vector_body:
+  %phi1 = phi ptr [ null, %entry ], [ %r63, %vector_body ]
+  %phi2 = phi ptr [ %arg, %entry ], [ %r62, %vector_body ]
+  %phi3 = phi i32 [ 0, %entry ], [ %r61, %vector_body ]
+  %r51 = getelementptr i8, ptr %phi1, i64 -16
+  %r52 = load <2 x double>, ptr %r51, align 8
+  %r53 = getelementptr i8, ptr %phi2, i64 -16
+  %r54 = load <2 x double>, ptr %r53, align 8
+  %r55 = fmul fast <2 x double> %r54, <double 3.0, double 4.0>
+  %r56 = fsub fast <2 x double> %r52, %r55
+  store <2 x double> %r56, ptr %r51, align 1
+  %r57 = load <2 x double>, ptr %phi1, align 8
+  %r58 = load <2 x double>, ptr %phi2, align 8
+  %r59 = fmul fast <2 x double> %r58,<double 3.0, double 4.0>
+  %r60 = fsub fast <2 x double> %r57, %r59
+  store <2 x double> %r60, ptr %phi1, align 1
+  %r61 = add i32 %phi3, 4
+  %r62 = getelementptr i8, ptr %phi2, i64 32
+  %r63 = getelementptr i8, ptr %phi1, i64 32
+  %r.not = icmp eq i32 %r61, 0
+  br i1 %r.not, label %exit, label %vector_body
+exit:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
index 15a484d11b0a..4c8e589391c3 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
@@ -20,13 +20,13 @@ define void @fma_dup_f16(ptr noalias nocapture noundef readonly %A, half noundef
 ; CHECK-NEXT:    mov x12, x9
 ; CHECK-NEXT:  .LBB0_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp q1, q3, [x11, #-16]
+; CHECK-NEXT:    ldp q1, q4, [x10, #-16]
 ; CHECK-NEXT:    subs x12, x12, #16
-; CHECK-NEXT:    ldp q2, q4, [x10, #-16]
+; CHECK-NEXT:    ldp q2, q3, [x11, #-16]
 ; CHECK-NEXT:    add x11, x11, #32
-; CHECK-NEXT:    fmla v2.8h, v1.8h, v0.h[0]
+; CHECK-NEXT:    fmla v1.8h, v2.8h, v0.h[0]
 ; CHECK-NEXT:    fmla v4.8h, v3.8h, v0.h[0]
-; CHECK-NEXT:    stp q2, q4, [x10, #-16]
+; CHECK-NEXT:    stp q1, q4, [x10, #-16]
 ; CHECK-NEXT:    add x10, x10, #32
 ; CHECK-NEXT:    b.ne .LBB0_4
 ; CHECK-NEXT:  // %bb.5: // %middle.block