Skip to content

Commit

Permalink
[AArch64] Add SchedReadAdvance to Neoverse-V1 scheduling model. (llvm…
Browse files Browse the repository at this point in the history
…#111538)

Introduce a description of late forwarding to the Neoverse-V1 Scheduling model.
  • Loading branch information
Rin18 authored Oct 11, 2024
1 parent b5ea5be commit 303c8d2
Show file tree
Hide file tree
Showing 3 changed files with 1,645 additions and 121 deletions.
207 changes: 155 additions & 52 deletions llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,89 @@ def V1Write_11c_9L01_9S_9V : SchedWriteRes<[V1UnitL01, V1UnitL01, V1UnitL01,
V1UnitV, V1UnitV, V1UnitV,
V1UnitV, V1UnitV, V1UnitV]>;

//===----------------------------------------------------------------------===//
// Define forwarded types

// NOTE: SOG, p. 20, n. 2: Accumulator forwarding is not supported for
// consumers of 64 bit multiply high operations?
def V1Wr_IM : SchedWriteRes<[V1UnitM]> { let Latency = 2; }
def V1Wr_IMA : SchedWriteRes<[V1UnitM0]> { let Latency = 2; }
def V1WriteIM : SchedWriteVariant<
[SchedVar<NeoverseMULIdiomPred, [V1Wr_IM]>,
SchedVar<NoSchedPred, [V1Wr_IMA]>]>;
def V1Rd_IMA : SchedReadAdvance<1, [V1Wr_IMA]>;

def V1Wr_FMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
def V1Rd_FMA : SchedReadAdvance<2, [WriteFMul, V1Wr_FMA]>;

def V1Wr_ADA : SchedWriteRes<[V1UnitV13]> { let Latency = 4; }
def V1Rd_ADA : SchedReadAdvance<3, [V1Wr_ADA]>;

def V1Wr_VDOT : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
def V1Rd_VDOT : SchedReadAdvance<2, [V1Wr_VDOT]>;

def V1Wr_VMMA : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
def V1Rd_VMMA : SchedReadAdvance<2, [V1Wr_VMMA]>;

def V1Wr_VMA : SchedWriteRes<[V1UnitV02]> { let Latency = 4; }
def V1Rd_VMA : SchedReadAdvance<3, [V1Wr_VMA]>;

def V1Wr_VMAL : SchedWriteRes<[V1UnitV02]> { let Latency = 4; }
def V1Rd_VMAL : SchedReadAdvance<3, [V1Wr_VMAL]>;

def V1Wr_VSA : SchedWriteRes<[V1UnitV13]> { let Latency = 4; }
def V1Rd_VSA : SchedReadAdvance<3, [V1Wr_VSA]>;

def V1Wr_FCMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
def V1Rd_FCMA : SchedReadAdvance<2, [V1Wr_FCMA]>;

def V1Wr_FPM : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
def V1Wr_FPMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
def V1Rd_FPMA : SchedReadAdvance<2, [V1Wr_FPM, V1Wr_FPMA]>;

def V1Wr_FPMAL : SchedWriteRes<[V1UnitV]> { let Latency = 5; }
def V1Rd_FPMAL : SchedReadAdvance<3, [V1Wr_FPMAL]>;

def V1Wr_BFD : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
def V1Rd_BFD : SchedReadAdvance<2, [V1Wr_BFD]>;

def V1Wr_BFMMA : SchedWriteRes<[V1UnitV]> { let Latency = 5; }
def V1Rd_BFMMA : SchedReadAdvance<2, [V1Wr_BFMMA]>;

def V1Wr_BFMLA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
def V1Rd_BFMLA : SchedReadAdvance<2, [V1Wr_BFMLA]>;

def V1Wr_CRC : SchedWriteRes<[V1UnitM0]> { let Latency = 2; }
def V1Rd_CRC : SchedReadAdvance<1, [V1Wr_CRC]>;

def V1Wr_ZDOTB : SchedWriteRes<[V1UnitV01]> { let Latency = 3; }
def V1Rd_ZDOTB : SchedReadAdvance<2, [V1Wr_ZDOTB]>;

def V1Wr_ZUDOTB : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
def V1Rd_ZUDOTB : SchedReadAdvance<2, [V1Wr_ZUDOTB]>;

def V1Wr_ZDOTH : SchedWriteRes<[V1UnitV0]> { let Latency = 4; }
def V1Rd_ZDOTH : SchedReadAdvance<3, [V1Wr_ZDOTH]>;

def V1Wr_ZMMA : SchedWriteRes<[V1UnitV01]> { let Latency = 3; }
def V1Rd_ZMMA : SchedReadAdvance<2, [V1Wr_ZMMA]>;

let Latency = 5, NumMicroOps = 2 in
def V1Wr_ZMAD : SchedWriteRes<[V1UnitV0, V1UnitV0]>;
def V1Rd_ZMAD : SchedReadAdvance<3, [V1Wr_ZMAD]>;

def V1Wr_ZFCMA : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
def V1Rd_ZFCMA : SchedReadAdvance<3, [V1Wr_ZFCMA]>;

def V1Wr_ZFMA : SchedWriteRes<[V1UnitV01]> { let Latency = 4; }
def V1Rd_ZFMA : SchedReadAdvance<2, [V1Wr_ZFMA]>;

def V1Wr_ZBFDOT : SchedWriteRes<[V1UnitV01]> { let Latency = 4; }
def V1Rd_ZBFDOT : SchedReadAdvance<2, [V1Wr_ZBFDOT]>;
def V1Wr_ZBFMMA : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
def V1Rd_ZBFMMA : SchedReadAdvance<2, [V1Wr_ZBFMMA]>;
def V1Wr_ZBFMAL : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
def V1Rd_ZBFMAL : SchedReadAdvance<3, [V1Wr_ZBFMAL]>;

// Miscellaneous Instructions
// -----------------------------------------------------------------------------
Expand Down Expand Up @@ -553,16 +636,19 @@ def : InstRW<[V1Write_1c_1J], (instrs SETF8, SETF16, RMIF, CFINV)>;
def : SchedAlias<WriteID32, V1Write_12c5_1M0>;
def : SchedAlias<WriteID64, V1Write_20c5_1M0>;

def : SchedAlias<WriteIM32, V1Write_2c_1M>;
def : SchedAlias<WriteIM64, V1Write_2c_1M>;

// Multiply
// Multiply accumulate
// Multiply accumulate, long
// Multiply long
def V1WriteIM : SchedWriteVariant<
[SchedVar<NeoverseMULIdiomPred, [V1Write_2c_1M]>,
SchedVar<NoSchedPred, [V1Write_2c_1M0]>]>;
def : SchedAlias<WriteIM32, V1WriteIM>;
def : SchedAlias<WriteIM64, V1WriteIM>;
// Multiply accumulate, W-form
// Multiply accumulate, X-form
def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_IMA],
(instregex "^M(ADD|SUB)[WX]rrr$")>;

// Multiply accumulate long
// Multiply long
def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_IMA],
(instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
// Multiply high
def : InstRW<[V1Write_3c_1M, ReadIM, ReadIM], (instrs SMULHrr, UMULHrr)>;

Expand Down Expand Up @@ -680,10 +766,11 @@ def : InstRW<[V1Write_15c7_1V02], (instrs FDIVDrr)>;
def : InstRW<[V1Write_16c7_1V02], (instrs FSQRTDr)>;

// FP multiply
def : SchedAlias<WriteFMul, V1Write_3c_1V>;
def : WriteRes<WriteFMul, [V1UnitV]> { let Latency = 3; }

// FP multiply accumulate
def : InstRW<[V1Write_4c_1V], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
def : InstRW<[V1Wr_FMA, ReadDefault, ReadDefault, V1Rd_FMA],
(instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;

// FP round to integral
def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ][HSD]r$",
Expand Down Expand Up @@ -824,7 +911,7 @@ def : SchedAlias<WriteVq, V1Write_2c_1V>;
// ASIMD absolute diff accum
// ASIMD absolute diff accum long
// ASIMD pairwise add and accumulate long
def : InstRW<[V1Write_4c_1V13], (instregex "^[SU]ABAL?v", "^[SU]ADALPv")>;
def : InstRW<[V1Wr_ADA, V1Rd_ADA], (instregex "^[SU]ABAL?v", "^[SU]ADALPv")>;

// ASIMD arith, reduce, 4H/4S
// ASIMD max/min, reduce, 4H/4S
Expand All @@ -843,23 +930,26 @@ def : InstRW<[V1Write_4c_2V13], (instregex "^(ADD|[SU]ADDL)Vv16i8v$",

// ASIMD dot product
// ASIMD dot product using signed and unsigned integers
def : InstRW<[V1Write_2c_1V], (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>;
def : InstRW<[V1Wr_VDOT, V1Rd_VDOT],
(instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>;

// ASIMD matrix multiply- accumulate
def : InstRW<[V1Write_3c_1V], (instrs SMMLA, UMMLA, USMMLA)>;
// ASIMD matrix multiply-accumulate
def : InstRW<[V1Wr_VMMA, V1Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>;

// ASIMD multiply
def : InstRW<[V1Write_4c_1V02], (instregex "^MULv", "^SQ(R)?DMULHv")>;

// ASIMD multiply accumulate
def : InstRW<[V1Wr_VMA, V1Rd_VMA], (instregex "^MLAv", "^MLSv")>;

// ASIMD multiply accumulate long
def : InstRW<[V1Wr_VMAL, V1Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;

// ASIMD multiply accumulate high
def : InstRW<[V1Write_4c_1V02], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;

// ASIMD multiply accumulate saturating long
def : InstRW<[V1Write_4c_1V02],
(instregex "^MUL(v[148]i16|v[124]i32)$",
"^SQR?DMULH(v[48]i16|v[24]i32)$",
"^ML[AS](v[148]i16|v[124]i32)$",
"^[SU]ML[AS]Lv",
"^SQRDML[AS]H(v[148]i16|v[124]i32)$",
"^SQDML[AS]Lv")>;
def : InstRW<[V1Write_4c_1V02], (instregex "^SQDML[AS]L[iv]")>;

// ASIMD multiply/multiply long (8x8) polynomial
def : InstRW<[V1Write_3c_1V01], (instregex "^PMULL?v(8|16)i8$")>;
Expand All @@ -868,11 +958,12 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^PMULL?v(8|16)i8$")>;
def : InstRW<[V1Write_3c_1V02], (instregex "^([SU]|SQD)MULLv")>;

// ASIMD shift accumulate
def : InstRW<[V1Wr_VSA, V1Rd_VSA], (instregex "^[SU]SRAv", "^[SU]RSRAv")>;

// ASIMD shift by immed, complex
// ASIMD shift by register, complex
def : InstRW<[V1Write_4c_1V13],
(instregex "^[SU]R?SRAv",
"^RSHRNv", "^SQRSHRU?Nv", "^(SQSHLU?|UQSHL)[bhsd]$",
(instregex "^RSHRNv", "^SQRSHRU?Nv", "^(SQSHLU?|UQSHL)[bhsd]$",
"^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$",
"^SQSHU?RNv", "^[SU]RSHRv", "^UQR?SHRNv",
"^[SU]Q?RSHLv", "^[SU]QSHLv")>;
Expand All @@ -890,16 +981,25 @@ def : InstRW<[V1Write_2c_1V13], (instregex "^SHLL?v", "^SHRNv", "^[SU]SHLLv",
// ASIMD FP absolute value/difference
// ASIMD FP arith, normal
// ASIMD FP compare
// ASIMD FP complex add
// ASIMD FP max/min, normal
// ASIMD FP max/min, pairwise
// ASIMD FP negate
// Covered by "SchedAlias (WriteV[dq]...)" above

// ASIMD FP complex add
def : InstRW<[V1Write_4c_1V], (instregex "^FCADD(v[48]f16|v[24]f32|v2f64)$")>;

// ASIMD FP complex multiply add
def : InstRW<[V1Wr_FCMA, V1Rd_FCMA], (instregex "^FCMLAv")>;

// ASIMD FP multiply
def : InstRW<[V1Wr_FPM], (instregex "^FMULX?v")>;

// ASIMD FP multiply accumulate
def : InstRW<[V1Write_4c_1V], (instregex "^FCADD(v[48]f16|v[24]f32|v2f64)$",
"^FML[AS]v")>;
def : InstRW<[V1Wr_FPMA, V1Rd_FPMA], (instregex "^FML[AS]v")>;

// ASIMD FP multiply accumulate long
def : InstRW<[V1Wr_FPMAL, V1Rd_FPMAL], (instregex "^FML[AS]L2?v")>;

// ASIMD FP convert, long (F16 to F32)
def : InstRW<[V1Write_4c_2V02], (instregex "^FCVTLv[48]i16$")>;
Expand Down Expand Up @@ -953,12 +1053,6 @@ def : InstRW<[V1Write_4c_2V], (instregex "^F(MAX|MIN)(NM)?Vv4(i16|i32)v$")>;
// ASIMD FP max/min, reduce, Q-form F16
def : InstRW<[V1Write_6c_3V], (instregex "^F(MAX|MIN)(NM)?Vv8i16v$")>;

// ASIMD FP multiply
def : InstRW<[V1Write_3c_1V], (instregex "^FMULX?v")>;

// ASIMD FP multiply accumulate long
def : InstRW<[V1Write_5c_1V], (instregex "^FML[AS]L2?v")>;

// ASIMD FP round, D-form F32 and Q-form F64
def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ]v2f(32|64)$")>;

Expand All @@ -976,13 +1070,13 @@ def : InstRW<[V1Write_6c_4V02], (instregex "^FRINT[AIMNPXZ]v8f16$")>;
def : InstRW<[V1Write_4c_1V02], (instrs BFCVTN, BFCVTN2)>;

// ASIMD dot product
def : InstRW<[V1Write_4c_1V], (instregex "^BF(DOT|16DOTlane)v[48]bf16$")>;
def : InstRW<[V1Wr_BFD, V1Rd_BFD], (instregex "^BF(DOT|16DOTlane)v[48]bf16$")>;

// ASIMD matrix multiply accumulate
def : InstRW<[V1Write_5c_1V], (instrs BFMMLA)>;
def : InstRW<[V1Wr_BFMMA, V1Rd_BFMMA], (instrs BFMMLA)>;

// ASIMD multiply accumulate long
def : InstRW<[V1Write_4c_1V], (instregex "^BFMLAL[BT](Idx)?$")>;
def : InstRW<[V1Wr_BFMLA, V1Rd_BFMLA], (instregex "^BFMLAL[BT](Idx)?$")>;

// Scalar convert, F32 to BF16
def : InstRW<[V1Write_3c_1V02], (instrs BFCVT)>;
Expand Down Expand Up @@ -1300,7 +1394,7 @@ def : InstRW<[V1Write_2c_1V0], (instrs BCAX, EOR3, RAX1, XAR)>;
// -----------------------------------------------------------------------------

// CRC checksum ops
def : InstRW<[V1Write_2c_1M0], (instregex "^CRC32C?[BHWX]rr$")>;
def : InstRW<[V1Wr_CRC, V1Rd_CRC], (instregex "^CRC32C?[BHWX]rr$")>;


// SVE Predicate instructions
Expand Down Expand Up @@ -1440,13 +1534,14 @@ def : InstRW<[V1Write_20c7_1V0], (instregex "^[SU]DIVR?_ZPmZ_D",
"^[SU]DIV_ZPZZ_D")>;

// Dot product, 8 bit
def : InstRW<[V1Write_3c_1V01], (instregex "^[SU]DOT_ZZZI?_S$")>;
def : InstRW<[V1Wr_ZDOTB, V1Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_S$")>;

// Dot product, 8 bit, using signed and unsigned integers
def : InstRW<[V1Write_3c_1V], (instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>;
def : InstRW<[V1Wr_ZUDOTB, V1Rd_ZUDOTB],
(instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>;

// Dot product, 16 bit
def : InstRW<[V1Write_4c_1V0], (instregex "^[SU]DOT_ZZZI?_D$")>;
def : InstRW<[V1Wr_ZDOTH, V1Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_D$")>;

// Duplicate, immediate and indexed form
def : InstRW<[V1Write_2c_1V01], (instregex "^DUP_ZI_[BHSD]$",
Expand Down Expand Up @@ -1488,7 +1583,7 @@ def : InstRW<[V1Write_2c_1V01], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]$",
"^MOVPRFX_ZZ$")>;

// Matrix multiply-accumulate
def : InstRW<[V1Write_3c_1V01], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
def : InstRW<[V1Wr_ZMMA, V1Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;

// Multiply, B, H, S element size
def : InstRW<[V1Write_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]",
Expand All @@ -1497,12 +1592,16 @@ def : InstRW<[V1Write_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]",
"^[SU]MULH_ZPZZ_[BHS]")>;

// Multiply, D element size
// Multiply accumulate, D element size
def : InstRW<[V1Write_5c_2V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D",
"^MUL_ZPZZ_D",
"^[SU]MULH_(ZPmZ|ZZZ)_D",
"^[SU]MULH_ZPZZ_D",
"^(MLA|MLS|MAD|MSB)_(ZPmZZ|ZPZZZ)_D")>;
"^[SU]MULH_ZPZZ_D")>;

// Multiply accumulate, D element size
def : InstRW<[V1Wr_ZMAD, V1Rd_ZMAD],
(instregex "^ML[AS]_ZPZZZ_D")>;
def : InstRW<[V1Wr_ZMAD, ReadDefault, V1Rd_ZMAD],
(instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>;

// Multiply accumulate, B, H, S element size
// NOTE: This is not specified in the SOG.
Expand Down Expand Up @@ -1583,8 +1682,8 @@ def : InstRW<[V1Write_2c_1V0], (instregex "^FAC(GE|GT)_PPzZZ_[HSD]$",
def : InstRW<[V1Write_3c_1V01], (instregex "^FCADD_ZPmZ_[HSD]$")>;

// Floating point complex multiply add
def : InstRW<[V1Write_5c_1V01], (instregex "^FCMLA_ZPmZZ_[HSD]$",
"^FCMLA_ZZZI_[HS]$")>;
def : InstRW<[V1Wr_ZFCMA, ReadDefault, V1Rd_ZFCMA], (instregex "^FCMLA_ZPmZZ_[HSD]")>;
def : InstRW<[V1Wr_ZFCMA, V1Rd_ZFCMA], (instregex "^FCMLA_ZZZI_[HS]")>;

// Floating point convert, long or narrow (F16 to F32 or F32 to F16)
// Floating point convert to integer, F32
Expand Down Expand Up @@ -1623,11 +1722,15 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]",
"^FMUL_ZPZ[IZ]_[HSD]")>;

// Floating point multiply accumulate
def : InstRW<[V1Wr_ZFMA, ReadDefault, V1Rd_ZFMA],
(instregex "^FN?ML[AS]_ZPmZZ_[HSD]",
"^FN?(MAD|MSB)_ZPmZZ_[HSD]")>;
def : InstRW<[V1Wr_ZFMA, V1Rd_ZFMA],
(instregex "^FML[AS]_ZZZI_[HSD]",
"^FN?ML[AS]_ZPZZZ_[HSD]")>;

// Floating point reciprocal step
def : InstRW<[V1Write_4c_1V01], (instregex "^F(N?M(AD|SB)|N?ML[AS])_ZPmZZ_[HSD]$",
"^FN?ML[AS]_ZPZZZ_[HSD]",
"^FML[AS]_ZZZI_[HSD]$",
"^F(RECPS|RSQRTS)_ZZZ_[HSD]$")>;
def : InstRW<[V1Write_4c_1V01], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>;

// Floating point reciprocal estimate, F16
def : InstRW<[V1Write_6c_4V0], (instrs FRECPE_ZZ_H, FRSQRTE_ZZ_H)>;
Expand Down Expand Up @@ -1681,13 +1784,13 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^FEXPA_ZZ_[HSD]$",
def : InstRW<[V1Write_4c_1V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;

// Dot product
def : InstRW<[V1Write_4c_1V01], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
def : InstRW<[V1Wr_ZBFDOT, V1Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;

// Matrix multiply accumulate
def : InstRW<[V1Write_5c_1V01], (instrs BFMMLA_ZZZ)>;
def : InstRW<[V1Wr_ZBFMMA, V1Rd_ZBFMMA], (instrs BFMMLA_ZZZ)>;

// Multiply accumulate long
def : InstRW<[V1Write_5c_1V01], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;
def : InstRW<[V1Wr_ZBFMAL, V1Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;


// SVE Load instructions
Expand Down
Loading

0 comments on commit 303c8d2

Please sign in to comment.