Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU][MC][True16] VOP3dot instruction update for true16/fake16 #113474

Open
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

broxigarchen
Copy link
Contributor

@broxigarchen broxigarchen commented Oct 23, 2024

Update VOP3dot instructions with true16 and fake16 formats.

This patch includes instructions:
V_DOT2_F16_F16
V_DOT2_BF16_BF16

@broxigarchen broxigarchen marked this pull request as ready for review October 23, 2024 18:05
@llvmbot llvmbot added backend:AMDGPU mc Machine (object) code labels Oct 23, 2024
@llvmbot
Copy link

llvmbot commented Oct 23, 2024

@llvm/pr-subscribers-mc

Author: Brox Chen (broxigarchen)

Changes

Patch is 171.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113474.diff

14 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+39-7)
  • (modified) llvm/lib/Target/AMDGPU/VOPInstructions.td (+29-12)
  • (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3.s (+78-66)
  • (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s (+98-25)
  • (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s (+96-26)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3.s (+57-33)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s (+34-22)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s (+32-20)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt (+160-37)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt (+190-21)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt (+184-21)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt (+140-29)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt (+70-10)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt (+70-10)
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 34ecdb56e8689d..caceb251f719b1 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -931,6 +931,30 @@ class VOP3_DOT_Profile<VOPProfile P> : VOP3_Profile<P, VOP3_OPSEL> {
   let HasOMod = 0;
 }
 
+class VOP3_DOT_Profile_t16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile_True16<P, Features> {
+  let HasClamp = 0;
+  let HasOMod = 0;
+  // Override modifiers for bf16(i16) (same as float modifiers).
+  let HasSrc0Mods = 1;
+  let HasSrc1Mods = 1;
+  let HasSrc2Mods = 1;
+  let Src0ModVOP3DPP = FPVRegInputMods;
+  let Src1ModVOP3DPP = FP32VCSrcInputMods;
+  let Src2ModVOP3DPP = FPT16VCSrcInputMods</*IsFake16*/0>;
+}
+
+class VOP3_DOT_Profile_fake16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile_Fake16<P, Features> {
+  let HasClamp = 0;
+  let HasOMod = 0;
+  // Override modifiers for bf16(i16) (same as float modifiers).
+  let HasSrc0Mods = 1;
+  let HasSrc1Mods = 1;
+  let HasSrc2Mods = 1;
+  let AsmVOP3Base = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
+   HasOpSel, HasOMod, IsVOP3P, HasModifiers, 1/*HasSrc0Mods*/, 1/*HasSrc1Mods*/,
+   1/*HasSrc2Mods*/, DstVT>.ret;
+}
+
 let SubtargetPredicate = isGFX11Plus in {
   defm V_MAXMIN_F32     : VOP3Inst<"v_maxmin_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
   defm V_MINMAX_F32     : VOP3Inst<"v_minmax_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
@@ -951,9 +975,15 @@ let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
   defm V_MINIMUMMAXIMUM_F16 : VOP3Inst<"v_minimummaximum_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
 } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
 
-let OtherPredicates = [HasDot9Insts], IsDOT=1 in {
-  defm V_DOT2_F16_F16 :   VOP3Inst<"v_dot2_f16_f16",   VOP3_DOT_Profile<VOP_F16_V2F16_V2F16_F16>, int_amdgcn_fdot2_f16_f16>;
-  defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile<VOP_BF16_V2BF16_V2BF16_BF16>, int_amdgcn_fdot2_bf16_bf16>;
+let SubtargetPredicate = HasDot9Insts, IsDOT=1 in {
+  defm V_DOT2_F16_F16 :   VOP3Inst_t16_with_profiles<"v_dot2_f16_f16", VOP3_DOT_Profile<VOP_F16_V2F16_V2F16_F16>,
+                                                      VOP3_DOT_Profile_t16<VOP_F16_V2F16_V2F16_F16>,
+                                                      VOP3_DOT_Profile_fake16<VOP_F16_V2F16_V2F16_F16>,
+                                                      int_amdgcn_fdot2_f16_f16>;
+  defm V_DOT2_BF16_BF16 : VOP3Inst_t16_with_profiles<"v_dot2_bf16_bf16", VOP3_DOT_Profile<VOP_BF16_V2BF16_V2BF16_BF16>,
+                                                      VOP3_DOT_Profile_t16<VOP_BF16_V2BF16_V2BF16_BF16>,
+                                                      VOP3_DOT_Profile_fake16<VOP_BF16_V2BF16_V2BF16_BF16>,
+                                                      int_amdgcn_fdot2_bf16_bf16>;
 }
 
 class VOP_Pseudo_Scalar<RegisterClass Dst, RegisterOperand SrcOp,
@@ -1112,8 +1142,10 @@ multiclass VOP3_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName,
   VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName>,
   VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName>;
 
-multiclass VOP3Dot_Realtriple_gfx11_gfx12<bits<10> op> :
-  VOP3Dot_Realtriple<GFX11Gen, op>, VOP3Dot_Realtriple<GFX12Gen, op>;
+multiclass VOP3Dot_Realtriple_t16_and_fake16_gfx11_gfx12<bits<10> op, string asmName, string opName = NAME> {
+  defm _t16: VOP3Dot_Realtriple_gfx11_gfx12<op, asmName, 0, opName#"_t16">;
+  defm _fake16: VOP3Dot_Realtriple_gfx11_gfx12<op, asmName, 0, opName#"_fake16">;
+}
 
 multiclass VOP3_Realtriple_t16_gfx11_gfx12<bits<10> op, string asmName, string opName = NAME,
                                            string pseudo_mnemonic = "", bit isSingle = 0> :
@@ -1205,8 +1237,8 @@ defm V_MAXMIN_U32          : VOP3_Realtriple_gfx11_gfx12<0x262>;
 defm V_MINMAX_U32          : VOP3_Realtriple_gfx11_gfx12<0x263>;
 defm V_MAXMIN_I32          : VOP3_Realtriple_gfx11_gfx12<0x264>;
 defm V_MINMAX_I32          : VOP3_Realtriple_gfx11_gfx12<0x265>;
-defm V_DOT2_F16_F16        : VOP3Dot_Realtriple_gfx11_gfx12<0x266>;
-defm V_DOT2_BF16_BF16      : VOP3Dot_Realtriple_gfx11_gfx12<0x267>;
+defm V_DOT2_F16_F16        : VOP3Dot_Realtriple_t16_and_fake16_gfx11_gfx12<0x266, "v_dot2_f16_f16">;
+defm V_DOT2_BF16_BF16      : VOP3Dot_Realtriple_t16_and_fake16_gfx11_gfx12<0x267, "v_dot2_bf16_bf16">;
 defm V_DIV_SCALE_F32       : VOP3be_Real_gfx11_gfx12<0x2fc, "V_DIV_SCALE_F32", "v_div_scale_f32">;
 defm V_DIV_SCALE_F64       : VOP3be_Real_gfx11_gfx12<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">;
 defm V_MAD_U64_U32_gfx11   : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_mad_u64_u32">;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index aab5dc7465d938..58dc812bc3ef6a 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -347,9 +347,12 @@ class VOP3FP8OpSel_src_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gf
    let Inst{14-13} = byte_sel;  // op_sel2/3
  }
 
-class VOP3DotOpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx11_gfx12<op, p>{
+class VOP3DotOpSel_gfx11_gfx12<bits<10> op, VOPProfile p> :
+    VOP3e_t16_gfx11_gfx12<op, p>{
   let Inst{11} = ?;
   let Inst{12} = ?;
+  let Inst{13} = !if(p.HasSrc2Mods, src2_modifiers{2}, 0);
+  let Inst{14} = !if(!and(p.HasDst, p.HasSrc0Mods),  src0_modifiers{3}, 0);
 }
 
 // NB: For V_INTERP* opcodes, src0 is encoded as src1 and vice versa
@@ -1611,10 +1614,12 @@ multiclass VOP3_Real_Base<GFXGen Gen, bits<10> op, string opName = NAME,
   }
 }
 
-multiclass VOP3Dot_Real_Base<GFXGen Gen, bits<10> op, string opName = NAME,
+multiclass VOP3Dot_Real_Base<GFXGen Gen, bits<10> op, string asmName, string opName = NAME,
                              bit isSingle = 0> {
   defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
-  let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
+  let AsmString = asmName # ps.AsmOperands,
+      DecoderNamespace = Gen.DecoderNamespace # !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"),
+      IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
     def _e64#Gen.Suffix :
       VOP3_Real_Gen<ps, Gen>,
       VOP3DotOpSel_gfx11_gfx12<op, ps.Pfl>;
@@ -1678,9 +1683,13 @@ multiclass VOP3_Real_dpp_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
       VOP3_DPP16_Gen<op, ps, Gen>;
 }
 
-multiclass VOP3Dot_Real_dpp_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
+multiclass VOP3Dot_Real_dpp_Base<GFXGen Gen, bits<10> op, string asmName, string opName = NAME> {
+  defvar ps = !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp");
   def _e64_dpp#Gen.Suffix :
-    VOP3_DPP16_Gen<op, !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp"), Gen> {
+    VOP3_DPP16_Gen_t16<op, ps, Gen> {
+      let AsmString = asmName # ps.Pfl.AsmVOP3DPP16;
+      let DecoderNamespace = Gen.DecoderNamespace
+                             # !if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
       let Inst{11} = ?;
       let Inst{12} = ?;
     }
@@ -1702,12 +1711,14 @@ multiclass VOP3_Real_dpp8_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
   }
 }
 
-multiclass VOP3Dot_Real_dpp8_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
+multiclass VOP3Dot_Real_dpp8_Base<GFXGen Gen, bits<10> op, string asmName, string opName = NAME> {
   defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
-  def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8<op, ps> {
+  def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8_t16<op, ps> {
     let Inst{11} = ?;
     let Inst{12} = ?;
-    let DecoderNamespace = Gen.DecoderNamespace;
+    let AsmString = asmName # ps.Pfl.AsmVOP3DPP8;
+    let DecoderNamespace = Gen.DecoderNamespace
+                           # !if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
     let AssemblerPredicate = Gen.AssemblerPredicate;
   }
 }
@@ -1760,11 +1771,11 @@ multiclass VOP3_Realtriple<GFXGen Gen, bits<10> op, bit isSingle = 0,
   VOP3_Real_dpp_Base<Gen, op, opName>,
   VOP3_Real_dpp8_Base<Gen, op, opName>;
 
-multiclass VOP3Dot_Realtriple<GFXGen Gen, bits<10> op, bit isSingle = 0,
+multiclass VOP3Dot_Realtriple<GFXGen Gen, bits<10> op, string asmName, bit isSingle = 0,
                               string opName = NAME> :
-  VOP3Dot_Real_Base<Gen, op, opName, isSingle>,
-  VOP3Dot_Real_dpp_Base<Gen, op, opName>,
-  VOP3Dot_Real_dpp8_Base<Gen, op, opName>;
+  VOP3Dot_Real_Base<Gen, op, asmName, opName, isSingle>,
+  VOP3Dot_Real_dpp_Base<Gen, op, asmName, opName>,
+  VOP3Dot_Real_dpp8_Base<Gen, op, asmName, opName>;
 
 multiclass VOP3Only_Realtriple<GFXGen Gen, bits<10> op> :
   VOP3_Realtriple<Gen, op, 1>;
@@ -1862,6 +1873,12 @@ multiclass VOP3Only_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName,
   VOP3Only_Realtriple_with_name<GFX11Gen, op, opName, asmName>,
   VOP3Only_Realtriple_with_name<GFX12Gen, op, opName, asmName>;
 
+multiclass VOP3Dot_Realtriple_gfx11_gfx12<bits<10> op, string asmName, bit isSingle = 0,
+                                          string opName = NAME> :
+  VOP3Dot_Realtriple<GFX11Gen, op, asmName, isSingle, opName>,
+  VOP3Dot_Realtriple<GFX12Gen, op, asmName, isSingle, opName>;
+
+
 //===----------------------------------------------------------------------===//
 
 include "VOPCInstructions.td"
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
index 210d55898367d8..1f6dc01b5d6e96 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
@@ -2119,104 +2119,116 @@ v_div_scale_f64 v[254:255], vcc, 0xaf123456, -vcc, -1 clamp div:2
 // W64: encoding: [0xfe,0xea,0xfd,0xd6,0xff,0xd4,0x04,0x5b,0x56,0x34,0x12,0xaf]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
-v_dot2_bf16_bf16 v5, v1, v2, s3
-// GFX11: encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0x0e,0x00]
+v_dot2_bf16_bf16 v5.l, v1, v2, s3
+// GFX11: [0x05,0x00,0x67,0xd6,0x01,0x05,0x0e,0x00]
 
-v_dot2_bf16_bf16 v5, v255, v255, s105
-// GFX11: encoding: [0x05,0x00,0x67,0xd6,0xff,0xff,0xa7,0x01]
+v_dot2_bf16_bf16 v5.l, v255, v255, s105
+// GFX11: [0x05,0x00,0x67,0xd6,0xff,0xff,0xa7,0x01]
 
-v_dot2_bf16_bf16 v5, s1, s2, v3
-// GFX11: encoding: [0x05,0x00,0x67,0xd6,0x01,0x04,0x0c,0x04]
+v_dot2_bf16_bf16 v5.l, s1, s2, v3.l
+// GFX11: [0x05,0x00,0x67,0xd6,0x01,0x04,0x0c,0x04]
 
-v_dot2_bf16_bf16 v5, s105, s105, m0
-// GFX11: encoding: [0x05,0x00,0x67,0xd6,0x69,0xd2,0xf4,0x01]
+v_dot2_bf16_bf16 v5.l, s105, s105, m0
+// GFX11: [0x05,0x00,0x67,0xd6,0x69,0xd2,0xf4,0x01]
 
-v_dot2_bf16_bf16 v5, vcc_lo, ttmp15, v255
-// GFX11: encoding: [0x05,0x00,0x67,0xd6,0x6a,0xf6,0xfc,0x07]
+v_dot2_bf16_bf16 v5.l, vcc_lo, ttmp15, v255.h
+// GFX11: [0x05,0x20,0x67,0xd6,0x6a,0xf6,0xfc,0x07]
 
-v_dot2_bf16_bf16 v5, vcc_hi, 0xfe0b, vcc_hi
-// GFX11: encoding: [0x05,0x00,0x67,0xd6,0x6b,0xfe,0xad,0x01,0x0b,0xfe,0x00,0x00]
+v_dot2_bf16_bf16 v5.l, vcc_hi, 0xfe0b, vcc_hi
+// GFX11: [0x05,0x00,0x67,0xd6,0x6b,0xfe,0xad,0x01,0x0b,0xfe,0x00,0x00]
 
-v_dot2_bf16_bf16 v5, ttmp15, src_scc, ttmp15
-// GFX11: encoding: [0x05,0x00,0x67,0xd6,0x7b,0xfa,0xed,0x01]
+v_dot2_bf16_bf16 v5.l, ttmp15, src_scc, ttmp15
+// GFX11: [0x05,0x00,0x67,0xd6,0x7b,0xfa,0xed,0x01]
 
-v_dot2_bf16_bf16 v5, |m0|, -1, -vcc_lo
-// GFX11: encoding: [0x05,0x01,0x67,0xd6,0x7d,0x82,0xa9,0x81]
+v_dot2_bf16_bf16 v5.l, |m0|, -1, -vcc_lo
+// GFX11: [0x05,0x01,0x67,0xd6,0x7d,0x82,0xa9,0x81]
 
-v_dot2_bf16_bf16 v5, -|exec_lo|, null, -|0xfe0b|
-// GFX11: encoding: [0x05,0x05,0x67,0xd6,0x7e,0xf8,0xfc,0xa3,0x0b,0xfe,0x00,0x00]
+v_dot2_bf16_bf16 v5.l, -|exec_lo|, null, -|0xfe0b|
+// GFX11: [0x05,0x05,0x67,0xd6,0x7e,0xf8,0xfc,0xa3,0x0b,0xfe,0x00,0x00]
 
-v_dot2_bf16_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo|
-// GFX11: encoding: [0x05,0x07,0x67,0xd6,0x7f,0xfc,0xf8,0xe1]
+v_dot2_bf16_bf16 v5.l, -|exec_hi|, -|exec_lo|, -|exec_lo|
+// GFX11: [0x05,0x07,0x67,0xd6,0x7f,0xfc,0xf8,0xe1]
 
-v_dot2_bf16_bf16 v5, null, -exec_hi, |src_scc|
-// GFX11: encoding: [0x05,0x04,0x67,0xd6,0x7c,0xfe,0xf4,0x43]
+v_dot2_bf16_bf16 v5.l, null, -exec_hi, |src_scc|
+// GFX11: [0x05,0x04,0x67,0xd6,0x7c,0xfe,0xf4,0x43]
 
-v_dot2_bf16_bf16 v5, -1, -|m0|, -|exec_hi| op_sel:[0,0,0,0]
-// GFX11: encoding: [0x05,0x06,0x67,0xd6,0xc1,0xfa,0xfc,0xc1]
+v_dot2_bf16_bf16 v5.l, -1, -|m0|, -|exec_hi|
+// GFX11: [0x05,0x06,0x67,0xd6,0xc1,0xfa,0xfc,0xc1]
 
-v_dot2_bf16_bf16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
-// GFX11: encoding: [0x05,0x22,0x67,0xd6,0xfd,0xd4,0x04,0x23]
+v_dot2_bf16_bf16 v5.l, -src_scc, |vcc_lo|, -1
+// GFX11: [0x05,0x02,0x67,0xd6,0xfd,0xd4,0x04,0x23]
 
-v_dot2_bf16_bf16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1]
-// GFX11: encoding: [0xff,0x43,0x67,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+v_dot2_bf16_bf16 v255.h, -|0xfe0b|, -|vcc_hi|, null
+// GFX11: [0xff,0x43,0x67,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
-v_dot2_bf16_bf16 v2, v0, 0x20004000, v2
-// GFX11: v_dot2_bf16_bf16 v2, v0, 0x20004000, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xff,0x09,0x04,0x00,0x40,0x00,0x20]
+v_dot2_bf16_bf16 v2.l, v0, 0x20004000, v2.l
+// GFX11: [0x02,0x00,0x67,0xd6,0x00,0xff,0x09,0x04,0x00,0x40,0x00,0x20]
 
-v_dot2_bf16_bf16 v2, 0x20004000, v0, v2
-// GFX11: v_dot2_bf16_bf16 v2, 0x20004000, v0, v2 ; encoding: [0x02,0x00,0x67,0xd6,0xff,0x00,0x0a,0x04,0x00,0x40,0x00,0x20]
+v_dot2_bf16_bf16 v2.l, 0x20004000, v0, v2.l
+// GFX11: [0x02,0x00,0x67,0xd6,0xff,0x00,0x0a,0x04,0x00,0x40,0x00,0x20]
 
-v_dot2_f16_f16 v5, v1, v2, s3
-// GFX11: encoding: [0x05,0x00,0x66,0xd6,0x01,0x05,0x0e,0x00]
+v_dot2_bf16_bf16 v5.l, vcc_lo, ttmp15, v255.h
+// GFX11: [0x05,0x20,0x67,0xd6,0x6a,0xf6,0xfc,0x07]
 
-v_dot2_f16_f16 v5, v255, s2, s105
-// GFX11: encoding: [0x05,0x00,0x66,0xd6,0xff,0x05,0xa4,0x01]
+v_dot2_bf16_bf16 v255.h, -|0xfe0b|, -|vcc_hi|, null
+// GFX11: [0xff,0x43,0x67,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
-v_dot2_f16_f16 v5, s1, v255, exec_hi
-// GFX11: encoding: [0x05,0x00,0x66,0xd6,0x01,0xfe,0xff,0x01]
+v_dot2_f16_f16 v5.l, v1, v2, s3
+// GFX11: [0x05,0x00,0x66,0xd6,0x01,0x05,0x0e,0x00]
 
-v_dot2_f16_f16 v5, s105, s105, exec_lo
-// GFX11: encoding: [0x05,0x00,0x66,0xd6,0x69,0xd2,0xf8,0x01]
+v_dot2_f16_f16 v5.l, v255, s2, s105
+// GFX11: [0x05,0x00,0x66,0xd6,0xff,0x05,0xa4,0x01]
 
-v_dot2_f16_f16 v5, vcc_lo, ttmp15, v3
-// GFX11: encoding: [0x05,0x00,0x66,0xd6,0x6a,0xf6,0x0c,0x04]
+v_dot2_f16_f16 v5.l, s1, v255, exec_hi
+// GFX11: [0x05,0x00,0x66,0xd6,0x01,0xfe,0xff,0x01]
 
-v_dot2_f16_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX11: encoding: [0x05,0x00,0x66,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_dot2_f16_f16 v5.l, s105, s105, exec_lo
+// GFX11: [0x05,0x00,0x66,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_dot2_f16_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX11: encoding: [0x05,0x07,0x66,0xd6,0x7b,0xfa,0xed,0xe1]
+v_dot2_f16_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX11: [0x05,0x00,0x66,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_dot2_f16_f16 v5, m0, 0.5, m0
-// GFX11: encoding: [0x05,0x00,0x66,0xd6,0x7d,0xe0,0xf5,0x01]
+v_dot2_f16_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX11: [0x05,0x20,0x66,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_dot2_f16_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX11: encoding: [0x05,0x01,0x66,0xd6,0x7e,0x82,0xad,0x01]
+v_dot2_f16_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX11: [0x05,0x07,0x66,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_dot2_f16_f16 v5, -|exec_hi|, null, -|vcc_lo|
-// GFX11: encoding: [0x05,0x05,0x66,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_dot2_f16_f16 v5.l, m0, 0.5, m0
+// GFX11: [0x05,0x00,0x66,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_dot2_f16_f16 v5, null, exec_lo, -|0xfe0b|
-// GFX11: encoding: [0x05,0x04,0x66,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_dot2_f16_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX11: [0x05,0x01,0x66,0xd6,0x7e,0x82,0xad,0x01]
 
-v_dot2_f16_f16 v5, -1, -|exec_hi|, -|src_scc|
-// GFX11: encoding: [0x05,0x06,0x66,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_dot2_f16_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX11: [0x05,0x05,0x66,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_dot2_f16_f16 v5, 0.5, -m0, 0.5 op_sel:[0,0,0,0]
-// GFX11: encoding: [0x05,0x00,0x66,0xd6,0xf0,0xfa,0xc0,0x43]
+v_dot2_f16_f16 v5.l, null, exec_lo, -|0xfe0b|
+// GFX11: [0x05,0x04,0x66,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_dot2_f16_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
-// GFX11: encoding: [0x05,0x22,0x66,0xd6,0xfd,0xd4,0x04,0x23]
+v_dot2_f16_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX11: [0x05,0x06,0x66,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_dot2_f16_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1]
-// GFX11: encoding: [0xff,0x43,0x66,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+v_dot2_f16_f16 v5.l, 0.5, -m0, 0.5
+// GFX11: [0x05,0x00,0x66,0xd6,0xf0,0xfa,0xc0,0x43]
 
-v_dot2_f16_f16 v2, v0, 0x20004000, v2
-// GFX11: v_dot2_f16_f16 v2, v0, 0x20004000, v2 ; encoding: [0x02,0x00,0x66,0xd6,0x00,0xff,0x09,0x04,0x00,0x40,0x00,0x20]
+v_dot2_f16_f16 v5.l, -src_scc, |vcc_lo|, -1
+// GFX11: [0x05,0x02,0x66,0xd6,0xfd,0xd4,0x04,0x23]
 
-v_dot2_f16_f16 v2, 0x20004000, v0, v2
-// GFX11: v_dot2_f16_f16 v2, 0x20004000, v0, v2 ; encoding: [0x02,0x00,0x66,0xd6,0xff,0x00,0x0a,0x04,0x00,0x40,0x00,0x20]
+v_dot2_f16_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null
+// GFX11: [0xff,0x43,0x66,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+
+v_dot2_f16_f16 v2.l, v0, 0x20004000, v2.l
+// GFX11: [0x02,0x00,0x66,0xd6,0x00,0xff,0x09,0x04,0x00,0x40,0x00,0x20]
+
+v_dot2_f16_f16 v2.l, 0x20004000, v0, v2.l
+// GFX11: [0x02,0x00,0x66,0xd6,0xff,0x00,0x0a,0x04,0x00,0x40,0x00,0x20]
+
+v_dot2_f16_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX11: [0x05,0x20,0x66,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_dot2_f16_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null
+// GFX11: [0xff,0x43,0x66,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 v_fma_dx9_zero_f32 v5, v1, v2, s3
 // GFX11: encoding: [0x05,0x00,0x09,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
index c82b61e21edf64..ba594e7406fc52 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
@@ -4700,44 +4700,117 @@ v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 ban
 v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
 // GFX11: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
-// GFX11: encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+v_dot2_f16_f16_e64_dpp v0.l, v1, v2, v3.l op_sel:[1,1,0,0] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid op_sel operand
+
+v_dot2_f16_f16_e64_dpp v0.l, s1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_dot2_f16_f16_e64_dpp v0.l, v1, s2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
-v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[1,1,0,0] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+v_dot2_bf16_bf16_e64_dpp v0.l, v1, v2, v3.l op_sel:[1,1,0,0] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
 // GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid op_sel operand
 
-v_dot2_f16_f16_e64_dpp v0, s1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+v_dot2_bf16_bf16_e64_dpp v0.l, s1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
 // GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
-v_dot2_f16_f16_e64_dpp v0, v1, s2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+v_dot2_bf16_bf16_e64_dpp v0.l, v1, s2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
 // GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
-v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
-// GFX11: encoding: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+v_dot2_f16_f16_e64_dpp v5.l, v1, v2, v3.l quad_perm:[3,2,1,0]
+// GFX11: [0x05,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_dot2_f16_f16_e64_dpp v0, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
-// GFX11: encoding: [0x00,0x65,0x66,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00]
+v_dot2_f16_f16_e64_dpp v5.l, v1, v2, v255.h quad_perm:[0,1,2,3]
+// GFX11: [0x05,0x20,0x66,0xd6,0xfa,0x04,0xf...
[truncated]

@llvmbot
Copy link

llvmbot commented Oct 23, 2024

@llvm/pr-subscribers-backend-amdgpu

Author: Brox Chen (broxigarchen)

Changes

Patch is 171.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113474.diff

14 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+39-7)
  • (modified) llvm/lib/Target/AMDGPU/VOPInstructions.td (+29-12)
  • (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3.s (+78-66)
  • (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s (+98-25)
  • (modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s (+96-26)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3.s (+57-33)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s (+34-22)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s (+32-20)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt (+160-37)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt (+190-21)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt (+184-21)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt (+140-29)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt (+70-10)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt (+70-10)
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 34ecdb56e8689d..caceb251f719b1 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -931,6 +931,30 @@ class VOP3_DOT_Profile<VOPProfile P> : VOP3_Profile<P, VOP3_OPSEL> {
   let HasOMod = 0;
 }
 
+class VOP3_DOT_Profile_t16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile_True16<P, Features> {
+  let HasClamp = 0;
+  let HasOMod = 0;
+  // Override modifiers for bf16(i16) (same as float modifiers).
+  let HasSrc0Mods = 1;
+  let HasSrc1Mods = 1;
+  let HasSrc2Mods = 1;
+  let Src0ModVOP3DPP = FPVRegInputMods;
+  let Src1ModVOP3DPP = FP32VCSrcInputMods;
+  let Src2ModVOP3DPP = FPT16VCSrcInputMods</*IsFake16*/0>;
+}
+
+class VOP3_DOT_Profile_fake16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile_Fake16<P, Features> {
+  let HasClamp = 0;
+  let HasOMod = 0;
+  // Override modifiers for bf16(i16) (same as float modifiers).
+  let HasSrc0Mods = 1;
+  let HasSrc1Mods = 1;
+  let HasSrc2Mods = 1;
+  let AsmVOP3Base = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
+   HasOpSel, HasOMod, IsVOP3P, HasModifiers, 1/*HasSrc0Mods*/, 1/*HasSrc1Mods*/,
+   1/*HasSrc2Mods*/, DstVT>.ret;
+}
+
 let SubtargetPredicate = isGFX11Plus in {
   defm V_MAXMIN_F32     : VOP3Inst<"v_maxmin_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
   defm V_MINMAX_F32     : VOP3Inst<"v_minmax_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
@@ -951,9 +975,15 @@ let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
   defm V_MINIMUMMAXIMUM_F16 : VOP3Inst<"v_minimummaximum_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
 } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
 
-let OtherPredicates = [HasDot9Insts], IsDOT=1 in {
-  defm V_DOT2_F16_F16 :   VOP3Inst<"v_dot2_f16_f16",   VOP3_DOT_Profile<VOP_F16_V2F16_V2F16_F16>, int_amdgcn_fdot2_f16_f16>;
-  defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile<VOP_BF16_V2BF16_V2BF16_BF16>, int_amdgcn_fdot2_bf16_bf16>;
+let SubtargetPredicate = HasDot9Insts, IsDOT=1 in {
+  defm V_DOT2_F16_F16 :   VOP3Inst_t16_with_profiles<"v_dot2_f16_f16", VOP3_DOT_Profile<VOP_F16_V2F16_V2F16_F16>,
+                                                      VOP3_DOT_Profile_t16<VOP_F16_V2F16_V2F16_F16>,
+                                                      VOP3_DOT_Profile_fake16<VOP_F16_V2F16_V2F16_F16>,
+                                                      int_amdgcn_fdot2_f16_f16>;
+  defm V_DOT2_BF16_BF16 : VOP3Inst_t16_with_profiles<"v_dot2_bf16_bf16", VOP3_DOT_Profile<VOP_BF16_V2BF16_V2BF16_BF16>,
+                                                      VOP3_DOT_Profile_t16<VOP_BF16_V2BF16_V2BF16_BF16>,
+                                                      VOP3_DOT_Profile_fake16<VOP_BF16_V2BF16_V2BF16_BF16>,
+                                                      int_amdgcn_fdot2_bf16_bf16>;
 }
 
 class VOP_Pseudo_Scalar<RegisterClass Dst, RegisterOperand SrcOp,
@@ -1112,8 +1142,10 @@ multiclass VOP3_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName,
   VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName>,
   VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName>;
 
-multiclass VOP3Dot_Realtriple_gfx11_gfx12<bits<10> op> :
-  VOP3Dot_Realtriple<GFX11Gen, op>, VOP3Dot_Realtriple<GFX12Gen, op>;
+multiclass VOP3Dot_Realtriple_t16_and_fake16_gfx11_gfx12<bits<10> op, string asmName, string opName = NAME> {
+  defm _t16: VOP3Dot_Realtriple_gfx11_gfx12<op, asmName, 0, opName#"_t16">;
+  defm _fake16: VOP3Dot_Realtriple_gfx11_gfx12<op, asmName, 0, opName#"_fake16">;
+}
 
 multiclass VOP3_Realtriple_t16_gfx11_gfx12<bits<10> op, string asmName, string opName = NAME,
                                            string pseudo_mnemonic = "", bit isSingle = 0> :
@@ -1205,8 +1237,8 @@ defm V_MAXMIN_U32          : VOP3_Realtriple_gfx11_gfx12<0x262>;
 defm V_MINMAX_U32          : VOP3_Realtriple_gfx11_gfx12<0x263>;
 defm V_MAXMIN_I32          : VOP3_Realtriple_gfx11_gfx12<0x264>;
 defm V_MINMAX_I32          : VOP3_Realtriple_gfx11_gfx12<0x265>;
-defm V_DOT2_F16_F16        : VOP3Dot_Realtriple_gfx11_gfx12<0x266>;
-defm V_DOT2_BF16_BF16      : VOP3Dot_Realtriple_gfx11_gfx12<0x267>;
+defm V_DOT2_F16_F16        : VOP3Dot_Realtriple_t16_and_fake16_gfx11_gfx12<0x266, "v_dot2_f16_f16">;
+defm V_DOT2_BF16_BF16      : VOP3Dot_Realtriple_t16_and_fake16_gfx11_gfx12<0x267, "v_dot2_bf16_bf16">;
 defm V_DIV_SCALE_F32       : VOP3be_Real_gfx11_gfx12<0x2fc, "V_DIV_SCALE_F32", "v_div_scale_f32">;
 defm V_DIV_SCALE_F64       : VOP3be_Real_gfx11_gfx12<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">;
 defm V_MAD_U64_U32_gfx11   : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_mad_u64_u32">;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index aab5dc7465d938..58dc812bc3ef6a 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -347,9 +347,12 @@ class VOP3FP8OpSel_src_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gf
    let Inst{14-13} = byte_sel;  // op_sel2/3
  }
 
-class VOP3DotOpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx11_gfx12<op, p>{
+class VOP3DotOpSel_gfx11_gfx12<bits<10> op, VOPProfile p> :
+    VOP3e_t16_gfx11_gfx12<op, p>{
   let Inst{11} = ?;
   let Inst{12} = ?;
+  let Inst{13} = !if(p.HasSrc2Mods, src2_modifiers{2}, 0);
+  let Inst{14} = !if(!and(p.HasDst, p.HasSrc0Mods),  src0_modifiers{3}, 0);
 }
 
 // NB: For V_INTERP* opcodes, src0 is encoded as src1 and vice versa
@@ -1611,10 +1614,12 @@ multiclass VOP3_Real_Base<GFXGen Gen, bits<10> op, string opName = NAME,
   }
 }
 
-multiclass VOP3Dot_Real_Base<GFXGen Gen, bits<10> op, string opName = NAME,
+multiclass VOP3Dot_Real_Base<GFXGen Gen, bits<10> op, string asmName, string opName = NAME,
                              bit isSingle = 0> {
   defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
-  let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
+  let AsmString = asmName # ps.AsmOperands,
+      DecoderNamespace = Gen.DecoderNamespace # !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"),
+      IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
     def _e64#Gen.Suffix :
       VOP3_Real_Gen<ps, Gen>,
       VOP3DotOpSel_gfx11_gfx12<op, ps.Pfl>;
@@ -1678,9 +1683,13 @@ multiclass VOP3_Real_dpp_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
       VOP3_DPP16_Gen<op, ps, Gen>;
 }
 
-multiclass VOP3Dot_Real_dpp_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
+multiclass VOP3Dot_Real_dpp_Base<GFXGen Gen, bits<10> op, string asmName, string opName = NAME> {
+  defvar ps = !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp");
   def _e64_dpp#Gen.Suffix :
-    VOP3_DPP16_Gen<op, !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp"), Gen> {
+    VOP3_DPP16_Gen_t16<op, ps, Gen> {
+      let AsmString = asmName # ps.Pfl.AsmVOP3DPP16;
+      let DecoderNamespace = Gen.DecoderNamespace
+                             # !if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
       let Inst{11} = ?;
       let Inst{12} = ?;
     }
@@ -1702,12 +1711,14 @@ multiclass VOP3_Real_dpp8_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
   }
 }
 
-multiclass VOP3Dot_Real_dpp8_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
+multiclass VOP3Dot_Real_dpp8_Base<GFXGen Gen, bits<10> op, string asmName, string opName = NAME> {
   defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
-  def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8<op, ps> {
+  def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8_t16<op, ps> {
     let Inst{11} = ?;
     let Inst{12} = ?;
-    let DecoderNamespace = Gen.DecoderNamespace;
+    let AsmString = asmName # ps.Pfl.AsmVOP3DPP8;
+    let DecoderNamespace = Gen.DecoderNamespace
+                           # !if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
     let AssemblerPredicate = Gen.AssemblerPredicate;
   }
 }
@@ -1760,11 +1771,11 @@ multiclass VOP3_Realtriple<GFXGen Gen, bits<10> op, bit isSingle = 0,
   VOP3_Real_dpp_Base<Gen, op, opName>,
   VOP3_Real_dpp8_Base<Gen, op, opName>;
 
-multiclass VOP3Dot_Realtriple<GFXGen Gen, bits<10> op, bit isSingle = 0,
+multiclass VOP3Dot_Realtriple<GFXGen Gen, bits<10> op, string asmName, bit isSingle = 0,
                               string opName = NAME> :
-  VOP3Dot_Real_Base<Gen, op, opName, isSingle>,
-  VOP3Dot_Real_dpp_Base<Gen, op, opName>,
-  VOP3Dot_Real_dpp8_Base<Gen, op, opName>;
+  VOP3Dot_Real_Base<Gen, op, asmName, opName, isSingle>,
+  VOP3Dot_Real_dpp_Base<Gen, op, asmName, opName>,
+  VOP3Dot_Real_dpp8_Base<Gen, op, asmName, opName>;
 
 multiclass VOP3Only_Realtriple<GFXGen Gen, bits<10> op> :
   VOP3_Realtriple<Gen, op, 1>;
@@ -1862,6 +1873,12 @@ multiclass VOP3Only_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName,
   VOP3Only_Realtriple_with_name<GFX11Gen, op, opName, asmName>,
   VOP3Only_Realtriple_with_name<GFX12Gen, op, opName, asmName>;
 
+multiclass VOP3Dot_Realtriple_gfx11_gfx12<bits<10> op, string asmName, bit isSingle = 0,
+                                          string opName = NAME> :
+  VOP3Dot_Realtriple<GFX11Gen, op, asmName, isSingle, opName>,
+  VOP3Dot_Realtriple<GFX12Gen, op, asmName, isSingle, opName>;
+
+
 //===----------------------------------------------------------------------===//
 
 include "VOPCInstructions.td"
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
index 210d55898367d8..1f6dc01b5d6e96 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
@@ -2119,104 +2119,116 @@ v_div_scale_f64 v[254:255], vcc, 0xaf123456, -vcc, -1 clamp div:2
 // W64: encoding: [0xfe,0xea,0xfd,0xd6,0xff,0xd4,0x04,0x5b,0x56,0x34,0x12,0xaf]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
-v_dot2_bf16_bf16 v5, v1, v2, s3
-// GFX11: encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0x0e,0x00]
+v_dot2_bf16_bf16 v5.l, v1, v2, s3
+// GFX11: [0x05,0x00,0x67,0xd6,0x01,0x05,0x0e,0x00]
 
-v_dot2_bf16_bf16 v5, v255, v255, s105
-// GFX11: encoding: [0x05,0x00,0x67,0xd6,0xff,0xff,0xa7,0x01]
+v_dot2_bf16_bf16 v5.l, v255, v255, s105
+// GFX11: [0x05,0x00,0x67,0xd6,0xff,0xff,0xa7,0x01]
 
-v_dot2_bf16_bf16 v5, s1, s2, v3
-// GFX11: encoding: [0x05,0x00,0x67,0xd6,0x01,0x04,0x0c,0x04]
+v_dot2_bf16_bf16 v5.l, s1, s2, v3.l
+// GFX11: [0x05,0x00,0x67,0xd6,0x01,0x04,0x0c,0x04]
 
-v_dot2_bf16_bf16 v5, s105, s105, m0
-// GFX11: encoding: [0x05,0x00,0x67,0xd6,0x69,0xd2,0xf4,0x01]
+v_dot2_bf16_bf16 v5.l, s105, s105, m0
+// GFX11: [0x05,0x00,0x67,0xd6,0x69,0xd2,0xf4,0x01]
 
-v_dot2_bf16_bf16 v5, vcc_lo, ttmp15, v255
-// GFX11: encoding: [0x05,0x00,0x67,0xd6,0x6a,0xf6,0xfc,0x07]
+v_dot2_bf16_bf16 v5.l, vcc_lo, ttmp15, v255.h
+// GFX11: [0x05,0x20,0x67,0xd6,0x6a,0xf6,0xfc,0x07]
 
-v_dot2_bf16_bf16 v5, vcc_hi, 0xfe0b, vcc_hi
-// GFX11: encoding: [0x05,0x00,0x67,0xd6,0x6b,0xfe,0xad,0x01,0x0b,0xfe,0x00,0x00]
+v_dot2_bf16_bf16 v5.l, vcc_hi, 0xfe0b, vcc_hi
+// GFX11: [0x05,0x00,0x67,0xd6,0x6b,0xfe,0xad,0x01,0x0b,0xfe,0x00,0x00]
 
-v_dot2_bf16_bf16 v5, ttmp15, src_scc, ttmp15
-// GFX11: encoding: [0x05,0x00,0x67,0xd6,0x7b,0xfa,0xed,0x01]
+v_dot2_bf16_bf16 v5.l, ttmp15, src_scc, ttmp15
+// GFX11: [0x05,0x00,0x67,0xd6,0x7b,0xfa,0xed,0x01]
 
-v_dot2_bf16_bf16 v5, |m0|, -1, -vcc_lo
-// GFX11: encoding: [0x05,0x01,0x67,0xd6,0x7d,0x82,0xa9,0x81]
+v_dot2_bf16_bf16 v5.l, |m0|, -1, -vcc_lo
+// GFX11: [0x05,0x01,0x67,0xd6,0x7d,0x82,0xa9,0x81]
 
-v_dot2_bf16_bf16 v5, -|exec_lo|, null, -|0xfe0b|
-// GFX11: encoding: [0x05,0x05,0x67,0xd6,0x7e,0xf8,0xfc,0xa3,0x0b,0xfe,0x00,0x00]
+v_dot2_bf16_bf16 v5.l, -|exec_lo|, null, -|0xfe0b|
+// GFX11: [0x05,0x05,0x67,0xd6,0x7e,0xf8,0xfc,0xa3,0x0b,0xfe,0x00,0x00]
 
-v_dot2_bf16_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo|
-// GFX11: encoding: [0x05,0x07,0x67,0xd6,0x7f,0xfc,0xf8,0xe1]
+v_dot2_bf16_bf16 v5.l, -|exec_hi|, -|exec_lo|, -|exec_lo|
+// GFX11: [0x05,0x07,0x67,0xd6,0x7f,0xfc,0xf8,0xe1]
 
-v_dot2_bf16_bf16 v5, null, -exec_hi, |src_scc|
-// GFX11: encoding: [0x05,0x04,0x67,0xd6,0x7c,0xfe,0xf4,0x43]
+v_dot2_bf16_bf16 v5.l, null, -exec_hi, |src_scc|
+// GFX11: [0x05,0x04,0x67,0xd6,0x7c,0xfe,0xf4,0x43]
 
-v_dot2_bf16_bf16 v5, -1, -|m0|, -|exec_hi| op_sel:[0,0,0,0]
-// GFX11: encoding: [0x05,0x06,0x67,0xd6,0xc1,0xfa,0xfc,0xc1]
+v_dot2_bf16_bf16 v5.l, -1, -|m0|, -|exec_hi|
+// GFX11: [0x05,0x06,0x67,0xd6,0xc1,0xfa,0xfc,0xc1]
 
-v_dot2_bf16_bf16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
-// GFX11: encoding: [0x05,0x22,0x67,0xd6,0xfd,0xd4,0x04,0x23]
+v_dot2_bf16_bf16 v5.l, -src_scc, |vcc_lo|, -1
+// GFX11: [0x05,0x02,0x67,0xd6,0xfd,0xd4,0x04,0x23]
 
-v_dot2_bf16_bf16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1]
-// GFX11: encoding: [0xff,0x43,0x67,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+v_dot2_bf16_bf16 v255.h, -|0xfe0b|, -|vcc_hi|, null
+// GFX11: [0xff,0x43,0x67,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
-v_dot2_bf16_bf16 v2, v0, 0x20004000, v2
-// GFX11: v_dot2_bf16_bf16 v2, v0, 0x20004000, v2 ; encoding: [0x02,0x00,0x67,0xd6,0x00,0xff,0x09,0x04,0x00,0x40,0x00,0x20]
+v_dot2_bf16_bf16 v2.l, v0, 0x20004000, v2.l
+// GFX11: [0x02,0x00,0x67,0xd6,0x00,0xff,0x09,0x04,0x00,0x40,0x00,0x20]
 
-v_dot2_bf16_bf16 v2, 0x20004000, v0, v2
-// GFX11: v_dot2_bf16_bf16 v2, 0x20004000, v0, v2 ; encoding: [0x02,0x00,0x67,0xd6,0xff,0x00,0x0a,0x04,0x00,0x40,0x00,0x20]
+v_dot2_bf16_bf16 v2.l, 0x20004000, v0, v2.l
+// GFX11: [0x02,0x00,0x67,0xd6,0xff,0x00,0x0a,0x04,0x00,0x40,0x00,0x20]
 
-v_dot2_f16_f16 v5, v1, v2, s3
-// GFX11: encoding: [0x05,0x00,0x66,0xd6,0x01,0x05,0x0e,0x00]
+v_dot2_bf16_bf16 v5.l, vcc_lo, ttmp15, v255.h
+// GFX11: [0x05,0x20,0x67,0xd6,0x6a,0xf6,0xfc,0x07]
 
-v_dot2_f16_f16 v5, v255, s2, s105
-// GFX11: encoding: [0x05,0x00,0x66,0xd6,0xff,0x05,0xa4,0x01]
+v_dot2_bf16_bf16 v255.h, -|0xfe0b|, -|vcc_hi|, null
+// GFX11: [0xff,0x43,0x67,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
-v_dot2_f16_f16 v5, s1, v255, exec_hi
-// GFX11: encoding: [0x05,0x00,0x66,0xd6,0x01,0xfe,0xff,0x01]
+v_dot2_f16_f16 v5.l, v1, v2, s3
+// GFX11: [0x05,0x00,0x66,0xd6,0x01,0x05,0x0e,0x00]
 
-v_dot2_f16_f16 v5, s105, s105, exec_lo
-// GFX11: encoding: [0x05,0x00,0x66,0xd6,0x69,0xd2,0xf8,0x01]
+v_dot2_f16_f16 v5.l, v255, s2, s105
+// GFX11: [0x05,0x00,0x66,0xd6,0xff,0x05,0xa4,0x01]
 
-v_dot2_f16_f16 v5, vcc_lo, ttmp15, v3
-// GFX11: encoding: [0x05,0x00,0x66,0xd6,0x6a,0xf6,0x0c,0x04]
+v_dot2_f16_f16 v5.l, s1, v255, exec_hi
+// GFX11: [0x05,0x00,0x66,0xd6,0x01,0xfe,0xff,0x01]
 
-v_dot2_f16_f16 v5, vcc_hi, 0xfe0b, v255
-// GFX11: encoding: [0x05,0x00,0x66,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_dot2_f16_f16 v5.l, s105, s105, exec_lo
+// GFX11: [0x05,0x00,0x66,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_dot2_f16_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX11: encoding: [0x05,0x07,0x66,0xd6,0x7b,0xfa,0xed,0xe1]
+v_dot2_f16_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX11: [0x05,0x00,0x66,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_dot2_f16_f16 v5, m0, 0.5, m0
-// GFX11: encoding: [0x05,0x00,0x66,0xd6,0x7d,0xe0,0xf5,0x01]
+v_dot2_f16_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX11: [0x05,0x20,0x66,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_dot2_f16_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX11: encoding: [0x05,0x01,0x66,0xd6,0x7e,0x82,0xad,0x01]
+v_dot2_f16_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX11: [0x05,0x07,0x66,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_dot2_f16_f16 v5, -|exec_hi|, null, -|vcc_lo|
-// GFX11: encoding: [0x05,0x05,0x66,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_dot2_f16_f16 v5.l, m0, 0.5, m0
+// GFX11: [0x05,0x00,0x66,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_dot2_f16_f16 v5, null, exec_lo, -|0xfe0b|
-// GFX11: encoding: [0x05,0x04,0x66,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_dot2_f16_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX11: [0x05,0x01,0x66,0xd6,0x7e,0x82,0xad,0x01]
 
-v_dot2_f16_f16 v5, -1, -|exec_hi|, -|src_scc|
-// GFX11: encoding: [0x05,0x06,0x66,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_dot2_f16_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX11: [0x05,0x05,0x66,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_dot2_f16_f16 v5, 0.5, -m0, 0.5 op_sel:[0,0,0,0]
-// GFX11: encoding: [0x05,0x00,0x66,0xd6,0xf0,0xfa,0xc0,0x43]
+v_dot2_f16_f16 v5.l, null, exec_lo, -|0xfe0b|
+// GFX11: [0x05,0x04,0x66,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_dot2_f16_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
-// GFX11: encoding: [0x05,0x22,0x66,0xd6,0xfd,0xd4,0x04,0x23]
+v_dot2_f16_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX11: [0x05,0x06,0x66,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_dot2_f16_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1]
-// GFX11: encoding: [0xff,0x43,0x66,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+v_dot2_f16_f16 v5.l, 0.5, -m0, 0.5
+// GFX11: [0x05,0x00,0x66,0xd6,0xf0,0xfa,0xc0,0x43]
 
-v_dot2_f16_f16 v2, v0, 0x20004000, v2
-// GFX11: v_dot2_f16_f16 v2, v0, 0x20004000, v2 ; encoding: [0x02,0x00,0x66,0xd6,0x00,0xff,0x09,0x04,0x00,0x40,0x00,0x20]
+v_dot2_f16_f16 v5.l, -src_scc, |vcc_lo|, -1
+// GFX11: [0x05,0x02,0x66,0xd6,0xfd,0xd4,0x04,0x23]
 
-v_dot2_f16_f16 v2, 0x20004000, v0, v2
-// GFX11: v_dot2_f16_f16 v2, 0x20004000, v0, v2 ; encoding: [0x02,0x00,0x66,0xd6,0xff,0x00,0x0a,0x04,0x00,0x40,0x00,0x20]
+v_dot2_f16_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null
+// GFX11: [0xff,0x43,0x66,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+
+v_dot2_f16_f16 v2.l, v0, 0x20004000, v2.l
+// GFX11: [0x02,0x00,0x66,0xd6,0x00,0xff,0x09,0x04,0x00,0x40,0x00,0x20]
+
+v_dot2_f16_f16 v2.l, 0x20004000, v0, v2.l
+// GFX11: [0x02,0x00,0x66,0xd6,0xff,0x00,0x0a,0x04,0x00,0x40,0x00,0x20]
+
+v_dot2_f16_f16 v5.l, vcc_hi, 0xfe0b, v255.h
+// GFX11: [0x05,0x20,0x66,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+
+v_dot2_f16_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null
+// GFX11: [0xff,0x43,0x66,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 v_fma_dx9_zero_f32 v5, v1, v2, s3
 // GFX11: encoding: [0x05,0x00,0x09,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
index c82b61e21edf64..ba594e7406fc52 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
@@ -4700,44 +4700,117 @@ v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 ban
 v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
 // GFX11: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
-// GFX11: encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+v_dot2_f16_f16_e64_dpp v0.l, v1, v2, v3.l op_sel:[1,1,0,0] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid op_sel operand
+
+v_dot2_f16_f16_e64_dpp v0.l, s1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_dot2_f16_f16_e64_dpp v0.l, v1, s2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
-v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[1,1,0,0] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+v_dot2_bf16_bf16_e64_dpp v0.l, v1, v2, v3.l op_sel:[1,1,0,0] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
 // GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid op_sel operand
 
-v_dot2_f16_f16_e64_dpp v0, s1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+v_dot2_bf16_bf16_e64_dpp v0.l, s1, v2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
 // GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
-v_dot2_f16_f16_e64_dpp v0, v1, s2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
+v_dot2_bf16_bf16_e64_dpp v0.l, v1, s2, v3.l quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
 // GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
-v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
-// GFX11: encoding: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
+v_dot2_f16_f16_e64_dpp v5.l, v1, v2, v3.l quad_perm:[3,2,1,0]
+// GFX11: [0x05,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_dot2_f16_f16_e64_dpp v0, |v1|, -v2, -|s3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
-// GFX11: encoding: [0x00,0x65,0x66,0xd6,0xfa,0x04,0x0e,0xc0,0x01,0xe4,0x04,0x00]
+v_dot2_f16_f16_e64_dpp v5.l, v1, v2, v255.h quad_perm:[0,1,2,3]
+// GFX11: [0x05,0x20,0x66,0xd6,0xfa,0x04,0xf...
[truncated]

@broxigarchen broxigarchen requested review from Pierre-vh and removed request for Sisyph October 28, 2024 21:42
@broxigarchen
Copy link
Contributor Author

ping!

@kosarev kosarev requested a review from Sisyph October 30, 2024 11:37
v_dot2_f16_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
// GFX11: encoding: [0x05,0x22,0x66,0xd6,0xfd,0xd4,0x04,0x23]
v_dot2_f16_f16 v5.l, -src_scc, |vcc_lo|, -1
// GFX11: encoding: [0x05,0x02,0x66,0xd6,0xfd,0xd4,0x04,0x23]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you confirm if the opsel operand was incorrect here raising an .h bit on a non-VGPR operand?

Copy link
Contributor Author

@broxigarchen broxigarchen Nov 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe the old op_sel testline is selecting .h on the -1 and the assembely does matched with it. So it does test the assemebler but probably make no sense on the instruction itself?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Raising a .h bit on an non-VGPR operand should be valid. We should keep the existing test cases (which requires adding .l to the dst operand, to match the op_sel operand).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like we are still missing tests that existed previously. Where is this test? "v_dot2_f16_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]"

The other asm test files we didn't lose any tests I could see. Maybe it's just this one?

Copy link
Contributor Author

@broxigarchen broxigarchen Nov 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add back op_sel to this test. Also for gfx12 as well.

Also checked other files, when I added op_sel test manually for dpp tests, I didn't remove the op_sel syntax when only vgprs are set to 1. But I think it does not hurt anything so I will keep them there.

v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
// GFX11: encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
v_dot2_f16_f16_e64_dpp v0.l, v1, v2, v3.l op_sel:[1,1,0,0] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid op_sel operand
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit puzzled with this change. Why do we test erroring out op_sel operands here and where do we test the [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00] code for GFX11 now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there are some testline missing for op_sel testlines, and we might have same problem in this #113603 as well. I will check and address these

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added back the [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00] line, and also added some missing op_sel testlines

llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s Outdated Show resolved Hide resolved
llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt Outdated Show resolved Hide resolved
llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt Outdated Show resolved Hide resolved
@broxigarchen broxigarchen force-pushed the main-merge-true16-vop3dot-mc branch 2 times, most recently from 26dd4b9 to 1572406 Compare November 8, 2024 03:13
llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt Outdated Show resolved Hide resolved
v_dot2_f16_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]
// GFX11: encoding: [0x05,0x22,0x66,0xd6,0xfd,0xd4,0x04,0x23]
v_dot2_f16_f16 v5.l, -src_scc, |vcc_lo|, -1
// GFX11: encoding: [0x05,0x02,0x66,0xd6,0xfd,0xd4,0x04,0x23]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like we are still missing tests that existed previously. Where is this test? "v_dot2_f16_f16 v5, -src_scc, |vcc_lo|, -1 op_sel:[0,0,1,0]"

The other asm test files we didn't lose any tests I could see. Maybe it's just this one?

Copy link
Contributor

@Sisyph Sisyph left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@broxigarchen
Copy link
Contributor Author

Squash the commits for the ease of bringing up downstreaming patch

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:AMDGPU mc Machine (object) code
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants