From 39337ff2dc366fde83b07193b72c294a846c5959 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 2 Dec 2024 09:23:17 -0500 Subject: [PATCH] AMDGPU: Handle cvt_scale F32/F16->F4/F8 gfx950 hazard (#117844) gfx950 SP changes doc says: No 4 clk forwarding on opcodes that convert from F32/F16->F8 or F32/F16->F4. Must insert a NOP or instruction writing some other destination VREG after a conversion to F4/F8 since it writes either low/high half or bytes. Co-authored-by: Pravin Jagtap Co-authored-by: Jeffrey Byrnes --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 14 +- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 + .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 10 + llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 5 + llvm/lib/Target/AMDGPU/VOP3Instructions.td | 11 +- llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir | 396 ++++++++++++++++++ .../llvm.amdgcn.cvt.scalef32.pk.gfx950.ll | 2 + .../AMDGPU/materialize-frame-index-sgpr.ll | 6 +- 8 files changed, 435 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 4c37ef8855a5ba..ecf03b14143ee3 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -909,8 +909,9 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { // There are three different types of instructions // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3 - // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and - // CVT_SR_BF8_F32 with op_sel[3:2] + // which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst + // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and + // op_sel[3:2] // != 0 if (SIInstrInfo::isSDWA(MI)) { // Type 1: SDWA with dst_sel != DWORD @@ -918,8 +919,8 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { if (DstSel->getImm() == AMDGPU::SDWA::DWORD) return nullptr; } else { - // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and - // CVT_SR_BF8_F32 with op_sel[3:2] != 0) + // Type 2 && Type 3: (VOP3 which write the hi bits) || (FP8DstSelInst + // with op_sel[3:2] != 0) if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) || !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & SISrcMods::DST_OP_SEL || @@ -983,7 +984,7 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); } - if (ST.hasDstSelForwardingHazard()) { + if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) { const int Shift16DefWaitstates = 1; auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) { @@ -1094,7 +1095,8 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { // problematic thus far. // see checkVALUHazards() - if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard()) + if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() && + !ST.hasCvtScaleForwardingHazard()) return 0; const MachineRegisterInfo &MRI = MF.getRegInfo(); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index ea5e159fdd8363..5cecaf6349c883 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1264,6 +1264,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; } + bool hasCvtScaleForwardingHazard() const { return GFX950Insts; } + bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; } bool requiresCodeObjectV6() const { return RequiresCOV6; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index ab5f0694c07f95..5a0e812748fbb7 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -378,6 +378,14 @@ struct VOPTrue16Info { bool IsTrue16; }; +#define GET_FP8DstByteSelTable_DECL +#define GET_FP8DstByteSelTable_IMPL + +struct DPMACCInstructionInfo { + uint16_t Opcode; + bool IsDPMACCInstruction; +}; + struct FP8DstByteSelInfo { uint16_t Opcode; bool HasFP8DstByteSel; @@ -418,6 +426,8 @@ struct FP8DstByteSelInfo { #define GET_getMFMA_F8F6F4_WithSize_DECL #define GET_getMFMA_F8F6F4_WithSize_IMPL #define GET_isMFMA_F8F6F4Table_IMPL +#define GET_isCvtScaleF32_F32F16ToF8F4Table_IMPL + #include "AMDGPUGenSearchableTables.inc" int getMTBUFBaseOpcode(unsigned Opc) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 9f7fbec6a542f7..ea497d7b239d7e 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -103,6 +103,10 @@ struct MFMA_F8F6F4_Info { uint8_t NumRegsSrcB; }; +struct CvtScaleF32_F32F16ToF8F4_Info { + unsigned Opcode; +}; + #define GET_MIMGBaseOpcode_DECL #define GET_MIMGDim_DECL #define GET_MIMGEncoding_DECL @@ -112,6 +116,7 @@ struct MFMA_F8F6F4_Info { #define GET_MAIInstInfoTable_DECL #define GET_MAIInstInfoTable_DECL #define GET_isMFMA_F8F6F4Table_DECL +#define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL #include "AMDGPUGenSearchableTables.inc" namespace IsaInfo { diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index c8c36714909adf..1160975f3302a9 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -970,11 +970,16 @@ class VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile : VOP3_Profil let HasOMod = 0; } +class VOP3_CVT_SCALE_FP4_F32_TiedInput_Profile : VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile

{ + let HasFP8DstByteSel = 1; +} + class VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile : VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile

{ let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0, Int32InputMods:$src1_modifiers, Src1RC64:$src1, FP32InputMods:$src2_modifiers, Src2RC64:$src2, VGPR_32:$vdst_in, op_sel0:$op_sel); + let HasFP8DstByteSel = 1; } @@ -992,6 +997,7 @@ class VOP3_CVT_SCALE_FP4_F16BF16_TiedInput_Profile : VOP3_Profile< HasSrc0FloatMods, HasSrc1FloatMods, HasSrc2FloatMods>.ret); let HasExtVOP3DPP = 0; + let HasFP8DstByteSel = 1; } class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile : @@ -1004,6 +1010,7 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile : let HasExtVOP3DPP = 0; let HasOpSel = 1; let HasOMod = 0; + let HasFP8DstByteSel = 1; } def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile, VOP3_OPSEL> { @@ -1015,6 +1022,7 @@ def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile : VOP3_Profile, @@ -1090,7 +1098,7 @@ let SubtargetPredicate = HasBF8ConversionScaleInsts, mayRaiseFPException = 0 in let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in { defm V_CVT_SCALEF32_PK_F32_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f32_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile>; let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { - defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile>; + defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4_F32_TiedInput_Profile>; let Constraints = "@earlyclobber $vdst" in { defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile>; defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile>; @@ -2047,6 +2055,7 @@ multiclass VOP3_Real_BITOP3_gfx9 op, string AsmName, bit isSingle = 0> } } } + } // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>; diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir index 75834316750951..6a25e346c89447 100644 --- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir +++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir @@ -255,3 +255,399 @@ body: | $vgpr0 = V_MOV_B32_e32 0, implicit $exec renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec ... + + +--- +# GCN-LABEL: name: test_cvt_scalef32_sr_fp8_bf16_hazard +# GCN: V_CVT_SCALEF32_SR_FP8_BF16_e64 +# GCN: GLOBAL_STORE_DWORD +name: test_cvt_scalef32_sr_fp8_bf16_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAITCNT 3952 + renamable $vgpr5 = V_CVT_SCALEF32_SR_FP8_BF16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr5, 0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr5, 0, 0, implicit $exec + S_ENDPGM 0 +... + +--- +# GCN-LABEL: name: test_cvt_scalef32_sr_fp8_f16_hazard +# GCN: V_CVT_SCALEF32_SR_FP8_F16_e64 +# GCN: GLOBAL_STORE_DWORD +name: test_cvt_scalef32_sr_fp8_f16_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAITCNT 3952 + renamable $vgpr5 = V_CVT_SCALEF32_SR_FP8_F16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr5, 0, implicit $mode, implicit $exec + GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr5, 0, 0, implicit $exec + S_ENDPGM 0 +... + +--- +# GCN-LABEL: name: test_cvt_scalef32_sr_fp8_f32_hazard +# GCN: V_CVT_SCALEF32_SR_FP8_F32_e64 +# GCN: S_NOP 0 +# GCN: V_ADD_U32_e32 +name: test_cvt_scalef32_sr_fp8_f32_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + renamable $vgpr5 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAITCNT 3952 + renamable $vgpr5 = V_CVT_SCALEF32_SR_FP8_F32_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr5, 0, implicit $mode, implicit $exec + renamable $vgpr2 = V_ADD_U32_e32 4, killed $vgpr5, implicit $exec + GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec + S_ENDPGM 0 +... + +--- +# GCN-LABEL: test_cvt_scalef32_pk_fp8_f32_hazard +# GCN: V_CVT_SCALEF32_PK_FP8_F32_e64 +# GCN: S_NOP 0 +# GCN: V_PK_ADD_U16 +name: test_cvt_scalef32_pk_fp8_f32_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + S_WAITCNT 0 + renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec + renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec + renamable $vgpr0 = V_CVT_SCALEF32_PK_FP8_F32_e64 8, killed $vgpr1, 0, killed $vgpr2, 0, killed $vgpr3, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +# GCN-LABEL: test_cvt_scalef32_pk_fp8_f16_hazard +# GCN: V_CVT_SCALEF32_PK_FP8_F16_e64 +# GCN: S_NOP 0 +# GCN: V_PK_ADD_U16 +name: test_cvt_scalef32_pk_fp8_f16_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + S_WAITCNT 0 + renamable $vgpr0 = V_CVT_SCALEF32_PK_FP8_F16_e64 8, killed $vgpr1, 0, killed $vgpr2, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +# GCN-LABEL: test_cvt_scalef32_pk_fp8_bf16_hazard +# GCN: V_CVT_SCALEF32_SR_BF8_BF16_e64 +# GCN: S_NOP 0 +# GCN: V_ADD_U32_e32 +name: test_cvt_scalef32_pk_fp8_bf16_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + S_WAITCNT 0 + renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAITCNT 3952 + renamable $vgpr0 = V_CVT_SCALEF32_SR_BF8_BF16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +# GCN-LABEL: test_cvt_scalef32_sr_bf8_f16_hazard +# GCN: V_CVT_SCALEF32_SR_BF8_F16_e64 +# GCN: S_NOP 0 +# GCN: V_ADD_U32_e32 +name: test_cvt_scalef32_sr_bf8_f16_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + S_WAITCNT 0 + renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAITCNT 3952 + renamable $vgpr0 = V_CVT_SCALEF32_SR_BF8_F16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +# GCN-LABEL: test_cvt_scalef32_sr_bf8_f32_hazard +# GCN: V_CVT_SCALEF32_SR_BF8_F32_e64 +# GCN: S_NOP 0 +# GCN: V_ADD_U32_e32 +name: test_cvt_scalef32_sr_bf8_f32_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + S_WAITCNT 0 + renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAITCNT 3952 + renamable $vgpr0 = V_CVT_SCALEF32_SR_BF8_F32_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +# GCN-LABEL: test_cvt_scalef32_pk_bf8_f32_hazard +# GCN: V_CVT_SCALEF32_PK_BF8_F32_e64 +# GCN: S_NOP 0 +# GCN: V_PK_ADD_U16 +name: test_cvt_scalef32_pk_bf8_f32_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + S_WAITCNT 0 + renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec + renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec + renamable $vgpr0 = V_CVT_SCALEF32_PK_BF8_F32_e64 8, killed $vgpr1, 0, killed $vgpr2, 0, killed $vgpr3, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +# GCN-LABEL: test_cvt_scalef32_pk_bf8_f16_hazard +# GCN: V_CVT_SCALEF32_PK_BF8_F16_e64 +# GCN: S_NOP 0 +# GCN: V_PK_ADD_U16 +name: test_cvt_scalef32_pk_bf8_f16_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + S_WAITCNT 0 + renamable $vgpr0 = V_CVT_SCALEF32_PK_BF8_F16_e64 8, killed $vgpr1, 0, killed $vgpr2, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +# GCN-LABEL: test_cvt_scalef32_pk_bf8_bf16_hazard +# GCN: V_CVT_SCALEF32_PK_BF8_BF16_e64 +# GCN: S_NOP 0 +# GCN: V_PK_ADD_U16 +name: test_cvt_scalef32_pk_bf8_bf16_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + S_WAITCNT 0 + renamable $vgpr0 = V_CVT_SCALEF32_PK_BF8_BF16_e64 8, killed $vgpr1, 0, killed $vgpr2, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_PK_ADD_U16 8, killed $vgpr0, 8, $vgpr0, 0, 0, 0, 0, 0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +# GCN-LABEL: test_cvt_scale_fp4_f32_hazard +# GCN: V_CVT_SCALEF32_PK_FP4_F32_e64 +# GCN: S_NOP 0 +# GCN: V_ADD_U32_e32 +name: test_cvt_scale_fp4_f32_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + S_WAITCNT 0 + renamable $vgpr1 = V_AND_B32_e32 2147483647, killed $vgpr1, implicit $exec + renamable $vgpr2 = V_XOR_B32_e32 -2147483648, killed $vgpr2, implicit $exec + renamable $vgpr0 = V_CVT_SCALEF32_PK_FP4_F32_e64 8, killed $vgpr1, 0, killed $vgpr2, 4, killed $vgpr3, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +# GCN-LABEL: test_scalef32_sr_pk_fp4_f16_hazard +# GCN: V_CVT_SCALEF32_SR_PK_FP4_F16_e64 +# GCN: S_NOP 0 +# GCN: V_ADD_U32_e32 +name: test_scalef32_sr_pk_fp4_f16_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + S_WAITCNT 0 + renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAITCNT 3952 + early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +# GCN-LABEL: test_scalef32_sr_pk_fp4_bf16_hazard +# GCN: V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 +# GCN: S_NOP 0 +# GCN: V_ADD_U32_e32 +name: test_scalef32_sr_pk_fp4_bf16_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + S_WAITCNT 0 + renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAITCNT 3952 + early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +# GCN-LABEL: test_scalef32_sr_pk_fp4_f32_hazard +# GCN: V_CVT_SCALEF32_SR_PK_FP4_F32_e64 +# GCN: S_NOP 0 +# GCN: V_ADD_U32_e32 +name: test_scalef32_sr_pk_fp4_f32_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + S_WAITCNT 0 + renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAITCNT 3952 + early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F32_e64 8, killed $vgpr2_vgpr3, 0, killed $vgpr4, 4, killed $vgpr5, killed $vgpr0, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +# GCN-LABEL: test_cvt_scalef32_fp4_f16_hazard +# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64 +# GCN: S_NOP 0 +# GCN: V_ADD_U32_e32 +name: test_cvt_scalef32_fp4_f16_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + S_WAITCNT 0 + renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr2, $vgpr2, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +# GCN-LABEL: test_cvt_scalef32_fp4_bf16_hazard +# GCN: V_CVT_SCALEF32_PK_FP4_BF16_e64 +# GCN: S_NOP 0 +# GCN: V_ADD_U32_e32 +name: test_cvt_scalef32_fp4_bf16_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + S_WAITCNT 0 + renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_BF16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr2, $vgpr2, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +# GCN-LABEL: test_cvt_scalef32_hazard_skipping_over_meta_instr +# GCN: V_CVT_SCALEF32_SR_BF8_F16_e64 +# GCN: S_NOP 0 +# GCN: V_ADD_U32_e32 +name: test_cvt_scalef32_hazard_skipping_over_meta_instr +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + S_WAITCNT 0 + renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAITCNT 3952 + renamable $vgpr0 = V_CVT_SCALEF32_SR_BF8_F16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + $vgpr4 = KILL + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +# GCN-LABEL: test_cvt_f16_to_fp4_to_f16_hazard +# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64 +# GCN: S_NOP 0 +# GCN: V_CVT_SCALEF32_PK_F16_FP4_e64 +# GCN: S_SETPC_B64_return +name: test_cvt_f16_to_fp4_to_f16_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + S_WAITCNT 0 + renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + renamable $vgpr0 = V_CVT_SCALEF32_PK_F16_FP4_e64 4, killed $vgpr2, 0, killed $vgpr1, 0, implicit $mode, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +# GCN-LABEL: test_cvt_scalef32_hazard_pseudo +# GCN: V_CVT_SCALEF32_SR_BF8_F16_e64 +# GCN: S_NOP 0 +# GCN: V_ADD_U32_e32 +name: test_cvt_scalef32_hazard_pseudo +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + S_WAITCNT 0 + renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAITCNT 3952 + renamable $vgpr0 = V_CVT_SCALEF32_SR_BF8_F16_e64 8, killed $vgpr2, 0, killed $vgpr3, 4, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + WAVE_BARRIER + renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr0, $vgpr0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +# GCN-LABEL: test_call_consuming_cvt_scalef32_hazard +# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64 +# GCN: SI_CALL +name: test_call_consuming_cvt_scalef32_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + BUNDLE implicit-def $sgpr0_sgpr1, implicit-def $sgpr0, implicit-def $sgpr0_lo16, implicit-def $sgpr0_hi16, implicit-def $sgpr1, implicit-def $sgpr1_lo16, implicit-def $sgpr1_hi16, implicit-def $scc { + $sgpr0_sgpr1 = S_GETPC_B64 + $sgpr0 = S_ADD_U32 internal $sgpr0, target-flags(amdgpu-gotprel32-lo) @test_cvt_scalef32_hazard_pseudo + 4, implicit-def $scc + $sgpr1 = S_ADDC_U32 internal $sgpr1, target-flags(amdgpu-gotprel32-hi) @test_cvt_scalef32_hazard_pseudo + 12, implicit-def $scc, implicit internal $scc + } + renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr0_sgpr1, 0, 0 + renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr0_sgpr1, @test_cvt_scalef32_hazard_pseudo, csr_amdgpu_gfx90ainsts, implicit undef $sgpr4_sgpr5, implicit undef $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit undef $sgpr10_sgpr11, implicit undef $sgpr12, implicit undef $sgpr13, implicit undef $sgpr14, implicit-def $sgpr15, implicit undef $vgpr31, implicit killed $vgpr2, implicit-def $vgpr2 + SI_RETURN_TO_EPILOG killed $vgpr0 +... + +--- +# GCN-LABEL: test_cvt_scalef32_inlineasm_hazard +# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64 +# GCN: S_NOP 0 +# GCN: INLINEASM +name: test_cvt_scalef32_inlineasm_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + S_WAITCNT 0 + renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, killed renamable $vgpr2 + S_SETPC_B64_return undef $sgpr30_sgpr31 +... + +--- +# GCN-LABEL: test_cvt_scale_cvt_scale_hazard +# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64 +# GCN: S_NOP 0 +# GCN: V_CVT_SCALEF32_SR_PK_FP4_F16_e64 +# GCN: S_NOP 0 +# GCN: S_SETPC_B64_return +name: test_cvt_scale_cvt_scale_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + S_WAITCNT 0 + renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + early-clobber renamable $vgpr4 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr3, 4, killed $vgpr1, killed $vgpr2, 0, implicit $mode, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... + +--- +# GCN-LABEL: test_cvt_scale_cvt_scale_waw_hazard +# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64 +# GCN: S_NOP 0 +# GCN: V_CVT_SCALEF32_SR_PK_FP4_F16_e64 +# GCN: S_SETPC_B64_return +name: test_cvt_scale_cvt_scale_waw_hazard +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + S_WAITCNT 0 + renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec + early-clobber renamable $vgpr2 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr3, 4, killed $vgpr1, killed $vgpr1, 0, implicit $mode, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll index f80f2935856e36..046a72b9307d09 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll @@ -1305,6 +1305,7 @@ define i32 @test_cvt_scalef32_fp4_f16_byte1(<2 x half> %src0, float %scale, i32 ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,1,0] +; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> %src0, float %scale, i32 1) @@ -1351,6 +1352,7 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte1(<2 x bfloat> %src0, float %scale, i ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,1,0] +; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll index d4110850f32066..7646197f13175b 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack < %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack < %s | FileCheck -check-prefixes=GFX9,GFX900 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+xnack < %s | FileCheck -check-prefixes=GFX9,GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack < %s | FileCheck -check-prefixes=GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+xnack < %s | FileCheck -check-prefixes=GFX940 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10_1 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10_3 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s @@ -2249,5 +2249,3 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; are reserved at the end for xnack + vcc). attributes #0 = { nounwind alignstack=64 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" } attributes #1 = { nounwind alignstack=16 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX9: {{.*}}