Skip to content

Commit

Permalink
AMDGPU: Handle cvt_scale F32/F16->F4/F8 gfx950 hazard (#117844)
Browse files Browse the repository at this point in the history
gfx950 SP changes doc says:
No 4 clk forwarding on opcodes that convert from
F32/F16->F8 or F32/F16->F4. Must insert a NOP or
instruction writing some other destination VREG
after a conversion to F4/F8 since it writes either
low/high half or bytes.

Co-authored-by: Pravin Jagtap <[email protected]>
Co-authored-by: Jeffrey Byrnes <[email protected]>
  • Loading branch information
3 people authored Dec 2, 2024
1 parent 61a58fc commit 39337ff
Show file tree
Hide file tree
Showing 8 changed files with 435 additions and 11 deletions.
14 changes: 8 additions & 6 deletions llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -909,17 +909,18 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {

// There are three different types of instructions
// which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
// which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
// CVT_SR_BF8_F32 with op_sel[3:2]
// which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
// (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
// op_sel[3:2]
// != 0
if (SIInstrInfo::isSDWA(MI)) {
// Type 1: SDWA with dst_sel != DWORD
if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
return nullptr;
} else {
// Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
// CVT_SR_BF8_F32 with op_sel[3:2] != 0)
// Type 2 && Type 3: (VOP3 which write the hi bits) || (FP8DstSelInst
// with op_sel[3:2] != 0)
if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
!(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
SISrcMods::DST_OP_SEL ||
Expand Down Expand Up @@ -983,7 +984,7 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
}

if (ST.hasDstSelForwardingHazard()) {
if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
const int Shift16DefWaitstates = 1;

auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
Expand Down Expand Up @@ -1094,7 +1095,8 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
// problematic thus far.

// see checkVALUHazards()
if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard())
if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
!ST.hasCvtScaleForwardingHazard())
return 0;

const MachineRegisterInfo &MRI = MF.getRegInfo();
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -1264,6 +1264,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }

bool hasCvtScaleForwardingHazard() const { return GFX950Insts; }

bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }

bool requiresCodeObjectV6() const { return RequiresCOV6; }
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,14 @@ struct VOPTrue16Info {
bool IsTrue16;
};

#define GET_FP8DstByteSelTable_DECL
#define GET_FP8DstByteSelTable_IMPL

struct DPMACCInstructionInfo {
uint16_t Opcode;
bool IsDPMACCInstruction;
};

struct FP8DstByteSelInfo {
uint16_t Opcode;
bool HasFP8DstByteSel;
Expand Down Expand Up @@ -418,6 +426,8 @@ struct FP8DstByteSelInfo {
#define GET_getMFMA_F8F6F4_WithSize_DECL
#define GET_getMFMA_F8F6F4_WithSize_IMPL
#define GET_isMFMA_F8F6F4Table_IMPL
#define GET_isCvtScaleF32_F32F16ToF8F4Table_IMPL

#include "AMDGPUGenSearchableTables.inc"

int getMTBUFBaseOpcode(unsigned Opc) {
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@ struct MFMA_F8F6F4_Info {
uint8_t NumRegsSrcB;
};

struct CvtScaleF32_F32F16ToF8F4_Info {
unsigned Opcode;
};

#define GET_MIMGBaseOpcode_DECL
#define GET_MIMGDim_DECL
#define GET_MIMGEncoding_DECL
Expand All @@ -112,6 +116,7 @@ struct MFMA_F8F6F4_Info {
#define GET_MAIInstInfoTable_DECL
#define GET_MAIInstInfoTable_DECL
#define GET_isMFMA_F8F6F4Table_DECL
#define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL
#include "AMDGPUGenSearchableTables.inc"

namespace IsaInfo {
Expand Down
11 changes: 10 additions & 1 deletion llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -970,11 +970,16 @@ class VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOPProfile P> : VOP3_Profil
let HasOMod = 0;
}

class VOP3_CVT_SCALE_FP4_F32_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<P> {
let HasFP8DstByteSel = 1;
}

class VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<P> {
let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
Int32InputMods:$src1_modifiers, Src1RC64:$src1,
FP32InputMods:$src2_modifiers, Src2RC64:$src2,
VGPR_32:$vdst_in, op_sel0:$op_sel);
let HasFP8DstByteSel = 1;
}


Expand All @@ -992,6 +997,7 @@ class VOP3_CVT_SCALE_FP4_F16BF16_TiedInput_Profile<VOPProfile P> : VOP3_Profile<
HasSrc0FloatMods, HasSrc1FloatMods,
HasSrc2FloatMods>.ret);
let HasExtVOP3DPP = 0;
let HasFP8DstByteSel = 1;
}

class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
Expand All @@ -1004,6 +1010,7 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
let HasExtVOP3DPP = 0;
let HasOpSel = 1;
let HasOMod = 0;
let HasFP8DstByteSel = 1;
}

def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32, v2f32, i32, f32]>, VOP3_OPSEL> {
Expand All @@ -1015,6 +1022,7 @@ def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32
let HasExtVOP3DPP = 0;
let HasOpSel = 1;
let HasOMod = 0;
let HasFP8DstByteSel = 1;
}

class VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<ValueType DstTy> : VOP3_Profile<VOPProfile<[DstTy, i32, f32, untyped]>,
Expand Down Expand Up @@ -1090,7 +1098,7 @@ let SubtargetPredicate = HasBF8ConversionScaleInsts, mayRaiseFPException = 0 in
let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in {
defm V_CVT_SCALEF32_PK_F32_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f32_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f32>>;
let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>;
defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>;
let Constraints = "@earlyclobber $vdst" in {
defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
Expand Down Expand Up @@ -2047,6 +2055,7 @@ multiclass VOP3_Real_BITOP3_gfx9<bits<10> op, string AsmName, bit isSingle = 0>
}
}
}

} // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9"

defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>;
Expand Down
Loading

0 comments on commit 39337ff

Please sign in to comment.