Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AMDGPU: Handle cvt_scale F32/F16->F4/F8 gfx950 hazard #117844

Merged
merged 1 commit into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -909,17 +909,18 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {

// There are three different types of instructions
// which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
// which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
// CVT_SR_BF8_F32 with op_sel[3:2]
// which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
// (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
// op_sel[3:2]
// != 0
if (SIInstrInfo::isSDWA(MI)) {
// Type 1: SDWA with dst_sel != DWORD
if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
return nullptr;
} else {
// Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
// CVT_SR_BF8_F32 with op_sel[3:2] != 0)
// Type 2 && Type 3: (VOP3 which write the hi bits) || (FP8DstSelInst
// with op_sel[3:2] != 0)
if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
!(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
SISrcMods::DST_OP_SEL ||
Expand Down Expand Up @@ -983,7 +984,7 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
}

if (ST.hasDstSelForwardingHazard()) {
if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
const int Shift16DefWaitstates = 1;

auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
Expand Down Expand Up @@ -1094,7 +1095,8 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
// problematic thus far.

// see checkVALUHazards()
if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard())
if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
!ST.hasCvtScaleForwardingHazard())
return 0;

const MachineRegisterInfo &MRI = MF.getRegInfo();
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -1264,6 +1264,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,

bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }

bool hasCvtScaleForwardingHazard() const { return GFX950Insts; }

bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }

bool requiresCodeObjectV6() const { return RequiresCOV6; }
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,14 @@ struct VOPTrue16Info {
bool IsTrue16;
};

#define GET_FP8DstByteSelTable_DECL
#define GET_FP8DstByteSelTable_IMPL

struct DPMACCInstructionInfo {
uint16_t Opcode;
bool IsDPMACCInstruction;
};

struct FP8DstByteSelInfo {
uint16_t Opcode;
bool HasFP8DstByteSel;
Expand Down Expand Up @@ -418,6 +426,8 @@ struct FP8DstByteSelInfo {
#define GET_getMFMA_F8F6F4_WithSize_DECL
#define GET_getMFMA_F8F6F4_WithSize_IMPL
#define GET_isMFMA_F8F6F4Table_IMPL
#define GET_isCvtScaleF32_F32F16ToF8F4Table_IMPL

#include "AMDGPUGenSearchableTables.inc"

int getMTBUFBaseOpcode(unsigned Opc) {
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@ struct MFMA_F8F6F4_Info {
uint8_t NumRegsSrcB;
};

struct CvtScaleF32_F32F16ToF8F4_Info {
unsigned Opcode;
};

#define GET_MIMGBaseOpcode_DECL
#define GET_MIMGDim_DECL
#define GET_MIMGEncoding_DECL
Expand All @@ -112,6 +116,7 @@ struct MFMA_F8F6F4_Info {
#define GET_MAIInstInfoTable_DECL
#define GET_MAIInstInfoTable_DECL
#define GET_isMFMA_F8F6F4Table_DECL
#define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL
#include "AMDGPUGenSearchableTables.inc"

namespace IsaInfo {
Expand Down
11 changes: 10 additions & 1 deletion llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -970,11 +970,16 @@ class VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOPProfile P> : VOP3_Profil
let HasOMod = 0;
}

class VOP3_CVT_SCALE_FP4_F32_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<P> {
let HasFP8DstByteSel = 1;
}

class VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<P> {
let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
Int32InputMods:$src1_modifiers, Src1RC64:$src1,
FP32InputMods:$src2_modifiers, Src2RC64:$src2,
VGPR_32:$vdst_in, op_sel0:$op_sel);
let HasFP8DstByteSel = 1;
}


Expand All @@ -992,6 +997,7 @@ class VOP3_CVT_SCALE_FP4_F16BF16_TiedInput_Profile<VOPProfile P> : VOP3_Profile<
HasSrc0FloatMods, HasSrc1FloatMods,
HasSrc2FloatMods>.ret);
let HasExtVOP3DPP = 0;
let HasFP8DstByteSel = 1;
}

class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
Expand All @@ -1004,6 +1010,7 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
let HasExtVOP3DPP = 0;
let HasOpSel = 1;
let HasOMod = 0;
let HasFP8DstByteSel = 1;
}

def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32, v2f32, i32, f32]>, VOP3_OPSEL> {
Expand All @@ -1015,6 +1022,7 @@ def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32
let HasExtVOP3DPP = 0;
let HasOpSel = 1;
let HasOMod = 0;
let HasFP8DstByteSel = 1;
}

class VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<ValueType DstTy> : VOP3_Profile<VOPProfile<[DstTy, i32, f32, untyped]>,
Expand Down Expand Up @@ -1090,7 +1098,7 @@ let SubtargetPredicate = HasBF8ConversionScaleInsts, mayRaiseFPException = 0 in
let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in {
defm V_CVT_SCALEF32_PK_F32_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f32_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f32>>;
let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>;
defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>;
let Constraints = "@earlyclobber $vdst" in {
defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
Expand Down Expand Up @@ -2047,6 +2055,7 @@ multiclass VOP3_Real_BITOP3_gfx9<bits<10> op, string AsmName, bit isSingle = 0>
}
}
}

} // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9"

defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>;
Expand Down
Loading