Skip to content

Commit

Permalink
AMDGPU: Define new sched model for gfx950
Browse files Browse the repository at this point in the history
A few instructions changed rate.
  • Loading branch information
arsenm committed Nov 22, 2024
1 parent 549b571 commit d203e5d
Show file tree
Hide file tree
Showing 7 changed files with 2,398 additions and 1,222 deletions.
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/GCNProcessors.td
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def : ProcessorModel<"gfx942", SIDPGFX940FullSpeedModel,
FeatureISAVersion9_4_2.Features
>;

def : ProcessorModel<"gfx950", SIDPGFX940FullSpeedModel,
def : ProcessorModel<"gfx950", SIDPGFX950FullSpeedModel,
FeatureISAVersion9_5_0.Features
>;

Expand Down
63 changes: 63 additions & 0 deletions llvm/lib/Target/AMDGPU/SISchedule.td
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def Write8PassMAI : SchedWrite;
def Write16PassMAI : SchedWrite;
def Write4PassDGEMM : SchedWrite;
def Write8PassDGEMM : SchedWrite;
def Write16PassDGEMM : SchedWrite;

// Scalar float instructions
def WriteSFPU : SchedWrite;
Expand Down Expand Up @@ -94,6 +95,7 @@ def SIFullSpeedModel : SISchedMachineModel;
def SIQuarterSpeedModel : SISchedMachineModel;
def SIDPFullSpeedModel : SISchedMachineModel;
def SIDPGFX940FullSpeedModel : SISchedMachineModel;
def SIDPGFX950FullSpeedModel : SISchedMachineModel;
def GFX10SpeedModel : SISchedMachineModel;
def GFX11SpeedModel : SISchedMachineModel;
def GFX12SpeedModel : SISchedMachineModel;
Expand Down Expand Up @@ -169,6 +171,8 @@ multiclass SICommonWriteRes {
def : HWVALUWriteRes<Write4PassDGEMM, 4>;
let ReleaseAtCycles = [8] in
def : HWVALUWriteRes<Write8PassDGEMM, 8>;
let ReleaseAtCycles = [16] in
def : HWVALUWriteRes<Write16PassDGEMM, 16>;

let ReleaseAtCycles = [2] in
def : HWWriteRes<Write2PassMAI, [HWXDL], 2>;
Expand Down Expand Up @@ -201,6 +205,13 @@ def WriteCopy : SchedWriteVariant<[
SchedVar<PredIsVGPR64Copy, [Write64Bit]>,
SchedVar<NoSchedPred, [WriteSALU]>]>;

// Check if any matrix inputs are interpreted as f8 in an f8f6f4 mfma
// instruction.
def PredIsF8_MFMA_SCALE : SchedPredicate<[{
TII->getNamedOperand(*MI, AMDGPU::OpName::cbsz)->getImm() <= AMDGPU::MFMAScaleFormats::FP8_E5M2 ||
TII->getNamedOperand(*MI, AMDGPU::OpName::blgp)->getImm() <= AMDGPU::MFMAScaleFormats::FP8_E5M2
}]>;

let SchedModel = SIFullSpeedModel in {

defm : SICommonWriteRes;
Expand Down Expand Up @@ -299,6 +310,58 @@ def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_32X32X")>;

} // End SchedModel = SIDPGFX940FullSpeedModel


let SchedModel = SIDPGFX950FullSpeedModel in {
defm : SICommonWriteRes;

def : HWVALUWriteRes<WriteFloatFMA, 1>;
def : HWVALUWriteRes<WriteDouble, 1>;
def : HWVALUWriteRes<WriteDoubleAdd, 1>;
def : HWVALUWriteRes<WriteDoubleCvt, 1>;
def : HWVALUWriteRes<WriteTrans64, 4>;
def : HWVALUWriteRes<WriteIntMul, 1>;
def : HWVALUWriteRes<Write64Bit, 1>;

def : InstRW<[WriteCopy], (instrs COPY)>;
def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_4X4X")>;

def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X8X")>;
def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X16")>;
def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X32")>;
def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X64")>;
def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X[14][FBI]")>;

def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X4XF")>;
def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X8")>;
def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X16")>;
def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X32_")>;
def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X[124][FBI]")>;

def : InstRW<[Write4PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_4X4X")>;
def : InstRW<[Write16PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>;

def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_16X16X")>;
def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_32X32X")>;


// If either matrix format is f8, the instruction takes 2x as many
// cycles. TODO: This isn't reflected in MCA.
def WriteMFMAScale_16X16X128_F8F6F4 : SchedWriteVariant<[
SchedVar<PredIsF8_MFMA_SCALE, [Write8PassMAI]>,
SchedVar<NoSchedPred, [Write4PassMAI]>]>;
def WriteMFMAScale_32X32X64_F8F6F4 : SchedWriteVariant<[
SchedVar<PredIsF8_MFMA_SCALE, [Write16PassMAI]>,
SchedVar<NoSchedPred, [Write8PassMAI]>]>;

def : InstRW<[WriteMFMAScale_16X16X128_F8F6F4, MIMFMARead],
(instregex "^V_MFMA(_SCALE)?_.32_16X16X128_F8F6F4")>;
def : InstRW<[WriteMFMAScale_32X32X64_F8F6F4, MIMFMARead],
(instregex "^V_MFMA(_SCALE)?_.32_32X32X64_F8F6F4")>;

} // End SchedModel = SIDPGFX950FullSpeedModel


let SchedModel = GFX10SpeedModel in {

// The latency values are 1 / (operations / cycle).
Expand Down
Loading

0 comments on commit d203e5d

Please sign in to comment.