Skip to content

Commit

Permalink
AMDGPU: Add minimum3/maximum3 pkf16 for gfx950 encodings (#117601)
Browse files Browse the repository at this point in the history
  • Loading branch information
arsenm authored Nov 26, 2024
1 parent a5174de commit ae719f0
Show file tree
Hide file tree
Showing 6 changed files with 189 additions and 1 deletion.
13 changes: 12 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,12 @@ def FeatureMinimum3Maximum3F16 : SubtargetFeature<"minimum3-maximum3-f16",
"Has v_minimum3_f16 and v_maximum3_f16 instructions"
>;

def FeatureMinimum3Maximum3PKF16 : SubtargetFeature<"minimum3-maximum3-pkf16",
"HasMinimum3Maximum3PKF16",
"true",
"Has v_pk_minimum3_f16 and v_pk_maximum3_f16 instructions"
>;

def FeatureSupportsXNACK : SubtargetFeature<"xnack-support",
"SupportsXNACK",
"true",
Expand Down Expand Up @@ -432,7 +438,8 @@ def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
FeatureFP4ConversionScaleInsts,
FeatureFP6BF6ConversionScaleInsts,
FeatureF16BF16ToFP6BF6ConversionScaleInsts,
FeatureMinimum3Maximum3F32
FeatureMinimum3Maximum3F32,
FeatureMinimum3Maximum3PKF16
]
>;

Expand Down Expand Up @@ -2146,6 +2153,10 @@ def HasMinimum3Maximum3F16 :
Predicate<"Subtarget->hasMinimum3Maximum3F16()">,
AssemblerPredicate<(all_of FeatureMinimum3Maximum3F16)>;

def HasMinimum3Maximum3PKF16 :
Predicate<"Subtarget->hasMinimum3Maximum3PKF16()">,
AssemblerPredicate<(all_of FeatureMinimum3Maximum3PKF16)>;


def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasAshrPkInsts = false;
bool HasMinimum3Maximum3F32 = false;
bool HasMinimum3Maximum3F16 = false;
bool HasMinimum3Maximum3PKF16 = false;

bool RequiresCOV6 = false;

Expand Down Expand Up @@ -1348,6 +1349,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return HasMinimum3Maximum3F16;
}

bool hasMinimum3Maximum3PKF16() const {
return HasMinimum3Maximum3PKF16;
}

/// \returns The maximum number of instructions that can be enclosed in an
/// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
/// instruction.
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP3PInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,11 @@ def : VOP3PSatPat<usubsat, V_PK_SUB_U16>;
def : VOP3PSatPat<ssubsat, V_PK_SUB_I16>;
} // End SubtargetPredicate = HasVOP3PInsts

let SubtargetPredicate = HasMinimum3Maximum3PKF16, FPDPRounding = 1 in {
defm V_PK_MINIMUM3_F16 : VOP3PInst<"v_pk_minimum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>;
defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>;
}

// TODO: Make sure we're doing the right thing with denormals. Note
// that FMA and MAD will differ.
multiclass MadFmaMixPats<SDPatternOperator fma_like,
Expand Down Expand Up @@ -2050,6 +2055,9 @@ defm V_PK_MUL_F16 : VOP3P_Real_vi <0x10>;
defm V_PK_MIN_F16 : VOP3P_Real_vi <0x11>;
defm V_PK_MAX_F16 : VOP3P_Real_vi <0x12>;

defm V_PK_MINIMUM3_F16 : VOP3P_Real_vi <0x1b>;
defm V_PK_MAXIMUM3_F16 : VOP3P_Real_vi <0x1c>;

defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x20>;
defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x21>;
defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x22>;
Expand Down
97 changes: 97 additions & 0 deletions llvm/test/MC/AMDGPU/gfx950_asm_features.s
Original file line number Diff line number Diff line change
Expand Up @@ -1182,3 +1182,100 @@ v_maximum3_f32 v1, v2, s8, v3
// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_minimum3_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa8,0xd2,0x01,0x05,0x0e,0x04]
v_minimum3_f32 v0, v1, v2, v3


// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_minimum3_f16 v1, v2, v3, v4 ; encoding: [0x01,0x40,0x9b,0xd3,0x02,0x07,0x12,0x1c]
v_pk_minimum3_f16 v1, v2, v3, v4

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_minimum3_f16 v1, v2, v3, 2.0 ; encoding: [0x01,0x40,0x9b,0xd3,0x02,0x07,0xd2,0x1b]
v_pk_minimum3_f16 v1, v2, v3, 2.0

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_minimum3_f16 v1, v2, 2.0, v3 ; encoding: [0x01,0x40,0x9b,0xd3,0x02,0xe9,0x0d,0x1c]
v_pk_minimum3_f16 v1, v2, 2.0, v3

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_minimum3_f16 v1, 2.0, v2, v3 ; encoding: [0x01,0x40,0x9b,0xd3,0xf4,0x04,0x0e,0x1c]
v_pk_minimum3_f16 v1, 2.0, v2, v3

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_minimum3_f16 v1, v2, v3, v4 clamp ; encoding: [0x01,0xc0,0x9b,0xd3,0x02,0x07,0x12,0x1c]
v_pk_minimum3_f16 v1, v2, v3, v4 clamp

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_minimum3_f16 v8, v0, s8, v1 ; encoding: [0x08,0x40,0x9b,0xd3,0x00,0x11,0x04,0x1c]
v_pk_minimum3_f16 v8, v0, s8, v1

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_minimum3_f16 v8, v0, v1, s8 ; encoding: [0x08,0x40,0x9b,0xd3,0x00,0x03,0x22,0x18]
v_pk_minimum3_f16 v8, v0, v1, s8

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_minimum3_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x9b,0xd3,0x00,0x01,0x04,0x1c]
v_pk_minimum3_f16 v8, v0, s0, v1 neg_lo:[0,0,0] neg_hi:[0,0,0]

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_minimum3_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x9b,0xd3,0x00,0x01,0x04,0x1c]
v_pk_minimum3_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[1,1,1] neg_lo:[0,0,0] neg_hi:[0,0,0]

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_minimum3_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x9b,0xd3,0x00,0x01,0x04,0x1c]
v_pk_minimum3_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[1,1,1]

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_minimum3_f16 v8, v0, s0, v1 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x9b,0xd3,0x00,0x01,0x04,0x04]
v_pk_minimum3_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[0,0,0]

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_minimum3_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x9b,0xd3,0x00,0x01,0x04,0x04]
v_pk_minimum3_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1]

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_maximum3_f16 v1, v2, v3, v4 ; encoding: [0x01,0x40,0x9c,0xd3,0x02,0x07,0x12,0x1c]
v_pk_maximum3_f16 v1, v2, v3, v4

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_maximum3_f16 v1, v2, v3, 2.0 ; encoding: [0x01,0x40,0x9c,0xd3,0x02,0x07,0xd2,0x1b]
v_pk_maximum3_f16 v1, v2, v3, 2.0

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_maximum3_f16 v1, v2, 2.0, v3 ; encoding: [0x01,0x40,0x9c,0xd3,0x02,0xe9,0x0d,0x1c]
v_pk_maximum3_f16 v1, v2, 2.0, v3

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_maximum3_f16 v1, 2.0, v2, v3 ; encoding: [0x01,0x40,0x9c,0xd3,0xf4,0x04,0x0e,0x1c]
v_pk_maximum3_f16 v1, 2.0, v2, v3

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_maximum3_f16 v1, v2, v3, v4 clamp ; encoding: [0x01,0xc0,0x9c,0xd3,0x02,0x07,0x12,0x1c]
v_pk_maximum3_f16 v1, v2, v3, v4 clamp

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_maximum3_f16 v8, v0, s8, v1 ; encoding: [0x08,0x40,0x9c,0xd3,0x00,0x11,0x04,0x1c]
v_pk_maximum3_f16 v8, v0, s8, v1

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_maximum3_f16 v8, v0, v1, s8 ; encoding: [0x08,0x40,0x9c,0xd3,0x00,0x03,0x22,0x18]
v_pk_maximum3_f16 v8, v0, v1, s8

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_maximum3_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x9c,0xd3,0x00,0x01,0x04,0x1c]
v_pk_maximum3_f16 v8, v0, s0, v1 neg_lo:[0,0,0] neg_hi:[0,0,0]

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_maximum3_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x9c,0xd3,0x00,0x01,0x04,0x1c]
v_pk_maximum3_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[1,1,1] neg_lo:[0,0,0] neg_hi:[0,0,0]

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_maximum3_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x9c,0xd3,0x00,0x01,0x04,0x1c]
v_pk_maximum3_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[1,1,1]

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_maximum3_f16 v8, v0, s0, v1 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x9c,0xd3,0x00,0x01,0x04,0x04]
v_pk_maximum3_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[0,0,0]

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: v_pk_maximum3_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x9c,0xd3,0x00,0x01,0x04,0x04]
v_pk_maximum3_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1]
6 changes: 6 additions & 0 deletions llvm/test/MC/AMDGPU/gfx950_err.s
Original file line number Diff line number Diff line change
Expand Up @@ -386,3 +386,9 @@ v_minimum3_f32 v0, s1, s2, v3

// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: literal operands are not supported
v_minimum3_f32 v0, v1, v2, 0xdeadbeef

// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand (violates constant bus restrictions)
v_pk_minimum3_f16 v0, s1, s2, v3

// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand (violates constant bus restrictions)
v_pk_maximum3_f16 v0, s1, s2, v3
61 changes: 61 additions & 0 deletions llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
Original file line number Diff line number Diff line change
Expand Up @@ -881,3 +881,64 @@

# GFX950: v_minimum3_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa8,0xd2,0x01,0x05,0x0e,0x04]
0x00,0x00,0xa8,0xd2,0x01,0x05,0x0e,0x04


# GFX950: v_pk_maximum3_f16 v1, 2.0, v2, v3 ; encoding: [0x01,0x40,0x9c,0xd3,0xf4,0x04,0x0e,0x1c]
0x01,0x40,0x9c,0xd3,0xf4,0x04,0x0e,0x1c

# GFX950: v_pk_maximum3_f16 v1, v2, 2.0, v3 ; encoding: [0x01,0x40,0x9c,0xd3,0x02,0xe9,0x0d,0x1c]
0x01,0x40,0x9c,0xd3,0x02,0xe9,0x0d,0x1c

# GFX950: v_pk_maximum3_f16 v1, v2, v3, 2.0 ; encoding: [0x01,0x40,0x9c,0xd3,0x02,0x07,0xd2,0x1b]
0x01,0x40,0x9c,0xd3,0x02,0x07,0xd2,0x1b

# GFX950: v_pk_maximum3_f16 v1, v2, v3, v4 ; encoding: [0x01,0x40,0x9c,0xd3,0x02,0x07,0x12,0x1c]
0x01,0x40,0x9c,0xd3,0x02,0x07,0x12,0x1c

# GFX950: v_pk_maximum3_f16 v1, v2, v3, v4 clamp ; encoding: [0x01,0xc0,0x9c,0xd3,0x02,0x07,0x12,0x1c]
0x01,0xc0,0x9c,0xd3,0x02,0x07,0x12,0x1c

# GFX950: v_pk_maximum3_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x9c,0xd3,0x00,0x01,0x04,0x1c]
0x08,0x40,0x9c,0xd3,0x00,0x01,0x04,0x1c

# GFX950: v_pk_maximum3_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x9c,0xd3,0x00,0x01,0x04,0x04]
0x08,0x60,0x9c,0xd3,0x00,0x01,0x04,0x04

# GFX950: v_pk_maximum3_f16 v8, v0, s0, v1 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x9c,0xd3,0x00,0x01,0x04,0x04]
0x08,0x00,0x9c,0xd3,0x00,0x01,0x04,0x04

# GFX950: v_pk_maximum3_f16 v8, v0, s8, v1 ; encoding: [0x08,0x40,0x9c,0xd3,0x00,0x11,0x04,0x1c]
0x08,0x40,0x9c,0xd3,0x00,0x11,0x04,0x1c

# GFX950: v_pk_maximum3_f16 v8, v0, v1, s8 ; encoding: [0x08,0x40,0x9c,0xd3,0x00,0x03,0x22,0x18]
0x08,0x40,0x9c,0xd3,0x00,0x03,0x22,0x18

# GFX950: v_pk_minimum3_f16 v1, 2.0, v2, v3 ; encoding: [0x01,0x40,0x9b,0xd3,0xf4,0x04,0x0e,0x1c]
0x01,0x40,0x9b,0xd3,0xf4,0x04,0x0e,0x1c

# GFX950: v_pk_minimum3_f16 v1, v2, 2.0, v3 ; encoding: [0x01,0x40,0x9b,0xd3,0x02,0xe9,0x0d,0x1c]
0x01,0x40,0x9b,0xd3,0x02,0xe9,0x0d,0x1c

# GFX950: v_pk_minimum3_f16 v1, v2, v3, 2.0 ; encoding: [0x01,0x40,0x9b,0xd3,0x02,0x07,0xd2,0x1b]
0x01,0x40,0x9b,0xd3,0x02,0x07,0xd2,0x1b

# GFX950: v_pk_minimum3_f16 v1, v2, v3, v4 ; encoding: [0x01,0x40,0x9b,0xd3,0x02,0x07,0x12,0x1c]
0x01,0x40,0x9b,0xd3,0x02,0x07,0x12,0x1c

# GFX950: v_pk_minimum3_f16 v1, v2, v3, v4 clamp ; encoding: [0x01,0xc0,0x9b,0xd3,0x02,0x07,0x12,0x1c]
0x01,0xc0,0x9b,0xd3,0x02,0x07,0x12,0x1c

# GFX950: v_pk_minimum3_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x9b,0xd3,0x00,0x01,0x04,0x1c]
0x08,0x40,0x9b,0xd3,0x00,0x01,0x04,0x1c

# GFX950: v_pk_minimum3_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x9b,0xd3,0x00,0x01,0x04,0x04]
0x08,0x60,0x9b,0xd3,0x00,0x01,0x04,0x04

# GFX950: v_pk_minimum3_f16 v8, v0, s0, v1 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x9b,0xd3,0x00,0x01,0x04,0x04]
0x08,0x00,0x9b,0xd3,0x00,0x01,0x04,0x04

# GFX950: v_pk_minimum3_f16 v8, v0, s8, v1 ; encoding: [0x08,0x40,0x9b,0xd3,0x00,0x11,0x04,0x1c]
0x08,0x40,0x9b,0xd3,0x00,0x11,0x04,0x1c

# GFX950: v_pk_minimum3_f16 v8, v0, v1, s8 ; encoding: [0x08,0x40,0x9b,0xd3,0x00,0x03,0x22,0x18]
0x08,0x40,0x9b,0xd3,0x00,0x03,0x22,0x18

0 comments on commit ae719f0

Please sign in to comment.