From f3bd6761caca3e6c5cc106cdb4654407c766aca7 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Thu, 25 Jul 2024 15:49:58 -0400 Subject: [PATCH] AMDGPU: Add support for V_CVT_PK_F16_F32 instruction for gfx950 Co-authored-by: Shilei Tian --- llvm/lib/Target/AMDGPU/AMDGPU.td | 12 +- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 11 +- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 + llvm/lib/Target/AMDGPU/VOP3Instructions.td | 18 + llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll | 313 ++++++++++++++++++ llvm/test/MC/AMDGPU/gfx950_asm_features.s | 24 ++ .../Disassembler/AMDGPU/gfx950_dasm_vop3.txt | 18 + 8 files changed, 398 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 5978f5b0bbae5f..ebe6f0965deb97 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -432,6 +432,12 @@ def FeatureAshrPkInsts : SubtargetFeature<"ashr-pk-insts", "Has Arithmetic Shift Pack instructions" >; +def FeatureCvtPkF16F32Inst : SubtargetFeature<"cvt-pk-f16-f32-inst", + "HasCvtPkF16F32Inst", + "true", + "Has cvt_pk_f16_f32 instruction" +>; + def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts", "GFX950Insts", "true", @@ -445,8 +451,9 @@ def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts", FeatureFP6BF6ConversionScaleInsts, FeatureF16BF16ToFP6BF6ConversionScaleInsts, FeatureF32ToF16BF16ConversionSRInsts, + FeatureCvtPkF16F32Inst, FeatureMinimum3Maximum3F32, - FeatureMinimum3Maximum3PKF16 + FeatureMinimum3Maximum3PKF16, ] >; @@ -2510,6 +2517,9 @@ def HasFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasFP6BF6ConversionSca def HasF16BF16ToFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasF16BF16ToFP6BF6ConversionScaleInsts()">, AssemblerPredicate<(all_of FeatureF16BF16ToFP6BF6ConversionScaleInsts)>; +def HasCvtPkF16F32Inst : Predicate<"Subtarget->hasCvtPkF16F32Inst()">, + AssemblerPredicate<(all_of FeatureCvtPkF16F32Inst)>; + def HasF32ToF16BF16ConversionSRInsts : Predicate<"Subtarget->hasF32ToF16BF16ConversionSRInsts()">, AssemblerPredicate<(all_of FeatureF32ToF16BF16ConversionSRInsts)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 9bf1f281c32a09..2e66f7525b9ccf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1040,10 +1040,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .lower(); } - getActionDefinitionsBuilder(G_FPTRUNC) - .legalFor({{S32, S64}, {S16, S32}}) - .scalarize(0) - .lower(); + auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC); + if (ST.hasCvtPkF16F32Inst()) + FPTruncActions.legalFor( + {{S32, S64}, {S16, S32}, {V2S16, V2S32}, {V2S16, V2S64}}); + else + FPTruncActions.legalFor({{S32, S64}, {S16, S32}}); + FPTruncActions.scalarize(0).lower(); getActionDefinitionsBuilder(G_FPEXT) .legalFor({{S64, S32}, {S32, S16}}) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index c5c951b58b8d6d..7701fef5365841 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -55,6 +55,7 @@ class AMDGPUSubtarget { bool HasFP4ConversionScaleInsts = false; bool HasFP6BF6ConversionScaleInsts = false; bool HasF16BF16ToFP6BF6ConversionScaleInsts = false; + bool HasCvtPkF16F32Inst = false; bool HasF32ToF16BF16ConversionSRInsts = false; bool EnableRealTrue16Insts = false; bool HasBF16ConversionInsts = false; @@ -191,6 +192,8 @@ class AMDGPUSubtarget { bool hasF16BF16ToFP6BF6ConversionScaleInsts() const { return HasF16BF16ToFP6BF6ConversionScaleInsts; } + bool hasCvtPkF16F32Inst() const { return HasCvtPkF16F32Inst; } + bool hasF32ToF16BF16ConversionSRInsts() const { return HasF32ToF16BF16ConversionSRInsts; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a212a9218ca0db..70230b5abc5171 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -902,6 +902,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal); } + if (Subtarget->hasCvtPkF16F32Inst()) { + setOperationAction(ISD::FP_ROUND, MVT::v2f16, Legal); + } + setTargetDAGCombine({ISD::ADD, ISD::UADDO_CARRY, ISD::SUB, diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index f93535ddee25ba..065abde62af8ad 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1147,6 +1147,21 @@ let SubtargetPredicate = HasGFX950Insts, mayRaiseFPException = 0 in { defm V_CVT_SCALEF32_2XPK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_2xpk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile, int_amdgcn_cvt_scalef32_2xpk16_bf6_f32>; } +let SubtargetPredicate = HasCvtPkF16F32Inst in { + let ReadsModeReg = 0 in { + defm V_CVT_PK_F16_F32 : VOP3Inst<"v_cvt_pk_f16_f32", VOP3_Profile>; + } + + def : GCNPat<(v2f16 (fpround v2f32:$src)), + (V_CVT_PK_F16_F32_e64 0, (EXTRACT_SUBREG VReg_64:$src, sub0), 0, (EXTRACT_SUBREG VReg_64:$src, sub1))>; + def : GCNPat<(v2f16 (fpround v2f64:$src)), + (V_CVT_PK_F16_F32_e64 0, (V_CVT_F32_F64_e64 0, (EXTRACT_SUBREG VReg_128:$src, sub0_sub1)), + 0, (V_CVT_F32_F64_e64 0, (EXTRACT_SUBREG VReg_128:$src, sub2_sub3)))>; + def : GCNPat<(v2f16 (build_vector (f16 (fpround (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (f16 (fpround (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)))))), + (V_CVT_PK_F16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1)>; +} + class Cvt_Scale_FP4FP8BF8ToF16F32_Pat : GCNPat< (DstTy (node i32:$src0, f32:$src1, timm:$index)), (inst (SrcAndDstSelToOpSelXForm_0_0 $index), $src0, (SrcAndDstSelToOpSelXForm_1_0 $index), $src1) @@ -2274,6 +2289,9 @@ defm V_CVT_SR_BF16_F32: VOP3OpSel_Real_gfx9 <0x2a7>; defm V_ASHR_PK_I8_I32 : VOP3OpSel_Real_gfx9 <0x265>; defm V_ASHR_PK_U8_I32 : VOP3OpSel_Real_gfx9 <0x266>; +let OtherPredicates = [HasCvtPkF16F32Inst] in { +defm V_CVT_PK_F16_F32 : VOP3_Real_gfx9<0x267, "v_cvt_pk_f16_f32">; +} defm V_CVT_SCALEF32_2XPK16_FP6_F32 : VOP3_Real_gfx9<0x252, "v_cvt_scalef32_2xpk16_fp6_f32">; defm V_CVT_SCALEF32_2XPK16_BF6_F32 : VOP3_Real_gfx9<0x253, "v_cvt_scalef32_2xpk16_bf6_f32">; diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index 0005f1179a5d28..188b2ada646869 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -5,6 +5,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=1 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SDAG %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-GISEL %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-SDAG %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-GISEL %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL %s @@ -99,6 +101,36 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; +; GFX950-SDAG-LABEL: fptrunc_f32_to_f16: +; GFX950-SDAG: ; %bb.0: ; %entry +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX950-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX950-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX950-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX950-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX950-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX950-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX950-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: fptrunc_f32_to_f16: +; GFX950-GISEL: ; %bb.0: ; %entry +; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 +; GFX950-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX950-GISEL-NEXT: s_endpgm +; ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -234,6 +266,38 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; +; GFX950-SDAG-LABEL: fptrunc_f64_to_f16: +; GFX950-SDAG: ; %bb.0: ; %entry +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX950-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX950-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX950-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX950-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX950-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX950-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX950-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX950-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: fptrunc_f64_to_f16: +; GFX950-GISEL: ; %bb.0: ; %entry +; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX950-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX950-GISEL-NEXT: s_endpgm +; ; GFX11-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -382,6 +446,37 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; +; GFX950-SDAG-LABEL: fptrunc_v2f32_to_v2f16: +; GFX950-SDAG: ; %bb.0: ; %entry +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX950-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX950-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX950-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX950-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX950-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX950-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX950-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v0, v0, v1 +; GFX950-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: fptrunc_v2f32_to_v2f16: +; GFX950-GISEL: ; %bb.0: ; %entry +; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX950-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v0, v0, v1 +; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX950-GISEL-NEXT: s_endpgm +; ; GFX11-SDAG-LABEL: fptrunc_v2f32_to_v2f16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -543,6 +638,42 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; +; GFX950-SDAG-LABEL: fptrunc_v2f64_to_v2f16: +; GFX950-SDAG: ; %bb.0: ; %entry +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX950-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX950-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX950-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX950-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX950-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GFX950-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX950-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] +; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v0, v0, v2 +; GFX950-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: fptrunc_v2f64_to_v2f16: +; GFX950-GISEL: ; %bb.0: ; %entry +; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX950-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] +; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v0, v0, v2 +; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX950-GISEL-NEXT: s_endpgm +; ; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -685,6 +816,36 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; +; GFX950-SDAG-LABEL: fneg_fptrunc_f32_to_f16: +; GFX950-SDAG: ; %bb.0: ; %entry +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX950-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX950-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX950-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX950-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX950-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX950-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX950-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: fneg_fptrunc_f32_to_f16: +; GFX950-GISEL: ; %bb.0: ; %entry +; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s2 +; GFX950-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX950-GISEL-NEXT: s_endpgm +; ; GFX11-SDAG-LABEL: fneg_fptrunc_f32_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -815,6 +976,36 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; +; GFX950-SDAG-LABEL: fabs_fptrunc_f32_to_f16: +; GFX950-SDAG: ; %bb.0: ; %entry +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX950-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX950-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX950-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX950-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX950-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX950-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX950-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0| +; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: fabs_fptrunc_f32_to_f16: +; GFX950-GISEL: ; %bb.0: ; %entry +; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2| +; GFX950-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX950-GISEL-NEXT: s_endpgm +; ; GFX11-SDAG-LABEL: fabs_fptrunc_f32_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -945,6 +1136,36 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; +; GFX950-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: +; GFX950-SDAG: ; %bb.0: ; %entry +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX950-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX950-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX950-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX950-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX950-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX950-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX950-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -|v0| +; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: +; GFX950-GISEL: ; %bb.0: ; %entry +; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s2| +; GFX950-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX950-GISEL-NEXT: s_endpgm +; ; GFX11-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1076,6 +1297,36 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; +; GFX950-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: +; GFX950-SDAG: ; %bb.0: ; %entry +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX950-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX950-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX950-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX950-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX950-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX950-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX950-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX950-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: +; GFX950-GISEL: ; %bb.0: ; %entry +; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 +; GFX950-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX950-GISEL-NEXT: s_endpgm +; ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1211,6 +1462,36 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; +; GFX950-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: +; GFX950-SDAG: ; %bb.0: ; %entry +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX950-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX950-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX950-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX950-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX950-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX950-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX950-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0| +; GFX950-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: +; GFX950-GISEL: ; %bb.0: ; %entry +; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2| +; GFX950-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX950-GISEL-NEXT: s_endpgm +; ; GFX11-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -1353,6 +1634,38 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; +; GFX950-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: +; GFX950-SDAG: ; %bb.0: ; %entry +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX950-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX950-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX950-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX950-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX950-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX950-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX950-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX950-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX950-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: +; GFX950-GISEL: ; %bb.0: ; %entry +; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 +; GFX950-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX950-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX950-GISEL-NEXT: s_endpgm +; ; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_features.s b/llvm/test/MC/AMDGPU/gfx950_asm_features.s index 490dbb4cd97fef..389b17296c0457 100644 --- a/llvm/test/MC/AMDGPU/gfx950_asm_features.s +++ b/llvm/test/MC/AMDGPU/gfx950_asm_features.s @@ -1551,3 +1551,27 @@ v_cvt_scalef32_sr_pk32_bf6_f32 v[0:5], v[6:37], v38, v39 // NOT-GFX950: error: instruction not supported on this GPU // GFX950: v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[6:37], v38, v39 ; encoding: [0x00,0x00,0x54,0xd2,0x06,0x4d,0x9e,0x04] v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[6:37], v38, v39 + +// NOT-GFX950: error: instruction not supported on this GPU +// GFX950: v_cvt_pk_f16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x67,0xd2,0x01,0x05,0x02,0x00] +v_cvt_pk_f16_f32 v5, v1, v2 + +// NOT-GFX950: error: instruction not supported on this GPU +// GFX950: v_cvt_pk_f16_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x67,0xd2,0xff,0xff,0x03,0x00] +v_cvt_pk_f16_f32 v5, v255, v255 + +// NOT-GFX950: error: instruction not supported on this GPU +// GFX950: v_cvt_pk_f16_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x67,0xd2,0x7c,0xe0,0x01,0x00] +v_cvt_pk_f16_f32 v5, m0, 0.5 + +// NOT-GFX950: error: instruction not supported on this GPU +// GFX950: v_cvt_pk_f16_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x67,0xd2,0x7e,0x82,0x01,0x00] +v_cvt_pk_f16_f32 v5, exec_lo, -1 + +// NOT-GFX950: error: instruction not supported on this GPU +// GFX950: v_cvt_pk_f16_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x67,0xd2,0xc1,0xfe,0x00,0x00] +v_cvt_pk_f16_f32 v5, -1, exec_hi + +// NOT-GFX950: error: instruction not supported on this GPU +// GFX950: v_cvt_pk_f16_f32 v5, 0.5, m0 mul:2 ; encoding: [0x05,0x00,0x67,0xd2,0xf0,0xf8,0x00,0x08] +v_cvt_pk_f16_f32 v5, 0.5, m0 mul:2 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt index f86d4325d63e58..7cd97ac87057e7 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt @@ -1146,3 +1146,21 @@ # GFX950: v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[6:37], v38, v39 ; encoding: [0x00,0x00,0x54,0xd2,0x06,0x4d,0x9e,0x04] 0x00,0x00,0x54,0xd2,0x06,0x4d,0x9e,0x04 + +# GFX950: v_cvt_pk_f16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x67,0xd2,0x01,0x05,0x02,0x00] +0x05,0x00,0x67,0xd2,0x01,0x05,0x02,0x00 + +# GFX950: v_cvt_pk_f16_f32 v5, v255, v255 ; encoding: [0x05,0x00,0x67,0xd2,0xff,0xff,0x03,0x00] +0x05,0x00,0x67,0xd2,0xff,0xff,0x03,0x00 + +# GFX950: v_cvt_pk_f16_f32 v5, m0, 0.5 ; encoding: [0x05,0x00,0x67,0xd2,0x7c,0xe0,0x01,0x00] +0x05,0x00,0x67,0xd2,0x7c,0xe0,0x01,0x00 + +# GFX950: v_cvt_pk_f16_f32 v5, exec_lo, -1 ; encoding: [0x05,0x00,0x67,0xd2,0x7e,0x82,0x01,0x00] +0x05,0x00,0x67,0xd2,0x7e,0x82,0x01,0x00 + +# GFX950: v_cvt_pk_f16_f32 v5, -1, exec_hi ; encoding: [0x05,0x00,0x67,0xd2,0xc1,0xfe,0x00,0x00] +0x05,0x00,0x67,0xd2,0xc1,0xfe,0x00,0x00 + +# GFX950: v_cvt_pk_f16_f32 v5, 0.5, m0 mul:2 ; encoding: [0x05,0x00,0x67,0xd2,0xf0,0xf8,0x00,0x08] +0x05,0x00,0x67,0xd2,0xf0,0xf8,0x00,0x08