Skip to content

Commit

Permalink
AMDGPU: Allocate different registers for vdst & src in v_cvt_scalef32*
Browse files Browse the repository at this point in the history
For multipass instructions, overlap on VDST and SRC’s
would result in HW race & undefined results.

Co-authored-by: Pravin Jagtap <[email protected]>
  • Loading branch information
pravinjagtap authored and arsenm committed Nov 27, 2024
1 parent b26cbd2 commit bbc7178
Show file tree
Hide file tree
Showing 5 changed files with 353 additions and 257 deletions.
12 changes: 7 additions & 5 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1088,9 +1088,11 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
defm V_CVT_SCALEF32_PK_F32_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f32_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f32>>;
let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>;
defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>;
let Constraints = "@earlyclobber $vdst" in {
defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>;
}
}
defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f16>>;
defm V_CVT_SCALEF32_PK_BF16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_bf16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2bf16>>;
Expand All @@ -1103,7 +1105,7 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
}
}

let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in {
let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0, Constraints = "@earlyclobber $vdst" in {
defm V_CVT_SCALEF32_PK32_F32_FP6 : VOP3Inst<"v_cvt_scalef32_pk32_f32_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F32_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f32_fp6>;
defm V_CVT_SCALEF32_PK32_F32_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_f32_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F32_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f32_bf6>;
defm V_CVT_SCALEF32_PK32_F16_FP6 : VOP3Inst<"v_cvt_scalef32_pk32_f16_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F16_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f16_fp6>;
Expand All @@ -1112,7 +1114,7 @@ let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0
defm V_CVT_SCALEF32_PK32_BF16_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_bf16_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32BF16_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_bf16_bf6>;
}

let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in {
let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPException = 0, Constraints = "@earlyclobber $vdst" in {
defm V_CVT_SCALEF32_PK32_FP6_F16 : VOP3Inst<"v_cvt_scalef32_pk32_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_f16>;
defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3Inst<"v_cvt_scalef32_pk32_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>, int_amdgcn_cvt_scalef32_pk32_bf6_f16>;
defm V_CVT_SCALEF32_PK32_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_pk32_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_bf16>;
Expand Down
214 changes: 154 additions & 60 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
Original file line number Diff line number Diff line change
Expand Up @@ -864,31 +864,91 @@ define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte3(i32 %src, float %scale) {
}

define <32 x float> @test_cvt_scale_pk32_f32_fp6(<6 x i32> %src, float %scale) {
; GCN-LABEL: test_cvt_scale_pk32_f32_fp6:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[0:5], v6
; GCN-NEXT: s_setpc_b64 s[30:31]
; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_fp6:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b32_e32 v38, v6
; GFX950-SDAG-NEXT: v_mov_b32_e32 v37, v5
; GFX950-SDAG-NEXT: v_mov_b32_e32 v36, v4
; GFX950-SDAG-NEXT: v_mov_b32_e32 v35, v3
; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, v2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v33, v1
; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, v0
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[32:37], v38
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_fp6:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0
; GFX950-GISEL-NEXT: v_mov_b32_e32 v33, v1
; GFX950-GISEL-NEXT: v_mov_b32_e32 v34, v2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v35, v3
; GFX950-GISEL-NEXT: v_mov_b32_e32 v36, v4
; GFX950-GISEL-NEXT: v_mov_b32_e32 v37, v5
; GFX950-GISEL-NEXT: v_mov_b32_e32 v38, v6
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[32:37], v38
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.fp6(<6 x i32> %src, float %scale)
ret <32 x float> %ret
}

define <32 x float> @test_cvt_scale_pk32_f32_bf6(<6 x i32> %src, float %scale) {
; GCN-LABEL: test_cvt_scale_pk32_f32_bf6:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[0:5], v6
; GCN-NEXT: s_setpc_b64 s[30:31]
; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_bf6:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b32_e32 v38, v6
; GFX950-SDAG-NEXT: v_mov_b32_e32 v37, v5
; GFX950-SDAG-NEXT: v_mov_b32_e32 v36, v4
; GFX950-SDAG-NEXT: v_mov_b32_e32 v35, v3
; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, v2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v33, v1
; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, v0
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[32:37], v38
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_bf6:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0
; GFX950-GISEL-NEXT: v_mov_b32_e32 v33, v1
; GFX950-GISEL-NEXT: v_mov_b32_e32 v34, v2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v35, v3
; GFX950-GISEL-NEXT: v_mov_b32_e32 v36, v4
; GFX950-GISEL-NEXT: v_mov_b32_e32 v37, v5
; GFX950-GISEL-NEXT: v_mov_b32_e32 v38, v6
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[32:37], v38
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.bf6(<6 x i32> %src, float %scale)
ret <32 x float> %ret
}

define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_vv(<6 x i32> %src, float %scale) {
; GCN-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], v6
; GCN-NEXT: s_setpc_b64 s[30:31]
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0
; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, v1
; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, v2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, v3
; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v4
; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v5
; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, v6
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float %scale)
ret <32 x half> %ret
}
Expand All @@ -897,26 +957,26 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl(<6 x i32> inreg %src) {
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3
; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s16
; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s17
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], s0
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], s0
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: s_mov_b32 s4, s16
; GFX950-GISEL-NEXT: s_mov_b32 s5, s17
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 0x42c80000
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], v6
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0x42c80000
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float 100.0)
ret <32 x half> %ret
Expand All @@ -926,7 +986,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_vv(<6 x i32> %src, float %
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[0:5], v6
; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], v22
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
Expand Down Expand Up @@ -958,14 +1025,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) {
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3
; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s16
; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s17
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[0:5], s0
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], s0
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
Expand Down Expand Up @@ -1000,11 +1067,31 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) {
}

define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_vv(<6 x i32> %src, float %scale) {
; GCN-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], v6
; GCN-NEXT: s_setpc_b64 s[30:31]
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0
; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, v1
; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, v2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, v3
; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v4
; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v5
; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, v6
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float %scale)
ret <32 x half> %ret
}
Expand All @@ -1013,26 +1100,26 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl(<6 x i32> inreg %src) {
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3
; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s16
; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s17
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], s0
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], s0
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-GISEL-NEXT: s_mov_b32 s4, s16
; GFX950-GISEL-NEXT: s_mov_b32 s5, s17
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 0x42c80000
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], v6
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[4:5]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0x42c80000
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float 100.0)
ret <32 x half> %ret
Expand All @@ -1042,7 +1129,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_vv(<6 x i32> %src, float %
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[0:5], v6
; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], v22
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
Expand Down Expand Up @@ -1074,14 +1168,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_sl(<6 x i32> inreg %src) {
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3
; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s16
; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s17
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[0:5], s0
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], s0
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:
Expand Down
Loading

0 comments on commit bbc7178

Please sign in to comment.