-
Notifications
You must be signed in to change notification settings - Fork 12.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Add v_mfma_i32_32x32x32_i8 for gfx950 #117052
Merged
Merged
Conversation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This was referenced Nov 20, 2024
arsenm
added
backend:AMDGPU
clang
Clang issues not falling into any other category
clang:frontend
Language frontend issues, e.g. anything involving "Sema"
llvm:analysis
llvm:ir
mc
Machine (object) code
labels
Nov 20, 2024
— with
Graphite App
arsenm
requested review from
jayfoad,
kosarev,
pravinjagtap,
rampitec,
scchan,
shiltian and
Sisyph
November 20, 2024 21:23
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesPatch is 51.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117052.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index b4c7c12fec6475..6c51e52a2ef665 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -441,6 +441,7 @@ TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x32_f16, "V4fV8hV8hV4fIiIiIi", "nc
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_f16, "V16fV8hV8hV16fIiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf16, "V16fV8yV8yV16fIiIiIi", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_16x16x64_i8, "V4iV4iV4iV4iIiIiIi", "nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_32x32x32_i8, "V16iV4iV4iV16iIiIiIi", "nc", "gfx950-insts")
//===----------------------------------------------------------------------===//
// GFX12+ only builtins.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
index b69db6410a7905..b21394b6982631 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -452,4 +452,10 @@ v4i test_mfma_i32_16x16x64_i8(v4i a, v4i b, v4i c) {
return __builtin_amdgcn_mfma_i32_16x16x64_i8(a, b, c, 1, 2, 3);
}
+// CHECK-GFX950-LABEL: @test_mfma_i32_32x32x32_i8(
+// CHECK-GFX950: tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %a, <4 x i32> %b, <16 x i32> %c, i32 1, i32 2, i32 3)
+v16i test_mfma_i32_32x32x32_i8(v4i a, v4i b, v16i c) {
+ return __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 1, 2, 3);
+}
+
#endif
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
index 3597042fadcf88..9c14c0541ff3b8 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl
@@ -7,6 +7,7 @@ typedef half half8 __attribute__((ext_vector_type(8)));
typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
typedef int int4 __attribute__((ext_vector_type(4)));
typedef int int8 __attribute__((ext_vector_type(8)));
+typedef int int16 __attribute__((ext_vector_type(16)));
void test_mfma_f32_16x16x32_f16(__global float4* out, half8 a, half8 b, float4 c, int X) {
@@ -48,3 +49,9 @@ void test_mfma_i32_16x16x64_i8(__global int4* out, int4 a, int4 b, int4 c, int X
*out = __builtin_amdgcn_mfma_i32_16x16x64_i8(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x64_i8' must be a constant integer}}
*out = __builtin_amdgcn_mfma_i32_16x16x64_i8(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x64_i8' must be a constant integer}}
}
+
+void test_mfma_i32_32x32x32_i8(__global int16* out, int4 a, int4 b, int16 c, int X) {
+ *out = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, X, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x32_i8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x32_i8' must be a constant integer}}
+ *out = __builtin_amdgcn_mfma_i32_32x32x32_i8(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x32_i8' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
index 3a27fbf2439353..71a110066342cb 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl
@@ -32,6 +32,7 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
*out1 = __builtin_amdgcn_mfma_f32_32x32x16_f16(a1, b1, c1, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_f16' needs target feature gfx950-insts}}
*out2 = __builtin_amdgcn_mfma_f32_32x32x16_bf16(a2, b2, c2, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_32x32x16_bf16' needs target feature gfx950-insts}}
*out3 = __builtin_amdgcn_mfma_i32_16x16x64_i8(a3, b3, c3, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_16x16x64_i8' needs target feature gfx950-insts}}
+ *out4 = __builtin_amdgcn_mfma_i32_32x32x32_i8(a4, b4, c4, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_32x32x32_i8' needs target feature gfx950-insts}}
*out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
*out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 35f45095b1ad04..b5d5eae0c7cd7e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3147,6 +3147,7 @@ defset list<Intrinsic> AMDGPUMFMAIntrinsics950 = {
def int_amdgcn_mfma_f32_16x16x32_f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v8f16_ty>;
def int_amdgcn_mfma_f32_32x32x16_f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8f16_ty>;
def int_amdgcn_mfma_i32_16x16x64_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_v4i32_ty>;
+def int_amdgcn_mfma_i32_32x32x32_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_v4i32_ty>;
def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty>;
def int_amdgcn_mfma_scale_f32_16x16x128_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v4f32_ty>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index f8ef1879496bb2..ea9c688c710418 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4750,7 +4750,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8:
case Intrinsic::amdgcn_mfma_f32_16x16x32_f16:
case Intrinsic::amdgcn_mfma_f32_32x32x16_f16:
- case Intrinsic::amdgcn_mfma_i32_16x16x64_i8: {
+ case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
+ case Intrinsic::amdgcn_mfma_i32_32x32x32_i8: {
// Default for MAI intrinsics.
// srcC can also be an immediate which can be folded later.
// FIXME: Should we eventually add an alternative mapping with AGPR src
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 7a83e76ba4c919..98d5e3f199cddd 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -725,6 +725,10 @@ def VOPProfileMAI_F32_V4I32_V4I32_X512_VCD : VOPProfileMAI<VOP_V16F32_V4I32_V4I3
def VOPProfileMAI_I32_V4I32_X128 : VOPProfileMAI<VOP_V4I32_V4I32_V4I32_V4I32, AISrc_128_f32, ADst_128, AVSrc_128>;
def VOPProfileMAI_I32_V4I32_X128_VCD : VOPProfileMAI<VOP_V4I32_V4I32_V4I32_V4I32, VISrc_128_f32, VDst_128, AVSrc_128>;
+// For i32_32x32x32_i8
+def VOPProfileMAI_I32_V4I32_X16 : VOPProfileMAI<VOP_V16I32_V4I32_V4I32_V16I32, AISrc_512_b32, ADst_512, AVSrc_128>;
+def VOPProfileMAI_I32_V4I32_X16_VCD : VOPProfileMAI<VOP_V16I32_V4I32_V4I32_V16I32, VISrc_512_b32, VDst_512, AVSrc_128>;
+
class MFMATable <bit is_mac, string Name> {
bit IsMac = is_mac;
@@ -950,6 +954,7 @@ defm V_MFMA_F32_16X16X32_F16 : MAIInst<"v_mfma_f32_16x16x32f16", "F32_V8F16
defm V_MFMA_F32_32X32X16_F16 : MAIInst<"v_mfma_f32_32x32x16f16", "F32_V8F16_X16", int_amdgcn_mfma_f32_32x32x16_f16>;
defm V_MFMA_I32_16X16X64_I8 : MAIInst<"v_mfma_i32_16x16x64i8", "I32_V4I32_X128", int_amdgcn_mfma_i32_16x16x64_i8>;
defm V_MFMA_F32_32X32X16_BF16 : MAIInst<"v_mfma_f32_32x32x16bf16", "F32_V8BF16_X16", int_amdgcn_mfma_f32_32x32x16_bf16>;
+defm V_MFMA_I32_32X32X32_I8 : MAIInst<"v_mfma_i32_32x32x32i8", "I32_V4I32_X16", int_amdgcn_mfma_i32_32x32x32_i8>;
defm V_MFMA_F32_16X16X128_F8F6F4 : MAIInst_SrcFormats_mc<"v_mfma_f32_16x16x128f8f6f4",
"_X128", mfma_f32_16x16x128_f8f6f4>;
@@ -2075,6 +2080,7 @@ defm V_MFMA_F32_16X16X32_F16 : VOP3P_Real_MFMA_gfx950 <0x54, "v_mfma_f32_16x
defm V_MFMA_F32_32X32X16_F16 : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x32x16_f16">;
defm V_MFMA_I32_16X16X64_I8 : VOP3P_Real_MFMA_gfx950 <0x36, "v_mfma_i32_16x16x64_i8">;
defm V_MFMA_F32_32X32X16_BF16 : VOP3P_Real_MFMA_gfx950 <0x37, "v_mfma_f32_32x32x16_bf16">;
+defm V_MFMA_I32_32X32X32_I8 : VOP3P_Real_MFMA_gfx950 <0x38, "v_mfma_i32_32x32x32_i8">;
defm V_MFMA_LD_SCALE_B32 : VOP3P_Real_vi <0x2c>;
defm V_MFMA_F32_16X16X128_F8F6F4 : VOP3P_Real_MFMA_F8F6F4_gfx950_mc <0x2d, "v_mfma_f32_16x16x128_f8f6f4">;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 48c6c42b66b9a5..26021a56790aae 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -314,6 +314,15 @@ define amdgpu_kernel void @mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1
ret void
}
+declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32>, <4 x i32>, <16 x i32>, i32 immarg, i32 immarg, i32 immarg)
+
+; CHECK: DIVERGENT: %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+define amdgpu_kernel void @mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, ptr addrspace(1) %out) {
+ %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
+ store <16 x i32> %result, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 555756d3fdf3f8..29668b2c5bfdbf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -555,5 +555,765 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
ret void
}
+; --------------------------------------------------------------------
+; llvm.amdgcn.mfma.i32.32x32x32.i8
+; --------------------------------------------------------------------
+
+declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32>, <4 x i32>, <16 x i32>, i32 immarg, i32 immarg, i32 immarg)
+
+define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) #1 {
+; SDAG-LABEL: test_mfma_i32_32x32x32_i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx8 s[20:27], s[0:1], 0x24
+; SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v0, s20
+; SDAG-NEXT: v_accvgpr_write_b32 a31, s19
+; SDAG-NEXT: v_mov_b32_e32 v1, s21
+; SDAG-NEXT: v_mov_b32_e32 v2, s22
+; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v4, s24
+; SDAG-NEXT: v_mov_b32_e32 v5, s25
+; SDAG-NEXT: v_mov_b32_e32 v6, s26
+; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a30, s18
+; SDAG-NEXT: v_accvgpr_write_b32 a29, s17
+; SDAG-NEXT: v_accvgpr_write_b32 a28, s16
+; SDAG-NEXT: v_accvgpr_write_b32 a27, s15
+; SDAG-NEXT: v_accvgpr_write_b32 a26, s14
+; SDAG-NEXT: v_accvgpr_write_b32 a25, s13
+; SDAG-NEXT: v_accvgpr_write_b32 a24, s12
+; SDAG-NEXT: v_accvgpr_write_b32 a23, s11
+; SDAG-NEXT: v_accvgpr_write_b32 a22, s10
+; SDAG-NEXT: v_accvgpr_write_b32 a21, s9
+; SDAG-NEXT: v_accvgpr_write_b32 a20, s8
+; SDAG-NEXT: v_accvgpr_write_b32 a19, s7
+; SDAG-NEXT: v_accvgpr_write_b32 a18, s6
+; SDAG-NEXT: v_accvgpr_write_b32 a17, s5
+; SDAG-NEXT: v_accvgpr_write_b32 a16, s4
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
+; SDAG-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 4
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s16
+; SDAG-NEXT: v_mov_b32_e32 v1, s17
+; SDAG-NEXT: v_mov_b32_e32 v2, s18
+; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s4
+; SDAG-NEXT: v_mov_b32_e32 v1, s5
+; SDAG-NEXT: v_mov_b32_e32 v2, s6
+; SDAG-NEXT: v_mov_b32_e32 v3, s7
+; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s8
+; SDAG-NEXT: v_mov_b32_e32 v1, s9
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_i32_32x32x32_i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx8 s[20:27], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], 0
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], 48
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[24:25]
+; GISEL-NEXT: v_accvgpr_write_b32 a31, s19
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[26:27]
+; GISEL-NEXT: v_accvgpr_write_b32 a30, s18
+; GISEL-NEXT: v_accvgpr_write_b32 a29, s17
+; GISEL-NEXT: v_accvgpr_write_b32 a28, s16
+; GISEL-NEXT: v_accvgpr_write_b32 a27, s15
+; GISEL-NEXT: v_accvgpr_write_b32 a26, s14
+; GISEL-NEXT: v_accvgpr_write_b32 a25, s13
+; GISEL-NEXT: v_accvgpr_write_b32 a24, s12
+; GISEL-NEXT: v_accvgpr_write_b32 a23, s11
+; GISEL-NEXT: v_accvgpr_write_b32 a22, s10
+; GISEL-NEXT: v_accvgpr_write_b32 a21, s9
+; GISEL-NEXT: v_accvgpr_write_b32 a20, s8
+; GISEL-NEXT: v_accvgpr_write_b32 a19, s7
+; GISEL-NEXT: v_accvgpr_write_b32 a18, s6
+; GISEL-NEXT: v_accvgpr_write_b32 a17, s5
+; GISEL-NEXT: v_accvgpr_write_b32 a16, s4
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], 16
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], 32
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 6
+; GISEL-NEXT: global_store_dwordx4 v[8:9], a[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[10:11], a[12:15], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_endpgm
+ %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0)
+ store volatile <16 x i32> %result, ptr addrspace(1) null
+ store volatile <16 x i32> %arg2, ptr addrspace(1) null
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) #1 {
+; SDAG-LABEL: test_mfma_i32_32x32x32_i8__flags:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx8 s[20:27], s[0:1], 0x24
+; SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v0, s20
+; SDAG-NEXT: v_accvgpr_write_b32 a31, s19
+; SDAG-NEXT: v_mov_b32_e32 v1, s21
+; SDAG-NEXT: v_mov_b32_e32 v2, s22
+; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v4, s24
+; SDAG-NEXT: v_mov_b32_e32 v5, s25
+; SDAG-NEXT: v_mov_b32_e32 v6, s26
+; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a30, s18
+; SDAG-NEXT: v_accvgpr_write_b32 a29, s17
+; SDAG-NEXT: v_accvgpr_write_b32 a28, s16
+; SDAG-NEXT: v_accvgpr_write_b32 a27, s15
+; SDAG-NEXT: v_accvgpr_write_b32 a26, s14
+; SDAG-NEXT: v_accvgpr_write_b32 a25, s13
+; SDAG-NEXT: v_accvgpr_write_b32 a24, s12
+; SDAG-NEXT: v_accvgpr_write_b32 a23, s11
+; SDAG-NEXT: v_accvgpr_write_b32 a22, s10
+; SDAG-NEXT: v_accvgpr_write_b32 a21, s9
+; SDAG-NEXT: v_accvgpr_write_b32 a20, s8
+; SDAG-NEXT: v_accvgpr_write_b32 a19, s7
+; SDAG-NEXT: v_accvgpr_write_b32 a18, s6
+; SDAG-NEXT: v_accvgpr_write_b32 a17, s5
+; SDAG-NEXT: v_accvgpr_write_b32 a16, s4
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:2 abid:3 blgp:1
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32
+; SDAG-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: s_nop 4
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s16
+; SDAG-NEXT: v_mov_b32_e32 v1, s17
+; SDAG-NEXT: v_mov_b32_e32 v2, s18
+; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s4
+; SDAG-NEXT: v_mov_b32_e32 v1, s5
+; SDAG-NEXT: v_mov_b32_e32 v2, s6
+; SDAG-NEXT: v_mov_b32_e32 v3, s7
+; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s8
+; SDAG-NEXT: v_mov_b32_e32 v1, s9
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_mfma_i32_32x32x32_i8__flags:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_lo...
[truncated]
|
shiltian
approved these changes
Nov 20, 2024
arsenm
force-pushed
the
users/arsenm/gfx950/v_mfma_i32_16x16x64_i8
branch
from
November 21, 2024 17:00
2277d71
to
0468b29
Compare
Base automatically changed from
users/arsenm/gfx950/v_mfma_i32_16x16x64_i8
to
main
November 21, 2024 17:02
arsenm
force-pushed
the
users/arsenm/gfx950/v_mfma_i32_32x32x32_i8
branch
from
November 21, 2024 17:04
522f526
to
535d10e
Compare
Merge activity
|
This was referenced Nov 21, 2024
This was referenced Nov 21, 2024
Merged
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Labels
backend:AMDGPU
clang:frontend
Language frontend issues, e.g. anything involving "Sema"
clang
Clang issues not falling into any other category
llvm:analysis
llvm:ir
mc
Machine (object) code
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
No description provided.