diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 752a751a25a0c7..4758f6053ccb6d 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -467,6 +467,8 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr4_b64_v2i32, "V2iV2i*3", "nc", "gfx950 TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr6_b96_v3i32, "V3iV3i*3", "nc", "gfx950-insts") TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts") TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4i16, "V4sV4s*3", "nc", "gfx950-insts") +TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4f16, "V4hV4h*3", "nc", "gfx950-insts") +TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4bf16, "V4yV4y*3", "nc", "gfx950-insts") TARGET_BUILTIN(__builtin_amdgcn_ashr_pk_i8_i32, "UsUiUiUi", "nc", "ashr-pk-insts") TARGET_BUILTIN(__builtin_amdgcn_ashr_pk_u8_i32, "UsUiUiUi", "nc", "ashr-pk-insts") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index cb9c23b8e0a0d0..8d162d3b8add40 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -19726,6 +19726,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32: case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32: case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32: + case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16: + case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16: case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: { Intrinsic::ID IID; switch (BuiltinID) { @@ -19751,6 +19753,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, IID = Intrinsic::amdgcn_ds_read_tr6_b96; break; case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: + case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16: + case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16: IID = Intrinsic::amdgcn_ds_read_tr16_b64; break; } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl index 39fa46d5845f42..91e04430d4973a 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl @@ -4,6 +4,8 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef int v3i __attribute__((ext_vector_type(3))); typedef short v4s __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef __bf16 v4y __attribute__((ext_vector_type(4))); // GFX950-LABEL: define dso_local <2 x i32> @test_amdgcn_ds_read_b64_tr_b4_v2i32( // GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { @@ -48,3 +50,24 @@ v4s test_amdgcn_ds_read_b64_tr_b16_v2i16(local v4s* inptr) { return __builtin_amdgcn_ds_read_tr16_b64_v4i16(inptr); } + +// GFX950-LABEL: define dso_local <4 x half> @test_amdgcn_ds_read_b64_tr_b16_v2f16( +// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// GFX950-NEXT: entry: +// GFX950-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) [[INPTR]]) +// GFX950-NEXT: ret <4 x half> [[TMP0]] +// +v4h test_amdgcn_ds_read_b64_tr_b16_v2f16(local v4h* inptr) +{ + return __builtin_amdgcn_ds_read_tr16_b64_v4f16(inptr); +} + +// GFX950-LABEL: define dso_local <4 x bfloat> @test_amdgcn_ds_read_b64_tr_b16_v2bf16( +// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// GFX950-NEXT: entry: +// GFX950-NEXT: [[TMP0:%.*]] = tail call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3) [[INPTR]]) +// GFX950-NEXT: ret <4 x bfloat> [[TMP0]] +v4y test_amdgcn_ds_read_b64_tr_b16_v2bf16(local v4y* inptr) +{ + return __builtin_amdgcn_ds_read_tr16_b64_v4bf16(inptr); +} diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 7cbd6d2dc62097..ef618727258cf2 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1202,6 +1202,8 @@ let SubtargetPredicate = HasGFX950Insts in { def : DSLoadTrPat ; def : DSLoadTrPat ; def : DSLoadTrPat ; + def : DSLoadTrPat ; + def : DSLoadTrPat ; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index fb1420ee340043..aa5208560817fd 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -305,6 +305,28 @@ bb: ret void } +declare <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3)) + +; CHECK: DIVERGENT: %tmp0 = call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep) +define amdgpu_kernel void @ds_read_b64_tr_b16_v4f16(ptr addrspace(3) %addr, ptr addrspace(1) %out) { +bb: + %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 + %tmp0 = call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep) + store <4 x half> %tmp0, ptr addrspace(1) %out, align 16 + ret void +} + +declare <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3)) + +; CHECK: DIVERGENT: %tmp0 = call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3) %gep) +define amdgpu_kernel void @ds_read_b64_tr_b16_v4bf16(ptr addrspace(3) %addr, ptr addrspace(1) %out) { +bb: + %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 + %tmp0 = call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3) %gep) + store <4 x bfloat> %tmp0, ptr addrspace(1) %out, align 16 + ret void +} + declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll index 0689af0d56268d..8481a3c2ccdb15 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll @@ -6,6 +6,8 @@ declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3)) declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3)) declare <3 x i32> @llvm.amdgcn.ds.read.tr6.b64.v3i32.p3(ptr addrspace(3)) declare <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16.p3(ptr addrspace(3)) +declare <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16.p3(ptr addrspace(3)) +declare <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16.p3(ptr addrspace(3)) define amdgpu_ps void @ds_read_b64_tr_b4(ptr addrspace(3) %addr, ptr addrspace(1) %use) { ; GFX950-SDAG-LABEL: ds_read_b64_tr_b4: @@ -106,3 +108,53 @@ entry: store <4 x i16> %val, ptr addrspace(1) %use ret void } + +define amdgpu_ps void @ds_read_b64_tr_b16_v4f16(ptr addrspace(3) %addr, ptr addrspace(1) %use) { +; GFX950-SDAG-LABEL: ds_read_b64_tr_b16_v4f16: +; GFX950-SDAG: ; %bb.0: ; %entry +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: ds_read_b64_tr_b16_v4f16: +; GFX950-GISEL: ; %bb.0: ; %entry +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 + %val = call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16.p3(ptr addrspace(3) %gep) + store <4 x half> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @ds_read_b64_tr_b16_v4bf16(ptr addrspace(3) %addr, ptr addrspace(1) %use) { +; GFX950-SDAG-LABEL: ds_read_b64_tr_b16_v4bf16: +; GFX950-SDAG: ; %bb.0: ; %entry +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: ds_read_b64_tr_b16_v4bf16: +; GFX950-GISEL: ; %bb.0: ; %entry +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 + %val = call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16.p3(ptr addrspace(3) %gep) + store <4 x bfloat> %val, ptr addrspace(1) %use + ret void +}