Skip to content

Commit

Permalink
AMDGPU: Allow f16/bf16 for DS_READ_TR16_B64 gfx950 builtins (#118297)
Browse files Browse the repository at this point in the history
Co-authored-by: Sirish Pande <[email protected]>
  • Loading branch information
arsenm and srpande authored Dec 2, 2024
1 parent 3d43789 commit a796f59
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 0 deletions.
2 changes: 2 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,8 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr4_b64_v2i32, "V2iV2i*3", "nc", "gfx950
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr6_b96_v3i32, "V3iV3i*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4i16, "V4sV4s*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4f16, "V4hV4h*3", "nc", "gfx950-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4bf16, "V4yV4y*3", "nc", "gfx950-insts")

TARGET_BUILTIN(__builtin_amdgcn_ashr_pk_i8_i32, "UsUiUiUi", "nc", "ashr-pk-insts")
TARGET_BUILTIN(__builtin_amdgcn_ashr_pk_u8_i32, "UsUiUiUi", "nc", "ashr-pk-insts")
Expand Down
4 changes: 4 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19726,6 +19726,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
Intrinsic::ID IID;
switch (BuiltinID) {
Expand All @@ -19751,6 +19753,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
IID = Intrinsic::amdgcn_ds_read_tr6_b96;
break;
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4f16:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4bf16:
IID = Intrinsic::amdgcn_ds_read_tr16_b64;
break;
}
Expand Down
23 changes: 23 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
typedef int v2i __attribute__((ext_vector_type(2)));
typedef int v3i __attribute__((ext_vector_type(3)));
typedef short v4s __attribute__((ext_vector_type(4)));
typedef half v4h __attribute__((ext_vector_type(4)));
typedef __bf16 v4y __attribute__((ext_vector_type(4)));

// GFX950-LABEL: define dso_local <2 x i32> @test_amdgcn_ds_read_b64_tr_b4_v2i32(
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
Expand Down Expand Up @@ -48,3 +50,24 @@ v4s test_amdgcn_ds_read_b64_tr_b16_v2i16(local v4s* inptr)
{
return __builtin_amdgcn_ds_read_tr16_b64_v4i16(inptr);
}

// GFX950-LABEL: define dso_local <4 x half> @test_amdgcn_ds_read_b64_tr_b16_v2f16(
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
// GFX950-NEXT: entry:
// GFX950-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) [[INPTR]])
// GFX950-NEXT: ret <4 x half> [[TMP0]]
//
v4h test_amdgcn_ds_read_b64_tr_b16_v2f16(local v4h* inptr)
{
return __builtin_amdgcn_ds_read_tr16_b64_v4f16(inptr);
}

// GFX950-LABEL: define dso_local <4 x bfloat> @test_amdgcn_ds_read_b64_tr_b16_v2bf16(
// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
// GFX950-NEXT: entry:
// GFX950-NEXT: [[TMP0:%.*]] = tail call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3) [[INPTR]])
// GFX950-NEXT: ret <4 x bfloat> [[TMP0]]
v4y test_amdgcn_ds_read_b64_tr_b16_v2bf16(local v4y* inptr)
{
return __builtin_amdgcn_ds_read_tr16_b64_v4bf16(inptr);
}
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/DSInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1202,6 +1202,8 @@ let SubtargetPredicate = HasGFX950Insts in {
def : DSLoadTrPat <DS_READ_B64_TR_B8, v2i32, int_amdgcn_ds_read_tr8_b64>;
def : DSLoadTrPat <DS_READ_B96_TR_B6, v3i32, int_amdgcn_ds_read_tr6_b96>;
def : DSLoadTrPat <DS_READ_B64_TR_B16, v4i16, int_amdgcn_ds_read_tr16_b64>;
def : DSLoadTrPat <DS_READ_B64_TR_B16, v4f16, int_amdgcn_ds_read_tr16_b64>;
def : DSLoadTrPat <DS_READ_B64_TR_B16, v4bf16, int_amdgcn_ds_read_tr16_b64>;
}

//===----------------------------------------------------------------------===//
Expand Down
22 changes: 22 additions & 0 deletions llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,28 @@ bb:
ret void
}

declare <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3))

; CHECK: DIVERGENT: %tmp0 = call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep)
define amdgpu_kernel void @ds_read_b64_tr_b16_v4f16(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
bb:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%tmp0 = call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep)
store <4 x half> %tmp0, ptr addrspace(1) %out, align 16
ret void
}

declare <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3))

; CHECK: DIVERGENT: %tmp0 = call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3) %gep)
define amdgpu_kernel void @ds_read_b64_tr_b16_v4bf16(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
bb:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%tmp0 = call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3) %gep)
store <4 x bfloat> %tmp0, ptr addrspace(1) %out, align 16
ret void
}

declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)

Expand Down
52 changes: 52 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3))
declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3))
declare <3 x i32> @llvm.amdgcn.ds.read.tr6.b64.v3i32.p3(ptr addrspace(3))
declare <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16.p3(ptr addrspace(3))
declare <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16.p3(ptr addrspace(3))
declare <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16.p3(ptr addrspace(3))

define amdgpu_ps void @ds_read_b64_tr_b4(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX950-SDAG-LABEL: ds_read_b64_tr_b4:
Expand Down Expand Up @@ -106,3 +108,53 @@ entry:
store <4 x i16> %val, ptr addrspace(1) %use
ret void
}

define amdgpu_ps void @ds_read_b64_tr_b16_v4f16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX950-SDAG-LABEL: ds_read_b64_tr_b16_v4f16:
; GFX950-SDAG: ; %bb.0: ; %entry
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
; GFX950-SDAG-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: ds_read_b64_tr_b16_v4f16:
; GFX950-GISEL: ; %bb.0: ; %entry
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
; GFX950-GISEL-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; GFX950-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%val = call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16.p3(ptr addrspace(3) %gep)
store <4 x half> %val, ptr addrspace(1) %use
ret void
}

define amdgpu_ps void @ds_read_b64_tr_b16_v4bf16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
; GFX950-SDAG-LABEL: ds_read_b64_tr_b16_v4bf16:
; GFX950-SDAG: ; %bb.0: ; %entry
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
; GFX950-SDAG-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: ds_read_b64_tr_b16_v4bf16:
; GFX950-GISEL: ; %bb.0: ; %entry
; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
; GFX950-GISEL-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; GFX950-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
%val = call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16.p3(ptr addrspace(3) %gep)
store <4 x bfloat> %val, ptr addrspace(1) %use
ret void
}

0 comments on commit a796f59

Please sign in to comment.