diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 1160975f3302a9..f93535ddee25ba 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -763,6 +763,10 @@ let SubtargetPredicate = isGFX9Plus, True16Predicate = NotHasTrue16BitInsts in { def : OpSelBinOpClampPat; def : OpSelBinOpClampPat; } // End SubtargetPredicate = isGFX9Plus, True16Predicate = NotHasTrue16BitInsts +let True16Predicate = UseRealTrue16Insts in { + def : OpSelBinOpClampPat; + def : OpSelBinOpClampPat; +} // End True16Predicate = UseRealTrue16Insts let True16Predicate = UseFakeTrue16Insts in { def : OpSelBinOpClampPat; def : OpSelBinOpClampPat; diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll index 9c5214338c54a7..a034cf6941c268 100644 --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -3,7 +3,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) { ; GFX6-LABEL: v_saddsat_i8: @@ -34,14 +35,32 @@ define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) { ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_saddsat_i8: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0 -; GFX10PLUS-NEXT: v_add_nc_i16 v0, v0, v1 clamp -; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_saddsat_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp +; GFX10-NEXT: v_ashrrev_i16 v0, 8, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_saddsat_i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_i16 v0.l, v0.l, v0.h clamp +; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.l, 8, v0.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_saddsat_i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX11-FAKE16-NEXT: v_add_nc_i16 v0, v0, v1 clamp +; GFX11-FAKE16-NEXT: v_ashrrev_i16 v0, 8, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result } @@ -76,11 +95,24 @@ define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) { ; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_saddsat_i16: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_add_nc_i16 v0, v0, v1 clamp -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_saddsat_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_saddsat_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_i16 v0.l, v0.l, v0.h clamp +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_saddsat_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_i16 v0, v0, v1 clamp +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result } diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll index 71017f15e3c6d1..6ed19bd6d764b8 100644 --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -3,7 +3,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) { ; GFX6-LABEL: v_ssubsat_i8: @@ -34,14 +35,32 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) { ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_ssubsat_i8: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0 -; GFX10PLUS-NEXT: v_sub_nc_i16 v0, v0, v1 clamp -; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_ssubsat_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-NEXT: v_sub_nc_i16 v0, v0, v1 clamp +; GFX10-NEXT: v_ashrrev_i16 v0, 8, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_ssubsat_i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v0.l +; GFX11-TRUE16-NEXT: v_sub_nc_i16 v0.l, v0.l, v0.h clamp +; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.l, 8, v0.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_ssubsat_i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX11-FAKE16-NEXT: v_sub_nc_i16 v0, v0, v1 clamp +; GFX11-FAKE16-NEXT: v_ashrrev_i16 v0, 8, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result } @@ -76,11 +95,24 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) { ; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_ssubsat_i16: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_sub_nc_i16 v0, v0, v1 clamp -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_ssubsat_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_sub_nc_i16 v0, v0, v1 clamp +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_ssubsat_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_sub_nc_i16 v0.l, v0.l, v0.h clamp +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_ssubsat_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_sub_nc_i16 v0, v0, v1 clamp +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result }