Skip to content

Commit

Permalink
[X86] combineTruncateWithSat - relax minimum truncation size for PACK…
Browse files Browse the repository at this point in the history
…SS/PACKUS

truncateVectorWithPACK handling of sub-128-bit result types was improved some time ago, so remove the old 64-bit limit

Fixes #68466
  • Loading branch information
RKSimon committed Nov 1, 2023
1 parent 39dfaf0 commit f471f6f
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 599 deletions.
6 changes: 2 additions & 4 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49604,14 +49604,12 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
(Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
!(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);

if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
VT.getSizeInBits() >= 64 &&
if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
isPowerOf2_32(VT.getVectorNumElements()) &&
(SVT == MVT::i8 || SVT == MVT::i16) &&
(InSVT == MVT::i16 || InSVT == MVT::i32)) {
if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
// vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
// Only do this when the result is at least 64 bits or we'll leaving
// dangling PACKSSDW nodes.
if (SVT == MVT::i8 && InSVT == MVT::i32) {
EVT MidVT = VT.changeVectorElementType(MVT::i16);
SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
Expand Down
259 changes: 48 additions & 211 deletions llvm/test/CodeGen/X86/fpclamptosat_vec.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1092,38 +1092,14 @@ define <2 x i16> @stest_f64i16(<2 x double> %x) nounwind {
; SSE-LABEL: stest_f64i16:
; SSE: # %bb.0: # %entry
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm1 = <32767,32767,u,u>
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: pcmpgtd %xmm0, %xmm2
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: por %xmm0, %xmm2
; SSE-NEXT: movdqa {{.*#+}} xmm0 = <4294934528,4294934528,u,u>
; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: pcmpgtd %xmm0, %xmm1
; SSE-NEXT: pand %xmm1, %xmm2
; SSE-NEXT: pandn %xmm0, %xmm1
; SSE-NEXT: por %xmm2, %xmm1
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
; SSE-NEXT: packssdw %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX2-LABEL: stest_f64i16:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767]
; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528]
; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-NEXT: retq
;
; AVX512-LABEL: stest_f64i16:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-NEXT: vpmovdw %xmm0, %xmm0
; AVX512-NEXT: retq
; AVX-LABEL: stest_f64i16:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%conv = fptosi <2 x double> %x to <2 x i32>
%0 = icmp slt <2 x i32> %conv, <i32 32767, i32 32767>
Expand Down Expand Up @@ -1198,24 +1174,11 @@ define <2 x i16> @ustest_f64i16(<2 x double> %x) nounwind {
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
; SSE-NEXT: retq
;
; AVX2-LABEL: ustest_f64i16:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535]
; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-NEXT: retq
;
; AVX512-LABEL: ustest_f64i16:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpmovdw %xmm0, %xmm0
; AVX512-NEXT: retq
; AVX-LABEL: ustest_f64i16:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%conv = fptosi <2 x double> %x to <2 x i32>
%0 = icmp slt <2 x i32> %conv, <i32 65535, i32 65535>
Expand Down Expand Up @@ -1652,40 +1615,16 @@ define <2 x i8> @stest_f64i8(<2 x double> %x) nounwind {
; SSE-LABEL: stest_f64i8:
; SSE: # %bb.0: # %entry
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm1 = <127,127,u,u>
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: pcmpgtd %xmm0, %xmm2
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: por %xmm0, %xmm2
; SSE-NEXT: movdqa {{.*#+}} xmm1 = <4294967168,4294967168,u,u>
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: pcmpgtd %xmm1, %xmm0
; SSE-NEXT: pand %xmm0, %xmm2
; SSE-NEXT: pandn %xmm1, %xmm0
; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: packssdw %xmm0, %xmm0
; SSE-NEXT: packsswb %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX2-LABEL: stest_f64i8:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127]
; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: stest_f64i8:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-NEXT: vpmovdb %xmm0, %xmm0
; AVX512-NEXT: retq
; AVX-LABEL: stest_f64i8:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%conv = fptosi <2 x double> %x to <2 x i32>
%0 = icmp slt <2 x i32> %conv, <i32 127, i32 127>
Expand Down Expand Up @@ -1748,39 +1687,16 @@ define <2 x i8> @ustest_f64i8(<2 x double> %x) nounwind {
; SSE-LABEL: ustest_f64i8:
; SSE: # %bb.0: # %entry
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm1 = <255,255,u,u>
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: pcmpgtd %xmm0, %xmm2
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: por %xmm0, %xmm2
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: pcmpgtd %xmm1, %xmm0
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: packssdw %xmm0, %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX2-LABEL: ustest_f64i8:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: ustest_f64i8:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpmovdb %xmm0, %xmm0
; AVX512-NEXT: retq
; AVX-LABEL: ustest_f64i8:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%conv = fptosi <2 x double> %x to <2 x i32>
%0 = icmp slt <2 x i32> %conv, <i32 255, i32 255>
Expand All @@ -1795,37 +1711,16 @@ define <4 x i8> @stest_f32i8(<4 x float> %x) nounwind {
; SSE-LABEL: stest_f32i8:
; SSE: # %bb.0: # %entry
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127]
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: pcmpgtd %xmm0, %xmm2
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: por %xmm0, %xmm2
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: pcmpgtd %xmm1, %xmm0
; SSE-NEXT: pand %xmm0, %xmm2
; SSE-NEXT: pandn %xmm1, %xmm0
; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: packssdw %xmm0, %xmm0
; SSE-NEXT: packsswb %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX2-LABEL: stest_f32i8:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127]
; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-NEXT: retq
;
; AVX512-LABEL: stest_f32i8:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX512-NEXT: vpmovsdb %xmm0, %xmm0
; AVX512-NEXT: retq
; AVX-LABEL: stest_f32i8:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%conv = fptosi <4 x float> %x to <4 x i32>
%0 = icmp slt <4 x i32> %conv, <i32 127, i32 127, i32 127, i32 127>
Expand Down Expand Up @@ -1888,37 +1783,16 @@ define <4 x i8> @ustest_f32i8(<4 x float> %x) nounwind {
; SSE-LABEL: ustest_f32i8:
; SSE: # %bb.0: # %entry
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: pcmpgtd %xmm0, %xmm2
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: por %xmm0, %xmm2
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: pcmpgtd %xmm1, %xmm0
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: packssdw %xmm0, %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX2-LABEL: ustest_f32i8:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-NEXT: retq
;
; AVX512-LABEL: ustest_f32i8:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpmovusdb %xmm0, %xmm0
; AVX512-NEXT: retq
; AVX-LABEL: ustest_f32i8:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%conv = fptosi <4 x float> %x to <4 x i32>
%0 = icmp slt <4 x i32> %conv, <i32 255, i32 255, i32 255, i32 255>
Expand Down Expand Up @@ -3863,38 +3737,14 @@ define <2 x i16> @stest_f64i16_mm(<2 x double> %x) nounwind {
; SSE-LABEL: stest_f64i16_mm:
; SSE: # %bb.0: # %entry
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm1 = <32767,32767,u,u>
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: pcmpgtd %xmm0, %xmm2
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: por %xmm0, %xmm2
; SSE-NEXT: movdqa {{.*#+}} xmm0 = <4294934528,4294934528,u,u>
; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: pcmpgtd %xmm0, %xmm1
; SSE-NEXT: pand %xmm1, %xmm2
; SSE-NEXT: pandn %xmm0, %xmm1
; SSE-NEXT: por %xmm2, %xmm1
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
; SSE-NEXT: packssdw %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX2-LABEL: stest_f64i16_mm:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767]
; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528]
; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-NEXT: retq
;
; AVX512-LABEL: stest_f64i16_mm:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-NEXT: vpmovdw %xmm0, %xmm0
; AVX512-NEXT: retq
; AVX-LABEL: stest_f64i16_mm:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%conv = fptosi <2 x double> %x to <2 x i32>
%spec.store.select = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %conv, <2 x i32> <i32 32767, i32 32767>)
Expand Down Expand Up @@ -3966,24 +3816,11 @@ define <2 x i16> @ustest_f64i16_mm(<2 x double> %x) nounwind {
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
; SSE-NEXT: retq
;
; AVX2-LABEL: ustest_f64i16_mm:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535]
; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-NEXT: retq
;
; AVX512-LABEL: ustest_f64i16_mm:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpmovdw %xmm0, %xmm0
; AVX512-NEXT: retq
; AVX-LABEL: ustest_f64i16_mm:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%conv = fptosi <2 x double> %x to <2 x i32>
%spec.store.select = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %conv, <2 x i32> <i32 65535, i32 65535>)
Expand Down
Loading

0 comments on commit f471f6f

Please sign in to comment.