From 37b79e779f447f1c714af7f907e7a2ec846d1da0 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 1 Oct 2023 14:27:55 +0100 Subject: [PATCH] [X86] combineConcatVectorOps - only concatenate single-use subops We could maybe extend this by allowing the lowest subop to have multiple uses and extract the lowest subvector result of the concatenated op, but let's just get the fix in first. Fixes #67333 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +- llvm/test/CodeGen/X86/pr67333.ll | 190 ++++++++++++++ llvm/test/CodeGen/X86/subvector-broadcast.ll | 8 +- .../vector-interleaved-store-i8-stride-7.ll | 16 +- llvm/test/CodeGen/X86/widen_bitcnt.ll | 242 +++++++++--------- .../zero_extend_vector_inreg_of_broadcast.ll | 64 ++--- 6 files changed, 358 insertions(+), 164 deletions(-) create mode 100644 llvm/test/CodeGen/X86/pr67333.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d9750ea22e2bac..426e3143ac9b21 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -57239,7 +57239,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, // TODO - combineX86ShufflesRecursively should handle shuffle concatenation // but it currently struggles with different vector widths. if (llvm::all_of(Ops, [Op0](SDValue Op) { - return Op.getOpcode() == Op0.getOpcode(); + return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse(); })) { auto ConcatSubOperand = [&](EVT VT, ArrayRef SubOps, unsigned I) { SmallVector Subs; diff --git a/llvm/test/CodeGen/X86/pr67333.ll b/llvm/test/CodeGen/X86/pr67333.ll new file mode 100644 index 00000000000000..64c7f4fb143bfe --- /dev/null +++ b/llvm/test/CodeGen/X86/pr67333.ll @@ -0,0 +1,190 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0 +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0 + +define void @SHA256_Compress_Generic(ptr noundef %ctx) #1 { +; CHECK-LABEL: SHA256_Compress_Generic: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movbel 0, %eax +; CHECK-NEXT: movbel 12(%rdi), %ecx +; CHECK-NEXT: vmovd %eax, %xmm0 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,0,1,2,3,128,128,128,128,128,128,128,128] +; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vpsrld $17, %xmm2, %xmm0 +; CHECK-NEXT: vpslld $15, %xmm2, %xmm3 +; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vpsrld $19, %xmm2, %xmm3 +; CHECK-NEXT: vpslld $13, %xmm2, %xmm4 +; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %ecx, %xmm3 +; CHECK-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-NEXT: vpsrld $17, %xmm1, %xmm0 +; CHECK-NEXT: vpslld $15, %xmm1, %xmm3 +; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vpsrld $19, %xmm1, %xmm3 +; CHECK-NEXT: vpslld $13, %xmm1, %xmm4 +; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpsrld $17, %xmm0, %xmm3 +; CHECK-NEXT: vpslld $15, %xmm0, %xmm4 +; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vpsrld $19, %xmm0, %xmm4 +; CHECK-NEXT: vpslld $13, %xmm0, %xmm5 +; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vpsrld $10, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpsrld $17, %xmm0, %xmm3 +; CHECK-NEXT: vpslld $15, %xmm0, %xmm4 +; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vpsrld $19, %xmm0, %xmm4 +; CHECK-NEXT: vpslld $13, %xmm0, %xmm5 +; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vpsrld $10, %xmm0, %xmm4 +; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3] +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,3] +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vpsrld $17, %xmm2, %xmm3 +; CHECK-NEXT: vpslld $15, %xmm2, %xmm4 +; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vpsrld $19, %xmm2, %xmm4 +; CHECK-NEXT: vpslld $13, %xmm2, %xmm5 +; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vpsrld $10, %xmm2, %xmm2 +; CHECK-NEXT: vpxor %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpsrlq $32, %xmm1, %xmm3 +; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vpsrld $17, %xmm1, %xmm2 +; CHECK-NEXT: vpslld $15, %xmm1, %xmm4 +; CHECK-NEXT: vpor %xmm2, %xmm4, %xmm2 +; CHECK-NEXT: vpsrld $19, %xmm1, %xmm4 +; CHECK-NEXT: vpslld $13, %xmm1, %xmm5 +; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; CHECK-NEXT: vpsrld $10, %xmm1, %xmm4 +; CHECK-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpsrld $17, %xmm2, %xmm3 +; CHECK-NEXT: vpslld $15, %xmm2, %xmm4 +; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vpsrld $19, %xmm2, %xmm4 +; CHECK-NEXT: vpslld $13, %xmm2, %xmm5 +; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vpsrld $10, %xmm2, %xmm2 +; CHECK-NEXT: vpxor %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpsrld $17, %xmm0, %xmm2 +; CHECK-NEXT: vpslld $15, %xmm0, %xmm3 +; CHECK-NEXT: vpor %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpsrld $19, %xmm0, %xmm3 +; CHECK-NEXT: vpslld $13, %xmm0, %xmm4 +; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vpsrld $10, %xmm0, %xmm3 +; CHECK-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vpsllq $32, %xmm1, %xmm3 +; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; CHECK-NEXT: vmovdqu %ymm0, 132(%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = load i32, ptr null, align 4 + %1 = tail call i32 asm "bswap $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %0) #3 + %arrayidx14 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 3 + %2 = load i32, ptr %arrayidx14, align 4 + %3 = tail call i32 asm "bswap $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %2) #3 + %4 = insertelement <2 x i32> zeroinitializer, i32 %1, i64 1 + %5 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %4, <2 x i32> %4, <2 x i32> ) + %6 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %4, <2 x i32> %4, <2 x i32> ) + %7 = xor <2 x i32> %5, %6 + %8 = lshr <2 x i32> %4, zeroinitializer + %9 = xor <2 x i32> %7, %8 + %10 = insertelement <2 x i32> zeroinitializer, i32 %3, i64 0 + %11 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %10, <2 x i32> + %12 = add <2 x i32> %11, %9 + %13 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %12, <2 x i32> %12, <2 x i32> ) + %14 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %12, <2 x i32> %12, <2 x i32> ) + %15 = xor <2 x i32> %13, %14 + %16 = lshr <2 x i32> %12, zeroinitializer + %17 = xor <2 x i32> %15, %16 + %18 = add <2 x i32> %4, %17 + %19 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %18, <2 x i32> %18, <2 x i32> ) + %20 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %18, <2 x i32> %18, <2 x i32> ) + %21 = xor <2 x i32> %19, %20 + %22 = lshr <2 x i32> %18, + %23 = xor <2 x i32> %21, %22 + %24 = add <2 x i32> %4, %23 + %25 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %24, <2 x i32> %24, <2 x i32> ) + %26 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %24, <2 x i32> %24, <2 x i32> ) + %27 = xor <2 x i32> %25, %26 + %28 = lshr <2 x i32> %24, + %29 = xor <2 x i32> %27, %28 + %30 = shufflevector <2 x i32> %4, <2 x i32> %12, <2 x i32> + %31 = add <2 x i32> %30, %29 + %32 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %31, <2 x i32> %31, <2 x i32> ) + %33 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %31, <2 x i32> %31, <2 x i32> ) + %34 = xor <2 x i32> %32, %33 + %35 = lshr <2 x i32> %31, + %36 = xor <2 x i32> %34, %35 + %37 = shufflevector <2 x i32> %12, <2 x i32> zeroinitializer, <2 x i32> + %38 = add <2 x i32> %37, %36 + %arrayidx918 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 33 + store <2 x i32> %38, ptr %arrayidx918, align 4 + %arrayidx1012 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 35 + %39 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %38, <2 x i32> %38, <2 x i32> ) + %40 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %38, <2 x i32> %38, <2 x i32> ) + %41 = xor <2 x i32> %39, %40 + %42 = lshr <2 x i32> %38, + %43 = xor <2 x i32> %41, %42 + %44 = add <2 x i32> %37, %43 + store <2 x i32> zeroinitializer, ptr %arrayidx1012, align 4 + %arrayidx1106 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 37 + %45 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %44, <2 x i32> %44, <2 x i32> ) + %46 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %44, <2 x i32> %44, <2 x i32> ) + %47 = xor <2 x i32> %45, %46 + %48 = lshr <2 x i32> %44, + %49 = xor <2 x i32> %47, %48 + %50 = lshr <2 x i32> %24, zeroinitializer + %51 = add <2 x i32> %50, %49 + store <2 x i32> %51, ptr %arrayidx1106, align 4 + %arrayidx1200 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 39 + %52 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %51, <2 x i32> %51, <2 x i32> ) + %53 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %51, <2 x i32> %51, <2 x i32> ) + %54 = xor <2 x i32> %52, %53 + %55 = lshr <2 x i32> %51, + %56 = xor <2 x i32> %54, %55 + %57 = shufflevector <2 x i32> %38, <2 x i32> zeroinitializer, <2 x i32> + %58 = insertelement <2 x i32> %57, i32 0, i64 0 + %59 = add <2 x i32> %58, %56 + store <2 x i32> %59, ptr %arrayidx1200, align 4 + ret void + +; uselistorder directives + uselistorder <2 x i32> %4, { 7, 0, 1, 6, 5, 4, 3, 2 } + uselistorder <2 x i32> %38, { 6, 5, 4, 3, 2, 1, 0 } +} + +declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) #2 + +; uselistorder directives +uselistorder ptr @llvm.fshl.v2i32, { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 } + +attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #1 = { nounwind sspstrong memory(argmem: readwrite) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "probe-stack"="inline-asm" "stack-protector-buffer-size"="8" "target-cpu"="skylake" "target-features"="+adx,+aes,+avx,+avx2,+bmi,+bmi2,+clflushopt,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { nounwind memory(none) } diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll index 5b6d9cd463c71c..4832304b4c6faf 100644 --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -1768,8 +1768,8 @@ define void @PR51226() { ; X86-AVX2-LABEL: PR51226: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX2-NEXT: vpslld $16, %xmm0, %xmm0 ; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpslld $16, %ymm0, %ymm0 ; X86-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX2-NEXT: vminps %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vmovups %ymm0, (%eax) @@ -1779,8 +1779,8 @@ define void @PR51226() { ; X86-AVX512-LABEL: PR51226: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX512-NEXT: vpslld $16, %xmm0, %xmm0 ; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; X86-AVX512-NEXT: vpslld $16, %ymm0, %ymm0 ; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX512-NEXT: vminps %ymm1, %ymm0, %ymm0 ; X86-AVX512-NEXT: vmovups %ymm0, (%eax) @@ -1801,8 +1801,8 @@ define void @PR51226() { ; X64-AVX2-LABEL: PR51226: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX2-NEXT: vpslld $16, %xmm0, %xmm0 ; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpslld $16, %ymm0, %ymm0 ; X64-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX2-NEXT: vminps %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vmovups %ymm0, (%rax) @@ -1812,8 +1812,8 @@ define void @PR51226() { ; X64-AVX512-LABEL: PR51226: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX512-NEXT: vpslld $16, %xmm0, %xmm0 ; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpslld $16, %ymm0, %ymm0 ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX512-NEXT: vminps %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vmovups %ymm0, (%rax) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 22f7707ca2b470..a0ef2d8fcbd3b2 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -726,17 +726,19 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-SLOW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vporq %zmm2, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512BW-SLOW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870 ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k1 ; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} diff --git a/llvm/test/CodeGen/X86/widen_bitcnt.ll b/llvm/test/CodeGen/X86/widen_bitcnt.ll index 8e46ace5cd8bef..5c07fbb7ec166e 100644 --- a/llvm/test/CodeGen/X86/widen_bitcnt.ll +++ b/llvm/test/CodeGen/X86/widen_bitcnt.ll @@ -630,74 +630,73 @@ define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> ; ; AVX2-LABEL: widen_ctlz_v2i32_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm4 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm5 ; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm7, %xmm5, %xmm5 -; AVX2-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm8 -; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm6 -; AVX2-NEXT: vpand %xmm7, %xmm6, %xmm6 -; AVX2-NEXT: vpcmpeqb %xmm4, %xmm6, %xmm9 -; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm6 -; AVX2-NEXT: vpand %xmm7, %xmm6, %xmm6 -; AVX2-NEXT: vpsrlw $4, %xmm3, %xmm10 -; AVX2-NEXT: vpand %xmm7, %xmm10, %xmm7 -; AVX2-NEXT: vpcmpeqb %xmm4, %xmm7, %xmm10 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm11 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: # ymm12 = mem[0,1,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm7 -; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX2-NEXT: vpshufb %ymm11, %ymm7, %ymm10 -; AVX2-NEXT: vpand %ymm9, %ymm10, %ymm9 -; AVX2-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm10 -; AVX2-NEXT: vpsrlw $4, %ymm11, %ymm11 -; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm8, %ymm8 -; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm10, %ymm11, %ymm11 -; AVX2-NEXT: vpshufb %ymm11, %ymm12, %ymm12 -; AVX2-NEXT: vpaddb %ymm12, %ymm9, %ymm12 -; AVX2-NEXT: vpshufb %ymm11, %ymm7, %ymm11 -; AVX2-NEXT: vpaddb %ymm11, %ymm9, %ymm9 -; AVX2-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpsrlw $8, %ymm8, %ymm8 -; AVX2-NEXT: vpand %ymm8, %ymm12, %ymm11 +; AVX2-NEXT: vpand %xmm7, %xmm5, %xmm8 +; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vpcmpeqb %xmm5, %xmm8, %xmm9 +; AVX2-NEXT: vpand %xmm4, %xmm9, %xmm4 +; AVX2-NEXT: vpshufb %xmm8, %xmm6, %xmm8 +; AVX2-NEXT: vpaddb %xmm4, %xmm8, %xmm4 +; AVX2-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm8 +; AVX2-NEXT: vpsrlw $8, %xmm8, %xmm8 +; AVX2-NEXT: vpand %xmm4, %xmm8, %xmm8 +; AVX2-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX2-NEXT: vpaddw %xmm4, %xmm8, %xmm4 +; AVX2-NEXT: vpcmpeqw %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm8 +; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm9 +; AVX2-NEXT: vpand %xmm7, %xmm9, %xmm9 +; AVX2-NEXT: vpcmpeqb %xmm5, %xmm9, %xmm10 +; AVX2-NEXT: vpand %xmm10, %xmm8, %xmm8 +; AVX2-NEXT: vpshufb %xmm9, %xmm6, %xmm9 +; AVX2-NEXT: vpaddb %xmm9, %xmm8, %xmm8 +; AVX2-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm9 +; AVX2-NEXT: vpsrlw $8, %xmm9, %xmm9 +; AVX2-NEXT: vpand %xmm9, %xmm8, %xmm9 +; AVX2-NEXT: vpsrlw $8, %xmm8, %xmm8 +; AVX2-NEXT: vpaddw %xmm9, %xmm8, %xmm8 +; AVX2-NEXT: vpcmpeqw %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm9 +; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm10 +; AVX2-NEXT: vpand %xmm7, %xmm10, %xmm10 +; AVX2-NEXT: vpcmpeqb %xmm5, %xmm10, %xmm11 +; AVX2-NEXT: vpand %xmm11, %xmm9, %xmm9 +; AVX2-NEXT: vpshufb %xmm10, %xmm6, %xmm10 +; AVX2-NEXT: vpaddb %xmm10, %xmm9, %xmm9 +; AVX2-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm10 +; AVX2-NEXT: vpsrlw $8, %xmm10, %xmm10 +; AVX2-NEXT: vpand %xmm10, %xmm9, %xmm10 +; AVX2-NEXT: vpsrlw $8, %xmm9, %xmm9 +; AVX2-NEXT: vpaddw %xmm10, %xmm9, %xmm9 +; AVX2-NEXT: vpcmpeqw %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm10 +; AVX2-NEXT: vpsrlw $4, %xmm3, %xmm11 +; AVX2-NEXT: vpand %xmm7, %xmm11, %xmm7 +; AVX2-NEXT: vpcmpeqb %xmm5, %xmm7, %xmm11 +; AVX2-NEXT: vpand %xmm11, %xmm10, %xmm10 +; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX2-NEXT: vpaddb %xmm6, %xmm10, %xmm6 +; AVX2-NEXT: vpcmpeqb %xmm5, %xmm3, %xmm7 +; AVX2-NEXT: vpsrlw $8, %xmm7, %xmm7 +; AVX2-NEXT: vpand %xmm7, %xmm6, %xmm7 +; AVX2-NEXT: vpsrlw $8, %xmm6, %xmm6 +; AVX2-NEXT: vpaddw %xmm7, %xmm6, %xmm6 +; AVX2-NEXT: vpcmpeqw %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm5 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlw $8, %ymm9, %ymm3 -; AVX2-NEXT: vpaddw %ymm3, %ymm11, %ymm11 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vpand %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm9 -; AVX2-NEXT: vpaddw %ymm3, %ymm8, %ymm3 -; AVX2-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm8 -; AVX2-NEXT: vpcmpeqb %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 +; AVX2-NEXT: vpand %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vpsrld $16, %ymm5, %ymm3 ; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpcmpeqw %xmm4, %xmm2, %xmm3 -; AVX2-NEXT: vpcmpeqb %xmm4, %xmm6, %xmm6 -; AVX2-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm3 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm2 -; AVX2-NEXT: vpshufb %ymm0, %ymm7, %ymm5 -; AVX2-NEXT: vpand %ymm2, %ymm5, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm0, %ymm10, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm7, %ymm0 -; AVX2-NEXT: vpaddb %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm2 -; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm9, %ymm2 -; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm3, %ymm2 +; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: retq ; @@ -1011,74 +1010,73 @@ define <8 x i32> @widen_ctlz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 ; ; AVX2-LABEL: widen_ctlz_undef_v2i32_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm4 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm5 ; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm7, %xmm5, %xmm5 -; AVX2-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm8 -; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm6 -; AVX2-NEXT: vpand %xmm7, %xmm6, %xmm6 -; AVX2-NEXT: vpcmpeqb %xmm4, %xmm6, %xmm9 -; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm6 -; AVX2-NEXT: vpand %xmm7, %xmm6, %xmm6 -; AVX2-NEXT: vpsrlw $4, %xmm3, %xmm10 -; AVX2-NEXT: vpand %xmm7, %xmm10, %xmm7 -; AVX2-NEXT: vpcmpeqb %xmm4, %xmm7, %xmm10 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm11 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX2-NEXT: # ymm12 = mem[0,1,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm7 -; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX2-NEXT: vpshufb %ymm11, %ymm7, %ymm10 -; AVX2-NEXT: vpand %ymm9, %ymm10, %ymm9 -; AVX2-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm10 -; AVX2-NEXT: vpsrlw $4, %ymm11, %ymm11 -; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm8, %ymm8 -; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm10, %ymm11, %ymm11 -; AVX2-NEXT: vpshufb %ymm11, %ymm12, %ymm12 -; AVX2-NEXT: vpaddb %ymm12, %ymm9, %ymm12 -; AVX2-NEXT: vpshufb %ymm11, %ymm7, %ymm11 -; AVX2-NEXT: vpaddb %ymm11, %ymm9, %ymm9 -; AVX2-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpsrlw $8, %ymm8, %ymm8 -; AVX2-NEXT: vpand %ymm8, %ymm12, %ymm11 +; AVX2-NEXT: vpand %xmm7, %xmm5, %xmm8 +; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vpcmpeqb %xmm5, %xmm8, %xmm9 +; AVX2-NEXT: vpand %xmm4, %xmm9, %xmm4 +; AVX2-NEXT: vpshufb %xmm8, %xmm6, %xmm8 +; AVX2-NEXT: vpaddb %xmm4, %xmm8, %xmm4 +; AVX2-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm8 +; AVX2-NEXT: vpsrlw $8, %xmm8, %xmm8 +; AVX2-NEXT: vpand %xmm4, %xmm8, %xmm8 +; AVX2-NEXT: vpsrlw $8, %xmm4, %xmm4 +; AVX2-NEXT: vpaddw %xmm4, %xmm8, %xmm4 +; AVX2-NEXT: vpcmpeqw %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm8 +; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm9 +; AVX2-NEXT: vpand %xmm7, %xmm9, %xmm9 +; AVX2-NEXT: vpcmpeqb %xmm5, %xmm9, %xmm10 +; AVX2-NEXT: vpand %xmm10, %xmm8, %xmm8 +; AVX2-NEXT: vpshufb %xmm9, %xmm6, %xmm9 +; AVX2-NEXT: vpaddb %xmm9, %xmm8, %xmm8 +; AVX2-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm9 +; AVX2-NEXT: vpsrlw $8, %xmm9, %xmm9 +; AVX2-NEXT: vpand %xmm9, %xmm8, %xmm9 +; AVX2-NEXT: vpsrlw $8, %xmm8, %xmm8 +; AVX2-NEXT: vpaddw %xmm9, %xmm8, %xmm8 +; AVX2-NEXT: vpcmpeqw %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm9 +; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm10 +; AVX2-NEXT: vpand %xmm7, %xmm10, %xmm10 +; AVX2-NEXT: vpcmpeqb %xmm5, %xmm10, %xmm11 +; AVX2-NEXT: vpand %xmm11, %xmm9, %xmm9 +; AVX2-NEXT: vpshufb %xmm10, %xmm6, %xmm10 +; AVX2-NEXT: vpaddb %xmm10, %xmm9, %xmm9 +; AVX2-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm10 +; AVX2-NEXT: vpsrlw $8, %xmm10, %xmm10 +; AVX2-NEXT: vpand %xmm10, %xmm9, %xmm10 +; AVX2-NEXT: vpsrlw $8, %xmm9, %xmm9 +; AVX2-NEXT: vpaddw %xmm10, %xmm9, %xmm9 +; AVX2-NEXT: vpcmpeqw %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm10 +; AVX2-NEXT: vpsrlw $4, %xmm3, %xmm11 +; AVX2-NEXT: vpand %xmm7, %xmm11, %xmm7 +; AVX2-NEXT: vpcmpeqb %xmm5, %xmm7, %xmm11 +; AVX2-NEXT: vpand %xmm11, %xmm10, %xmm10 +; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX2-NEXT: vpaddb %xmm6, %xmm10, %xmm6 +; AVX2-NEXT: vpcmpeqb %xmm5, %xmm3, %xmm7 +; AVX2-NEXT: vpsrlw $8, %xmm7, %xmm7 +; AVX2-NEXT: vpand %xmm7, %xmm6, %xmm7 +; AVX2-NEXT: vpsrlw $8, %xmm6, %xmm6 +; AVX2-NEXT: vpaddw %xmm7, %xmm6, %xmm6 +; AVX2-NEXT: vpcmpeqw %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm5 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlw $8, %ymm9, %ymm3 -; AVX2-NEXT: vpaddw %ymm3, %ymm11, %ymm11 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm11, %ymm1 -; AVX2-NEXT: vpand %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm9 -; AVX2-NEXT: vpaddw %ymm3, %ymm8, %ymm3 -; AVX2-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm8 -; AVX2-NEXT: vpcmpeqb %xmm4, %xmm5, %xmm5 -; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 +; AVX2-NEXT: vpand %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vpsrld $16, %ymm5, %ymm3 ; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpcmpeqw %xmm4, %xmm2, %xmm3 -; AVX2-NEXT: vpcmpeqb %xmm4, %xmm6, %xmm6 -; AVX2-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm3 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm2 -; AVX2-NEXT: vpshufb %ymm0, %ymm7, %ymm5 -; AVX2-NEXT: vpand %ymm2, %ymm5, %ymm2 -; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm0, %ymm10, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm7, %ymm0 -; AVX2-NEXT: vpaddb %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm2 -; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm9, %ymm2 -; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm3, %ymm2 +; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index 8f9d2169aa19b9..92973274f29190 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -5682,9 +5682,10 @@ define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in ; ; AVX512BW-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -5794,9 +5795,10 @@ define void @vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16(ptr %in ; ; AVX512BW-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -5906,9 +5908,10 @@ define void @vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8(ptr %in.v ; ; AVX512BW-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zero,zmm0[16],zero,zero,zero,zero,zero,zero,zero,zmm0[16],zero,zero,zero,zero,zero,zero,zero,zmm0[32],zero,zero,zero,zero,zero,zero,zero,zmm0[32],zero,zero,zero,zero,zero,zero,zero,zmm0[48],zero,zero,zero,zero,zero,zero,zero,zmm0[48],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -5999,9 +6002,10 @@ define void @vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4(ptr %i ; ; AVX512BW-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -6205,12 +6209,12 @@ define void @vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16(ptr %i ; ; AVX512BW-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,33,0,35,0,37,0,39,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,57,0,59,0,61,0,63] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -6324,12 +6328,12 @@ define void @vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8(ptr %in. ; ; AVX512BW-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,33,34,35,0,37,38,39,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,0,57,58,59,0,61,62,63] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,1],zero,zero,zero,zero,zero,zero,zmm0[0,1],zero,zero,zero,zero,zero,zero,zmm0[16,17],zero,zero,zero,zero,zero,zero,zmm0[16,17],zero,zero,zero,zero,zero,zero,zmm0[32,33],zero,zero,zero,zero,zero,zero,zmm0[32,33],zero,zero,zero,zero,zero,zero,zmm0[48,49],zero,zero,zero,zero,zero,zero,zmm0[48,49],zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -6443,12 +6447,12 @@ define void @vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4(ptr %i ; ; AVX512BW-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,33,34,35,36,37,38,39,0,41,42,43,44,45,46,47,0,49,50,51,52,53,54,55,0,57,58,59,60,61,62,63] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq