From 964ade8f169beff85fdaf20a64b2a8ee9a8398e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Fri, 19 Apr 2024 20:46:27 +0200 Subject: [PATCH] Implement LLVM x86 AVX2 intrinsics --- src/shims/x86/avx.rs | 71 +- src/shims/x86/avx2.rs | 444 ++++++++ src/shims/x86/mod.rs | 402 +++++++ src/shims/x86/sse2.rs | 75 +- src/shims/x86/sse41.rs | 59 +- src/shims/x86/ssse3.rs | 65 +- tests/pass/intrinsics-x86-avx2.rs | 1613 +++++++++++++++++++++++++++++ 7 files changed, 2473 insertions(+), 256 deletions(-) create mode 100644 src/shims/x86/avx2.rs create mode 100644 tests/pass/intrinsics-x86-avx2.rs diff --git a/src/shims/x86/avx.rs b/src/shims/x86/avx.rs index 23c78647b9..41c20d768f 100644 --- a/src/shims/x86/avx.rs +++ b/src/shims/x86/avx.rs @@ -7,7 +7,8 @@ use rustc_target::spec::abi::Abi; use super::{ bin_op_simd_float_all, conditional_dot_product, convert_float_to_int, horizontal_bin_op, - round_all, test_bits_masked, test_high_bits_masked, unary_op_ps, FloatBinOp, FloatUnaryOp, + mask_load, mask_store, round_all, test_bits_masked, test_high_bits_masked, unary_op_ps, + FloatBinOp, FloatUnaryOp, }; use crate::*; use shims::foreign_items::EmulateForeignItemResult; @@ -347,71 +348,3 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>: Ok(EmulateForeignItemResult::NeedsJumping) } } - -/// Conditionally loads from `ptr` according the high bit of each -/// element of `mask`. `ptr` does not need to be aligned. -fn mask_load<'tcx>( - this: &mut crate::MiriInterpCx<'_, 'tcx>, - ptr: &OpTy<'tcx, Provenance>, - mask: &OpTy<'tcx, Provenance>, - dest: &MPlaceTy<'tcx, Provenance>, -) -> InterpResult<'tcx, ()> { - let (mask, mask_len) = this.operand_to_simd(mask)?; - let (dest, dest_len) = this.mplace_to_simd(dest)?; - - assert_eq!(dest_len, mask_len); - - let mask_item_size = mask.layout.field(this, 0).size; - let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap(); - - let ptr = this.read_pointer(ptr)?; - for i in 0..dest_len { - let mask = this.project_index(&mask, i)?; - let dest = this.project_index(&dest, i)?; - - if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 { - // Size * u64 is implemented as always checked - #[allow(clippy::arithmetic_side_effects)] - let ptr = ptr.wrapping_offset(dest.layout.size * i, &this.tcx); - // Unaligned copy, which is what we want. - this.mem_copy(ptr, dest.ptr(), dest.layout.size, /*nonoverlapping*/ true)?; - } else { - this.write_scalar(Scalar::from_int(0, dest.layout.size), &dest)?; - } - } - - Ok(()) -} - -/// Conditionally stores into `ptr` according the high bit of each -/// element of `mask`. `ptr` does not need to be aligned. -fn mask_store<'tcx>( - this: &mut crate::MiriInterpCx<'_, 'tcx>, - ptr: &OpTy<'tcx, Provenance>, - mask: &OpTy<'tcx, Provenance>, - value: &OpTy<'tcx, Provenance>, -) -> InterpResult<'tcx, ()> { - let (mask, mask_len) = this.operand_to_simd(mask)?; - let (value, value_len) = this.operand_to_simd(value)?; - - assert_eq!(value_len, mask_len); - - let mask_item_size = mask.layout.field(this, 0).size; - let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap(); - - let ptr = this.read_pointer(ptr)?; - for i in 0..value_len { - let mask = this.project_index(&mask, i)?; - let value = this.project_index(&value, i)?; - - if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 { - // Size * u64 is implemented as always checked - #[allow(clippy::arithmetic_side_effects)] - let ptr = ptr.wrapping_offset(value.layout.size * i, &this.tcx); - // Unaligned copy, which is what we want. - this.mem_copy(value.ptr(), ptr, value.layout.size, /*nonoverlapping*/ true)?; - } - } - - Ok(()) -} diff --git a/src/shims/x86/avx2.rs b/src/shims/x86/avx2.rs new file mode 100644 index 0000000000..bbf53f9f1e --- /dev/null +++ b/src/shims/x86/avx2.rs @@ -0,0 +1,444 @@ +use crate::rustc_middle::ty::layout::LayoutOf as _; +use rustc_middle::mir; +use rustc_middle::ty::Ty; +use rustc_span::Symbol; +use rustc_target::spec::abi::Abi; + +use super::{ + horizontal_bin_op, int_abs, mask_load, mask_store, mpsadbw, packssdw, packsswb, packusdw, + packuswb, pmulhrsw, psign, shift_simd_by_scalar, shift_simd_by_simd, ShiftOp, +}; +use crate::*; +use shims::foreign_items::EmulateForeignItemResult; + +impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for crate::MiriInterpCx<'mir, 'tcx> {} +pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>: + crate::MiriInterpCxExt<'mir, 'tcx> +{ + fn emulate_x86_avx2_intrinsic( + &mut self, + link_name: Symbol, + abi: Abi, + args: &[OpTy<'tcx, Provenance>], + dest: &MPlaceTy<'tcx, Provenance>, + ) -> InterpResult<'tcx, EmulateForeignItemResult> { + let this = self.eval_context_mut(); + this.expect_target_feature_for_intrinsic(link_name, "avx2")?; + // Prefix should have already been checked. + let unprefixed_name = link_name.as_str().strip_prefix("llvm.x86.avx2.").unwrap(); + + match unprefixed_name { + // Used to implement the _mm256_abs_epi{8,16,32} functions. + // Calculates the absolute value of packed 8/16/32-bit integers. + "pabs.b" | "pabs.w" | "pabs.d" => { + let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + int_abs(this, op, dest)?; + } + // Used to implement the _mm256_h{add,adds,sub}_epi{16,32} functions. + // Horizontally add / add with saturation / subtract adjacent 16/32-bit + // integer values in `left` and `right`. + "phadd.w" | "phadd.sw" | "phadd.d" | "phsub.w" | "phsub.sw" | "phsub.d" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (which, saturating) = match unprefixed_name { + "phadd.w" | "phadd.d" => (mir::BinOp::Add, false), + "phadd.sw" => (mir::BinOp::Add, true), + "phsub.w" | "phsub.d" => (mir::BinOp::Sub, false), + "phsub.sw" => (mir::BinOp::Sub, true), + _ => unreachable!(), + }; + + horizontal_bin_op(this, which, saturating, left, right, dest)?; + } + // Used to implement `_mm{,_mask}_{i32,i64}gather_{epi32,epi64,pd,ps}` functions + // Gathers elements from `slice` using `offsets * scale` as indices. + // When the highest bit of the corresponding element of `mask` is 0, + // the value is copied from `src` instead. + "gather.d.d" | "gather.d.d.256" | "gather.d.q" | "gather.d.q.256" | "gather.q.d" + | "gather.q.d.256" | "gather.q.q" | "gather.q.q.256" | "gather.d.pd" + | "gather.d.pd.256" | "gather.q.pd" | "gather.q.pd.256" | "gather.d.ps" + | "gather.d.ps.256" | "gather.q.ps" | "gather.q.ps.256" => { + let [src, slice, offsets, mask, scale] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + assert_eq!(dest.layout, src.layout); + + let (src, _) = this.operand_to_simd(src)?; + let (offsets, offsets_len) = this.operand_to_simd(offsets)?; + let (mask, mask_len) = this.operand_to_simd(mask)?; + let (dest, dest_len) = this.mplace_to_simd(dest)?; + + // There are cases like dest: i32x4, offsets: i64x2 + let actual_len = dest_len.min(offsets_len); + + assert_eq!(dest_len, mask_len); + + let mask_item_size = mask.layout.field(this, 0).size; + let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap(); + + let scale = this.read_scalar(scale)?.to_i8()?; + if !matches!(scale, 1 | 2 | 4 | 8) { + throw_unsup_format!("invalid gather scale {scale}"); + } + let scale = i64::from(scale); + + let slice = this.read_pointer(slice)?; + for i in 0..actual_len { + let mask = this.project_index(&mask, i)?; + let dest = this.project_index(&dest, i)?; + + if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 { + let offset = this.project_index(&offsets, i)?; + let offset = + i64::try_from(this.read_scalar(&offset)?.to_int(offset.layout.size)?) + .unwrap(); + let ptr = slice + .wrapping_signed_offset(offset.checked_mul(scale).unwrap(), &this.tcx); + // Unaligned copy, which is what we want. + this.mem_copy( + ptr, + dest.ptr(), + dest.layout.size, + /*nonoverlapping*/ true, + )?; + } else { + this.copy_op(&this.project_index(&src, i)?, &dest)?; + } + } + for i in actual_len..dest_len { + let dest = this.project_index(&dest, i)?; + this.write_scalar(Scalar::from_int(0, dest.layout.size), &dest)?; + } + } + // Used to implement the _mm256_madd_epi16 function. + // Multiplies packed signed 16-bit integers in `left` and `right`, producing + // intermediate signed 32-bit integers. Horizontally add adjacent pairs of + // intermediate 32-bit integers, and pack the results in `dest`. + "pmadd.wd" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.mplace_to_simd(dest)?; + + assert_eq!(left_len, right_len); + assert_eq!(dest_len.checked_mul(2).unwrap(), left_len); + + for i in 0..dest_len { + let j1 = i.checked_mul(2).unwrap(); + let left1 = this.read_scalar(&this.project_index(&left, j1)?)?.to_i16()?; + let right1 = this.read_scalar(&this.project_index(&right, j1)?)?.to_i16()?; + + let j2 = j1.checked_add(1).unwrap(); + let left2 = this.read_scalar(&this.project_index(&left, j2)?)?.to_i16()?; + let right2 = this.read_scalar(&this.project_index(&right, j2)?)?.to_i16()?; + + let dest = this.project_index(&dest, i)?; + + // Multiplications are i16*i16->i32, which will not overflow. + let mul1 = i32::from(left1).checked_mul(right1.into()).unwrap(); + let mul2 = i32::from(left2).checked_mul(right2.into()).unwrap(); + // However, this addition can overflow in the most extreme case + // (-0x8000)*(-0x8000)+(-0x8000)*(-0x8000) = 0x80000000 + let res = mul1.wrapping_add(mul2); + + this.write_scalar(Scalar::from_i32(res), &dest)?; + } + } + // Used to implement the _mm256_maddubs_epi16 function. + // Multiplies packed 8-bit unsigned integers from `left` and packed + // signed 8-bit integers from `right` into 16-bit signed integers. Then, + // the saturating sum of the products with indices `2*i` and `2*i+1` + // produces the output at index `i`. + "pmadd.ub.sw" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.mplace_to_simd(dest)?; + + assert_eq!(left_len, right_len); + assert_eq!(dest_len.checked_mul(2).unwrap(), left_len); + + for i in 0..dest_len { + let j1 = i.checked_mul(2).unwrap(); + let left1 = this.read_scalar(&this.project_index(&left, j1)?)?.to_u8()?; + let right1 = this.read_scalar(&this.project_index(&right, j1)?)?.to_i8()?; + + let j2 = j1.checked_add(1).unwrap(); + let left2 = this.read_scalar(&this.project_index(&left, j2)?)?.to_u8()?; + let right2 = this.read_scalar(&this.project_index(&right, j2)?)?.to_i8()?; + + let dest = this.project_index(&dest, i)?; + + // Multiplication of a u8 and an i8 into an i16 cannot overflow. + let mul1 = i16::from(left1).checked_mul(right1.into()).unwrap(); + let mul2 = i16::from(left2).checked_mul(right2.into()).unwrap(); + let res = mul1.saturating_add(mul2); + + this.write_scalar(Scalar::from_i16(res), &dest)?; + } + } + // Used to implement the _mm_maskload_epi32, _mm_maskload_epi64, + // _mm256_maskload_epi32 and _mm256_maskload_epi64 functions. + // For the element `i`, if the high bit of the `i`-th element of `mask` + // is one, it is loaded from `ptr.wrapping_add(i)`, otherwise zero is + // loaded. + "maskload.d" | "maskload.q" | "maskload.d.256" | "maskload.q.256" => { + let [ptr, mask] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + mask_load(this, ptr, mask, dest)?; + } + // Used to implement the _mm_maskstore_epi32, _mm_maskstore_epi64, + // _mm256_maskstore_epi32 and _mm256_maskstore_epi64 functions. + // For the element `i`, if the high bit of the element `i`-th of `mask` + // is one, it is stored into `ptr.wapping_add(i)`. + // Unlike SSE2's _mm_maskmoveu_si128, these are not non-temporal stores. + "maskstore.d" | "maskstore.q" | "maskstore.d.256" | "maskstore.q.256" => { + let [ptr, mask, value] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + mask_store(this, ptr, mask, value)?; + } + // Used to implement the _mm256_mpsadbw_epu8 function. + // Compute the sum of absolute differences of quadruplets of unsigned + // 8-bit integers in `left` and `right`, and store the 16-bit results + // in `right`. Quadruplets are selected from `left` and `right` with + // offsets specified in `imm`. + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8 + "mpsadbw" => { + let [left, right, imm] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + mpsadbw(this, left, right, imm, dest)?; + } + // Used to implement the _mm256_mulhrs_epi16 function. + // Multiplies packed 16-bit signed integer values, truncates the 32-bit + // product to the 18 most significant bits by right-shifting, and then + // divides the 18-bit value by 2 (rounding to nearest) by first adding + // 1 and then taking the bits `1..=16`. + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16 + "pmul.hr.sw" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + pmulhrsw(this, left, right, dest)?; + } + // Used to implement the _mm256_packs_epi16 function. + // Converts two 16-bit integer vectors to a single 8-bit integer + // vector with signed saturation. + "packsswb" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + packsswb(this, left, right, dest)?; + } + // Used to implement the _mm256_packs_epi32 function. + // Converts two 32-bit integer vectors to a single 16-bit integer + // vector with signed saturation. + "packssdw" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + packssdw(this, left, right, dest)?; + } + // Used to implement the _mm256_packus_epi16 function. + // Converts two 16-bit signed integer vectors to a single 8-bit + // unsigned integer vector with saturation. + "packuswb" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + packuswb(this, left, right, dest)?; + } + // Used to implement the _mm256_packus_epi32 function. + // Concatenates two 32-bit signed integer vectors and converts + // the result to a 16-bit unsigned integer vector with saturation. + "packusdw" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + packusdw(this, left, right, dest)?; + } + // Used to implement the _mm256_permutevar8x32_epi32 and + // _mm256_permutevar8x32_ps function. + // Shuffles `left` using the three low bits of each element of `right` + // as indices. + "permd" | "permps" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.mplace_to_simd(dest)?; + + assert_eq!(dest_len, left_len); + assert_eq!(dest_len, right_len); + + for i in 0..dest_len { + let dest = this.project_index(&dest, i)?; + let right = this.read_scalar(&this.project_index(&right, i)?)?.to_u32()?; + let left = this.project_index(&left, (right & 0b111).into())?; + + this.copy_op(&left, &dest)?; + } + } + // Used to implement the _mm256_permute2x128_si256 function. + // Shuffles 128-bit blocks of `a` and `b` using `imm` as pattern. + "vperm2i128" => { + let [left, right, imm] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + assert_eq!(left.layout.size.bits(), 256); + assert_eq!(right.layout.size.bits(), 256); + assert_eq!(dest.layout.size.bits(), 256); + + // Transmute to `[i128; 2]` + + let array_layout = + this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.i128, 2))?; + let left = left.transmute(array_layout, this)?; + let right = right.transmute(array_layout, this)?; + let dest = dest.transmute(array_layout, this)?; + + let imm = this.read_scalar(imm)?.to_u8()?; + + for i in 0..2 { + let dest = this.project_index(&dest, i)?; + let src = match (imm >> i.checked_mul(4).unwrap()) & 0b11 { + 0 => this.project_index(&left, 0)?, + 1 => this.project_index(&left, 1)?, + 2 => this.project_index(&right, 0)?, + 3 => this.project_index(&right, 1)?, + _ => unreachable!(), + }; + + this.copy_op(&src, &dest)?; + } + } + // Used to implement the _mm256_sad_epu8 function. + // Compute the absolute differences of packed unsigned 8-bit integers + // in `left` and `right`, then horizontally sum each consecutive 8 + // differences to produce four unsigned 16-bit integers, and pack + // these unsigned 16-bit integers in the low 16 bits of 64-bit elements + // in `dest`. + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8 + "psad.bw" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.mplace_to_simd(dest)?; + + assert_eq!(left_len, right_len); + assert_eq!(left_len, dest_len.checked_mul(8).unwrap()); + + for i in 0..dest_len { + let dest = this.project_index(&dest, i)?; + + let mut acc: u16 = 0; + for j in 0..8 { + let src_index = i.checked_mul(8).unwrap().checked_add(j).unwrap(); + + let left = this.project_index(&left, src_index)?; + let left = this.read_scalar(&left)?.to_u8()?; + + let right = this.project_index(&right, src_index)?; + let right = this.read_scalar(&right)?.to_u8()?; + + acc = acc.checked_add(left.abs_diff(right).into()).unwrap(); + } + + this.write_scalar(Scalar::from_u64(acc.into()), &dest)?; + } + } + // Used to implement the _mm256_shuffle_epi8 intrinsic. + // Shuffles bytes from `left` using `right` as pattern. + // Each 128-bit block is shuffled independently. + "pshuf.b" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.mplace_to_simd(dest)?; + + assert_eq!(dest_len, left_len); + assert_eq!(dest_len, right_len); + + for i in 0..dest_len { + let right = this.read_scalar(&this.project_index(&right, i)?)?.to_u8()?; + let dest = this.project_index(&dest, i)?; + + let res = if right & 0x80 == 0 { + // Shuffle each 128-bit (16-byte) block independently. + let j = u64::from(right % 16).checked_add(i & !15).unwrap(); + this.read_scalar(&this.project_index(&left, j)?)? + } else { + // If the highest bit in `right` is 1, write zero. + Scalar::from_u8(0) + }; + + this.write_scalar(res, &dest)?; + } + } + // Used to implement the _mm256_sign_epi{8,16,32} functions. + // Negates elements from `left` when the corresponding element in + // `right` is negative. If an element from `right` is zero, zero + // is writen to the corresponding output element. + // Basically, we multiply `left` with `right.signum()`. + "psign.b" | "psign.w" | "psign.d" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + psign(this, left, right, dest)?; + } + // Used to implement the _mm256_{sll,srl,sra}_epi{16,32,64} functions + // (except _mm256_sra_epi64, which is not available in AVX2). + // Shifts N-bit packed integers in left by the amount in right. + // `right` is as 128-bit vector. but it is interpreted as a single + // 64-bit integer (remaining bits are ignored). + // For logic shifts, when right is larger than N - 1, zero is produced. + // For arithmetic shifts, when right is larger than N - 1, the sign bit + // is copied to remaining bits. + "psll.w" | "psrl.w" | "psra.w" | "psll.d" | "psrl.d" | "psra.d" | "psll.q" + | "psrl.q" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let which = match unprefixed_name { + "psll.w" | "psll.d" | "psll.q" => ShiftOp::Left, + "psrl.w" | "psrl.d" | "psrl.q" => ShiftOp::RightLogic, + "psra.w" | "psra.d" => ShiftOp::RightArith, + _ => unreachable!(), + }; + + shift_simd_by_scalar(this, left, right, which, dest)?; + } + // Used to implement the _mm{,256}_{sllv,srlv,srav}_epi{32,64} functions + // (except _mm{,256}_srav_epi64, which are not available in AVX2). + "psllv.d" | "psllv.d.256" | "psllv.q" | "psllv.q.256" | "psrlv.d" | "psrlv.d.256" + | "psrlv.q" | "psrlv.q.256" | "psrav.d" | "psrav.d.256" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let which = match unprefixed_name { + "psllv.d" | "psllv.d.256" | "psllv.q" | "psllv.q.256" => ShiftOp::Left, + "psrlv.d" | "psrlv.d.256" | "psrlv.q" | "psrlv.q.256" => ShiftOp::RightLogic, + "psrav.d" | "psrav.d.256" => ShiftOp::RightArith, + _ => unreachable!(), + }; + + shift_simd_by_simd(this, left, right, which, dest)?; + } + _ => return Ok(EmulateForeignItemResult::NotSupported), + } + Ok(EmulateForeignItemResult::NeedsJumping) + } +} diff --git a/src/shims/x86/mod.rs b/src/shims/x86/mod.rs index 615821b2e3..9b8ea7b690 100644 --- a/src/shims/x86/mod.rs +++ b/src/shims/x86/mod.rs @@ -14,6 +14,7 @@ use shims::foreign_items::EmulateForeignItemResult; mod aesni; mod avx; +mod avx2; mod sse; mod sse2; mod sse3; @@ -136,6 +137,11 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>: this, link_name, abi, args, dest, ); } + name if name.starts_with("avx2.") => { + return avx2::EvalContextExt::emulate_x86_avx2_intrinsic( + this, link_name, abi, args, dest, + ); + } _ => return Ok(EmulateForeignItemResult::NotSupported), } @@ -534,6 +540,61 @@ fn shift_simd_by_scalar<'tcx>( Ok(()) } +/// Shifts each element of `left` by the corresponding element of `right`. +/// +/// For logic shifts, when right is larger than BITS - 1, zero is produced. +/// For arithmetic right-shifts, when right is larger than BITS - 1, the sign +/// bit is copied to remaining bits. +fn shift_simd_by_simd<'tcx>( + this: &mut crate::MiriInterpCx<'_, 'tcx>, + left: &OpTy<'tcx, Provenance>, + right: &OpTy<'tcx, Provenance>, + which: ShiftOp, + dest: &MPlaceTy<'tcx, Provenance>, +) -> InterpResult<'tcx, ()> { + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.mplace_to_simd(dest)?; + + assert_eq!(dest_len, left_len); + assert_eq!(dest_len, right_len); + + for i in 0..dest_len { + let left = this.read_scalar(&this.project_index(&left, i)?)?; + let right = this.read_scalar(&this.project_index(&right, i)?)?; + let dest = this.project_index(&dest, i)?; + + // It is ok to saturate the value to u32::MAX because any value + // above BITS - 1 will produce the same result. + let shift = u32::try_from(right.to_uint(dest.layout.size)?).unwrap_or(u32::MAX); + + let res = match which { + ShiftOp::Left => { + let left = left.to_uint(dest.layout.size)?; + let res = left.checked_shl(shift).unwrap_or(0); + // `truncate` is needed as left-shift can make the absolute value larger. + Scalar::from_uint(dest.layout.size.truncate(res), dest.layout.size) + } + ShiftOp::RightLogic => { + let left = left.to_uint(dest.layout.size)?; + let res = left.checked_shr(shift).unwrap_or(0); + // No `truncate` needed as right-shift can only make the absolute value smaller. + Scalar::from_uint(res, dest.layout.size) + } + ShiftOp::RightArith => { + let left = left.to_int(dest.layout.size)?; + // On overflow, copy the sign bit to the remaining bits + let res = left.checked_shr(shift).unwrap_or(left >> 127); + // No `truncate` needed as right-shift can only make the absolute value smaller. + Scalar::from_int(res, dest.layout.size) + } + }; + this.write_scalar(res, &dest)?; + } + + Ok(()) +} + /// Takes a 128-bit vector, transmutes it to `[u64; 2]` and extracts /// the first value. fn extract_first_u64<'tcx>( @@ -664,6 +725,30 @@ fn convert_float_to_int<'tcx>( Ok(()) } +/// Calculates absolute value of integers in `op` and stores the result in `dest`. +fn int_abs<'tcx>( + this: &mut crate::MiriInterpCx<'_, 'tcx>, + op: &OpTy<'tcx, Provenance>, + dest: &MPlaceTy<'tcx, Provenance>, +) -> InterpResult<'tcx, ()> { + let (op, op_len) = this.operand_to_simd(op)?; + let (dest, dest_len) = this.mplace_to_simd(dest)?; + + assert_eq!(op_len, dest_len); + + for i in 0..dest_len { + let op = this.read_scalar(&this.project_index(&op, i)?)?; + let dest = this.project_index(&dest, i)?; + + // Converting to a host "i128" works since the input is always signed. + let res = op.to_int(dest.layout.size)?.unsigned_abs(); + + this.write_scalar(Scalar::from_uint(res, dest.layout.size), &dest)?; + } + + Ok(()) +} + /// Splits `op` (which must be a SIMD vector) into 128-bit chuncks. /// /// Returns a tuple where: @@ -874,3 +959,320 @@ fn test_high_bits_masked<'tcx>( Ok((direct, negated)) } + +/// Conditionally loads from `ptr` according the high bit of each +/// element of `mask`. `ptr` does not need to be aligned. +fn mask_load<'tcx>( + this: &mut crate::MiriInterpCx<'_, 'tcx>, + ptr: &OpTy<'tcx, Provenance>, + mask: &OpTy<'tcx, Provenance>, + dest: &MPlaceTy<'tcx, Provenance>, +) -> InterpResult<'tcx, ()> { + let (mask, mask_len) = this.operand_to_simd(mask)?; + let (dest, dest_len) = this.mplace_to_simd(dest)?; + + assert_eq!(dest_len, mask_len); + + let mask_item_size = mask.layout.field(this, 0).size; + let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap(); + + let ptr = this.read_pointer(ptr)?; + for i in 0..dest_len { + let mask = this.project_index(&mask, i)?; + let dest = this.project_index(&dest, i)?; + + if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 { + // Size * u64 is implemented as always checked + #[allow(clippy::arithmetic_side_effects)] + let ptr = ptr.wrapping_offset(dest.layout.size * i, &this.tcx); + // Unaligned copy, which is what we want. + this.mem_copy(ptr, dest.ptr(), dest.layout.size, /*nonoverlapping*/ true)?; + } else { + this.write_scalar(Scalar::from_int(0, dest.layout.size), &dest)?; + } + } + + Ok(()) +} + +/// Conditionally stores into `ptr` according the high bit of each +/// element of `mask`. `ptr` does not need to be aligned. +fn mask_store<'tcx>( + this: &mut crate::MiriInterpCx<'_, 'tcx>, + ptr: &OpTy<'tcx, Provenance>, + mask: &OpTy<'tcx, Provenance>, + value: &OpTy<'tcx, Provenance>, +) -> InterpResult<'tcx, ()> { + let (mask, mask_len) = this.operand_to_simd(mask)?; + let (value, value_len) = this.operand_to_simd(value)?; + + assert_eq!(value_len, mask_len); + + let mask_item_size = mask.layout.field(this, 0).size; + let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap(); + + let ptr = this.read_pointer(ptr)?; + for i in 0..value_len { + let mask = this.project_index(&mask, i)?; + let value = this.project_index(&value, i)?; + + if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 { + // Size * u64 is implemented as always checked + #[allow(clippy::arithmetic_side_effects)] + let ptr = ptr.wrapping_offset(value.layout.size * i, &this.tcx); + // Unaligned copy, which is what we want. + this.mem_copy(value.ptr(), ptr, value.layout.size, /*nonoverlapping*/ true)?; + } + } + + Ok(()) +} + +/// Compute the sum of absolute differences of quadruplets of unsigned +/// 8-bit integers in `left` and `right`, and store the 16-bit results +/// in `right`. Quadruplets are selected from `left` and `right` with +/// offsets specified in `imm`. +/// +/// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16 +/// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8 +/// +/// Each 128-bit chunk is treated independently (i.e., the value for +/// the is i-th 128-bit chunk of `dest` is calculated with the i-th +/// 128-bit chunks of `left` and `right`). +fn mpsadbw<'tcx>( + this: &mut crate::MiriInterpCx<'_, 'tcx>, + left: &OpTy<'tcx, Provenance>, + right: &OpTy<'tcx, Provenance>, + imm: &OpTy<'tcx, Provenance>, + dest: &MPlaceTy<'tcx, Provenance>, +) -> InterpResult<'tcx, ()> { + assert_eq!(left.layout, right.layout); + assert_eq!(left.layout.size, dest.layout.size); + + let (num_chunks, op_items_per_chunk, left) = split_simd_to_128bit_chunks(this, left)?; + let (_, _, right) = split_simd_to_128bit_chunks(this, right)?; + let (_, dest_items_per_chunk, dest) = split_simd_to_128bit_chunks(this, dest)?; + + assert_eq!(op_items_per_chunk, dest_items_per_chunk.checked_mul(2).unwrap()); + + let imm = this.read_scalar(imm)?.to_uint(imm.layout.size)?; + // Bit 2 of `imm` specifies the offset for indices of `left`. + // The offset is 0 when the bit is 0 or 4 when the bit is 1. + let left_offset = u64::try_from((imm >> 2) & 1).unwrap().checked_mul(4).unwrap(); + // Bits 0..=1 of `imm` specify the offset for indices of + // `right` in blocks of 4 elements. + let right_offset = u64::try_from(imm & 0b11).unwrap().checked_mul(4).unwrap(); + + for i in 0..num_chunks { + let left = this.project_index(&left, i)?; + let right = this.project_index(&right, i)?; + let dest = this.project_index(&dest, i)?; + + for j in 0..dest_items_per_chunk { + let left_offset = left_offset.checked_add(j).unwrap(); + let mut res: u16 = 0; + for k in 0..4 { + let left = this + .read_scalar(&this.project_index(&left, left_offset.checked_add(k).unwrap())?)? + .to_u8()?; + let right = this + .read_scalar( + &this.project_index(&right, right_offset.checked_add(k).unwrap())?, + )? + .to_u8()?; + res = res.checked_add(left.abs_diff(right).into()).unwrap(); + } + this.write_scalar(Scalar::from_u16(res), &this.project_index(&dest, j)?)?; + } + } + + Ok(()) +} + +/// Multiplies packed 16-bit signed integer values, truncates the 32-bit +/// product to the 18 most significant bits by right-shifting, and then +/// divides the 18-bit value by 2 (rounding to nearest) by first adding +/// 1 and then taking the bits `1..=16`. +/// +/// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16 +/// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16 +fn pmulhrsw<'tcx>( + this: &mut crate::MiriInterpCx<'_, 'tcx>, + left: &OpTy<'tcx, Provenance>, + right: &OpTy<'tcx, Provenance>, + dest: &MPlaceTy<'tcx, Provenance>, +) -> InterpResult<'tcx, ()> { + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.mplace_to_simd(dest)?; + + assert_eq!(dest_len, left_len); + assert_eq!(dest_len, right_len); + + for i in 0..dest_len { + let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?; + let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?; + let dest = this.project_index(&dest, i)?; + + let res = + (i32::from(left).checked_mul(right.into()).unwrap() >> 14).checked_add(1).unwrap() >> 1; + + // The result of this operation can overflow a signed 16-bit integer. + // When `left` and `right` are -0x8000, the result is 0x8000. + #[allow(clippy::cast_possible_truncation)] + let res = res as i16; + + this.write_scalar(Scalar::from_i16(res), &dest)?; + } + + Ok(()) +} + +fn pack_generic<'tcx>( + this: &mut crate::MiriInterpCx<'_, 'tcx>, + left: &OpTy<'tcx, Provenance>, + right: &OpTy<'tcx, Provenance>, + dest: &MPlaceTy<'tcx, Provenance>, + f: impl Fn(Scalar) -> InterpResult<'tcx, Scalar>, +) -> InterpResult<'tcx, ()> { + assert_eq!(left.layout, right.layout); + assert_eq!(left.layout.size, dest.layout.size); + + let (num_chunks, op_items_per_chunk, left) = split_simd_to_128bit_chunks(this, left)?; + let (_, _, right) = split_simd_to_128bit_chunks(this, right)?; + let (_, dest_items_per_chunk, dest) = split_simd_to_128bit_chunks(this, dest)?; + + assert_eq!(dest_items_per_chunk, op_items_per_chunk.checked_mul(2).unwrap()); + + for i in 0..num_chunks { + let left = this.project_index(&left, i)?; + let right = this.project_index(&right, i)?; + let dest = this.project_index(&dest, i)?; + + for j in 0..op_items_per_chunk { + let left = this.read_scalar(&this.project_index(&left, j)?)?; + let right = this.read_scalar(&this.project_index(&right, j)?)?; + let left_dest = this.project_index(&dest, j)?; + let right_dest = + this.project_index(&dest, j.checked_add(op_items_per_chunk).unwrap())?; + + let left_res = f(left)?; + let right_res = f(right)?; + + this.write_scalar(left_res, &left_dest)?; + this.write_scalar(right_res, &right_dest)?; + } + } + + Ok(()) +} + +/// Converts two 16-bit integer vectors to a single 8-bit integer +/// vector with signed saturation. +/// +/// Each 128-bit chunk is treated independently (i.e., the value for +/// the is i-th 128-bit chunk of `dest` is calculated with the i-th +/// 128-bit chunks of `left` and `right`). +fn packsswb<'tcx>( + this: &mut crate::MiriInterpCx<'_, 'tcx>, + left: &OpTy<'tcx, Provenance>, + right: &OpTy<'tcx, Provenance>, + dest: &MPlaceTy<'tcx, Provenance>, +) -> InterpResult<'tcx, ()> { + pack_generic(this, left, right, dest, |op| { + let op = op.to_i16()?; + let res = i8::try_from(op).unwrap_or(if op < 0 { i8::MIN } else { i8::MAX }); + Ok(Scalar::from_i8(res)) + }) +} + +/// Converts two 16-bit signed integer vectors to a single 8-bit +/// unsigned integer vector with saturation. +/// +/// Each 128-bit chunk is treated independently (i.e., the value for +/// the is i-th 128-bit chunk of `dest` is calculated with the i-th +/// 128-bit chunks of `left` and `right`). +fn packuswb<'tcx>( + this: &mut crate::MiriInterpCx<'_, 'tcx>, + left: &OpTy<'tcx, Provenance>, + right: &OpTy<'tcx, Provenance>, + dest: &MPlaceTy<'tcx, Provenance>, +) -> InterpResult<'tcx, ()> { + pack_generic(this, left, right, dest, |op| { + let op = op.to_i16()?; + let res = u8::try_from(op).unwrap_or(if op < 0 { 0 } else { u8::MAX }); + Ok(Scalar::from_u8(res)) + }) +} + +/// Converts two 32-bit integer vectors to a single 16-bit integer +/// vector with signed saturation. +/// +/// Each 128-bit chunk is treated independently (i.e., the value for +/// the is i-th 128-bit chunk of `dest` is calculated with the i-th +/// 128-bit chunks of `left` and `right`). +fn packssdw<'tcx>( + this: &mut crate::MiriInterpCx<'_, 'tcx>, + left: &OpTy<'tcx, Provenance>, + right: &OpTy<'tcx, Provenance>, + dest: &MPlaceTy<'tcx, Provenance>, +) -> InterpResult<'tcx, ()> { + pack_generic(this, left, right, dest, |op| { + let op = op.to_i32()?; + let res = i16::try_from(op).unwrap_or(if op < 0 { i16::MIN } else { i16::MAX }); + Ok(Scalar::from_i16(res)) + }) +} + +/// Converts two 32-bit integer vectors to a single 16-bit integer +/// vector with unsigned saturation. +/// +/// Each 128-bit chunk is treated independently (i.e., the value for +/// the is i-th 128-bit chunk of `dest` is calculated with the i-th +/// 128-bit chunks of `left` and `right`). +fn packusdw<'tcx>( + this: &mut crate::MiriInterpCx<'_, 'tcx>, + left: &OpTy<'tcx, Provenance>, + right: &OpTy<'tcx, Provenance>, + dest: &MPlaceTy<'tcx, Provenance>, +) -> InterpResult<'tcx, ()> { + pack_generic(this, left, right, dest, |op| { + let op = op.to_i32()?; + let res = u16::try_from(op).unwrap_or(if op < 0 { 0 } else { u16::MAX }); + Ok(Scalar::from_u16(res)) + }) +} + +/// Negates elements from `left` when the corresponding element in +/// `right` is negative. If an element from `right` is zero, zero +/// is writen to the corresponding output element. +/// In other words, multiplies `left` with `right.signum()`. +fn psign<'tcx>( + this: &mut crate::MiriInterpCx<'_, 'tcx>, + left: &OpTy<'tcx, Provenance>, + right: &OpTy<'tcx, Provenance>, + dest: &MPlaceTy<'tcx, Provenance>, +) -> InterpResult<'tcx, ()> { + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.mplace_to_simd(dest)?; + + assert_eq!(dest_len, left_len); + assert_eq!(dest_len, right_len); + + for i in 0..dest_len { + let dest = this.project_index(&dest, i)?; + let left = this.read_immediate(&this.project_index(&left, i)?)?; + let right = this.read_scalar(&this.project_index(&right, i)?)?.to_int(dest.layout.size)?; + + let res = this.wrapping_binary_op( + mir::BinOp::Mul, + &left, + &ImmTy::from_int(right.signum(), dest.layout), + )?; + + this.write_immediate(*res, &dest)?; + } + + Ok(()) +} diff --git a/src/shims/x86/sse2.rs b/src/shims/x86/sse2.rs index 9db30d7ddc..63b6a30194 100644 --- a/src/shims/x86/sse2.rs +++ b/src/shims/x86/sse2.rs @@ -3,8 +3,8 @@ use rustc_span::Symbol; use rustc_target::spec::abi::Abi; use super::{ - bin_op_simd_float_all, bin_op_simd_float_first, convert_float_to_int, shift_simd_by_scalar, - FloatBinOp, ShiftOp, + bin_op_simd_float_all, bin_op_simd_float_first, convert_float_to_int, packssdw, packsswb, + packuswb, shift_simd_by_scalar, FloatBinOp, ShiftOp, }; use crate::*; use shims::foreign_items::EmulateForeignItemResult; @@ -176,29 +176,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>: let [left, right] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; - let (left, left_len) = this.operand_to_simd(left)?; - let (right, right_len) = this.operand_to_simd(right)?; - let (dest, dest_len) = this.mplace_to_simd(dest)?; - - // left and right are i16x8, dest is i8x16 - assert_eq!(left_len, 8); - assert_eq!(right_len, 8); - assert_eq!(dest_len, 16); - - for i in 0..left_len { - let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?; - let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?; - let left_dest = this.project_index(&dest, i)?; - let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?; - - let left_res = - i8::try_from(left).unwrap_or(if left < 0 { i8::MIN } else { i8::MAX }); - let right_res = - i8::try_from(right).unwrap_or(if right < 0 { i8::MIN } else { i8::MAX }); - - this.write_scalar(Scalar::from_i8(left_res), &left_dest)?; - this.write_scalar(Scalar::from_i8(right_res), &right_dest)?; - } + packsswb(this, left, right, dest)?; } // Used to implement the _mm_packus_epi16 function. // Converts two 16-bit signed integer vectors to a single 8-bit @@ -207,28 +185,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>: let [left, right] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; - let (left, left_len) = this.operand_to_simd(left)?; - let (right, right_len) = this.operand_to_simd(right)?; - let (dest, dest_len) = this.mplace_to_simd(dest)?; - - // left and right are i16x8, dest is u8x16 - assert_eq!(left_len, 8); - assert_eq!(right_len, 8); - assert_eq!(dest_len, 16); - - for i in 0..left_len { - let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?; - let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?; - let left_dest = this.project_index(&dest, i)?; - let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?; - - let left_res = u8::try_from(left).unwrap_or(if left < 0 { 0 } else { u8::MAX }); - let right_res = - u8::try_from(right).unwrap_or(if right < 0 { 0 } else { u8::MAX }); - - this.write_scalar(Scalar::from_u8(left_res), &left_dest)?; - this.write_scalar(Scalar::from_u8(right_res), &right_dest)?; - } + packuswb(this, left, right, dest)?; } // Used to implement the _mm_packs_epi32 function. // Converts two 32-bit integer vectors to a single 16-bit integer @@ -237,29 +194,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>: let [left, right] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; - let (left, left_len) = this.operand_to_simd(left)?; - let (right, right_len) = this.operand_to_simd(right)?; - let (dest, dest_len) = this.mplace_to_simd(dest)?; - - // left and right are i32x4, dest is i16x8 - assert_eq!(left_len, 4); - assert_eq!(right_len, 4); - assert_eq!(dest_len, 8); - - for i in 0..left_len { - let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i32()?; - let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i32()?; - let left_dest = this.project_index(&dest, i)?; - let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?; - - let left_res = - i16::try_from(left).unwrap_or(if left < 0 { i16::MIN } else { i16::MAX }); - let right_res = - i16::try_from(right).unwrap_or(if right < 0 { i16::MIN } else { i16::MAX }); - - this.write_scalar(Scalar::from_i16(left_res), &left_dest)?; - this.write_scalar(Scalar::from_i16(right_res), &right_dest)?; - } + packssdw(this, left, right, dest)?; } // Used to implement _mm_min_sd and _mm_max_sd functions. // Note that the semantics are a bit different from Rust simd_min diff --git a/src/shims/x86/sse41.rs b/src/shims/x86/sse41.rs index 16a82eed99..19bc27421d 100644 --- a/src/shims/x86/sse41.rs +++ b/src/shims/x86/sse41.rs @@ -1,7 +1,7 @@ use rustc_span::Symbol; use rustc_target::spec::abi::Abi; -use super::{conditional_dot_product, round_all, round_first, test_bits_masked}; +use super::{conditional_dot_product, mpsadbw, packusdw, round_all, round_first, test_bits_masked}; use crate::*; use shims::foreign_items::EmulateForeignItemResult; @@ -68,27 +68,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>: let [left, right] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; - let (left, left_len) = this.operand_to_simd(left)?; - let (right, right_len) = this.operand_to_simd(right)?; - let (dest, dest_len) = this.mplace_to_simd(dest)?; - - assert_eq!(left_len, right_len); - assert_eq!(dest_len, left_len.checked_mul(2).unwrap()); - - for i in 0..left_len { - let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i32()?; - let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i32()?; - let left_dest = this.project_index(&dest, i)?; - let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?; - - let left_res = - u16::try_from(left).unwrap_or(if left < 0 { 0 } else { u16::MAX }); - let right_res = - u16::try_from(right).unwrap_or(if right < 0 { 0 } else { u16::MAX }); - - this.write_scalar(Scalar::from_u16(left_res), &left_dest)?; - this.write_scalar(Scalar::from_u16(right_res), &right_dest)?; - } + packusdw(this, left, right, dest)?; } // Used to implement the _mm_dp_ps and _mm_dp_pd functions. // Conditionally multiplies the packed floating-point elements in @@ -176,40 +156,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>: let [left, right, imm] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; - let (left, left_len) = this.operand_to_simd(left)?; - let (right, right_len) = this.operand_to_simd(right)?; - let (dest, dest_len) = this.mplace_to_simd(dest)?; - - assert_eq!(left_len, right_len); - assert_eq!(left_len, dest_len.checked_mul(2).unwrap()); - - let imm = this.read_scalar(imm)?.to_u8()?; - // Bit 2 of `imm` specifies the offset for indices of `left`. - // The offset is 0 when the bit is 0 or 4 when the bit is 1. - let left_offset = u64::from((imm >> 2) & 1).checked_mul(4).unwrap(); - // Bits 0..=1 of `imm` specify the offset for indices of - // `right` in blocks of 4 elements. - let right_offset = u64::from(imm & 0b11).checked_mul(4).unwrap(); - - for i in 0..dest_len { - let left_offset = left_offset.checked_add(i).unwrap(); - let mut res: u16 = 0; - for j in 0..4 { - let left = this - .read_scalar( - &this.project_index(&left, left_offset.checked_add(j).unwrap())?, - )? - .to_u8()?; - let right = this - .read_scalar( - &this - .project_index(&right, right_offset.checked_add(j).unwrap())?, - )? - .to_u8()?; - res = res.checked_add(left.abs_diff(right).into()).unwrap(); - } - this.write_scalar(Scalar::from_u16(res), &this.project_index(&dest, i)?)?; - } + mpsadbw(this, left, right, imm, dest)?; } // Used to implement the _mm_testz_si128, _mm_testc_si128 // and _mm_testnzc_si128 functions. diff --git a/src/shims/x86/ssse3.rs b/src/shims/x86/ssse3.rs index dd5d064b20..4f8e52dbb7 100644 --- a/src/shims/x86/ssse3.rs +++ b/src/shims/x86/ssse3.rs @@ -2,7 +2,7 @@ use rustc_middle::mir; use rustc_span::Symbol; use rustc_target::spec::abi::Abi; -use super::horizontal_bin_op; +use super::{horizontal_bin_op, int_abs, pmulhrsw, psign}; use crate::*; use shims::foreign_items::EmulateForeignItemResult; @@ -28,20 +28,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>: "pabs.b.128" | "pabs.w.128" | "pabs.d.128" => { let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; - let (op, op_len) = this.operand_to_simd(op)?; - let (dest, dest_len) = this.mplace_to_simd(dest)?; - - assert_eq!(op_len, dest_len); - - for i in 0..dest_len { - let op = this.read_scalar(&this.project_index(&op, i)?)?; - let dest = this.project_index(&dest, i)?; - - // Converting to a host "i128" works since the input is always signed. - let res = op.to_int(dest.layout.size)?.unsigned_abs(); - - this.write_scalar(Scalar::from_uint(res, dest.layout.size), &dest)?; - } + int_abs(this, op, dest)?; } // Used to implement the _mm_shuffle_epi8 intrinsic. // Shuffles bytes from `left` using `right` as pattern. @@ -136,30 +123,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>: let [left, right] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; - let (left, left_len) = this.operand_to_simd(left)?; - let (right, right_len) = this.operand_to_simd(right)?; - let (dest, dest_len) = this.mplace_to_simd(dest)?; - - assert_eq!(dest_len, left_len); - assert_eq!(dest_len, right_len); - - for i in 0..dest_len { - let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?; - let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?; - let dest = this.project_index(&dest, i)?; - - let res = (i32::from(left).checked_mul(right.into()).unwrap() >> 14) - .checked_add(1) - .unwrap() - >> 1; - - // The result of this operation can overflow a signed 16-bit integer. - // When `left` and `right` are -0x8000, the result is 0x8000. - #[allow(clippy::cast_possible_truncation)] - let res = res as i16; - - this.write_scalar(Scalar::from_i16(res), &dest)?; - } + pmulhrsw(this, left, right, dest)?; } // Used to implement the _mm_sign_epi{8,16,32} functions. // Negates elements from `left` when the corresponding element in @@ -170,28 +134,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>: let [left, right] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; - let (left, left_len) = this.operand_to_simd(left)?; - let (right, right_len) = this.operand_to_simd(right)?; - let (dest, dest_len) = this.mplace_to_simd(dest)?; - - assert_eq!(dest_len, left_len); - assert_eq!(dest_len, right_len); - - for i in 0..dest_len { - let dest = this.project_index(&dest, i)?; - let left = this.read_immediate(&this.project_index(&left, i)?)?; - let right = this - .read_scalar(&this.project_index(&right, i)?)? - .to_int(dest.layout.size)?; - - let res = this.wrapping_binary_op( - mir::BinOp::Mul, - &left, - &ImmTy::from_int(right.signum(), dest.layout), - )?; - - this.write_immediate(*res, &dest)?; - } + psign(this, left, right, dest)?; } _ => return Ok(EmulateForeignItemResult::NotSupported), } diff --git a/tests/pass/intrinsics-x86-avx2.rs b/tests/pass/intrinsics-x86-avx2.rs new file mode 100644 index 0000000000..80d125bb85 --- /dev/null +++ b/tests/pass/intrinsics-x86-avx2.rs @@ -0,0 +1,1613 @@ +// Ignore everything except x86 and x86_64 +// Any new targets that are added to CI should be ignored here. +// (We cannot use `cfg`-based tricks here since the `target-feature` flags below only work on x86.) +//@ignore-target-aarch64 +//@ignore-target-arm +//@ignore-target-avr +//@ignore-target-s390x +//@ignore-target-thumbv7em +//@ignore-target-wasm32 +//@compile-flags: -C target-feature=+avx2 + +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; +use std::mem::transmute; + +fn main() { + assert!(is_x86_feature_detected!("avx2")); + + unsafe { + test_avx2(); + } +} + +#[target_feature(enable = "avx2")] +unsafe fn test_avx2() { + // Mostly copied from library/stdarch/crates/core_arch/src/x86/avx2.rs + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_abs_epi32() { + #[rustfmt::skip] + let a = _mm256_setr_epi32( + 0, 1, -1, i32::MAX, + i32::MIN, 100, -100, -32, + ); + let r = _mm256_abs_epi32(a); + #[rustfmt::skip] + let e = _mm256_setr_epi32( + 0, 1, 1, i32::MAX, + i32::MAX.wrapping_add(1), 100, 100, 32, + ); + assert_eq_m256i(r, e); + } + test_mm256_abs_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_abs_epi16() { + #[rustfmt::skip] + let a = _mm256_setr_epi16( + 0, 1, -1, 2, -2, 3, -3, 4, + -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32, + ); + let r = _mm256_abs_epi16(a); + #[rustfmt::skip] + let e = _mm256_setr_epi16( + 0, 1, 1, 2, 2, 3, 3, 4, + 4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32, + ); + assert_eq_m256i(r, e); + } + test_mm256_abs_epi16(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_abs_epi8() { + #[rustfmt::skip] + let a = _mm256_setr_epi8( + 0, 1, -1, 2, -2, 3, -3, 4, + -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32, + 0, 1, -1, 2, -2, 3, -3, 4, + -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32, + ); + let r = _mm256_abs_epi8(a); + #[rustfmt::skip] + let e = _mm256_setr_epi8( + 0, 1, 1, 2, 2, 3, 3, 4, + 4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32, + 0, 1, 1, 2, 2, 3, 3, 4, + 4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32, + ); + assert_eq_m256i(r, e); + } + test_mm256_abs_epi8(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_hadd_epi16() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(4); + let r = _mm256_hadd_epi16(a, b); + let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); + assert_eq_m256i(r, e); + + // Test wrapping on overflow + let a = _mm256_setr_epi16( + i16::MAX, + 1, + i16::MAX, + 2, + i16::MAX, + 3, + i16::MAX, + 4, + i16::MAX, + 5, + i16::MAX, + 6, + i16::MAX, + 7, + i16::MAX, + 8, + ); + let b = _mm256_setr_epi16( + i16::MIN, + -1, + i16::MIN, + -2, + i16::MIN, + -3, + i16::MIN, + -4, + i16::MIN, + -5, + i16::MIN, + -6, + i16::MIN, + -7, + i16::MIN, + -8, + ); + let expected = _mm256_setr_epi16( + i16::MIN, + i16::MIN + 1, + i16::MIN + 2, + i16::MIN + 3, + i16::MAX, + i16::MAX - 1, + i16::MAX - 2, + i16::MAX - 3, + i16::MIN + 4, + i16::MIN + 5, + i16::MIN + 6, + i16::MIN + 7, + i16::MAX - 4, + i16::MAX - 5, + i16::MAX - 6, + i16::MAX - 7, + ); + let r = _mm256_hadd_epi16(a, b); + assert_eq_m256i(r, expected); + } + test_mm256_hadd_epi16(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_hadd_epi32() { + let a = _mm256_set1_epi32(2); + let b = _mm256_set1_epi32(4); + let r = _mm256_hadd_epi32(a, b); + let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8); + assert_eq_m256i(r, e); + + // Test wrapping on overflow + let a = _mm256_setr_epi32(i32::MAX, 1, i32::MAX, 2, i32::MAX, 3, i32::MAX, 4); + let b = _mm256_setr_epi32(i32::MIN, -1, i32::MIN, -2, i32::MIN, -3, i32::MIN, -4); + let expected = _mm256_setr_epi32( + i32::MIN, + i32::MIN + 1, + i32::MAX, + i32::MAX - 1, + i32::MIN + 2, + i32::MIN + 3, + i32::MAX - 2, + i32::MAX - 3, + ); + let r = _mm256_hadd_epi32(a, b); + assert_eq_m256i(r, expected); + } + test_mm256_hadd_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_hadds_epi16() { + let a = _mm256_set1_epi16(2); + let a = _mm256_insert_epi16::<0>(a, 0x7fff); + let a = _mm256_insert_epi16::<1>(a, 1); + let b = _mm256_set1_epi16(4); + let r = _mm256_hadds_epi16(a, b); + let e = _mm256_setr_epi16(0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); + assert_eq_m256i(r, e); + + // Test saturating on overflow + let a = _mm256_setr_epi16( + i16::MAX, + 1, + i16::MAX, + 2, + i16::MAX, + 3, + i16::MAX, + 4, + i16::MAX, + 5, + i16::MAX, + 6, + i16::MAX, + 7, + i16::MAX, + 8, + ); + let b = _mm256_setr_epi16( + i16::MIN, + -1, + i16::MIN, + -2, + i16::MIN, + -3, + i16::MIN, + -4, + i16::MIN, + -5, + i16::MIN, + -6, + i16::MIN, + -7, + i16::MIN, + -8, + ); + let expected = _mm256_setr_epi16( + i16::MAX, + i16::MAX, + i16::MAX, + i16::MAX, + i16::MIN, + i16::MIN, + i16::MIN, + i16::MIN, + i16::MAX, + i16::MAX, + i16::MAX, + i16::MAX, + i16::MIN, + i16::MIN, + i16::MIN, + i16::MIN, + ); + let r = _mm256_hadds_epi16(a, b); + assert_eq_m256i(r, expected); + } + test_mm256_hadds_epi16(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_hsub_epi16() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(4); + let r = _mm256_hsub_epi16(a, b); + let e = _mm256_set1_epi16(0); + assert_eq_m256i(r, e); + + // Test wrapping on overflow + let a = _mm256_setr_epi16( + i16::MAX, + -1, + i16::MAX, + -2, + i16::MAX, + -3, + i16::MAX, + -4, + i16::MAX, + -5, + i16::MAX, + -6, + i16::MAX, + -7, + i16::MAX, + -8, + ); + let b = _mm256_setr_epi16( + i16::MIN, + 1, + i16::MIN, + 2, + i16::MIN, + 3, + i16::MIN, + 4, + i16::MIN, + 5, + i16::MIN, + 6, + i16::MIN, + 7, + i16::MIN, + 8, + ); + let expected = _mm256_setr_epi16( + i16::MIN, + i16::MIN + 1, + i16::MIN + 2, + i16::MIN + 3, + i16::MAX, + i16::MAX - 1, + i16::MAX - 2, + i16::MAX - 3, + i16::MIN + 4, + i16::MIN + 5, + i16::MIN + 6, + i16::MIN + 7, + i16::MAX - 4, + i16::MAX - 5, + i16::MAX - 6, + i16::MAX - 7, + ); + let r = _mm256_hsub_epi16(a, b); + assert_eq_m256i(r, expected); + } + test_mm256_hsub_epi16(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_hsub_epi32() { + let a = _mm256_set1_epi32(2); + let b = _mm256_set1_epi32(4); + let r = _mm256_hsub_epi32(a, b); + let e = _mm256_set1_epi32(0); + assert_eq_m256i(r, e); + + // Test wrapping on overflow + let a = _mm256_setr_epi32(i32::MAX, -1, i32::MAX, -2, i32::MAX, -3, i32::MAX, -4); + let b = _mm256_setr_epi32(i32::MIN, 1, i32::MIN, 2, i32::MIN, 3, i32::MIN, 4); + let expected = _mm256_setr_epi32( + i32::MIN, + i32::MIN + 1, + i32::MAX, + i32::MAX - 1, + i32::MIN + 2, + i32::MIN + 3, + i32::MAX - 2, + i32::MAX - 3, + ); + let r = _mm256_hsub_epi32(a, b); + assert_eq_m256i(r, expected); + } + test_mm256_hsub_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_hsubs_epi16() { + let a = _mm256_set1_epi16(2); + let a = _mm256_insert_epi16::<0>(a, 0x7fff); + let a = _mm256_insert_epi16::<1>(a, -1); + let b = _mm256_set1_epi16(4); + let r = _mm256_hsubs_epi16(a, b); + let e = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 0x7FFF); + assert_eq_m256i(r, e); + + // Test saturating on overflow + let a = _mm256_setr_epi16( + i16::MAX, + -1, + i16::MAX, + -2, + i16::MAX, + -3, + i16::MAX, + -4, + i16::MAX, + -5, + i16::MAX, + -6, + i16::MAX, + -7, + i16::MAX, + -8, + ); + let b = _mm256_setr_epi16( + i16::MIN, + 1, + i16::MIN, + 2, + i16::MIN, + 3, + i16::MIN, + 4, + i16::MIN, + 5, + i16::MIN, + 6, + i16::MIN, + 7, + i16::MIN, + 8, + ); + let expected = _mm256_setr_epi16( + i16::MAX, + i16::MAX, + i16::MAX, + i16::MAX, + i16::MIN, + i16::MIN, + i16::MIN, + i16::MIN, + i16::MAX, + i16::MAX, + i16::MAX, + i16::MAX, + i16::MIN, + i16::MIN, + i16::MIN, + i16::MIN, + ); + let r = _mm256_hsubs_epi16(a, b); + assert_eq_m256i(r, expected); + } + test_mm256_hsubs_epi16(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_i32gather_epi32() { + let arr: [i32; 128] = core::array::from_fn(|i| i as i32); + // A multiplier of 4 is word-addressing + let r = _mm_i32gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48)); + assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48)); + } + test_mm_i32gather_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_mask_i32gather_epi32() { + let arr: [i32; 128] = core::array::from_fn(|i| i as i32); + // A multiplier of 4 is word-addressing + let r = _mm_mask_i32gather_epi32::<4>( + _mm_set1_epi32(256), + arr.as_ptr(), + _mm_setr_epi32(0, 16, 64, 96), + _mm_setr_epi32(-1, -1, -1, 0), + ); + assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256)); + } + test_mm_mask_i32gather_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_i32gather_epi32() { + let arr: [i32; 128] = core::array::from_fn(|i| i as i32); + // A multiplier of 4 is word-addressing + let r = + _mm256_i32gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4)); + assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4)); + } + test_mm256_i32gather_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_mask_i32gather_epi32() { + let arr: [i32; 128] = core::array::from_fn(|i| i as i32); + // A multiplier of 4 is word-addressing + let r = _mm256_mask_i32gather_epi32::<4>( + _mm256_set1_epi32(256), + arr.as_ptr(), + _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0), + _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0), + ); + assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256)); + } + test_mm256_mask_i32gather_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_i32gather_ps() { + let arr: [f32; 128] = core::array::from_fn(|i| i as f32); + // A multiplier of 4 is word-addressing for f32s + let r = _mm_i32gather_ps::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48)); + assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0)); + } + test_mm_i32gather_ps(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_mask_i32gather_ps() { + let arr: [f32; 128] = core::array::from_fn(|i| i as f32); + // A multiplier of 4 is word-addressing for f32s + let r = _mm_mask_i32gather_ps::<4>( + _mm_set1_ps(256.0), + arr.as_ptr(), + _mm_setr_epi32(0, 16, 64, 96), + _mm_setr_ps(-1.0, -1.0, -1.0, 0.0), + ); + assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0)); + } + test_mm_mask_i32gather_ps(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_i32gather_ps() { + let arr: [f32; 128] = core::array::from_fn(|i| i as f32); + // A multiplier of 4 is word-addressing for f32s + let r = + _mm256_i32gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4)); + assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0)); + } + test_mm256_i32gather_ps(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_mask_i32gather_ps() { + let arr: [f32; 128] = core::array::from_fn(|i| i as f32); + // A multiplier of 4 is word-addressing for f32s + let r = _mm256_mask_i32gather_ps::<4>( + _mm256_set1_ps(256.0), + arr.as_ptr(), + _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0), + _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0), + ); + assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0)); + } + test_mm256_mask_i32gather_ps(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_i32gather_epi64() { + let arr: [i64; 128] = core::array::from_fn(|i| i as i64); + // A multiplier of 8 is word-addressing for i64s + let r = _mm_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0)); + assert_eq_m128i(r, _mm_setr_epi64x(0, 16)); + } + test_mm_i32gather_epi64(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_mask_i32gather_epi64() { + let arr: [i64; 128] = core::array::from_fn(|i| i as i64); + // A multiplier of 8 is word-addressing for i64s + let r = _mm_mask_i32gather_epi64::<8>( + _mm_set1_epi64x(256), + arr.as_ptr(), + _mm_setr_epi32(16, 16, 16, 16), + _mm_setr_epi64x(-1, 0), + ); + assert_eq_m128i(r, _mm_setr_epi64x(16, 256)); + } + test_mm_mask_i32gather_epi64(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_i32gather_epi64() { + let arr: [i64; 128] = core::array::from_fn(|i| i as i64); + // A multiplier of 8 is word-addressing for i64s + let r = _mm256_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48)); + assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48)); + } + test_mm256_i32gather_epi64(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_mask_i32gather_epi64() { + let arr: [i64; 128] = core::array::from_fn(|i| i as i64); + // A multiplier of 8 is word-addressing for i64s + let r = _mm256_mask_i32gather_epi64::<8>( + _mm256_set1_epi64x(256), + arr.as_ptr(), + _mm_setr_epi32(0, 16, 64, 96), + _mm256_setr_epi64x(-1, -1, -1, 0), + ); + assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256)); + } + test_mm256_mask_i32gather_epi64(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_i32gather_pd() { + let arr: [f64; 128] = core::array::from_fn(|i| i as f64); + // A multiplier of 8 is word-addressing for f64s + let r = _mm_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0)); + assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0)); + } + test_mm_i32gather_pd(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_mask_i32gather_pd() { + let arr: [f64; 128] = core::array::from_fn(|i| i as f64); + // A multiplier of 8 is word-addressing for f64s + let r = _mm_mask_i32gather_pd::<8>( + _mm_set1_pd(256.0), + arr.as_ptr(), + _mm_setr_epi32(16, 16, 16, 16), + _mm_setr_pd(-1.0, 0.0), + ); + assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0)); + } + test_mm_mask_i32gather_pd(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_i32gather_pd() { + let arr: [f64; 128] = core::array::from_fn(|i| i as f64); + // A multiplier of 8 is word-addressing for f64s + let r = _mm256_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48)); + assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0)); + } + test_mm256_i32gather_pd(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_mask_i32gather_pd() { + let arr: [f64; 128] = core::array::from_fn(|i| i as f64); + // A multiplier of 8 is word-addressing for f64s + let r = _mm256_mask_i32gather_pd::<8>( + _mm256_set1_pd(256.0), + arr.as_ptr(), + _mm_setr_epi32(0, 16, 64, 96), + _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0), + ); + assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0)); + } + test_mm256_mask_i32gather_pd(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_i64gather_epi32() { + let arr: [i32; 128] = core::array::from_fn(|i| i as i32); + // A multiplier of 4 is word-addressing + let r = _mm_i64gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16)); + assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0)); + } + test_mm_i64gather_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_mask_i64gather_epi32() { + let arr: [i32; 128] = core::array::from_fn(|i| i as i32); + // A multiplier of 4 is word-addressing + let r = _mm_mask_i64gather_epi32::<4>( + _mm_set1_epi32(256), + arr.as_ptr(), + _mm_setr_epi64x(0, 16), + _mm_setr_epi32(-1, 0, -1, 0), + ); + assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0)); + } + test_mm_mask_i64gather_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_i64gather_epi32() { + let arr: [i32; 128] = core::array::from_fn(|i| i as i32); + // A multiplier of 4 is word-addressing + let r = _mm256_i64gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48)); + assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48)); + } + test_mm256_i64gather_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_mask_i64gather_epi32() { + let arr: [i32; 128] = core::array::from_fn(|i| i as i32); + // A multiplier of 4 is word-addressing + let r = _mm256_mask_i64gather_epi32::<4>( + _mm_set1_epi32(256), + arr.as_ptr(), + _mm256_setr_epi64x(0, 16, 64, 96), + _mm_setr_epi32(-1, -1, -1, 0), + ); + assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256)); + } + test_mm256_mask_i64gather_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_i64gather_ps() { + let arr: [f32; 128] = core::array::from_fn(|i| i as f32); + // A multiplier of 4 is word-addressing for f32s + let r = _mm_i64gather_ps::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16)); + assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0)); + } + test_mm_i64gather_ps(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_mask_i64gather_ps() { + let arr: [f32; 128] = core::array::from_fn(|i| i as f32); + // A multiplier of 4 is word-addressing for f32s + let r = _mm_mask_i64gather_ps::<4>( + _mm_set1_ps(256.0), + arr.as_ptr(), + _mm_setr_epi64x(0, 16), + _mm_setr_ps(-1.0, 0.0, -1.0, 0.0), + ); + assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0)); + } + test_mm_mask_i64gather_ps(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_i64gather_ps() { + let arr: [f32; 128] = core::array::from_fn(|i| i as f32); + // A multiplier of 4 is word-addressing for f32s + let r = _mm256_i64gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48)); + assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0)); + } + test_mm256_i64gather_ps(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_mask_i64gather_ps() { + let arr: [f32; 128] = core::array::from_fn(|i| i as f32); + // A multiplier of 4 is word-addressing for f32s + let r = _mm256_mask_i64gather_ps::<4>( + _mm_set1_ps(256.0), + arr.as_ptr(), + _mm256_setr_epi64x(0, 16, 64, 96), + _mm_setr_ps(-1.0, -1.0, -1.0, 0.0), + ); + assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0)); + } + test_mm256_mask_i64gather_ps(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_i64gather_epi64() { + let arr: [i64; 128] = core::array::from_fn(|i| i as i64); + // A multiplier of 8 is word-addressing for i64s + let r = _mm_i64gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16)); + assert_eq_m128i(r, _mm_setr_epi64x(0, 16)); + } + test_mm_i64gather_epi64(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_mask_i64gather_epi64() { + let arr: [i64; 128] = core::array::from_fn(|i| i as i64); + // A multiplier of 8 is word-addressing for i64s + let r = _mm_mask_i64gather_epi64::<8>( + _mm_set1_epi64x(256), + arr.as_ptr(), + _mm_setr_epi64x(16, 16), + _mm_setr_epi64x(-1, 0), + ); + assert_eq_m128i(r, _mm_setr_epi64x(16, 256)); + } + test_mm_mask_i64gather_epi64(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_i64gather_epi64() { + let arr: [i64; 128] = core::array::from_fn(|i| i as i64); + // A multiplier of 8 is word-addressing for i64s + let r = _mm256_i64gather_epi64::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48)); + assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48)); + } + test_mm256_i64gather_epi64(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_mask_i64gather_epi64() { + let arr: [i64; 128] = core::array::from_fn(|i| i as i64); + // A multiplier of 8 is word-addressing for i64s + let r = _mm256_mask_i64gather_epi64::<8>( + _mm256_set1_epi64x(256), + arr.as_ptr(), + _mm256_setr_epi64x(0, 16, 64, 96), + _mm256_setr_epi64x(-1, -1, -1, 0), + ); + assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256)); + } + test_mm256_mask_i64gather_epi64(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_i64gather_pd() { + let arr: [f64; 128] = core::array::from_fn(|i| i as f64); + // A multiplier of 8 is word-addressing for f64s + let r = _mm_i64gather_pd::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16)); + assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0)); + } + test_mm_i64gather_pd(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_mask_i64gather_pd() { + let arr: [f64; 128] = core::array::from_fn(|i| i as f64); + // A multiplier of 8 is word-addressing for f64s + let r = _mm_mask_i64gather_pd::<8>( + _mm_set1_pd(256.0), + arr.as_ptr(), + _mm_setr_epi64x(16, 16), + _mm_setr_pd(-1.0, 0.0), + ); + assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0)); + } + test_mm_mask_i64gather_pd(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_i64gather_pd() { + let arr: [f64; 128] = core::array::from_fn(|i| i as f64); + // A multiplier of 8 is word-addressing for f64s + let r = _mm256_i64gather_pd::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48)); + assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0)); + } + test_mm256_i64gather_pd(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_mask_i64gather_pd() { + let arr: [f64; 128] = core::array::from_fn(|i| i as f64); + // A multiplier of 8 is word-addressing for f64s + let r = _mm256_mask_i64gather_pd::<8>( + _mm256_set1_pd(256.0), + arr.as_ptr(), + _mm256_setr_epi64x(0, 16, 64, 96), + _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0), + ); + assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0)); + } + test_mm256_mask_i64gather_pd(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_madd_epi16() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(4); + let r = _mm256_madd_epi16(a, b); + let e = _mm256_set1_epi32(16); + assert_eq_m256i(r, e); + } + test_mm256_madd_epi16(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_maddubs_epi16() { + let a = _mm256_set1_epi8(2); + let b = _mm256_set1_epi8(4); + let r = _mm256_maddubs_epi16(a, b); + let e = _mm256_set1_epi16(16); + assert_eq_m256i(r, e); + } + test_mm256_maddubs_epi16(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_maskload_epi32() { + let nums = [1, 2, 3, 4]; + let a = &nums as *const i32; + let mask = _mm_setr_epi32(-1, 0, 0, -1); + let r = _mm_maskload_epi32(a, mask); + let e = _mm_setr_epi32(1, 0, 0, 4); + assert_eq_m128i(r, e); + + // Unaligned pointer + let a = Unaligned::new([1i32, 2, 3, 4]); + let mask = _mm_setr_epi32(0, !0, 0, !0); + let r = _mm_maskload_epi32(a.as_ptr().cast(), mask); + let e = _mm_setr_epi32(0, 2, 0, 4); + assert_eq_m128i(r, e); + + // Only loading first element, so slice can be short. + let a = &[2i32]; + let mask = _mm_setr_epi32(!0, 0, 0, 0); + let r = _mm_maskload_epi32(a.as_ptr(), mask); + let e = _mm_setr_epi32(2, 0, 0, 0); + assert_eq_m128i(r, e); + + // Only loading last element, so slice can be short. + let a = &[2i32]; + let mask = _mm_setr_epi32(0, 0, 0, !0); + let r = _mm_maskload_epi32(a.as_ptr().wrapping_sub(3), mask); + let e = _mm_setr_epi32(0, 0, 0, 2); + assert_eq_m128i(r, e); + } + test_mm_maskload_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_maskload_epi32() { + let nums = [1, 2, 3, 4, 5, 6, 7, 8]; + let a = &nums as *const i32; + let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0); + let r = _mm256_maskload_epi32(a, mask); + let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0); + assert_eq_m256i(r, e); + + // Unaligned pointer + let a = Unaligned::new([1i32, 2, 3, 4, 5, 6, 7, 8]); + let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0); + let r = _mm256_maskload_epi32(a.as_ptr().cast(), mask); + let e = _mm256_setr_epi32(0, 2, 0, 4, 0, 6, 0, 8); + assert_eq_m256i(r, e); + + // Only loading first element, so slice can be short. + let a = &[2i32]; + let mask = _mm256_setr_epi32(!0, 0, 0, 0, 0, 0, 0, 0); + let r = _mm256_maskload_epi32(a.as_ptr(), mask); + let e = _mm256_setr_epi32(2, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m256i(r, e); + + // Only loading last element, so slice can be short. + let a = &[2i32]; + let mask = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, !0); + let r = _mm256_maskload_epi32(a.as_ptr().wrapping_sub(7), mask); + let e = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 2); + assert_eq_m256i(r, e); + } + test_mm256_maskload_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_maskload_epi64() { + let nums = [1_i64, 2_i64]; + let a = &nums as *const i64; + let mask = _mm_setr_epi64x(0, -1); + let r = _mm_maskload_epi64(a, mask); + let e = _mm_setr_epi64x(0, 2); + assert_eq_m128i(r, e); + + // Unaligned pointer + let a = Unaligned::new([1i64, 2]); + let mask = _mm_setr_epi64x(0, !0); + let r = _mm_maskload_epi64(a.as_ptr().cast(), mask); + let e = _mm_setr_epi64x(0, 2); + assert_eq_m128i(r, e); + + // Only loading first element, so slice can be short. + let a = &[2i64]; + let mask = _mm_setr_epi64x(!0, 0); + let r = _mm_maskload_epi64(a.as_ptr(), mask); + let e = _mm_setr_epi64x(2, 0); + assert_eq_m128i(r, e); + + // Only loading last element, so slice can be short. + let a = &[2i64]; + let mask = _mm_setr_epi64x(0, !0); + let r = _mm_maskload_epi64(a.as_ptr().wrapping_sub(1), mask); + let e = _mm_setr_epi64x(0, 2); + assert_eq_m128i(r, e); + } + test_mm_maskload_epi64(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_maskload_epi64() { + let nums = [1_i64, 2_i64, 3_i64, 4_i64]; + let a = &nums as *const i64; + let mask = _mm256_setr_epi64x(0, -1, -1, 0); + let r = _mm256_maskload_epi64(a, mask); + let e = _mm256_setr_epi64x(0, 2, 3, 0); + assert_eq_m256i(r, e); + + // Unaligned pointer + let a = Unaligned::new([1i64, 2, 3, 4]); + let mask = _mm256_setr_epi64x(0, !0, 0, !0); + let r = _mm256_maskload_epi64(a.as_ptr().cast(), mask); + let e = _mm256_setr_epi64x(0, 2, 0, 4); + assert_eq_m256i(r, e); + + // Only loading first element, so slice can be short. + let a = &[2i64]; + let mask = _mm256_setr_epi64x(!0, 0, 0, 0); + let r = _mm256_maskload_epi64(a.as_ptr(), mask); + let e = _mm256_setr_epi64x(2, 0, 0, 0); + assert_eq_m256i(r, e); + + // Only loading last element, so slice can be short. + let a = &[2i64]; + let mask = _mm256_setr_epi64x(0, 0, 0, !0); + let r = _mm256_maskload_epi64(a.as_ptr().wrapping_sub(3), mask); + let e = _mm256_setr_epi64x(0, 0, 0, 2); + assert_eq_m256i(r, e); + } + test_mm256_maskload_epi64(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_maskstore_epi32() { + let a = _mm_setr_epi32(1, 2, 3, 4); + let mut arr = [-1, -1, -1, -1]; + let mask = _mm_setr_epi32(-1, 0, 0, -1); + _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a); + let e = [1, -1, -1, 4]; + assert_eq!(arr, e); + + // Unaligned pointer + let mut r = Unaligned::new([0i32; 4]); + let mask = _mm_setr_epi32(0, !0, 0, !0); + let a = _mm_setr_epi32(1, 2, 3, 4); + _mm_maskstore_epi32(r.as_mut_ptr().cast(), mask, a); + let e = [0i32, 2, 0, 4]; + assert_eq!(r.read(), e); + + // Only storing first element, so slice can be short. + let mut r = [0i32]; + let mask = _mm_setr_epi32(!0, 0, 0, 0); + let a = _mm_setr_epi32(1, 2, 3, 4); + _mm_maskstore_epi32(r.as_mut_ptr(), mask, a); + let e = [1i32]; + assert_eq!(r, e); + + // Only storing last element, so slice can be short. + let mut r = [0i32]; + let mask = _mm_setr_epi32(0, 0, 0, !0); + let a = _mm_setr_epi32(1, 2, 3, 4); + _mm_maskstore_epi32(r.as_mut_ptr().wrapping_sub(3), mask, a); + let e = [4i32]; + assert_eq!(r, e); + } + test_mm_maskstore_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_maskstore_epi32() { + let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8); + let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1]; + let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0); + _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a); + let e = [1, -1, -1, 42, -1, 6, 7, -1]; + assert_eq!(arr, e); + + // Unaligned pointer + let mut r = Unaligned::new([0i32; 8]); + let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0); + let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); + _mm256_maskstore_epi32(r.as_mut_ptr().cast(), mask, a); + let e = [0i32, 2, 0, 4, 0, 6, 0, 8]; + assert_eq!(r.read(), e); + + // Only storing first element, so slice can be short. + let mut r = [0i32]; + let mask = _mm256_setr_epi32(!0, 0, 0, 0, 0, 0, 0, 0); + let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); + _mm256_maskstore_epi32(r.as_mut_ptr(), mask, a); + let e = [1i32]; + assert_eq!(r, e); + + // Only storing last element, so slice can be short. + let mut r = [0i32]; + let mask = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, !0); + let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); + _mm256_maskstore_epi32(r.as_mut_ptr().wrapping_sub(7), mask, a); + let e = [8i32]; + assert_eq!(r, e); + } + test_mm256_maskstore_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_maskstore_epi64() { + let a = _mm_setr_epi64x(1_i64, 2_i64); + let mut arr = [-1_i64, -1_i64]; + let mask = _mm_setr_epi64x(0, -1); + _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a); + let e = [-1, 2]; + assert_eq!(arr, e); + + // Unaligned pointer + let mut r = Unaligned::new([0i64; 2]); + let mask = _mm_setr_epi64x(0, !0); + let a = _mm_setr_epi64x(1, 2); + _mm_maskstore_epi64(r.as_mut_ptr().cast(), mask, a); + let e = [0i64, 2]; + assert_eq!(r.read(), e); + + // Only storing first element, so slice can be short. + let mut r = [0i64]; + let mask = _mm_setr_epi64x(!0, 0); + let a = _mm_setr_epi64x(1, 2); + _mm_maskstore_epi64(r.as_mut_ptr(), mask, a); + let e = [1i64]; + assert_eq!(r, e); + + // Only storing last element, so slice can be short. + let mut r = [0i64]; + let mask = _mm_setr_epi64x(0, !0); + let a = _mm_setr_epi64x(1, 2); + _mm_maskstore_epi64(r.as_mut_ptr().wrapping_sub(1), mask, a); + let e = [2i64]; + assert_eq!(r, e); + } + test_mm_maskstore_epi64(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_maskstore_epi64() { + let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64); + let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64]; + let mask = _mm256_setr_epi64x(0, -1, -1, 0); + _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a); + let e = [-1, 2, 3, -1]; + assert_eq!(arr, e); + + // Unaligned pointer + let mut r = Unaligned::new([0i64; 4]); + let mask = _mm256_setr_epi64x(0, !0, 0, !0); + let a = _mm256_setr_epi64x(1, 2, 3, 4); + _mm256_maskstore_epi64(r.as_mut_ptr().cast(), mask, a); + let e = [0i64, 2, 0, 4]; + assert_eq!(r.read(), e); + + // Only storing first element, so slice can be short. + let mut r = [0i64]; + let mask = _mm256_setr_epi64x(!0, 0, 0, 0); + let a = _mm256_setr_epi64x(1, 2, 3, 4); + _mm256_maskstore_epi64(r.as_mut_ptr(), mask, a); + let e = [1i64]; + assert_eq!(r, e); + + // Only storing last element, so slice can be short. + let mut r = [0i64]; + let mask = _mm256_setr_epi64x(0, 0, 0, !0); + let a = _mm256_setr_epi64x(1, 2, 3, 4); + _mm256_maskstore_epi64(r.as_mut_ptr().wrapping_sub(3), mask, a); + let e = [4i64]; + assert_eq!(r, e); + } + test_mm256_maskstore_epi64(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_mpsadbw_epu8() { + let a = _mm256_setr_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 2, 4, 6, 8, 10, 12, 14, 16, + 18, 20, 22, 24, 26, 28, 30, + ); + + let r = _mm256_mpsadbw_epu8::<0b000>(a, a); + let e = _mm256_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28, 0, 8, 16, 24, 32, 40, 48, 56); + assert_eq_m256i(r, e); + + let r = _mm256_mpsadbw_epu8::<0b001>(a, a); + let e = _mm256_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12, 32, 24, 16, 8, 0, 8, 16, 24); + assert_eq_m256i(r, e); + + let r = _mm256_mpsadbw_epu8::<0b100>(a, a); + let e = _mm256_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44, 32, 40, 48, 56, 64, 72, 80, 88); + assert_eq_m256i(r, e); + + let r = _mm256_mpsadbw_epu8::<0b101>(a, a); + let e = _mm256_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28, 0, 8, 16, 24, 32, 40, 48, 56); + assert_eq_m256i(r, e); + + let r = _mm256_mpsadbw_epu8::<0b111>(a, a); + let e = _mm256_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4, 64, 56, 48, 40, 32, 24, 16, 8); + assert_eq_m256i(r, e); + } + test_mm256_mpsadbw_epu8(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_mulhrs_epi16() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(4); + let r = _mm256_mullo_epi16(a, b); + let e = _mm256_set1_epi16(8); + assert_eq_m256i(r, e); + } + test_mm256_mulhrs_epi16(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_packs_epi16() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(4); + let r = _mm256_packs_epi16(a, b); + #[rustfmt::skip] + let e = _mm256_setr_epi8( + 2, 2, 2, 2, 2, 2, 2, 2, + 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, + 4, 4, 4, 4, 4, 4, 4, 4, + ); + + assert_eq_m256i(r, e); + } + test_mm256_packs_epi16(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_packs_epi32() { + let a = _mm256_set1_epi32(2); + let b = _mm256_set1_epi32(4); + let r = _mm256_packs_epi32(a, b); + let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4); + + assert_eq_m256i(r, e); + } + test_mm256_packs_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_packus_epi16() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(4); + let r = _mm256_packus_epi16(a, b); + #[rustfmt::skip] + let e = _mm256_setr_epi8( + 2, 2, 2, 2, 2, 2, 2, 2, + 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, + 4, 4, 4, 4, 4, 4, 4, 4, + ); + + assert_eq_m256i(r, e); + } + test_mm256_packus_epi16(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_packus_epi32() { + let a = _mm256_set1_epi32(2); + let b = _mm256_set1_epi32(4); + let r = _mm256_packus_epi32(a, b); + let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4); + + assert_eq_m256i(r, e); + } + test_mm256_packus_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_permutevar8x32_epi32() { + let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800); + let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4); + let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500); + let r = _mm256_permutevar8x32_epi32(a, b); + assert_eq_m256i(r, expected); + } + test_mm256_permutevar8x32_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_permute2x128_si256() { + let a = _mm256_setr_epi64x(100, 200, 500, 600); + let b = _mm256_setr_epi64x(300, 400, 700, 800); + let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b); + let e = _mm256_setr_epi64x(700, 800, 500, 600); + assert_eq_m256i(r, e); + } + test_mm256_permute2x128_si256(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_permutevar8x32_ps() { + let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); + let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4); + let r = _mm256_permutevar8x32_ps(a, b); + let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.); + assert_eq_m256(r, e); + } + test_mm256_permutevar8x32_ps(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_sad_epu8() { + let a = _mm256_set1_epi8(2); + let b = _mm256_set1_epi8(4); + let r = _mm256_sad_epu8(a, b); + let e = _mm256_set1_epi64x(16); + assert_eq_m256i(r, e); + } + test_mm256_sad_epu8(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_shuffle_epi8() { + #[rustfmt::skip] + let a = _mm256_setr_epi8( + 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, + ); + #[rustfmt::skip] + let b = _mm256_setr_epi8( + 4, 128u8 as i8, 4, 3, 24, 12, 6, 19, + 12, 5, 5, 10, 4, 1, 8, 0, + 4, 128u8 as i8, 4, 3, 24, 12, 6, 19, + 12, 5, 5, 10, 4, 1, 8, 0, + ); + #[rustfmt::skip] + let expected = _mm256_setr_epi8( + 5, 0, 5, 4, 9, 13, 7, 4, + 13, 6, 6, 11, 5, 2, 9, 1, + 21, 0, 21, 20, 25, 29, 23, 20, + 29, 22, 22, 27, 21, 18, 25, 17, + ); + let r = _mm256_shuffle_epi8(a, b); + assert_eq_m256i(r, expected); + } + test_mm256_shuffle_epi8(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_sign_epi16() { + let a = _mm256_set1_epi16(2); + let b = _mm256_set1_epi16(-1); + let r = _mm256_sign_epi16(a, b); + let e = _mm256_set1_epi16(-2); + assert_eq_m256i(r, e); + } + test_mm256_sign_epi16(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_sign_epi32() { + let a = _mm256_set1_epi32(2); + let b = _mm256_set1_epi32(-1); + let r = _mm256_sign_epi32(a, b); + let e = _mm256_set1_epi32(-2); + assert_eq_m256i(r, e); + } + test_mm256_sign_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_sign_epi8() { + let a = _mm256_set1_epi8(2); + let b = _mm256_set1_epi8(-1); + let r = _mm256_sign_epi8(a, b); + let e = _mm256_set1_epi8(-2); + assert_eq_m256i(r, e); + } + test_mm256_sign_epi8(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_sll_epi16() { + let a = _mm256_setr_epi16( + 0x88, -0x88, 0x99, -0x99, 0xAA, -0xAA, 0xBB, -0xBB, 0xCC, -0xCC, 0xDD, -0xDD, 0xEE, + -0xEE, 0xFF, -0xFF, + ); + let r = _mm256_sll_epi16(a, _mm_set_epi64x(0, 4)); + assert_eq_m256i( + r, + _mm256_setr_epi16( + 0x880, -0x880, 0x990, -0x990, 0xAA0, -0xAA0, 0xBB0, -0xBB0, 0xCC0, -0xCC0, 0xDD0, + -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0, + ), + ); + let r = _mm256_sll_epi16(a, _mm_set_epi64x(4, 0)); + assert_eq_m256i(r, a); + let r = _mm256_sll_epi16(a, _mm_set_epi64x(0, 16)); + assert_eq_m256i(r, _mm256_set1_epi16(0)); + let r = _mm256_sll_epi16(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m256i(r, _mm256_set1_epi16(0)); + } + test_mm256_sll_epi16(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_sll_epi32() { + let a = + _mm256_setr_epi32(0xCCCC, -0xCCCC, 0xDDDD, -0xDDDD, 0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF); + let r = _mm256_sll_epi32(a, _mm_set_epi64x(0, 4)); + assert_eq_m256i( + r, + _mm256_setr_epi32( + 0xCCCC0, -0xCCCC0, 0xDDDD0, -0xDDDD0, 0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0, + ), + ); + let r = _mm256_sll_epi32(a, _mm_set_epi64x(4, 0)); + assert_eq_m256i(r, a); + let r = _mm256_sll_epi32(a, _mm_set_epi64x(0, 32)); + assert_eq_m256i(r, _mm256_set1_epi32(0)); + let r = _mm256_sll_epi32(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m256i(r, _mm256_set1_epi32(0)); + } + test_mm256_sll_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_sll_epi64() { + let a = _mm256_set_epi64x(0xEEEEEEEE, -0xEEEEEEEE, 0xFFFFFFFF, -0xFFFFFFFF); + let r = _mm256_sll_epi64(a, _mm_set_epi64x(0, 4)); + assert_eq_m256i(r, _mm256_set_epi64x(0xEEEEEEEE0, -0xEEEEEEEE0, 0xFFFFFFFF0, -0xFFFFFFFF0)); + let r = _mm256_sll_epi64(a, _mm_set_epi64x(4, 0)); + assert_eq_m256i(r, a); + let r = _mm256_sll_epi64(a, _mm_set_epi64x(0, 64)); + assert_eq_m256i(r, _mm256_set1_epi64x(0)); + let r = _mm256_sll_epi64(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m256i(r, _mm256_set1_epi64x(0)); + } + test_mm256_sll_epi64(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_sra_epi16() { + let a = _mm256_setr_epi16( + 0x88, -0x88, 0x99, -0x99, 0xAA, -0xAA, 0xBB, -0xBB, 0xCC, -0xCC, 0xDD, -0xDD, 0xEE, + -0xEE, 0xFF, -0xFF, + ); + let r = _mm256_sra_epi16(a, _mm_set_epi64x(0, 4)); + assert_eq_m256i( + r, + _mm256_setr_epi16( + 0x8, -0x9, 0x9, -0xA, 0xA, -0xB, 0xB, -0xC, 0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF, + -0x10, + ), + ); + let r = _mm256_sra_epi16(a, _mm_set_epi64x(4, 0)); + assert_eq_m256i(r, a); + let r = _mm256_sra_epi16(a, _mm_set_epi64x(0, 16)); + assert_eq_m256i( + r, + _mm256_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1), + ); + let r = _mm256_sra_epi16(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m256i( + r, + _mm256_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1), + ); + } + test_mm256_sra_epi16(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_sra_epi32() { + let a = + _mm256_setr_epi32(0xCCCC, -0xCCCC, 0xDDDD, -0xDDDD, 0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF); + let r = _mm256_sra_epi32(a, _mm_set_epi64x(0, 4)); + assert_eq_m256i( + r, + _mm256_setr_epi32(0xCCC, -0xCCD, 0xDDD, -0xDDE, 0xEEE, -0xEEF, 0xFFF, -0x1000), + ); + let r = _mm256_sra_epi32(a, _mm_set_epi64x(4, 0)); + assert_eq_m256i(r, a); + let r = _mm256_sra_epi32(a, _mm_set_epi64x(0, 32)); + assert_eq_m256i(r, _mm256_setr_epi32(0, -1, 0, -1, 0, -1, 0, -1)); + let r = _mm256_sra_epi32(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m256i(r, _mm256_setr_epi32(0, -1, 0, -1, 0, -1, 0, -1)); + } + test_mm256_sra_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_srl_epi16() { + let a = _mm256_setr_epi16( + 0x88, -0x88, 0x99, -0x99, 0xAA, -0xAA, 0xBB, -0xBB, 0xCC, -0xCC, 0xDD, -0xDD, 0xEE, + -0xEE, 0xFF, -0xFF, + ); + let r = _mm256_srl_epi16(a, _mm_set_epi64x(0, 4)); + assert_eq_m256i( + r, + _mm256_setr_epi16( + 0x8, 0xFF7, 0x9, 0xFF6, 0xA, 0xFF5, 0xB, 0xFF4, 0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1, + 0xF, 0xFF0, + ), + ); + let r = _mm256_srl_epi16(a, _mm_set_epi64x(4, 0)); + assert_eq_m256i(r, a); + let r = _mm256_srl_epi16(a, _mm_set_epi64x(0, 16)); + assert_eq_m256i(r, _mm256_set1_epi16(0)); + let r = _mm256_srl_epi16(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m256i(r, _mm256_set1_epi16(0)); + } + test_mm256_srl_epi16(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_srl_epi32() { + let a = + _mm256_setr_epi32(0xCCCC, -0xCCCC, 0xDDDD, -0xDDDD, 0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF); + let r = _mm256_srl_epi32(a, _mm_set_epi64x(0, 4)); + assert_eq_m256i( + r, + _mm256_setr_epi32( + 0xCCC, 0xFFFF333, 0xDDD, 0xFFFF222, 0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000, + ), + ); + let r = _mm256_srl_epi32(a, _mm_set_epi64x(4, 0)); + assert_eq_m256i(r, a); + let r = _mm256_srl_epi32(a, _mm_set_epi64x(0, 32)); + assert_eq_m256i(r, _mm256_set1_epi32(0)); + let r = _mm256_srl_epi32(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m256i(r, _mm256_set1_epi32(0)); + } + test_mm256_srl_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_srl_epi64() { + let a = _mm256_set_epi64x(0xEEEEEEEE, -0xEEEEEEEE, 0xFFFFFFFF, -0xFFFFFFFF); + let r = _mm256_srl_epi64(a, _mm_set_epi64x(0, 4)); + assert_eq_m256i( + r, + _mm256_set_epi64x(0xEEEEEEE, 0xFFFFFFFF1111111, 0xFFFFFFF, 0xFFFFFFFF0000000), + ); + let r = _mm256_srl_epi64(a, _mm_set_epi64x(4, 0)); + assert_eq_m256i(r, a); + let r = _mm256_srl_epi64(a, _mm_set_epi64x(0, 64)); + assert_eq_m256i(r, _mm256_set1_epi64x(0)); + let r = _mm256_srl_epi64(a, _mm_set_epi64x(0, i64::MAX)); + assert_eq_m256i(r, _mm256_set1_epi64x(0)); + } + test_mm256_srl_epi64(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_sllv_epi32() { + let a = _mm_set_epi32(1, 2, 3, 4); + let b = _mm_set_epi32(4, 3, 2, 1); + let r = _mm_sllv_epi32(a, b); + let e = _mm_set_epi32(16, 16, 12, 8); + assert_eq_m128i(r, e); + } + test_mm_sllv_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_sllv_epi32() { + let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1); + let r = _mm256_sllv_epi32(a, b); + let e = _mm256_set_epi32(256, 256, 192, 128, 80, 48, 28, 16); + assert_eq_m256i(r, e); + } + test_mm256_sllv_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_sllv_epi64() { + let a = _mm_set_epi64x(2, 3); + let b = _mm_set_epi64x(1, 2); + let r = _mm_sllv_epi64(a, b); + let e = _mm_set_epi64x(4, 12); + assert_eq_m128i(r, e); + } + test_mm_sllv_epi64(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_sllv_epi64() { + let a = _mm256_set_epi64x(1, 2, 3, 4); + let b = _mm256_set_epi64x(4, 3, 2, 1); + let r = _mm256_sllv_epi64(a, b); + let e = _mm256_set_epi64x(16, 16, 12, 8); + assert_eq_m256i(r, e); + } + test_mm256_sllv_epi64(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_srav_epi32() { + let a = _mm_set_epi32(16, -32, 64, -128); + let b = _mm_set_epi32(4, 3, 2, 1); + let r = _mm_srav_epi32(a, b); + let e = _mm_set_epi32(1, -4, 16, -64); + assert_eq_m128i(r, e); + } + test_mm_srav_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_srav_epi32() { + let a = _mm256_set_epi32(256, -512, 1024, -2048, 4096, -8192, 16384, -32768); + let b = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1); + let r = _mm256_srav_epi32(a, b); + let e = _mm256_set_epi32(1, -4, 16, -64, 256, -1024, 4096, -16384); + assert_eq_m256i(r, e); + } + test_mm256_srav_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_srlv_epi32() { + let a = _mm_set_epi32(16, 32, 64, 128); + let b = _mm_set_epi32(4, 3, 2, 1); + let r = _mm_srlv_epi32(a, b); + let e = _mm_set_epi32(1, 4, 16, 64); + assert_eq_m128i(r, e); + } + test_mm_srlv_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_srlv_epi32() { + let a = _mm256_set_epi32(256, 512, 1024, 2048, 4096, 8192, 16384, 32768); + let b = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1); + let r = _mm256_srlv_epi32(a, b); + let e = _mm256_set_epi32(1, 4, 16, 64, 256, 1024, 4096, 16384); + assert_eq_m256i(r, e); + } + test_mm256_srlv_epi32(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm_srlv_epi64() { + let a = _mm_set_epi64x(4, 8); + let b = _mm_set_epi64x(2, 1); + let r = _mm_srlv_epi64(a, b); + let e = _mm_set_epi64x(1, 4); + assert_eq_m128i(r, e); + } + test_mm_srlv_epi64(); + + #[target_feature(enable = "avx2")] + unsafe fn test_mm256_srlv_epi64() { + let a = _mm256_set_epi64x(16, 32, 64, 128); + let b = _mm256_set_epi64x(4, 3, 2, 1); + let r = _mm256_srlv_epi64(a, b); + let e = _mm256_set_epi64x(1, 4, 16, 64); + assert_eq_m256i(r, e); + } + test_mm256_srlv_epi64(); +} + +#[target_feature(enable = "sse2")] +unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i { + _mm_set_epi64x(b, a) +} + +#[track_caller] +#[target_feature(enable = "sse")] +unsafe fn assert_eq_m128(a: __m128, b: __m128) { + let r = _mm_cmpeq_ps(a, b); + if _mm_movemask_ps(r) != 0b1111 { + panic!("{:?} != {:?}", a, b); + } +} + +#[track_caller] +#[target_feature(enable = "sse2")] +unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) { + if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 { + panic!("{:?} != {:?}", a, b); + } +} + +#[track_caller] +#[target_feature(enable = "sse2")] +unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) { + assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b)) +} + +#[track_caller] +#[target_feature(enable = "avx")] +unsafe fn assert_eq_m256(a: __m256, b: __m256) { + let cmp = _mm256_cmp_ps::<_CMP_EQ_OQ>(a, b); + if _mm256_movemask_ps(cmp) != 0b11111111 { + panic!("{:?} != {:?}", a, b); + } +} + +#[track_caller] +#[target_feature(enable = "avx")] +unsafe fn assert_eq_m256d(a: __m256d, b: __m256d) { + let cmp = _mm256_cmp_pd::<_CMP_EQ_OQ>(a, b); + if _mm256_movemask_pd(cmp) != 0b1111 { + panic!("{:?} != {:?}", a, b); + } +} + +#[track_caller] +#[target_feature(enable = "avx")] +unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) { + assert_eq!(transmute::<_, [u64; 4]>(a), transmute::<_, [u64; 4]>(b)) +} + +/// Stores `T` in an unaligned address +struct Unaligned { + buf: Vec, + offset: bool, + _marker: std::marker::PhantomData, +} + +impl Unaligned { + fn new(value: T) -> Self { + // Allocate extra byte for unalignment headroom + let len = std::mem::size_of::(); + let mut buf = Vec::::with_capacity(len + 1); + // Force the address to be a non-multiple of 2, so it is as unaligned as it can get. + let offset = (buf.as_ptr() as usize % 2) == 0; + let value_ptr: *const T = &value; + unsafe { + buf.as_mut_ptr().add(offset.into()).copy_from_nonoverlapping(value_ptr.cast(), len); + } + Self { buf, offset, _marker: std::marker::PhantomData } + } + + fn as_ptr(&self) -> *const T { + unsafe { self.buf.as_ptr().add(self.offset.into()).cast() } + } + + fn as_mut_ptr(&mut self) -> *mut T { + unsafe { self.buf.as_mut_ptr().add(self.offset.into()).cast() } + } + + fn read(&self) -> T { + unsafe { self.as_ptr().read_unaligned() } + } +}