From 964ade8f169beff85fdaf20a64b2a8ee9a8398e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= <eduardosm-dev@e64.io>
Date: Fri, 19 Apr 2024 20:46:27 +0200
Subject: [PATCH] Implement LLVM x86 AVX2 intrinsics

---
 src/shims/x86/avx.rs              |   71 +-
 src/shims/x86/avx2.rs             |  444 ++++++++
 src/shims/x86/mod.rs              |  402 +++++++
 src/shims/x86/sse2.rs             |   75 +-
 src/shims/x86/sse41.rs            |   59 +-
 src/shims/x86/ssse3.rs            |   65 +-
 tests/pass/intrinsics-x86-avx2.rs | 1613 +++++++++++++++++++++++++++++
 7 files changed, 2473 insertions(+), 256 deletions(-)
 create mode 100644 src/shims/x86/avx2.rs
 create mode 100644 tests/pass/intrinsics-x86-avx2.rs

diff --git a/src/shims/x86/avx.rs b/src/shims/x86/avx.rs
index 23c78647b9..41c20d768f 100644
--- a/src/shims/x86/avx.rs
+++ b/src/shims/x86/avx.rs
@@ -7,7 +7,8 @@ use rustc_target::spec::abi::Abi;
 
 use super::{
     bin_op_simd_float_all, conditional_dot_product, convert_float_to_int, horizontal_bin_op,
-    round_all, test_bits_masked, test_high_bits_masked, unary_op_ps, FloatBinOp, FloatUnaryOp,
+    mask_load, mask_store, round_all, test_bits_masked, test_high_bits_masked, unary_op_ps,
+    FloatBinOp, FloatUnaryOp,
 };
 use crate::*;
 use shims::foreign_items::EmulateForeignItemResult;
@@ -347,71 +348,3 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
         Ok(EmulateForeignItemResult::NeedsJumping)
     }
 }
-
-/// Conditionally loads from `ptr` according the high bit of each
-/// element of `mask`. `ptr` does not need to be aligned.
-fn mask_load<'tcx>(
-    this: &mut crate::MiriInterpCx<'_, 'tcx>,
-    ptr: &OpTy<'tcx, Provenance>,
-    mask: &OpTy<'tcx, Provenance>,
-    dest: &MPlaceTy<'tcx, Provenance>,
-) -> InterpResult<'tcx, ()> {
-    let (mask, mask_len) = this.operand_to_simd(mask)?;
-    let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-    assert_eq!(dest_len, mask_len);
-
-    let mask_item_size = mask.layout.field(this, 0).size;
-    let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap();
-
-    let ptr = this.read_pointer(ptr)?;
-    for i in 0..dest_len {
-        let mask = this.project_index(&mask, i)?;
-        let dest = this.project_index(&dest, i)?;
-
-        if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 {
-            // Size * u64 is implemented as always checked
-            #[allow(clippy::arithmetic_side_effects)]
-            let ptr = ptr.wrapping_offset(dest.layout.size * i, &this.tcx);
-            // Unaligned copy, which is what we want.
-            this.mem_copy(ptr, dest.ptr(), dest.layout.size, /*nonoverlapping*/ true)?;
-        } else {
-            this.write_scalar(Scalar::from_int(0, dest.layout.size), &dest)?;
-        }
-    }
-
-    Ok(())
-}
-
-/// Conditionally stores into `ptr` according the high bit of each
-/// element of `mask`. `ptr` does not need to be aligned.
-fn mask_store<'tcx>(
-    this: &mut crate::MiriInterpCx<'_, 'tcx>,
-    ptr: &OpTy<'tcx, Provenance>,
-    mask: &OpTy<'tcx, Provenance>,
-    value: &OpTy<'tcx, Provenance>,
-) -> InterpResult<'tcx, ()> {
-    let (mask, mask_len) = this.operand_to_simd(mask)?;
-    let (value, value_len) = this.operand_to_simd(value)?;
-
-    assert_eq!(value_len, mask_len);
-
-    let mask_item_size = mask.layout.field(this, 0).size;
-    let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap();
-
-    let ptr = this.read_pointer(ptr)?;
-    for i in 0..value_len {
-        let mask = this.project_index(&mask, i)?;
-        let value = this.project_index(&value, i)?;
-
-        if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 {
-            // Size * u64 is implemented as always checked
-            #[allow(clippy::arithmetic_side_effects)]
-            let ptr = ptr.wrapping_offset(value.layout.size * i, &this.tcx);
-            // Unaligned copy, which is what we want.
-            this.mem_copy(value.ptr(), ptr, value.layout.size, /*nonoverlapping*/ true)?;
-        }
-    }
-
-    Ok(())
-}
diff --git a/src/shims/x86/avx2.rs b/src/shims/x86/avx2.rs
new file mode 100644
index 0000000000..bbf53f9f1e
--- /dev/null
+++ b/src/shims/x86/avx2.rs
@@ -0,0 +1,444 @@
+use crate::rustc_middle::ty::layout::LayoutOf as _;
+use rustc_middle::mir;
+use rustc_middle::ty::Ty;
+use rustc_span::Symbol;
+use rustc_target::spec::abi::Abi;
+
+use super::{
+    horizontal_bin_op, int_abs, mask_load, mask_store, mpsadbw, packssdw, packsswb, packusdw,
+    packuswb, pmulhrsw, psign, shift_simd_by_scalar, shift_simd_by_simd, ShiftOp,
+};
+use crate::*;
+use shims::foreign_items::EmulateForeignItemResult;
+
+impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for crate::MiriInterpCx<'mir, 'tcx> {}
+pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
+    crate::MiriInterpCxExt<'mir, 'tcx>
+{
+    fn emulate_x86_avx2_intrinsic(
+        &mut self,
+        link_name: Symbol,
+        abi: Abi,
+        args: &[OpTy<'tcx, Provenance>],
+        dest: &MPlaceTy<'tcx, Provenance>,
+    ) -> InterpResult<'tcx, EmulateForeignItemResult> {
+        let this = self.eval_context_mut();
+        this.expect_target_feature_for_intrinsic(link_name, "avx2")?;
+        // Prefix should have already been checked.
+        let unprefixed_name = link_name.as_str().strip_prefix("llvm.x86.avx2.").unwrap();
+
+        match unprefixed_name {
+            // Used to implement the _mm256_abs_epi{8,16,32} functions.
+            // Calculates the absolute value of packed 8/16/32-bit integers.
+            "pabs.b" | "pabs.w" | "pabs.d" => {
+                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                int_abs(this, op, dest)?;
+            }
+            // Used to implement the _mm256_h{add,adds,sub}_epi{16,32} functions.
+            // Horizontally add / add with saturation / subtract adjacent 16/32-bit
+            // integer values in `left` and `right`.
+            "phadd.w" | "phadd.sw" | "phadd.d" | "phsub.w" | "phsub.sw" | "phsub.d" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (which, saturating) = match unprefixed_name {
+                    "phadd.w" | "phadd.d" => (mir::BinOp::Add, false),
+                    "phadd.sw" => (mir::BinOp::Add, true),
+                    "phsub.w" | "phsub.d" => (mir::BinOp::Sub, false),
+                    "phsub.sw" => (mir::BinOp::Sub, true),
+                    _ => unreachable!(),
+                };
+
+                horizontal_bin_op(this, which, saturating, left, right, dest)?;
+            }
+            // Used to implement `_mm{,_mask}_{i32,i64}gather_{epi32,epi64,pd,ps}` functions
+            // Gathers elements from `slice` using `offsets * scale` as indices.
+            // When the highest bit of the corresponding element of `mask` is 0,
+            // the value is copied from `src` instead.
+            "gather.d.d" | "gather.d.d.256" | "gather.d.q" | "gather.d.q.256" | "gather.q.d"
+            | "gather.q.d.256" | "gather.q.q" | "gather.q.q.256" | "gather.d.pd"
+            | "gather.d.pd.256" | "gather.q.pd" | "gather.q.pd.256" | "gather.d.ps"
+            | "gather.d.ps.256" | "gather.q.ps" | "gather.q.ps.256" => {
+                let [src, slice, offsets, mask, scale] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                assert_eq!(dest.layout, src.layout);
+
+                let (src, _) = this.operand_to_simd(src)?;
+                let (offsets, offsets_len) = this.operand_to_simd(offsets)?;
+                let (mask, mask_len) = this.operand_to_simd(mask)?;
+                let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+                // There are cases like dest: i32x4, offsets: i64x2
+                let actual_len = dest_len.min(offsets_len);
+
+                assert_eq!(dest_len, mask_len);
+
+                let mask_item_size = mask.layout.field(this, 0).size;
+                let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap();
+
+                let scale = this.read_scalar(scale)?.to_i8()?;
+                if !matches!(scale, 1 | 2 | 4 | 8) {
+                    throw_unsup_format!("invalid gather scale {scale}");
+                }
+                let scale = i64::from(scale);
+
+                let slice = this.read_pointer(slice)?;
+                for i in 0..actual_len {
+                    let mask = this.project_index(&mask, i)?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 {
+                        let offset = this.project_index(&offsets, i)?;
+                        let offset =
+                            i64::try_from(this.read_scalar(&offset)?.to_int(offset.layout.size)?)
+                                .unwrap();
+                        let ptr = slice
+                            .wrapping_signed_offset(offset.checked_mul(scale).unwrap(), &this.tcx);
+                        // Unaligned copy, which is what we want.
+                        this.mem_copy(
+                            ptr,
+                            dest.ptr(),
+                            dest.layout.size,
+                            /*nonoverlapping*/ true,
+                        )?;
+                    } else {
+                        this.copy_op(&this.project_index(&src, i)?, &dest)?;
+                    }
+                }
+                for i in actual_len..dest_len {
+                    let dest = this.project_index(&dest, i)?;
+                    this.write_scalar(Scalar::from_int(0, dest.layout.size), &dest)?;
+                }
+            }
+            // Used to implement the _mm256_madd_epi16 function.
+            // Multiplies packed signed 16-bit integers in `left` and `right`, producing
+            // intermediate signed 32-bit integers. Horizontally add adjacent pairs of
+            // intermediate 32-bit integers, and pack the results in `dest`.
+            "pmadd.wd" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+                assert_eq!(left_len, right_len);
+                assert_eq!(dest_len.checked_mul(2).unwrap(), left_len);
+
+                for i in 0..dest_len {
+                    let j1 = i.checked_mul(2).unwrap();
+                    let left1 = this.read_scalar(&this.project_index(&left, j1)?)?.to_i16()?;
+                    let right1 = this.read_scalar(&this.project_index(&right, j1)?)?.to_i16()?;
+
+                    let j2 = j1.checked_add(1).unwrap();
+                    let left2 = this.read_scalar(&this.project_index(&left, j2)?)?.to_i16()?;
+                    let right2 = this.read_scalar(&this.project_index(&right, j2)?)?.to_i16()?;
+
+                    let dest = this.project_index(&dest, i)?;
+
+                    // Multiplications are i16*i16->i32, which will not overflow.
+                    let mul1 = i32::from(left1).checked_mul(right1.into()).unwrap();
+                    let mul2 = i32::from(left2).checked_mul(right2.into()).unwrap();
+                    // However, this addition can overflow in the most extreme case
+                    // (-0x8000)*(-0x8000)+(-0x8000)*(-0x8000) = 0x80000000
+                    let res = mul1.wrapping_add(mul2);
+
+                    this.write_scalar(Scalar::from_i32(res), &dest)?;
+                }
+            }
+            // Used to implement the _mm256_maddubs_epi16 function.
+            // Multiplies packed 8-bit unsigned integers from `left` and packed
+            // signed 8-bit integers from `right` into 16-bit signed integers. Then,
+            // the saturating sum of the products with indices `2*i` and `2*i+1`
+            // produces the output at index `i`.
+            "pmadd.ub.sw" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+                assert_eq!(left_len, right_len);
+                assert_eq!(dest_len.checked_mul(2).unwrap(), left_len);
+
+                for i in 0..dest_len {
+                    let j1 = i.checked_mul(2).unwrap();
+                    let left1 = this.read_scalar(&this.project_index(&left, j1)?)?.to_u8()?;
+                    let right1 = this.read_scalar(&this.project_index(&right, j1)?)?.to_i8()?;
+
+                    let j2 = j1.checked_add(1).unwrap();
+                    let left2 = this.read_scalar(&this.project_index(&left, j2)?)?.to_u8()?;
+                    let right2 = this.read_scalar(&this.project_index(&right, j2)?)?.to_i8()?;
+
+                    let dest = this.project_index(&dest, i)?;
+
+                    // Multiplication of a u8 and an i8 into an i16 cannot overflow.
+                    let mul1 = i16::from(left1).checked_mul(right1.into()).unwrap();
+                    let mul2 = i16::from(left2).checked_mul(right2.into()).unwrap();
+                    let res = mul1.saturating_add(mul2);
+
+                    this.write_scalar(Scalar::from_i16(res), &dest)?;
+                }
+            }
+            // Used to implement the _mm_maskload_epi32, _mm_maskload_epi64,
+            // _mm256_maskload_epi32 and _mm256_maskload_epi64 functions.
+            // For the element `i`, if the high bit of the `i`-th element of `mask`
+            // is one, it is loaded from `ptr.wrapping_add(i)`, otherwise zero is
+            // loaded.
+            "maskload.d" | "maskload.q" | "maskload.d.256" | "maskload.q.256" => {
+                let [ptr, mask] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                mask_load(this, ptr, mask, dest)?;
+            }
+            // Used to implement the _mm_maskstore_epi32, _mm_maskstore_epi64,
+            // _mm256_maskstore_epi32 and _mm256_maskstore_epi64 functions.
+            // For the element `i`, if the high bit of the element `i`-th of `mask`
+            // is one, it is stored into `ptr.wapping_add(i)`.
+            // Unlike SSE2's _mm_maskmoveu_si128, these are not non-temporal stores.
+            "maskstore.d" | "maskstore.q" | "maskstore.d.256" | "maskstore.q.256" => {
+                let [ptr, mask, value] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                mask_store(this, ptr, mask, value)?;
+            }
+            // Used to implement the _mm256_mpsadbw_epu8 function.
+            // Compute the sum of absolute differences of quadruplets of unsigned
+            // 8-bit integers in `left` and `right`, and store the 16-bit results
+            // in `right`. Quadruplets are selected from `left` and `right` with
+            // offsets specified in `imm`.
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8
+            "mpsadbw" => {
+                let [left, right, imm] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                mpsadbw(this, left, right, imm, dest)?;
+            }
+            // Used to implement the _mm256_mulhrs_epi16 function.
+            // Multiplies packed 16-bit signed integer values, truncates the 32-bit
+            // product to the 18 most significant bits by right-shifting, and then
+            // divides the 18-bit value by 2 (rounding to nearest) by first adding
+            // 1 and then taking the bits `1..=16`.
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16
+            "pmul.hr.sw" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                pmulhrsw(this, left, right, dest)?;
+            }
+            // Used to implement the _mm256_packs_epi16 function.
+            // Converts two 16-bit integer vectors to a single 8-bit integer
+            // vector with signed saturation.
+            "packsswb" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                packsswb(this, left, right, dest)?;
+            }
+            // Used to implement the _mm256_packs_epi32 function.
+            // Converts two 32-bit integer vectors to a single 16-bit integer
+            // vector with signed saturation.
+            "packssdw" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                packssdw(this, left, right, dest)?;
+            }
+            // Used to implement the _mm256_packus_epi16 function.
+            // Converts two 16-bit signed integer vectors to a single 8-bit
+            // unsigned integer vector with saturation.
+            "packuswb" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                packuswb(this, left, right, dest)?;
+            }
+            // Used to implement the _mm256_packus_epi32 function.
+            // Concatenates two 32-bit signed integer vectors and converts
+            // the result to a 16-bit unsigned integer vector with saturation.
+            "packusdw" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                packusdw(this, left, right, dest)?;
+            }
+            // Used to implement the _mm256_permutevar8x32_epi32 and
+            // _mm256_permutevar8x32_ps function.
+            // Shuffles `left` using the three low bits of each element of `right`
+            // as indices.
+            "permd" | "permps" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+                assert_eq!(dest_len, left_len);
+                assert_eq!(dest_len, right_len);
+
+                for i in 0..dest_len {
+                    let dest = this.project_index(&dest, i)?;
+                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_u32()?;
+                    let left = this.project_index(&left, (right & 0b111).into())?;
+
+                    this.copy_op(&left, &dest)?;
+                }
+            }
+            // Used to implement the _mm256_permute2x128_si256 function.
+            // Shuffles 128-bit blocks of `a` and `b` using `imm` as pattern.
+            "vperm2i128" => {
+                let [left, right, imm] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                assert_eq!(left.layout.size.bits(), 256);
+                assert_eq!(right.layout.size.bits(), 256);
+                assert_eq!(dest.layout.size.bits(), 256);
+
+                // Transmute to `[i128; 2]`
+
+                let array_layout =
+                    this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.i128, 2))?;
+                let left = left.transmute(array_layout, this)?;
+                let right = right.transmute(array_layout, this)?;
+                let dest = dest.transmute(array_layout, this)?;
+
+                let imm = this.read_scalar(imm)?.to_u8()?;
+
+                for i in 0..2 {
+                    let dest = this.project_index(&dest, i)?;
+                    let src = match (imm >> i.checked_mul(4).unwrap()) & 0b11 {
+                        0 => this.project_index(&left, 0)?,
+                        1 => this.project_index(&left, 1)?,
+                        2 => this.project_index(&right, 0)?,
+                        3 => this.project_index(&right, 1)?,
+                        _ => unreachable!(),
+                    };
+
+                    this.copy_op(&src, &dest)?;
+                }
+            }
+            // Used to implement the _mm256_sad_epu8 function.
+            // Compute the absolute differences of packed unsigned 8-bit integers
+            // in `left` and `right`, then horizontally sum each consecutive 8
+            // differences to produce four unsigned 16-bit integers, and pack
+            // these unsigned 16-bit integers in the low 16 bits of 64-bit elements
+            // in `dest`.
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8
+            "psad.bw" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+                assert_eq!(left_len, right_len);
+                assert_eq!(left_len, dest_len.checked_mul(8).unwrap());
+
+                for i in 0..dest_len {
+                    let dest = this.project_index(&dest, i)?;
+
+                    let mut acc: u16 = 0;
+                    for j in 0..8 {
+                        let src_index = i.checked_mul(8).unwrap().checked_add(j).unwrap();
+
+                        let left = this.project_index(&left, src_index)?;
+                        let left = this.read_scalar(&left)?.to_u8()?;
+
+                        let right = this.project_index(&right, src_index)?;
+                        let right = this.read_scalar(&right)?.to_u8()?;
+
+                        acc = acc.checked_add(left.abs_diff(right).into()).unwrap();
+                    }
+
+                    this.write_scalar(Scalar::from_u64(acc.into()), &dest)?;
+                }
+            }
+            // Used to implement the _mm256_shuffle_epi8 intrinsic.
+            // Shuffles bytes from `left` using `right` as pattern.
+            // Each 128-bit block is shuffled independently.
+            "pshuf.b" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+                assert_eq!(dest_len, left_len);
+                assert_eq!(dest_len, right_len);
+
+                for i in 0..dest_len {
+                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_u8()?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    let res = if right & 0x80 == 0 {
+                        // Shuffle each 128-bit (16-byte) block independently.
+                        let j = u64::from(right % 16).checked_add(i & !15).unwrap();
+                        this.read_scalar(&this.project_index(&left, j)?)?
+                    } else {
+                        // If the highest bit in `right` is 1, write zero.
+                        Scalar::from_u8(0)
+                    };
+
+                    this.write_scalar(res, &dest)?;
+                }
+            }
+            // Used to implement the _mm256_sign_epi{8,16,32} functions.
+            // Negates elements from `left` when the corresponding element in
+            // `right` is negative. If an element from `right` is zero, zero
+            // is writen to the corresponding output element.
+            // Basically, we multiply `left` with `right.signum()`.
+            "psign.b" | "psign.w" | "psign.d" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                psign(this, left, right, dest)?;
+            }
+            // Used to implement the _mm256_{sll,srl,sra}_epi{16,32,64} functions
+            // (except _mm256_sra_epi64, which is not available in AVX2).
+            // Shifts N-bit packed integers in left by the amount in right.
+            // `right` is as 128-bit vector. but it is interpreted as a single
+            // 64-bit integer (remaining bits are ignored).
+            // For logic shifts, when right is larger than N - 1, zero is produced.
+            // For arithmetic shifts, when right is larger than N - 1, the sign bit
+            // is copied to remaining bits.
+            "psll.w" | "psrl.w" | "psra.w" | "psll.d" | "psrl.d" | "psra.d" | "psll.q"
+            | "psrl.q" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let which = match unprefixed_name {
+                    "psll.w" | "psll.d" | "psll.q" => ShiftOp::Left,
+                    "psrl.w" | "psrl.d" | "psrl.q" => ShiftOp::RightLogic,
+                    "psra.w" | "psra.d" => ShiftOp::RightArith,
+                    _ => unreachable!(),
+                };
+
+                shift_simd_by_scalar(this, left, right, which, dest)?;
+            }
+            // Used to implement the _mm{,256}_{sllv,srlv,srav}_epi{32,64} functions
+            // (except _mm{,256}_srav_epi64, which are not available in AVX2).
+            "psllv.d" | "psllv.d.256" | "psllv.q" | "psllv.q.256" | "psrlv.d" | "psrlv.d.256"
+            | "psrlv.q" | "psrlv.q.256" | "psrav.d" | "psrav.d.256" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let which = match unprefixed_name {
+                    "psllv.d" | "psllv.d.256" | "psllv.q" | "psllv.q.256" => ShiftOp::Left,
+                    "psrlv.d" | "psrlv.d.256" | "psrlv.q" | "psrlv.q.256" => ShiftOp::RightLogic,
+                    "psrav.d" | "psrav.d.256" => ShiftOp::RightArith,
+                    _ => unreachable!(),
+                };
+
+                shift_simd_by_simd(this, left, right, which, dest)?;
+            }
+            _ => return Ok(EmulateForeignItemResult::NotSupported),
+        }
+        Ok(EmulateForeignItemResult::NeedsJumping)
+    }
+}
diff --git a/src/shims/x86/mod.rs b/src/shims/x86/mod.rs
index 615821b2e3..9b8ea7b690 100644
--- a/src/shims/x86/mod.rs
+++ b/src/shims/x86/mod.rs
@@ -14,6 +14,7 @@ use shims::foreign_items::EmulateForeignItemResult;
 
 mod aesni;
 mod avx;
+mod avx2;
 mod sse;
 mod sse2;
 mod sse3;
@@ -136,6 +137,11 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                     this, link_name, abi, args, dest,
                 );
             }
+            name if name.starts_with("avx2.") => {
+                return avx2::EvalContextExt::emulate_x86_avx2_intrinsic(
+                    this, link_name, abi, args, dest,
+                );
+            }
 
             _ => return Ok(EmulateForeignItemResult::NotSupported),
         }
@@ -534,6 +540,61 @@ fn shift_simd_by_scalar<'tcx>(
     Ok(())
 }
 
+/// Shifts each element of `left` by the corresponding element of `right`.
+///
+/// For logic shifts, when right is larger than BITS - 1, zero is produced.
+/// For arithmetic right-shifts, when right is larger than BITS - 1, the sign
+/// bit is copied to remaining bits.
+fn shift_simd_by_simd<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    which: ShiftOp,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (left, left_len) = this.operand_to_simd(left)?;
+    let (right, right_len) = this.operand_to_simd(right)?;
+    let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+    assert_eq!(dest_len, left_len);
+    assert_eq!(dest_len, right_len);
+
+    for i in 0..dest_len {
+        let left = this.read_scalar(&this.project_index(&left, i)?)?;
+        let right = this.read_scalar(&this.project_index(&right, i)?)?;
+        let dest = this.project_index(&dest, i)?;
+
+        // It is ok to saturate the value to u32::MAX because any value
+        // above BITS - 1 will produce the same result.
+        let shift = u32::try_from(right.to_uint(dest.layout.size)?).unwrap_or(u32::MAX);
+
+        let res = match which {
+            ShiftOp::Left => {
+                let left = left.to_uint(dest.layout.size)?;
+                let res = left.checked_shl(shift).unwrap_or(0);
+                // `truncate` is needed as left-shift can make the absolute value larger.
+                Scalar::from_uint(dest.layout.size.truncate(res), dest.layout.size)
+            }
+            ShiftOp::RightLogic => {
+                let left = left.to_uint(dest.layout.size)?;
+                let res = left.checked_shr(shift).unwrap_or(0);
+                // No `truncate` needed as right-shift can only make the absolute value smaller.
+                Scalar::from_uint(res, dest.layout.size)
+            }
+            ShiftOp::RightArith => {
+                let left = left.to_int(dest.layout.size)?;
+                // On overflow, copy the sign bit to the remaining bits
+                let res = left.checked_shr(shift).unwrap_or(left >> 127);
+                // No `truncate` needed as right-shift can only make the absolute value smaller.
+                Scalar::from_int(res, dest.layout.size)
+            }
+        };
+        this.write_scalar(res, &dest)?;
+    }
+
+    Ok(())
+}
+
 /// Takes a 128-bit vector, transmutes it to `[u64; 2]` and extracts
 /// the first value.
 fn extract_first_u64<'tcx>(
@@ -664,6 +725,30 @@ fn convert_float_to_int<'tcx>(
     Ok(())
 }
 
+/// Calculates absolute value of integers in `op` and stores the result in `dest`.
+fn int_abs<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    op: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (op, op_len) = this.operand_to_simd(op)?;
+    let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+    assert_eq!(op_len, dest_len);
+
+    for i in 0..dest_len {
+        let op = this.read_scalar(&this.project_index(&op, i)?)?;
+        let dest = this.project_index(&dest, i)?;
+
+        // Converting to a host "i128" works since the input is always signed.
+        let res = op.to_int(dest.layout.size)?.unsigned_abs();
+
+        this.write_scalar(Scalar::from_uint(res, dest.layout.size), &dest)?;
+    }
+
+    Ok(())
+}
+
 /// Splits `op` (which must be a SIMD vector) into 128-bit chuncks.
 ///
 /// Returns a tuple where:
@@ -874,3 +959,320 @@ fn test_high_bits_masked<'tcx>(
 
     Ok((direct, negated))
 }
+
+/// Conditionally loads from `ptr` according the high bit of each
+/// element of `mask`. `ptr` does not need to be aligned.
+fn mask_load<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    ptr: &OpTy<'tcx, Provenance>,
+    mask: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (mask, mask_len) = this.operand_to_simd(mask)?;
+    let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+    assert_eq!(dest_len, mask_len);
+
+    let mask_item_size = mask.layout.field(this, 0).size;
+    let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap();
+
+    let ptr = this.read_pointer(ptr)?;
+    for i in 0..dest_len {
+        let mask = this.project_index(&mask, i)?;
+        let dest = this.project_index(&dest, i)?;
+
+        if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 {
+            // Size * u64 is implemented as always checked
+            #[allow(clippy::arithmetic_side_effects)]
+            let ptr = ptr.wrapping_offset(dest.layout.size * i, &this.tcx);
+            // Unaligned copy, which is what we want.
+            this.mem_copy(ptr, dest.ptr(), dest.layout.size, /*nonoverlapping*/ true)?;
+        } else {
+            this.write_scalar(Scalar::from_int(0, dest.layout.size), &dest)?;
+        }
+    }
+
+    Ok(())
+}
+
+/// Conditionally stores into `ptr` according the high bit of each
+/// element of `mask`. `ptr` does not need to be aligned.
+fn mask_store<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    ptr: &OpTy<'tcx, Provenance>,
+    mask: &OpTy<'tcx, Provenance>,
+    value: &OpTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (mask, mask_len) = this.operand_to_simd(mask)?;
+    let (value, value_len) = this.operand_to_simd(value)?;
+
+    assert_eq!(value_len, mask_len);
+
+    let mask_item_size = mask.layout.field(this, 0).size;
+    let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap();
+
+    let ptr = this.read_pointer(ptr)?;
+    for i in 0..value_len {
+        let mask = this.project_index(&mask, i)?;
+        let value = this.project_index(&value, i)?;
+
+        if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 {
+            // Size * u64 is implemented as always checked
+            #[allow(clippy::arithmetic_side_effects)]
+            let ptr = ptr.wrapping_offset(value.layout.size * i, &this.tcx);
+            // Unaligned copy, which is what we want.
+            this.mem_copy(value.ptr(), ptr, value.layout.size, /*nonoverlapping*/ true)?;
+        }
+    }
+
+    Ok(())
+}
+
+/// Compute the sum of absolute differences of quadruplets of unsigned
+/// 8-bit integers in `left` and `right`, and store the 16-bit results
+/// in `right`. Quadruplets are selected from `left` and `right` with
+/// offsets specified in `imm`.
+///
+/// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
+/// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8
+///
+/// Each 128-bit chunk is treated independently (i.e., the value for
+/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
+/// 128-bit chunks of `left` and `right`).
+fn mpsadbw<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    imm: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    assert_eq!(left.layout, right.layout);
+    assert_eq!(left.layout.size, dest.layout.size);
+
+    let (num_chunks, op_items_per_chunk, left) = split_simd_to_128bit_chunks(this, left)?;
+    let (_, _, right) = split_simd_to_128bit_chunks(this, right)?;
+    let (_, dest_items_per_chunk, dest) = split_simd_to_128bit_chunks(this, dest)?;
+
+    assert_eq!(op_items_per_chunk, dest_items_per_chunk.checked_mul(2).unwrap());
+
+    let imm = this.read_scalar(imm)?.to_uint(imm.layout.size)?;
+    // Bit 2 of `imm` specifies the offset for indices of `left`.
+    // The offset is 0 when the bit is 0 or 4 when the bit is 1.
+    let left_offset = u64::try_from((imm >> 2) & 1).unwrap().checked_mul(4).unwrap();
+    // Bits 0..=1 of `imm` specify the offset for indices of
+    // `right` in blocks of 4 elements.
+    let right_offset = u64::try_from(imm & 0b11).unwrap().checked_mul(4).unwrap();
+
+    for i in 0..num_chunks {
+        let left = this.project_index(&left, i)?;
+        let right = this.project_index(&right, i)?;
+        let dest = this.project_index(&dest, i)?;
+
+        for j in 0..dest_items_per_chunk {
+            let left_offset = left_offset.checked_add(j).unwrap();
+            let mut res: u16 = 0;
+            for k in 0..4 {
+                let left = this
+                    .read_scalar(&this.project_index(&left, left_offset.checked_add(k).unwrap())?)?
+                    .to_u8()?;
+                let right = this
+                    .read_scalar(
+                        &this.project_index(&right, right_offset.checked_add(k).unwrap())?,
+                    )?
+                    .to_u8()?;
+                res = res.checked_add(left.abs_diff(right).into()).unwrap();
+            }
+            this.write_scalar(Scalar::from_u16(res), &this.project_index(&dest, j)?)?;
+        }
+    }
+
+    Ok(())
+}
+
+/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
+/// product to the 18 most significant bits by right-shifting, and then
+/// divides the 18-bit value by 2 (rounding to nearest) by first adding
+/// 1 and then taking the bits `1..=16`.
+///
+/// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
+/// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16
+fn pmulhrsw<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (left, left_len) = this.operand_to_simd(left)?;
+    let (right, right_len) = this.operand_to_simd(right)?;
+    let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+    assert_eq!(dest_len, left_len);
+    assert_eq!(dest_len, right_len);
+
+    for i in 0..dest_len {
+        let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?;
+        let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?;
+        let dest = this.project_index(&dest, i)?;
+
+        let res =
+            (i32::from(left).checked_mul(right.into()).unwrap() >> 14).checked_add(1).unwrap() >> 1;
+
+        // The result of this operation can overflow a signed 16-bit integer.
+        // When `left` and `right` are -0x8000, the result is 0x8000.
+        #[allow(clippy::cast_possible_truncation)]
+        let res = res as i16;
+
+        this.write_scalar(Scalar::from_i16(res), &dest)?;
+    }
+
+    Ok(())
+}
+
+fn pack_generic<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+    f: impl Fn(Scalar<Provenance>) -> InterpResult<'tcx, Scalar<Provenance>>,
+) -> InterpResult<'tcx, ()> {
+    assert_eq!(left.layout, right.layout);
+    assert_eq!(left.layout.size, dest.layout.size);
+
+    let (num_chunks, op_items_per_chunk, left) = split_simd_to_128bit_chunks(this, left)?;
+    let (_, _, right) = split_simd_to_128bit_chunks(this, right)?;
+    let (_, dest_items_per_chunk, dest) = split_simd_to_128bit_chunks(this, dest)?;
+
+    assert_eq!(dest_items_per_chunk, op_items_per_chunk.checked_mul(2).unwrap());
+
+    for i in 0..num_chunks {
+        let left = this.project_index(&left, i)?;
+        let right = this.project_index(&right, i)?;
+        let dest = this.project_index(&dest, i)?;
+
+        for j in 0..op_items_per_chunk {
+            let left = this.read_scalar(&this.project_index(&left, j)?)?;
+            let right = this.read_scalar(&this.project_index(&right, j)?)?;
+            let left_dest = this.project_index(&dest, j)?;
+            let right_dest =
+                this.project_index(&dest, j.checked_add(op_items_per_chunk).unwrap())?;
+
+            let left_res = f(left)?;
+            let right_res = f(right)?;
+
+            this.write_scalar(left_res, &left_dest)?;
+            this.write_scalar(right_res, &right_dest)?;
+        }
+    }
+
+    Ok(())
+}
+
+/// Converts two 16-bit integer vectors to a single 8-bit integer
+/// vector with signed saturation.
+///
+/// Each 128-bit chunk is treated independently (i.e., the value for
+/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
+/// 128-bit chunks of `left` and `right`).
+fn packsswb<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    pack_generic(this, left, right, dest, |op| {
+        let op = op.to_i16()?;
+        let res = i8::try_from(op).unwrap_or(if op < 0 { i8::MIN } else { i8::MAX });
+        Ok(Scalar::from_i8(res))
+    })
+}
+
+/// Converts two 16-bit signed integer vectors to a single 8-bit
+/// unsigned integer vector with saturation.
+///
+/// Each 128-bit chunk is treated independently (i.e., the value for
+/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
+/// 128-bit chunks of `left` and `right`).
+fn packuswb<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    pack_generic(this, left, right, dest, |op| {
+        let op = op.to_i16()?;
+        let res = u8::try_from(op).unwrap_or(if op < 0 { 0 } else { u8::MAX });
+        Ok(Scalar::from_u8(res))
+    })
+}
+
+/// Converts two 32-bit integer vectors to a single 16-bit integer
+/// vector with signed saturation.
+///
+/// Each 128-bit chunk is treated independently (i.e., the value for
+/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
+/// 128-bit chunks of `left` and `right`).
+fn packssdw<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    pack_generic(this, left, right, dest, |op| {
+        let op = op.to_i32()?;
+        let res = i16::try_from(op).unwrap_or(if op < 0 { i16::MIN } else { i16::MAX });
+        Ok(Scalar::from_i16(res))
+    })
+}
+
+/// Converts two 32-bit integer vectors to a single 16-bit integer
+/// vector with unsigned saturation.
+///
+/// Each 128-bit chunk is treated independently (i.e., the value for
+/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
+/// 128-bit chunks of `left` and `right`).
+fn packusdw<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    pack_generic(this, left, right, dest, |op| {
+        let op = op.to_i32()?;
+        let res = u16::try_from(op).unwrap_or(if op < 0 { 0 } else { u16::MAX });
+        Ok(Scalar::from_u16(res))
+    })
+}
+
+/// Negates elements from `left` when the corresponding element in
+/// `right` is negative. If an element from `right` is zero, zero
+/// is writen to the corresponding output element.
+/// In other words, multiplies `left` with `right.signum()`.
+fn psign<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &MPlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (left, left_len) = this.operand_to_simd(left)?;
+    let (right, right_len) = this.operand_to_simd(right)?;
+    let (dest, dest_len) = this.mplace_to_simd(dest)?;
+
+    assert_eq!(dest_len, left_len);
+    assert_eq!(dest_len, right_len);
+
+    for i in 0..dest_len {
+        let dest = this.project_index(&dest, i)?;
+        let left = this.read_immediate(&this.project_index(&left, i)?)?;
+        let right = this.read_scalar(&this.project_index(&right, i)?)?.to_int(dest.layout.size)?;
+
+        let res = this.wrapping_binary_op(
+            mir::BinOp::Mul,
+            &left,
+            &ImmTy::from_int(right.signum(), dest.layout),
+        )?;
+
+        this.write_immediate(*res, &dest)?;
+    }
+
+    Ok(())
+}
diff --git a/src/shims/x86/sse2.rs b/src/shims/x86/sse2.rs
index 9db30d7ddc..63b6a30194 100644
--- a/src/shims/x86/sse2.rs
+++ b/src/shims/x86/sse2.rs
@@ -3,8 +3,8 @@ use rustc_span::Symbol;
 use rustc_target::spec::abi::Abi;
 
 use super::{
-    bin_op_simd_float_all, bin_op_simd_float_first, convert_float_to_int, shift_simd_by_scalar,
-    FloatBinOp, ShiftOp,
+    bin_op_simd_float_all, bin_op_simd_float_first, convert_float_to_int, packssdw, packsswb,
+    packuswb, shift_simd_by_scalar, FloatBinOp, ShiftOp,
 };
 use crate::*;
 use shims::foreign_items::EmulateForeignItemResult;
@@ -176,29 +176,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                // left and right are i16x8, dest is i8x16
-                assert_eq!(left_len, 8);
-                assert_eq!(right_len, 8);
-                assert_eq!(dest_len, 16);
-
-                for i in 0..left_len {
-                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?;
-                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?;
-                    let left_dest = this.project_index(&dest, i)?;
-                    let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?;
-
-                    let left_res =
-                        i8::try_from(left).unwrap_or(if left < 0 { i8::MIN } else { i8::MAX });
-                    let right_res =
-                        i8::try_from(right).unwrap_or(if right < 0 { i8::MIN } else { i8::MAX });
-
-                    this.write_scalar(Scalar::from_i8(left_res), &left_dest)?;
-                    this.write_scalar(Scalar::from_i8(right_res), &right_dest)?;
-                }
+                packsswb(this, left, right, dest)?;
             }
             // Used to implement the _mm_packus_epi16 function.
             // Converts two 16-bit signed integer vectors to a single 8-bit
@@ -207,28 +185,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                // left and right are i16x8, dest is u8x16
-                assert_eq!(left_len, 8);
-                assert_eq!(right_len, 8);
-                assert_eq!(dest_len, 16);
-
-                for i in 0..left_len {
-                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?;
-                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?;
-                    let left_dest = this.project_index(&dest, i)?;
-                    let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?;
-
-                    let left_res = u8::try_from(left).unwrap_or(if left < 0 { 0 } else { u8::MAX });
-                    let right_res =
-                        u8::try_from(right).unwrap_or(if right < 0 { 0 } else { u8::MAX });
-
-                    this.write_scalar(Scalar::from_u8(left_res), &left_dest)?;
-                    this.write_scalar(Scalar::from_u8(right_res), &right_dest)?;
-                }
+                packuswb(this, left, right, dest)?;
             }
             // Used to implement the _mm_packs_epi32 function.
             // Converts two 32-bit integer vectors to a single 16-bit integer
@@ -237,29 +194,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                // left and right are i32x4, dest is i16x8
-                assert_eq!(left_len, 4);
-                assert_eq!(right_len, 4);
-                assert_eq!(dest_len, 8);
-
-                for i in 0..left_len {
-                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i32()?;
-                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i32()?;
-                    let left_dest = this.project_index(&dest, i)?;
-                    let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?;
-
-                    let left_res =
-                        i16::try_from(left).unwrap_or(if left < 0 { i16::MIN } else { i16::MAX });
-                    let right_res =
-                        i16::try_from(right).unwrap_or(if right < 0 { i16::MIN } else { i16::MAX });
-
-                    this.write_scalar(Scalar::from_i16(left_res), &left_dest)?;
-                    this.write_scalar(Scalar::from_i16(right_res), &right_dest)?;
-                }
+                packssdw(this, left, right, dest)?;
             }
             // Used to implement _mm_min_sd and _mm_max_sd functions.
             // Note that the semantics are a bit different from Rust simd_min
diff --git a/src/shims/x86/sse41.rs b/src/shims/x86/sse41.rs
index 16a82eed99..19bc27421d 100644
--- a/src/shims/x86/sse41.rs
+++ b/src/shims/x86/sse41.rs
@@ -1,7 +1,7 @@
 use rustc_span::Symbol;
 use rustc_target::spec::abi::Abi;
 
-use super::{conditional_dot_product, round_all, round_first, test_bits_masked};
+use super::{conditional_dot_product, mpsadbw, packusdw, round_all, round_first, test_bits_masked};
 use crate::*;
 use shims::foreign_items::EmulateForeignItemResult;
 
@@ -68,27 +68,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                assert_eq!(left_len, right_len);
-                assert_eq!(dest_len, left_len.checked_mul(2).unwrap());
-
-                for i in 0..left_len {
-                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i32()?;
-                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i32()?;
-                    let left_dest = this.project_index(&dest, i)?;
-                    let right_dest = this.project_index(&dest, i.checked_add(left_len).unwrap())?;
-
-                    let left_res =
-                        u16::try_from(left).unwrap_or(if left < 0 { 0 } else { u16::MAX });
-                    let right_res =
-                        u16::try_from(right).unwrap_or(if right < 0 { 0 } else { u16::MAX });
-
-                    this.write_scalar(Scalar::from_u16(left_res), &left_dest)?;
-                    this.write_scalar(Scalar::from_u16(right_res), &right_dest)?;
-                }
+                packusdw(this, left, right, dest)?;
             }
             // Used to implement the _mm_dp_ps and _mm_dp_pd functions.
             // Conditionally multiplies the packed floating-point elements in
@@ -176,40 +156,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right, imm] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                assert_eq!(left_len, right_len);
-                assert_eq!(left_len, dest_len.checked_mul(2).unwrap());
-
-                let imm = this.read_scalar(imm)?.to_u8()?;
-                // Bit 2 of `imm` specifies the offset for indices of `left`.
-                // The offset is 0 when the bit is 0 or 4 when the bit is 1.
-                let left_offset = u64::from((imm >> 2) & 1).checked_mul(4).unwrap();
-                // Bits 0..=1 of `imm` specify the offset for indices of
-                // `right` in blocks of 4 elements.
-                let right_offset = u64::from(imm & 0b11).checked_mul(4).unwrap();
-
-                for i in 0..dest_len {
-                    let left_offset = left_offset.checked_add(i).unwrap();
-                    let mut res: u16 = 0;
-                    for j in 0..4 {
-                        let left = this
-                            .read_scalar(
-                                &this.project_index(&left, left_offset.checked_add(j).unwrap())?,
-                            )?
-                            .to_u8()?;
-                        let right = this
-                            .read_scalar(
-                                &this
-                                    .project_index(&right, right_offset.checked_add(j).unwrap())?,
-                            )?
-                            .to_u8()?;
-                        res = res.checked_add(left.abs_diff(right).into()).unwrap();
-                    }
-                    this.write_scalar(Scalar::from_u16(res), &this.project_index(&dest, i)?)?;
-                }
+                mpsadbw(this, left, right, imm, dest)?;
             }
             // Used to implement the _mm_testz_si128, _mm_testc_si128
             // and _mm_testnzc_si128 functions.
diff --git a/src/shims/x86/ssse3.rs b/src/shims/x86/ssse3.rs
index dd5d064b20..4f8e52dbb7 100644
--- a/src/shims/x86/ssse3.rs
+++ b/src/shims/x86/ssse3.rs
@@ -2,7 +2,7 @@ use rustc_middle::mir;
 use rustc_span::Symbol;
 use rustc_target::spec::abi::Abi;
 
-use super::horizontal_bin_op;
+use super::{horizontal_bin_op, int_abs, pmulhrsw, psign};
 use crate::*;
 use shims::foreign_items::EmulateForeignItemResult;
 
@@ -28,20 +28,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
             "pabs.b.128" | "pabs.w.128" | "pabs.d.128" => {
                 let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (op, op_len) = this.operand_to_simd(op)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                assert_eq!(op_len, dest_len);
-
-                for i in 0..dest_len {
-                    let op = this.read_scalar(&this.project_index(&op, i)?)?;
-                    let dest = this.project_index(&dest, i)?;
-
-                    // Converting to a host "i128" works since the input is always signed.
-                    let res = op.to_int(dest.layout.size)?.unsigned_abs();
-
-                    this.write_scalar(Scalar::from_uint(res, dest.layout.size), &dest)?;
-                }
+                int_abs(this, op, dest)?;
             }
             // Used to implement the _mm_shuffle_epi8 intrinsic.
             // Shuffles bytes from `left` using `right` as pattern.
@@ -136,30 +123,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                assert_eq!(dest_len, left_len);
-                assert_eq!(dest_len, right_len);
-
-                for i in 0..dest_len {
-                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?;
-                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?;
-                    let dest = this.project_index(&dest, i)?;
-
-                    let res = (i32::from(left).checked_mul(right.into()).unwrap() >> 14)
-                        .checked_add(1)
-                        .unwrap()
-                        >> 1;
-
-                    // The result of this operation can overflow a signed 16-bit integer.
-                    // When `left` and `right` are -0x8000, the result is 0x8000.
-                    #[allow(clippy::cast_possible_truncation)]
-                    let res = res as i16;
-
-                    this.write_scalar(Scalar::from_i16(res), &dest)?;
-                }
+                pmulhrsw(this, left, right, dest)?;
             }
             // Used to implement the _mm_sign_epi{8,16,32} functions.
             // Negates elements from `left` when the corresponding element in
@@ -170,28 +134,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.mplace_to_simd(dest)?;
-
-                assert_eq!(dest_len, left_len);
-                assert_eq!(dest_len, right_len);
-
-                for i in 0..dest_len {
-                    let dest = this.project_index(&dest, i)?;
-                    let left = this.read_immediate(&this.project_index(&left, i)?)?;
-                    let right = this
-                        .read_scalar(&this.project_index(&right, i)?)?
-                        .to_int(dest.layout.size)?;
-
-                    let res = this.wrapping_binary_op(
-                        mir::BinOp::Mul,
-                        &left,
-                        &ImmTy::from_int(right.signum(), dest.layout),
-                    )?;
-
-                    this.write_immediate(*res, &dest)?;
-                }
+                psign(this, left, right, dest)?;
             }
             _ => return Ok(EmulateForeignItemResult::NotSupported),
         }
diff --git a/tests/pass/intrinsics-x86-avx2.rs b/tests/pass/intrinsics-x86-avx2.rs
new file mode 100644
index 0000000000..80d125bb85
--- /dev/null
+++ b/tests/pass/intrinsics-x86-avx2.rs
@@ -0,0 +1,1613 @@
+// Ignore everything except x86 and x86_64
+// Any new targets that are added to CI should be ignored here.
+// (We cannot use `cfg`-based tricks here since the `target-feature` flags below only work on x86.)
+//@ignore-target-aarch64
+//@ignore-target-arm
+//@ignore-target-avr
+//@ignore-target-s390x
+//@ignore-target-thumbv7em
+//@ignore-target-wasm32
+//@compile-flags: -C target-feature=+avx2
+
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+use std::mem::transmute;
+
+fn main() {
+    assert!(is_x86_feature_detected!("avx2"));
+
+    unsafe {
+        test_avx2();
+    }
+}
+
+#[target_feature(enable = "avx2")]
+unsafe fn test_avx2() {
+    // Mostly copied from library/stdarch/crates/core_arch/src/x86/avx2.rs
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_abs_epi32() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi32(
+            0, 1, -1, i32::MAX,
+            i32::MIN, 100, -100, -32,
+        );
+        let r = _mm256_abs_epi32(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi32(
+            0, 1, 1, i32::MAX,
+            i32::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_abs_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_abs_epi16() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi16(
+            0,  1, -1, 2, -2, 3, -3, 4,
+            -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
+        );
+        let r = _mm256_abs_epi16(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi16(
+            0, 1, 1, 2, 2, 3, 3, 4,
+            4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_abs_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_abs_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            0, 1, -1, 2, -2, 3, -3, 4,
+            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
+            0, 1, -1, 2, -2, 3, -3, 4,
+            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
+        );
+        let r = _mm256_abs_epi8(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            0, 1, 1, 2, 2, 3, 3, 4,
+            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
+            0, 1, 1, 2, 2, 3, 3, 4,
+            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_abs_epi8();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_hadd_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hadd_epi16(a, b);
+        let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
+        assert_eq_m256i(r, e);
+
+        // Test wrapping on overflow
+        let a = _mm256_setr_epi16(
+            i16::MAX,
+            1,
+            i16::MAX,
+            2,
+            i16::MAX,
+            3,
+            i16::MAX,
+            4,
+            i16::MAX,
+            5,
+            i16::MAX,
+            6,
+            i16::MAX,
+            7,
+            i16::MAX,
+            8,
+        );
+        let b = _mm256_setr_epi16(
+            i16::MIN,
+            -1,
+            i16::MIN,
+            -2,
+            i16::MIN,
+            -3,
+            i16::MIN,
+            -4,
+            i16::MIN,
+            -5,
+            i16::MIN,
+            -6,
+            i16::MIN,
+            -7,
+            i16::MIN,
+            -8,
+        );
+        let expected = _mm256_setr_epi16(
+            i16::MIN,
+            i16::MIN + 1,
+            i16::MIN + 2,
+            i16::MIN + 3,
+            i16::MAX,
+            i16::MAX - 1,
+            i16::MAX - 2,
+            i16::MAX - 3,
+            i16::MIN + 4,
+            i16::MIN + 5,
+            i16::MIN + 6,
+            i16::MIN + 7,
+            i16::MAX - 4,
+            i16::MAX - 5,
+            i16::MAX - 6,
+            i16::MAX - 7,
+        );
+        let r = _mm256_hadd_epi16(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_hadd_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_hadd_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_hadd_epi32(a, b);
+        let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
+        assert_eq_m256i(r, e);
+
+        // Test wrapping on overflow
+        let a = _mm256_setr_epi32(i32::MAX, 1, i32::MAX, 2, i32::MAX, 3, i32::MAX, 4);
+        let b = _mm256_setr_epi32(i32::MIN, -1, i32::MIN, -2, i32::MIN, -3, i32::MIN, -4);
+        let expected = _mm256_setr_epi32(
+            i32::MIN,
+            i32::MIN + 1,
+            i32::MAX,
+            i32::MAX - 1,
+            i32::MIN + 2,
+            i32::MIN + 3,
+            i32::MAX - 2,
+            i32::MAX - 3,
+        );
+        let r = _mm256_hadd_epi32(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_hadd_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_hadds_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
+        let a = _mm256_insert_epi16::<1>(a, 1);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hadds_epi16(a, b);
+        let e = _mm256_setr_epi16(0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
+        assert_eq_m256i(r, e);
+
+        // Test saturating on overflow
+        let a = _mm256_setr_epi16(
+            i16::MAX,
+            1,
+            i16::MAX,
+            2,
+            i16::MAX,
+            3,
+            i16::MAX,
+            4,
+            i16::MAX,
+            5,
+            i16::MAX,
+            6,
+            i16::MAX,
+            7,
+            i16::MAX,
+            8,
+        );
+        let b = _mm256_setr_epi16(
+            i16::MIN,
+            -1,
+            i16::MIN,
+            -2,
+            i16::MIN,
+            -3,
+            i16::MIN,
+            -4,
+            i16::MIN,
+            -5,
+            i16::MIN,
+            -6,
+            i16::MIN,
+            -7,
+            i16::MIN,
+            -8,
+        );
+        let expected = _mm256_setr_epi16(
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+        );
+        let r = _mm256_hadds_epi16(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_hadds_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_hsub_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hsub_epi16(a, b);
+        let e = _mm256_set1_epi16(0);
+        assert_eq_m256i(r, e);
+
+        // Test wrapping on overflow
+        let a = _mm256_setr_epi16(
+            i16::MAX,
+            -1,
+            i16::MAX,
+            -2,
+            i16::MAX,
+            -3,
+            i16::MAX,
+            -4,
+            i16::MAX,
+            -5,
+            i16::MAX,
+            -6,
+            i16::MAX,
+            -7,
+            i16::MAX,
+            -8,
+        );
+        let b = _mm256_setr_epi16(
+            i16::MIN,
+            1,
+            i16::MIN,
+            2,
+            i16::MIN,
+            3,
+            i16::MIN,
+            4,
+            i16::MIN,
+            5,
+            i16::MIN,
+            6,
+            i16::MIN,
+            7,
+            i16::MIN,
+            8,
+        );
+        let expected = _mm256_setr_epi16(
+            i16::MIN,
+            i16::MIN + 1,
+            i16::MIN + 2,
+            i16::MIN + 3,
+            i16::MAX,
+            i16::MAX - 1,
+            i16::MAX - 2,
+            i16::MAX - 3,
+            i16::MIN + 4,
+            i16::MIN + 5,
+            i16::MIN + 6,
+            i16::MIN + 7,
+            i16::MAX - 4,
+            i16::MAX - 5,
+            i16::MAX - 6,
+            i16::MAX - 7,
+        );
+        let r = _mm256_hsub_epi16(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_hsub_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_hsub_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_hsub_epi32(a, b);
+        let e = _mm256_set1_epi32(0);
+        assert_eq_m256i(r, e);
+
+        // Test wrapping on overflow
+        let a = _mm256_setr_epi32(i32::MAX, -1, i32::MAX, -2, i32::MAX, -3, i32::MAX, -4);
+        let b = _mm256_setr_epi32(i32::MIN, 1, i32::MIN, 2, i32::MIN, 3, i32::MIN, 4);
+        let expected = _mm256_setr_epi32(
+            i32::MIN,
+            i32::MIN + 1,
+            i32::MAX,
+            i32::MAX - 1,
+            i32::MIN + 2,
+            i32::MIN + 3,
+            i32::MAX - 2,
+            i32::MAX - 3,
+        );
+        let r = _mm256_hsub_epi32(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_hsub_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_hsubs_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
+        let a = _mm256_insert_epi16::<1>(a, -1);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_hsubs_epi16(a, b);
+        let e = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 0x7FFF);
+        assert_eq_m256i(r, e);
+
+        // Test saturating on overflow
+        let a = _mm256_setr_epi16(
+            i16::MAX,
+            -1,
+            i16::MAX,
+            -2,
+            i16::MAX,
+            -3,
+            i16::MAX,
+            -4,
+            i16::MAX,
+            -5,
+            i16::MAX,
+            -6,
+            i16::MAX,
+            -7,
+            i16::MAX,
+            -8,
+        );
+        let b = _mm256_setr_epi16(
+            i16::MIN,
+            1,
+            i16::MIN,
+            2,
+            i16::MIN,
+            3,
+            i16::MIN,
+            4,
+            i16::MIN,
+            5,
+            i16::MIN,
+            6,
+            i16::MIN,
+            7,
+            i16::MIN,
+            8,
+        );
+        let expected = _mm256_setr_epi16(
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MAX,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+            i16::MIN,
+        );
+        let r = _mm256_hsubs_epi16(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_hsubs_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i32gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm_i32gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
+    }
+    test_mm_i32gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm_mask_i32gather_epi32::<4>(
+            _mm_set1_epi32(256),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm_setr_epi32(-1, -1, -1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
+    }
+    test_mm_mask_i32gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r =
+            _mm256_i32gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
+        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
+    }
+    test_mm256_i32gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm256_mask_i32gather_epi32::<4>(
+            _mm256_set1_epi32(256),
+            arr.as_ptr(),
+            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
+            _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
+        );
+        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
+    }
+    test_mm256_mask_i32gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i32gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_i32gather_ps::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
+    }
+    test_mm_i32gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_mask_i32gather_ps::<4>(
+            _mm_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
+        );
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
+    }
+    test_mm_mask_i32gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r =
+            _mm256_i32gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
+        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
+    }
+    test_mm256_i32gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm256_mask_i32gather_ps::<4>(
+            _mm256_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
+            _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
+        );
+        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0));
+    }
+    test_mm256_mask_i32gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i32gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
+        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
+    }
+    test_mm_i32gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_mask_i32gather_epi64::<8>(
+            _mm_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm_setr_epi32(16, 16, 16, 16),
+            _mm_setr_epi64x(-1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
+    }
+    test_mm_mask_i32gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
+    }
+    test_mm256_i32gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_mask_i32gather_epi64::<8>(
+            _mm256_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm256_setr_epi64x(-1, -1, -1, 0),
+        );
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
+    }
+    test_mm256_mask_i32gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i32gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
+        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
+    }
+    test_mm_i32gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i32gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_mask_i32gather_pd::<8>(
+            _mm_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi32(16, 16, 16, 16),
+            _mm_setr_pd(-1.0, 0.0),
+        );
+        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
+    }
+    test_mm_mask_i32gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i32gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
+    }
+    test_mm256_i32gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i32gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_mask_i32gather_pd::<8>(
+            _mm256_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi32(0, 16, 64, 96),
+            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
+        );
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
+    }
+    test_mm256_mask_i32gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i64gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm_i64gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
+    }
+    test_mm_i64gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm_mask_i64gather_epi32::<4>(
+            _mm_set1_epi32(256),
+            arr.as_ptr(),
+            _mm_setr_epi64x(0, 16),
+            _mm_setr_epi32(-1, 0, -1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
+    }
+    test_mm_mask_i64gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm256_i64gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
+    }
+    test_mm256_i64gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_epi32() {
+        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
+        // A multiplier of 4 is word-addressing
+        let r = _mm256_mask_i64gather_epi32::<4>(
+            _mm_set1_epi32(256),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm_setr_epi32(-1, -1, -1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
+    }
+    test_mm256_mask_i64gather_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i64gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_i64gather_ps::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
+    }
+    test_mm_i64gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm_mask_i64gather_ps::<4>(
+            _mm_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi64x(0, 16),
+            _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
+        );
+        assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
+    }
+    test_mm_mask_i64gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm256_i64gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
+    }
+    test_mm256_i64gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_ps() {
+        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
+        // A multiplier of 4 is word-addressing for f32s
+        let r = _mm256_mask_i64gather_ps::<4>(
+            _mm_set1_ps(256.0),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
+        );
+        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
+    }
+    test_mm256_mask_i64gather_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i64gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_i64gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
+    }
+    test_mm_i64gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm_mask_i64gather_epi64::<8>(
+            _mm_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm_setr_epi64x(16, 16),
+            _mm_setr_epi64x(-1, 0),
+        );
+        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
+    }
+    test_mm_mask_i64gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_i64gather_epi64::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
+    }
+    test_mm256_i64gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_epi64() {
+        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
+        // A multiplier of 8 is word-addressing for i64s
+        let r = _mm256_mask_i64gather_epi64::<8>(
+            _mm256_set1_epi64x(256),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm256_setr_epi64x(-1, -1, -1, 0),
+        );
+        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
+    }
+    test_mm256_mask_i64gather_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_i64gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_i64gather_pd::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
+        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
+    }
+    test_mm_i64gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_mask_i64gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm_mask_i64gather_pd::<8>(
+            _mm_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm_setr_epi64x(16, 16),
+            _mm_setr_pd(-1.0, 0.0),
+        );
+        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
+    }
+    test_mm_mask_i64gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_i64gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_i64gather_pd::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
+    }
+    test_mm256_i64gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mask_i64gather_pd() {
+        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
+        // A multiplier of 8 is word-addressing for f64s
+        let r = _mm256_mask_i64gather_pd::<8>(
+            _mm256_set1_pd(256.0),
+            arr.as_ptr(),
+            _mm256_setr_epi64x(0, 16, 64, 96),
+            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
+        );
+        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
+    }
+    test_mm256_mask_i64gather_pd();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_madd_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_madd_epi16(a, b);
+        let e = _mm256_set1_epi32(16);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_madd_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_maddubs_epi16() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_maddubs_epi16(a, b);
+        let e = _mm256_set1_epi16(16);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_maddubs_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_maskload_epi32() {
+        let nums = [1, 2, 3, 4];
+        let a = &nums as *const i32;
+        let mask = _mm_setr_epi32(-1, 0, 0, -1);
+        let r = _mm_maskload_epi32(a, mask);
+        let e = _mm_setr_epi32(1, 0, 0, 4);
+        assert_eq_m128i(r, e);
+
+        // Unaligned pointer
+        let a = Unaligned::new([1i32, 2, 3, 4]);
+        let mask = _mm_setr_epi32(0, !0, 0, !0);
+        let r = _mm_maskload_epi32(a.as_ptr().cast(), mask);
+        let e = _mm_setr_epi32(0, 2, 0, 4);
+        assert_eq_m128i(r, e);
+
+        // Only loading first element, so slice can be short.
+        let a = &[2i32];
+        let mask = _mm_setr_epi32(!0, 0, 0, 0);
+        let r = _mm_maskload_epi32(a.as_ptr(), mask);
+        let e = _mm_setr_epi32(2, 0, 0, 0);
+        assert_eq_m128i(r, e);
+
+        // Only loading last element, so slice can be short.
+        let a = &[2i32];
+        let mask = _mm_setr_epi32(0, 0, 0, !0);
+        let r = _mm_maskload_epi32(a.as_ptr().wrapping_sub(3), mask);
+        let e = _mm_setr_epi32(0, 0, 0, 2);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_maskload_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_maskload_epi32() {
+        let nums = [1, 2, 3, 4, 5, 6, 7, 8];
+        let a = &nums as *const i32;
+        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
+        let r = _mm256_maskload_epi32(a, mask);
+        let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
+        assert_eq_m256i(r, e);
+
+        // Unaligned pointer
+        let a = Unaligned::new([1i32, 2, 3, 4, 5, 6, 7, 8]);
+        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
+        let r = _mm256_maskload_epi32(a.as_ptr().cast(), mask);
+        let e = _mm256_setr_epi32(0, 2, 0, 4, 0, 6, 0, 8);
+        assert_eq_m256i(r, e);
+
+        // Only loading first element, so slice can be short.
+        let a = &[2i32];
+        let mask = _mm256_setr_epi32(!0, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm256_maskload_epi32(a.as_ptr(), mask);
+        let e = _mm256_setr_epi32(2, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+
+        // Only loading last element, so slice can be short.
+        let a = &[2i32];
+        let mask = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, !0);
+        let r = _mm256_maskload_epi32(a.as_ptr().wrapping_sub(7), mask);
+        let e = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 2);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_maskload_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_maskload_epi64() {
+        let nums = [1_i64, 2_i64];
+        let a = &nums as *const i64;
+        let mask = _mm_setr_epi64x(0, -1);
+        let r = _mm_maskload_epi64(a, mask);
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+
+        // Unaligned pointer
+        let a = Unaligned::new([1i64, 2]);
+        let mask = _mm_setr_epi64x(0, !0);
+        let r = _mm_maskload_epi64(a.as_ptr().cast(), mask);
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+
+        // Only loading first element, so slice can be short.
+        let a = &[2i64];
+        let mask = _mm_setr_epi64x(!0, 0);
+        let r = _mm_maskload_epi64(a.as_ptr(), mask);
+        let e = _mm_setr_epi64x(2, 0);
+        assert_eq_m128i(r, e);
+
+        // Only loading last element, so slice can be short.
+        let a = &[2i64];
+        let mask = _mm_setr_epi64x(0, !0);
+        let r = _mm_maskload_epi64(a.as_ptr().wrapping_sub(1), mask);
+        let e = _mm_setr_epi64x(0, 2);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_maskload_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_maskload_epi64() {
+        let nums = [1_i64, 2_i64, 3_i64, 4_i64];
+        let a = &nums as *const i64;
+        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
+        let r = _mm256_maskload_epi64(a, mask);
+        let e = _mm256_setr_epi64x(0, 2, 3, 0);
+        assert_eq_m256i(r, e);
+
+        // Unaligned pointer
+        let a = Unaligned::new([1i64, 2, 3, 4]);
+        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
+        let r = _mm256_maskload_epi64(a.as_ptr().cast(), mask);
+        let e = _mm256_setr_epi64x(0, 2, 0, 4);
+        assert_eq_m256i(r, e);
+
+        // Only loading first element, so slice can be short.
+        let a = &[2i64];
+        let mask = _mm256_setr_epi64x(!0, 0, 0, 0);
+        let r = _mm256_maskload_epi64(a.as_ptr(), mask);
+        let e = _mm256_setr_epi64x(2, 0, 0, 0);
+        assert_eq_m256i(r, e);
+
+        // Only loading last element, so slice can be short.
+        let a = &[2i64];
+        let mask = _mm256_setr_epi64x(0, 0, 0, !0);
+        let r = _mm256_maskload_epi64(a.as_ptr().wrapping_sub(3), mask);
+        let e = _mm256_setr_epi64x(0, 0, 0, 2);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_maskload_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_maskstore_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let mut arr = [-1, -1, -1, -1];
+        let mask = _mm_setr_epi32(-1, 0, 0, -1);
+        _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
+        let e = [1, -1, -1, 4];
+        assert_eq!(arr, e);
+
+        // Unaligned pointer
+        let mut r = Unaligned::new([0i32; 4]);
+        let mask = _mm_setr_epi32(0, !0, 0, !0);
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        _mm_maskstore_epi32(r.as_mut_ptr().cast(), mask, a);
+        let e = [0i32, 2, 0, 4];
+        assert_eq!(r.read(), e);
+
+        // Only storing first element, so slice can be short.
+        let mut r = [0i32];
+        let mask = _mm_setr_epi32(!0, 0, 0, 0);
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        _mm_maskstore_epi32(r.as_mut_ptr(), mask, a);
+        let e = [1i32];
+        assert_eq!(r, e);
+
+        // Only storing last element, so slice can be short.
+        let mut r = [0i32];
+        let mask = _mm_setr_epi32(0, 0, 0, !0);
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        _mm_maskstore_epi32(r.as_mut_ptr().wrapping_sub(3), mask, a);
+        let e = [4i32];
+        assert_eq!(r, e);
+    }
+    test_mm_maskstore_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_maskstore_epi32() {
+        let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
+        let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
+        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
+        _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
+        let e = [1, -1, -1, 42, -1, 6, 7, -1];
+        assert_eq!(arr, e);
+
+        // Unaligned pointer
+        let mut r = Unaligned::new([0i32; 8]);
+        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        _mm256_maskstore_epi32(r.as_mut_ptr().cast(), mask, a);
+        let e = [0i32, 2, 0, 4, 0, 6, 0, 8];
+        assert_eq!(r.read(), e);
+
+        // Only storing first element, so slice can be short.
+        let mut r = [0i32];
+        let mask = _mm256_setr_epi32(!0, 0, 0, 0, 0, 0, 0, 0);
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        _mm256_maskstore_epi32(r.as_mut_ptr(), mask, a);
+        let e = [1i32];
+        assert_eq!(r, e);
+
+        // Only storing last element, so slice can be short.
+        let mut r = [0i32];
+        let mask = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, !0);
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        _mm256_maskstore_epi32(r.as_mut_ptr().wrapping_sub(7), mask, a);
+        let e = [8i32];
+        assert_eq!(r, e);
+    }
+    test_mm256_maskstore_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_maskstore_epi64() {
+        let a = _mm_setr_epi64x(1_i64, 2_i64);
+        let mut arr = [-1_i64, -1_i64];
+        let mask = _mm_setr_epi64x(0, -1);
+        _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
+        let e = [-1, 2];
+        assert_eq!(arr, e);
+
+        // Unaligned pointer
+        let mut r = Unaligned::new([0i64; 2]);
+        let mask = _mm_setr_epi64x(0, !0);
+        let a = _mm_setr_epi64x(1, 2);
+        _mm_maskstore_epi64(r.as_mut_ptr().cast(), mask, a);
+        let e = [0i64, 2];
+        assert_eq!(r.read(), e);
+
+        // Only storing first element, so slice can be short.
+        let mut r = [0i64];
+        let mask = _mm_setr_epi64x(!0, 0);
+        let a = _mm_setr_epi64x(1, 2);
+        _mm_maskstore_epi64(r.as_mut_ptr(), mask, a);
+        let e = [1i64];
+        assert_eq!(r, e);
+
+        // Only storing last element, so slice can be short.
+        let mut r = [0i64];
+        let mask = _mm_setr_epi64x(0, !0);
+        let a = _mm_setr_epi64x(1, 2);
+        _mm_maskstore_epi64(r.as_mut_ptr().wrapping_sub(1), mask, a);
+        let e = [2i64];
+        assert_eq!(r, e);
+    }
+    test_mm_maskstore_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_maskstore_epi64() {
+        let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
+        let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
+        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
+        _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
+        let e = [-1, 2, 3, -1];
+        assert_eq!(arr, e);
+
+        // Unaligned pointer
+        let mut r = Unaligned::new([0i64; 4]);
+        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        _mm256_maskstore_epi64(r.as_mut_ptr().cast(), mask, a);
+        let e = [0i64, 2, 0, 4];
+        assert_eq!(r.read(), e);
+
+        // Only storing first element, so slice can be short.
+        let mut r = [0i64];
+        let mask = _mm256_setr_epi64x(!0, 0, 0, 0);
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        _mm256_maskstore_epi64(r.as_mut_ptr(), mask, a);
+        let e = [1i64];
+        assert_eq!(r, e);
+
+        // Only storing last element, so slice can be short.
+        let mut r = [0i64];
+        let mask = _mm256_setr_epi64x(0, 0, 0, !0);
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        _mm256_maskstore_epi64(r.as_mut_ptr().wrapping_sub(3), mask, a);
+        let e = [4i64];
+        assert_eq!(r, e);
+    }
+    test_mm256_maskstore_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mpsadbw_epu8() {
+        let a = _mm256_setr_epi8(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 2, 4, 6, 8, 10, 12, 14, 16,
+            18, 20, 22, 24, 26, 28, 30,
+        );
+
+        let r = _mm256_mpsadbw_epu8::<0b000>(a, a);
+        let e = _mm256_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28, 0, 8, 16, 24, 32, 40, 48, 56);
+        assert_eq_m256i(r, e);
+
+        let r = _mm256_mpsadbw_epu8::<0b001>(a, a);
+        let e = _mm256_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12, 32, 24, 16, 8, 0, 8, 16, 24);
+        assert_eq_m256i(r, e);
+
+        let r = _mm256_mpsadbw_epu8::<0b100>(a, a);
+        let e = _mm256_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44, 32, 40, 48, 56, 64, 72, 80, 88);
+        assert_eq_m256i(r, e);
+
+        let r = _mm256_mpsadbw_epu8::<0b101>(a, a);
+        let e = _mm256_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28, 0, 8, 16, 24, 32, 40, 48, 56);
+        assert_eq_m256i(r, e);
+
+        let r = _mm256_mpsadbw_epu8::<0b111>(a, a);
+        let e = _mm256_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4, 64, 56, 48, 40, 32, 24, 16, 8);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_mpsadbw_epu8();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_mulhrs_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_mullo_epi16(a, b);
+        let e = _mm256_set1_epi16(8);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_mulhrs_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_packs_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_packs_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_packs_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_packs_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_packs_epi32(a, b);
+        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
+
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_packs_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_packus_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(4);
+        let r = _mm256_packus_epi16(a, b);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+            2, 2, 2, 2, 2, 2, 2, 2,
+            4, 4, 4, 4, 4, 4, 4, 4,
+        );
+
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_packus_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_packus_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(4);
+        let r = _mm256_packus_epi32(a, b);
+        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
+
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_packus_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_permutevar8x32_epi32() {
+        let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
+        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
+        let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
+        let r = _mm256_permutevar8x32_epi32(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_permutevar8x32_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_permute2x128_si256() {
+        let a = _mm256_setr_epi64x(100, 200, 500, 600);
+        let b = _mm256_setr_epi64x(300, 400, 700, 800);
+        let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
+        let e = _mm256_setr_epi64x(700, 800, 500, 600);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_permute2x128_si256();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_permutevar8x32_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
+        let r = _mm256_permutevar8x32_ps(a, b);
+        let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
+        assert_eq_m256(r, e);
+    }
+    test_mm256_permutevar8x32_ps();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sad_epu8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(4);
+        let r = _mm256_sad_epu8(a, b);
+        let e = _mm256_set1_epi64x(16);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_sad_epu8();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_shuffle_epi8() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        #[rustfmt::skip]
+        let b = _mm256_setr_epi8(
+            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
+            12, 5, 5, 10, 4, 1, 8, 0,
+            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
+            12, 5, 5, 10, 4, 1, 8, 0,
+        );
+        #[rustfmt::skip]
+        let expected = _mm256_setr_epi8(
+            5, 0, 5, 4, 9, 13, 7, 4,
+            13, 6, 6, 11, 5, 2, 9, 1,
+            21, 0, 21, 20, 25, 29, 23, 20,
+            29, 22, 22, 27, 21, 18, 25, 17,
+        );
+        let r = _mm256_shuffle_epi8(a, b);
+        assert_eq_m256i(r, expected);
+    }
+    test_mm256_shuffle_epi8();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sign_epi16() {
+        let a = _mm256_set1_epi16(2);
+        let b = _mm256_set1_epi16(-1);
+        let r = _mm256_sign_epi16(a, b);
+        let e = _mm256_set1_epi16(-2);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_sign_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sign_epi32() {
+        let a = _mm256_set1_epi32(2);
+        let b = _mm256_set1_epi32(-1);
+        let r = _mm256_sign_epi32(a, b);
+        let e = _mm256_set1_epi32(-2);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_sign_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sign_epi8() {
+        let a = _mm256_set1_epi8(2);
+        let b = _mm256_set1_epi8(-1);
+        let r = _mm256_sign_epi8(a, b);
+        let e = _mm256_set1_epi8(-2);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_sign_epi8();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sll_epi16() {
+        let a = _mm256_setr_epi16(
+            0x88, -0x88, 0x99, -0x99, 0xAA, -0xAA, 0xBB, -0xBB, 0xCC, -0xCC, 0xDD, -0xDD, 0xEE,
+            -0xEE, 0xFF, -0xFF,
+        );
+        let r = _mm256_sll_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi16(
+                0x880, -0x880, 0x990, -0x990, 0xAA0, -0xAA0, 0xBB0, -0xBB0, 0xCC0, -0xCC0, 0xDD0,
+                -0xDD0, 0xEE0, -0xEE0, 0xFF0, -0xFF0,
+            ),
+        );
+        let r = _mm256_sll_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_sll_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m256i(r, _mm256_set1_epi16(0));
+        let r = _mm256_sll_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_set1_epi16(0));
+    }
+    test_mm256_sll_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sll_epi32() {
+        let a =
+            _mm256_setr_epi32(0xCCCC, -0xCCCC, 0xDDDD, -0xDDDD, 0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm256_sll_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi32(
+                0xCCCC0, -0xCCCC0, 0xDDDD0, -0xDDDD0, 0xEEEE0, -0xEEEE0, 0xFFFF0, -0xFFFF0,
+            ),
+        );
+        let r = _mm256_sll_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_sll_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m256i(r, _mm256_set1_epi32(0));
+        let r = _mm256_sll_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_set1_epi32(0));
+    }
+    test_mm256_sll_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sll_epi64() {
+        let a = _mm256_set_epi64x(0xEEEEEEEE, -0xEEEEEEEE, 0xFFFFFFFF, -0xFFFFFFFF);
+        let r = _mm256_sll_epi64(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(r, _mm256_set_epi64x(0xEEEEEEEE0, -0xEEEEEEEE0, 0xFFFFFFFF0, -0xFFFFFFFF0));
+        let r = _mm256_sll_epi64(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_sll_epi64(a, _mm_set_epi64x(0, 64));
+        assert_eq_m256i(r, _mm256_set1_epi64x(0));
+        let r = _mm256_sll_epi64(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_set1_epi64x(0));
+    }
+    test_mm256_sll_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sra_epi16() {
+        let a = _mm256_setr_epi16(
+            0x88, -0x88, 0x99, -0x99, 0xAA, -0xAA, 0xBB, -0xBB, 0xCC, -0xCC, 0xDD, -0xDD, 0xEE,
+            -0xEE, 0xFF, -0xFF,
+        );
+        let r = _mm256_sra_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi16(
+                0x8, -0x9, 0x9, -0xA, 0xA, -0xB, 0xB, -0xC, 0xC, -0xD, 0xD, -0xE, 0xE, -0xF, 0xF,
+                -0x10,
+            ),
+        );
+        let r = _mm256_sra_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_sra_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1),
+        );
+        let r = _mm256_sra_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1),
+        );
+    }
+    test_mm256_sra_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sra_epi32() {
+        let a =
+            _mm256_setr_epi32(0xCCCC, -0xCCCC, 0xDDDD, -0xDDDD, 0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm256_sra_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi32(0xCCC, -0xCCD, 0xDDD, -0xDDE, 0xEEE, -0xEEF, 0xFFF, -0x1000),
+        );
+        let r = _mm256_sra_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_sra_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m256i(r, _mm256_setr_epi32(0, -1, 0, -1, 0, -1, 0, -1));
+        let r = _mm256_sra_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_setr_epi32(0, -1, 0, -1, 0, -1, 0, -1));
+    }
+    test_mm256_sra_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_srl_epi16() {
+        let a = _mm256_setr_epi16(
+            0x88, -0x88, 0x99, -0x99, 0xAA, -0xAA, 0xBB, -0xBB, 0xCC, -0xCC, 0xDD, -0xDD, 0xEE,
+            -0xEE, 0xFF, -0xFF,
+        );
+        let r = _mm256_srl_epi16(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi16(
+                0x8, 0xFF7, 0x9, 0xFF6, 0xA, 0xFF5, 0xB, 0xFF4, 0xC, 0xFF3, 0xD, 0xFF2, 0xE, 0xFF1,
+                0xF, 0xFF0,
+            ),
+        );
+        let r = _mm256_srl_epi16(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_srl_epi16(a, _mm_set_epi64x(0, 16));
+        assert_eq_m256i(r, _mm256_set1_epi16(0));
+        let r = _mm256_srl_epi16(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_set1_epi16(0));
+    }
+    test_mm256_srl_epi16();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_srl_epi32() {
+        let a =
+            _mm256_setr_epi32(0xCCCC, -0xCCCC, 0xDDDD, -0xDDDD, 0xEEEE, -0xEEEE, 0xFFFF, -0xFFFF);
+        let r = _mm256_srl_epi32(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_setr_epi32(
+                0xCCC, 0xFFFF333, 0xDDD, 0xFFFF222, 0xEEE, 0xFFFF111, 0xFFF, 0xFFFF000,
+            ),
+        );
+        let r = _mm256_srl_epi32(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_srl_epi32(a, _mm_set_epi64x(0, 32));
+        assert_eq_m256i(r, _mm256_set1_epi32(0));
+        let r = _mm256_srl_epi32(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_set1_epi32(0));
+    }
+    test_mm256_srl_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_srl_epi64() {
+        let a = _mm256_set_epi64x(0xEEEEEEEE, -0xEEEEEEEE, 0xFFFFFFFF, -0xFFFFFFFF);
+        let r = _mm256_srl_epi64(a, _mm_set_epi64x(0, 4));
+        assert_eq_m256i(
+            r,
+            _mm256_set_epi64x(0xEEEEEEE, 0xFFFFFFFF1111111, 0xFFFFFFF, 0xFFFFFFFF0000000),
+        );
+        let r = _mm256_srl_epi64(a, _mm_set_epi64x(4, 0));
+        assert_eq_m256i(r, a);
+        let r = _mm256_srl_epi64(a, _mm_set_epi64x(0, 64));
+        assert_eq_m256i(r, _mm256_set1_epi64x(0));
+        let r = _mm256_srl_epi64(a, _mm_set_epi64x(0, i64::MAX));
+        assert_eq_m256i(r, _mm256_set1_epi64x(0));
+    }
+    test_mm256_srl_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_sllv_epi32() {
+        let a = _mm_set_epi32(1, 2, 3, 4);
+        let b = _mm_set_epi32(4, 3, 2, 1);
+        let r = _mm_sllv_epi32(a, b);
+        let e = _mm_set_epi32(16, 16, 12, 8);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_sllv_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sllv_epi32() {
+        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let r = _mm256_sllv_epi32(a, b);
+        let e = _mm256_set_epi32(256, 256, 192, 128, 80, 48, 28, 16);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_sllv_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_sllv_epi64() {
+        let a = _mm_set_epi64x(2, 3);
+        let b = _mm_set_epi64x(1, 2);
+        let r = _mm_sllv_epi64(a, b);
+        let e = _mm_set_epi64x(4, 12);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_sllv_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_sllv_epi64() {
+        let a = _mm256_set_epi64x(1, 2, 3, 4);
+        let b = _mm256_set_epi64x(4, 3, 2, 1);
+        let r = _mm256_sllv_epi64(a, b);
+        let e = _mm256_set_epi64x(16, 16, 12, 8);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_sllv_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_srav_epi32() {
+        let a = _mm_set_epi32(16, -32, 64, -128);
+        let b = _mm_set_epi32(4, 3, 2, 1);
+        let r = _mm_srav_epi32(a, b);
+        let e = _mm_set_epi32(1, -4, 16, -64);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_srav_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_srav_epi32() {
+        let a = _mm256_set_epi32(256, -512, 1024, -2048, 4096, -8192, 16384, -32768);
+        let b = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let r = _mm256_srav_epi32(a, b);
+        let e = _mm256_set_epi32(1, -4, 16, -64, 256, -1024, 4096, -16384);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_srav_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_srlv_epi32() {
+        let a = _mm_set_epi32(16, 32, 64, 128);
+        let b = _mm_set_epi32(4, 3, 2, 1);
+        let r = _mm_srlv_epi32(a, b);
+        let e = _mm_set_epi32(1, 4, 16, 64);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_srlv_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_srlv_epi32() {
+        let a = _mm256_set_epi32(256, 512, 1024, 2048, 4096, 8192, 16384, 32768);
+        let b = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
+        let r = _mm256_srlv_epi32(a, b);
+        let e = _mm256_set_epi32(1, 4, 16, 64, 256, 1024, 4096, 16384);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_srlv_epi32();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm_srlv_epi64() {
+        let a = _mm_set_epi64x(4, 8);
+        let b = _mm_set_epi64x(2, 1);
+        let r = _mm_srlv_epi64(a, b);
+        let e = _mm_set_epi64x(1, 4);
+        assert_eq_m128i(r, e);
+    }
+    test_mm_srlv_epi64();
+
+    #[target_feature(enable = "avx2")]
+    unsafe fn test_mm256_srlv_epi64() {
+        let a = _mm256_set_epi64x(16, 32, 64, 128);
+        let b = _mm256_set_epi64x(4, 3, 2, 1);
+        let r = _mm256_srlv_epi64(a, b);
+        let e = _mm256_set_epi64x(1, 4, 16, 64);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_srlv_epi64();
+}
+
+#[target_feature(enable = "sse2")]
+unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i {
+    _mm_set_epi64x(b, a)
+}
+
+#[track_caller]
+#[target_feature(enable = "sse")]
+unsafe fn assert_eq_m128(a: __m128, b: __m128) {
+    let r = _mm_cmpeq_ps(a, b);
+    if _mm_movemask_ps(r) != 0b1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "sse2")]
+unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
+    if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "sse2")]
+unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) {
+    assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b))
+}
+
+#[track_caller]
+#[target_feature(enable = "avx")]
+unsafe fn assert_eq_m256(a: __m256, b: __m256) {
+    let cmp = _mm256_cmp_ps::<_CMP_EQ_OQ>(a, b);
+    if _mm256_movemask_ps(cmp) != 0b11111111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "avx")]
+unsafe fn assert_eq_m256d(a: __m256d, b: __m256d) {
+    let cmp = _mm256_cmp_pd::<_CMP_EQ_OQ>(a, b);
+    if _mm256_movemask_pd(cmp) != 0b1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "avx")]
+unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
+    assert_eq!(transmute::<_, [u64; 4]>(a), transmute::<_, [u64; 4]>(b))
+}
+
+/// Stores `T` in an unaligned address
+struct Unaligned<T: Copy> {
+    buf: Vec<u8>,
+    offset: bool,
+    _marker: std::marker::PhantomData<T>,
+}
+
+impl<T: Copy> Unaligned<T> {
+    fn new(value: T) -> Self {
+        // Allocate extra byte for unalignment headroom
+        let len = std::mem::size_of::<T>();
+        let mut buf = Vec::<u8>::with_capacity(len + 1);
+        // Force the address to be a non-multiple of 2, so it is as unaligned as it can get.
+        let offset = (buf.as_ptr() as usize % 2) == 0;
+        let value_ptr: *const T = &value;
+        unsafe {
+            buf.as_mut_ptr().add(offset.into()).copy_from_nonoverlapping(value_ptr.cast(), len);
+        }
+        Self { buf, offset, _marker: std::marker::PhantomData }
+    }
+
+    fn as_ptr(&self) -> *const T {
+        unsafe { self.buf.as_ptr().add(self.offset.into()).cast() }
+    }
+
+    fn as_mut_ptr(&mut self) -> *mut T {
+        unsafe { self.buf.as_mut_ptr().add(self.offset.into()).cast() }
+    }
+
+    fn read(&self) -> T {
+        unsafe { self.as_ptr().read_unaligned() }
+    }
+}