From 184dc146a5b3c03b21cce99507cc775977f8466d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= <eduardosm-dev@e64.io>
Date: Tue, 26 Sep 2023 21:12:45 +0200
Subject: [PATCH] Implement `llvm.x86.ssse3.*` intrinsics

---
 src/shims/x86/mod.rs                    |  45 ++++
 src/shims/x86/sse3.rs                   |  35 +---
 src/shims/x86/ssse3.rs                  | 206 +++++++++++++++++++
 tests/pass/intrinsics-x86-sse3-ssse3.rs | 262 ++++++++++++++++++++++++
 tests/pass/intrinsics-x86-sse3.rs       | 114 -----------
 5 files changed, 516 insertions(+), 146 deletions(-)
 create mode 100644 src/shims/x86/ssse3.rs
 create mode 100644 tests/pass/intrinsics-x86-sse3-ssse3.rs
 delete mode 100644 tests/pass/intrinsics-x86-sse3.rs

diff --git a/src/shims/x86/mod.rs b/src/shims/x86/mod.rs
index 8b3805ae55..e031a1f9db 100644
--- a/src/shims/x86/mod.rs
+++ b/src/shims/x86/mod.rs
@@ -10,6 +10,7 @@ use shims::foreign_items::EmulateByNameResult;
 mod sse;
 mod sse2;
 mod sse3;
+mod ssse3;
 
 impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for crate::MiriInterpCx<'mir, 'tcx> {}
 pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
@@ -94,6 +95,11 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                     this, link_name, abi, args, dest,
                 );
             }
+            name if name.starts_with("ssse3.") => {
+                return ssse3::EvalContextExt::emulate_x86_ssse3_intrinsic(
+                    this, link_name, abi, args, dest,
+                );
+            }
             _ => return Ok(EmulateByNameResult::NotSupported),
         }
         Ok(EmulateByNameResult::NeedsJumping)
@@ -292,3 +298,42 @@ fn bin_op_simd_float_all<'tcx, F: rustc_apfloat::Float>(
 
     Ok(())
 }
+
+/// Horizontaly performs `which` operation on adjacent values of
+/// `left` and `right` SIMD vectors and stores the result in `dest`.
+fn horizontal_bin_op<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    which: mir::BinOp,
+    saturating: bool,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &PlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (left, left_len) = this.operand_to_simd(left)?;
+    let (right, right_len) = this.operand_to_simd(right)?;
+    let (dest, dest_len) = this.place_to_simd(dest)?;
+
+    assert_eq!(dest_len, left_len);
+    assert_eq!(dest_len, right_len);
+    assert_eq!(dest_len % 2, 0);
+
+    let middle = dest_len / 2;
+    for i in 0..dest_len {
+        let (j, src) =
+            if i < middle { (i, &left) } else { (i.checked_sub(middle).unwrap(), &right) };
+        let base_i = j.checked_mul(2).unwrap();
+        let lhs = this.read_immediate(&this.project_index(src, base_i)?)?;
+        let rhs = this.read_immediate(&this.project_index(src, base_i.checked_add(1).unwrap())?)?;
+
+        let res = if saturating {
+            Immediate::from(this.saturating_arith(which, &lhs, &rhs)?)
+        } else {
+            let (res, _overflow) = this.overflowing_binary_op(which, &lhs, &rhs)?;
+            *res
+        };
+
+        this.write_immediate(res, &this.project_index(&dest, i)?)?;
+    }
+
+    Ok(())
+}
diff --git a/src/shims/x86/sse3.rs b/src/shims/x86/sse3.rs
index 39842fb2d7..cacfcbfe40 100644
--- a/src/shims/x86/sse3.rs
+++ b/src/shims/x86/sse3.rs
@@ -3,6 +3,7 @@ use rustc_span::Symbol;
 use rustc_target::abi::Align;
 use rustc_target::spec::abi::Abi;
 
+use super::horizontal_bin_op;
 use crate::*;
 use shims::foreign_items::EmulateByNameResult;
 
@@ -55,43 +56,13 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 let [left, right] =
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
 
-                let (left, left_len) = this.operand_to_simd(left)?;
-                let (right, right_len) = this.operand_to_simd(right)?;
-                let (dest, dest_len) = this.place_to_simd(dest)?;
-
-                assert_eq!(dest_len, left_len);
-                assert_eq!(dest_len, right_len);
-                assert_eq!(dest_len % 2, 0);
-
-                let op = match unprefixed_name {
+                let which = match unprefixed_name {
                     "hadd.ps" | "hadd.pd" => mir::BinOp::Add,
                     "hsub.ps" | "hsub.pd" => mir::BinOp::Sub,
                     _ => unreachable!(),
                 };
 
-                let middle = dest_len / 2;
-                for i in 0..dest_len {
-                    let (lhs, rhs) = if i < middle {
-                        let base_i = i.checked_mul(2).unwrap();
-                        (
-                            this.read_immediate(&this.project_index(&left, base_i)?)?,
-                            this.read_immediate(
-                                &this.project_index(&left, base_i.checked_add(1).unwrap())?,
-                            )?,
-                        )
-                    } else {
-                        let base_i = i.checked_sub(middle).unwrap().checked_mul(2).unwrap();
-                        (
-                            this.read_immediate(&this.project_index(&right, base_i)?)?,
-                            this.read_immediate(
-                                &this.project_index(&right, base_i.checked_add(1).unwrap())?,
-                            )?,
-                        )
-                    };
-                    let (res, _overflow) = this.overflowing_binary_op(op, &lhs, &rhs)?;
-
-                    this.write_immediate(*res, &this.project_index(&dest, i)?)?;
-                }
+                horizontal_bin_op(this, which, /*saturating*/ false, left, right, dest)?;
             }
             // Used to implement the _mm_lddqu_si128 function.
             // Reads a 128-bit vector from an unaligned pointer. This intrinsic
diff --git a/src/shims/x86/ssse3.rs b/src/shims/x86/ssse3.rs
new file mode 100644
index 0000000000..3eb641affb
--- /dev/null
+++ b/src/shims/x86/ssse3.rs
@@ -0,0 +1,206 @@
+use rustc_middle::mir;
+use rustc_span::Symbol;
+use rustc_target::spec::abi::Abi;
+
+use super::horizontal_bin_op;
+use crate::*;
+use shims::foreign_items::EmulateByNameResult;
+
+impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for crate::MiriInterpCx<'mir, 'tcx> {}
+pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
+    crate::MiriInterpCxExt<'mir, 'tcx>
+{
+    fn emulate_x86_ssse3_intrinsic(
+        &mut self,
+        link_name: Symbol,
+        abi: Abi,
+        args: &[OpTy<'tcx, Provenance>],
+        dest: &PlaceTy<'tcx, Provenance>,
+    ) -> InterpResult<'tcx, EmulateByNameResult<'mir, 'tcx>> {
+        let this = self.eval_context_mut();
+        // Prefix should have already been checked.
+        let unprefixed_name = link_name.as_str().strip_prefix("llvm.x86.ssse3.").unwrap();
+
+        match unprefixed_name {
+            // Used to implement the _mm_abs_epi{8,16,32} functions.
+            // Calculates the absolute value of packed 8/16/32-bit integers.
+            "pabs.b.128" | "pabs.w.128" | "pabs.d.128" => {
+                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (op, op_len) = this.operand_to_simd(op)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(op_len, dest_len);
+
+                for i in 0..dest_len {
+                    let op = this.read_scalar(&this.project_index(&op, i)?)?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    let res = op.to_int(dest.layout.size)?.unsigned_abs();
+
+                    this.write_scalar(Scalar::from_uint(res, dest.layout.size), &dest)?;
+                }
+            }
+            // Used to implement the _mm_shuffle_epi8 intrinsic.
+            // Shuffles bytes from `left` using `right` as pattern.
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
+            "pshuf.b.128" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(dest_len, left_len);
+                assert_eq!(dest_len, right_len);
+
+                for i in 0..dest_len {
+                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_u8()?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    let res = if right & 0x80 == 0 {
+                        let j = right % 16; // index wraps around
+                        this.read_scalar(&this.project_index(&left, j.into())?)?
+                    } else {
+                        // If the highest bit in `right` is 1, write zero.
+                        Scalar::from_u8(0)
+                    };
+
+                    this.write_scalar(res, &dest)?;
+                }
+            }
+            // Used to implement the _mm_h{add,adds,sub}_epi{16,32} functions.
+            // Horizontally add/add with saturation/subtract adjacent 16/32-bit
+            // integer values in `left` and `right`.
+            "phadd.w.128" | "phadd.sw.128" | "phadd.d.128" | "phsub.w.128" | "phsub.sw.128"
+            | "phsub.d.128" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (which, saturating) = match unprefixed_name {
+                    "phadd.w.128" | "phadd.d.128" => (mir::BinOp::Add, false),
+                    "phadd.sw.128" => (mir::BinOp::Add, true),
+                    "phsub.w.128" | "phsub.d.128" => (mir::BinOp::Sub, false),
+                    "phsub.sw.128" => (mir::BinOp::Sub, true),
+                    _ => unreachable!(),
+                };
+
+                horizontal_bin_op(this, which, saturating, left, right, dest)?;
+            }
+            // Used to implement the _mm_maddubs_epi16 function.
+            // Multiplies corresponding pairs of packed 8-bit unsigned integer
+            // values contained in the first source operand and packed 8-bit signed
+            // integer values contained in the second source operand, add pairs of
+            // contiguous products with signed saturation, and writes the 16-bit sums to
+            // the corresponding bits in the destination.
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
+            "pmadd.ub.sw.128" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(left_len, right_len);
+                assert_eq!(dest_len, left_len / 2);
+
+                for i in 0..dest_len {
+                    let j1 = i.checked_mul(2).unwrap();
+                    let left1 = this.read_scalar(&this.project_index(&left, j1)?)?.to_u8()?;
+                    let right1 = this.read_scalar(&this.project_index(&right, j1)?)?.to_i8()?;
+
+                    let j2 = j1.checked_add(1).unwrap();
+                    let left2 = this.read_scalar(&this.project_index(&left, j2)?)?.to_u8()?;
+                    let right2 = this.read_scalar(&this.project_index(&right, j2)?)?.to_i8()?;
+
+                    let dest = this.project_index(&dest, i)?;
+
+                    let mul1 = i16::from(left1).checked_mul(right1.into()).unwrap();
+                    let mul2 = i16::from(left2).checked_mul(right2.into()).unwrap();
+                    let res = mul1.saturating_add(mul2);
+
+                    this.write_scalar(Scalar::from_i16(res), &dest)?;
+                }
+            }
+            // Used to implement the _mm_mulhrs_epi16 function.
+            // Multiplies packed 16-bit signed integer values, truncate the 32-bit
+            // product to the 18 most significant bits by right-shifting, round the
+            // truncated value by adding 1, and write bits `[16:1]` to the destination.
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
+            "pmul.hr.sw.128" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(dest_len, left_len);
+                assert_eq!(dest_len, right_len);
+
+                for i in 0..dest_len {
+                    let left = this.read_scalar(&this.project_index(&left, i)?)?.to_i16()?;
+                    let right = this.read_scalar(&this.project_index(&right, i)?)?.to_i16()?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    let res = (i32::from(left).checked_mul(right.into()).unwrap() >> 14)
+                        .checked_add(1)
+                        .unwrap()
+                        >> 1;
+
+                    this.write_scalar(Scalar::from_i16(res.try_into().unwrap()), &dest)?;
+                }
+            }
+            // Used to implement the _mm_sign_epi{8,16,32} functions.
+            // Negates elements from `left` when the its corresponding from
+            // `right` is negative. If an element from `right` is zero, zero
+            // is writen to the corresponding output element.
+            "psign.b.128" | "psign.w.128" | "psign.d.128" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(dest_len, left_len);
+                assert_eq!(dest_len, right_len);
+
+                for i in 0..dest_len {
+                    let dest = this.project_index(&dest, i)?;
+                    let left = this
+                        .read_scalar(&this.project_index(&left, i)?)?
+                        .to_int(dest.layout.size)?;
+                    let right = this
+                        .read_scalar(&this.project_index(&right, i)?)?
+                        .to_int(dest.layout.size)?;
+
+                    let res = if right == 0 {
+                        0
+                    } else if right < 0 {
+                        if left == dest.layout.size.signed_int_min() {
+                            // Make sure `Scalar::from_int` does not get an overflowed value.
+                            //
+                            // For example, with 16-bit value -0x8000, negating after sign
+                            // extension to i128 will produce 0x8000. However, `Scalar::from_int`
+                            // will fail because 0x8000 is outside the range of signed 16-bit
+                            // integers. If the negation had been done directly with wrapping 16-bit
+                            // arithmetic, the result would be -0x8000, so we simulate that.
+                            left
+                        } else {
+                            left.checked_neg().unwrap()
+                        }
+                    } else {
+                        left
+                    };
+
+                    this.write_scalar(Scalar::from_int(res, dest.layout.size), &dest)?;
+                }
+            }
+            _ => return Ok(EmulateByNameResult::NotSupported),
+        }
+        Ok(EmulateByNameResult::NeedsJumping)
+    }
+}
diff --git a/tests/pass/intrinsics-x86-sse3-ssse3.rs b/tests/pass/intrinsics-x86-sse3-ssse3.rs
new file mode 100644
index 0000000000..72c8480c53
--- /dev/null
+++ b/tests/pass/intrinsics-x86-sse3-ssse3.rs
@@ -0,0 +1,262 @@
+// Ignore everything except x86 and x86_64
+// Any additional target are added to CI should be ignored here
+// (We cannot use `cfg`-based tricks here since the `target-feature` flags below only work on x86.)
+//@ignore-target-aarch64
+//@ignore-target-arm
+//@ignore-target-avr
+//@ignore-target-s390x
+//@ignore-target-thumbv7em
+//@ignore-target-wasm32
+// SSSE3 implicitly enables SSE3
+//@compile-flags: -C target-feature=+ssse3
+
+use core::mem::transmute;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+fn main() {
+    assert!(is_x86_feature_detected!("ssse3"));
+
+    unsafe {
+        test_sse3();
+        test_ssse3();
+    }
+}
+
+#[target_feature(enable = "sse3")]
+unsafe fn test_sse3() {
+    // Mostly copied from library/stdarch/crates/core_arch/src/x86/sse3.rs
+
+    #[target_feature(enable = "sse3")]
+    unsafe fn test_mm_addsub_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_addsub_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(99.0, 25.0, 0.0, -15.0));
+    }
+    test_mm_addsub_ps();
+
+    #[target_feature(enable = "sse3")]
+    unsafe fn test_mm_addsub_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let b = _mm_setr_pd(-100.0, 20.0);
+        let r = _mm_addsub_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(99.0, 25.0));
+    }
+    test_mm_addsub_pd();
+
+    #[target_feature(enable = "sse3")]
+    unsafe fn test_mm_hadd_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_hadd_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(4.0, -10.0, -80.0, -5.0));
+    }
+    test_mm_hadd_ps();
+
+    #[target_feature(enable = "sse3")]
+    unsafe fn test_mm_hadd_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let b = _mm_setr_pd(-100.0, 20.0);
+        let r = _mm_hadd_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(4.0, -80.0));
+    }
+    test_mm_hadd_pd();
+
+    #[target_feature(enable = "sse3")]
+    unsafe fn test_mm_hsub_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_hsub_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-6.0, 10.0, -120.0, 5.0));
+    }
+    test_mm_hsub_ps();
+
+    #[target_feature(enable = "sse3")]
+    unsafe fn test_mm_hsub_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let b = _mm_setr_pd(-100.0, 20.0);
+        let r = _mm_hsub_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(-6.0, -120.0));
+    }
+    test_mm_hsub_pd();
+
+    #[target_feature(enable = "sse3")]
+    unsafe fn test_mm_lddqu_si128() {
+        let a = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm_lddqu_si128(&a);
+        assert_eq_m128i(a, r);
+    }
+    test_mm_lddqu_si128();
+}
+
+#[target_feature(enable = "ssse3")]
+unsafe fn test_ssse3() {
+    // Mostly copied from library/stdarch/crates/core_arch/src/x86/ssse3.rs
+
+    #[target_feature(enable = "ssse3")]
+    unsafe fn test_mm_abs_epi8() {
+        let r = _mm_abs_epi8(_mm_set1_epi8(-5));
+        assert_eq_m128i(r, _mm_set1_epi8(5));
+    }
+    test_mm_abs_epi8();
+
+    #[target_feature(enable = "ssse3")]
+    unsafe fn test_mm_abs_epi16() {
+        let r = _mm_abs_epi16(_mm_set1_epi16(-5));
+        assert_eq_m128i(r, _mm_set1_epi16(5));
+    }
+    test_mm_abs_epi16();
+
+    #[target_feature(enable = "ssse3")]
+    unsafe fn test_mm_abs_epi32() {
+        let r = _mm_abs_epi32(_mm_set1_epi32(-5));
+        assert_eq_m128i(r, _mm_set1_epi32(5));
+    }
+    test_mm_abs_epi32();
+
+    #[target_feature(enable = "ssse3")]
+    unsafe fn test_mm_shuffle_epi8() {
+        let a = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_setr_epi8(4, 128_u8 as i8, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0);
+        let expected = _mm_setr_epi8(5, 0, 5, 4, 9, 13, 7, 4, 13, 6, 6, 11, 5, 2, 9, 1);
+        let r = _mm_shuffle_epi8(a, b);
+        assert_eq_m128i(r, expected);
+    }
+    test_mm_shuffle_epi8();
+
+    #[target_feature(enable = "ssse3")]
+    unsafe fn test_mm_hadd_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 4, 3, 24, 12, 6, 19);
+        let expected = _mm_setr_epi16(3, 7, 11, 15, 132, 7, 36, 25);
+        let r = _mm_hadd_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+    test_mm_hadd_epi16();
+
+    #[target_feature(enable = "ssse3")]
+    unsafe fn test_mm_hadds_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 4, 3, 32767, 1, -32768, -1);
+        let expected = _mm_setr_epi16(3, 7, 11, 15, 132, 7, 32767, -32768);
+        let r = _mm_hadds_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+    test_mm_hadds_epi16();
+
+    #[target_feature(enable = "ssse3")]
+    unsafe fn test_mm_hadd_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let b = _mm_setr_epi32(4, 128, 4, 3);
+        let expected = _mm_setr_epi32(3, 7, 132, 7);
+        let r = _mm_hadd_epi32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+    test_mm_hadd_epi32();
+
+    #[target_feature(enable = "ssse3")]
+    unsafe fn test_mm_hsub_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 4, 3, 24, 12, 6, 19);
+        let expected = _mm_setr_epi16(-1, -1, -1, -1, -124, 1, 12, -13);
+        let r = _mm_hsub_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+    test_mm_hsub_epi16();
+
+    #[target_feature(enable = "ssse3")]
+    unsafe fn test_mm_hsubs_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 4, 3, 32767, -1, -32768, 1);
+        let expected = _mm_setr_epi16(-1, -1, -1, -1, -124, 1, 32767, -32768);
+        let r = _mm_hsubs_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+    test_mm_hsubs_epi16();
+
+    #[target_feature(enable = "ssse3")]
+    unsafe fn test_mm_hsub_epi32() {
+        let a = _mm_setr_epi32(1, 2, 3, 4);
+        let b = _mm_setr_epi32(4, 128, 4, 3);
+        let expected = _mm_setr_epi32(-1, -1, -124, 1);
+        let r = _mm_hsub_epi32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+    test_mm_hsub_epi32();
+
+    #[target_feature(enable = "ssse3")]
+    unsafe fn test_mm_maddubs_epi16() {
+        let a = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b = _mm_setr_epi8(4, 63, 4, 3, 24, 12, 6, 19, 12, 5, 5, 10, 4, 1, 8, 0);
+        let expected = _mm_setr_epi16(130, 24, 192, 194, 158, 175, 66, 120);
+        let r = _mm_maddubs_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+    test_mm_maddubs_epi16();
+
+    #[target_feature(enable = "ssse3")]
+    unsafe fn test_mm_mulhrs_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 4, 3, 32767, -1, -32768, 1);
+        let expected = _mm_setr_epi16(0, 0, 0, 0, 5, 0, -7, 0);
+        let r = _mm_mulhrs_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+    test_mm_mulhrs_epi16();
+
+    #[target_feature(enable = "ssse3")]
+    unsafe fn test_mm_sign_epi8() {
+        let a = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -14, -15, 16);
+        let b = _mm_setr_epi8(4, 63, -4, 3, 24, 12, -6, -19, 12, 5, -5, 10, 4, 1, -8, 0);
+        let expected = _mm_setr_epi8(1, 2, -3, 4, 5, 6, -7, -8, 9, 10, -11, 12, 13, -14, 15, 0);
+        let r = _mm_sign_epi8(a, b);
+        assert_eq_m128i(r, expected);
+    }
+    test_mm_sign_epi8();
+
+    #[target_feature(enable = "ssse3")]
+    unsafe fn test_mm_sign_epi16() {
+        let a = _mm_setr_epi16(1, 2, 3, 4, -5, -6, 7, 8);
+        let b = _mm_setr_epi16(4, 128, 0, 3, 1, -1, -2, 1);
+        let expected = _mm_setr_epi16(1, 2, 0, 4, -5, 6, -7, 8);
+        let r = _mm_sign_epi16(a, b);
+        assert_eq_m128i(r, expected);
+    }
+    test_mm_sign_epi16();
+
+    #[target_feature(enable = "ssse3")]
+    unsafe fn test_mm_sign_epi32() {
+        let a = _mm_setr_epi32(-1, 2, 3, 4);
+        let b = _mm_setr_epi32(1, -1, 1, 0);
+        let expected = _mm_setr_epi32(-1, -2, 3, 0);
+        let r = _mm_sign_epi32(a, b);
+        assert_eq_m128i(r, expected);
+    }
+    test_mm_sign_epi32();
+}
+
+#[track_caller]
+#[target_feature(enable = "sse")]
+unsafe fn assert_eq_m128(a: __m128, b: __m128) {
+    let r = _mm_cmpeq_ps(a, b);
+    if _mm_movemask_ps(r) != 0b1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "sse2")]
+unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
+    if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "sse2")]
+pub unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) {
+    assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b))
+}
diff --git a/tests/pass/intrinsics-x86-sse3.rs b/tests/pass/intrinsics-x86-sse3.rs
deleted file mode 100644
index ea92bf45ac..0000000000
--- a/tests/pass/intrinsics-x86-sse3.rs
+++ /dev/null
@@ -1,114 +0,0 @@
-// Ignore everything except x86 and x86_64
-// Any additional target are added to CI should be ignored here
-// (We cannot use `cfg`-based tricks here since the `target-feature` flags below only work on x86.)
-//@ignore-target-aarch64
-//@ignore-target-arm
-//@ignore-target-avr
-//@ignore-target-s390x
-//@ignore-target-thumbv7em
-//@ignore-target-wasm32
-//@compile-flags: -C target-feature=+sse3
-
-use core::mem::transmute;
-#[cfg(target_arch = "x86")]
-use std::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
-use std::arch::x86_64::*;
-
-fn main() {
-    assert!(is_x86_feature_detected!("sse3"));
-
-    unsafe {
-        test_sse3();
-    }
-}
-
-#[target_feature(enable = "sse3")]
-unsafe fn test_sse3() {
-    // Mostly copied from library/stdarch/crates/core_arch/src/x86/sse3.rs
-
-    #[target_feature(enable = "sse3")]
-    unsafe fn test_mm_addsub_ps() {
-        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
-        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
-        let r = _mm_addsub_ps(a, b);
-        assert_eq_m128(r, _mm_setr_ps(99.0, 25.0, 0.0, -15.0));
-    }
-    test_mm_addsub_ps();
-
-    #[target_feature(enable = "sse3")]
-    unsafe fn test_mm_addsub_pd() {
-        let a = _mm_setr_pd(-1.0, 5.0);
-        let b = _mm_setr_pd(-100.0, 20.0);
-        let r = _mm_addsub_pd(a, b);
-        assert_eq_m128d(r, _mm_setr_pd(99.0, 25.0));
-    }
-    test_mm_addsub_pd();
-
-    #[target_feature(enable = "sse3")]
-    unsafe fn test_mm_hadd_ps() {
-        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
-        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
-        let r = _mm_hadd_ps(a, b);
-        assert_eq_m128(r, _mm_setr_ps(4.0, -10.0, -80.0, -5.0));
-    }
-    test_mm_hadd_ps();
-
-    #[target_feature(enable = "sse3")]
-    unsafe fn test_mm_hadd_pd() {
-        let a = _mm_setr_pd(-1.0, 5.0);
-        let b = _mm_setr_pd(-100.0, 20.0);
-        let r = _mm_hadd_pd(a, b);
-        assert_eq_m128d(r, _mm_setr_pd(4.0, -80.0));
-    }
-    test_mm_hadd_pd();
-
-    #[target_feature(enable = "sse3")]
-    unsafe fn test_mm_hsub_ps() {
-        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
-        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
-        let r = _mm_hsub_ps(a, b);
-        assert_eq_m128(r, _mm_setr_ps(-6.0, 10.0, -120.0, 5.0));
-    }
-    test_mm_hsub_ps();
-
-    #[target_feature(enable = "sse3")]
-    unsafe fn test_mm_hsub_pd() {
-        let a = _mm_setr_pd(-1.0, 5.0);
-        let b = _mm_setr_pd(-100.0, 20.0);
-        let r = _mm_hsub_pd(a, b);
-        assert_eq_m128d(r, _mm_setr_pd(-6.0, -120.0));
-    }
-    test_mm_hsub_pd();
-
-    #[target_feature(enable = "sse3")]
-    unsafe fn test_mm_lddqu_si128() {
-        let a = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let r = _mm_lddqu_si128(&a);
-        assert_eq_m128i(a, r);
-    }
-    test_mm_lddqu_si128();
-}
-
-#[track_caller]
-#[target_feature(enable = "sse")]
-unsafe fn assert_eq_m128(a: __m128, b: __m128) {
-    let r = _mm_cmpeq_ps(a, b);
-    if _mm_movemask_ps(r) != 0b1111 {
-        panic!("{:?} != {:?}", a, b);
-    }
-}
-
-#[track_caller]
-#[target_feature(enable = "sse2")]
-unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
-    if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
-        panic!("{:?} != {:?}", a, b);
-    }
-}
-
-#[track_caller]
-#[target_feature(enable = "sse2")]
-pub unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) {
-    assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b))
-}