Implement llvm.x86.sse3.* intrinsics

rust-lang · Sep 26, 2023 · 3f3f64d · 3f3f64d
1 parent 1a82975
commit 3f3f64d
Show file tree

Hide file tree

Showing 3 changed files with 241 additions and 0 deletions.
diff --git a/src/shims/x86/mod.rs b/src/shims/x86/mod.rs
@@ -9,6 +9,7 @@ use shims::foreign_items::EmulateByNameResult;
 
 mod sse;
 mod sse2;
+mod sse3;
 
 impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for crate::MiriInterpCx<'mir, 'tcx> {}
 pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
@@ -88,6 +89,11 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                     this, link_name, abi, args, dest,
                 );
             }
+            name if name.starts_with("sse3.") => {
+                return sse3::EvalContextExt::emulate_x86_sse3_intrinsic(
+                    this, link_name, abi, args, dest,
+                );
+            }
             _ => return Ok(EmulateByNameResult::NotSupported),
         }
         Ok(EmulateByNameResult::NeedsJumping)

diff --git a/src/shims/x86/sse3.rs b/src/shims/x86/sse3.rs
@@ -0,0 +1,121 @@
+use rustc_middle::mir;
+use rustc_span::Symbol;
+use rustc_target::abi::Align;
+use rustc_target::spec::abi::Abi;
+
+use crate::*;
+use shims::foreign_items::EmulateByNameResult;
+
+impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for crate::MiriInterpCx<'mir, 'tcx> {}
+pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
+    crate::MiriInterpCxExt<'mir, 'tcx>
+{
+    fn emulate_x86_sse3_intrinsic(
+        &mut self,
+        link_name: Symbol,
+        abi: Abi,
+        args: &[OpTy<'tcx, Provenance>],
+        dest: &PlaceTy<'tcx, Provenance>,
+    ) -> InterpResult<'tcx, EmulateByNameResult<'mir, 'tcx>> {
+        let this = self.eval_context_mut();
+        // Prefix should have already been checked.
+        let unprefixed_name = link_name.as_str().strip_prefix("llvm.x86.sse3.").unwrap();
+
+        match unprefixed_name {
+            // Used to implement the _mm_addsub_ps and _mm_addsub_pd functions.
+            // Alternatively add and subtract floating point (f32 or f64) from
+            // `left` and `right`
+            "addsub.ps" | "addsub.pd" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(dest_len, left_len);
+                assert_eq!(dest_len, right_len);
+
+                for i in 0..dest_len {
+                    let left = this.read_immediate(&this.project_index(&left, i)?)?;
+                    let right = this.read_immediate(&this.project_index(&right, i)?)?;
+                    let dest = this.project_index(&dest, i)?;
+
+                    // Even elements are subtracted and odd elements are added.
+                    let op = if i % 2 == 0 { mir::BinOp::Sub } else { mir::BinOp::Add };
+                    let (res, _overflow) = this.overflowing_binary_op(op, &left, &right)?;
+
+                    this.write_immediate(*res, &dest)?;
+                }
+            }
+            // Used to implement the _mm_h{add,sub}_p{s,d} functions.
+            // Horizontally add/subtract adjacent floating point values
+            // in `left` and `right`.
+            "hadd.ps" | "hadd.pd" | "hsub.ps" | "hsub.pd" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (left, left_len) = this.operand_to_simd(left)?;
+                let (right, right_len) = this.operand_to_simd(right)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(dest_len, left_len);
+                assert_eq!(dest_len, right_len);
+                assert_eq!(dest_len % 2, 0);
+
+                let op = match unprefixed_name {
+                    "hadd.ps" | "hadd.pd" => mir::BinOp::Add,
+                    "hsub.ps" | "hsub.pd" => mir::BinOp::Sub,
+                    _ => unreachable!(),
+                };
+
+                let middle = dest_len / 2;
+                for i in 0..dest_len {
+                    let (lhs, rhs) = if i < middle {
+                        let base_i = i.checked_mul(2).unwrap();
+                        (
+                            this.read_immediate(&this.project_index(&left, base_i)?)?,
+                            this.read_immediate(
+                                &this.project_index(&left, base_i.checked_add(1).unwrap())?,
+                            )?,
+                        )
+                    } else {
+                        let base_i = i.checked_sub(middle).unwrap().checked_mul(2).unwrap();
+                        (
+                            this.read_immediate(&this.project_index(&right, base_i)?)?,
+                            this.read_immediate(
+                                &this.project_index(&right, base_i.checked_add(1).unwrap())?,
+                            )?,
+                        )
+                    };
+                    let (res, _overflow) = this.overflowing_binary_op(op, &lhs, &rhs)?;
+
+                    this.write_immediate(*res, &this.project_index(&dest, i)?)?;
+                }
+            }
+            // Used to implement the _mm_lddqu_si128 function.
+            // Reads a 128-bit vector from an unaligned pointer. This intrinsic
+            // is expected to perform better than a regular unaligned read when
+            // the data crosses a cache line, but for Miri this is just a regular
+            // unaligned read.
+            "ldu.dq" => {
+                let [src_ptr] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+                let dest = dest.force_mplace(this)?;
+
+                let src_ptr = this.read_pointer(src_ptr)?;
+                let dest_ptr = this.read_pointer(&this.mplace_to_ref(&dest)?)?;
+
+                this.mem_copy(
+                    src_ptr,
+                    Align::ONE,
+                    dest_ptr,
+                    Align::ONE,
+                    dest.layout.size,
+                    /*nonoverlapping*/ true,
+                )?;
+            }
+            _ => return Ok(EmulateByNameResult::NotSupported),
+        }
+        Ok(EmulateByNameResult::NeedsJumping)
+    }
+}
diff --git a/tests/pass/intrinsics-x86-sse3.rs b/tests/pass/intrinsics-x86-sse3.rs
@@ -0,0 +1,114 @@
+// Ignore everything except x86 and x86_64
+// Any additional target are added to CI should be ignored here
+// (We cannot use `cfg`-based tricks here since the `target-feature` flags below only work on x86.)
+//@ignore-target-aarch64
+//@ignore-target-arm
+//@ignore-target-avr
+//@ignore-target-s390x
+//@ignore-target-thumbv7em
+//@ignore-target-wasm32
+//@compile-flags: -C target-feature=+sse3
+
+use core::mem::transmute;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+fn main() {
+    assert!(is_x86_feature_detected!("sse3"));
+
+    unsafe {
+        test_sse3();
+    }
+}
+
+#[target_feature(enable = "sse3")]
+unsafe fn test_sse3() {
+    // Mostly copied from library/stdarch/crates/core_arch/src/x86/sse3.rs
+
+    #[target_feature(enable = "sse3")]
+    unsafe fn test_mm_addsub_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_addsub_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(99.0, 25.0, 0.0, -15.0));
+    }
+    test_mm_addsub_ps();
+
+    #[target_feature(enable = "sse3")]
+    unsafe fn test_mm_addsub_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let b = _mm_setr_pd(-100.0, 20.0);
+        let r = _mm_addsub_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(99.0, 25.0));
+    }
+    test_mm_addsub_pd();
+
+    #[target_feature(enable = "sse3")]
+    unsafe fn test_mm_hadd_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_hadd_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(4.0, -10.0, -80.0, -5.0));
+    }
+    test_mm_hadd_ps();
+
+    #[target_feature(enable = "sse3")]
+    unsafe fn test_mm_hadd_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let b = _mm_setr_pd(-100.0, 20.0);
+        let r = _mm_hadd_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(4.0, -80.0));
+    }
+    test_mm_hadd_pd();
+
+    #[target_feature(enable = "sse3")]
+    unsafe fn test_mm_hsub_ps() {
+        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
+        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
+        let r = _mm_hsub_ps(a, b);
+        assert_eq_m128(r, _mm_setr_ps(-6.0, 10.0, -120.0, 5.0));
+    }
+    test_mm_hsub_ps();
+
+    #[target_feature(enable = "sse3")]
+    unsafe fn test_mm_hsub_pd() {
+        let a = _mm_setr_pd(-1.0, 5.0);
+        let b = _mm_setr_pd(-100.0, 20.0);
+        let r = _mm_hsub_pd(a, b);
+        assert_eq_m128d(r, _mm_setr_pd(-6.0, -120.0));
+    }
+    test_mm_hsub_pd();
+
+    #[target_feature(enable = "sse3")]
+    unsafe fn test_mm_lddqu_si128() {
+        let a = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let r = _mm_lddqu_si128(&a);
+        assert_eq_m128i(a, r);
+    }
+    test_mm_lddqu_si128();
+}
+
+#[track_caller]
+#[target_feature(enable = "sse")]
+unsafe fn assert_eq_m128(a: __m128, b: __m128) {
+    let r = _mm_cmpeq_ps(a, b);
+    if _mm_movemask_ps(r) != 0b1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "sse2")]
+unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
+    if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "sse2")]
+pub unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) {
+    assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b))
+}