From 0b7fafbe4f0de22fe8327923b24b20547ae1f55b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Thu, 28 Sep 2023 18:13:59 +0200 Subject: [PATCH] Implement the `llvm.x86.sse2.pmadd.wd` intrinsic --- src/shims/x86/sse2.rs | 36 +++++++++++++++++++++++++++++++ tests/pass/intrinsics-x86-sse2.rs | 18 ++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/src/shims/x86/sse2.rs b/src/shims/x86/sse2.rs index 2ca882167b..2ef6a9b59e 100644 --- a/src/shims/x86/sse2.rs +++ b/src/shims/x86/sse2.rs @@ -82,6 +82,42 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>: this.write_immediate(*res, &dest)?; } } + // Used to implement the _mm_madd_epi16 function. + // Multiplies packed signed 16-bit integers in `left` and `right`, producing + // intermediate signed 32-bit integers. Horizontally add adjacent pairs of + // intermediate 32-bit integers, and pack the results in `dest`. + "pmadd.wd" => { + let [left, right] = + this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (left, left_len) = this.operand_to_simd(left)?; + let (right, right_len) = this.operand_to_simd(right)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(left_len, right_len); + assert_eq!(dest_len.checked_mul(2).unwrap(), left_len); + + for i in 0..dest_len { + let j1 = i.checked_mul(2).unwrap(); + let left1 = this.read_scalar(&this.project_index(&left, j1)?)?.to_i16()?; + let right1 = this.read_scalar(&this.project_index(&right, j1)?)?.to_i16()?; + + let j2 = j1.checked_add(1).unwrap(); + let left2 = this.read_scalar(&this.project_index(&left, j2)?)?.to_i16()?; + let right2 = this.read_scalar(&this.project_index(&right, j2)?)?.to_i16()?; + + let dest = this.project_index(&dest, i)?; + + // Multiplications are i16*i16->i32, which will not overflow. + let mul1 = i32::from(left1).checked_mul(right1.into()).unwrap(); + let mul2 = i32::from(left2).checked_mul(right2.into()).unwrap(); + // However, this addition can overflow in the most extreme case + // (-0x8000)*(-0x8000)+(-0x8000)*(-0x8000) = 0x80000000 + let res = mul1.wrapping_add(mul2); + + this.write_scalar(Scalar::from_i32(res), &dest)?; + } + } // Used to implement the _mm_mulhi_epi16 and _mm_mulhi_epu16 functions. "pmulh.w" | "pmulhu.w" => { let [left, right] = diff --git a/tests/pass/intrinsics-x86-sse2.rs b/tests/pass/intrinsics-x86-sse2.rs index fa9df04d36..2c7665bc73 100644 --- a/tests/pass/intrinsics-x86-sse2.rs +++ b/tests/pass/intrinsics-x86-sse2.rs @@ -70,6 +70,24 @@ mod tests { } test_mm_avg_epu16(); + #[target_feature(enable = "sse2")] + unsafe fn test_mm_madd_epi16() { + let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16); + let r = _mm_madd_epi16(a, b); + let e = _mm_setr_epi32(29, 81, 149, 233); + assert_eq_m128i(r, e); + + let a = + _mm_setr_epi16(i16::MAX, i16::MAX, i16::MIN, i16::MIN, i16::MIN, i16::MAX, 0, 0); + let b = + _mm_setr_epi16(i16::MAX, i16::MAX, i16::MIN, i16::MIN, i16::MAX, i16::MIN, 0, 0); + let r = _mm_madd_epi16(a, b); + let e = _mm_setr_epi32(0x7FFE0002, i32::MIN, -0x7FFF0000, 0); + assert_eq_m128i(r, e); + } + test_mm_madd_epi16(); + #[target_feature(enable = "sse2")] unsafe fn test_mm_mulhi_epi16() { let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));