From f50f06232399dd5a2bd1106577b16410171dc0d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Thu, 21 Sep 2023 18:25:41 +0200 Subject: [PATCH] Implement `llvm.ctpop.v*` intrinsics Tested through x86 avx512vpopcntdq and avx512bitalg functions. --- src/shims/foreign_items.rs | 23 ++++ tests/pass/intrinsics-x86-avx512bitalg.rs | 133 +++++++++++++++++++ tests/pass/intrinsics-x86-avx512vpopcntdq.rs | 124 +++++++++++++++++ 3 files changed, 280 insertions(+) create mode 100644 tests/pass/intrinsics-x86-avx512bitalg.rs create mode 100644 tests/pass/intrinsics-x86-avx512vpopcntdq.rs diff --git a/src/shims/foreign_items.rs b/src/shims/foreign_items.rs index 0e8eaf450e..f596df1f7a 100644 --- a/src/shims/foreign_items.rs +++ b/src/shims/foreign_items.rs @@ -1032,6 +1032,29 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> { } } + // Used to implement the x86 `_mm{,256,512}_popcnt_epi{8,16,32,64}` and wasm + // `{i,u}8x16_popcnt` functions. + name if name.starts_with("llvm.ctpop.v") => { + let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?; + + let (op, op_len) = this.operand_to_simd(op)?; + let (dest, dest_len) = this.place_to_simd(dest)?; + + assert_eq!(dest_len, op_len); + + for i in 0..dest_len { + let op = this.read_immediate(&this.project_index(&op, i)?)?; + // Use `to_uint` to get a zero-extended `u128`. Those + // extra zeros will not affect `count_ones`. + let res = op.to_scalar().to_uint(op.layout.size)?.count_ones(); + + this.write_scalar( + Scalar::from_uint(res, op.layout.size), + &this.project_index(&dest, i)?, + )?; + } + } + name if name.starts_with("llvm.x86.sse.") => { return shims::x86::sse::EvalContextExt::emulate_x86_sse_intrinsic( this, link_name, abi, args, dest, diff --git a/tests/pass/intrinsics-x86-avx512bitalg.rs b/tests/pass/intrinsics-x86-avx512bitalg.rs new file mode 100644 index 0000000000..19f9b94ff0 --- /dev/null +++ b/tests/pass/intrinsics-x86-avx512bitalg.rs @@ -0,0 +1,133 @@ +// Ignore everything except x86 and x86_64 +// Any additional target are added to CI should be ignored here +//@ignore-target-aarch64 +//@ignore-target-arm +//@ignore-target-avr +//@ignore-target-s390x +//@ignore-target-thumbv7em +//@ignore-target-wasm32 +//@compile-flags: -C target-feature=+avx512bitalg,+avx512f,+avx512vl + +#![feature(avx512_target_feature)] +#![feature(stdsimd)] + +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; +use std::mem::transmute; + +fn main() { + assert!(is_x86_feature_detected!("avx512bitalg")); + assert!(is_x86_feature_detected!("avx512f")); + assert!(is_x86_feature_detected!("avx512vl")); + + unsafe { + test_avx512bitalg(); + } +} + +// Some of the constants in the tests below are just bit patterns. They should not +// be interpreted as integers; signedness does not make sense for them, but +// __mXXXi happens to be defined in terms of signed integers. +#[allow(overflowing_literals)] +#[target_feature(enable = "avx512bitalg,avx512f,avx512vl")] +unsafe fn test_avx512bitalg() { + // Mostly copied from library/stdarch/crates/core_arch/src/x86/avx512bitalg.rs + + #[target_feature(enable = "avx512bitalg,avx512f")] + unsafe fn test_mm512_popcnt_epi16() { + let test_data = _mm512_set_epi16( + 0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF, + 0x3F_FF, 0x7F_FF, 0xFF_FF, -1, -100, 255, 256, 2, 4, 8, 16, 32, 64, 128, 256, 512, + 1024, 2048, + ); + let actual_result = _mm512_popcnt_epi16(test_data); + let reference_result = _mm512_set_epi16( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 12, 8, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, + ); + assert_eq_m512i(actual_result, reference_result); + } + test_mm512_popcnt_epi16(); + + #[target_feature(enable = "avx512bitalg,avx512f,avx512vl")] + unsafe fn test_mm256_popcnt_epi16() { + let test_data = _mm256_set_epi16( + 0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1_FF, 0x3_FF, 0x7_FF, 0xF_FF, 0x1F_FF, + 0x3F_FF, 0x7F_FF, + ); + let actual_result = _mm256_popcnt_epi16(test_data); + let reference_result = + _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m256i(actual_result, reference_result); + } + test_mm256_popcnt_epi16(); + + #[target_feature(enable = "avx512bitalg,avx512f,avx512vl")] + unsafe fn test_mm_popcnt_epi16() { + let test_data = _mm_set_epi16(0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F); + let actual_result = _mm_popcnt_epi16(test_data); + let reference_result = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); + assert_eq_m128i(actual_result, reference_result); + } + test_mm_popcnt_epi16(); + + #[target_feature(enable = "avx512bitalg,avx512f")] + unsafe fn test_mm512_popcnt_epi8() { + let test_data = _mm512_set_epi8( + 0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 128, 171, 206, 100, + 217, 109, 253, 190, 177, 254, 179, 215, 230, 68, 201, 172, 183, 154, 84, 56, 227, 189, + 140, 35, 117, 219, 169, 226, 170, 13, 22, 159, 251, 73, 121, 143, 145, 85, 91, 137, 90, + 225, 21, 249, 211, 155, 228, 70, + ); + let actual_result = _mm512_popcnt_epi8(test_data); + let reference_result = _mm512_set_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 5, 5, 7, 6, 4, 7, 5, 6, 5, + 2, 4, 4, 6, 4, 3, 3, 5, 6, 3, 3, 5, 6, 4, 4, 4, 3, 3, 6, 7, 3, 5, 5, 3, 4, 5, 3, 4, 4, + 3, 6, 5, 5, 4, 3, + ); + assert_eq_m512i(actual_result, reference_result); + } + test_mm512_popcnt_epi8(); + + #[target_feature(enable = "avx512bitalg,avx512f,avx512vl")] + unsafe fn test_mm256_popcnt_epi8() { + let test_data = _mm256_set_epi8( + 0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64, 128, 171, 206, 100, + 217, 109, 253, 190, 177, 254, 179, 215, 230, 68, 201, 172, + ); + let actual_result = _mm256_popcnt_epi8(test_data); + let reference_result = _mm256_set_epi8( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1, 1, 5, 5, 3, 5, 5, 7, 6, 4, 7, 5, 6, 5, + 2, 4, 4, + ); + assert_eq_m256i(actual_result, reference_result); + } + test_mm256_popcnt_epi8(); + + #[target_feature(enable = "avx512bitalg,avx512f,avx512vl")] + unsafe fn test_mm_popcnt_epi8() { + let test_data = + _mm_set_epi8(0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, -1, 2, 4, 8, 16, 32, 64); + let actual_result = _mm_popcnt_epi8(test_data); + let reference_result = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 1, 1, 1, 1, 1, 1); + assert_eq_m128i(actual_result, reference_result); + } + test_mm_popcnt_epi8(); +} + +#[track_caller] +unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) { + assert_eq!(transmute::<_, [i32; 16]>(a), transmute::<_, [i32; 16]>(b)) +} + +#[track_caller] +unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) { + assert_eq!(transmute::<_, [u64; 4]>(a), transmute::<_, [u64; 4]>(b)) +} + +#[track_caller] +unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) { + assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b)) +} diff --git a/tests/pass/intrinsics-x86-avx512vpopcntdq.rs b/tests/pass/intrinsics-x86-avx512vpopcntdq.rs new file mode 100644 index 0000000000..5c8821305e --- /dev/null +++ b/tests/pass/intrinsics-x86-avx512vpopcntdq.rs @@ -0,0 +1,124 @@ +// Ignore everything except x86 and x86_64 +// Any additional target are added to CI should be ignored here +//@ignore-target-aarch64 +//@ignore-target-arm +//@ignore-target-avr +//@ignore-target-s390x +//@ignore-target-thumbv7em +//@ignore-target-wasm32 +//@compile-flags: -C target-feature=+avx512vpopcntdq,+avx512f,+avx512vl + +#![feature(avx512_target_feature)] +#![feature(stdsimd)] + +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; +use std::mem::transmute; + +fn main() { + assert!(is_x86_feature_detected!("avx512vpopcntdq")); + assert!(is_x86_feature_detected!("avx512f")); + assert!(is_x86_feature_detected!("avx512vl")); + + unsafe { + test_avx512vpopcntdq(); + } +} + +#[target_feature(enable = "avx512vpopcntdq,avx512f,avx512vl")] +unsafe fn test_avx512vpopcntdq() { + // Mostly copied from library/stdarch/crates/core_arch/src/x86/avx512vpopcntdq.rs + + #[target_feature(enable = "avx512vpopcntdq,avx512f")] + unsafe fn test_mm512_popcnt_epi32() { + let test_data = _mm512_set_epi32( + 0, + 1, + -1, + 2, + 7, + 0xFF_FE, + 0x7F_FF_FF_FF, + -100, + 0x40_00_00_00, + 103, + 371, + 552, + 432_948, + 818_826_998, + 255, + 256, + ); + let actual_result = _mm512_popcnt_epi32(test_data); + let reference_result = + _mm512_set_epi32(0, 1, 32, 1, 3, 15, 31, 28, 1, 5, 6, 3, 10, 17, 8, 1); + assert_eq_m512i(actual_result, reference_result); + } + test_mm512_popcnt_epi32(); + + #[target_feature(enable = "avx512vpopcntdq,avx512f,avx512vl")] + unsafe fn test_mm256_popcnt_epi32() { + let test_data = _mm256_set_epi32(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF, -100); + let actual_result = _mm256_popcnt_epi32(test_data); + let reference_result = _mm256_set_epi32(0, 1, 32, 1, 3, 15, 31, 28); + assert_eq_m256i(actual_result, reference_result); + } + test_mm256_popcnt_epi32(); + + #[target_feature(enable = "avx512vpopcntdq,avx512f,avx512vl")] + unsafe fn test_mm_popcnt_epi32() { + let test_data = _mm_set_epi32(0, 1, -1, -100); + let actual_result = _mm_popcnt_epi32(test_data); + let reference_result = _mm_set_epi32(0, 1, 32, 28); + assert_eq_m128i(actual_result, reference_result); + } + test_mm_popcnt_epi32(); + + #[target_feature(enable = "avx512vpopcntdq,avx512f")] + unsafe fn test_mm512_popcnt_epi64() { + let test_data = _mm512_set_epi64(0, 1, -1, 2, 7, 0xFF_FE, 0x7F_FF_FF_FF_FF_FF_FF_FF, -100); + let actual_result = _mm512_popcnt_epi64(test_data); + let reference_result = _mm512_set_epi64(0, 1, 64, 1, 3, 15, 63, 60); + assert_eq_m512i(actual_result, reference_result); + } + test_mm512_popcnt_epi64(); + + #[target_feature(enable = "avx512vpopcntdq,avx512vl")] + unsafe fn test_mm256_popcnt_epi64() { + let test_data = _mm256_set_epi64x(0, 1, -1, -100); + let actual_result = _mm256_popcnt_epi64(test_data); + let reference_result = _mm256_set_epi64x(0, 1, 64, 60); + assert_eq_m256i(actual_result, reference_result); + } + test_mm256_popcnt_epi64(); + + #[target_feature(enable = "avx512vpopcntdq,avx512vl")] + unsafe fn test_mm_popcnt_epi64() { + let test_data = _mm_set_epi64x(0, 1); + let actual_result = _mm_popcnt_epi64(test_data); + let reference_result = _mm_set_epi64x(0, 1); + assert_eq_m128i(actual_result, reference_result); + let test_data = _mm_set_epi64x(-1, -100); + let actual_result = _mm_popcnt_epi64(test_data); + let reference_result = _mm_set_epi64x(64, 60); + assert_eq_m128i(actual_result, reference_result); + } + test_mm_popcnt_epi64(); +} + +#[track_caller] +unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) { + assert_eq!(transmute::<_, [i32; 16]>(a), transmute::<_, [i32; 16]>(b)) +} + +#[track_caller] +unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) { + assert_eq!(transmute::<_, [u64; 4]>(a), transmute::<_, [u64; 4]>(b)) +} + +#[track_caller] +unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) { + assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b)) +}