From cff1af907ffecdcfc0a45fde17472084f4492645 Mon Sep 17 00:00:00 2001 From: redzic Date: Wed, 15 Sep 2021 14:44:13 -0500 Subject: [PATCH] Add AVX2 implementation of `first_max_element` This also now requires BMI1 and BMI2 for AVX2 in `CpuFeatureLevel`. --- src/asm/x86/dist/hbd.rs | 4 +- src/asm/x86/lrf.rs | 20 ++++---- src/asm/x86/quantize.rs | 2 +- src/asm/x86/transform/forward.rs | 38 +++++++-------- src/cdef.rs | 82 +++++++++++++++++++++++++------- src/cpu_features/x86.rs | 5 +- 6 files changed, 100 insertions(+), 51 deletions(-) diff --git a/src/asm/x86/dist/hbd.rs b/src/asm/x86/dist/hbd.rs index 43101874d1..210dabd0b6 100644 --- a/src/asm/x86/dist/hbd.rs +++ b/src/asm/x86/dist/hbd.rs @@ -5,7 +5,7 @@ macro_rules! satd_hbd_avx2 { ($(($W:expr, $H:expr)),*) => { $( paste::item! { - #[target_feature(enable = "avx2")] + #[target_feature(enable = "avx2,bmi1,bmi2")] pub(crate) unsafe extern fn []( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, ) -> u32 { @@ -43,7 +43,7 @@ macro_rules! satd_kernel_hbd_avx2 { ($(($W:expr, $H:expr)),*) => { $( paste::item! { - #[target_feature(enable = "avx2")] + #[target_feature(enable = "avx2,bmi1,bmi2")] unsafe extern fn []( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, ) -> u64 { diff --git a/src/asm/x86/lrf.rs b/src/asm/x86/lrf.rs index d1cc0dd237..a052b6b522 100644 --- a/src/asm/x86/lrf.rs +++ b/src/asm/x86/lrf.rs @@ -158,7 +158,7 @@ static X_BY_XPLUS1: [u32; 256] = [ ]; #[inline] -#[target_feature(enable = "avx2")] +#[target_feature(enable = "avx2,bmi1,bmi2")] unsafe fn sgrproj_box_ab_8_avx2( r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, x: usize, y: usize, s: u32, bdm8: usize, @@ -169,7 +169,7 @@ unsafe fn sgrproj_box_ab_8_avx2( // Using an integral image, compute the sum of a square region #[inline] - #[target_feature(enable = "avx2")] + #[target_feature(enable = "avx2,bmi1,bmi2")] unsafe fn get_integral_square_avx2( iimg: &[u32], stride: usize, x: usize, y: usize, size: usize, ) -> __m256i { @@ -234,7 +234,7 @@ unsafe fn sgrproj_box_ab_8_avx2( _mm256_storeu_si256(bf.as_mut_ptr().add(x) as *mut _, b); } -#[target_feature(enable = "avx2")] +#[target_feature(enable = "avx2,bmi1,bmi2")] pub(crate) unsafe fn sgrproj_box_ab_r1_avx2( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, @@ -293,7 +293,7 @@ pub(crate) unsafe fn sgrproj_box_ab_r1_avx2( } } -#[target_feature(enable = "avx2")] +#[target_feature(enable = "avx2,bmi1,bmi2")] pub(crate) unsafe fn sgrproj_box_ab_r2_avx2( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, @@ -353,7 +353,7 @@ pub(crate) unsafe fn sgrproj_box_ab_r2_avx2( } #[inline] -#[target_feature(enable = "avx2")] +#[target_feature(enable = "avx2,bmi1,bmi2")] unsafe fn sgrproj_box_f_r0_8_avx2( f: &mut [u32], x: usize, y: usize, cdeffed: &PlaneSlice, ) { @@ -374,7 +374,7 @@ unsafe fn sgrproj_box_f_r0_8_avx2( ); } -#[target_feature(enable = "avx2")] +#[target_feature(enable = "avx2,bmi1,bmi2")] pub(crate) unsafe fn sgrproj_box_f_r0_avx2( f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice, ) { @@ -396,7 +396,7 @@ pub(crate) unsafe fn sgrproj_box_f_r0_avx2( } #[inline] -#[target_feature(enable = "avx2")] +#[target_feature(enable = "avx2,bmi1,bmi2")] unsafe fn sgrproj_box_f_r1_8_avx2( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], x: usize, y: usize, cdeffed: &PlaneSlice, @@ -496,7 +496,7 @@ unsafe fn sgrproj_box_f_r1_8_avx2( ); } -#[target_feature(enable = "avx2")] +#[target_feature(enable = "avx2,bmi1,bmi2")] pub(crate) unsafe fn sgrproj_box_f_r1_avx2( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice, @@ -519,7 +519,7 @@ pub(crate) unsafe fn sgrproj_box_f_r1_avx2( } #[inline] -#[target_feature(enable = "avx2")] +#[target_feature(enable = "avx2,bmi1,bmi2")] unsafe fn sgrproj_box_f_r2_8_avx2( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], x: usize, y: usize, cdeffed: &PlaneSlice, @@ -618,7 +618,7 @@ unsafe fn sgrproj_box_f_r2_8_avx2( ); } -#[target_feature(enable = "avx2")] +#[target_feature(enable = "avx2,bmi1,bmi2")] pub(crate) unsafe fn sgrproj_box_f_r2_avx2( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice, diff --git a/src/asm/x86/quantize.rs b/src/asm/x86/quantize.rs index 83e3986a81..878c657ba1 100644 --- a/src/asm/x86/quantize.rs +++ b/src/asm/x86/quantize.rs @@ -85,7 +85,7 @@ pub fn dequantize( } } -#[target_feature(enable = "avx2")] +#[target_feature(enable = "avx2,bmi1,bmi2")] unsafe fn dequantize_avx2( qindex: u8, coeffs_ptr: *const i16, _eob: usize, rcoeffs_ptr: *mut i16, tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, diff --git a/src/asm/x86/transform/forward.rs b/src/asm/x86/transform/forward.rs index 5f6c9e8d05..e0402139b5 100644 --- a/src/asm/x86/transform/forward.rs +++ b/src/asm/x86/transform/forward.rs @@ -63,13 +63,13 @@ struct I32X8 { } impl I32X8 { - #[target_feature(enable = "avx2")] + #[target_feature(enable = "avx2,bmi1,bmi2")] #[inline] const unsafe fn vec(self) -> __m256i { self.data } - #[target_feature(enable = "avx2")] + #[target_feature(enable = "avx2,bmi1,bmi2")] #[inline] const unsafe fn new(a: __m256i) -> I32X8 { I32X8 { data: a } @@ -77,13 +77,13 @@ impl I32X8 { } impl TxOperations for I32X8 { - #[target_feature(enable = "avx2")] + #[target_feature(enable = "avx2,bmi1,bmi2")] #[inline] unsafe fn zero() -> Self { I32X8::new(_mm256_setzero_si256()) } - #[target_feature(enable = "avx2")] + #[target_feature(enable = "avx2,bmi1,bmi2")] #[inline] unsafe fn tx_mul(self, mul: (i32, i32)) -> Self { I32X8::new(_mm256_srav_epi32( @@ -95,7 +95,7 @@ impl TxOperations for I32X8 { )) } - #[target_feature(enable = "avx2")] + #[target_feature(enable = "avx2,bmi1,bmi2")] #[inline] unsafe fn rshift1(self) -> Self { I32X8::new(_mm256_srai_epi32( @@ -107,34 +107,34 @@ impl TxOperations for I32X8 { )) } - #[target_feature(enable = "avx2")] + #[target_feature(enable = "avx2,bmi1,bmi2")] #[inline] unsafe fn add(self, b: Self) -> Self { I32X8::new(_mm256_add_epi32(self.vec(), b.vec())) } - #[target_feature(enable = "avx2")] + #[target_feature(enable = "avx2,bmi1,bmi2")] #[inline] unsafe fn sub(self, b: Self) -> Self { I32X8::new(_mm256_sub_epi32(self.vec(), b.vec())) } - #[target_feature(enable = "avx2")] + #[target_feature(enable = "avx2,bmi1,bmi2")] #[inline] unsafe fn add_avg(self, b: Self) -> Self { I32X8::new(_mm256_srai_epi32(_mm256_add_epi32(self.vec(), b.vec()), 1)) } - #[target_feature(enable = "avx2")] + #[target_feature(enable = "avx2,bmi1,bmi2")] #[inline] unsafe fn sub_avg(self, b: Self) -> Self { I32X8::new(_mm256_srai_epi32(_mm256_sub_epi32(self.vec(), b.vec()), 1)) } } -impl_1d_tx!(target_feature(enable = "avx2"), unsafe); +impl_1d_tx!(target_feature(enable = "avx2,bmi1,bmi2"), unsafe); -#[target_feature(enable = "avx2")] +#[target_feature(enable = "avx2,bmi1,bmi2")] unsafe fn transpose_8x8_avx2( input: (I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8), ) -> (I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8) { @@ -175,7 +175,7 @@ unsafe fn transpose_8x8_avx2( ) } -#[target_feature(enable = "avx2")] +#[target_feature(enable = "avx2,bmi1,bmi2")] unsafe fn transpose_8x4_avx2( input: (I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8), ) -> (I32X8, I32X8, I32X8, I32X8) { @@ -213,7 +213,7 @@ unsafe fn transpose_8x4_avx2( ) } -#[target_feature(enable = "avx2")] +#[target_feature(enable = "avx2,bmi1,bmi2")] unsafe fn transpose_4x8_avx2( input: (I32X8, I32X8, I32X8, I32X8), ) -> (I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8, I32X8) { @@ -246,7 +246,7 @@ unsafe fn transpose_4x8_avx2( ) } -#[target_feature(enable = "avx2")] +#[target_feature(enable = "avx2,bmi1,bmi2")] unsafe fn transpose_4x4_avx2( input: (I32X8, I32X8, I32X8, I32X8), ) -> (I32X8, I32X8, I32X8, I32X8) { @@ -265,13 +265,13 @@ unsafe fn transpose_4x4_avx2( ) } -#[target_feature(enable = "avx2")] +#[target_feature(enable = "avx2,bmi1,bmi2")] #[inline] unsafe fn shift_left(a: I32X8, shift: u8) -> I32X8 { I32X8::new(_mm256_sllv_epi32(a.vec(), _mm256_set1_epi32(shift as i32))) } -#[target_feature(enable = "avx2")] +#[target_feature(enable = "avx2,bmi1,bmi2")] #[inline] unsafe fn shift_right(a: I32X8, shift: u8) -> I32X8 { I32X8::new(_mm256_srav_epi32( @@ -280,7 +280,7 @@ unsafe fn shift_right(a: I32X8, shift: u8) -> I32X8 { )) } -#[target_feature(enable = "avx2")] +#[target_feature(enable = "avx2,bmi1,bmi2")] #[inline] unsafe fn round_shift_array_avx2(arr: &mut [I32X8], size: usize, bit: i8) { if bit == 0 { @@ -328,7 +328,7 @@ impl SizeClass1D { } #[allow(clippy::identity_op, clippy::erasing_op)] -#[target_feature(enable = "avx2")] +#[target_feature(enable = "avx2,bmi1,bmi2")] unsafe fn forward_transform_avx2( input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize, tx_type: TxType, bd: usize, @@ -355,7 +355,7 @@ unsafe fn forward_transform_avx2( // Columns for cg in (0..txfm_size_col).step_by(8) { let shift = cfg.shift[0] as u8; - #[target_feature(enable = "avx2")] + #[target_feature(enable = "avx2,bmi1,bmi2")] #[inline] unsafe fn load_columns(input_ptr: *const i16, shift: u8) -> I32X8 { // TODO: load 64-bits for x4 wide columns diff --git a/src/cdef.rs b/src/cdef.rs index 08a222677b..00b6eee6e6 100644 --- a/src/cdef.rs +++ b/src/cdef.rs @@ -59,20 +59,57 @@ pub(crate) mod rust { /// /// # Arguments /// - /// * `elems` - A non-empty slice of integers - /// - /// # Panics - /// - /// Panics if `elems` is empty + /// * `elems` - A slice of 8 `i32`s #[inline] - fn first_max_element(elems: &[i32]) -> (usize, i32) { - // In case of a tie, the first element must be selected. - let (max_idx, max_value) = elems - .iter() - .enumerate() - .max_by_key(|&(i, v)| (v, -(i as isize))) - .unwrap(); - (max_idx, *max_value) + fn first_max_element( + elems: &[i32; 8], cpu: CpuFeatureLevel, + ) -> (usize, i32) { + // Same as `first_max_element`, but implemented with AVX2 intrinsics + #[inline] + #[cfg(nasm_x86_64)] + #[target_feature(enable = "avx2,bmi1,bmi2")] + unsafe fn first_max_element_avx2(elems: &[i32; 8]) -> (usize, i32) { + #[cfg(target_arch = "x86")] + use std::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use std::arch::x86_64::*; + + // the compiler autovectorizes this + let max_val = *elems.iter().max().unwrap(); + + let cmp = _mm256_cmpeq_epi32( + _mm256_loadu_si256(elems as *const i32 as *const _), + _mm256_set1_epi32(max_val), + ); + // this intrinsic is supposed to be for floating point, but it works + // fine on integer data as well + let mask = _mm256_movemask_ps(std::mem::transmute(cmp)); + + (mask.trailing_zeros() as usize, max_val) + } + + #[inline] + fn first_max_element(elems: &[i32; 8]) -> (usize, i32) { + // In case of a tie, the first element must be selected. + let (max_idx, max_value) = elems + .iter() + .enumerate() + .max_by_key(|&(i, v)| (v, -(i as isize))) + .unwrap(); + (max_idx, *max_value) + } + + #[cfg(nasm_x86_64)] + if cpu >= CpuFeatureLevel::AVX2 { + let result = unsafe { first_max_element_avx2(elems) }; + + #[cfg(feature = "check_asm")] + assert_eq!(result, first_max_element(elems)); + + return result; + } + + first_max_element(elems) } // Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on. @@ -84,7 +121,7 @@ pub(crate) mod rust { // http://jmvalin.ca/notes/intra_paint.pdf pub fn cdef_find_dir( img: &PlaneSlice<'_, T>, var: &mut u32, coeff_shift: usize, - _cpu: CpuFeatureLevel, + cpu: CpuFeatureLevel, ) -> i32 { let mut cost: [i32; 8] = [0; 8]; let mut partial: [[i32; 15]; 8] = [[0; 15]; 8]; @@ -133,7 +170,7 @@ pub(crate) mod rust { } } - let (best_dir, best_cost) = first_max_element(&cost); + let (best_dir, best_cost) = first_max_element(&cost, cpu); // Difference between the optimal variance and the variance along the // orthogonal direction. Again, the sum(x^2) terms cancel out. // We'd normally divide by 840, but dividing by 1024 is close enough @@ -305,9 +342,18 @@ pub(crate) mod rust { #[test] fn check_max_element() { - assert_eq!(first_max_element(&[-1, -1, 1, 2, 3, 4, 6, 6]), (6, 6)); - assert_eq!(first_max_element(&[-1, -1, 1, 2, 3, 4, 7, 6]), (6, 7)); - assert_eq!(first_max_element(&[0, 0]), (0, 0)); + assert_eq!( + first_max_element(&[-1, -1, 1, 2, 3, 4, 6, 6], CpuFeatureLevel::RUST), + (6, 6) + ); + assert_eq!( + first_max_element(&[-1, -1, 1, 2, 3, 4, 7, 6], CpuFeatureLevel::RUST), + (6, 7) + ); + assert_eq!( + first_max_element(&[0, 0, 0, 0, 0, 0, 0, 0], CpuFeatureLevel::RUST), + (0, 0) + ); } } } diff --git a/src/cpu_features/x86.rs b/src/cpu_features/x86.rs index eb47cf9f43..6a3113c54a 100644 --- a/src/cpu_features/x86.rs +++ b/src/cpu_features/x86.rs @@ -62,7 +62,10 @@ impl Default for CpuFeatureLevel { CpuFeatureLevel::AVX512ICL } else if avx512_detected() { CpuFeatureLevel::AVX512 - } else if is_x86_feature_detected!("avx2") { + } else if is_x86_feature_detected!("avx2") + && is_x86_feature_detected!("bmi1") + && is_x86_feature_detected!("bmi2") + { CpuFeatureLevel::AVX2 } else if is_x86_feature_detected!("sse4.1") { CpuFeatureLevel::SSE4_1