diff --git a/simd.h b/simd.h index 32fbf93..bf7b5a3 100644 --- a/simd.h +++ b/simd.h @@ -254,34 +254,28 @@ static inline void simd_export_uint8(simd_vector a, simd_vector b, simd_vector c //---------------------------------------------------------------------------------------------------------------------- static inline simd_vector simd_frexp(simd_vector x, simd_vector* exponent) { - /*__m256i cast_float = _mm256_castps_si256(x); - __m256i e = _mm256_and_si256(_mm256_srli_epi32(cast_float, 23), _mm256_set1_epi32(0xff));; - __m256i equal_to_zero = _mm256_and_si256(_mm256_cmpeq_epi32(e, _mm256_setzero_si256()), simd_cmp_eq(x, simd_splat_zero())); - e = _mm256_andnot_si256(equal_to_zero, _mm256_sub_epi32(e, _mm256_set1_epi32(0x7e))); - cast_float = _mm256_and_si256(cast_float, _mm256_set1_epi32(0x807fffff)); - cast_float = _mm256_or_si256(cast_float, _mm256_set1_epi32(0x3f000000)); - *exponent = _mm256_cvtepi32_ps(e); - return simd_select(_mm256_castsi256_ps(cast_float), x, equal_to_zero);*/ - (void)x; - (void)exponent; - return simd_splat_zero(); + int32x4_t cast_float = vreinterpretq_s32_f32(x); + int32x4_t e = vandq_s32(vshlq_s32(cast_float, vdupq_n_s32(-23)), vdupq_n_s32(0xff)); + int32x4_t equal_to_zero = vandq_s32(vceqq_s32(e, vdupq_n_s32(0)), vreinterpretq_s32_f32(vceqq_f32(x, simd_splat_zero()))); + e = vbicq_s32(vsubq_s32(e, vdupq_n_s32(0x7e)) ,equal_to_zero); + cast_float = vandq_s32(cast_float, vdupq_n_s32(0x807fffff)); + cast_float = vorrq_s32(cast_float, vdupq_n_s32(0x3f000000)); + *exponent = vcvtq_f32_s32(e); + return simd_select(vreinterpretq_f32_s32(cast_float), x, equal_to_zero); } //---------------------------------------------------------------------------------------------------------------------- static inline simd_vector simd_ldexp(simd_vector x, simd_vector pw2) { - /*__m256i fl = _mm256_castps_si256(x); - __m256i e = _mm256_and_si256(_mm256_srli_epi32(fl, 23), _mm256_set1_epi32(0xff)); - e = _mm256_and_si256(_mm256_add_epi32(e, _mm256_cvtps_epi32(pw2)), _mm256_set1_epi32(0xff)); - __m256i is_infinity = _mm256_cmpeq_epi32(e, _mm256_set1_epi32(0xff)); - fl = _mm256_or_si256(_mm256_andnot_si256(is_infinity, fl), _mm256_and_si256(fl, _mm256_set1_epi32(0xFF800000))); - fl = _mm256_or_si256(_mm256_slli_epi32(e, 23), _mm256_and_si256(fl, _mm256_set1_epi32(0x807fffff))); + int32x4_t twofivefive = vdupq_n_s32(0xff); + int32x4_t fl = vreinterpretq_s32_f32(x); + int32x4_t e = vandq_s32(vshlq_s32(fl, vdupq_n_s32(-23)), twofivefive); + e = vandq_s32(vaddq_s32(e, vcvtq_s32_f32(pw2)), twofivefive); + int32x4_t is_infinity = vceqq_s32(e, twofivefive); + fl = vorrq_s32(vbicq_s32(fl, is_infinity), vandq_s32(fl, vdupq_n_s32(0xFF800000))); + fl = vorrq_s32(vshlq_s32(e, vdupq_n_s32(23)), vandq_s32(fl, vdupq_n_s32(0x807fffff))); simd_vector equal_to_zero = simd_cmp_eq(x, simd_splat_zero()); - return simd_andnot(_mm256_castsi256_ps(fl), equal_to_zero);*/ - - (void)x; - (void)pw2; - return simd_splat_zero(); + return simd_andnot(vreinterpretq_f32_s32(fl), equal_to_zero); } #else