From b0dc2adebf8ecf9c8f127d9f1ed78a1875719e51 Mon Sep 17 00:00:00 2001 From: kennytm Date: Thu, 18 Jan 2024 18:43:04 +0800 Subject: [PATCH] pclmulqdq: Upgrade rust version to use stabilized AArch64 intrinsics. (#10) * pclmulqdq: Upgrade rust version to use stabilized AArch64 intrinsics. Signed-off-by: kennytm * test: upgrade crc dependency from 1.8 to 3.0 Benchmarking shows the throughput of crc v3 on x86 is essentially unchanged. Not going to update the numbers. Signed-off-by: kennytm --------- Signed-off-by: kennytm --- Cargo.toml | 14 ++++---- README.md | 15 ++------- benches/benchmark.rs | 14 ++++---- src/lib.rs | 11 +++---- src/pclmulqdq/aarch64.rs | 70 ++++++---------------------------------- src/pclmulqdq/mod.rs | 6 ++-- 6 files changed, 35 insertions(+), 95 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5832066..f4424d8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,27 +1,29 @@ [package] name = "crc64fast" -version = "1.0.0" +version = "1.1.0" authors = ["The TiKV Project Developers"] license = "MIT OR Apache-2.0" -edition = "2018" +edition = "2021" keywords = ["crc", "crc64", "simd", "checksum"] repository = "https://github.com/tikv/crc64fast" description = "SIMD accelerated CRC64 calculation" exclude = ["build_table.rs"] readme = "README.md" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +# Note: Rust 1.70 upgraded LLVM version to 16 (in particular https://reviews.llvm.org/D131047) +# Before that, the compiler is unwilling to generate the PMULL2 instruction on AArch64. +rust-version = "1.70.0" [dependencies] [dev-dependencies] -crc = "1" +crc = "3" proptest = "1" -criterion = "0.3" +criterion = "0.5" rand = "0.8" [features] -pmull = [] +pmull = [] # deprecated, no longer have any effect. fake-simd = [] [[bench]] diff --git a/README.md b/README.md index 70745d1..96321dd 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ crc64fast [![Latest Version](https://img.shields.io/crates/v/crc64fast.svg)](https://crates.io/crates/crc64fast) [![Documentation](https://img.shields.io/badge/api-rustdoc-blue.svg)](https://docs.rs/crc64fast) -SIMD-accelerated CRC-64-ECMA computation +SIMD-accelerated CRC-64/XZ (a.k.a. CRC-64/GO-ECMA) computation (similar to [`crc32fast`](https://crates.io/crates/crc32fast)). ## Usage @@ -32,20 +32,11 @@ be chosen based on CPU feature at runtime. | Algorithm | Throughput (x86_64) | Throughput (aarch64) | |:------------------|--------------------:|---------------------:| -| [crc 1.8.1] | 0.5 GiB/s | 0.3 GiB/s | +| [crc 3.0.1] | 0.5 GiB/s | 0.3 GiB/s | | crc64fast (table) | 2.3 GiB/s | 1.8 GiB/s | | crc64fast (simd) | 28.2 GiB/s | 20.0 GiB/s | -[crc 1.8.1]: https://crates.io/crates/crc - -> **Note:** Since Rust has not stabilized SIMD support on AArch64, you need a -> nightly compiler and enable the `pmull` feature to use the SIMD-based -> implementation: -> -> ```toml -> [dependencies] -> crc64fast = { version = "1.0", features = ["pmull"] } -> ``` +[crc 3.0.1]: https://docs.rs/crc/3.0.1/crc/index.html ## TODO diff --git a/benches/benchmark.rs b/benches/benchmark.rs index b1dd75f..bb74fcc 100644 --- a/benches/benchmark.rs +++ b/benches/benchmark.rs @@ -1,9 +1,11 @@ // Copyright 2019 TiKV Project Authors. Licensed under MIT or Apache-2.0. -use crc::crc64::{self, Hasher64}; +use crc::{Crc, CRC_64_XZ}; use criterion::*; use rand::{thread_rng, RngCore}; +const CRC: Crc = Crc::::new(&CRC_64_XZ); + fn bench_crc(c: &mut Criterion) { let mut group = c.benchmark_group("CRC64"); let mut rng = thread_rng(); @@ -15,11 +17,11 @@ fn bench_crc(c: &mut Criterion) { group.throughput(Throughput::Bytes(3 << size)); group.bench_with_input(BenchmarkId::new("crc::crc64", size), &buf, |b, buf| { b.iter(|| { - let mut digest = crc64::Digest::new(crc64::ECMA); - digest.write(&buf[..(1 << size)]); - digest.write(&buf[(1 << size)..(2 << size)]); - digest.write(&buf[(2 << size)..]); - digest.sum64() + let mut digest = CRC.digest(); + digest.update(&buf[..(1 << size)]); + digest.update(&buf[(1 << size)..(2 << size)]); + digest.update(&buf[(2 << size)..]); + digest.finalize() }) }); group.bench_with_input(BenchmarkId::new("crc64fast::simd", size), &buf, |b, buf| { diff --git a/src/lib.rs b/src/lib.rs index 847f2c1..98b9c61 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,11 +18,6 @@ //! assert_eq!(checksum, 0x8483_c0fa_3260_7d61); //! ``` -#![cfg_attr( - feature = "pmull", - feature(stdsimd, platform_intrinsics, aarch64_target_feature, llvm_asm) -)] - mod pclmulqdq; mod table; @@ -75,10 +70,12 @@ impl Default for Digest { #[cfg(test)] mod tests { use super::Digest; - use crc::crc64::checksum_ecma; + use crc::{Crc, CRC_64_XZ}; use proptest::collection::size_range; use proptest::prelude::*; + const CRC: Crc = Crc::::new(&CRC_64_XZ); + #[test] fn test_standard_vectors() { static CASES: &[(&[u8], u64)] = &[ @@ -120,7 +117,7 @@ mod tests { fn equivalent_to_crc(bytes in any_buffer()) { let mut hasher = Digest::new(); hasher.write(&bytes); - prop_assert_eq!(hasher.sum64(), checksum_ecma(&bytes)); + prop_assert_eq!(hasher.sum64(), CRC.checksum(&bytes)); } #[test] diff --git a/src/pclmulqdq/aarch64.rs b/src/pclmulqdq/aarch64.rs index c368200..fb7d2ae 100644 --- a/src/pclmulqdq/aarch64.rs +++ b/src/pclmulqdq/aarch64.rs @@ -2,7 +2,7 @@ //! AArch64 implementation of the PCLMULQDQ-based CRC calculation. -use std::arch::aarch64::*; +use std::arch::{aarch64::*, is_aarch64_feature_detected}; use std::mem::transmute; use std::ops::BitXor; @@ -10,6 +10,9 @@ use std::ops::BitXor; #[derive(Copy, Clone, Debug)] pub struct Simd(uint8x16_t); +#[allow(non_camel_case_types)] +type poly64_t = u64; + impl Simd { #[inline] #[target_feature(enable = "neon")] @@ -52,34 +55,12 @@ impl super::SimdExt for Simd { } #[inline] - #[target_feature(enable = "crypto", enable = "neon")] + #[target_feature(enable = "aes", enable = "neon")] unsafe fn fold_16(self, coeff: Self) -> Self { - let h: Self; - let l: Self; - - // FIXME: When used as a single function, this branch is equivalent to - // the ASM below. However, when fold_16 is called inside a loop, for - // some reason LLVM replaces the PMULL2 call with a plain PMULL, which - // leads unnecessary FMOV calls and slows down the throughput from - // 20 GiB/s to 14 GiB/s. This bug does not exist with GCC. Delete the - // ASM code once this misoptimization is fixed. - #[cfg(slow)] - { - let [x0, x1] = self.into_poly64s(); - let [c0, c1] = coeff.into_poly64s(); - h = Self::from_mul(c0, x0); - l = Self::from_mul(c1, x1); - } - #[cfg(not(slow))] - { - llvm_asm!( - "pmull $0.1q, $2.1d, $3.1d - pmull2 $1.1q, $2.2d, $3.2d" - : "=&w"(l), "=w"(h) - : "w"(self), "w"(coeff) - ); - } - + let [x0, x1] = self.into_poly64s(); + let [c0, c1] = coeff.into_poly64s(); + let h = Self::from_mul(c0, x0); + let l = Self::from_mul(c1, x1); h ^ l } @@ -110,36 +91,3 @@ impl BitXor for Simd { unsafe { Self(veorq_u8(self.0, other.0)) } } } - -//------------------------------------------------------------------------------ -// -// Below are intrinsics not yet included in Rust. - -extern "platform-intrinsic" { - fn simd_extract(x: T, idx: u32) -> U; -} - -#[inline] -#[target_feature(enable = "neon")] -unsafe fn vgetq_lane_p64(a: poly64x2_t, idx: u32) -> poly64_t { - let elem: i64 = simd_extract(a, idx); - transmute(elem) -} - -#[inline] -#[target_feature(enable = "neon")] -unsafe fn vreinterpretq_u8_p128(a: poly128_t) -> uint8x16_t { - transmute(a) -} - -#[inline] -#[target_feature(enable = "neon")] -unsafe fn vreinterpretq_p64_u8(a: uint8x16_t) -> poly64x2_t { - transmute(a) -} - -#[inline] -#[target_feature(enable = "neon")] -unsafe fn vcreate_u8(value: u64) -> uint8x8_t { - transmute(value) -} diff --git a/src/pclmulqdq/mod.rs b/src/pclmulqdq/mod.rs index b329808..a8efe32 100644 --- a/src/pclmulqdq/mod.rs +++ b/src/pclmulqdq/mod.rs @@ -9,7 +9,7 @@ #[cfg(not(feature = "fake-simd"))] #[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), path = "x86.rs")] -#[cfg_attr(all(target_arch = "aarch64", feature = "pmull"), path = "aarch64.rs")] +#[cfg_attr(target_arch = "aarch64", path = "aarch64.rs")] mod arch; #[cfg(feature = "fake-simd")] @@ -93,8 +93,8 @@ fn update(mut state: u64, bytes: &[u8]) -> u64 { target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1") )] #[cfg_attr( - all(target_arch = "aarch64", feature = "pmull"), - target_feature(enable = "crypto", enable = "neon") + target_arch = "aarch64", + target_feature(enable = "aes", enable = "neon") )] unsafe fn update_simd(state: u64, first: &[Simd; 8], rest: &[[Simd; 8]]) -> u64 { // receive the initial 128 bytes of data