From b0dc2adebf8ecf9c8f127d9f1ed78a1875719e51 Mon Sep 17 00:00:00 2001
From: kennytm <kennytm@gmail.com>
Date: Thu, 18 Jan 2024 18:43:04 +0800
Subject: [PATCH] pclmulqdq: Upgrade rust version to use stabilized AArch64
 intrinsics. (#10)

* pclmulqdq: Upgrade rust version to use stabilized AArch64 intrinsics.

Signed-off-by: kennytm <kennytm@gmail.com>

* test: upgrade crc dependency from 1.8 to 3.0

Benchmarking shows the throughput of crc v3 on x86 is essentially
unchanged. Not going to update the numbers.

Signed-off-by: kennytm <kennytm@gmail.com>

---------

Signed-off-by: kennytm <kennytm@gmail.com>
---
 Cargo.toml               | 14 ++++----
 README.md                | 15 ++-------
 benches/benchmark.rs     | 14 ++++----
 src/lib.rs               | 11 +++----
 src/pclmulqdq/aarch64.rs | 70 ++++++----------------------------------
 src/pclmulqdq/mod.rs     |  6 ++--
 6 files changed, 35 insertions(+), 95 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 5832066..f4424d8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,27 +1,29 @@
 [package]
 name = "crc64fast"
-version = "1.0.0"
+version = "1.1.0"
 authors = ["The TiKV Project Developers"]
 license = "MIT OR Apache-2.0"
-edition = "2018"
+edition = "2021"
 keywords = ["crc", "crc64", "simd", "checksum"]
 repository = "https://github.com/tikv/crc64fast"
 description = "SIMD accelerated CRC64 calculation"
 exclude = ["build_table.rs"]
 readme = "README.md"
 
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+# Note: Rust 1.70 upgraded LLVM version to 16 (in particular https://reviews.llvm.org/D131047)
+# Before that, the compiler is unwilling to generate the PMULL2 instruction on AArch64.
+rust-version = "1.70.0"
 
 [dependencies]
 
 [dev-dependencies]
-crc = "1"
+crc = "3"
 proptest = "1"
-criterion = "0.3"
+criterion = "0.5"
 rand = "0.8"
 
 [features]
-pmull = []
+pmull = [] # deprecated, no longer have any effect.
 fake-simd = []
 
 [[bench]]
diff --git a/README.md b/README.md
index 70745d1..96321dd 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ crc64fast
 [![Latest Version](https://img.shields.io/crates/v/crc64fast.svg)](https://crates.io/crates/crc64fast)
 [![Documentation](https://img.shields.io/badge/api-rustdoc-blue.svg)](https://docs.rs/crc64fast)
 
-SIMD-accelerated CRC-64-ECMA computation
+SIMD-accelerated CRC-64/XZ (a.k.a. CRC-64/GO-ECMA) computation
 (similar to [`crc32fast`](https://crates.io/crates/crc32fast)).
 
 ## Usage
@@ -32,20 +32,11 @@ be chosen based on CPU feature at runtime.
 
 | Algorithm         | Throughput (x86_64) | Throughput (aarch64) |
 |:------------------|--------------------:|---------------------:|
-| [crc 1.8.1]       |  0.5 GiB/s          |  0.3 GiB/s           |
+| [crc 3.0.1]       |  0.5 GiB/s          |  0.3 GiB/s           |
 | crc64fast (table) |  2.3 GiB/s          |  1.8 GiB/s           |
 | crc64fast (simd)  | 28.2 GiB/s          | 20.0 GiB/s           |
 
-[crc 1.8.1]: https://crates.io/crates/crc
-
-> **Note:** Since Rust has not stabilized SIMD support on AArch64, you need a
-> nightly compiler and enable the `pmull` feature to use the SIMD-based
-> implementation:
->
-> ```toml
-> [dependencies]
-> crc64fast = { version = "1.0", features = ["pmull"] }
-> ```
+[crc 3.0.1]: https://docs.rs/crc/3.0.1/crc/index.html
 
 ## TODO
 
diff --git a/benches/benchmark.rs b/benches/benchmark.rs
index b1dd75f..bb74fcc 100644
--- a/benches/benchmark.rs
+++ b/benches/benchmark.rs
@@ -1,9 +1,11 @@
 // Copyright 2019 TiKV Project Authors. Licensed under MIT or Apache-2.0.
 
-use crc::crc64::{self, Hasher64};
+use crc::{Crc, CRC_64_XZ};
 use criterion::*;
 use rand::{thread_rng, RngCore};
 
+const CRC: Crc<u64> = Crc::<u64>::new(&CRC_64_XZ);
+
 fn bench_crc(c: &mut Criterion) {
     let mut group = c.benchmark_group("CRC64");
     let mut rng = thread_rng();
@@ -15,11 +17,11 @@ fn bench_crc(c: &mut Criterion) {
         group.throughput(Throughput::Bytes(3 << size));
         group.bench_with_input(BenchmarkId::new("crc::crc64", size), &buf, |b, buf| {
             b.iter(|| {
-                let mut digest = crc64::Digest::new(crc64::ECMA);
-                digest.write(&buf[..(1 << size)]);
-                digest.write(&buf[(1 << size)..(2 << size)]);
-                digest.write(&buf[(2 << size)..]);
-                digest.sum64()
+                let mut digest = CRC.digest();
+                digest.update(&buf[..(1 << size)]);
+                digest.update(&buf[(1 << size)..(2 << size)]);
+                digest.update(&buf[(2 << size)..]);
+                digest.finalize()
             })
         });
         group.bench_with_input(BenchmarkId::new("crc64fast::simd", size), &buf, |b, buf| {
diff --git a/src/lib.rs b/src/lib.rs
index 847f2c1..98b9c61 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -18,11 +18,6 @@
 //! assert_eq!(checksum, 0x8483_c0fa_3260_7d61);
 //! ```
 
-#![cfg_attr(
-    feature = "pmull",
-    feature(stdsimd, platform_intrinsics, aarch64_target_feature, llvm_asm)
-)]
-
 mod pclmulqdq;
 mod table;
 
@@ -75,10 +70,12 @@ impl Default for Digest {
 #[cfg(test)]
 mod tests {
     use super::Digest;
-    use crc::crc64::checksum_ecma;
+    use crc::{Crc, CRC_64_XZ};
     use proptest::collection::size_range;
     use proptest::prelude::*;
 
+    const CRC: Crc<u64> = Crc::<u64>::new(&CRC_64_XZ);
+
     #[test]
     fn test_standard_vectors() {
         static CASES: &[(&[u8], u64)] = &[
@@ -120,7 +117,7 @@ mod tests {
         fn equivalent_to_crc(bytes in any_buffer()) {
             let mut hasher = Digest::new();
             hasher.write(&bytes);
-            prop_assert_eq!(hasher.sum64(), checksum_ecma(&bytes));
+            prop_assert_eq!(hasher.sum64(), CRC.checksum(&bytes));
         }
 
         #[test]
diff --git a/src/pclmulqdq/aarch64.rs b/src/pclmulqdq/aarch64.rs
index c368200..fb7d2ae 100644
--- a/src/pclmulqdq/aarch64.rs
+++ b/src/pclmulqdq/aarch64.rs
@@ -2,7 +2,7 @@
 
 //! AArch64 implementation of the PCLMULQDQ-based CRC calculation.
 
-use std::arch::aarch64::*;
+use std::arch::{aarch64::*, is_aarch64_feature_detected};
 use std::mem::transmute;
 use std::ops::BitXor;
 
@@ -10,6 +10,9 @@ use std::ops::BitXor;
 #[derive(Copy, Clone, Debug)]
 pub struct Simd(uint8x16_t);
 
+#[allow(non_camel_case_types)]
+type poly64_t = u64;
+
 impl Simd {
     #[inline]
     #[target_feature(enable = "neon")]
@@ -52,34 +55,12 @@ impl super::SimdExt for Simd {
     }
 
     #[inline]
-    #[target_feature(enable = "crypto", enable = "neon")]
+    #[target_feature(enable = "aes", enable = "neon")]
     unsafe fn fold_16(self, coeff: Self) -> Self {
-        let h: Self;
-        let l: Self;
-
-        // FIXME: When used as a single function, this branch is equivalent to
-        // the ASM below. However, when fold_16 is called inside a loop, for
-        // some reason LLVM replaces the PMULL2 call with a plain PMULL, which
-        // leads unnecessary FMOV calls and slows down the throughput from
-        // 20 GiB/s to 14 GiB/s. This bug does not exist with GCC. Delete the
-        // ASM code once this misoptimization is fixed.
-        #[cfg(slow)]
-        {
-            let [x0, x1] = self.into_poly64s();
-            let [c0, c1] = coeff.into_poly64s();
-            h = Self::from_mul(c0, x0);
-            l = Self::from_mul(c1, x1);
-        }
-        #[cfg(not(slow))]
-        {
-            llvm_asm!(
-                "pmull $0.1q, $2.1d, $3.1d
-                pmull2 $1.1q, $2.2d, $3.2d"
-                : "=&w"(l), "=w"(h)
-                : "w"(self), "w"(coeff)
-            );
-        }
-
+        let [x0, x1] = self.into_poly64s();
+        let [c0, c1] = coeff.into_poly64s();
+        let h = Self::from_mul(c0, x0);
+        let l = Self::from_mul(c1, x1);
         h ^ l
     }
 
@@ -110,36 +91,3 @@ impl BitXor for Simd {
         unsafe { Self(veorq_u8(self.0, other.0)) }
     }
 }
-
-//------------------------------------------------------------------------------
-//
-// Below are intrinsics not yet included in Rust.
-
-extern "platform-intrinsic" {
-    fn simd_extract<T, U>(x: T, idx: u32) -> U;
-}
-
-#[inline]
-#[target_feature(enable = "neon")]
-unsafe fn vgetq_lane_p64(a: poly64x2_t, idx: u32) -> poly64_t {
-    let elem: i64 = simd_extract(a, idx);
-    transmute(elem)
-}
-
-#[inline]
-#[target_feature(enable = "neon")]
-unsafe fn vreinterpretq_u8_p128(a: poly128_t) -> uint8x16_t {
-    transmute(a)
-}
-
-#[inline]
-#[target_feature(enable = "neon")]
-unsafe fn vreinterpretq_p64_u8(a: uint8x16_t) -> poly64x2_t {
-    transmute(a)
-}
-
-#[inline]
-#[target_feature(enable = "neon")]
-unsafe fn vcreate_u8(value: u64) -> uint8x8_t {
-    transmute(value)
-}
diff --git a/src/pclmulqdq/mod.rs b/src/pclmulqdq/mod.rs
index b329808..a8efe32 100644
--- a/src/pclmulqdq/mod.rs
+++ b/src/pclmulqdq/mod.rs
@@ -9,7 +9,7 @@
 
 #[cfg(not(feature = "fake-simd"))]
 #[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), path = "x86.rs")]
-#[cfg_attr(all(target_arch = "aarch64", feature = "pmull"), path = "aarch64.rs")]
+#[cfg_attr(target_arch = "aarch64", path = "aarch64.rs")]
 mod arch;
 
 #[cfg(feature = "fake-simd")]
@@ -93,8 +93,8 @@ fn update(mut state: u64, bytes: &[u8]) -> u64 {
     target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")
 )]
 #[cfg_attr(
-    all(target_arch = "aarch64", feature = "pmull"),
-    target_feature(enable = "crypto", enable = "neon")
+    target_arch = "aarch64",
+    target_feature(enable = "aes", enable = "neon")
 )]
 unsafe fn update_simd(state: u64, first: &[Simd; 8], rest: &[[Simd; 8]]) -> u64 {
     // receive the initial 128 bytes of data