Skip to content

Commit

Permalink
pclmulqdq: Upgrade rust version to use stabilized AArch64 intrinsics. (
Browse files Browse the repository at this point in the history
…#10)

* pclmulqdq: Upgrade rust version to use stabilized AArch64 intrinsics.

Signed-off-by: kennytm <[email protected]>

* test: upgrade crc dependency from 1.8 to 3.0

Benchmarking shows the throughput of crc v3 on x86 is essentially
unchanged. Not going to update the numbers.

Signed-off-by: kennytm <[email protected]>

---------

Signed-off-by: kennytm <[email protected]>
  • Loading branch information
kennytm authored Jan 18, 2024
1 parent baedd67 commit b0dc2ad
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 95 deletions.
14 changes: 8 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,27 +1,29 @@
[package]
name = "crc64fast"
version = "1.0.0"
version = "1.1.0"
authors = ["The TiKV Project Developers"]
license = "MIT OR Apache-2.0"
edition = "2018"
edition = "2021"
keywords = ["crc", "crc64", "simd", "checksum"]
repository = "https://github.com/tikv/crc64fast"
description = "SIMD accelerated CRC64 calculation"
exclude = ["build_table.rs"]
readme = "README.md"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
# Note: Rust 1.70 upgraded LLVM version to 16 (in particular https://reviews.llvm.org/D131047)
# Before that, the compiler is unwilling to generate the PMULL2 instruction on AArch64.
rust-version = "1.70.0"

[dependencies]

[dev-dependencies]
crc = "1"
crc = "3"
proptest = "1"
criterion = "0.3"
criterion = "0.5"
rand = "0.8"

[features]
pmull = []
pmull = [] # deprecated, no longer have any effect.
fake-simd = []

[[bench]]
Expand Down
15 changes: 3 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ crc64fast
[![Latest Version](https://img.shields.io/crates/v/crc64fast.svg)](https://crates.io/crates/crc64fast)
[![Documentation](https://img.shields.io/badge/api-rustdoc-blue.svg)](https://docs.rs/crc64fast)

SIMD-accelerated CRC-64-ECMA computation
SIMD-accelerated CRC-64/XZ (a.k.a. CRC-64/GO-ECMA) computation
(similar to [`crc32fast`](https://crates.io/crates/crc32fast)).

## Usage
Expand All @@ -32,20 +32,11 @@ be chosen based on CPU feature at runtime.

| Algorithm | Throughput (x86_64) | Throughput (aarch64) |
|:------------------|--------------------:|---------------------:|
| [crc 1.8.1] | 0.5 GiB/s | 0.3 GiB/s |
| [crc 3.0.1] | 0.5 GiB/s | 0.3 GiB/s |
| crc64fast (table) | 2.3 GiB/s | 1.8 GiB/s |
| crc64fast (simd) | 28.2 GiB/s | 20.0 GiB/s |

[crc 1.8.1]: https://crates.io/crates/crc

> **Note:** Since Rust has not stabilized SIMD support on AArch64, you need a
> nightly compiler and enable the `pmull` feature to use the SIMD-based
> implementation:
>
> ```toml
> [dependencies]
> crc64fast = { version = "1.0", features = ["pmull"] }
> ```
[crc 3.0.1]: https://docs.rs/crc/3.0.1/crc/index.html

## TODO

Expand Down
14 changes: 8 additions & 6 deletions benches/benchmark.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
// Copyright 2019 TiKV Project Authors. Licensed under MIT or Apache-2.0.

use crc::crc64::{self, Hasher64};
use crc::{Crc, CRC_64_XZ};
use criterion::*;
use rand::{thread_rng, RngCore};

const CRC: Crc<u64> = Crc::<u64>::new(&CRC_64_XZ);

fn bench_crc(c: &mut Criterion) {
let mut group = c.benchmark_group("CRC64");
let mut rng = thread_rng();
Expand All @@ -15,11 +17,11 @@ fn bench_crc(c: &mut Criterion) {
group.throughput(Throughput::Bytes(3 << size));
group.bench_with_input(BenchmarkId::new("crc::crc64", size), &buf, |b, buf| {
b.iter(|| {
let mut digest = crc64::Digest::new(crc64::ECMA);
digest.write(&buf[..(1 << size)]);
digest.write(&buf[(1 << size)..(2 << size)]);
digest.write(&buf[(2 << size)..]);
digest.sum64()
let mut digest = CRC.digest();
digest.update(&buf[..(1 << size)]);
digest.update(&buf[(1 << size)..(2 << size)]);
digest.update(&buf[(2 << size)..]);
digest.finalize()
})
});
group.bench_with_input(BenchmarkId::new("crc64fast::simd", size), &buf, |b, buf| {
Expand Down
11 changes: 4 additions & 7 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,6 @@
//! assert_eq!(checksum, 0x8483_c0fa_3260_7d61);
//! ```
#![cfg_attr(
feature = "pmull",
feature(stdsimd, platform_intrinsics, aarch64_target_feature, llvm_asm)
)]

mod pclmulqdq;
mod table;

Expand Down Expand Up @@ -75,10 +70,12 @@ impl Default for Digest {
#[cfg(test)]
mod tests {
use super::Digest;
use crc::crc64::checksum_ecma;
use crc::{Crc, CRC_64_XZ};
use proptest::collection::size_range;
use proptest::prelude::*;

const CRC: Crc<u64> = Crc::<u64>::new(&CRC_64_XZ);

#[test]
fn test_standard_vectors() {
static CASES: &[(&[u8], u64)] = &[
Expand Down Expand Up @@ -120,7 +117,7 @@ mod tests {
fn equivalent_to_crc(bytes in any_buffer()) {
let mut hasher = Digest::new();
hasher.write(&bytes);
prop_assert_eq!(hasher.sum64(), checksum_ecma(&bytes));
prop_assert_eq!(hasher.sum64(), CRC.checksum(&bytes));
}

#[test]
Expand Down
70 changes: 9 additions & 61 deletions src/pclmulqdq/aarch64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@

//! AArch64 implementation of the PCLMULQDQ-based CRC calculation.
use std::arch::aarch64::*;
use std::arch::{aarch64::*, is_aarch64_feature_detected};
use std::mem::transmute;
use std::ops::BitXor;

#[repr(transparent)]
#[derive(Copy, Clone, Debug)]
pub struct Simd(uint8x16_t);

#[allow(non_camel_case_types)]
type poly64_t = u64;

impl Simd {
#[inline]
#[target_feature(enable = "neon")]
Expand Down Expand Up @@ -52,34 +55,12 @@ impl super::SimdExt for Simd {
}

#[inline]
#[target_feature(enable = "crypto", enable = "neon")]
#[target_feature(enable = "aes", enable = "neon")]
unsafe fn fold_16(self, coeff: Self) -> Self {
let h: Self;
let l: Self;

// FIXME: When used as a single function, this branch is equivalent to
// the ASM below. However, when fold_16 is called inside a loop, for
// some reason LLVM replaces the PMULL2 call with a plain PMULL, which
// leads unnecessary FMOV calls and slows down the throughput from
// 20 GiB/s to 14 GiB/s. This bug does not exist with GCC. Delete the
// ASM code once this misoptimization is fixed.
#[cfg(slow)]
{
let [x0, x1] = self.into_poly64s();
let [c0, c1] = coeff.into_poly64s();
h = Self::from_mul(c0, x0);
l = Self::from_mul(c1, x1);
}
#[cfg(not(slow))]
{
llvm_asm!(
"pmull $0.1q, $2.1d, $3.1d
pmull2 $1.1q, $2.2d, $3.2d"
: "=&w"(l), "=w"(h)
: "w"(self), "w"(coeff)
);
}

let [x0, x1] = self.into_poly64s();
let [c0, c1] = coeff.into_poly64s();
let h = Self::from_mul(c0, x0);
let l = Self::from_mul(c1, x1);
h ^ l
}

Expand Down Expand Up @@ -110,36 +91,3 @@ impl BitXor for Simd {
unsafe { Self(veorq_u8(self.0, other.0)) }
}
}

//------------------------------------------------------------------------------
//
// Below are intrinsics not yet included in Rust.

extern "platform-intrinsic" {
fn simd_extract<T, U>(x: T, idx: u32) -> U;
}

#[inline]
#[target_feature(enable = "neon")]
unsafe fn vgetq_lane_p64(a: poly64x2_t, idx: u32) -> poly64_t {
let elem: i64 = simd_extract(a, idx);
transmute(elem)
}

#[inline]
#[target_feature(enable = "neon")]
unsafe fn vreinterpretq_u8_p128(a: poly128_t) -> uint8x16_t {
transmute(a)
}

#[inline]
#[target_feature(enable = "neon")]
unsafe fn vreinterpretq_p64_u8(a: uint8x16_t) -> poly64x2_t {
transmute(a)
}

#[inline]
#[target_feature(enable = "neon")]
unsafe fn vcreate_u8(value: u64) -> uint8x8_t {
transmute(value)
}
6 changes: 3 additions & 3 deletions src/pclmulqdq/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#[cfg(not(feature = "fake-simd"))]
#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), path = "x86.rs")]
#[cfg_attr(all(target_arch = "aarch64", feature = "pmull"), path = "aarch64.rs")]
#[cfg_attr(target_arch = "aarch64", path = "aarch64.rs")]
mod arch;

#[cfg(feature = "fake-simd")]
Expand Down Expand Up @@ -93,8 +93,8 @@ fn update(mut state: u64, bytes: &[u8]) -> u64 {
target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")
)]
#[cfg_attr(
all(target_arch = "aarch64", feature = "pmull"),
target_feature(enable = "crypto", enable = "neon")
target_arch = "aarch64",
target_feature(enable = "aes", enable = "neon")
)]
unsafe fn update_simd(state: u64, first: &[Simd; 8], rest: &[[Simd; 8]]) -> u64 {
// receive the initial 128 bytes of data
Expand Down

0 comments on commit b0dc2ad

Please sign in to comment.