diff --git a/core/src/parquet/util/hash_util.rs b/core/src/parquet/util/hash_util.rs deleted file mode 100644 index 102e01ffc..000000000 --- a/core/src/parquet/util/hash_util.rs +++ /dev/null @@ -1,167 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -fn hash_(data: &[u8], seed: u32) -> u32 { - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - unsafe { - if is_x86_feature_detected!("sse4.2") { - crc32_hash(data, seed) - } else { - murmur_hash2_64a(data, seed as u64) as u32 - } - } - - #[cfg(any( - target_arch = "aarch64", - target_arch = "arm", - target_arch = "riscv64", - target_arch = "wasm32" - ))] - unsafe { - murmur_hash2_64a(data, seed as u64) as u32 - } -} - -const MURMUR_PRIME: u64 = 0xc6a4a7935bd1e995; -const MURMUR_R: i32 = 47; - -/// Rust implementation of MurmurHash2, 64-bit version for 64-bit platforms -/// -/// SAFTETY Only safe on platforms which support unaligned loads (like x86_64) -unsafe fn murmur_hash2_64a(data_bytes: &[u8], seed: u64) -> u64 { - let len = data_bytes.len(); - let len_64 = (len / 8) * 8; - let data_bytes_64 = - std::slice::from_raw_parts(&data_bytes[0..len_64] as *const [u8] as *const u64, len / 8); - - let mut h = seed ^ (MURMUR_PRIME.wrapping_mul(data_bytes.len() as u64)); - for v in data_bytes_64 { - let mut k = *v; - k = k.wrapping_mul(MURMUR_PRIME); - k ^= k >> MURMUR_R; - k = k.wrapping_mul(MURMUR_PRIME); - h ^= k; - h = h.wrapping_mul(MURMUR_PRIME); - } - - let data2 = &data_bytes[len_64..]; - - let v = len & 7; - if v == 7 { - h ^= (data2[6] as u64) << 48; - } - if v >= 6 { - h ^= (data2[5] as u64) << 40; - } - if v >= 5 { - h ^= (data2[4] as u64) << 32; - } - if v >= 4 { - h ^= (data2[3] as u64) << 24; - } - if v >= 3 { - h ^= (data2[2] as u64) << 16; - } - if v >= 2 { - h ^= (data2[1] as u64) << 8; - } - if v >= 1 { - h ^= data2[0] as u64; - } - if v > 0 { - h = h.wrapping_mul(MURMUR_PRIME); - } - - h ^= h >> MURMUR_R; - h = h.wrapping_mul(MURMUR_PRIME); - h ^= h >> MURMUR_R; - h -} - -/// CRC32 hash implementation using SSE4 instructions. Borrowed from Impala. -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[target_feature(enable = "sse4.2")] -unsafe fn crc32_hash(bytes: &[u8], seed: u32) -> u32 { - #[cfg(target_arch = "x86")] - use std::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use std::arch::x86_64::*; - - let u32_num_bytes = std::mem::size_of::(); - let mut num_bytes = bytes.len(); - let num_words = num_bytes / u32_num_bytes; - num_bytes %= u32_num_bytes; - - let bytes_u32: &[u32] = std::slice::from_raw_parts( - &bytes[0..num_words * u32_num_bytes] as *const [u8] as *const u32, - num_words, - ); - - let mut offset = 0; - let mut hash = seed; - while offset < num_words { - hash = _mm_crc32_u32(hash, bytes_u32[offset]); - offset += 1; - } - - offset = num_words * u32_num_bytes; - while offset < num_bytes { - hash = _mm_crc32_u8(hash, bytes[offset]); - offset += 1; - } - - // The lower half of the CRC hash has poor uniformity, so swap the halves - // for anyone who only uses the first several bits of the hash. - hash = (hash << 16) | (hash >> 16); - hash -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_murmur2_64a() { - unsafe { - let result = murmur_hash2_64a(b"hello", 123); - assert_eq!(result, 2597646618390559622); - - let result = murmur_hash2_64a(b"helloworld", 123); - assert_eq!(result, 4934371746140206573); - - let result = murmur_hash2_64a(b"helloworldparquet", 123); - assert_eq!(result, 2392198230801491746); - } - } - - #[test] - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - fn test_crc32() { - if is_x86_feature_detected!("sse4.2") { - unsafe { - let result = crc32_hash(b"hello", 123); - assert_eq!(result, 2927487359); - - let result = crc32_hash(b"helloworld", 123); - assert_eq!(result, 314229527); - - let result = crc32_hash(b"helloworldparquet", 123); - assert_eq!(result, 667078870); - } - } - } -} diff --git a/core/src/parquet/util/mod.rs b/core/src/parquet/util/mod.rs index 7a37b786d..e71a6e2c7 100644 --- a/core/src/parquet/util/mod.rs +++ b/core/src/parquet/util/mod.rs @@ -16,7 +16,6 @@ // under the License. pub(crate) mod bit_packing; -pub mod hash_util; pub mod jni; pub mod memory;