From 039bafe4706ce2492b681d4e28a44a643b327ed0 Mon Sep 17 00:00:00 2001 From: Andy Grove <agrove@apache.org> Date: Sun, 9 Jun 2024 10:14:57 -0600 Subject: [PATCH] remove unsafe read_num_bytes_u32 and TODO comments for more unsafe code that needs to be fixed --- core/src/common/bit.rs | 25 ++----------------------- core/src/parquet/mutable_vector.rs | 2 +- core/src/parquet/read/levels.rs | 4 ++-- core/src/parquet/read/values.rs | 1 + 4 files changed, 6 insertions(+), 26 deletions(-) diff --git a/core/src/common/bit.rs b/core/src/common/bit.rs index 2cd1871dd9..0c2bae8358 100644 --- a/core/src/common/bit.rs +++ b/core/src/common/bit.rs @@ -118,18 +118,6 @@ pub fn read_num_bytes_u64(size: usize, src: &[u8]) -> u64 { trailing_bits(v, size * 8) } -/// u32 specific version of read_num_bytes! -/// This is faster than read_num_bytes! because this method avoids buffer copies. -#[inline] -pub fn read_num_bytes_u32(size: usize, src: &[u8]) -> u32 { - debug_assert!(size <= src.len()); - if unlikely(src.len() < 4) { - return read_num_bytes!(u32, size, src); - } - let in_ptr = src as *const [u8] as *const u8 as *const u32; - let v = unsafe { in_ptr.read_unaligned() }; - trailing_bits(v as u64, size * 8) as u32 -} /// Similar to the `read_num_bytes` but read nums from bytes in big-endian order /// This is used to read bytes from Java's OutputStream which writes bytes in big-endian @@ -795,6 +783,7 @@ impl BitReader { } let in_buf = &self.buffer.as_slice()[self.byte_offset..]; + // TODO this is unsafe let mut in_ptr = in_buf as *const [u8] as *const u8 as *const u32; while total - i >= 32 { in_ptr = unpack32(in_ptr, dst, num_bits); @@ -836,6 +825,7 @@ impl BitReader { unsafe { let in_buf = &self.buffer.as_slice()[self.byte_offset..]; + // TODO this is unsafe let mut in_ptr = in_buf as *const [u8] as *const u8 as *const u32; // FIXME assert!(memory::is_ptr_aligned(in_ptr)); if size_of::<T>() == 4 { @@ -1017,17 +1007,6 @@ mod tests { } } - #[test] - fn test_read_num_bytes_u32() { - let buffer: Vec<u8> = vec![0, 1, 2, 3]; - for size in 0..buffer.len() { - assert_eq!( - read_num_bytes_u32(size, &buffer), - read_num_bytes!(u32, size, &buffer), - ); - } - } - #[test] fn test_ceil() { assert_eq!(ceil(0, 1), 0); diff --git a/core/src/parquet/mutable_vector.rs b/core/src/parquet/mutable_vector.rs index f1428fd396..090f0d3d68 100644 --- a/core/src/parquet/mutable_vector.rs +++ b/core/src/parquet/mutable_vector.rs @@ -154,7 +154,7 @@ impl ParquetMutableVector { // We need to update offset buffer for binary. if Self::is_binary_type(&self.arrow_type) { let mut offset = self.num_values * 4; - let prev_offset_value = bit::read_num_bytes_u32(4, &self.value_buffer[offset..]); + let prev_offset_value = read_num_bytes!(u32, 4, &self.value_buffer[offset..]); offset += 4; (0..n).for_each(|_| { bit::memcpy_value(&prev_offset_value, 4, &mut self.value_buffer[offset..]); diff --git a/core/src/parquet/read/levels.rs b/core/src/parquet/read/levels.rs index d43c286842..885ef0f110 100644 --- a/core/src/parquet/read/levels.rs +++ b/core/src/parquet/read/levels.rs @@ -22,7 +22,7 @@ use parquet::schema::types::ColumnDescPtr; use super::values::Decoder; use crate::{ - common::bit::{self, read_num_bytes_u32, BitReader}, + common::bit::{self, BitReader}, parquet::ParquetMutableVector, unlikely, }; @@ -89,7 +89,7 @@ impl LevelDecoder { 0 } else if self.need_length { let u32_size = mem::size_of::<u32>(); - let data_size = read_num_bytes_u32(u32_size, page_data.as_slice()) as usize; + let data_size = read_num_bytes!(u32, u32_size, page_data.as_slice()) as usize; self.bit_reader = Some(BitReader::new(page_data.slice(u32_size), data_size)); u32_size + data_size } else { diff --git a/core/src/parquet/read/values.rs b/core/src/parquet/read/values.rs index 9d9bbb3c9e..155e6b1265 100644 --- a/core/src/parquet/read/values.rs +++ b/core/src/parquet/read/values.rs @@ -455,6 +455,7 @@ macro_rules! make_int_variant_impl { while num - i >= 32 { unsafe { + // TODO this is unsafe let in_slice = std::slice::from_raw_parts(in_ptr, 32); for n in 0..32 {