From 79d4e8f43f0a3275f97706fe4ac2b6669114beea Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 11 Jun 2024 12:45:29 -0600 Subject: [PATCH] remove more unsafe --- core/src/parquet/read/values.rs | 125 ++++++++++++++++++++++++-------- 1 file changed, 93 insertions(+), 32 deletions(-) diff --git a/core/src/parquet/read/values.rs b/core/src/parquet/read/values.rs index 402670fc3..0b2f58388 100644 --- a/core/src/parquet/read/values.rs +++ b/core/src/parquet/read/values.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::{marker::PhantomData, mem, ptr::copy_nonoverlapping}; +use std::{marker::PhantomData, mem}; use arrow::buffer::Buffer; use bytes::Buf; @@ -452,6 +452,20 @@ impl PlainDecoding for Int8Type { } } +impl PlainDecoding for UInt8Type { + fn decode(src: &mut PlainDecoderInner, dst: &mut ParquetMutableVector, num: usize) { + let src_data = &src.data; + let dst_slice = dst.value_buffer.as_slice_mut(); + let dst_offset = dst.num_values; + copy_i32_to_u8(&src_data[src.offset..], &mut dst_slice[dst_offset..], num); + src.offset += 4 * num; // Parquet stores Int8/Int16 using 4 bytes + } + + fn skip(src: &mut PlainDecoderInner, num: usize) { + src.offset += 4 * num; // Parquet stores Int8/Int16 using 4 bytes + } +} + impl PlainDecoding for Int16Type { fn decode(src: &mut PlainDecoderInner, dst: &mut ParquetMutableVector, num: usize) { let src_data = &src.data; @@ -466,39 +480,34 @@ impl PlainDecoding for Int16Type { } } -// Shared implementation for int variants such as Int8 and Int16 -macro_rules! make_int_variant_impl { - ($ty: ident, $native_ty: ty, $type_size: expr) => { - impl PlainDecoding for $ty { - fn decode(src: &mut PlainDecoderInner, dst: &mut ParquetMutableVector, num: usize) { - let src_data = &src.data; - let dst_slice = dst.value_buffer.as_slice_mut(); - let mut dst_offset = dst.num_values * $type_size; - for _ in 0..num { - unsafe { - copy_nonoverlapping( - &src_data[src.offset..] as *const [u8] as *const u8 - as *const $native_ty, - &mut dst_slice[dst_offset] as *mut u8 as *mut $native_ty, - 1, - ); - } - src.offset += 4; // Parquet stores Int8/Int16 using 4 bytes - dst_offset += $type_size; - } - } +impl PlainDecoding for UInt16Type { + fn decode(src: &mut PlainDecoderInner, dst: &mut ParquetMutableVector, num: usize) { + let src_data = &src.data; + let dst_slice = dst.value_buffer.as_slice_mut(); + let dst_offset = dst.num_values * 2; + copy_i32_to_u16(&src_data[src.offset..], &mut dst_slice[dst_offset..], num); + src.offset += 4 * num; // Parquet stores Int8/Int16 using 4 bytes + } - fn skip(src: &mut PlainDecoderInner, num: usize) { - let num_bytes = 4 * num; // Parquet stores Int8/Int16 using 4 bytes - src.offset += num_bytes; - } - } - }; + fn skip(src: &mut PlainDecoderInner, num: usize) { + src.offset += 4 * num; // Parquet stores Int8/Int16 using 4 bytes + } +} + +impl PlainDecoding for UInt32Type { + fn decode(src: &mut PlainDecoderInner, dst: &mut ParquetMutableVector, num: usize) { + let src_data = &src.data; + let dst_slice = dst.value_buffer.as_slice_mut(); + let dst_offset = dst.num_values * 4; + copy_i32_to_u32(&src_data[src.offset..], &mut dst_slice[dst_offset..], num); + src.offset += 4 * num; // Parquet stores Int8/Int16 using 4 bytes + } + + fn skip(src: &mut PlainDecoderInner, num: usize) { + src.offset += 4 * num; + } } -make_int_variant_impl!(UInt8Type, u8, 2);- -make_int_variant_impl!(UInt16Type, u16, 4); -make_int_variant_impl!(UInt32Type, u32, 8); // Shared implementation for variants of Binary type macro_rules! make_plain_binary_impl { @@ -1013,7 +1022,7 @@ fn copy_i32_to_i8(src: &[u8], dst: &mut [u8], num: usize) { let i32_value = i32::from_le_bytes([src[i * 4], src[i * 4 + 1], src[i * 4 + 2], src[i * 4 + 3]]); - // Downcast to i16, potentially losing data + // Downcast to i8, potentially losing data let i8_value = i32_value as i8; let i8_bytes = i8_value.to_le_bytes(); @@ -1021,6 +1030,22 @@ fn copy_i32_to_i8(src: &[u8], dst: &mut [u8], num: usize) { } } +fn copy_i32_to_u8(src: &[u8], dst: &mut [u8], num: usize) { + debug_assert!(src.len() >= num * 4, "Source slice is too small"); + debug_assert!(dst.len() >= num * 1, "Destination slice is too small"); + + for i in 0..num { + let i32_value = + i32::from_le_bytes([src[i * 4], src[i * 4 + 1], src[i * 4 + 2], src[i * 4 + 3]]); + + // Downcast to u8, potentially losing data + let u8_value = i32_value as u8; + let u8_bytes = u8_value.to_le_bytes(); + + dst[i] = u8_bytes[0]; + } +} + fn copy_i32_to_i16(src: &[u8], dst: &mut [u8], num: usize) { debug_assert!(src.len() >= num * 4, "Source slice is too small"); debug_assert!(dst.len() >= num * 2, "Destination slice is too small"); @@ -1038,6 +1063,42 @@ fn copy_i32_to_i16(src: &[u8], dst: &mut [u8], num: usize) { } } +fn copy_i32_to_u16(src: &[u8], dst: &mut [u8], num: usize) { + debug_assert!(src.len() >= num * 4, "Source slice is too small"); + debug_assert!(dst.len() >= num * 2, "Destination slice is too small"); + + for i in 0..num { + let i32_value = + i32::from_le_bytes([src[i * 4], src[i * 4 + 1], src[i * 4 + 2], src[i * 4 + 3]]); + + // Downcast to u16, potentially losing data + let u16_value = i32_value as u16; + let u16_bytes = u16_value.to_le_bytes(); + + dst[i * 2] = u16_bytes[0]; + dst[i * 2 + 1] = u16_bytes[1]; + } +} + +fn copy_i32_to_u32(src: &[u8], dst: &mut [u8], num: usize) { + debug_assert!(src.len() >= num * 4, "Source slice is too small"); + debug_assert!(dst.len() >= num * 2, "Destination slice is too small"); + + for i in 0..num { + let i32_value = + i32::from_le_bytes([src[i * 4], src[i * 4 + 1], src[i * 4 + 2], src[i * 4 + 3]]); + + // Downcast to u32, potentially losing data + let u32_value = i32_value as u32; + let u32_bytes = u32_value.to_le_bytes(); + + dst[i * 2] = u32_bytes[0]; + dst[i * 2 + 1] = u32_bytes[1]; + dst[i * 2 + 2] = u32_bytes[2]; + dst[i * 2 + 3] = u32_bytes[3]; + } +} + #[cfg(test)] mod test { use super::*;