From 79d4e8f43f0a3275f97706fe4ac2b6669114beea Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 11 Jun 2024 12:45:29 -0600
Subject: [PATCH] remove more unsafe

---
 core/src/parquet/read/values.rs | 125 ++++++++++++++++++++++++--------
 1 file changed, 93 insertions(+), 32 deletions(-)

diff --git a/core/src/parquet/read/values.rs b/core/src/parquet/read/values.rs
index 402670fc3..0b2f58388 100644
--- a/core/src/parquet/read/values.rs
+++ b/core/src/parquet/read/values.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{marker::PhantomData, mem, ptr::copy_nonoverlapping};
+use std::{marker::PhantomData, mem};
 
 use arrow::buffer::Buffer;
 use bytes::Buf;
@@ -452,6 +452,20 @@ impl PlainDecoding for Int8Type {
     }
 }
 
+impl PlainDecoding for UInt8Type {
+    fn decode(src: &mut PlainDecoderInner, dst: &mut ParquetMutableVector, num: usize) {
+        let src_data = &src.data;
+        let dst_slice = dst.value_buffer.as_slice_mut();
+        let dst_offset = dst.num_values;
+        copy_i32_to_u8(&src_data[src.offset..], &mut dst_slice[dst_offset..], num);
+        src.offset += 4 * num; // Parquet stores Int8/Int16 using 4 bytes
+    }
+
+    fn skip(src: &mut PlainDecoderInner, num: usize) {
+        src.offset += 4 * num; // Parquet stores Int8/Int16 using 4 bytes
+    }
+}
+
 impl PlainDecoding for Int16Type {
     fn decode(src: &mut PlainDecoderInner, dst: &mut ParquetMutableVector, num: usize) {
         let src_data = &src.data;
@@ -466,39 +480,34 @@ impl PlainDecoding for Int16Type {
     }
 }
 
-// Shared implementation for int variants such as Int8 and Int16
-macro_rules! make_int_variant_impl {
-    ($ty: ident, $native_ty: ty, $type_size: expr) => {
-        impl PlainDecoding for $ty {
-            fn decode(src: &mut PlainDecoderInner, dst: &mut ParquetMutableVector, num: usize) {
-                let src_data = &src.data;
-                let dst_slice = dst.value_buffer.as_slice_mut();
-                let mut dst_offset = dst.num_values * $type_size;
-                for _ in 0..num {
-                    unsafe {
-                        copy_nonoverlapping(
-                            &src_data[src.offset..] as *const [u8] as *const u8
-                                as *const $native_ty,
-                            &mut dst_slice[dst_offset] as *mut u8 as *mut $native_ty,
-                            1,
-                        );
-                    }
-                    src.offset += 4; // Parquet stores Int8/Int16 using 4 bytes
-                    dst_offset += $type_size;
-                }
-            }
+impl PlainDecoding for UInt16Type {
+    fn decode(src: &mut PlainDecoderInner, dst: &mut ParquetMutableVector, num: usize) {
+        let src_data = &src.data;
+        let dst_slice = dst.value_buffer.as_slice_mut();
+        let dst_offset = dst.num_values * 2;
+        copy_i32_to_u16(&src_data[src.offset..], &mut dst_slice[dst_offset..], num);
+        src.offset += 4 * num; // Parquet stores Int8/Int16 using 4 bytes
+    }
 
-            fn skip(src: &mut PlainDecoderInner, num: usize) {
-                let num_bytes = 4 * num; // Parquet stores Int8/Int16 using 4 bytes
-                src.offset += num_bytes;
-            }
-        }
-    };
+    fn skip(src: &mut PlainDecoderInner, num: usize) {
+        src.offset += 4 * num; // Parquet stores Int8/Int16 using 4 bytes
+    }
+}
+
+impl PlainDecoding for UInt32Type {
+    fn decode(src: &mut PlainDecoderInner, dst: &mut ParquetMutableVector, num: usize) {
+        let src_data = &src.data;
+        let dst_slice = dst.value_buffer.as_slice_mut();
+        let dst_offset = dst.num_values * 4;
+        copy_i32_to_u32(&src_data[src.offset..], &mut dst_slice[dst_offset..], num);
+        src.offset += 4 * num; // Parquet stores Int8/Int16 using 4 bytes
+    }
+
+    fn skip(src: &mut PlainDecoderInner, num: usize) {
+        src.offset += 4 * num;
+    }
 }
 
-make_int_variant_impl!(UInt8Type, u8, 2);-
-make_int_variant_impl!(UInt16Type, u16, 4);
-make_int_variant_impl!(UInt32Type, u32, 8);
 
 // Shared implementation for variants of Binary type
 macro_rules! make_plain_binary_impl {
@@ -1013,7 +1022,7 @@ fn copy_i32_to_i8(src: &[u8], dst: &mut [u8], num: usize) {
         let i32_value =
             i32::from_le_bytes([src[i * 4], src[i * 4 + 1], src[i * 4 + 2], src[i * 4 + 3]]);
 
-        // Downcast to i16, potentially losing data
+        // Downcast to i8, potentially losing data
         let i8_value = i32_value as i8;
         let i8_bytes = i8_value.to_le_bytes();
 
@@ -1021,6 +1030,22 @@ fn copy_i32_to_i8(src: &[u8], dst: &mut [u8], num: usize) {
     }
 }
 
+fn copy_i32_to_u8(src: &[u8], dst: &mut [u8], num: usize) {
+    debug_assert!(src.len() >= num * 4, "Source slice is too small");
+    debug_assert!(dst.len() >= num * 1, "Destination slice is too small");
+
+    for i in 0..num {
+        let i32_value =
+            i32::from_le_bytes([src[i * 4], src[i * 4 + 1], src[i * 4 + 2], src[i * 4 + 3]]);
+
+        // Downcast to u8, potentially losing data
+        let u8_value = i32_value as u8;
+        let u8_bytes = u8_value.to_le_bytes();
+
+        dst[i] = u8_bytes[0];
+    }
+}
+
 fn copy_i32_to_i16(src: &[u8], dst: &mut [u8], num: usize) {
     debug_assert!(src.len() >= num * 4, "Source slice is too small");
     debug_assert!(dst.len() >= num * 2, "Destination slice is too small");
@@ -1038,6 +1063,42 @@ fn copy_i32_to_i16(src: &[u8], dst: &mut [u8], num: usize) {
     }
 }
 
+fn copy_i32_to_u16(src: &[u8], dst: &mut [u8], num: usize) {
+    debug_assert!(src.len() >= num * 4, "Source slice is too small");
+    debug_assert!(dst.len() >= num * 2, "Destination slice is too small");
+
+    for i in 0..num {
+        let i32_value =
+            i32::from_le_bytes([src[i * 4], src[i * 4 + 1], src[i * 4 + 2], src[i * 4 + 3]]);
+
+        // Downcast to u16, potentially losing data
+        let u16_value = i32_value as u16;
+        let u16_bytes = u16_value.to_le_bytes();
+
+        dst[i * 2] = u16_bytes[0];
+        dst[i * 2 + 1] = u16_bytes[1];
+    }
+}
+
+fn copy_i32_to_u32(src: &[u8], dst: &mut [u8], num: usize) {
+    debug_assert!(src.len() >= num * 4, "Source slice is too small");
+    debug_assert!(dst.len() >= num * 2, "Destination slice is too small");
+
+    for i in 0..num {
+        let i32_value =
+            i32::from_le_bytes([src[i * 4], src[i * 4 + 1], src[i * 4 + 2], src[i * 4 + 3]]);
+
+        // Downcast to u32, potentially losing data
+        let u32_value = i32_value as u32;
+        let u32_bytes = u32_value.to_le_bytes();
+
+        dst[i * 2] = u32_bytes[0];
+        dst[i * 2 + 1] = u32_bytes[1];
+        dst[i * 2 + 2] = u32_bytes[2];
+        dst[i * 2 + 3] = u32_bytes[3];
+    }
+}
+
 #[cfg(test)]
 mod test {
     use super::*;