From 039bafe4706ce2492b681d4e28a44a643b327ed0 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Sun, 9 Jun 2024 10:14:57 -0600
Subject: [PATCH] remove unsafe read_num_bytes_u32 and TODO comments for more
 unsafe code that needs to be fixed

---
 core/src/common/bit.rs             | 25 ++-----------------------
 core/src/parquet/mutable_vector.rs |  2 +-
 core/src/parquet/read/levels.rs    |  4 ++--
 core/src/parquet/read/values.rs    |  1 +
 4 files changed, 6 insertions(+), 26 deletions(-)

diff --git a/core/src/common/bit.rs b/core/src/common/bit.rs
index 2cd1871dd9..0c2bae8358 100644
--- a/core/src/common/bit.rs
+++ b/core/src/common/bit.rs
@@ -118,18 +118,6 @@ pub fn read_num_bytes_u64(size: usize, src: &[u8]) -> u64 {
     trailing_bits(v, size * 8)
 }
 
-/// u32 specific version of read_num_bytes!
-/// This is faster than read_num_bytes! because this method avoids buffer copies.
-#[inline]
-pub fn read_num_bytes_u32(size: usize, src: &[u8]) -> u32 {
-    debug_assert!(size <= src.len());
-    if unlikely(src.len() < 4) {
-        return read_num_bytes!(u32, size, src);
-    }
-    let in_ptr = src as *const [u8] as *const u8 as *const u32;
-    let v = unsafe { in_ptr.read_unaligned() };
-    trailing_bits(v as u64, size * 8) as u32
-}
 
 /// Similar to the `read_num_bytes` but read nums from bytes in big-endian order
 /// This is used to read bytes from Java's OutputStream which writes bytes in big-endian
@@ -795,6 +783,7 @@ impl BitReader {
         }
 
         let in_buf = &self.buffer.as_slice()[self.byte_offset..];
+        // TODO this is unsafe
         let mut in_ptr = in_buf as *const [u8] as *const u8 as *const u32;
         while total - i >= 32 {
             in_ptr = unpack32(in_ptr, dst, num_bits);
@@ -836,6 +825,7 @@ impl BitReader {
 
         unsafe {
             let in_buf = &self.buffer.as_slice()[self.byte_offset..];
+            // TODO this is unsafe
             let mut in_ptr = in_buf as *const [u8] as *const u8 as *const u32;
             // FIXME assert!(memory::is_ptr_aligned(in_ptr));
             if size_of::<T>() == 4 {
@@ -1017,17 +1007,6 @@ mod tests {
         }
     }
 
-    #[test]
-    fn test_read_num_bytes_u32() {
-        let buffer: Vec<u8> = vec![0, 1, 2, 3];
-        for size in 0..buffer.len() {
-            assert_eq!(
-                read_num_bytes_u32(size, &buffer),
-                read_num_bytes!(u32, size, &buffer),
-            );
-        }
-    }
-
     #[test]
     fn test_ceil() {
         assert_eq!(ceil(0, 1), 0);
diff --git a/core/src/parquet/mutable_vector.rs b/core/src/parquet/mutable_vector.rs
index f1428fd396..090f0d3d68 100644
--- a/core/src/parquet/mutable_vector.rs
+++ b/core/src/parquet/mutable_vector.rs
@@ -154,7 +154,7 @@ impl ParquetMutableVector {
         // We need to update offset buffer for binary.
         if Self::is_binary_type(&self.arrow_type) {
             let mut offset = self.num_values * 4;
-            let prev_offset_value = bit::read_num_bytes_u32(4, &self.value_buffer[offset..]);
+            let prev_offset_value = read_num_bytes!(u32, 4, &self.value_buffer[offset..]);
             offset += 4;
             (0..n).for_each(|_| {
                 bit::memcpy_value(&prev_offset_value, 4, &mut self.value_buffer[offset..]);
diff --git a/core/src/parquet/read/levels.rs b/core/src/parquet/read/levels.rs
index d43c286842..885ef0f110 100644
--- a/core/src/parquet/read/levels.rs
+++ b/core/src/parquet/read/levels.rs
@@ -22,7 +22,7 @@ use parquet::schema::types::ColumnDescPtr;
 
 use super::values::Decoder;
 use crate::{
-    common::bit::{self, read_num_bytes_u32, BitReader},
+    common::bit::{self, BitReader},
     parquet::ParquetMutableVector,
     unlikely,
 };
@@ -89,7 +89,7 @@ impl LevelDecoder {
             0
         } else if self.need_length {
             let u32_size = mem::size_of::<u32>();
-            let data_size = read_num_bytes_u32(u32_size, page_data.as_slice()) as usize;
+            let data_size = read_num_bytes!(u32, u32_size, page_data.as_slice()) as usize;
             self.bit_reader = Some(BitReader::new(page_data.slice(u32_size), data_size));
             u32_size + data_size
         } else {
diff --git a/core/src/parquet/read/values.rs b/core/src/parquet/read/values.rs
index 9d9bbb3c9e..155e6b1265 100644
--- a/core/src/parquet/read/values.rs
+++ b/core/src/parquet/read/values.rs
@@ -455,6 +455,7 @@ macro_rules! make_int_variant_impl {
 
                 while num - i >= 32 {
                     unsafe {
+                        // TODO this is unsafe
                         let in_slice = std::slice::from_raw_parts(in_ptr, 32);
 
                         for n in 0..32 {