From cc56f12f408417e1b6d6cc5fbd1ba0cb8d70400e Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 31 Dec 2024 11:08:22 +0800 Subject: [PATCH] GH-45126: [C++][Parquet] Fix undefined behavior in the FormatStatValue (#45127) ### Rationale for this change FormatStatValue function in the parquet/types.cc employs reinterpret_cast to cast bytes to specific data types. It is undefined behavior when the bytes are unaligned. ### What changes are included in this PR? Use std::memcpy to replace reinterpret_cast. ### Are these changes tested? Pass CI. ### Are there any user-facing changes? No. * GitHub Issue: #45126 Authored-by: Gang Wu Signed-off-by: Gang Wu --- cpp/src/parquet/types.cc | 48 ++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index 7b50ed48d06b0..bee75c335afd5 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include #include @@ -95,31 +96,46 @@ std::string FormatStatValue(Type::type parquet_type, ::std::string_view val) { const char* bytes = val.data(); switch (parquet_type) { - case Type::BOOLEAN: - result << reinterpret_cast(bytes)[0]; + case Type::BOOLEAN: { + bool value{}; + std::memcpy(&value, bytes, sizeof(bool)); + result << value; break; - case Type::INT32: - result << reinterpret_cast(bytes)[0]; + } + case Type::INT32: { + int32_t value{}; + std::memcpy(&value, bytes, sizeof(int32_t)); + result << value; break; - case Type::INT64: - result << reinterpret_cast(bytes)[0]; + } + case Type::INT64: { + int64_t value{}; + std::memcpy(&value, bytes, sizeof(int64_t)); + result << value; break; - case Type::DOUBLE: - result << reinterpret_cast(bytes)[0]; + } + case Type::DOUBLE: { + double value{}; + std::memcpy(&value, bytes, sizeof(double)); + result << value; break; - case Type::FLOAT: - result << reinterpret_cast(bytes)[0]; + } + case Type::FLOAT: { + float value{}; + std::memcpy(&value, bytes, sizeof(float)); + result << value; break; + } case Type::INT96: { - auto const i32_val = reinterpret_cast(bytes); - result << i32_val[0] << " " << i32_val[1] << " " << i32_val[2]; + std::array values{}; + std::memcpy(values.data(), bytes, 3 * sizeof(int32_t)); + result << values[0] << " " << values[1] << " " << values[2]; break; } - case Type::BYTE_ARRAY: { - return std::string(val); - } + case Type::BYTE_ARRAY: case Type::FIXED_LEN_BYTE_ARRAY: { - return std::string(val); + result << val; + break; } case Type::UNDEFINED: default: