From ee81a96c9ead5975e70208d5ba9f4a7a09e6398c Mon Sep 17 00:00:00 2001 From: ijl Date: Mon, 15 Apr 2024 13:59:17 +0000 Subject: [PATCH] numpy serialization rejects non-native endianness --- README.md | 8 ++++++-- src/serialize/error.rs | 5 +++++ src/serialize/per_type/numpy.rs | 13 ++++++++++++- test/test_numpy.py | 11 +++++++++++ 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 1bbcad6f..724bde5e 100644 --- a/README.md +++ b/README.md @@ -864,8 +864,12 @@ b'"2021-01-01T00:00:00+00:00"' If an array is not a contiguous C array, contains an unsupported datatype, or contains a `numpy.datetime64` using an unsupported representation (e.g., picoseconds), orjson falls through to `default`. In `default`, -`obj.tolist()` can be specified. If an array is malformed, which -is not expected, `orjson.JSONEncodeError` is raised. +`obj.tolist()` can be specified. + +If an array is not in the native endianness, e.g., an array of big-endian values +on a little-endian system, `orjson.JSONEncodeError` is raised. + +If an array is malformed, `orjson.JSONEncodeError` is raised. This measures serializing 92MiB of JSON from an `numpy.ndarray` with dimensions of `(50000, 100)` and `numpy.float64` values: diff --git a/src/serialize/error.rs b/src/serialize/error.rs index f1acd629..744404be 100644 --- a/src/serialize/error.rs +++ b/src/serialize/error.rs @@ -17,6 +17,7 @@ pub enum SerializeError { DictKeyInvalidType, NumpyMalformed, NumpyNotCContiguous, + NumpyNotNativeEndian, NumpyUnsupportedDatatype, UnsupportedType(NonNull), } @@ -48,6 +49,10 @@ impl std::fmt::Display for SerializeError { f, "numpy array is not C contiguous; use ndarray.tolist() in default" ), + SerializeError::NumpyNotNativeEndian => write!( + f, + "numpy array is not native-endianness" + ), SerializeError::NumpyUnsupportedDatatype => { write!(f, "unsupported datatype in numpy array") } diff --git a/src/serialize/per_type/numpy.rs b/src/serialize/per_type/numpy.rs index dede7996..045f9553 100644 --- a/src/serialize/per_type/numpy.rs +++ b/src/serialize/per_type/numpy.rs @@ -42,6 +42,9 @@ impl<'a> Serialize for NumpySerializer<'a> { Err(PyArrayError::NotContiguous) => { err!(SerializeError::NumpyNotCContiguous) } + Err(PyArrayError::NotNativeEndian) => { + err!(SerializeError::NumpyNotNativeEndian) + } Err(PyArrayError::UnsupportedDataType) => { err!(SerializeError::NumpyUnsupportedDatatype) } @@ -101,6 +104,9 @@ pub struct PyCapsule { // https://docs.scipy.org/doc/numpy/reference/arrays.interface.html#c.__array_struct__ +const NPY_ARRAY_C_CONTIGUOUS: c_int = 0x1; +const NPY_ARRAY_NOTSWAPPED: c_int = 0x200; + #[repr(C)] pub struct PyArrayInterface { pub two: c_int, @@ -154,9 +160,11 @@ impl ItemType { } } } + pub enum PyArrayError { Malformed, NotContiguous, + NotNativeEndian, UnsupportedDataType, } @@ -187,9 +195,12 @@ impl NumpyArray { if unsafe { (*array).two != 2 } { ffi!(Py_DECREF(capsule)); Err(PyArrayError::Malformed) - } else if unsafe { (*array).flags } & 0x1 != 0x1 { + } else if unsafe { (*array).flags } & NPY_ARRAY_C_CONTIGUOUS != NPY_ARRAY_C_CONTIGUOUS { ffi!(Py_DECREF(capsule)); Err(PyArrayError::NotContiguous) + } else if unsafe { (*array).flags } & NPY_ARRAY_NOTSWAPPED != NPY_ARRAY_NOTSWAPPED { + ffi!(Py_DECREF(capsule)); + Err(PyArrayError::NotNativeEndian) } else { let num_dimensions = unsafe { (*array).nd as usize }; if num_dimensions == 0 { diff --git a/test/test_numpy.py b/test/test_numpy.py index e997ae76..28eacd5a 100644 --- a/test/test_numpy.py +++ b/test/test_numpy.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: (Apache-2.0 OR MIT) +import sys + import pytest import orjson @@ -864,3 +866,12 @@ def test_numpy_float64(self): [-1.7976931348623157e308, 1.7976931348623157e308], numpy.float64 ) ) + + +@pytest.mark.skipif(numpy is None, reason="numpy is not installed") +class NumpyEndianness: + def test_numpy_array_dimension_zero(self): + wrong_endianness = ">" if sys.byteorder == "little" else "<" + array = numpy.array([0, 1, 0.4, 5.7], dtype=f"{wrong_endianness}f8") + with pytest.raises(orjson.JSONEncodeError): + orjson.dumps(array, option=orjson.OPT_SERIALIZE_NUMPY)