diff --git a/.github/workflows/debug.yaml b/.github/workflows/debug.yaml index 19372a0d..9ebc6ebf 100644 --- a/.github/workflows/debug.yaml +++ b/.github/workflows/debug.yaml @@ -10,11 +10,12 @@ jobs: profile: [ { rust: "1.65", features: "" }, { rust: "1.65", features: "--features=yyjson" }, - { rust: "nightly-2024-01-17", features: "--features=yyjson,unstable-simd" }, + { rust: "nightly-2024-02-01", features: "--features=yyjson,unstable-simd" }, ] python: [ - { version: '3.12', abi: 'cp312-cp312' }, - { version: '3.8', abi: 'cp38-cp38' }, + { version: '3.13' }, + { version: '3.12' }, + { version: '3.8' }, ] env: CC: "gcc" @@ -26,7 +27,9 @@ jobs: - uses: actions/setup-python@v5 with: + allow-prereleases: true python-version: '${{ matrix.python.version }}' + - run: python -m pip install --user --upgrade pip "maturin>=1,<2" wheel - uses: actions/checkout@v4 @@ -43,9 +46,15 @@ jobs: - run: python -m pip install --user -r test/requirements.txt -r integration/requirements.txt - run: pytest -s -rxX -v test + timeout-minutes: 4 env: PYTHONMALLOC: "debug" - run: ./integration/run thread + timeout-minutes: 2 + - run: ./integration/run http + timeout-minutes: 2 + - run: ./integration/run init + timeout-minutes: 2 diff --git a/.github/workflows/linux.yaml b/.github/workflows/linux.yaml index 744bef39..2f84f0c2 100644 --- a/.github/workflows/linux.yaml +++ b/.github/workflows/linux.yaml @@ -71,8 +71,8 @@ jobs: options: --user 0 steps: - run: yum install -y clang lld - - run: curl https://sh.rustup.rs -sSf | sh -s -- --default-toolchain nightly-2024-01-17 --profile minimal -y - - run: rustup component add rust-src --toolchain nightly-2024-01-17-x86_64-unknown-linux-gnu + - run: curl https://sh.rustup.rs -sSf | sh -s -- --default-toolchain nightly-2024-02-01 --profile minimal -y + - run: rustup component add rust-src --toolchain nightly-2024-02-01-x86_64-unknown-linux-gnu - uses: actions/checkout@v4 - name: build-std @@ -143,11 +143,11 @@ jobs: RUSTFLAGS: "-C target-feature=-crt-static" CARGO_UNSTABLE_SPARSE_REGISTRY: "true" with: - rust-toolchain: nightly-2024-01-17 + rust-toolchain: nightly-2024-02-01 rustup-components: rust-src target: ${{ matrix.platform.target }} manylinux: musllinux_1_1 - args: --release --strip --out=dist --features=no-panic,yyjson -i python${{ matrix.python.version }} + args: --release --strip --out=dist --features=encoding_rs/simd-accel,no-panic,unstable-simd,yyjson -i python${{ matrix.python.version }} - name: Set up QEMU if: matrix.platform.arch != 'x86_64' @@ -196,28 +196,28 @@ jobs: arch: 'aarch64', cflags: '-O2 -flto', features: 'encoding_rs/simd-accel,no-panic,unstable-simd,yyjson', - rustflags: '-Z mir-opt-level=4 -D warnings', + rustflags: '-D warnings', target: 'aarch64-unknown-linux-gnu', }, { arch: 'armv7', cflags: '-Os -flto -fstrict-aliasing', features: 'no-panic,yyjson', # no SIMD - rustflags: '-C opt-level=s -Z mir-opt-level=4 -D warnings', + rustflags: '-C opt-level=s -D warnings', target: 'armv7-unknown-linux-gnueabihf', }, { arch: 'ppc64le', cflags: '-O2 -flto', features: 'no-panic,unstable-simd,yyjson', - rustflags: '-Z mir-opt-level=4 -D warnings', + rustflags: '-D warnings', target: 'powerpc64le-unknown-linux-gnu', }, { arch: 's390x', cflags: '-O2 -flto -march=z10', features: 'no-panic,unstable-simd,yyjson', - rustflags: '-Z mir-opt-level=4 -C target-cpu=z10 -D warnings', + rustflags: '-C target-cpu=z10 -D warnings', target: 's390x-unknown-linux-gnu', }, ] @@ -238,7 +238,7 @@ jobs: RUSTFLAGS: "${{ matrix.target.rustflags }}" with: target: ${{ matrix.target.target }} - rust-toolchain: nightly-2024-01-17 + rust-toolchain: nightly-2024-02-01 rustup-components: rust-src manylinux: auto args: --release --strip --out=dist --features=${{ matrix.target.features }} -i python${{ matrix.python.version }} diff --git a/.gitignore b/.gitignore index fdf3c2de..40769adc 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ /target /vendor __pycache__ +corpus +data/yelp diff --git a/Cargo.lock b/Cargo.lock index 838bfcf7..10865605 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -77,9 +77,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.31" +version = "0.4.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38" +checksum = "9f13690e35a5e4ace198e7beea2895d29f3a9cc55015fcebe6336bd2010af9eb" dependencies = [ "num-traits", ] @@ -161,15 +161,15 @@ checksum = "9028f49264629065d057f340a86acb84867925865f73bbf8d47b4d149a7e88b8" [[package]] name = "libc" -version = "0.2.152" +version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] name = "no-panic" -version = "0.1.28" +version = "0.1.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc56831a2ae584dc43a8b0b33f496e71fb4d43cf8c1c0a3fd932e6340bea1f81" +checksum = "c711522eedec2a96bb3672ad60a03561cb28934ab1e9b97d2ecb58e07c79ef52" dependencies = [ "proc-macro2", "quote", @@ -219,9 +219,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.76" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95fc56cda0b5c3325f5fbbd7ff9fda9e02bb00bb3dac51252d2f1bfa1cb8cc8c" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] @@ -272,18 +272,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.195" +version = "1.0.196" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63261df402c67811e9ac6def069e4786148c4563f4b50fd4bf30aa370d626b02" +checksum = "870026e60fa08c69f064aa766c10f10b1d62db9ccd4d0abb206472bee0ce3b32" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.195" +version = "1.0.196" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46fe8f8603d81ba86327b23a2e9cdf49e1255fb94a4c5f297f6ee0547178ea2c" +checksum = "33c85360c95e7d137454dc81d9a4ed2b8efd8fbe19cee57357b32b9771fccb67" dependencies = [ "proc-macro2", "quote", @@ -292,9 +292,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.111" +version = "1.0.113" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "176e46fa42316f18edd598015a5166857fc835ec732f5215eac6b7bdbf0a84f4" +checksum = "69801b70b1c3dac963ecb03a364ba0ceda9cf60c71cfe475e99864759c8b8a79" dependencies = [ "itoa", "ryu", @@ -309,9 +309,9 @@ checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" [[package]] name = "smallvec" -version = "1.12.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2593d31f82ead8df961d8bd23a64c2ccf2eb5dd34b0a34bfb4dd54011c72009e" +checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" [[package]] name = "static_assertions" diff --git a/Cargo.toml b/Cargo.toml index c2cf4578..08fff9e9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,13 +14,13 @@ keywords = ["fast", "json", "dataclass", "dataclasses", "datetime", "rfc", "8259 include = [ "Cargo.toml", "CHANGELOG.md", - "data/*", + "data", "include", "LICENSE-APACHE", "LICENSE-MIT", "pyproject.toml", "README.md", - "src/*", + "src", "test/*.py", "test/requirements.txt", ] diff --git a/README.md b/README.md index ff72e3e5..685d6da8 100644 --- a/README.md +++ b/README.md @@ -1196,7 +1196,7 @@ It benefits from also having a C build environment to compile a faster deserialization backend. See this project's `manylinux_2_28` builds for an example using clang and LTO. -The project's own CI tests against `nightly-2024-01-17` and stable 1.65. It +The project's own CI tests against `nightly-2024-02-01` and stable 1.65. It is prudent to pin the nightly version because that channel can introduce breaking changes. diff --git a/ci/azure-macos.yml b/ci/azure-macos.yml index f490383f..c395c1b8 100644 --- a/ci/azure-macos.yml +++ b/ci/azure-macos.yml @@ -30,7 +30,7 @@ steps: LDFLAGS: "-O2 -fstrict-aliasing -flto=full -Wl,--as-needed" CFLAGS_x86_64_apple_darwin: "-O2 -fstrict-aliasing -flto=full -march=x86-64-v2 -mtune=generic" CFLAGS_aarch64_apple_darwin: "-O2 -fstrict-aliasing -flto=full -mcpu=apple-m1 -mtune=generic" - RUSTFLAGS: "-C lto=fat -Z mir-opt-level=4 -Z virtual-function-elimination -D warnings" + RUSTFLAGS: "-C lto=fat -Z virtual-function-elimination -D warnings" CARGO_UNSTABLE_SPARSE_REGISTRY: "true" displayName: build universal2 diff --git a/ci/azure-pipelines.yml b/ci/azure-pipelines.yml index c47945bd..a20411fd 100644 --- a/ci/azure-pipelines.yml +++ b/ci/azure-pipelines.yml @@ -1,5 +1,5 @@ variables: - toolchain: nightly-2024-01-17 + toolchain: nightly-2024-02-01 jobs: diff --git a/integration/run b/integration/run index 477f7076..b3abf62a 100755 --- a/integration/run +++ b/integration/run @@ -29,3 +29,7 @@ fi if [[ $to_run == *"init"* ]]; then "${_dir}"/init fi + +if [[ $to_run == *"yelp"* ]]; then + "${_dir}"/yelp.py +fi diff --git a/integration/yelp.py b/integration/yelp.py new file mode 100755 index 00000000..1c3f58a0 --- /dev/null +++ b/integration/yelp.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + +import sys +from pathlib import Path + +import orjson + +# https://www.yelp.com/dataset, ~8.6GiB +FILES = { + "yelp_academic_dataset_business.json", + "yelp_academic_dataset_checkin.json", + "yelp_academic_dataset_review.json", + "yelp_academic_dataset_tip.json", + "yelp_academic_dataset_user.json", +} + +for filename in FILES: + message = f"Processing {filename} ..." + sys.stdout.buffer.write(f"{message}".encode("ascii")) + count = 0 + data = Path(f"data/yelp/{filename}").read_bytes() + for line in data.split(b"\n"): + if not line: + continue + count += 1 + deserialized = orjson.loads(line) + assert orjson.loads(orjson.dumps(deserialized)) == deserialized + if count % 100 == 0: + sys.stdout.buffer.write(f"\r{message} {count} entries".encode("ascii")) + + sys.stdout.buffer.write(f"\r{message} {count} entries\n".encode("ascii")) diff --git a/script/lint b/script/lint index 15eed6ec..c52fe14b 100755 --- a/script/lint +++ b/script/lint @@ -4,7 +4,7 @@ set -eou pipefail to_lint="./bench/*.py ./pysrc/orjson/__init__.pyi ./test/*.py script/pydataclass script/pymem script/pysort script/pynumpy script/pynonstr script/pycorrectness script/graph integration/init -integration/wsgi.py integration/typestubs.py integration/thread" +integration/wsgi.py integration/typestubs.py integration/thread integration/yelp.py" ruff ${to_lint} --fix ruff format ${to_lint} diff --git a/src/util.rs b/src/util.rs index a51dc633..1e21b144 100644 --- a/src/util.rs +++ b/src/util.rs @@ -238,12 +238,20 @@ macro_rules! use_immortal { }; } +#[cfg(not(Py_3_13))] macro_rules! pydict_next { ($obj1:expr, $obj2:expr, $obj3:expr, $obj4:expr) => { unsafe { pyo3_ffi::_PyDict_Next($obj1, $obj2, $obj3, $obj4, std::ptr::null_mut()) } }; } +#[cfg(Py_3_13)] +macro_rules! pydict_next { + ($obj1:expr, $obj2:expr, $obj3:expr, $obj4:expr) => { + unsafe { pyo3_ffi::PyDict_Next($obj1, $obj2, $obj3, $obj4) } + }; +} + macro_rules! reserve_minimum { ($writer:expr) => { $writer.reserve(64); diff --git a/test/test_dict.py b/test/test_dict.py index 02908f6b..36bbef02 100644 --- a/test/test_dict.py +++ b/test/test_dict.py @@ -1,11 +1,103 @@ # SPDX-License-Identifier: (Apache-2.0 OR MIT) +import pytest + import orjson class TestDict: + def test_dict(self): + """ + dict + """ + obj = {"key": "value"} + ref = '{"key":"value"}' + assert orjson.dumps(obj) == ref.encode("utf-8") + assert orjson.loads(ref) == obj + + def test_dict_duplicate_loads(self): + assert orjson.loads(b'{"1":true,"1":false}') == {"1": False} + + def test_dict_empty(self): + obj = [{"key": [{}] * 4096}] * 4096 # type:ignore + assert orjson.loads(orjson.dumps(obj)) == obj + + def test_dict_large_dict(self): + """ + dict with >512 keys + """ + obj = {"key_%s" % idx: [{}, {"a": [{}, {}, {}]}, {}] for idx in range(513)} + assert len(obj) == 513 + assert orjson.loads(orjson.dumps(obj)) == obj + + def test_dict_large_4096(self): + """ + dict with >4096 keys + """ + obj = {"key_%s" % idx: "value_%s" % idx for idx in range(4097)} + assert len(obj) == 4097 + assert orjson.loads(orjson.dumps(obj)) == obj + + def test_dict_large_65536(self): + """ + dict with >65536 keys + """ + obj = {"key_%s" % idx: "value_%s" % idx for idx in range(65537)} + assert len(obj) == 65537 + assert orjson.loads(orjson.dumps(obj)) == obj + + def test_dict_large_keys(self): + """ + dict with keys too large to cache + """ + obj = { + "keeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeey": "value" + } + ref = '{"keeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeey":"value"}' + assert orjson.dumps(obj) == ref.encode("utf-8") + assert orjson.loads(ref) == obj + + def test_dict_unicode(self): + """ + dict unicode keys + """ + obj = {"🐈": "value"} + ref = b'{"\xf0\x9f\x90\x88":"value"}' + assert orjson.dumps(obj) == ref + assert orjson.loads(ref) == obj + assert orjson.loads(ref)["🐈"] == "value" + + def test_dict_invalid_key_dumps(self): + """ + dict invalid key dumps() + """ + with pytest.raises(orjson.JSONEncodeError): + orjson.dumps({1: "value"}) + with pytest.raises(orjson.JSONEncodeError): + orjson.dumps({b"key": "value"}) + + def test_dict_invalid_key_loads(self): + """ + dict invalid key loads() + """ + with pytest.raises(orjson.JSONDecodeError): + orjson.loads('{1:"value"}') + with pytest.raises(orjson.JSONDecodeError): + orjson.loads('{{"a":true}:true}') + + def test_dict_similar_keys(self): + """ + loads() similar keys + + This was a regression in 3.4.2 caused by using + the implementation in wy instead of wyhash. + """ + assert orjson.loads( + '{"cf_status_firefox67": "---", "cf_status_firefox57": "verified"}' + ) == {"cf_status_firefox57": "verified", "cf_status_firefox67": "---"} + def test_dict_pop_replace_first(self): - """Test pop and replace a first key in a dict with other keys.""" + "Test pop and replace a first key in a dict with other keys." data = {"id": "any", "other": "any"} data.pop("id") assert orjson.dumps(data) == b'{"other":"any"}' @@ -13,7 +105,7 @@ def test_dict_pop_replace_first(self): assert orjson.dumps(data) == b'{"other":"any","id":"new"}' def test_dict_pop_replace_last(self): - """Test pop and replace a last key in a dict with other keys.""" + "Test pop and replace a last key in a dict with other keys." data = {"other": "any", "id": "any"} data.pop("id") assert orjson.dumps(data) == b'{"other":"any"}' @@ -21,7 +113,7 @@ def test_dict_pop_replace_last(self): assert orjson.dumps(data) == b'{"other":"any","id":"new"}' def test_dict_pop(self): - """Test pop and replace a key in a dict with no other keys.""" + "Test pop and replace a key in a dict with no other keys." data = {"id": "any"} data.pop("id") assert orjson.dumps(data) == b"{}" @@ -29,21 +121,30 @@ def test_dict_pop(self): assert orjson.dumps(data) == b'{"id":"new"}' def test_in_place(self): - """Mutate dict in-place""" + "Mutate dict in-place" data = {"id": "any", "static": "msg"} data["id"] = "new" assert orjson.dumps(data) == b'{"id":"new","static":"msg"}' def test_dict_0xff(self): - """dk_size <= 0xff""" + "dk_size <= 0xff" data = {str(idx): idx for idx in range(0, 0xFF)} data.pop("112") data["112"] = 1 data["113"] = 2 assert orjson.loads(orjson.dumps(data)) == data + def test_dict_0xff_repeated(self): + "dk_size <= 0xff repeated" + for _ in range(0, 100): + data = {str(idx): idx for idx in range(0, 0xFF)} + data.pop("112") + data["112"] = 1 + data["113"] = 2 + assert orjson.loads(orjson.dumps(data)) == data + def test_dict_0xffff(self): - """dk_size <= 0xffff""" + "dk_size <= 0xffff" data = {str(idx): idx for idx in range(0, 0xFFFF)} data.pop("112") data["112"] = 1 diff --git a/test/test_memory.py b/test/test_memory.py index 9aed19c2..c4ccc829 100644 --- a/test/test_memory.py +++ b/test/test_memory.py @@ -61,7 +61,7 @@ class Object: for i in range(100000, 101000) ] -MAX_INCREASE = 2097152 # 2MiB +MAX_INCREASE = 4194304 # 4MiB class Unsupported: diff --git a/test/test_type.py b/test/test_type.py index 35ca8aeb..5eb9a954 100644 --- a/test/test_type.py +++ b/test/test_type.py @@ -49,6 +49,10 @@ def test_str_long(self): for obj in ("aaaa" * 1024, "ΓΌΓ½ΓΎΓΏ" * 1024, "ε₯½" * 1024, "οΏ½" * 1024): assert orjson.loads(orjson.dumps(obj)) == obj + def test_str_2mib(self): + ref = '🐈🐈🐈🐈🐈"ΓΌΓ½a0s9999🐈🐈🐈🐈🐈9\0999\\9999' * 1024 * 50 + assert orjson.loads(orjson.dumps(ref)) == ref + def test_str_very_long(self): """ str long enough to trigger overflow in bytecount @@ -512,79 +516,9 @@ def test_tuple(self): assert orjson.dumps(obj) == ref.encode("utf-8") assert orjson.loads(ref) == list(obj) - def test_dict(self): - """ - dict - """ - obj = {"key": "value"} - ref = '{"key":"value"}' - assert orjson.dumps(obj) == ref.encode("utf-8") - assert orjson.loads(ref) == obj - - def test_dict_duplicate_loads(self): - assert orjson.loads(b'{"1":true,"1":false}') == {"1": False} - - def test_dict_large(self): - """ - dict with >512 keys - """ - obj = {"key_%s" % idx: "value" for idx in range(513)} - assert len(obj) == 513 - assert orjson.loads(orjson.dumps(obj)) == obj - - def test_dict_large_keys(self): - """ - dict with keys too large to cache - """ - obj = { - "keeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeey": "value" - } - ref = '{"keeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeey":"value"}' - assert orjson.dumps(obj) == ref.encode("utf-8") - assert orjson.loads(ref) == obj - - def test_dict_unicode(self): - """ - dict unicode keys - """ - obj = {"🐈": "value"} - ref = b'{"\xf0\x9f\x90\x88":"value"}' - assert orjson.dumps(obj) == ref - assert orjson.loads(ref) == obj - assert orjson.loads(ref)["🐈"] == "value" - - def test_dict_invalid_key_dumps(self): - """ - dict invalid key dumps() - """ - with pytest.raises(orjson.JSONEncodeError): - orjson.dumps({1: "value"}) - with pytest.raises(orjson.JSONEncodeError): - orjson.dumps({b"key": "value"}) - - def test_dict_invalid_key_loads(self): - """ - dict invalid key loads() - """ - with pytest.raises(orjson.JSONDecodeError): - orjson.loads('{1:"value"}') - with pytest.raises(orjson.JSONDecodeError): - orjson.loads('{{"a":true}:true}') - def test_object(self): """ object() dumps() """ with pytest.raises(orjson.JSONEncodeError): orjson.dumps(object()) - - def test_dict_similar_keys(self): - """ - loads() similar keys - - This was a regression in 3.4.2 caused by using - the implementation in wy instead of wyhash. - """ - assert orjson.loads( - '{"cf_status_firefox67": "---", "cf_status_firefox57": "verified"}' - ) == {"cf_status_firefox57": "verified", "cf_status_firefox67": "---"}