From 000c68ffb20f3423ad79184e7ac408c224503fd9 Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Fri, 1 Nov 2024 15:25:13 -0700 Subject: [PATCH] Passing compression test --- conftest.py | 16 ++++++----- .../test_writers/test_icechunk_append.py | 28 +++++++++---------- virtualizarr/writers/icechunk.py | 2 -- virtualizarr/zarr.py | 2 +- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/conftest.py b/conftest.py index 9f9da924..7cacb3f0 100644 --- a/conftest.py +++ b/conftest.py @@ -37,20 +37,22 @@ def netcdf4_file(tmpdir): @pytest.fixture def compressed_netcdf4_files(tmpdir): - ds = xr.tutorial.open_dataset("air_temperature") + # without chunks={} we get a compression error: zlib.error: Error -3 while decompressing data: incorrect header check + ds = xr.tutorial.open_dataset("air_temperature", chunks={}) + ds1 = ds.isel(time=slice(None, 1460)) + ds2 = ds.isel(time=slice(1460, None)) # Define compression options for NetCDF encoding = { - var: dict(compression="gzip", compression_opts=4) for var in ds.data_vars + # without encoding the chunksizes, irregular ones are chosen + var: dict(zlib=True, complevel=4, chunksizes=(1460, 25, 53)) + for var in ds.data_vars } - ds1 = ds.isel(time=slice(None, 1460)) - ds2 = ds.isel(time=slice(1460, None)) - # Save it to disk as netCDF (in temporary directory) filepath1 = f"{tmpdir}/air1_compressed.nc" filepath2 = f"{tmpdir}/air2_compressed.nc" - ds1.to_netcdf(filepath1, engine="h5netcdf", encoding=encoding) - ds2.to_netcdf(filepath2, engine="h5netcdf", encoding=encoding) + ds1.to_netcdf(filepath1, encoding=encoding, engine="h5netcdf") + ds2.to_netcdf(filepath2, encoding=encoding, engine="h5netcdf") ds1.close() ds2.close() diff --git a/virtualizarr/tests/test_writers/test_icechunk_append.py b/virtualizarr/tests/test_writers/test_icechunk_append.py index 1a9cd659..07938f1a 100644 --- a/virtualizarr/tests/test_writers/test_icechunk_append.py +++ b/virtualizarr/tests/test_writers/test_icechunk_append.py @@ -159,7 +159,7 @@ def test_append_virtual_ref_with_encoding( shape=(1460, 25, 53), chunk_shape=(1460, 25, 53), dims=["time", "lat", "lon"], - dtype=np.dtype("int16"), + dtype=np.dtype("float64"), variable_name="air", encoding={"scale_factor": scale_factor}, base_offset=15419, @@ -170,7 +170,7 @@ def test_append_virtual_ref_with_encoding( shape=(1460, 25, 53), chunk_shape=(1460, 25, 53), dims=["time", "lat", "lon"], - dtype=np.dtype("int16"), + dtype=np.dtype("float64"), variable_name="air", encoding={"scale_factor": scale_factor}, base_offset=15419, @@ -201,7 +201,6 @@ def test_append_virtual_ref_with_encoding( ## When appending to a virtual ref with compression, it succeeds -@pytest.mark.skip(reason="Failing with gzip.BadGzipFile: Not a gzipped file") def test_append_with_compression_succeeds( icechunk_storage: "StorageConfig", compressed_netcdf4_files: str ): @@ -213,24 +212,25 @@ def test_append_with_compression_succeeds( vds1, vds2 = ( gen_virtual_dataset( file_uri=file1, - # https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#metadata - compressor={"id": "gzip", "level": 4}, - dtype=np.dtype("int16"), - variable_name="air", shape=(1460, 25, 53), chunk_shape=(1460, 25, 53), - base_offset=15419, - length=3869000, + compressor={"id": "zlib", "level": 4}, + dims=["time", "lat", "lon"], + dtype=np.dtype("float64"), + variable_name="air", + base_offset=23214, + length=3936114, ), gen_virtual_dataset( file_uri=file2, - compressor={"id": "gzip", "level": 4}, - dtype=np.dtype("int16"), - variable_name="air", shape=(1460, 25, 53), chunk_shape=(1460, 25, 53), - base_offset=15419, - length=3869000, + compressor={"id": "zlib", "level": 4}, + dims=["time", "lat", "lon"], + dtype=np.dtype("float64"), + variable_name="air", + base_offset=23214, + length=3938672, ), ) diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index 119f3f37..c3ed72b5 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -188,8 +188,6 @@ def write_virtual_variable_to_icechunk( zarray = ma.zarray mode = store.mode.str - # Aimee: resize the array if it already exists - # TODO: assert chunking and encoding is the same dims = var.dims append_axis, existing_num_chunks, arr = None, None, None if mode == "a" and append_dim in dims: diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 169c03ac..e339a3f4 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -227,5 +227,5 @@ def _num_codec_config_to_configurable(num_codec: dict) -> dict: return num_codec num_codec_copy = num_codec.copy() - name = num_codec_copy.pop("id") + name = "numcodecs." + num_codec_copy.pop("id") return {"name": name, "configuration": num_codec_copy}