From 000c68ffb20f3423ad79184e7ac408c224503fd9 Mon Sep 17 00:00:00 2001
From: Aimee Barciauskas <aimee@developmentseed.org>
Date: Fri, 1 Nov 2024 15:25:13 -0700
Subject: [PATCH] Passing compression test

---
 conftest.py                                   | 16 ++++++-----
 .../test_writers/test_icechunk_append.py      | 28 +++++++++----------
 virtualizarr/writers/icechunk.py              |  2 --
 virtualizarr/zarr.py                          |  2 +-
 4 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/conftest.py b/conftest.py
index 9f9da924..7cacb3f0 100644
--- a/conftest.py
+++ b/conftest.py
@@ -37,20 +37,22 @@ def netcdf4_file(tmpdir):
 
 @pytest.fixture
 def compressed_netcdf4_files(tmpdir):
-    ds = xr.tutorial.open_dataset("air_temperature")
+    # without chunks={} we get a compression error: zlib.error: Error -3 while decompressing data: incorrect header check
+    ds = xr.tutorial.open_dataset("air_temperature", chunks={})
+    ds1 = ds.isel(time=slice(None, 1460))
+    ds2 = ds.isel(time=slice(1460, None))
     # Define compression options for NetCDF
     encoding = {
-        var: dict(compression="gzip", compression_opts=4) for var in ds.data_vars
+        # without encoding the chunksizes, irregular ones are chosen
+        var: dict(zlib=True, complevel=4, chunksizes=(1460, 25, 53))
+        for var in ds.data_vars
     }
 
-    ds1 = ds.isel(time=slice(None, 1460))
-    ds2 = ds.isel(time=slice(1460, None))
-
     # Save it to disk as netCDF (in temporary directory)
     filepath1 = f"{tmpdir}/air1_compressed.nc"
     filepath2 = f"{tmpdir}/air2_compressed.nc"
-    ds1.to_netcdf(filepath1, engine="h5netcdf", encoding=encoding)
-    ds2.to_netcdf(filepath2, engine="h5netcdf", encoding=encoding)
+    ds1.to_netcdf(filepath1, encoding=encoding, engine="h5netcdf")
+    ds2.to_netcdf(filepath2, encoding=encoding, engine="h5netcdf")
     ds1.close()
     ds2.close()
 
diff --git a/virtualizarr/tests/test_writers/test_icechunk_append.py b/virtualizarr/tests/test_writers/test_icechunk_append.py
index 1a9cd659..07938f1a 100644
--- a/virtualizarr/tests/test_writers/test_icechunk_append.py
+++ b/virtualizarr/tests/test_writers/test_icechunk_append.py
@@ -159,7 +159,7 @@ def test_append_virtual_ref_with_encoding(
             shape=(1460, 25, 53),
             chunk_shape=(1460, 25, 53),
             dims=["time", "lat", "lon"],
-            dtype=np.dtype("int16"),
+            dtype=np.dtype("float64"),
             variable_name="air",
             encoding={"scale_factor": scale_factor},
             base_offset=15419,
@@ -170,7 +170,7 @@ def test_append_virtual_ref_with_encoding(
             shape=(1460, 25, 53),
             chunk_shape=(1460, 25, 53),
             dims=["time", "lat", "lon"],
-            dtype=np.dtype("int16"),
+            dtype=np.dtype("float64"),
             variable_name="air",
             encoding={"scale_factor": scale_factor},
             base_offset=15419,
@@ -201,7 +201,6 @@ def test_append_virtual_ref_with_encoding(
 
 
 ## When appending to a virtual ref with compression, it succeeds
-@pytest.mark.skip(reason="Failing with gzip.BadGzipFile: Not a gzipped file")
 def test_append_with_compression_succeeds(
     icechunk_storage: "StorageConfig", compressed_netcdf4_files: str
 ):
@@ -213,24 +212,25 @@ def test_append_with_compression_succeeds(
     vds1, vds2 = (
         gen_virtual_dataset(
             file_uri=file1,
-            # https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#metadata
-            compressor={"id": "gzip", "level": 4},
-            dtype=np.dtype("int16"),
-            variable_name="air",
             shape=(1460, 25, 53),
             chunk_shape=(1460, 25, 53),
-            base_offset=15419,
-            length=3869000,
+            compressor={"id": "zlib", "level": 4},
+            dims=["time", "lat", "lon"],
+            dtype=np.dtype("float64"),
+            variable_name="air",
+            base_offset=23214,
+            length=3936114,
         ),
         gen_virtual_dataset(
             file_uri=file2,
-            compressor={"id": "gzip", "level": 4},
-            dtype=np.dtype("int16"),
-            variable_name="air",
             shape=(1460, 25, 53),
             chunk_shape=(1460, 25, 53),
-            base_offset=15419,
-            length=3869000,
+            compressor={"id": "zlib", "level": 4},
+            dims=["time", "lat", "lon"],
+            dtype=np.dtype("float64"),
+            variable_name="air",
+            base_offset=23214,
+            length=3938672,
         ),
     )
 
diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py
index 119f3f37..c3ed72b5 100644
--- a/virtualizarr/writers/icechunk.py
+++ b/virtualizarr/writers/icechunk.py
@@ -188,8 +188,6 @@ def write_virtual_variable_to_icechunk(
     zarray = ma.zarray
     mode = store.mode.str
 
-    # Aimee: resize the array if it already exists
-    # TODO: assert chunking and encoding is the same
     dims = var.dims
     append_axis, existing_num_chunks, arr = None, None, None
     if mode == "a" and append_dim in dims:
diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py
index 169c03ac..e339a3f4 100644
--- a/virtualizarr/zarr.py
+++ b/virtualizarr/zarr.py
@@ -227,5 +227,5 @@ def _num_codec_config_to_configurable(num_codec: dict) -> dict:
         return num_codec
 
     num_codec_copy = num_codec.copy()
-    name = num_codec_copy.pop("id")
+    name = "numcodecs." + num_codec_copy.pop("id")
     return {"name": name, "configuration": num_codec_copy}