allow creating references for empty archival datasets (#260)

* raise a more user-friendly error for empty variables * add a test to make sure the error is raised * create a empty manifest array instead * also allow writing empty chunk manifests * try using an annotated variable * explicitly cast instead * switch the order of `cast` arguments * use the main constructor instead * forgotten call of `ChunkManifest.empty` * explanatory comment * rename the reader test * check that concatenating works * check that stacking works with missing chunk definitions * check that broadcasting works * use empty arrays instead of 0-sized if shape given * pass the chunk grid shape for all empty chunk manifests * don't allow empty chunks if no chunk grid shape given * move `ujson` to top-level * replace the manual floor division * release note * fix a couple of changelog entries
zarr-developers · Oct 18, 2024 · 7053bc0 · 7053bc0
1 parent e6407e0
commit 7053bc0
Show file tree

Hide file tree

Showing 7 changed files with 194 additions and 9 deletions.
diff --git a/docs/releases.rst b/docs/releases.rst
@@ -28,14 +28,17 @@ New Features
 - Load scalar variables by default. (:pull:`205`)
   By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.
 
+- Support empty files (:pull:`260`)
+  By `Justus Magin <https://github.com/keewis>`_.
+
 Breaking changes
 ~~~~~~~~~~~~~~~~
 
 - Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`)
   By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.
 - VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclass
   `pydantic.BaseModel` (:pull:`210`)
-- `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`xxx`)
+- `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`210`)
 
 Deprecations
 ~~~~~~~~~~~~
@@ -55,7 +58,7 @@ Bug fixes
 Documentation
 ~~~~~~~~~~~~~
 
-- Adds virtualizarr + coiled serverless example notebook (:pull`223`)
+- Adds virtualizarr + coiled serverless example notebook (:pull:`223`)
   By `Raphael Hagen <https://github.com/norlandrhagen>`_.
 
 

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
@@ -89,7 +89,7 @@ class ChunkManifest:
     _offsets: np.ndarray[Any, np.dtype[np.uint64]]
     _lengths: np.ndarray[Any, np.dtype[np.uint64]]
 
-    def __init__(self, entries: dict) -> None:
+    def __init__(self, entries: dict, shape: tuple[int, ...] | None = None) -> None:
         """
         Create a ChunkManifest from a dictionary mapping zarr chunk keys to byte ranges.
 
@@ -105,13 +105,14 @@ def __init__(self, entries: dict) -> None:
                     "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
                 }
         """
+        if shape is None and not entries:
+            raise ValueError("need a chunk grid shape if no chunks given")
 
         # TODO do some input validation here first?
         validate_chunk_keys(entries.keys())
 
-        # TODO should we actually optionally pass chunk grid shape in,
-        # in case there are not enough chunks to give correct idea of full shape?
-        shape = get_chunk_grid_shape(entries.keys())
+        if shape is None:
+            shape = get_chunk_grid_shape(entries.keys())
 
         # Initializing to empty implies that entries with path='' are treated as missing chunks
         paths = cast(  # `np.empty` apparently is type hinted as if the output could have Any dtype
@@ -386,6 +387,9 @@ def get_ndim_from_key(key: str) -> int:
 
 
 def validate_chunk_keys(chunk_keys: Iterable[ChunkKey]):
+    if not chunk_keys:
+        return
+
     # Check if all keys have the correct form
     for key in chunk_keys:
         if not re.match(_CHUNK_KEY, key):

diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py
@@ -13,7 +13,7 @@
     KerchunkStoreRefs,
 )
 from virtualizarr.utils import _FsspecFSFromFilepath
-from virtualizarr.zarr import ZArray, ZAttrs
+from virtualizarr.zarr import ZArray, ZAttrs, ceildiv
 
 
 # TODO shouldn't this live in backend.py? Because it's not just useful for the kerchunk-specific readers...
@@ -230,6 +230,13 @@ def dataset_from_kerchunk_refs(
     return vds
 
 
+def determine_chunk_grid_shape(zarray):
+    return tuple(
+        ceildiv(length, chunksize)
+        for length, chunksize in zip(zarray.shape, zarray.chunks)
+    )
+
+
 def variable_from_kerchunk_refs(
     refs: KerchunkStoreRefs, var_name: str, virtual_array_class
 ) -> Variable:
@@ -242,6 +249,12 @@ def variable_from_kerchunk_refs(
     if chunk_dict:
         manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict)
         varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest)
+    elif len(zarray.shape) != 0:
+        # empty variables don't have physical chunks, but zarray shows that the variable
+        # is at least 1D
+        shape = determine_chunk_grid_shape(zarray)
+        manifest = ChunkManifest(entries={}, shape=shape)
+        varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest)
     else:
         # This means we encountered a scalar variable of dimension 0,
         # very likely that it actually has no numeric value and its only purpose

diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py
@@ -203,6 +203,38 @@ def test_broadcast_any_shape(self, shape, chunks, target_shape):
         for len_arr, len_chunk in zip(broadcasted_marr.shape, broadcasted_chunk_shape):
             assert len_chunk <= len_arr
 
+    @pytest.mark.parametrize(
+        "shape, chunks, grid_shape, target_shape",
+        [
+            ((1,), (1,), (1,), (3,)),
+            ((2,), (1,), (2,), (2,)),
+            ((3,), (2,), (2,), (5, 4, 3)),
+            ((3, 1), (2, 1), (2, 1), (2, 3, 4)),
+        ],
+    )
+    def test_broadcast_empty(self, shape, chunks, grid_shape, target_shape):
+        zarray = ZArray(
+            chunks=chunks,
+            compressor={"id": "zlib", "level": 1},
+            dtype=np.dtype("int32"),
+            fill_value=0.0,
+            filters=None,
+            order="C",
+            shape=shape,
+            zarr_format=2,
+        )
+        manifest = ChunkManifest(entries={}, shape=grid_shape)
+        marr = ManifestArray(zarray, manifest)
+
+        expanded = np.broadcast_to(marr, shape=target_shape)
+        assert expanded.shape == target_shape
+        assert len(expanded.chunks) == expanded.ndim
+        assert all(
+            len_chunk <= len_arr
+            for len_arr, len_chunk in zip(expanded.shape, expanded.chunks)
+        )
+        assert expanded.manifest.dict() == {}
+
 
 # TODO we really need some kind of fixtures to generate useful example data
 # The hard part is having an alternative way to get to the expected result of concatenation
@@ -250,6 +282,44 @@ def test_concat(self):
         assert result.zarray.order == zarray.order
         assert result.zarray.zarr_format == zarray.zarr_format
 
+    def test_concat_empty(self):
+        # both manifest arrays in this example have the same zarray properties
+        zarray = ZArray(
+            chunks=(5, 1, 10),
+            compressor={"id": "zlib", "level": 1},
+            dtype=np.dtype("int32"),
+            fill_value=0.0,
+            filters=None,
+            order="C",
+            shape=(5, 1, 20),
+            zarr_format=2,
+        )
+
+        chunks_dict1 = {}
+        manifest1 = ChunkManifest(entries=chunks_dict1, shape=(1, 1, 2))
+        marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)
+
+        chunks_dict2 = {
+            "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100},
+            "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100},
+        }
+        manifest2 = ChunkManifest(entries=chunks_dict2)
+        marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)
+
+        result = np.concatenate([marr1, marr2], axis=1)
+
+        assert result.shape == (5, 2, 20)
+        assert result.chunks == (5, 1, 10)
+        assert result.manifest.dict() == {
+            "0.1.0": {"path": "foo.nc", "offset": 300, "length": 100},
+            "0.1.1": {"path": "foo.nc", "offset": 400, "length": 100},
+        }
+        assert result.zarray.compressor == zarray.compressor
+        assert result.zarray.filters == zarray.filters
+        assert result.zarray.fill_value == zarray.fill_value
+        assert result.zarray.order == zarray.order
+        assert result.zarray.zarr_format == zarray.zarr_format
+
 
 class TestStack:
     def test_stack(self):
@@ -295,6 +365,44 @@ def test_stack(self):
         assert result.zarray.order == zarray.order
         assert result.zarray.zarr_format == zarray.zarr_format
 
+    def test_stack_empty(self):
+        # both manifest arrays in this example have the same zarray properties
+        zarray = ZArray(
+            chunks=(5, 10),
+            compressor={"id": "zlib", "level": 1},
+            dtype=np.dtype("int32"),
+            fill_value=0.0,
+            filters=None,
+            order="C",
+            shape=(5, 20),
+            zarr_format=2,
+        )
+
+        chunks_dict1 = {}
+        manifest1 = ChunkManifest(entries=chunks_dict1, shape=(1, 2))
+        marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)
+
+        chunks_dict2 = {
+            "0.0": {"path": "foo.nc", "offset": 300, "length": 100},
+            "0.1": {"path": "foo.nc", "offset": 400, "length": 100},
+        }
+        manifest2 = ChunkManifest(entries=chunks_dict2)
+        marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)
+
+        result = np.stack([marr1, marr2], axis=1)
+
+        assert result.shape == (5, 2, 20)
+        assert result.chunks == (5, 1, 10)
+        assert result.manifest.dict() == {
+            "0.1.0": {"path": "foo.nc", "offset": 300, "length": 100},
+            "0.1.1": {"path": "foo.nc", "offset": 400, "length": 100},
+        }
+        assert result.zarray.compressor == zarray.compressor
+        assert result.zarray.filters == zarray.filters
+        assert result.zarray.fill_value == zarray.fill_value
+        assert result.zarray.order == zarray.order
+        assert result.zarray.zarr_format == zarray.zarr_format
+
 
 def test_refuse_combine():
     # TODO test refusing to concatenate arrays that have conflicting shapes / chunk sizes

diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py
@@ -20,6 +20,14 @@ def test_create_manifest(self):
         manifest = ChunkManifest(entries=chunks)
         assert manifest.dict() == chunks
 
+        chunks = {}
+        manifest = ChunkManifest(entries=chunks, shape=(2, 2))
+        assert manifest.dict() == chunks
+
+    def test_create_manifest_empty_missing_shape(self):
+        with pytest.raises(ValueError, match="chunk grid shape if no chunks"):
+            ChunkManifest(entries={})
+
     def test_invalid_chunk_entries(self):
         chunks = {
             "0.0.0": {"path": "s3://bucket/foo.nc"},

diff --git a/virtualizarr/tests/test_readers/test_kerchunk.py b/virtualizarr/tests/test_readers/test_kerchunk.py
@@ -1,4 +1,5 @@
 import numpy as np
+import ujson
 
 from virtualizarr.manifests import ManifestArray
 from virtualizarr.readers.kerchunk import (
@@ -45,8 +46,6 @@ def test_dataset_from_df_refs():
 
 
 def test_dataset_from_df_refs_with_filters():
-    import ujson
-
     filters = [{"elementsize": 4, "id": "shuffle"}, {"id": "zlib", "level": 4}]
     zarray = {
         "chunks": [2, 3],
@@ -62,3 +61,24 @@ def test_dataset_from_df_refs_with_filters():
     ds = dataset_from_kerchunk_refs(ds_refs)
     da = ds["a"]
     assert da.data.zarray.filters == filters
+
+
+def test_dataset_from_kerchunk_refs_empty_chunk_manifest():
+    zarray = {
+        "chunks": [50, 100],
+        "compressor": None,
+        "dtype": "<i8",
+        "fill_value": 100,
+        "filters": None,
+        "order": "C",
+        "shape": [100, 200],
+        "zarr_format": 2,
+    }
+    refs = gen_ds_refs(zarray=ujson.dumps(zarray))
+    del refs["refs"]["a/0.0"]
+
+    ds = dataset_from_kerchunk_refs(refs)
+    assert "a" in ds.variables
+    assert isinstance(ds["a"].data, ManifestArray)
+    assert ds["a"].sizes == {"x": 100, "y": 200}
+    assert ds["a"].chunksizes == {"x": 50, "y": 100}
diff --git a/virtualizarr/tests/test_writers/test_kerchunk.py b/virtualizarr/tests/test_writers/test_kerchunk.py
@@ -40,6 +40,35 @@ def test_accessor_to_kerchunk_dict(self):
         result_ds_refs = ds.virtualize.to_kerchunk(format="dict")
         assert result_ds_refs == expected_ds_refs
 
+    def test_accessor_to_kerchunk_dict_empty(self):
+        manifest = ChunkManifest(entries={}, shape=(1, 1))
+        arr = ManifestArray(
+            chunkmanifest=manifest,
+            zarray=dict(
+                shape=(2, 3),
+                dtype=np.dtype("<i8"),
+                chunks=(2, 3),
+                compressor=None,
+                filters=None,
+                fill_value=np.nan,
+                order="C",
+            ),
+        )
+        ds = Dataset({"a": (["x", "y"], arr)})
+
+        expected_ds_refs = {
+            "version": 1,
+            "refs": {
+                ".zgroup": '{"zarr_format":2}',
+                ".zattrs": "{}",
+                "a/.zarray": '{"shape":[2,3],"chunks":[2,3],"dtype":"<i8","fill_value":null,"order":"C","compressor":null,"filters":null,"zarr_format":2}',
+                "a/.zattrs": '{"_ARRAY_DIMENSIONS":["x","y"]}',
+            },
+        }
+
+        result_ds_refs = ds.virtualize.to_kerchunk(format="dict")
+        assert result_ds_refs == expected_ds_refs
+
     def test_accessor_to_kerchunk_json(self, tmp_path):
         import ujson