Skip to content

Commit

Permalink
allow creating references for empty archival datasets (#260)
Browse files Browse the repository at this point in the history
* raise a more user-friendly error for empty variables

* add a test to make sure the error is raised

* create a empty manifest array instead

* also allow writing empty chunk manifests

* try using an annotated variable

* explicitly cast instead

* switch the order of `cast` arguments

* use the main constructor instead

* forgotten call of `ChunkManifest.empty`

* explanatory comment

* rename the reader test

* check that concatenating works

* check that stacking works with missing chunk definitions

* check that broadcasting works

* use empty arrays instead of 0-sized if shape given

* pass the chunk grid shape for all empty chunk manifests

* don't allow empty chunks if no chunk grid shape given

* move `ujson` to top-level

* replace the manual floor division

* release note

* fix a couple of changelog entries
  • Loading branch information
keewis authored Oct 18, 2024
1 parent e6407e0 commit 7053bc0
Show file tree
Hide file tree
Showing 7 changed files with 194 additions and 9 deletions.
7 changes: 5 additions & 2 deletions docs/releases.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,17 @@ New Features
- Load scalar variables by default. (:pull:`205`)
By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.

- Support empty files (:pull:`260`)
By `Justus Magin <https://github.com/keewis>`_.

Breaking changes
~~~~~~~~~~~~~~~~

- Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`)
By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.
- VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclass
`pydantic.BaseModel` (:pull:`210`)
- `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`xxx`)
- `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`210`)

Deprecations
~~~~~~~~~~~~
Expand All @@ -55,7 +58,7 @@ Bug fixes
Documentation
~~~~~~~~~~~~~

- Adds virtualizarr + coiled serverless example notebook (:pull`223`)
- Adds virtualizarr + coiled serverless example notebook (:pull:`223`)
By `Raphael Hagen <https://github.com/norlandrhagen>`_.


Expand Down
12 changes: 8 additions & 4 deletions virtualizarr/manifests/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class ChunkManifest:
_offsets: np.ndarray[Any, np.dtype[np.uint64]]
_lengths: np.ndarray[Any, np.dtype[np.uint64]]

def __init__(self, entries: dict) -> None:
def __init__(self, entries: dict, shape: tuple[int, ...] | None = None) -> None:
"""
Create a ChunkManifest from a dictionary mapping zarr chunk keys to byte ranges.
Expand All @@ -105,13 +105,14 @@ def __init__(self, entries: dict) -> None:
"0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
}
"""
if shape is None and not entries:
raise ValueError("need a chunk grid shape if no chunks given")

# TODO do some input validation here first?
validate_chunk_keys(entries.keys())

# TODO should we actually optionally pass chunk grid shape in,
# in case there are not enough chunks to give correct idea of full shape?
shape = get_chunk_grid_shape(entries.keys())
if shape is None:
shape = get_chunk_grid_shape(entries.keys())

# Initializing to empty implies that entries with path='' are treated as missing chunks
paths = cast( # `np.empty` apparently is type hinted as if the output could have Any dtype
Expand Down Expand Up @@ -386,6 +387,9 @@ def get_ndim_from_key(key: str) -> int:


def validate_chunk_keys(chunk_keys: Iterable[ChunkKey]):
if not chunk_keys:
return

# Check if all keys have the correct form
for key in chunk_keys:
if not re.match(_CHUNK_KEY, key):
Expand Down
15 changes: 14 additions & 1 deletion virtualizarr/readers/kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
KerchunkStoreRefs,
)
from virtualizarr.utils import _FsspecFSFromFilepath
from virtualizarr.zarr import ZArray, ZAttrs
from virtualizarr.zarr import ZArray, ZAttrs, ceildiv


# TODO shouldn't this live in backend.py? Because it's not just useful for the kerchunk-specific readers...
Expand Down Expand Up @@ -230,6 +230,13 @@ def dataset_from_kerchunk_refs(
return vds


def determine_chunk_grid_shape(zarray):
return tuple(
ceildiv(length, chunksize)
for length, chunksize in zip(zarray.shape, zarray.chunks)
)


def variable_from_kerchunk_refs(
refs: KerchunkStoreRefs, var_name: str, virtual_array_class
) -> Variable:
Expand All @@ -242,6 +249,12 @@ def variable_from_kerchunk_refs(
if chunk_dict:
manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict)
varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest)
elif len(zarray.shape) != 0:
# empty variables don't have physical chunks, but zarray shows that the variable
# is at least 1D
shape = determine_chunk_grid_shape(zarray)
manifest = ChunkManifest(entries={}, shape=shape)
varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest)
else:
# This means we encountered a scalar variable of dimension 0,
# very likely that it actually has no numeric value and its only purpose
Expand Down
108 changes: 108 additions & 0 deletions virtualizarr/tests/test_manifests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,38 @@ def test_broadcast_any_shape(self, shape, chunks, target_shape):
for len_arr, len_chunk in zip(broadcasted_marr.shape, broadcasted_chunk_shape):
assert len_chunk <= len_arr

@pytest.mark.parametrize(
"shape, chunks, grid_shape, target_shape",
[
((1,), (1,), (1,), (3,)),
((2,), (1,), (2,), (2,)),
((3,), (2,), (2,), (5, 4, 3)),
((3, 1), (2, 1), (2, 1), (2, 3, 4)),
],
)
def test_broadcast_empty(self, shape, chunks, grid_shape, target_shape):
zarray = ZArray(
chunks=chunks,
compressor={"id": "zlib", "level": 1},
dtype=np.dtype("int32"),
fill_value=0.0,
filters=None,
order="C",
shape=shape,
zarr_format=2,
)
manifest = ChunkManifest(entries={}, shape=grid_shape)
marr = ManifestArray(zarray, manifest)

expanded = np.broadcast_to(marr, shape=target_shape)
assert expanded.shape == target_shape
assert len(expanded.chunks) == expanded.ndim
assert all(
len_chunk <= len_arr
for len_arr, len_chunk in zip(expanded.shape, expanded.chunks)
)
assert expanded.manifest.dict() == {}


# TODO we really need some kind of fixtures to generate useful example data
# The hard part is having an alternative way to get to the expected result of concatenation
Expand Down Expand Up @@ -250,6 +282,44 @@ def test_concat(self):
assert result.zarray.order == zarray.order
assert result.zarray.zarr_format == zarray.zarr_format

def test_concat_empty(self):
# both manifest arrays in this example have the same zarray properties
zarray = ZArray(
chunks=(5, 1, 10),
compressor={"id": "zlib", "level": 1},
dtype=np.dtype("int32"),
fill_value=0.0,
filters=None,
order="C",
shape=(5, 1, 20),
zarr_format=2,
)

chunks_dict1 = {}
manifest1 = ChunkManifest(entries=chunks_dict1, shape=(1, 1, 2))
marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)

chunks_dict2 = {
"0.0.0": {"path": "foo.nc", "offset": 300, "length": 100},
"0.0.1": {"path": "foo.nc", "offset": 400, "length": 100},
}
manifest2 = ChunkManifest(entries=chunks_dict2)
marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)

result = np.concatenate([marr1, marr2], axis=1)

assert result.shape == (5, 2, 20)
assert result.chunks == (5, 1, 10)
assert result.manifest.dict() == {
"0.1.0": {"path": "foo.nc", "offset": 300, "length": 100},
"0.1.1": {"path": "foo.nc", "offset": 400, "length": 100},
}
assert result.zarray.compressor == zarray.compressor
assert result.zarray.filters == zarray.filters
assert result.zarray.fill_value == zarray.fill_value
assert result.zarray.order == zarray.order
assert result.zarray.zarr_format == zarray.zarr_format


class TestStack:
def test_stack(self):
Expand Down Expand Up @@ -295,6 +365,44 @@ def test_stack(self):
assert result.zarray.order == zarray.order
assert result.zarray.zarr_format == zarray.zarr_format

def test_stack_empty(self):
# both manifest arrays in this example have the same zarray properties
zarray = ZArray(
chunks=(5, 10),
compressor={"id": "zlib", "level": 1},
dtype=np.dtype("int32"),
fill_value=0.0,
filters=None,
order="C",
shape=(5, 20),
zarr_format=2,
)

chunks_dict1 = {}
manifest1 = ChunkManifest(entries=chunks_dict1, shape=(1, 2))
marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)

chunks_dict2 = {
"0.0": {"path": "foo.nc", "offset": 300, "length": 100},
"0.1": {"path": "foo.nc", "offset": 400, "length": 100},
}
manifest2 = ChunkManifest(entries=chunks_dict2)
marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)

result = np.stack([marr1, marr2], axis=1)

assert result.shape == (5, 2, 20)
assert result.chunks == (5, 1, 10)
assert result.manifest.dict() == {
"0.1.0": {"path": "foo.nc", "offset": 300, "length": 100},
"0.1.1": {"path": "foo.nc", "offset": 400, "length": 100},
}
assert result.zarray.compressor == zarray.compressor
assert result.zarray.filters == zarray.filters
assert result.zarray.fill_value == zarray.fill_value
assert result.zarray.order == zarray.order
assert result.zarray.zarr_format == zarray.zarr_format


def test_refuse_combine():
# TODO test refusing to concatenate arrays that have conflicting shapes / chunk sizes
Expand Down
8 changes: 8 additions & 0 deletions virtualizarr/tests/test_manifests/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ def test_create_manifest(self):
manifest = ChunkManifest(entries=chunks)
assert manifest.dict() == chunks

chunks = {}
manifest = ChunkManifest(entries=chunks, shape=(2, 2))
assert manifest.dict() == chunks

def test_create_manifest_empty_missing_shape(self):
with pytest.raises(ValueError, match="chunk grid shape if no chunks"):
ChunkManifest(entries={})

def test_invalid_chunk_entries(self):
chunks = {
"0.0.0": {"path": "s3://bucket/foo.nc"},
Expand Down
24 changes: 22 additions & 2 deletions virtualizarr/tests/test_readers/test_kerchunk.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import ujson

from virtualizarr.manifests import ManifestArray
from virtualizarr.readers.kerchunk import (
Expand Down Expand Up @@ -45,8 +46,6 @@ def test_dataset_from_df_refs():


def test_dataset_from_df_refs_with_filters():
import ujson

filters = [{"elementsize": 4, "id": "shuffle"}, {"id": "zlib", "level": 4}]
zarray = {
"chunks": [2, 3],
Expand All @@ -62,3 +61,24 @@ def test_dataset_from_df_refs_with_filters():
ds = dataset_from_kerchunk_refs(ds_refs)
da = ds["a"]
assert da.data.zarray.filters == filters


def test_dataset_from_kerchunk_refs_empty_chunk_manifest():
zarray = {
"chunks": [50, 100],
"compressor": None,
"dtype": "<i8",
"fill_value": 100,
"filters": None,
"order": "C",
"shape": [100, 200],
"zarr_format": 2,
}
refs = gen_ds_refs(zarray=ujson.dumps(zarray))
del refs["refs"]["a/0.0"]

ds = dataset_from_kerchunk_refs(refs)
assert "a" in ds.variables
assert isinstance(ds["a"].data, ManifestArray)
assert ds["a"].sizes == {"x": 100, "y": 200}
assert ds["a"].chunksizes == {"x": 50, "y": 100}
29 changes: 29 additions & 0 deletions virtualizarr/tests/test_writers/test_kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,35 @@ def test_accessor_to_kerchunk_dict(self):
result_ds_refs = ds.virtualize.to_kerchunk(format="dict")
assert result_ds_refs == expected_ds_refs

def test_accessor_to_kerchunk_dict_empty(self):
manifest = ChunkManifest(entries={}, shape=(1, 1))
arr = ManifestArray(
chunkmanifest=manifest,
zarray=dict(
shape=(2, 3),
dtype=np.dtype("<i8"),
chunks=(2, 3),
compressor=None,
filters=None,
fill_value=np.nan,
order="C",
),
)
ds = Dataset({"a": (["x", "y"], arr)})

expected_ds_refs = {
"version": 1,
"refs": {
".zgroup": '{"zarr_format":2}',
".zattrs": "{}",
"a/.zarray": '{"shape":[2,3],"chunks":[2,3],"dtype":"<i8","fill_value":null,"order":"C","compressor":null,"filters":null,"zarr_format":2}',
"a/.zattrs": '{"_ARRAY_DIMENSIONS":["x","y"]}',
},
}

result_ds_refs = ds.virtualize.to_kerchunk(format="dict")
assert result_ds_refs == expected_ds_refs

def test_accessor_to_kerchunk_json(self, tmp_path):
import ujson

Expand Down

0 comments on commit 7053bc0

Please sign in to comment.