Skip to content

Commit

Permalink
Merge branch 'main' into https_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
ayushnag committed Jul 22, 2024
2 parents 17888e3 + 10bd53d commit 6c85395
Show file tree
Hide file tree
Showing 10 changed files with 251 additions and 61 deletions.
6 changes: 6 additions & 0 deletions docs/releases.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,18 @@ New Features
Breaking changes
~~~~~~~~~~~~~~~~

- Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`)
By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.

Deprecations
~~~~~~~~~~~~

Bug fixes
~~~~~~~~~

- Exclude empty chunks during `ChunkDict` construction. (:pull:`198`)
By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.

Documentation
~~~~~~~~~~~~~

Expand Down
2 changes: 1 addition & 1 deletion virtualizarr/kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable, var_name: str) -> KerchunkAr
for chunk_key, entry in marr.manifest.dict().items()
}

zarray = marr.zarray
zarray = marr.zarray.replace(zarr_format=2)

else:
try:
Expand Down
52 changes: 37 additions & 15 deletions virtualizarr/manifests/manifest.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import json
import re
from collections.abc import Iterable, Iterator
from typing import Any, Callable, NewType, Tuple, Union, cast
from typing import Any, Callable, Dict, NewType, Tuple, TypedDict, cast

import numpy as np
from pydantic import BaseModel, ConfigDict
from upath import UPath

from virtualizarr.types import ChunkKey

Expand All @@ -15,7 +16,13 @@
_CHUNK_KEY = rf"^{_INTEGER}+({_SEPARATOR}{_INTEGER})*$" # matches 1 integer, optionally followed by more integers each separated by a separator (i.e. a period)


ChunkDict = NewType("ChunkDict", dict[ChunkKey, dict[str, Union[str, int]]])
class ChunkDictEntry(TypedDict):
path: str
offset: int
length: int


ChunkDict = NewType("ChunkDict", dict[ChunkKey, ChunkDictEntry])


class ChunkEntry(BaseModel):
Expand All @@ -35,16 +42,23 @@ def __repr__(self) -> str:
return f"ChunkEntry(path='{self.path}', offset={self.offset}, length={self.length})"

@classmethod
def from_kerchunk(cls, path_and_byte_range_info: list[str | int]) -> "ChunkEntry":
path, offset, length = path_and_byte_range_info
def from_kerchunk(
cls, path_and_byte_range_info: tuple[str] | tuple[str, int, int]
) -> "ChunkEntry":
if len(path_and_byte_range_info) == 1:
path = path_and_byte_range_info[0]
offset = 0
length = UPath(path).stat().st_size
else:
path, offset, length = path_and_byte_range_info
return ChunkEntry(path=path, offset=offset, length=length)

def to_kerchunk(self) -> list[str | int]:
def to_kerchunk(self) -> tuple[str, int, int]:
"""Write out in the format that kerchunk uses for chunk entries."""
return [self.path, self.offset, self.length]
return (self.path, self.offset, self.length)

def dict(self) -> dict[str, Union[str, int]]:
return dict(path=self.path, offset=self.offset, length=self.length)
def dict(self) -> ChunkDictEntry:
return ChunkDictEntry(path=self.path, offset=self.offset, length=self.length)


class ChunkManifest:
Expand Down Expand Up @@ -252,7 +266,7 @@ def dict(self) -> ChunkDict:
[*coord_vectors, self._paths, self._offsets, self._lengths],
flags=("refs_ok",),
)
if path.item()[0] != "" # don't include entry if path='' (i.e. empty chunk)
if path.item() != "" # don't include entry if path='' (i.e. empty chunk)
}

return cast(
Expand Down Expand Up @@ -283,12 +297,20 @@ def to_zarr_json(self, filepath: str) -> None:
json.dump(entries, json_file, indent=4, separators=(", ", ": "))

@classmethod
def _from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest":
chunkentries = {
cast(ChunkKey, k): ChunkEntry.from_kerchunk(v).dict()
for k, v in kerchunk_chunk_dict.items()
}
return ChunkManifest(entries=cast(ChunkDict, chunkentries))
def _from_kerchunk_chunk_dict(
cls,
# The type hint requires `Dict` instead of `dict` due to
# the conflicting ChunkManifest.dict method.
kerchunk_chunk_dict: Dict[ChunkKey, str | tuple[str] | tuple[str, int, int]],
) -> "ChunkManifest":
chunk_entries: dict[ChunkKey, ChunkDictEntry] = {}
for k, v in kerchunk_chunk_dict.items():
if isinstance(v, (str, bytes)):
raise NotImplementedError("TODO: handle inlined data")
elif not isinstance(v, (tuple, list)):
raise TypeError(f"Unexpected type {type(v)} for chunk value: {v}")
chunk_entries[k] = ChunkEntry.from_kerchunk(v).dict()
return ChunkManifest(entries=chunk_entries)

def rename_paths(
self,
Expand Down
4 changes: 2 additions & 2 deletions virtualizarr/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def create_manifestarray(

zarray = ZArray(
chunks=chunks,
compressor="zlib",
compressor={"id": "blosc", "clevel": 5, "cname": "lz4", "shuffle": 1},
dtype=np.dtype("float32"),
fill_value=0.0, # TODO change this to NaN?
fill_value=0.0,
filters=None,
order="C",
shape=shape,
Expand Down
2 changes: 1 addition & 1 deletion virtualizarr/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def test_non_dimension_coordinates(self, tmpdir, format):
# regression test for GH issue #105

# set up example xarray dataset containing non-dimension coordinate variables
ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6).reshape(2, 3))})
ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6.0).reshape(2, 3))})

# save it to disk as netCDF (in temporary directory)
ds.to_netcdf(f"{tmpdir}/non_dim_coords.nc")
Expand Down
12 changes: 6 additions & 6 deletions virtualizarr/tests/test_manifests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def test_create_manifestarray(self):
shape = (5, 2, 20)
zarray = ZArray(
chunks=chunks,
compressor="zlib",
compressor={"id": "zlib", "level": 1},
dtype=np.dtype("int32"),
fill_value=0.0,
filters=None,
Expand Down Expand Up @@ -74,7 +74,7 @@ def test_equals(self):
shape = (5, 2, 20)
zarray = ZArray(
chunks=chunks,
compressor="zlib",
compressor={"id": "zlib", "level": 1},
dtype=np.dtype("int32"),
fill_value=0.0,
filters=None,
Expand All @@ -95,7 +95,7 @@ def test_not_equal_chunk_entries(self):
# both manifest arrays in this example have the same zarray properties
zarray = ZArray(
chunks=(5, 1, 10),
compressor="zlib",
compressor={"id": "zlib", "level": 1},
dtype=np.dtype("int32"),
fill_value=0.0,
filters=None,
Expand Down Expand Up @@ -209,7 +209,7 @@ def test_concat(self):
# both manifest arrays in this example have the same zarray properties
zarray = ZArray(
chunks=(5, 1, 10),
compressor="zlib",
compressor={"id": "zlib", "level": 1},
dtype=np.dtype("int32"),
fill_value=0.0,
filters=None,
Expand Down Expand Up @@ -254,7 +254,7 @@ def test_stack(self):
# both manifest arrays in this example have the same zarray properties
zarray = ZArray(
chunks=(5, 10),
compressor="zlib",
compressor={"id": "zlib", "level": 1},
dtype=np.dtype("int32"),
fill_value=0.0,
filters=None,
Expand Down Expand Up @@ -299,7 +299,7 @@ def test_refuse_combine():

zarray_common = {
"chunks": (5, 1, 10),
"compressor": "zlib",
"compressor": {"id": "zlib", "level": 1},
"dtype": np.dtype("int32"),
"fill_value": 0.0,
"filters": None,
Expand Down
8 changes: 8 additions & 0 deletions virtualizarr/tests/test_manifests/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,14 @@ def test_invalid_chunk_keys(self):
with pytest.raises(ValueError, match="Inconsistent number of dimensions"):
ChunkManifest(entries=chunks)

def test_empty_chunk_paths(self):
chunks = {
"0.0.0": {"path": "", "offset": 0, "length": 100},
"1.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
}
manifest = ChunkManifest(entries=chunks)
assert len(manifest.dict()) == 1


class TestProperties:
def test_chunk_grid_info(self):
Expand Down
10 changes: 5 additions & 5 deletions virtualizarr/tests/test_xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def test_wrapping():
dtype = np.dtype("int32")
zarray = ZArray(
chunks=chunks,
compressor="zlib",
compressor={"id": "zlib", "level": 1},
dtype=dtype,
fill_value=0.0,
filters=None,
Expand Down Expand Up @@ -49,7 +49,7 @@ def test_equals(self):
shape = (5, 20)
zarray = ZArray(
chunks=chunks,
compressor="zlib",
compressor={"id": "zlib", "level": 1},
dtype=np.dtype("int32"),
fill_value=0.0,
filters=None,
Expand Down Expand Up @@ -86,7 +86,7 @@ def test_concat_along_existing_dim(self):
# both manifest arrays in this example have the same zarray properties
zarray = ZArray(
chunks=(1, 10),
compressor="zlib",
compressor={"id": "zlib", "level": 1},
dtype=np.dtype("int32"),
fill_value=0.0,
filters=None,
Expand Down Expand Up @@ -133,7 +133,7 @@ def test_concat_along_new_dim(self):
# both manifest arrays in this example have the same zarray properties
zarray = ZArray(
chunks=(5, 10),
compressor="zlib",
compressor={"id": "zlib", "level": 1},
dtype=np.dtype("int32"),
fill_value=0.0,
filters=None,
Expand Down Expand Up @@ -183,7 +183,7 @@ def test_concat_dim_coords_along_existing_dim(self):
# both manifest arrays in this example have the same zarray properties
zarray = ZArray(
chunks=(10,),
compressor="zlib",
compressor={"id": "zlib", "level": 1},
dtype=np.dtype("int32"),
fill_value=0.0,
filters=None,
Expand Down
60 changes: 54 additions & 6 deletions virtualizarr/tests/test_zarr.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
import json

import numpy as np
import pytest
import xarray as xr
import xarray.testing as xrt

from virtualizarr import ManifestArray, open_virtual_dataset
from virtualizarr.manifests.manifest import ChunkManifest
from virtualizarr.zarr import dataset_to_zarr, metadata_from_zarr_json


def test_zarr_v3_roundtrip(tmpdir):
@pytest.fixture
def vds_with_manifest_arrays() -> xr.Dataset:
arr = ManifestArray(
chunkmanifest=ChunkManifest(
entries={"0.0": dict(path="test.nc", offset=6144, length=48)}
Expand All @@ -15,18 +20,61 @@ def test_zarr_v3_roundtrip(tmpdir):
shape=(2, 3),
dtype=np.dtype("<i8"),
chunks=(2, 3),
compressor=None,
compressor={"id": "zlib", "level": 1},
filters=None,
fill_value=np.nan,
fill_value=0,
order="C",
zarr_format=3,
),
)
original = xr.Dataset({"a": (["x", "y"], arr)}, attrs={"something": 0})
return xr.Dataset({"a": (["x", "y"], arr)}, attrs={"something": 0})


def isconfigurable(value: dict) -> bool:
"""
Several metadata attributes in ZarrV3 use a dictionary with keys "name" : str and "configuration" : dict
"""
return "name" in value and "configuration" in value

original.virtualize.to_zarr(tmpdir / "store.zarr")

def test_zarr_v3_roundtrip(tmpdir, vds_with_manifest_arrays: xr.Dataset):
vds_with_manifest_arrays.virtualize.to_zarr(tmpdir / "store.zarr")
roundtrip = open_virtual_dataset(
tmpdir / "store.zarr", filetype="zarr_v3", indexes={}
)

xrt.assert_identical(roundtrip, original)
xrt.assert_identical(roundtrip, vds_with_manifest_arrays)


def test_metadata_roundtrip(tmpdir, vds_with_manifest_arrays: xr.Dataset):
dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr")
zarray, _, _ = metadata_from_zarr_json(tmpdir / "store.zarr/a/zarr.json")
assert zarray == vds_with_manifest_arrays.a.data.zarray


def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: xr.Dataset):
"""
Checks that the output metadata of an array variable conforms to this spec
for the required attributes:
https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#metadata
"""
dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr")
# read the a variable's metadata
with open(tmpdir / "store.zarr/a/zarr.json", mode="r") as f:
metadata = json.loads(f.read())
assert metadata["zarr_format"] == 3
assert metadata["node_type"] == "array"
assert isinstance(metadata["shape"], list) and all(
isinstance(dim, int) for dim in metadata["shape"]
)
assert isinstance(metadata["data_type"], str) or isconfigurable(
metadata["data_type"]
)
assert isconfigurable(metadata["chunk_grid"])
assert isconfigurable(metadata["chunk_key_encoding"])
assert isinstance(metadata["fill_value"], (bool, int, float, str, list))
assert (
isinstance(metadata["codecs"], list)
and len(metadata["codecs"]) > 1
and all(isconfigurable(codec) for codec in metadata["codecs"])
)
Loading

0 comments on commit 6c85395

Please sign in to comment.