diff --git a/README.md b/README.md index c481d542..f415b356 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ VirtualiZarr (pronounced like "virtualize" but more piratey) grew out of [discussions](https://github.com/fsspec/kerchunk/issues/377) on the [kerchunk repository](https://github.com/fsspec/kerchunk), and is an attempt to provide the game-changing power of kerchunk in a zarr-native way, and with a familiar array-like API. +You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides [almost all the same features](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) as Kerchunk. + _Please see the [documentation](https://virtualizarr.readthedocs.io/en/latest/)_ ### Development Status and Roadmap diff --git a/docs/faq.md b/docs/faq.md index df4af749..d273a529 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -16,6 +16,8 @@ The above steps would also be performed using the `kerchunk` library alone, but ## How do VirtualiZarr and Kerchunk compare? +You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides [almost all the same features](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) as Kerchunk. + Users of kerchunk may find the following comparison table useful, which shows which features of kerchunk map on to which features of VirtualiZarr. | Component / Feature | Kerchunk | VirtualiZarr | | ------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | diff --git a/docs/index.md b/docs/index.md index d1beb291..0e79418f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -4,6 +4,8 @@ VirtualiZarr grew out of [discussions](https://github.com/fsspec/kerchunk/issues/377) on the [kerchunk repository](https://github.com/fsspec/kerchunk), and is an attempt to provide the game-changing power of kerchunk in a zarr-native way, and with a familiar array-like API. +You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides [almost all the same features](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) as Kerchunk. + ## Motivation The Kerchunk idea solves an incredibly important problem: accessing big archival datasets via a cloud-optimized pattern, but without copying or modifying the original data in any way. This is a win-win-win for users, data engineers, and data providers. Users see fast-opening zarr-compliant stores that work performantly with libraries like xarray and dask, data engineers can provide this speed by adding a lightweight virtualization layer on top of existing data (without having to ask anyone's permission), and data providers don't have to change anything about their legacy files for them to be used in a cloud-optimized way. diff --git a/docs/releases.rst b/docs/releases.rst index c44ff245..f472db05 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -12,12 +12,18 @@ New Features Breaking changes ~~~~~~~~~~~~~~~~ +- Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`) + By `Gustavo Hidalgo `_. + Deprecations ~~~~~~~~~~~~ Bug fixes ~~~~~~~~~ +- Exclude empty chunks during `ChunkDict` construction. (:pull:`198`) + By `Gustavo Hidalgo `_. + Documentation ~~~~~~~~~~~~~ diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 6e82067d..122b86b3 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -266,7 +266,7 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable, var_name: str) -> KerchunkAr for chunk_key, entry in marr.manifest.dict().items() } - zarray = marr.zarray + zarray = marr.zarray.replace(zarr_format=2) else: try: diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index a0983dec..e15cf7d7 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -5,7 +5,7 @@ from ..kerchunk import KerchunkArrRefs from ..zarr import ZArray -from .array_api import MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS +from .array_api import MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS, _isnan from .manifest import ChunkManifest @@ -127,6 +127,8 @@ def __array_function__(self, func, types, args, kwargs) -> Any: def __array_ufunc__(self, ufunc, method, *inputs, **kwargs) -> Any: """We have to define this in order to convince xarray that this class is a duckarray, even though we will never support ufuncs.""" + if ufunc == np.isnan: + return _isnan(self.shape) return NotImplemented def __array__(self) -> np.ndarray: diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index 0ecdc023..09606978 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -356,8 +356,8 @@ def isnan(x: "ManifestArray", /) -> np.ndarray: Only implemented to get past some checks deep inside xarray, see https://github.com/TomNicholas/VirtualiZarr/issues/29. """ - return np.full( - shape=x.shape, - fill_value=False, - dtype=np.dtype(bool), - ) + return _isnan(x.shape) + + +def _isnan(shape: tuple): + return np.full(shape=shape, fill_value=False, dtype=np.dtype(bool)) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index cc196e6d..bf7c24fd 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -1,10 +1,11 @@ import json import re from collections.abc import Iterable, Iterator -from typing import Any, Callable, NewType, Tuple, Union, cast +from typing import Any, Callable, Dict, NewType, Tuple, TypedDict, cast import numpy as np from pydantic import BaseModel, ConfigDict +from upath import UPath from virtualizarr.types import ChunkKey @@ -15,7 +16,13 @@ _CHUNK_KEY = rf"^{_INTEGER}+({_SEPARATOR}{_INTEGER})*$" # matches 1 integer, optionally followed by more integers each separated by a separator (i.e. a period) -ChunkDict = NewType("ChunkDict", dict[ChunkKey, dict[str, Union[str, int]]]) +class ChunkDictEntry(TypedDict): + path: str + offset: int + length: int + + +ChunkDict = NewType("ChunkDict", dict[ChunkKey, ChunkDictEntry]) class ChunkEntry(BaseModel): @@ -35,16 +42,23 @@ def __repr__(self) -> str: return f"ChunkEntry(path='{self.path}', offset={self.offset}, length={self.length})" @classmethod - def from_kerchunk(cls, path_and_byte_range_info: list[str | int]) -> "ChunkEntry": - path, offset, length = path_and_byte_range_info + def from_kerchunk( + cls, path_and_byte_range_info: tuple[str] | tuple[str, int, int] + ) -> "ChunkEntry": + if len(path_and_byte_range_info) == 1: + path = path_and_byte_range_info[0] + offset = 0 + length = UPath(path).stat().st_size + else: + path, offset, length = path_and_byte_range_info return ChunkEntry(path=path, offset=offset, length=length) - def to_kerchunk(self) -> list[str | int]: + def to_kerchunk(self) -> tuple[str, int, int]: """Write out in the format that kerchunk uses for chunk entries.""" - return [self.path, self.offset, self.length] + return (self.path, self.offset, self.length) - def dict(self) -> dict[str, Union[str, int]]: - return dict(path=self.path, offset=self.offset, length=self.length) + def dict(self) -> ChunkDictEntry: + return ChunkDictEntry(path=self.path, offset=self.offset, length=self.length) class ChunkManifest: @@ -252,7 +266,7 @@ def dict(self) -> ChunkDict: [*coord_vectors, self._paths, self._offsets, self._lengths], flags=("refs_ok",), ) - if path.item()[0] != "" # don't include entry if path='' (i.e. empty chunk) + if path.item() != "" # don't include entry if path='' (i.e. empty chunk) } return cast( @@ -283,12 +297,20 @@ def to_zarr_json(self, filepath: str) -> None: json.dump(entries, json_file, indent=4, separators=(", ", ": ")) @classmethod - def _from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest": - chunkentries = { - cast(ChunkKey, k): ChunkEntry.from_kerchunk(v).dict() - for k, v in kerchunk_chunk_dict.items() - } - return ChunkManifest(entries=cast(ChunkDict, chunkentries)) + def _from_kerchunk_chunk_dict( + cls, + # The type hint requires `Dict` instead of `dict` due to + # the conflicting ChunkManifest.dict method. + kerchunk_chunk_dict: Dict[ChunkKey, str | tuple[str] | tuple[str, int, int]], + ) -> "ChunkManifest": + chunk_entries: dict[ChunkKey, ChunkDictEntry] = {} + for k, v in kerchunk_chunk_dict.items(): + if isinstance(v, (str, bytes)): + raise NotImplementedError("TODO: handle inlined data") + elif not isinstance(v, (tuple, list)): + raise TypeError(f"Unexpected type {type(v)} for chunk value: {v}") + chunk_entries[k] = ChunkEntry.from_kerchunk(v).dict() + return ChunkManifest(entries=chunk_entries) def rename_paths( self, diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 3856a6ba..7df13d10 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -48,9 +48,9 @@ def create_manifestarray( zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "blosc", "clevel": 5, "cname": "lz4", "shuffle": 1}, dtype=np.dtype("float32"), - fill_value=0.0, # TODO change this to NaN? + fill_value=0.0, filters=None, order="C", shape=shape, diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 2e612de9..239316a1 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -138,7 +138,7 @@ def test_non_dimension_coordinates(self, tmpdir, format): # regression test for GH issue #105 # set up example xarray dataset containing non-dimension coordinate variables - ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6).reshape(2, 3))}) + ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6.0).reshape(2, 3))}) # save it to disk as netCDF (in temporary directory) ds.to_netcdf(f"{tmpdir}/non_dim_coords.nc") diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index 459e60be..6d5ede79 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -19,7 +19,7 @@ def test_create_manifestarray(self): shape = (5, 2, 20) zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -74,7 +74,7 @@ def test_equals(self): shape = (5, 2, 20) zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -95,7 +95,7 @@ def test_not_equal_chunk_entries(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(5, 1, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -209,7 +209,7 @@ def test_concat(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(5, 1, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -254,7 +254,7 @@ def test_stack(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(5, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -299,7 +299,7 @@ def test_refuse_combine(): zarray_common = { "chunks": (5, 1, 10), - "compressor": "zlib", + "compressor": {"id": "zlib", "level": 1}, "dtype": np.dtype("int32"), "fill_value": 0.0, "filters": None, diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py index 1a2a0ae1..7ef69982 100644 --- a/virtualizarr/tests/test_manifests/test_manifest.py +++ b/virtualizarr/tests/test_manifests/test_manifest.py @@ -51,6 +51,14 @@ def test_invalid_chunk_keys(self): with pytest.raises(ValueError, match="Inconsistent number of dimensions"): ChunkManifest(entries=chunks) + def test_empty_chunk_paths(self): + chunks = { + "0.0.0": {"path": "", "offset": 0, "length": 100}, + "1.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + } + manifest = ChunkManifest(entries=chunks) + assert len(manifest.dict()) == 1 + class TestProperties: def test_chunk_grid_info(self): diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index d0fe2e3b..31ef8320 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -19,7 +19,7 @@ def test_wrapping(): dtype = np.dtype("int32") zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=dtype, fill_value=0.0, filters=None, @@ -49,7 +49,7 @@ def test_equals(self): shape = (5, 20) zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -86,7 +86,7 @@ def test_concat_along_existing_dim(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(1, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -133,7 +133,7 @@ def test_concat_along_new_dim(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(5, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -183,7 +183,7 @@ def test_concat_dim_coords_along_existing_dim(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(10,), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -376,7 +376,7 @@ def test_read_from_url(self, filetype, url): with pytest.raises(NotImplementedError): vds = open_virtual_dataset(url, reader_options={}, indexes={}) else: - vds = open_virtual_dataset(url, reader_options={}, indexes={}) + vds = open_virtual_dataset(url, indexes={}) assert isinstance(vds, xr.Dataset) diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py index 80d04b9c..7715d245 100644 --- a/virtualizarr/tests/test_zarr.py +++ b/virtualizarr/tests/test_zarr.py @@ -1,12 +1,17 @@ +import json + import numpy as np +import pytest import xarray as xr import xarray.testing as xrt from virtualizarr import ManifestArray, open_virtual_dataset from virtualizarr.manifests.manifest import ChunkManifest +from virtualizarr.zarr import dataset_to_zarr, metadata_from_zarr_json -def test_zarr_v3_roundtrip(tmpdir): +@pytest.fixture +def vds_with_manifest_arrays() -> xr.Dataset: arr = ManifestArray( chunkmanifest=ChunkManifest( entries={"0.0": dict(path="test.nc", offset=6144, length=48)} @@ -15,18 +20,61 @@ def test_zarr_v3_roundtrip(tmpdir): shape=(2, 3), dtype=np.dtype(" bool: + """ + Several metadata attributes in ZarrV3 use a dictionary with keys "name" : str and "configuration" : dict + """ + return "name" in value and "configuration" in value - original.virtualize.to_zarr(tmpdir / "store.zarr") + +def test_zarr_v3_roundtrip(tmpdir, vds_with_manifest_arrays: xr.Dataset): + vds_with_manifest_arrays.virtualize.to_zarr(tmpdir / "store.zarr") roundtrip = open_virtual_dataset( tmpdir / "store.zarr", filetype="zarr_v3", indexes={} ) - xrt.assert_identical(roundtrip, original) + xrt.assert_identical(roundtrip, vds_with_manifest_arrays) + + +def test_metadata_roundtrip(tmpdir, vds_with_manifest_arrays: xr.Dataset): + dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr") + zarray, _, _ = metadata_from_zarr_json(tmpdir / "store.zarr/a/zarr.json") + assert zarray == vds_with_manifest_arrays.a.data.zarray + + +def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: xr.Dataset): + """ + Checks that the output metadata of an array variable conforms to this spec + for the required attributes: + https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#metadata + """ + dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr") + # read the a variable's metadata + with open(tmpdir / "store.zarr/a/zarr.json", mode="r") as f: + metadata = json.loads(f.read()) + assert metadata["zarr_format"] == 3 + assert metadata["node_type"] == "array" + assert isinstance(metadata["shape"], list) and all( + isinstance(dim, int) for dim in metadata["shape"] + ) + assert isinstance(metadata["data_type"], str) or isconfigurable( + metadata["data_type"] + ) + assert isconfigurable(metadata["chunk_grid"]) + assert isconfigurable(metadata["chunk_key_encoding"]) + assert isinstance(metadata["fill_value"], (bool, int, float, str, list)) + assert ( + isinstance(metadata["codecs"], list) + and len(metadata["codecs"]) > 1 + and all(isconfigurable(codec) for codec in metadata["codecs"]) + ) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 6200296b..4ae5700d 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -10,6 +10,7 @@ import ujson # type: ignore import xarray as xr +from upath import UPath from xarray import register_dataset_accessor from xarray.backends import BackendArray from xarray.coding.times import CFDatetimeCoder @@ -125,9 +126,14 @@ def open_virtual_dataset( ) else: if reader_options is None: - reader_options = { - "storage_options": {"key": "", "secret": "", "anon": True} - } + universal_filepath = UPath(filepath) + protocol = universal_filepath.protocol + if protocol == "s3": + reader_options = { + "storage_options": {"key": "", "secret": "", "anon": True} + } + else: + reader_options = {} # this is the only place we actually always need to use kerchunk directly # TODO avoid even reading byte ranges for variables that will be dropped later anyway? diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 545a86fc..e5015b36 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -8,10 +8,18 @@ Optional, ) +import numcodecs import numpy as np import ujson # type: ignore import xarray as xr -from pydantic import BaseModel, ConfigDict, field_validator +from pydantic import ( + BaseModel, + ConfigDict, + Field, + field_validator, + model_validator, +) +from typing_extensions import Self from virtualizarr.vendor.zarr.utils import json_dumps @@ -22,10 +30,25 @@ ZAttrs = NewType( "ZAttrs", dict[str, Any] ) # just the .zattrs (for one array or for the whole store/group) +FillValueT = bool | str | float | int | list | None + +ZARR_DEFAULT_FILL_VALUE: dict[np.dtype, FillValueT] = { + # numpy dtypes's hierarchy lets us avoid checking for all the widths + # https://numpy.org/doc/stable/reference/arrays.scalars.html + np.dtype("bool"): False, + np.dtype("int"): 0, + np.dtype("float"): 0.0, + np.dtype("complex"): [0.0, 0.0], +} +""" +The value and format of the fill_value depend on the `data_type` of the array. +See here for spec: +https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value +""" class Codec(BaseModel): - compressor: str | None = None + compressor: dict | None = None filters: list[dict] | None = None def __repr__(self) -> str: @@ -42,9 +65,9 @@ class ZArray(BaseModel): ) chunks: tuple[int, ...] - compressor: str | None = None + compressor: dict | None = None dtype: np.dtype - fill_value: float | int | None = np.nan # float or int? + fill_value: FillValueT = Field(default=0.0, validate_default=True) filters: list[dict] | None = None order: Literal["C", "F"] shape: tuple[int, ...] @@ -64,6 +87,12 @@ def __post_init__(self) -> None: f"Array shape {self.shape} has ndim={self.shape} but chunk shape {self.chunks} has ndim={len(self.chunks)}" ) + @model_validator(mode="after") + def _check_fill_value(self) -> Self: + if self.fill_value is None: + self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype, 0.0) + return self + @property def codec(self) -> Codec: """For comparison against other arrays.""" @@ -80,11 +109,6 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": fill_value = np.nan compressor = decoded_arr_refs_zarray["compressor"] - # deal with an inconsistency in kerchunk's tiff_to_zarr function - # TODO should this be moved to the point where we actually call tiff_to_zarr? Or ideally made consistent upstream. - if compressor is not None and "id" in compressor: - compressor = compressor["id"] - return ZArray( chunks=tuple(decoded_arr_refs_zarray["chunks"]), compressor=compressor, @@ -98,21 +122,19 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": def dict(self) -> dict[str, Any]: zarray_dict = dict(self) - zarray_dict["dtype"] = encode_dtype(zarray_dict["dtype"]) - - if zarray_dict["fill_value"] is np.nan: - zarray_dict["fill_value"] = None - return zarray_dict def to_kerchunk_json(self) -> str: - return ujson.dumps(self.dict()) + zarray_dict = self.dict() + if zarray_dict["fill_value"] is np.nan: + zarray_dict["fill_value"] = None + return ujson.dumps(zarray_dict) def replace( self, chunks: Optional[tuple[int, ...]] = None, - compressor: Optional[str] = None, + compressor: Optional[dict] = None, dtype: Optional[np.dtype] = None, fill_value: Optional[float] = None, # float or int? filters: Optional[list[dict]] = None, # type: ignore[valid-type] @@ -134,6 +156,59 @@ def replace( zarr_format=zarr_format if zarr_format is not None else self.zarr_format, ) + def _v3_codec_pipeline(self) -> list: + """ + VirtualiZarr internally uses the `filters`, `compressor`, and `order` attributes + from zarr v2, but to create conformant zarr v3 metadata those 3 must be turned into `codecs` objects. + Not all codecs are created equal though: https://github.com/zarr-developers/zarr-python/issues/1943 + An array _must_ declare a single ArrayBytes codec, and 0 or more ArrayArray, BytesBytes codecs. + Roughly, this is the mapping: + ``` + filters: Iterable[ArrayArrayCodec] #optional + compressor: ArrayBytesCodec #mandatory + post_compressor: Iterable[BytesBytesCodec] #optional + ``` + """ + if self.filters: + filter_codecs_configs = [ + numcodecs.get_codec(filter).get_config() for filter in self.filters + ] + filters = [ + dict(name=codec.pop("id"), configuration=codec) + for codec in filter_codecs_configs + ] + else: + filters = [] + + # Noting here that zarr v3 has very few codecs specificed in the official spec, + # and that there are far more codecs in `numcodecs`. We take a gamble and assume + # that the codec names and configuration are simply mapped into zarrv3 "configurables". + if self.compressor: + compressor = [_num_codec_config_to_configurable(self.compressor)] + else: + compressor = [] + + # https://zarr-specs.readthedocs.io/en/latest/v3/codecs/transpose/v1.0.html#transpose-codec-v1 + # Either "C" or "F", defining the layout of bytes within each chunk of the array. + # "C" means row-major order, i.e., the last dimension varies fastest; + # "F" means column-major order, i.e., the first dimension varies fastest. + if self.order == "C": + order = tuple(range(len(self.shape))) + elif self.order == "F": + order = tuple(reversed(range(len(self.shape)))) + + transpose = dict(name="transpose", configuration=dict(order=order)) + # https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097 + # "If no ArrayBytesCodec is supplied, we can auto-add a BytesCodec" + bytes = dict( + name="bytes", configuration={} + ) # TODO need to handle endianess configuration + + # The order here is significant! + # [ArrayArray] -> ArrayBytes -> [BytesBytes] + codec_pipeline = [transpose, bytes] + compressor + filters + return codec_pipeline + def encode_dtype(dtype: np.dtype) -> str: # TODO not sure if there is a better way to get the ' "name": "default", "configuration": {"separator": "/"}, } - metadata["codecs"] = metadata.pop("filters") - metadata.pop("compressor") # TODO this should be entered in codecs somehow - metadata.pop("order") # TODO this should be replaced by a transpose codec + metadata["codecs"] = zarray._v3_codec_pipeline() + metadata.pop("filters") + metadata.pop("compressor") + metadata.pop("order") # indicate that we're using the manifest storage transformer ZEP metadata["storage_transformers"] = [ @@ -277,21 +353,51 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: dim_names = metadata.pop("dimension_names") chunk_shape = metadata["chunk_grid"]["configuration"]["chunk_shape"] + shape = metadata["shape"] + zarr_format = metadata["zarr_format"] if metadata["fill_value"] is None: - fill_value = np.nan + raise ValueError( + "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" + ) else: fill_value = metadata["fill_value"] + all_codecs = [ + codec + for codec in metadata["codecs"] + if codec["name"] not in ("transpose", "bytes") + ] + compressor, *filters = [ + _configurable_to_num_codec_config(_filter) for _filter in all_codecs + ] zarray = ZArray( - chunks=metadata["chunk_grid"]["configuration"]["chunk_shape"], - compressor=metadata["codecs"], + chunks=chunk_shape, + compressor=compressor, dtype=np.dtype(metadata["data_type"]), fill_value=fill_value, - filters=metadata.get("filters", None), + filters=filters or None, order="C", - shape=chunk_shape, - zarr_format=3, + shape=shape, + zarr_format=zarr_format, ) return zarray, dim_names, attrs + + +def _configurable_to_num_codec_config(configurable: dict) -> dict: + """ + Convert a zarr v3 configurable into a numcodecs codec. + """ + configurable_copy = configurable.copy() + codec_id = configurable_copy.pop("name") + configuration = configurable_copy.pop("configuration") + return numcodecs.get_codec({"id": codec_id, **configuration}).get_config() + + +def _num_codec_config_to_configurable(num_codec: dict) -> dict: + """ + Convert a numcodecs codec into a zarr v3 configurable. + """ + num_codec_copy = num_codec.copy() + return {"name": num_codec_copy.pop("id"), "configuration": num_codec_copy}