From 10ef7e54dcd5b2a74b6180fdc51304d71d8a1e79 Mon Sep 17 00:00:00 2001 From: Gustavo Hidalgo Date: Fri, 19 Jul 2024 20:30:25 -0400 Subject: [PATCH 1/3] Exclude empty `paths` on `ChunkDict` creation (#198) * Update docs * handle empty paths * reset releases * remove experimental chunk validation skip * add docs --- docs/releases.rst | 3 +++ virtualizarr/manifests/manifest.py | 2 +- virtualizarr/tests/test_manifests/test_manifest.py | 8 ++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/docs/releases.rst b/docs/releases.rst index c44ff245..c6735bd1 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -18,6 +18,9 @@ Deprecations Bug fixes ~~~~~~~~~ +- Exclude empty chunks during `ChunkDict` construction. (:pull:`198`) + By `Gustavo Hidalgo `_. + Documentation ~~~~~~~~~~~~~ diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index cc196e6d..70b91d5b 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -252,7 +252,7 @@ def dict(self) -> ChunkDict: [*coord_vectors, self._paths, self._offsets, self._lengths], flags=("refs_ok",), ) - if path.item()[0] != "" # don't include entry if path='' (i.e. empty chunk) + if path.item() != "" # don't include entry if path='' (i.e. empty chunk) } return cast( diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py index 1a2a0ae1..7ef69982 100644 --- a/virtualizarr/tests/test_manifests/test_manifest.py +++ b/virtualizarr/tests/test_manifests/test_manifest.py @@ -51,6 +51,14 @@ def test_invalid_chunk_keys(self): with pytest.raises(ValueError, match="Inconsistent number of dimensions"): ChunkManifest(entries=chunks) + def test_empty_chunk_paths(self): + chunks = { + "0.0.0": {"path": "", "offset": 0, "length": 100}, + "1.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + } + manifest = ChunkManifest(entries=chunks) + assert len(manifest.dict()) == 1 + class TestProperties: def test_chunk_grid_info(self): From 0ad4de5c612d1d632c2acb07ecfad071756eccf4 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Mon, 22 Jul 2024 08:09:17 +0200 Subject: [PATCH 2/3] Extend refspec support to [path] entries (without offset/length) (#187) * Improve typing of ChunkEntry * Handle kerchunk [path] with no offset/length * Raise NotImplementedError on inlined data * Explain the need for Dict type hint --- virtualizarr/manifests/manifest.py | 50 +++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 70b91d5b..bf7c24fd 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -1,10 +1,11 @@ import json import re from collections.abc import Iterable, Iterator -from typing import Any, Callable, NewType, Tuple, Union, cast +from typing import Any, Callable, Dict, NewType, Tuple, TypedDict, cast import numpy as np from pydantic import BaseModel, ConfigDict +from upath import UPath from virtualizarr.types import ChunkKey @@ -15,7 +16,13 @@ _CHUNK_KEY = rf"^{_INTEGER}+({_SEPARATOR}{_INTEGER})*$" # matches 1 integer, optionally followed by more integers each separated by a separator (i.e. a period) -ChunkDict = NewType("ChunkDict", dict[ChunkKey, dict[str, Union[str, int]]]) +class ChunkDictEntry(TypedDict): + path: str + offset: int + length: int + + +ChunkDict = NewType("ChunkDict", dict[ChunkKey, ChunkDictEntry]) class ChunkEntry(BaseModel): @@ -35,16 +42,23 @@ def __repr__(self) -> str: return f"ChunkEntry(path='{self.path}', offset={self.offset}, length={self.length})" @classmethod - def from_kerchunk(cls, path_and_byte_range_info: list[str | int]) -> "ChunkEntry": - path, offset, length = path_and_byte_range_info + def from_kerchunk( + cls, path_and_byte_range_info: tuple[str] | tuple[str, int, int] + ) -> "ChunkEntry": + if len(path_and_byte_range_info) == 1: + path = path_and_byte_range_info[0] + offset = 0 + length = UPath(path).stat().st_size + else: + path, offset, length = path_and_byte_range_info return ChunkEntry(path=path, offset=offset, length=length) - def to_kerchunk(self) -> list[str | int]: + def to_kerchunk(self) -> tuple[str, int, int]: """Write out in the format that kerchunk uses for chunk entries.""" - return [self.path, self.offset, self.length] + return (self.path, self.offset, self.length) - def dict(self) -> dict[str, Union[str, int]]: - return dict(path=self.path, offset=self.offset, length=self.length) + def dict(self) -> ChunkDictEntry: + return ChunkDictEntry(path=self.path, offset=self.offset, length=self.length) class ChunkManifest: @@ -283,12 +297,20 @@ def to_zarr_json(self, filepath: str) -> None: json.dump(entries, json_file, indent=4, separators=(", ", ": ")) @classmethod - def _from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest": - chunkentries = { - cast(ChunkKey, k): ChunkEntry.from_kerchunk(v).dict() - for k, v in kerchunk_chunk_dict.items() - } - return ChunkManifest(entries=cast(ChunkDict, chunkentries)) + def _from_kerchunk_chunk_dict( + cls, + # The type hint requires `Dict` instead of `dict` due to + # the conflicting ChunkManifest.dict method. + kerchunk_chunk_dict: Dict[ChunkKey, str | tuple[str] | tuple[str, int, int]], + ) -> "ChunkManifest": + chunk_entries: dict[ChunkKey, ChunkDictEntry] = {} + for k, v in kerchunk_chunk_dict.items(): + if isinstance(v, (str, bytes)): + raise NotImplementedError("TODO: handle inlined data") + elif not isinstance(v, (tuple, list)): + raise TypeError(f"Unexpected type {type(v)} for chunk value: {v}") + chunk_entries[k] = ChunkEntry.from_kerchunk(v).dict() + return ChunkManifest(entries=chunk_entries) def rename_paths( self, From 10bd53dc3dae08303e57fe5aefe49804d9c4517d Mon Sep 17 00:00:00 2001 From: Gustavo Hidalgo Date: Mon, 22 Jul 2024 12:44:56 -0400 Subject: [PATCH 3/3] Conformant ZarrV3 codecs and fill values (#193) * Generate chunk manifest backed variable from HDF5 dataset. * Transfer dataset attrs to variable. * Get virtual variables dict from HDF5 file. * Update virtual_vars_from_hdf to use fsspec and drop_variables arg. * mypy fix to use ChunkKey and empty dimensions list. * Extract attributes from hdf5 root group. * Use hdf reader for netcdf4 files. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix ruff complaints. * First steps for handling HDF5 filters. * Initial step for hdf5plugin supported codecs. * Small commit to check compression support in CI environment. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix mypy complaints for hdf_filters. * Local pre-commit fix for hdf_filters. * Use fsspec reader_options introduced in #37. * Fix incorrect zarr_v3 if block position from merge commit ef0d7a8. * Fix early return from hdf _extract_attrs. * Test that _extract_attrs correctly handles multiple attributes. * Initial attempt at scale and offset via numcodecs. * Tests for cfcodec_from_dataset. * Temporarily relax integration tests to assert_allclose. * Add blosc_lz4 fixture parameterization to confirm libnetcdf environment. * Check for compatability with netcdf4 engine. * Use separate fixtures for h5netcdf and netcdf4 compression styles. * Print libhdf5 and libnetcdf4 versions to confirm compiled environment. * Skip netcdf4 style compression tests when libhdf5 < 1.14. * Include imagecodecs.numcodecs to support HDF5 lzf filters. * Remove test that verifies call to read_kerchunk_references_from_file. * Add additional codec support structures for imagecodecs and numcodecs. * Add codec config test for Zstd. * Include initial cf decoding tests. * Revert typo for scale_factor retrieval. * Update reader to use new numpy manifest representation. * Temporarily skip test until blosc netcdf4 issue is solved. * Fix Pydantic 2 migration warnings. * Include hdf5plugin and imagecodecs-numcodecs in mamba test environment. * Mamba attempt with imagecodecs rather than imagecodecs-numcodecs. * Mamba attempt with latest imagecodecs release. * Use correct iter_chunks callback function signtature. * Include pip based imagecodecs-numcodecs until conda-forge availability. * Handle non-coordinate dims which are serialized to hdf as empty dataset. * Use reader_options for filetype check and update failing kerchunk call. * Fix chunkmanifest shaping for chunked datasets. * Handle scale_factor attribute serialization for compressed files. * Include chunked roundtrip fixture. * Standardize xarray integration tests for hdf filters. * Update reader selection logic for new filetype determination. * Use decode_times for integration test. * Standardize fixture names for hdf5 vs netcdf4 file types. * Handle array add_offset property for compressed data. * Include h5py shuffle filter. * Make ScaleAndOffset codec last in filters list. * Apply ScaleAndOffset codec to _FillValue since it's value is now downstream. * Coerce scale and add_offset values to native float for JSON serialization. * Conformant ZarrV3 codecs * Update docs * Update virtualizarr/zarr.py Co-authored-by: Tom Augspurger * Update virtualizarr/zarr.py Co-authored-by: Tom Augspurger * Change default_fill to 0s * Generate permutation * Pythonic isinstance check * Add return type to isconfigurable Co-authored-by: Tom Augspurger * Changes from pair programming for zarrv3 to kerchunk file reading * Revert "Merge remote-tracking branch 'upstream/hdf5_reader' into codecs" This reverts commit 7a65fbdc8eda1dfedaa59e90bd2d8fe652819085, reversing changes made to c051f04523ae3d9a4244c1ece92ffc95a633498b. * Fix unit tests * PR comments * Remove kwarg in dict default --------- Co-authored-by: sharkinsspatial Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Tom Augspurger Co-authored-by: Tria McNeely --- docs/releases.rst | 3 + virtualizarr/kerchunk.py | 2 +- virtualizarr/tests/__init__.py | 4 +- virtualizarr/tests/test_integration.py | 2 +- .../tests/test_manifests/test_array.py | 12 +- virtualizarr/tests/test_xarray.py | 10 +- virtualizarr/tests/test_zarr.py | 60 ++++++- virtualizarr/zarr.py | 156 +++++++++++++++--- 8 files changed, 203 insertions(+), 46 deletions(-) diff --git a/docs/releases.rst b/docs/releases.rst index c6735bd1..f472db05 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -12,6 +12,9 @@ New Features Breaking changes ~~~~~~~~~~~~~~~~ +- Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`) + By `Gustavo Hidalgo `_. + Deprecations ~~~~~~~~~~~~ diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 6e82067d..122b86b3 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -266,7 +266,7 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable, var_name: str) -> KerchunkAr for chunk_key, entry in marr.manifest.dict().items() } - zarray = marr.zarray + zarray = marr.zarray.replace(zarr_format=2) else: try: diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 3856a6ba..7df13d10 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -48,9 +48,9 @@ def create_manifestarray( zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "blosc", "clevel": 5, "cname": "lz4", "shuffle": 1}, dtype=np.dtype("float32"), - fill_value=0.0, # TODO change this to NaN? + fill_value=0.0, filters=None, order="C", shape=shape, diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 2e612de9..239316a1 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -138,7 +138,7 @@ def test_non_dimension_coordinates(self, tmpdir, format): # regression test for GH issue #105 # set up example xarray dataset containing non-dimension coordinate variables - ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6).reshape(2, 3))}) + ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6.0).reshape(2, 3))}) # save it to disk as netCDF (in temporary directory) ds.to_netcdf(f"{tmpdir}/non_dim_coords.nc") diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index 459e60be..6d5ede79 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -19,7 +19,7 @@ def test_create_manifestarray(self): shape = (5, 2, 20) zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -74,7 +74,7 @@ def test_equals(self): shape = (5, 2, 20) zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -95,7 +95,7 @@ def test_not_equal_chunk_entries(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(5, 1, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -209,7 +209,7 @@ def test_concat(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(5, 1, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -254,7 +254,7 @@ def test_stack(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(5, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -299,7 +299,7 @@ def test_refuse_combine(): zarray_common = { "chunks": (5, 1, 10), - "compressor": "zlib", + "compressor": {"id": "zlib", "level": 1}, "dtype": np.dtype("int32"), "fill_value": 0.0, "filters": None, diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index d0fe2e3b..7fb7a026 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -19,7 +19,7 @@ def test_wrapping(): dtype = np.dtype("int32") zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=dtype, fill_value=0.0, filters=None, @@ -49,7 +49,7 @@ def test_equals(self): shape = (5, 20) zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -86,7 +86,7 @@ def test_concat_along_existing_dim(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(1, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -133,7 +133,7 @@ def test_concat_along_new_dim(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(5, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -183,7 +183,7 @@ def test_concat_dim_coords_along_existing_dim(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(10,), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py index 80d04b9c..7715d245 100644 --- a/virtualizarr/tests/test_zarr.py +++ b/virtualizarr/tests/test_zarr.py @@ -1,12 +1,17 @@ +import json + import numpy as np +import pytest import xarray as xr import xarray.testing as xrt from virtualizarr import ManifestArray, open_virtual_dataset from virtualizarr.manifests.manifest import ChunkManifest +from virtualizarr.zarr import dataset_to_zarr, metadata_from_zarr_json -def test_zarr_v3_roundtrip(tmpdir): +@pytest.fixture +def vds_with_manifest_arrays() -> xr.Dataset: arr = ManifestArray( chunkmanifest=ChunkManifest( entries={"0.0": dict(path="test.nc", offset=6144, length=48)} @@ -15,18 +20,61 @@ def test_zarr_v3_roundtrip(tmpdir): shape=(2, 3), dtype=np.dtype(" bool: + """ + Several metadata attributes in ZarrV3 use a dictionary with keys "name" : str and "configuration" : dict + """ + return "name" in value and "configuration" in value - original.virtualize.to_zarr(tmpdir / "store.zarr") + +def test_zarr_v3_roundtrip(tmpdir, vds_with_manifest_arrays: xr.Dataset): + vds_with_manifest_arrays.virtualize.to_zarr(tmpdir / "store.zarr") roundtrip = open_virtual_dataset( tmpdir / "store.zarr", filetype="zarr_v3", indexes={} ) - xrt.assert_identical(roundtrip, original) + xrt.assert_identical(roundtrip, vds_with_manifest_arrays) + + +def test_metadata_roundtrip(tmpdir, vds_with_manifest_arrays: xr.Dataset): + dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr") + zarray, _, _ = metadata_from_zarr_json(tmpdir / "store.zarr/a/zarr.json") + assert zarray == vds_with_manifest_arrays.a.data.zarray + + +def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: xr.Dataset): + """ + Checks that the output metadata of an array variable conforms to this spec + for the required attributes: + https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#metadata + """ + dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr") + # read the a variable's metadata + with open(tmpdir / "store.zarr/a/zarr.json", mode="r") as f: + metadata = json.loads(f.read()) + assert metadata["zarr_format"] == 3 + assert metadata["node_type"] == "array" + assert isinstance(metadata["shape"], list) and all( + isinstance(dim, int) for dim in metadata["shape"] + ) + assert isinstance(metadata["data_type"], str) or isconfigurable( + metadata["data_type"] + ) + assert isconfigurable(metadata["chunk_grid"]) + assert isconfigurable(metadata["chunk_key_encoding"]) + assert isinstance(metadata["fill_value"], (bool, int, float, str, list)) + assert ( + isinstance(metadata["codecs"], list) + and len(metadata["codecs"]) > 1 + and all(isconfigurable(codec) for codec in metadata["codecs"]) + ) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 545a86fc..e5015b36 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -8,10 +8,18 @@ Optional, ) +import numcodecs import numpy as np import ujson # type: ignore import xarray as xr -from pydantic import BaseModel, ConfigDict, field_validator +from pydantic import ( + BaseModel, + ConfigDict, + Field, + field_validator, + model_validator, +) +from typing_extensions import Self from virtualizarr.vendor.zarr.utils import json_dumps @@ -22,10 +30,25 @@ ZAttrs = NewType( "ZAttrs", dict[str, Any] ) # just the .zattrs (for one array or for the whole store/group) +FillValueT = bool | str | float | int | list | None + +ZARR_DEFAULT_FILL_VALUE: dict[np.dtype, FillValueT] = { + # numpy dtypes's hierarchy lets us avoid checking for all the widths + # https://numpy.org/doc/stable/reference/arrays.scalars.html + np.dtype("bool"): False, + np.dtype("int"): 0, + np.dtype("float"): 0.0, + np.dtype("complex"): [0.0, 0.0], +} +""" +The value and format of the fill_value depend on the `data_type` of the array. +See here for spec: +https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value +""" class Codec(BaseModel): - compressor: str | None = None + compressor: dict | None = None filters: list[dict] | None = None def __repr__(self) -> str: @@ -42,9 +65,9 @@ class ZArray(BaseModel): ) chunks: tuple[int, ...] - compressor: str | None = None + compressor: dict | None = None dtype: np.dtype - fill_value: float | int | None = np.nan # float or int? + fill_value: FillValueT = Field(default=0.0, validate_default=True) filters: list[dict] | None = None order: Literal["C", "F"] shape: tuple[int, ...] @@ -64,6 +87,12 @@ def __post_init__(self) -> None: f"Array shape {self.shape} has ndim={self.shape} but chunk shape {self.chunks} has ndim={len(self.chunks)}" ) + @model_validator(mode="after") + def _check_fill_value(self) -> Self: + if self.fill_value is None: + self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype, 0.0) + return self + @property def codec(self) -> Codec: """For comparison against other arrays.""" @@ -80,11 +109,6 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": fill_value = np.nan compressor = decoded_arr_refs_zarray["compressor"] - # deal with an inconsistency in kerchunk's tiff_to_zarr function - # TODO should this be moved to the point where we actually call tiff_to_zarr? Or ideally made consistent upstream. - if compressor is not None and "id" in compressor: - compressor = compressor["id"] - return ZArray( chunks=tuple(decoded_arr_refs_zarray["chunks"]), compressor=compressor, @@ -98,21 +122,19 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": def dict(self) -> dict[str, Any]: zarray_dict = dict(self) - zarray_dict["dtype"] = encode_dtype(zarray_dict["dtype"]) - - if zarray_dict["fill_value"] is np.nan: - zarray_dict["fill_value"] = None - return zarray_dict def to_kerchunk_json(self) -> str: - return ujson.dumps(self.dict()) + zarray_dict = self.dict() + if zarray_dict["fill_value"] is np.nan: + zarray_dict["fill_value"] = None + return ujson.dumps(zarray_dict) def replace( self, chunks: Optional[tuple[int, ...]] = None, - compressor: Optional[str] = None, + compressor: Optional[dict] = None, dtype: Optional[np.dtype] = None, fill_value: Optional[float] = None, # float or int? filters: Optional[list[dict]] = None, # type: ignore[valid-type] @@ -134,6 +156,59 @@ def replace( zarr_format=zarr_format if zarr_format is not None else self.zarr_format, ) + def _v3_codec_pipeline(self) -> list: + """ + VirtualiZarr internally uses the `filters`, `compressor`, and `order` attributes + from zarr v2, but to create conformant zarr v3 metadata those 3 must be turned into `codecs` objects. + Not all codecs are created equal though: https://github.com/zarr-developers/zarr-python/issues/1943 + An array _must_ declare a single ArrayBytes codec, and 0 or more ArrayArray, BytesBytes codecs. + Roughly, this is the mapping: + ``` + filters: Iterable[ArrayArrayCodec] #optional + compressor: ArrayBytesCodec #mandatory + post_compressor: Iterable[BytesBytesCodec] #optional + ``` + """ + if self.filters: + filter_codecs_configs = [ + numcodecs.get_codec(filter).get_config() for filter in self.filters + ] + filters = [ + dict(name=codec.pop("id"), configuration=codec) + for codec in filter_codecs_configs + ] + else: + filters = [] + + # Noting here that zarr v3 has very few codecs specificed in the official spec, + # and that there are far more codecs in `numcodecs`. We take a gamble and assume + # that the codec names and configuration are simply mapped into zarrv3 "configurables". + if self.compressor: + compressor = [_num_codec_config_to_configurable(self.compressor)] + else: + compressor = [] + + # https://zarr-specs.readthedocs.io/en/latest/v3/codecs/transpose/v1.0.html#transpose-codec-v1 + # Either "C" or "F", defining the layout of bytes within each chunk of the array. + # "C" means row-major order, i.e., the last dimension varies fastest; + # "F" means column-major order, i.e., the first dimension varies fastest. + if self.order == "C": + order = tuple(range(len(self.shape))) + elif self.order == "F": + order = tuple(reversed(range(len(self.shape)))) + + transpose = dict(name="transpose", configuration=dict(order=order)) + # https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097 + # "If no ArrayBytesCodec is supplied, we can auto-add a BytesCodec" + bytes = dict( + name="bytes", configuration={} + ) # TODO need to handle endianess configuration + + # The order here is significant! + # [ArrayArray] -> ArrayBytes -> [BytesBytes] + codec_pipeline = [transpose, bytes] + compressor + filters + return codec_pipeline + def encode_dtype(dtype: np.dtype) -> str: # TODO not sure if there is a better way to get the ' "name": "default", "configuration": {"separator": "/"}, } - metadata["codecs"] = metadata.pop("filters") - metadata.pop("compressor") # TODO this should be entered in codecs somehow - metadata.pop("order") # TODO this should be replaced by a transpose codec + metadata["codecs"] = zarray._v3_codec_pipeline() + metadata.pop("filters") + metadata.pop("compressor") + metadata.pop("order") # indicate that we're using the manifest storage transformer ZEP metadata["storage_transformers"] = [ @@ -277,21 +353,51 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: dim_names = metadata.pop("dimension_names") chunk_shape = metadata["chunk_grid"]["configuration"]["chunk_shape"] + shape = metadata["shape"] + zarr_format = metadata["zarr_format"] if metadata["fill_value"] is None: - fill_value = np.nan + raise ValueError( + "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" + ) else: fill_value = metadata["fill_value"] + all_codecs = [ + codec + for codec in metadata["codecs"] + if codec["name"] not in ("transpose", "bytes") + ] + compressor, *filters = [ + _configurable_to_num_codec_config(_filter) for _filter in all_codecs + ] zarray = ZArray( - chunks=metadata["chunk_grid"]["configuration"]["chunk_shape"], - compressor=metadata["codecs"], + chunks=chunk_shape, + compressor=compressor, dtype=np.dtype(metadata["data_type"]), fill_value=fill_value, - filters=metadata.get("filters", None), + filters=filters or None, order="C", - shape=chunk_shape, - zarr_format=3, + shape=shape, + zarr_format=zarr_format, ) return zarray, dim_names, attrs + + +def _configurable_to_num_codec_config(configurable: dict) -> dict: + """ + Convert a zarr v3 configurable into a numcodecs codec. + """ + configurable_copy = configurable.copy() + codec_id = configurable_copy.pop("name") + configuration = configurable_copy.pop("configuration") + return numcodecs.get_codec({"id": codec_id, **configuration}).get_config() + + +def _num_codec_config_to_configurable(num_codec: dict) -> dict: + """ + Convert a numcodecs codec into a zarr v3 configurable. + """ + num_codec_copy = num_codec.copy() + return {"name": num_codec_copy.pop("id"), "configuration": num_codec_copy}