From 10ef7e54dcd5b2a74b6180fdc51304d71d8a1e79 Mon Sep 17 00:00:00 2001
From: Gustavo Hidalgo <zambrano.hidalgo@gmail.com>
Date: Fri, 19 Jul 2024 20:30:25 -0400
Subject: [PATCH 1/3] Exclude empty `paths` on `ChunkDict` creation (#198)

* Update docs

* handle empty paths

* reset releases

* remove experimental chunk validation skip

* add docs
---
 docs/releases.rst                                  | 3 +++
 virtualizarr/manifests/manifest.py                 | 2 +-
 virtualizarr/tests/test_manifests/test_manifest.py | 8 ++++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/docs/releases.rst b/docs/releases.rst
index c44ff245..c6735bd1 100644
--- a/docs/releases.rst
+++ b/docs/releases.rst
@@ -18,6 +18,9 @@ Deprecations
 Bug fixes
 ~~~~~~~~~
 
+- Exclude empty chunks during `ChunkDict` construction. (:pull:`198`)
+  By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.
+
 Documentation
 ~~~~~~~~~~~~~
 
diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index cc196e6d..70b91d5b 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -252,7 +252,7 @@ def dict(self) -> ChunkDict:
                 [*coord_vectors, self._paths, self._offsets, self._lengths],
                 flags=("refs_ok",),
             )
-            if path.item()[0] != ""  # don't include entry if path='' (i.e. empty chunk)
+            if path.item() != ""  # don't include entry if path='' (i.e. empty chunk)
         }
 
         return cast(
diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py
index 1a2a0ae1..7ef69982 100644
--- a/virtualizarr/tests/test_manifests/test_manifest.py
+++ b/virtualizarr/tests/test_manifests/test_manifest.py
@@ -51,6 +51,14 @@ def test_invalid_chunk_keys(self):
         with pytest.raises(ValueError, match="Inconsistent number of dimensions"):
             ChunkManifest(entries=chunks)
 
+    def test_empty_chunk_paths(self):
+        chunks = {
+            "0.0.0": {"path": "", "offset": 0, "length": 100},
+            "1.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
+        }
+        manifest = ChunkManifest(entries=chunks)
+        assert len(manifest.dict()) == 1
+
 
 class TestProperties:
     def test_chunk_grid_info(self):

From 0ad4de5c612d1d632c2acb07ecfad071756eccf4 Mon Sep 17 00:00:00 2001
From: Ben Mares <services-git-throwaway1@tensorial.com>
Date: Mon, 22 Jul 2024 08:09:17 +0200
Subject: [PATCH 2/3] Extend refspec support to [path] entries (without
 offset/length) (#187)

* Improve typing of ChunkEntry

* Handle kerchunk [path] with no offset/length

* Raise NotImplementedError on inlined data

* Explain the need for Dict type hint
---
 virtualizarr/manifests/manifest.py | 50 +++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 14 deletions(-)

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
index 70b91d5b..bf7c24fd 100644
--- a/virtualizarr/manifests/manifest.py
+++ b/virtualizarr/manifests/manifest.py
@@ -1,10 +1,11 @@
 import json
 import re
 from collections.abc import Iterable, Iterator
-from typing import Any, Callable, NewType, Tuple, Union, cast
+from typing import Any, Callable, Dict, NewType, Tuple, TypedDict, cast
 
 import numpy as np
 from pydantic import BaseModel, ConfigDict
+from upath import UPath
 
 from virtualizarr.types import ChunkKey
 
@@ -15,7 +16,13 @@
 _CHUNK_KEY = rf"^{_INTEGER}+({_SEPARATOR}{_INTEGER})*$"  # matches 1 integer, optionally followed by more integers each separated by a separator (i.e. a period)
 
 
-ChunkDict = NewType("ChunkDict", dict[ChunkKey, dict[str, Union[str, int]]])
+class ChunkDictEntry(TypedDict):
+    path: str
+    offset: int
+    length: int
+
+
+ChunkDict = NewType("ChunkDict", dict[ChunkKey, ChunkDictEntry])
 
 
 class ChunkEntry(BaseModel):
@@ -35,16 +42,23 @@ def __repr__(self) -> str:
         return f"ChunkEntry(path='{self.path}', offset={self.offset}, length={self.length})"
 
     @classmethod
-    def from_kerchunk(cls, path_and_byte_range_info: list[str | int]) -> "ChunkEntry":
-        path, offset, length = path_and_byte_range_info
+    def from_kerchunk(
+        cls, path_and_byte_range_info: tuple[str] | tuple[str, int, int]
+    ) -> "ChunkEntry":
+        if len(path_and_byte_range_info) == 1:
+            path = path_and_byte_range_info[0]
+            offset = 0
+            length = UPath(path).stat().st_size
+        else:
+            path, offset, length = path_and_byte_range_info
         return ChunkEntry(path=path, offset=offset, length=length)
 
-    def to_kerchunk(self) -> list[str | int]:
+    def to_kerchunk(self) -> tuple[str, int, int]:
         """Write out in the format that kerchunk uses for chunk entries."""
-        return [self.path, self.offset, self.length]
+        return (self.path, self.offset, self.length)
 
-    def dict(self) -> dict[str, Union[str, int]]:
-        return dict(path=self.path, offset=self.offset, length=self.length)
+    def dict(self) -> ChunkDictEntry:
+        return ChunkDictEntry(path=self.path, offset=self.offset, length=self.length)
 
 
 class ChunkManifest:
@@ -283,12 +297,20 @@ def to_zarr_json(self, filepath: str) -> None:
             json.dump(entries, json_file, indent=4, separators=(", ", ": "))
 
     @classmethod
-    def _from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest":
-        chunkentries = {
-            cast(ChunkKey, k): ChunkEntry.from_kerchunk(v).dict()
-            for k, v in kerchunk_chunk_dict.items()
-        }
-        return ChunkManifest(entries=cast(ChunkDict, chunkentries))
+    def _from_kerchunk_chunk_dict(
+        cls,
+        # The type hint requires `Dict` instead of `dict` due to
+        # the conflicting ChunkManifest.dict method.
+        kerchunk_chunk_dict: Dict[ChunkKey, str | tuple[str] | tuple[str, int, int]],
+    ) -> "ChunkManifest":
+        chunk_entries: dict[ChunkKey, ChunkDictEntry] = {}
+        for k, v in kerchunk_chunk_dict.items():
+            if isinstance(v, (str, bytes)):
+                raise NotImplementedError("TODO: handle inlined data")
+            elif not isinstance(v, (tuple, list)):
+                raise TypeError(f"Unexpected type {type(v)} for chunk value: {v}")
+            chunk_entries[k] = ChunkEntry.from_kerchunk(v).dict()
+        return ChunkManifest(entries=chunk_entries)
 
     def rename_paths(
         self,

From 10bd53dc3dae08303e57fe5aefe49804d9c4517d Mon Sep 17 00:00:00 2001
From: Gustavo Hidalgo <guhidalgo@microsoft.com>
Date: Mon, 22 Jul 2024 12:44:56 -0400
Subject: [PATCH 3/3] Conformant ZarrV3 codecs and fill values (#193)

* Generate chunk manifest backed variable from HDF5 dataset.

* Transfer dataset attrs to variable.

* Get virtual variables dict from HDF5 file.

* Update virtual_vars_from_hdf to use fsspec and drop_variables arg.

* mypy fix to use ChunkKey and empty dimensions list.

* Extract attributes from hdf5 root group.

* Use hdf reader for netcdf4 files.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix ruff complaints.

* First steps for handling HDF5 filters.

* Initial step for hdf5plugin supported codecs.

* Small commit to check compression support in CI environment.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix mypy complaints for hdf_filters.

* Local pre-commit fix for hdf_filters.

* Use fsspec reader_options introduced in #37.

* Fix incorrect zarr_v3 if block position from merge commit ef0d7a8.

* Fix early return from hdf _extract_attrs.

* Test that _extract_attrs correctly handles multiple attributes.

* Initial attempt at scale and offset via numcodecs.

* Tests for cfcodec_from_dataset.

* Temporarily relax integration tests to assert_allclose.

* Add blosc_lz4 fixture parameterization to confirm libnetcdf environment.

* Check for compatability with netcdf4 engine.

* Use separate fixtures for h5netcdf and netcdf4 compression styles.

* Print libhdf5 and libnetcdf4 versions to confirm compiled environment.

* Skip netcdf4 style compression tests when libhdf5 < 1.14.

* Include imagecodecs.numcodecs to support HDF5 lzf filters.

* Remove test that verifies call to read_kerchunk_references_from_file.

* Add additional codec support structures for imagecodecs and numcodecs.

* Add codec config test for Zstd.

* Include initial cf decoding tests.

* Revert typo for scale_factor retrieval.

* Update reader to use new numpy manifest representation.

* Temporarily skip test until blosc netcdf4 issue is solved.

* Fix Pydantic 2 migration warnings.

* Include hdf5plugin and imagecodecs-numcodecs in mamba test environment.

* Mamba attempt with imagecodecs rather than imagecodecs-numcodecs.

* Mamba attempt with latest imagecodecs release.

* Use correct iter_chunks callback function signtature.

* Include pip based imagecodecs-numcodecs until conda-forge availability.

* Handle non-coordinate dims which are serialized to hdf as empty dataset.

* Use reader_options for filetype check and update failing kerchunk call.

* Fix chunkmanifest shaping for chunked datasets.

* Handle scale_factor attribute serialization for compressed files.

* Include chunked roundtrip fixture.

* Standardize xarray integration tests for hdf filters.

* Update reader selection logic for new filetype determination.

* Use decode_times for integration test.

* Standardize fixture names for hdf5 vs netcdf4 file types.

* Handle array add_offset property for compressed data.

* Include h5py shuffle filter.

* Make ScaleAndOffset codec last in filters list.

* Apply ScaleAndOffset codec to _FillValue since it's value is now downstream.

* Coerce scale and add_offset values to native float for JSON serialization.

* Conformant ZarrV3 codecs

* Update docs

* Update virtualizarr/zarr.py

Co-authored-by: Tom Augspurger <tom.augspurger88@gmail.com>

* Update virtualizarr/zarr.py

Co-authored-by: Tom Augspurger <tom.augspurger88@gmail.com>

* Change default_fill to 0s

* Generate permutation

* Pythonic isinstance check

* Add return type to isconfigurable

Co-authored-by: Tom Augspurger <tom.augspurger88@gmail.com>

* Changes from pair programming for zarrv3 to kerchunk file reading

* Revert "Merge remote-tracking branch 'upstream/hdf5_reader' into codecs"

This reverts commit 7a65fbdc8eda1dfedaa59e90bd2d8fe652819085, reversing
changes made to c051f04523ae3d9a4244c1ece92ffc95a633498b.

* Fix unit tests

* PR comments

* Remove kwarg in dict default

---------

Co-authored-by: sharkinsspatial <sharkinsgis@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Tom Augspurger <tom.augspurger88@gmail.com>
Co-authored-by: Tria McNeely <triamcnely@microsoft.com>
---
 docs/releases.rst                             |   3 +
 virtualizarr/kerchunk.py                      |   2 +-
 virtualizarr/tests/__init__.py                |   4 +-
 virtualizarr/tests/test_integration.py        |   2 +-
 .../tests/test_manifests/test_array.py        |  12 +-
 virtualizarr/tests/test_xarray.py             |  10 +-
 virtualizarr/tests/test_zarr.py               |  60 ++++++-
 virtualizarr/zarr.py                          | 156 +++++++++++++++---
 8 files changed, 203 insertions(+), 46 deletions(-)

diff --git a/docs/releases.rst b/docs/releases.rst
index c6735bd1..f472db05 100644
--- a/docs/releases.rst
+++ b/docs/releases.rst
@@ -12,6 +12,9 @@ New Features
 Breaking changes
 ~~~~~~~~~~~~~~~~
 
+- Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`)
+  By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.
+
 Deprecations
 ~~~~~~~~~~~~
 
diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py
index 6e82067d..122b86b3 100644
--- a/virtualizarr/kerchunk.py
+++ b/virtualizarr/kerchunk.py
@@ -266,7 +266,7 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable, var_name: str) -> KerchunkAr
             for chunk_key, entry in marr.manifest.dict().items()
         }
 
-        zarray = marr.zarray
+        zarray = marr.zarray.replace(zarr_format=2)
 
     else:
         try:
diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py
index 3856a6ba..7df13d10 100644
--- a/virtualizarr/tests/__init__.py
+++ b/virtualizarr/tests/__init__.py
@@ -48,9 +48,9 @@ def create_manifestarray(
 
     zarray = ZArray(
         chunks=chunks,
-        compressor="zlib",
+        compressor={"id": "blosc", "clevel": 5, "cname": "lz4", "shuffle": 1},
         dtype=np.dtype("float32"),
-        fill_value=0.0,  # TODO change this to NaN?
+        fill_value=0.0,
         filters=None,
         order="C",
         shape=shape,
diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
index 2e612de9..239316a1 100644
--- a/virtualizarr/tests/test_integration.py
+++ b/virtualizarr/tests/test_integration.py
@@ -138,7 +138,7 @@ def test_non_dimension_coordinates(self, tmpdir, format):
         # regression test for GH issue #105
 
         # set up example xarray dataset containing non-dimension coordinate variables
-        ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6).reshape(2, 3))})
+        ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6.0).reshape(2, 3))})
 
         # save it to disk as netCDF (in temporary directory)
         ds.to_netcdf(f"{tmpdir}/non_dim_coords.nc")
diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py
index 459e60be..6d5ede79 100644
--- a/virtualizarr/tests/test_manifests/test_array.py
+++ b/virtualizarr/tests/test_manifests/test_array.py
@@ -19,7 +19,7 @@ def test_create_manifestarray(self):
         shape = (5, 2, 20)
         zarray = ZArray(
             chunks=chunks,
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -74,7 +74,7 @@ def test_equals(self):
         shape = (5, 2, 20)
         zarray = ZArray(
             chunks=chunks,
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -95,7 +95,7 @@ def test_not_equal_chunk_entries(self):
         # both manifest arrays in this example have the same zarray properties
         zarray = ZArray(
             chunks=(5, 1, 10),
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -209,7 +209,7 @@ def test_concat(self):
         # both manifest arrays in this example have the same zarray properties
         zarray = ZArray(
             chunks=(5, 1, 10),
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -254,7 +254,7 @@ def test_stack(self):
         # both manifest arrays in this example have the same zarray properties
         zarray = ZArray(
             chunks=(5, 10),
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -299,7 +299,7 @@ def test_refuse_combine():
 
     zarray_common = {
         "chunks": (5, 1, 10),
-        "compressor": "zlib",
+        "compressor": {"id": "zlib", "level": 1},
         "dtype": np.dtype("int32"),
         "fill_value": 0.0,
         "filters": None,
diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
index d0fe2e3b..7fb7a026 100644
--- a/virtualizarr/tests/test_xarray.py
+++ b/virtualizarr/tests/test_xarray.py
@@ -19,7 +19,7 @@ def test_wrapping():
     dtype = np.dtype("int32")
     zarray = ZArray(
         chunks=chunks,
-        compressor="zlib",
+        compressor={"id": "zlib", "level": 1},
         dtype=dtype,
         fill_value=0.0,
         filters=None,
@@ -49,7 +49,7 @@ def test_equals(self):
         shape = (5, 20)
         zarray = ZArray(
             chunks=chunks,
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -86,7 +86,7 @@ def test_concat_along_existing_dim(self):
         # both manifest arrays in this example have the same zarray properties
         zarray = ZArray(
             chunks=(1, 10),
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -133,7 +133,7 @@ def test_concat_along_new_dim(self):
         # both manifest arrays in this example have the same zarray properties
         zarray = ZArray(
             chunks=(5, 10),
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -183,7 +183,7 @@ def test_concat_dim_coords_along_existing_dim(self):
         # both manifest arrays in this example have the same zarray properties
         zarray = ZArray(
             chunks=(10,),
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py
index 80d04b9c..7715d245 100644
--- a/virtualizarr/tests/test_zarr.py
+++ b/virtualizarr/tests/test_zarr.py
@@ -1,12 +1,17 @@
+import json
+
 import numpy as np
+import pytest
 import xarray as xr
 import xarray.testing as xrt
 
 from virtualizarr import ManifestArray, open_virtual_dataset
 from virtualizarr.manifests.manifest import ChunkManifest
+from virtualizarr.zarr import dataset_to_zarr, metadata_from_zarr_json
 
 
-def test_zarr_v3_roundtrip(tmpdir):
+@pytest.fixture
+def vds_with_manifest_arrays() -> xr.Dataset:
     arr = ManifestArray(
         chunkmanifest=ChunkManifest(
             entries={"0.0": dict(path="test.nc", offset=6144, length=48)}
@@ -15,18 +20,61 @@ def test_zarr_v3_roundtrip(tmpdir):
             shape=(2, 3),
             dtype=np.dtype("<i8"),
             chunks=(2, 3),
-            compressor=None,
+            compressor={"id": "zlib", "level": 1},
             filters=None,
-            fill_value=np.nan,
+            fill_value=0,
             order="C",
             zarr_format=3,
         ),
     )
-    original = xr.Dataset({"a": (["x", "y"], arr)}, attrs={"something": 0})
+    return xr.Dataset({"a": (["x", "y"], arr)}, attrs={"something": 0})
+
+
+def isconfigurable(value: dict) -> bool:
+    """
+    Several metadata attributes in ZarrV3 use a dictionary with keys "name" : str and "configuration" : dict
+    """
+    return "name" in value and "configuration" in value
 
-    original.virtualize.to_zarr(tmpdir / "store.zarr")
+
+def test_zarr_v3_roundtrip(tmpdir, vds_with_manifest_arrays: xr.Dataset):
+    vds_with_manifest_arrays.virtualize.to_zarr(tmpdir / "store.zarr")
     roundtrip = open_virtual_dataset(
         tmpdir / "store.zarr", filetype="zarr_v3", indexes={}
     )
 
-    xrt.assert_identical(roundtrip, original)
+    xrt.assert_identical(roundtrip, vds_with_manifest_arrays)
+
+
+def test_metadata_roundtrip(tmpdir, vds_with_manifest_arrays: xr.Dataset):
+    dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr")
+    zarray, _, _ = metadata_from_zarr_json(tmpdir / "store.zarr/a/zarr.json")
+    assert zarray == vds_with_manifest_arrays.a.data.zarray
+
+
+def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: xr.Dataset):
+    """
+    Checks that the output metadata of an array variable conforms to this spec
+    for the required attributes:
+    https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#metadata
+    """
+    dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr")
+    # read the a variable's metadata
+    with open(tmpdir / "store.zarr/a/zarr.json", mode="r") as f:
+        metadata = json.loads(f.read())
+    assert metadata["zarr_format"] == 3
+    assert metadata["node_type"] == "array"
+    assert isinstance(metadata["shape"], list) and all(
+        isinstance(dim, int) for dim in metadata["shape"]
+    )
+    assert isinstance(metadata["data_type"], str) or isconfigurable(
+        metadata["data_type"]
+    )
+    assert isconfigurable(metadata["chunk_grid"])
+    assert isconfigurable(metadata["chunk_key_encoding"])
+    assert isinstance(metadata["fill_value"], (bool, int, float, str, list))
+    assert (
+        isinstance(metadata["codecs"], list)
+        and len(metadata["codecs"]) > 1
+        and all(isconfigurable(codec) for codec in metadata["codecs"])
+    )
diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py
index 545a86fc..e5015b36 100644
--- a/virtualizarr/zarr.py
+++ b/virtualizarr/zarr.py
@@ -8,10 +8,18 @@
     Optional,
 )
 
+import numcodecs
 import numpy as np
 import ujson  # type: ignore
 import xarray as xr
-from pydantic import BaseModel, ConfigDict, field_validator
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    field_validator,
+    model_validator,
+)
+from typing_extensions import Self
 
 from virtualizarr.vendor.zarr.utils import json_dumps
 
@@ -22,10 +30,25 @@
 ZAttrs = NewType(
     "ZAttrs", dict[str, Any]
 )  # just the .zattrs (for one array or for the whole store/group)
+FillValueT = bool | str | float | int | list | None
+
+ZARR_DEFAULT_FILL_VALUE: dict[np.dtype, FillValueT] = {
+    # numpy dtypes's hierarchy lets us avoid checking for all the widths
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html
+    np.dtype("bool"): False,
+    np.dtype("int"): 0,
+    np.dtype("float"): 0.0,
+    np.dtype("complex"): [0.0, 0.0],
+}
+"""
+The value and format of the fill_value depend on the `data_type` of the array.
+See here for spec:
+https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value
+"""
 
 
 class Codec(BaseModel):
-    compressor: str | None = None
+    compressor: dict | None = None
     filters: list[dict] | None = None
 
     def __repr__(self) -> str:
@@ -42,9 +65,9 @@ class ZArray(BaseModel):
     )
 
     chunks: tuple[int, ...]
-    compressor: str | None = None
+    compressor: dict | None = None
     dtype: np.dtype
-    fill_value: float | int | None = np.nan  # float or int?
+    fill_value: FillValueT = Field(default=0.0, validate_default=True)
     filters: list[dict] | None = None
     order: Literal["C", "F"]
     shape: tuple[int, ...]
@@ -64,6 +87,12 @@ def __post_init__(self) -> None:
                 f"Array shape {self.shape} has ndim={self.shape} but chunk shape {self.chunks} has ndim={len(self.chunks)}"
             )
 
+    @model_validator(mode="after")
+    def _check_fill_value(self) -> Self:
+        if self.fill_value is None:
+            self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype, 0.0)
+        return self
+
     @property
     def codec(self) -> Codec:
         """For comparison against other arrays."""
@@ -80,11 +109,6 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray":
             fill_value = np.nan
 
         compressor = decoded_arr_refs_zarray["compressor"]
-        # deal with an inconsistency in kerchunk's tiff_to_zarr function
-        # TODO should this be moved to the point where we actually call tiff_to_zarr? Or ideally made consistent upstream.
-        if compressor is not None and "id" in compressor:
-            compressor = compressor["id"]
-
         return ZArray(
             chunks=tuple(decoded_arr_refs_zarray["chunks"]),
             compressor=compressor,
@@ -98,21 +122,19 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray":
 
     def dict(self) -> dict[str, Any]:
         zarray_dict = dict(self)
-
         zarray_dict["dtype"] = encode_dtype(zarray_dict["dtype"])
-
-        if zarray_dict["fill_value"] is np.nan:
-            zarray_dict["fill_value"] = None
-
         return zarray_dict
 
     def to_kerchunk_json(self) -> str:
-        return ujson.dumps(self.dict())
+        zarray_dict = self.dict()
+        if zarray_dict["fill_value"] is np.nan:
+            zarray_dict["fill_value"] = None
+        return ujson.dumps(zarray_dict)
 
     def replace(
         self,
         chunks: Optional[tuple[int, ...]] = None,
-        compressor: Optional[str] = None,
+        compressor: Optional[dict] = None,
         dtype: Optional[np.dtype] = None,
         fill_value: Optional[float] = None,  # float or int?
         filters: Optional[list[dict]] = None,  # type: ignore[valid-type]
@@ -134,6 +156,59 @@ def replace(
             zarr_format=zarr_format if zarr_format is not None else self.zarr_format,
         )
 
+    def _v3_codec_pipeline(self) -> list:
+        """
+        VirtualiZarr internally uses the `filters`, `compressor`, and `order` attributes
+        from zarr v2, but to create conformant zarr v3 metadata those 3 must be turned into `codecs` objects.
+        Not all codecs are created equal though: https://github.com/zarr-developers/zarr-python/issues/1943
+        An array _must_ declare a single ArrayBytes codec, and 0 or more ArrayArray, BytesBytes codecs.
+        Roughly, this is the mapping:
+        ```
+            filters: Iterable[ArrayArrayCodec] #optional
+            compressor: ArrayBytesCodec #mandatory
+            post_compressor: Iterable[BytesBytesCodec] #optional
+        ```
+        """
+        if self.filters:
+            filter_codecs_configs = [
+                numcodecs.get_codec(filter).get_config() for filter in self.filters
+            ]
+            filters = [
+                dict(name=codec.pop("id"), configuration=codec)
+                for codec in filter_codecs_configs
+            ]
+        else:
+            filters = []
+
+        # Noting here that zarr v3 has very few codecs specificed in the official spec,
+        # and that there are far more codecs in `numcodecs`. We take a gamble and assume
+        # that the codec names and configuration are simply mapped into zarrv3 "configurables".
+        if self.compressor:
+            compressor = [_num_codec_config_to_configurable(self.compressor)]
+        else:
+            compressor = []
+
+        # https://zarr-specs.readthedocs.io/en/latest/v3/codecs/transpose/v1.0.html#transpose-codec-v1
+        # Either "C" or "F", defining the layout of bytes within each chunk of the array.
+        # "C" means row-major order, i.e., the last dimension varies fastest;
+        # "F" means column-major order, i.e., the first dimension varies fastest.
+        if self.order == "C":
+            order = tuple(range(len(self.shape)))
+        elif self.order == "F":
+            order = tuple(reversed(range(len(self.shape))))
+
+        transpose = dict(name="transpose", configuration=dict(order=order))
+        # https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097
+        # "If no ArrayBytesCodec is supplied, we can auto-add a BytesCodec"
+        bytes = dict(
+            name="bytes", configuration={}
+        )  # TODO need to handle endianess configuration
+
+        # The order here is significant!
+        # [ArrayArray] -> ArrayBytes -> [BytesBytes]
+        codec_pipeline = [transpose, bytes] + compressor + filters
+        return codec_pipeline
+
 
 def encode_dtype(dtype: np.dtype) -> str:
     # TODO not sure if there is a better way to get the '<i4' style representation of the dtype out
@@ -234,9 +309,10 @@ def zarr_v3_array_metadata(zarray: ZArray, dim_names: list[str], attrs: dict) ->
         "name": "default",
         "configuration": {"separator": "/"},
     }
-    metadata["codecs"] = metadata.pop("filters")
-    metadata.pop("compressor")  # TODO this should be entered in codecs somehow
-    metadata.pop("order")  # TODO this should be replaced by a transpose codec
+    metadata["codecs"] = zarray._v3_codec_pipeline()
+    metadata.pop("filters")
+    metadata.pop("compressor")
+    metadata.pop("order")
 
     # indicate that we're using the manifest storage transformer ZEP
     metadata["storage_transformers"] = [
@@ -277,21 +353,51 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]:
     dim_names = metadata.pop("dimension_names")
 
     chunk_shape = metadata["chunk_grid"]["configuration"]["chunk_shape"]
+    shape = metadata["shape"]
+    zarr_format = metadata["zarr_format"]
 
     if metadata["fill_value"] is None:
-        fill_value = np.nan
+        raise ValueError(
+            "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value"
+        )
     else:
         fill_value = metadata["fill_value"]
 
+    all_codecs = [
+        codec
+        for codec in metadata["codecs"]
+        if codec["name"] not in ("transpose", "bytes")
+    ]
+    compressor, *filters = [
+        _configurable_to_num_codec_config(_filter) for _filter in all_codecs
+    ]
     zarray = ZArray(
-        chunks=metadata["chunk_grid"]["configuration"]["chunk_shape"],
-        compressor=metadata["codecs"],
+        chunks=chunk_shape,
+        compressor=compressor,
         dtype=np.dtype(metadata["data_type"]),
         fill_value=fill_value,
-        filters=metadata.get("filters", None),
+        filters=filters or None,
         order="C",
-        shape=chunk_shape,
-        zarr_format=3,
+        shape=shape,
+        zarr_format=zarr_format,
     )
 
     return zarray, dim_names, attrs
+
+
+def _configurable_to_num_codec_config(configurable: dict) -> dict:
+    """
+    Convert a zarr v3 configurable into a numcodecs codec.
+    """
+    configurable_copy = configurable.copy()
+    codec_id = configurable_copy.pop("name")
+    configuration = configurable_copy.pop("configuration")
+    return numcodecs.get_codec({"id": codec_id, **configuration}).get_config()
+
+
+def _num_codec_config_to_configurable(num_codec: dict) -> dict:
+    """
+    Convert a numcodecs codec into a zarr v3 configurable.
+    """
+    num_codec_copy = num_codec.copy()
+    return {"name": num_codec_copy.pop("id"), "configuration": num_codec_copy}