Merge branch 'zarr-developers:main' into coords_fix

zarr-developers · Jul 30, 2024 · 2c94f19 · 2c94f19
2 parents bfb95ba + 179bb2a
commit 2c94f19
Show file tree

Hide file tree

Showing 16 changed files with 275 additions and 71 deletions.
diff --git a/README.md b/README.md
@@ -4,6 +4,8 @@
 
 VirtualiZarr (pronounced like "virtualize" but more piratey) grew out of [discussions](https://github.com/fsspec/kerchunk/issues/377) on the [kerchunk repository](https://github.com/fsspec/kerchunk), and is an attempt to provide the game-changing power of kerchunk in a zarr-native way, and with a familiar array-like API.
 
+You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides [almost all the same features](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) as Kerchunk.
+
 _Please see the [documentation](https://virtualizarr.readthedocs.io/en/latest/)_
 
 ### Development Status and Roadmap

diff --git a/docs/faq.md b/docs/faq.md
@@ -16,6 +16,8 @@ The above steps would also be performed using the `kerchunk` library alone, but
 
 ## How do VirtualiZarr and Kerchunk compare?
 
+You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides [almost all the same features](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) as Kerchunk.
+
 Users of kerchunk may find the following comparison table useful, which shows which features of kerchunk map on to which features of VirtualiZarr.
 | Component / Feature                                                      | Kerchunk                                                                                                                            | VirtualiZarr                                                                                                                                     |
 | ------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |

diff --git a/docs/index.md b/docs/index.md
@@ -4,6 +4,8 @@
 
 VirtualiZarr grew out of [discussions](https://github.com/fsspec/kerchunk/issues/377) on the [kerchunk repository](https://github.com/fsspec/kerchunk), and is an attempt to provide the game-changing power of kerchunk in a zarr-native way, and with a familiar array-like API.
 
+You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides [almost all the same features](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) as Kerchunk.
+
 ## Motivation
 
 The Kerchunk idea solves an incredibly important problem: accessing big archival datasets via a cloud-optimized pattern, but without copying or modifying the original data in any way. This is a win-win-win for users, data engineers, and data providers. Users see fast-opening zarr-compliant stores that work performantly with libraries like xarray and dask, data engineers can provide this speed by adding a lightweight virtualization layer on top of existing data (without having to ask anyone's permission), and data providers don't have to change anything about their legacy files for them to be used in a cloud-optimized way.

diff --git a/docs/releases.rst b/docs/releases.rst
@@ -12,12 +12,18 @@ New Features
 Breaking changes
 ~~~~~~~~~~~~~~~~
 
+- Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`)
+  By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.
+
 Deprecations
 ~~~~~~~~~~~~
 
 Bug fixes
 ~~~~~~~~~
 
+- Exclude empty chunks during `ChunkDict` construction. (:pull:`198`)
+  By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.
+
 Documentation
 ~~~~~~~~~~~~~
 

diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py
@@ -266,7 +266,7 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable, var_name: str) -> KerchunkAr
             for chunk_key, entry in marr.manifest.dict().items()
         }
 
-        zarray = marr.zarray
+        zarray = marr.zarray.replace(zarr_format=2)
 
     else:
         try:

diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py
@@ -5,7 +5,7 @@
 
 from ..kerchunk import KerchunkArrRefs
 from ..zarr import ZArray
-from .array_api import MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS
+from .array_api import MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS, _isnan
 from .manifest import ChunkManifest
 
 
@@ -127,6 +127,8 @@ def __array_function__(self, func, types, args, kwargs) -> Any:
 
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs) -> Any:
         """We have to define this in order to convince xarray that this class is a duckarray, even though we will never support ufuncs."""
+        if ufunc == np.isnan:
+            return _isnan(self.shape)
         return NotImplemented
 
     def __array__(self) -> np.ndarray:

diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py
@@ -356,8 +356,8 @@ def isnan(x: "ManifestArray", /) -> np.ndarray:
 
     Only implemented to get past some checks deep inside xarray, see https://github.com/TomNicholas/VirtualiZarr/issues/29.
     """
-    return np.full(
-        shape=x.shape,
-        fill_value=False,
-        dtype=np.dtype(bool),
-    )
+    return _isnan(x.shape)
+
+
+def _isnan(shape: tuple):
+    return np.full(shape=shape, fill_value=False, dtype=np.dtype(bool))
diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
@@ -1,10 +1,11 @@
 import json
 import re
 from collections.abc import Iterable, Iterator
-from typing import Any, Callable, NewType, Tuple, Union, cast
+from typing import Any, Callable, Dict, NewType, Tuple, TypedDict, cast
 
 import numpy as np
 from pydantic import BaseModel, ConfigDict
+from upath import UPath
 
 from virtualizarr.types import ChunkKey
 
@@ -15,7 +16,13 @@
 _CHUNK_KEY = rf"^{_INTEGER}+({_SEPARATOR}{_INTEGER})*$"  # matches 1 integer, optionally followed by more integers each separated by a separator (i.e. a period)
 
 
-ChunkDict = NewType("ChunkDict", dict[ChunkKey, dict[str, Union[str, int]]])
+class ChunkDictEntry(TypedDict):
+    path: str
+    offset: int
+    length: int
+
+
+ChunkDict = NewType("ChunkDict", dict[ChunkKey, ChunkDictEntry])
 
 
 class ChunkEntry(BaseModel):
@@ -35,16 +42,23 @@ def __repr__(self) -> str:
         return f"ChunkEntry(path='{self.path}', offset={self.offset}, length={self.length})"
 
     @classmethod
-    def from_kerchunk(cls, path_and_byte_range_info: list[str | int]) -> "ChunkEntry":
-        path, offset, length = path_and_byte_range_info
+    def from_kerchunk(
+        cls, path_and_byte_range_info: tuple[str] | tuple[str, int, int]
+    ) -> "ChunkEntry":
+        if len(path_and_byte_range_info) == 1:
+            path = path_and_byte_range_info[0]
+            offset = 0
+            length = UPath(path).stat().st_size
+        else:
+            path, offset, length = path_and_byte_range_info
         return ChunkEntry(path=path, offset=offset, length=length)
 
-    def to_kerchunk(self) -> list[str | int]:
+    def to_kerchunk(self) -> tuple[str, int, int]:
         """Write out in the format that kerchunk uses for chunk entries."""
-        return [self.path, self.offset, self.length]
+        return (self.path, self.offset, self.length)
 
-    def dict(self) -> dict[str, Union[str, int]]:
-        return dict(path=self.path, offset=self.offset, length=self.length)
+    def dict(self) -> ChunkDictEntry:
+        return ChunkDictEntry(path=self.path, offset=self.offset, length=self.length)
 
 
 class ChunkManifest:
@@ -252,7 +266,7 @@ def dict(self) -> ChunkDict:
                 [*coord_vectors, self._paths, self._offsets, self._lengths],
                 flags=("refs_ok",),
             )
-            if path.item()[0] != ""  # don't include entry if path='' (i.e. empty chunk)
+            if path.item() != ""  # don't include entry if path='' (i.e. empty chunk)
         }
 
         return cast(
@@ -283,12 +297,20 @@ def to_zarr_json(self, filepath: str) -> None:
             json.dump(entries, json_file, indent=4, separators=(", ", ": "))
 
     @classmethod
-    def _from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest":
-        chunkentries = {
-            cast(ChunkKey, k): ChunkEntry.from_kerchunk(v).dict()
-            for k, v in kerchunk_chunk_dict.items()
-        }
-        return ChunkManifest(entries=cast(ChunkDict, chunkentries))
+    def _from_kerchunk_chunk_dict(
+        cls,
+        # The type hint requires `Dict` instead of `dict` due to
+        # the conflicting ChunkManifest.dict method.
+        kerchunk_chunk_dict: Dict[ChunkKey, str | tuple[str] | tuple[str, int, int]],
+    ) -> "ChunkManifest":
+        chunk_entries: dict[ChunkKey, ChunkDictEntry] = {}
+        for k, v in kerchunk_chunk_dict.items():
+            if isinstance(v, (str, bytes)):
+                raise NotImplementedError("TODO: handle inlined data")
+            elif not isinstance(v, (tuple, list)):
+                raise TypeError(f"Unexpected type {type(v)} for chunk value: {v}")
+            chunk_entries[k] = ChunkEntry.from_kerchunk(v).dict()
+        return ChunkManifest(entries=chunk_entries)
 
     def rename_paths(
         self,

diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py
@@ -48,9 +48,9 @@ def create_manifestarray(
 
     zarray = ZArray(
         chunks=chunks,
-        compressor="zlib",
+        compressor={"id": "blosc", "clevel": 5, "cname": "lz4", "shuffle": 1},
         dtype=np.dtype("float32"),
-        fill_value=0.0,  # TODO change this to NaN?
+        fill_value=0.0,
         filters=None,
         order="C",
         shape=shape,

diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
@@ -138,7 +138,7 @@ def test_non_dimension_coordinates(self, tmpdir, format):
         # regression test for GH issue #105
 
         # set up example xarray dataset containing non-dimension coordinate variables
-        ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6).reshape(2, 3))})
+        ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6.0).reshape(2, 3))})
 
         # save it to disk as netCDF (in temporary directory)
         ds.to_netcdf(f"{tmpdir}/non_dim_coords.nc")

diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py
@@ -19,7 +19,7 @@ def test_create_manifestarray(self):
         shape = (5, 2, 20)
         zarray = ZArray(
             chunks=chunks,
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -74,7 +74,7 @@ def test_equals(self):
         shape = (5, 2, 20)
         zarray = ZArray(
             chunks=chunks,
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -95,7 +95,7 @@ def test_not_equal_chunk_entries(self):
         # both manifest arrays in this example have the same zarray properties
         zarray = ZArray(
             chunks=(5, 1, 10),
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -209,7 +209,7 @@ def test_concat(self):
         # both manifest arrays in this example have the same zarray properties
         zarray = ZArray(
             chunks=(5, 1, 10),
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -254,7 +254,7 @@ def test_stack(self):
         # both manifest arrays in this example have the same zarray properties
         zarray = ZArray(
             chunks=(5, 10),
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -299,7 +299,7 @@ def test_refuse_combine():
 
     zarray_common = {
         "chunks": (5, 1, 10),
-        "compressor": "zlib",
+        "compressor": {"id": "zlib", "level": 1},
         "dtype": np.dtype("int32"),
         "fill_value": 0.0,
         "filters": None,

diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py
@@ -51,6 +51,14 @@ def test_invalid_chunk_keys(self):
         with pytest.raises(ValueError, match="Inconsistent number of dimensions"):
             ChunkManifest(entries=chunks)
 
+    def test_empty_chunk_paths(self):
+        chunks = {
+            "0.0.0": {"path": "", "offset": 0, "length": 100},
+            "1.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
+        }
+        manifest = ChunkManifest(entries=chunks)
+        assert len(manifest.dict()) == 1
+
 
 class TestProperties:
     def test_chunk_grid_info(self):

diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
@@ -19,7 +19,7 @@ def test_wrapping():
     dtype = np.dtype("int32")
     zarray = ZArray(
         chunks=chunks,
-        compressor="zlib",
+        compressor={"id": "zlib", "level": 1},
         dtype=dtype,
         fill_value=0.0,
         filters=None,
@@ -49,7 +49,7 @@ def test_equals(self):
         shape = (5, 20)
         zarray = ZArray(
             chunks=chunks,
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -86,7 +86,7 @@ def test_concat_along_existing_dim(self):
         # both manifest arrays in this example have the same zarray properties
         zarray = ZArray(
             chunks=(1, 10),
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -133,7 +133,7 @@ def test_concat_along_new_dim(self):
         # both manifest arrays in this example have the same zarray properties
         zarray = ZArray(
             chunks=(5, 10),
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -183,7 +183,7 @@ def test_concat_dim_coords_along_existing_dim(self):
         # both manifest arrays in this example have the same zarray properties
         zarray = ZArray(
             chunks=(10,),
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -376,7 +376,7 @@ def test_read_from_url(self, filetype, url):
             with pytest.raises(NotImplementedError):
                 vds = open_virtual_dataset(url, reader_options={}, indexes={})
         else:
-            vds = open_virtual_dataset(url, reader_options={}, indexes={})
+            vds = open_virtual_dataset(url, indexes={})
             assert isinstance(vds, xr.Dataset)
-Original file line number
+Diff line change
@@ Expand Up @@
                 for chunk_key, entry in marr.manifest.dict().items()
             }
-            zarray = marr.zarray
+            zarray = marr.zarray.replace(zarr_format=2)
         else:
             try:
@@ Expand Down @@