Merge branch 'main' into pre-commit-ci-update-config

zarr-developers · Oct 17, 2024 · 07b2fa2 · 07b2fa2
2 parents c1b3c7e + ec8e465
commit 07b2fa2
Show file tree

Hide file tree

Showing 15 changed files with 255 additions and 53 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -46,10 +46,6 @@ jobs:
           conda env list
           conda list
 
-      - name: Type check
-        run: |
-          mypy virtualizarr
-
       - name: Running Tests
         run: |
           python -m pytest ./virtualizarr --run-network-tests --cov=./ --cov-report=xml --verbose

diff --git a/.github/workflows/typing.yml b/.github/workflows/typing.yml
@@ -0,0 +1,38 @@
+name: Typing
+
+on:
+  push:
+    branches: [ "main" ]
+    paths-ignore:
+    - 'docs/**'
+  pull_request:
+    branches: [ "main" ]
+    paths-ignore:
+    - 'docs/**'
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  mypy:
+    name: mypy
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.12'
+
+      - name: Install deps
+        run: |
+          # We need to test optional dep to add all the library stubs
+          pip install -e '.[test]'
+
+      - name: Type check
+        run: |
+          mypy virtualizarr
diff --git a/.gitignore b/.gitignore
@@ -160,3 +160,4 @@ cython_debug/
 #.idea/
 virtualizarr/_version.py
 docs/generated/
+examples/
diff --git a/ci/doc.yml b/ci/doc.yml
@@ -13,4 +13,3 @@ dependencies:
       - "sphinx_design"
       - "sphinx_togglebutton"
       - "sphinx-autodoc-typehints"
-      - -e  "..[test]"
diff --git a/conftest.py b/conftest.py
@@ -33,6 +33,20 @@ def netcdf4_file(tmpdir):
     return filepath
 
 
+@pytest.fixture
+def netcdf4_virtual_dataset(netcdf4_file):
+    from virtualizarr import open_virtual_dataset
+
+    return open_virtual_dataset(netcdf4_file, indexes={})
+
+
+@pytest.fixture
+def netcdf4_inlined_ref(netcdf4_file):
+    from kerchunk.hdf import SingleHdf5ToZarr
+
+    return SingleHdf5ToZarr(netcdf4_file, inline_threshold=1000).translate()
+
+
 @pytest.fixture
 def hdf5_groups_file(tmpdir):
     # Set up example xarray dataset

diff --git a/docs/releases.rst b/docs/releases.rst
@@ -8,6 +8,11 @@ v1.0.1 (unreleased)
 
 New Features
 ~~~~~~~~~~~~
+
+
+- Can open `kerchunk` reference files with ``open_virtual_dataset``.
+  (:pull:`251`, :pull:`186`) By `Raphael Hagen <https://github.com/norlandrhagen>`_ & `Kristen Thyng <https://github.com/kthyng>`_.
+
 - Adds defaults for `open_virtual_dataset_from_v3_store` in (:pull:`234`)
   By `Raphael Hagen <https://github.com/norlandrhagen>`_.
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -421,6 +421,18 @@ Currently there are not yet any zarr v3 readers which understand the chunk manif
 This store can however be read by {py:func}`~virtualizarr.xarray.open_virtual_dataset`, by passing `filetype="zarr_v3"`.
 ```
 
+## Opening Kerchunk references as virtual datasets
+
+You can open existing Kerchunk `json` or `parquet` references as Virtualizarr virtual datasets. This may be useful for converting existing Kerchunk formatted references to storage formats like [Icechunk](https://icechunk.io/).
+
+```python
+
+vds = open_virtual_dataset('combined.json', format='kerchunk')
+# or
+vds = open_virtual_dataset('combined.parquet', format='kerchunk')
+
+```
+
 ## Rewriting existing manifests
 
 Sometimes it can be useful to rewrite the contents of an already-generated manifest or virtual dataset.

diff --git a/pyproject.toml b/pyproject.toml
@@ -87,6 +87,10 @@ ignore_missing_imports = true
 module = "kerchunk.*"
 ignore_missing_imports = true
 
+[[tool.mypy.overrides]]
+module = "ujson.*"
+ignore_missing_imports = true
+
 [tool.ruff]
 # Same as Black.
 line-length = 88

diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py
@@ -16,7 +16,8 @@
 from xarray.core.variable import IndexVariable
 
 from virtualizarr.manifests import ManifestArray
-from virtualizarr.utils import _fsspec_openfile_from_filepath
+from virtualizarr.types.kerchunk import KerchunkStoreRefs
+from virtualizarr.utils import _FsspecFSFromFilepath
 
 XArrayOpenT = str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore
 
@@ -39,6 +40,7 @@ class FileType(AutoName):
     zarr = auto()
     dmrpp = auto()
     zarr_v3 = auto()
+    kerchunk = auto()
 
 
 class ManifestBackendArray(ManifestArray, BackendArray):
@@ -67,13 +69,14 @@ def open_virtual_dataset(
 
     Xarray indexes can optionally be created (the default behaviour). To avoid creating any xarray indexes pass ``indexes={}``.
 
+
     Parameters
     ----------
     filepath : str, default None
         File path to open as a set of virtualized zarr arrays.
     filetype : FileType, default None
         Type of file to be opened. Used to determine which kerchunk file format backend to use.
-        Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3'}.
+        Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3', 'kerchunk'}.
         If not provided will attempt to automatically infer the correct filetype from header bytes.
     group : str, default is None
         Path to the HDF5/netCDF4 group in the given file to open. Given as a str, supported by filetypes “netcdf4” and “hdf5”.
@@ -133,9 +136,44 @@ def open_virtual_dataset(
         raise NotImplementedError()
 
     # if filetype is user defined, convert to FileType
+
     if filetype is not None:
         filetype = FileType(filetype)
 
+    if filetype == FileType.kerchunk:
+        from virtualizarr.readers.kerchunk import dataset_from_kerchunk_refs
+
+        fs = _FsspecFSFromFilepath(filepath=filepath, reader_options=reader_options)
+
+        # The kerchunk .parquet storage format isn't actually a parquet, but a directory that contains named parquets for each group/variable.
+        if fs.filepath.endswith("ref.parquet"):
+            from fsspec.implementations.reference import LazyReferenceMapper
+
+            lrm = LazyReferenceMapper(filepath, fs.fs)
+
+            # build reference dict from KV pairs in LazyReferenceMapper
+            # is there a better / more preformant way to extract this?
+            array_refs = {k: lrm[k] for k in lrm.keys()}
+
+            full_reference = {"refs": array_refs}
+
+            return dataset_from_kerchunk_refs(KerchunkStoreRefs(full_reference))
+
+        # JSON has no magic bytes, but the Kerchunk version 1 spec starts with 'version':
+        # https://fsspec.github.io/kerchunk/spec.html
+        elif fs.read_bytes(9).startswith(b'{"version'):
+            import ujson
+
+            with fs.open_file() as of:
+                refs = ujson.load(of)
+
+            return dataset_from_kerchunk_refs(KerchunkStoreRefs(refs))
+
+        else:
+            raise ValueError(
+                "The input Kerchunk reference did not seem to be in Kerchunk's JSON or Parquet spec: https://fsspec.github.io/kerchunk/spec.html. The Kerchunk format autodetection is quite flaky, so if your reference matches the Kerchunk spec feel free to open an issue: https://github.com/zarr-developers/VirtualiZarr/issues"
+            )
+
     if filetype == FileType.zarr_v3:
         # TODO is there a neat way of auto-detecting this?
         from virtualizarr.readers.zarr import open_virtual_dataset_from_v3_store
@@ -151,9 +189,9 @@ def open_virtual_dataset(
                 "Specifying `loadable_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files."
             )
 
-        fpath = _fsspec_openfile_from_filepath(
+        fpath = _FsspecFSFromFilepath(
             filepath=filepath, reader_options=reader_options
-        )
+        ).open_file()
         parser = DMRParser(fpath.read(), data_filepath=filepath.strip(".dmrpp"))
         vds = parser.parse_dataset()
         vds.drop_vars(drop_variables)
@@ -189,9 +227,9 @@ def open_virtual_dataset(
             # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
             # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
             # TODO really we probably want a dedicated xarray backend that iterates over all variables only once
-            fpath = _fsspec_openfile_from_filepath(
+            fpath = _FsspecFSFromFilepath(
                 filepath=filepath, reader_options=reader_options
-            )
+            ).open_file()
 
             # fpath can be `Any` thanks to fsspec.filesystem(...).open() returning Any.
             # We'll (hopefully safely) cast it to what xarray is expecting, but this might let errors through.

diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Callable, Iterable
+from typing import TYPE_CHECKING, Any, Callable, Iterable, cast
 
 import numpy as np
 
@@ -217,9 +217,12 @@ def stack(
     new_shape.insert(axis, length_along_new_stacked_axis)
 
     # do stacking of entries in manifest
-    stacked_paths = np.stack(
-        [arr.manifest._paths for arr in arrays],
-        axis=axis,
+    stacked_paths = cast(  # `np.stack` apparently is type hinted as if the output could have Any dtype
+        np.ndarray[Any, np.dtypes.StringDType],
+        np.stack(
+            [arr.manifest._paths for arr in arrays],
+            axis=axis,
+        ),
     )
     stacked_offsets = np.stack(
         [arr.manifest._offsets for arr in arrays],
@@ -296,10 +299,14 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra
     )
 
     # do broadcasting of entries in manifest
-    broadcasted_paths = np.broadcast_to(
-        x.manifest._paths,
-        shape=new_chunk_grid_shape,
+    broadcasted_paths = cast(  # `np.broadcast_to` apparently is type hinted as if the output could have Any dtype
+        np.ndarray[Any, np.dtypes.StringDType],
+        np.broadcast_to(
+            x.manifest._paths,
+            shape=new_chunk_grid_shape,
+        ),
     )
+
     broadcasted_offsets = np.broadcast_to(
         x.manifest._offsets,
         shape=new_chunk_grid_shape,

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
@@ -84,7 +84,7 @@ class ChunkManifest:
     so it's not possible to have a ChunkManifest object that does not represent a valid grid of chunks.
     """
 
-    _paths: np.ndarray[Any, np.dtypes.StringDType]  # type: ignore[name-defined]
+    _paths: np.ndarray[Any, np.dtypes.StringDType]
     _offsets: np.ndarray[Any, np.dtype[np.uint64]]
     _lengths: np.ndarray[Any, np.dtype[np.uint64]]
 
@@ -113,7 +113,10 @@ def __init__(self, entries: dict) -> None:
         shape = get_chunk_grid_shape(entries.keys())
 
         # Initializing to empty implies that entries with path='' are treated as missing chunks
-        paths = np.empty(shape=shape, dtype=np.dtypes.StringDType())  # type: ignore[attr-defined]
+        paths = cast(  # `np.empty` apparently is type hinted as if the output could have Any dtype
+            np.ndarray[Any, np.dtypes.StringDType],
+            np.empty(shape=shape, dtype=np.dtypes.StringDType()),
+        )
         offsets = np.empty(shape=shape, dtype=np.dtype("uint64"))
         lengths = np.empty(shape=shape, dtype=np.dtype("uint64"))
 
@@ -141,7 +144,7 @@ def __init__(self, entries: dict) -> None:
     @classmethod
     def from_arrays(
         cls,
-        paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]],  # type: ignore[name-defined]
+        paths: np.ndarray[Any, np.dtypes.StringDType],
         offsets: np.ndarray[Any, np.dtype[np.uint64]],
         lengths: np.ndarray[Any, np.dtype[np.uint64]],
     ) -> "ChunkManifest":
@@ -306,7 +309,9 @@ def _from_kerchunk_chunk_dict(
         chunk_entries: dict[ChunkKey, ChunkDictEntry] = {}
         for k, v in kerchunk_chunk_dict.items():
             if isinstance(v, (str, bytes)):
-                raise NotImplementedError("TODO: handle inlined data")
+                raise NotImplementedError(
+                    "Reading inlined reference data is currently not supported. [ToDo]"
+                )
             elif not isinstance(v, (tuple, list)):
                 raise TypeError(f"Unexpected type {type(v)} for chunk value: {v}")
             chunk_entries[k] = ChunkEntry.from_kerchunk(v).dict()

diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py
@@ -13,7 +13,7 @@
     KerchunkArrRefs,
     KerchunkStoreRefs,
 )
-from virtualizarr.utils import _fsspec_openfile_from_filepath
+from virtualizarr.utils import _FsspecFSFromFilepath
 from virtualizarr.zarr import ZArray, ZAttrs
 
 
@@ -28,9 +28,9 @@ def _automatically_determine_filetype(
         raise NotImplementedError()
 
     # Read magic bytes from local or remote file
-    fpath = _fsspec_openfile_from_filepath(
+    fpath = _FsspecFSFromFilepath(
         filepath=filepath, reader_options=reader_options
-    )
+    ).open_file()
     magic_bytes = fpath.read(8)
     fpath.close()