reading existing refs - wip

zarr-developers · Oct 8, 2024 · 18f0deb · 18f0deb
1 parent 47a5e87
commit 18f0deb
Show file tree

Hide file tree

Showing 2 changed files with 61 additions and 1 deletion.
diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py
@@ -39,6 +39,8 @@ class FileType(AutoName):
     zarr = auto()
     dmrpp = auto()
     zarr_v3 = auto()
+    kerchunk_json = auto()
+    kerchunk_parquet = auto()
 
 
 class ManifestBackendArray(ManifestArray, BackendArray):
@@ -67,13 +69,14 @@ def open_virtual_dataset(
 
     Xarray indexes can optionally be created (the default behaviour). To avoid creating any xarray indexes pass ``indexes={}``.
 
+
     Parameters
     ----------
     filepath : str, default None
         File path to open as a set of virtualized zarr arrays.
     filetype : FileType, default None
         Type of file to be opened. Used to determine which kerchunk file format backend to use.
-        Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3'}.
+        Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3', 'kerchunk_json', 'kerchunk_parquet'}.
         If not provided will attempt to automatically infer the correct filetype from header bytes.
     group : str, default is None
         Path to the HDF5/netCDF4 group in the given file to open. Given as a str, supported by filetypes “netcdf4” and “hdf5”.
@@ -136,6 +139,23 @@ def open_virtual_dataset(
     if filetype is not None:
         filetype = FileType(filetype)
 
+    if filetype == FileType.kerchunk_json:
+        import ast
+
+        from virtualizarr.readers.kerchunk import dataset_from_kerchunk_refs
+
+        fpath = _fsspec_openfile_from_filepath(
+            filepath=filepath, reader_options=reader_options
+        )
+
+        refs = ast.literal_eval(fpath.read().decode("utf-8"))
+
+        vds = dataset_from_kerchunk_refs(refs)
+        return vds
+
+    if filetype == FileType.kerchunk_parquet:
+        raise NotImplementedError()
+
     if filetype == FileType.zarr_v3:
         # TODO is there a neat way of auto-detecting this?
         from virtualizarr.readers.zarr import open_virtual_dataset_from_v3_store

diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py
@@ -336,3 +336,43 @@ def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir):
         vds = open_virtual_dataset(hdf5_scalar)
         assert vds.scalar.dims == ()
         assert vds.scalar.attrs == {"scalar": "true"}
+
+
+@pytest.mark.parametrize(
+    "reference_format",
+    [
+        "kerchunk_json",
+        pytest.param("kerchunk_parquet", marks=pytest.mark.skip(reason="wip")),
+    ],
+)
+def test_open_virtual_dataset_existing_kerchunk_refs(
+    tmp_path, netcdf4_file, reference_format
+):
+    from kerchunk.hdf import SingleHdf5ToZarr
+
+    # FIXME/WARNING/TODO: `inline_threshold` set to 1, so we don't get inlining of refs.
+    # Reading inlined vars as ManifestArrays is currently not implemented.
+    refs = SingleHdf5ToZarr(netcdf4_file, inline_threshold=1).translate()
+
+    if reference_format == "kerchunk_json":
+        import ujson
+
+        ref_filepath = tmp_path / "ref.json"
+        with open(ref_filepath, "wb") as f:
+            f.write(ujson.dumps(refs).encode())
+    # WIP
+    # if reference_format == 'kerchunk_parquet':
+    #     ref_filepath = tmp_path / 'ref.parquet'
+    #     from kerchunk.df import refs_to_dataframe
+    #     refs_to_dataframe(fo=refs, url = ref_filepath.as_posix())
+
+    vds = open_virtual_dataset(
+        filepath=ref_filepath, filetype=reference_format, indexes={}
+    )
+
+    # FIXME: variable names have a trailing \
+    assert list(vds) == ["time", "lat", "air", "lon"]
+    # FIXME: coodinates empty
+    assert set(vds.coords) == set("lat", "lon", "time")
+    # FIXME: coords [lat, lon, time] are in data variables
+    assert list(vds.variables) == ["air"]