Skip to content

Commit

Permalink
reading existing refs - wip
Browse files Browse the repository at this point in the history
  • Loading branch information
norlandrhagen committed Oct 8, 2024
1 parent 47a5e87 commit 18f0deb
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 1 deletion.
22 changes: 21 additions & 1 deletion virtualizarr/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ class FileType(AutoName):
zarr = auto()
dmrpp = auto()
zarr_v3 = auto()
kerchunk_json = auto()
kerchunk_parquet = auto()


class ManifestBackendArray(ManifestArray, BackendArray):
Expand Down Expand Up @@ -67,13 +69,14 @@ def open_virtual_dataset(
Xarray indexes can optionally be created (the default behaviour). To avoid creating any xarray indexes pass ``indexes={}``.
Parameters
----------
filepath : str, default None
File path to open as a set of virtualized zarr arrays.
filetype : FileType, default None
Type of file to be opened. Used to determine which kerchunk file format backend to use.
Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3'}.
Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3', 'kerchunk_json', 'kerchunk_parquet'}.
If not provided will attempt to automatically infer the correct filetype from header bytes.
group : str, default is None
Path to the HDF5/netCDF4 group in the given file to open. Given as a str, supported by filetypes “netcdf4” and “hdf5”.
Expand Down Expand Up @@ -136,6 +139,23 @@ def open_virtual_dataset(
if filetype is not None:
filetype = FileType(filetype)

if filetype == FileType.kerchunk_json:
import ast

from virtualizarr.readers.kerchunk import dataset_from_kerchunk_refs

fpath = _fsspec_openfile_from_filepath(
filepath=filepath, reader_options=reader_options
)

refs = ast.literal_eval(fpath.read().decode("utf-8"))

vds = dataset_from_kerchunk_refs(refs)
return vds

if filetype == FileType.kerchunk_parquet:
raise NotImplementedError()

if filetype == FileType.zarr_v3:
# TODO is there a neat way of auto-detecting this?
from virtualizarr.readers.zarr import open_virtual_dataset_from_v3_store
Expand Down
40 changes: 40 additions & 0 deletions virtualizarr/tests/test_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,3 +336,43 @@ def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir):
vds = open_virtual_dataset(hdf5_scalar)
assert vds.scalar.dims == ()
assert vds.scalar.attrs == {"scalar": "true"}


@pytest.mark.parametrize(
"reference_format",
[
"kerchunk_json",
pytest.param("kerchunk_parquet", marks=pytest.mark.skip(reason="wip")),
],
)
def test_open_virtual_dataset_existing_kerchunk_refs(
tmp_path, netcdf4_file, reference_format
):
from kerchunk.hdf import SingleHdf5ToZarr

# FIXME/WARNING/TODO: `inline_threshold` set to 1, so we don't get inlining of refs.
# Reading inlined vars as ManifestArrays is currently not implemented.
refs = SingleHdf5ToZarr(netcdf4_file, inline_threshold=1).translate()

if reference_format == "kerchunk_json":
import ujson

ref_filepath = tmp_path / "ref.json"
with open(ref_filepath, "wb") as f:
f.write(ujson.dumps(refs).encode())
# WIP
# if reference_format == 'kerchunk_parquet':
# ref_filepath = tmp_path / 'ref.parquet'
# from kerchunk.df import refs_to_dataframe
# refs_to_dataframe(fo=refs, url = ref_filepath.as_posix())

vds = open_virtual_dataset(
filepath=ref_filepath, filetype=reference_format, indexes={}
)

# FIXME: variable names have a trailing \
assert list(vds) == ["time", "lat", "air", "lon"]
# FIXME: coodinates empty
assert set(vds.coords) == set("lat", "lon", "time")
# FIXME: coords [lat, lon, time] are in data variables
assert list(vds.variables) == ["air"]

0 comments on commit 18f0deb

Please sign in to comment.