Skip to content

Commit

Permalink
Add option to explicitly specify use of an experimental hdf backend.
Browse files Browse the repository at this point in the history
  • Loading branch information
sharkinsspatial committed Oct 24, 2024
1 parent 1589776 commit 772c580
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 58 deletions.
13 changes: 9 additions & 4 deletions virtualizarr/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,23 @@
from virtualizarr.readers import (
DMRPPVirtualBackend,
FITSVirtualBackend,
HDFVirtualBackend,
HDF5VirtualBackend,
KerchunkVirtualBackend,
NetCDF3VirtualBackend,
TIFFVirtualBackend,
ZarrV3VirtualBackend,
)
from virtualizarr.readers.common import VirtualBackend
from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions

# TODO add entrypoint to allow external libraries to add to this mapping
VIRTUAL_BACKENDS = {
"kerchunk": KerchunkVirtualBackend,
"zarr_v3": ZarrV3VirtualBackend,
"dmrpp": DMRPPVirtualBackend,
"hdf5": HDFVirtualBackend,
"netcdf4": HDFVirtualBackend, # note this is the same as for hdf5
# all the below call one of the kerchunk backends internally (https://fsspec.github.io/kerchunk/reference.html#file-format-backends)
"hdf5": HDF5VirtualBackend,
"netcdf4": HDF5VirtualBackend, # note this is the same as for hdf5
"netcdf3": NetCDF3VirtualBackend,
"tiff": TIFFVirtualBackend,
"fits": FITSVirtualBackend,
Expand Down Expand Up @@ -113,6 +114,7 @@ def open_virtual_dataset(
indexes: Mapping[str, Index] | None = None,
virtual_array_class=ManifestArray,
reader_options: Optional[dict] = None,
backend: Optional[VirtualBackend] = None,
) -> Dataset:
"""
Open a file or store as an xarray Dataset wrapping virtualized zarr arrays.
Expand Down Expand Up @@ -182,7 +184,10 @@ def open_virtual_dataset(
filepath=filepath, reader_options=reader_options
)

backend_cls = VIRTUAL_BACKENDS.get(filetype.name.lower())
if backend:
backend_cls = backend
else:
backend_cls = VIRTUAL_BACKENDS.get(filetype.name.lower())

if backend_cls is None:
raise NotImplementedError(f"Unsupported file type: {filetype.name}")
Expand Down
85 changes: 59 additions & 26 deletions virtualizarr/tests/test_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from virtualizarr import open_virtual_dataset
from virtualizarr.backend import FileType, automatically_determine_filetype
from virtualizarr.manifests import ManifestArray
from virtualizarr.readers.hdf import HDFVirtualBackend
from virtualizarr.tests import (
has_astropy,
has_tifffile,
Expand Down Expand Up @@ -82,14 +83,15 @@ def test_FileType():


@requires_kerchunk
@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
class TestOpenVirtualDatasetIndexes:
def test_no_indexes(self, netcdf4_file):
vds = open_virtual_dataset(netcdf4_file, indexes={})
def test_no_indexes(self, netcdf4_file, hdf_backend):
vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
assert vds.indexes == {}

def test_create_default_indexes(self, netcdf4_file):
def test_create_default_indexes(self, netcdf4_file, hdf_backend):
with pytest.warns(UserWarning, match="will create in-memory pandas indexes"):
vds = open_virtual_dataset(netcdf4_file, indexes=None)
vds = open_virtual_dataset(netcdf4_file, indexes=None, backend=hdf_backend)
ds = open_dataset(netcdf4_file, decode_times=True)

# TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812
Expand All @@ -113,7 +115,8 @@ def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, I


@requires_kerchunk
def test_cftime_index(tmpdir):
@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
def test_cftime_index(tmpdir, hdf_backend):
"""Ensure a virtual dataset contains the same indexes as an Xarray dataset"""
# Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168
ds = xr.Dataset(
Expand All @@ -129,7 +132,10 @@ def test_cftime_index(tmpdir):
)
ds.to_netcdf(f"{tmpdir}/tmp.nc")
vds = open_virtual_dataset(
f"{tmpdir}/tmp.nc", loadable_variables=["time", "lat", "lon"], indexes={}
f"{tmpdir}/tmp.nc",
loadable_variables=["time", "lat", "lon"],
indexes={},
backend=hdf_backend,
)
# TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812
assert index_mappings_equal(vds.xindexes, ds.xindexes)
Expand All @@ -139,15 +145,16 @@ def test_cftime_index(tmpdir):


@requires_kerchunk
@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
class TestOpenVirtualDatasetAttrs:
def test_drop_array_dimensions(self, netcdf4_file):
def test_drop_array_dimensions(self, netcdf4_file, hdf_backend):
# regression test for GH issue #150
vds = open_virtual_dataset(netcdf4_file, indexes={})
vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
assert "_ARRAY_DIMENSIONS" not in vds["air"].attrs

def test_coordinate_variable_attrs_preserved(self, netcdf4_file):
def test_coordinate_variable_attrs_preserved(self, netcdf4_file, hdf_backend):
# regression test for GH issue #155
vds = open_virtual_dataset(netcdf4_file, indexes={})
vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
assert vds["lat"].attrs == {
"standard_name": "latitude",
"long_name": "Latitude",
Expand All @@ -165,7 +172,8 @@ class TestReadFromS3:
@pytest.mark.parametrize(
"indexes", [None, {}], ids=["None index", "empty dict index"]
)
def test_anon_read_s3(self, filetype, indexes):
@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
def test_anon_read_s3(self, filetype, indexes, hdf_backend):
"""Parameterized tests for empty vs supplied indexes and filetypes."""
# TODO: Switch away from this s3 url after minIO is implemented.
fpath = "s3://carbonplan-share/virtualizarr/local.nc"
Expand All @@ -174,6 +182,7 @@ def test_anon_read_s3(self, filetype, indexes):
filetype=filetype,
indexes=indexes,
reader_options={"storage_options": {"anon": True}},
backend=hdf_backend,
)

assert vds.dims == {"time": 2920, "lat": 25, "lon": 53}
Expand All @@ -182,6 +191,7 @@ def test_anon_read_s3(self, filetype, indexes):


@network
@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
class TestReadFromURL:
@pytest.mark.parametrize(
"filetype, url",
Expand Down Expand Up @@ -228,24 +238,30 @@ class TestReadFromURL:
),
],
)
def test_read_from_url(self, filetype, url):
def test_read_from_url(self, hdf_backend, filetype, url):
if filetype in ["grib", "jpg", "hdf4"]:
with pytest.raises(NotImplementedError):
vds = open_virtual_dataset(url, reader_options={}, indexes={})
vds = open_virtual_dataset(
url,
reader_options={},
indexes={},
backend=hdf_backend,
)
elif filetype == "hdf5":
vds = open_virtual_dataset(
url,
group="science/LSAR/GCOV/grids/frequencyA",
drop_variables=["listOfCovarianceTerms", "listOfPolarizations"],
indexes={},
reader_options={},
backend=hdf_backend,
)
assert isinstance(vds, xr.Dataset)
else:
vds = open_virtual_dataset(url, indexes={})
vds = open_virtual_dataset(url, indexes={}, backend=hdf_backend)
assert isinstance(vds, xr.Dataset)

def test_virtualizarr_vs_local_nisar(self):
def test_virtualizarr_vs_local_nisar(self, hdf_backend):
import fsspec

# Open group directly from locally cached file with xarray
Expand All @@ -268,6 +284,7 @@ def test_virtualizarr_vs_local_nisar(self):
group=hdf_group,
indexes={},
drop_variables=["listOfCovarianceTerms", "listOfPolarizations"],
backend=hdf_backend,
)
tmpref = "/tmp/cmip6.json"
vds.virtualize.to_kerchunk(tmpref, format="json")
Expand All @@ -279,10 +296,14 @@ def test_virtualizarr_vs_local_nisar(self):

@requires_kerchunk
class TestLoadVirtualDataset:
def test_loadable_variables(self, netcdf4_file):
@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
def test_loadable_variables(self, netcdf4_file, hdf_backend):
vars_to_load = ["air", "time"]
vds = open_virtual_dataset(
netcdf4_file, loadable_variables=vars_to_load, indexes={}
netcdf4_file,
loadable_variables=vars_to_load,
indexes={},
backend=hdf_backend,
)

for name in vds.variables:
Expand All @@ -304,18 +325,28 @@ def test_explicit_filetype(self, netcdf4_file):
with pytest.raises(NotImplementedError):
open_virtual_dataset(netcdf4_file, filetype="grib")

def test_group_kwarg(self, hdf5_groups_file):
with pytest.raises(ValueError, match="Multiple HDF Groups found"):
open_virtual_dataset(hdf5_groups_file)
with pytest.raises(ValueError, match="not found in"):
open_virtual_dataset(hdf5_groups_file, group="doesnt_exist")
@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
def test_group_kwarg(self, hdf5_groups_file, hdf_backend):
if hdf_backend:
with pytest.raises(NotImplementedError, match="Nested groups"):
open_virtual_dataset(hdf5_groups_file, backend=hdf_backend)
with pytest.raises(KeyError, match="doesn't exist"):
open_virtual_dataset(
hdf5_groups_file, group="doesnt_exist", backend=hdf_backend
)
else:
with pytest.raises(ValueError, match="Multiple HDF Groups found"):
open_virtual_dataset(hdf5_groups_file)
with pytest.raises(ValueError, match="not found in"):
open_virtual_dataset(hdf5_groups_file, group="doesnt_exist")

vars_to_load = ["air", "time"]
vds = open_virtual_dataset(
hdf5_groups_file,
group="test/group",
loadable_variables=vars_to_load,
indexes={},
backend=hdf_backend,
)
full_ds = xr.open_dataset(
hdf5_groups_file,
Expand All @@ -340,13 +371,15 @@ def test_open_virtual_dataset_passes_expected_args(
}
mock_read_kerchunk.assert_called_once_with(**args)

def test_open_dataset_with_empty(self, hdf5_empty, tmpdir):
vds = open_virtual_dataset(hdf5_empty)
@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
def test_open_dataset_with_empty(self, hdf5_empty, tmpdir, hdf_backend):
vds = open_virtual_dataset(hdf5_empty, backend=hdf_backend)
assert vds.empty.dims == ()
assert vds.empty.attrs == {"empty": "true"}

def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir):
vds = open_virtual_dataset(hdf5_scalar)
@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir, hdf_backend):
vds = open_virtual_dataset(hdf5_scalar, backend=hdf_backend)
assert vds.scalar.dims == ()
assert vds.scalar.attrs == {"scalar": "true"}

Expand Down
30 changes: 21 additions & 9 deletions virtualizarr/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from virtualizarr import open_virtual_dataset
from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.readers.hdf import HDFVirtualBackend
from virtualizarr.tests import requires_kerchunk
from virtualizarr.translators.kerchunk import (
dataset_from_kerchunk_refs,
Expand Down Expand Up @@ -63,8 +64,9 @@ def test_no_duplicates_find_var_names():
),
],
)
@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
def test_numpy_arrays_to_inlined_kerchunk_refs(
netcdf4_file, inline_threshold, vars_to_inline
netcdf4_file, inline_threshold, vars_to_inline, hdf_backend
):
from kerchunk.hdf import SingleHdf5ToZarr

Expand All @@ -75,7 +77,7 @@ def test_numpy_arrays_to_inlined_kerchunk_refs(

# loading the variables should produce same result as inlining them using kerchunk
vds = open_virtual_dataset(
netcdf4_file, loadable_variables=vars_to_inline, indexes={}
netcdf4_file, loadable_variables=vars_to_inline, indexes={}, backend=hdf_backend
)
refs = vds.virtualize.to_kerchunk(format="dict")

Expand All @@ -90,15 +92,16 @@ def test_numpy_arrays_to_inlined_kerchunk_refs(
@requires_kerchunk
@pytest.mark.parametrize("format", ["dict", "json", "parquet"])
class TestKerchunkRoundtrip:
def test_kerchunk_roundtrip_no_concat(self, tmpdir, format):
@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
# set up example xarray dataset
ds = xr.tutorial.open_dataset("air_temperature", decode_times=False)

# save it to disk as netCDF (in temporary directory)
ds.to_netcdf(f"{tmpdir}/air.nc")

# use open_dataset_via_kerchunk to read it as references
vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={})
vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={}, backend=hdf_backend)

if format == "dict":
# write those references to an in-memory kerchunk-formatted references dictionary
Expand All @@ -122,8 +125,11 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format):
for coord in ds.coords:
assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs

@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
@pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])])
def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars):
def test_kerchunk_roundtrip_concat(
self, tmpdir, format, hdf_backend, decode_times, time_vars
):
# set up example xarray dataset
ds = xr.tutorial.open_dataset("air_temperature", decode_times=decode_times)

Expand All @@ -139,11 +145,13 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars
f"{tmpdir}/air1.nc",
indexes={},
loadable_variables=time_vars,
backend=hdf_backend,
)
vds2 = open_virtual_dataset(
f"{tmpdir}/air2.nc",
indexes={},
loadable_variables=time_vars,
backend=hdf_backend,
)

if decode_times is False:
Expand Down Expand Up @@ -187,7 +195,8 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars
assert roundtrip.time.encoding["units"] == ds.time.encoding["units"]
assert roundtrip.time.encoding["calendar"] == ds.time.encoding["calendar"]

def test_non_dimension_coordinates(self, tmpdir, format):
@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
# regression test for GH issue #105

# set up example xarray dataset containing non-dimension coordinate variables
Expand All @@ -196,7 +205,9 @@ def test_non_dimension_coordinates(self, tmpdir, format):
# save it to disk as netCDF (in temporary directory)
ds.to_netcdf(f"{tmpdir}/non_dim_coords.nc")

vds = open_virtual_dataset(f"{tmpdir}/non_dim_coords.nc", indexes={})
vds = open_virtual_dataset(
f"{tmpdir}/non_dim_coords.nc", indexes={}, backend=hdf_backend
)

assert "lat" in vds.coords
assert "coordinates" not in vds.attrs
Expand Down Expand Up @@ -269,11 +280,12 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format):


@requires_kerchunk
def test_open_scalar_variable(tmpdir):
@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
def test_open_scalar_variable(tmpdir, hdf_backend):
# regression test for GH issue #100

ds = xr.Dataset(data_vars={"a": 0})
ds.to_netcdf(f"{tmpdir}/scalar.nc")

vds = open_virtual_dataset(f"{tmpdir}/scalar.nc", indexes={})
vds = open_virtual_dataset(f"{tmpdir}/scalar.nc", indexes={}, backend=hdf_backend)
assert vds["a"].shape == ()
Loading

0 comments on commit 772c580

Please sign in to comment.