Add option to explicitly specify use of an experimental hdf backend.

zarr-developers · Oct 24, 2024 · 772c580 · 772c580
1 parent 1589776
commit 772c580
Show file tree

Hide file tree

Showing 5 changed files with 126 additions and 58 deletions.
diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py
@@ -13,22 +13,23 @@
 from virtualizarr.readers import (
     DMRPPVirtualBackend,
     FITSVirtualBackend,
-    HDFVirtualBackend,
+    HDF5VirtualBackend,
     KerchunkVirtualBackend,
     NetCDF3VirtualBackend,
     TIFFVirtualBackend,
     ZarrV3VirtualBackend,
 )
+from virtualizarr.readers.common import VirtualBackend
 from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions
 
 # TODO add entrypoint to allow external libraries to add to this mapping
 VIRTUAL_BACKENDS = {
     "kerchunk": KerchunkVirtualBackend,
     "zarr_v3": ZarrV3VirtualBackend,
     "dmrpp": DMRPPVirtualBackend,
-    "hdf5": HDFVirtualBackend,
-    "netcdf4": HDFVirtualBackend,  # note this is the same as for hdf5
     # all the below call one of the kerchunk backends internally (https://fsspec.github.io/kerchunk/reference.html#file-format-backends)
+    "hdf5": HDF5VirtualBackend,
+    "netcdf4": HDF5VirtualBackend,  # note this is the same as for hdf5
     "netcdf3": NetCDF3VirtualBackend,
     "tiff": TIFFVirtualBackend,
     "fits": FITSVirtualBackend,
@@ -113,6 +114,7 @@ def open_virtual_dataset(
     indexes: Mapping[str, Index] | None = None,
     virtual_array_class=ManifestArray,
     reader_options: Optional[dict] = None,
+    backend: Optional[VirtualBackend] = None,
 ) -> Dataset:
     """
     Open a file or store as an xarray Dataset wrapping virtualized zarr arrays.
@@ -182,7 +184,10 @@ def open_virtual_dataset(
             filepath=filepath, reader_options=reader_options
         )
 
-    backend_cls = VIRTUAL_BACKENDS.get(filetype.name.lower())
+    if backend:
+        backend_cls = backend
+    else:
+        backend_cls = VIRTUAL_BACKENDS.get(filetype.name.lower())
 
     if backend_cls is None:
         raise NotImplementedError(f"Unsupported file type: {filetype.name}")

diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py
@@ -11,6 +11,7 @@
 from virtualizarr import open_virtual_dataset
 from virtualizarr.backend import FileType, automatically_determine_filetype
 from virtualizarr.manifests import ManifestArray
+from virtualizarr.readers.hdf import HDFVirtualBackend
 from virtualizarr.tests import (
     has_astropy,
     has_tifffile,
@@ -82,14 +83,15 @@ def test_FileType():
 
 
 @requires_kerchunk
+@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
 class TestOpenVirtualDatasetIndexes:
-    def test_no_indexes(self, netcdf4_file):
-        vds = open_virtual_dataset(netcdf4_file, indexes={})
+    def test_no_indexes(self, netcdf4_file, hdf_backend):
+        vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
         assert vds.indexes == {}
 
-    def test_create_default_indexes(self, netcdf4_file):
+    def test_create_default_indexes(self, netcdf4_file, hdf_backend):
         with pytest.warns(UserWarning, match="will create in-memory pandas indexes"):
-            vds = open_virtual_dataset(netcdf4_file, indexes=None)
+            vds = open_virtual_dataset(netcdf4_file, indexes=None, backend=hdf_backend)
         ds = open_dataset(netcdf4_file, decode_times=True)
 
         # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812
@@ -113,7 +115,8 @@ def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, I
 
 
 @requires_kerchunk
-def test_cftime_index(tmpdir):
+@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
+def test_cftime_index(tmpdir, hdf_backend):
     """Ensure a virtual dataset contains the same indexes as an Xarray dataset"""
     # Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168
     ds = xr.Dataset(
@@ -129,7 +132,10 @@ def test_cftime_index(tmpdir):
     )
     ds.to_netcdf(f"{tmpdir}/tmp.nc")
     vds = open_virtual_dataset(
-        f"{tmpdir}/tmp.nc", loadable_variables=["time", "lat", "lon"], indexes={}
+        f"{tmpdir}/tmp.nc",
+        loadable_variables=["time", "lat", "lon"],
+        indexes={},
+        backend=hdf_backend,
     )
     # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812
     assert index_mappings_equal(vds.xindexes, ds.xindexes)
@@ -139,15 +145,16 @@ def test_cftime_index(tmpdir):
 
 
 @requires_kerchunk
+@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
 class TestOpenVirtualDatasetAttrs:
-    def test_drop_array_dimensions(self, netcdf4_file):
+    def test_drop_array_dimensions(self, netcdf4_file, hdf_backend):
         # regression test for GH issue #150
-        vds = open_virtual_dataset(netcdf4_file, indexes={})
+        vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
         assert "_ARRAY_DIMENSIONS" not in vds["air"].attrs
 
-    def test_coordinate_variable_attrs_preserved(self, netcdf4_file):
+    def test_coordinate_variable_attrs_preserved(self, netcdf4_file, hdf_backend):
         # regression test for GH issue #155
-        vds = open_virtual_dataset(netcdf4_file, indexes={})
+        vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
         assert vds["lat"].attrs == {
             "standard_name": "latitude",
             "long_name": "Latitude",
@@ -165,7 +172,8 @@ class TestReadFromS3:
     @pytest.mark.parametrize(
         "indexes", [None, {}], ids=["None index", "empty dict index"]
     )
-    def test_anon_read_s3(self, filetype, indexes):
+    @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
+    def test_anon_read_s3(self, filetype, indexes, hdf_backend):
         """Parameterized tests for empty vs supplied indexes and filetypes."""
         # TODO: Switch away from this s3 url after minIO is implemented.
         fpath = "s3://carbonplan-share/virtualizarr/local.nc"
@@ -174,6 +182,7 @@ def test_anon_read_s3(self, filetype, indexes):
             filetype=filetype,
             indexes=indexes,
             reader_options={"storage_options": {"anon": True}},
+            backend=hdf_backend,
         )
 
         assert vds.dims == {"time": 2920, "lat": 25, "lon": 53}
@@ -182,6 +191,7 @@ def test_anon_read_s3(self, filetype, indexes):
 
 
 @network
+@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
 class TestReadFromURL:
     @pytest.mark.parametrize(
         "filetype, url",
@@ -228,24 +238,30 @@ class TestReadFromURL:
             ),
         ],
     )
-    def test_read_from_url(self, filetype, url):
+    def test_read_from_url(self, hdf_backend, filetype, url):
         if filetype in ["grib", "jpg", "hdf4"]:
             with pytest.raises(NotImplementedError):
-                vds = open_virtual_dataset(url, reader_options={}, indexes={})
+                vds = open_virtual_dataset(
+                    url,
+                    reader_options={},
+                    indexes={},
+                    backend=hdf_backend,
+                )
         elif filetype == "hdf5":
             vds = open_virtual_dataset(
                 url,
                 group="science/LSAR/GCOV/grids/frequencyA",
                 drop_variables=["listOfCovarianceTerms", "listOfPolarizations"],
                 indexes={},
                 reader_options={},
+                backend=hdf_backend,
             )
             assert isinstance(vds, xr.Dataset)
         else:
-            vds = open_virtual_dataset(url, indexes={})
+            vds = open_virtual_dataset(url, indexes={}, backend=hdf_backend)
             assert isinstance(vds, xr.Dataset)
 
-    def test_virtualizarr_vs_local_nisar(self):
+    def test_virtualizarr_vs_local_nisar(self, hdf_backend):
         import fsspec
 
         # Open group directly from locally cached file with xarray
@@ -268,6 +284,7 @@ def test_virtualizarr_vs_local_nisar(self):
             group=hdf_group,
             indexes={},
             drop_variables=["listOfCovarianceTerms", "listOfPolarizations"],
+            backend=hdf_backend,
         )
         tmpref = "/tmp/cmip6.json"
         vds.virtualize.to_kerchunk(tmpref, format="json")
@@ -279,10 +296,14 @@ def test_virtualizarr_vs_local_nisar(self):
 
 @requires_kerchunk
 class TestLoadVirtualDataset:
-    def test_loadable_variables(self, netcdf4_file):
+    @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
+    def test_loadable_variables(self, netcdf4_file, hdf_backend):
         vars_to_load = ["air", "time"]
         vds = open_virtual_dataset(
-            netcdf4_file, loadable_variables=vars_to_load, indexes={}
+            netcdf4_file,
+            loadable_variables=vars_to_load,
+            indexes={},
+            backend=hdf_backend,
         )
 
         for name in vds.variables:
@@ -304,18 +325,28 @@ def test_explicit_filetype(self, netcdf4_file):
         with pytest.raises(NotImplementedError):
             open_virtual_dataset(netcdf4_file, filetype="grib")
 
-    def test_group_kwarg(self, hdf5_groups_file):
-        with pytest.raises(ValueError, match="Multiple HDF Groups found"):
-            open_virtual_dataset(hdf5_groups_file)
-        with pytest.raises(ValueError, match="not found in"):
-            open_virtual_dataset(hdf5_groups_file, group="doesnt_exist")
+    @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
+    def test_group_kwarg(self, hdf5_groups_file, hdf_backend):
+        if hdf_backend:
+            with pytest.raises(NotImplementedError, match="Nested groups"):
+                open_virtual_dataset(hdf5_groups_file, backend=hdf_backend)
+            with pytest.raises(KeyError, match="doesn't exist"):
+                open_virtual_dataset(
+                    hdf5_groups_file, group="doesnt_exist", backend=hdf_backend
+                )
+        else:
+            with pytest.raises(ValueError, match="Multiple HDF Groups found"):
+                open_virtual_dataset(hdf5_groups_file)
+            with pytest.raises(ValueError, match="not found in"):
+                open_virtual_dataset(hdf5_groups_file, group="doesnt_exist")
 
         vars_to_load = ["air", "time"]
         vds = open_virtual_dataset(
             hdf5_groups_file,
             group="test/group",
             loadable_variables=vars_to_load,
             indexes={},
+            backend=hdf_backend,
         )
         full_ds = xr.open_dataset(
             hdf5_groups_file,
@@ -340,13 +371,15 @@ def test_open_virtual_dataset_passes_expected_args(
         }
         mock_read_kerchunk.assert_called_once_with(**args)
 
-    def test_open_dataset_with_empty(self, hdf5_empty, tmpdir):
-        vds = open_virtual_dataset(hdf5_empty)
+    @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
+    def test_open_dataset_with_empty(self, hdf5_empty, tmpdir, hdf_backend):
+        vds = open_virtual_dataset(hdf5_empty, backend=hdf_backend)
         assert vds.empty.dims == ()
         assert vds.empty.attrs == {"empty": "true"}
 
-    def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir):
-        vds = open_virtual_dataset(hdf5_scalar)
+    @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
+    def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir, hdf_backend):
+        vds = open_virtual_dataset(hdf5_scalar, backend=hdf_backend)
         assert vds.scalar.dims == ()
         assert vds.scalar.attrs == {"scalar": "true"}
 

diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
@@ -5,6 +5,7 @@
 
 from virtualizarr import open_virtual_dataset
 from virtualizarr.manifests import ChunkManifest, ManifestArray
+from virtualizarr.readers.hdf import HDFVirtualBackend
 from virtualizarr.tests import requires_kerchunk
 from virtualizarr.translators.kerchunk import (
     dataset_from_kerchunk_refs,
@@ -63,8 +64,9 @@ def test_no_duplicates_find_var_names():
         ),
     ],
 )
+@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
 def test_numpy_arrays_to_inlined_kerchunk_refs(
-    netcdf4_file, inline_threshold, vars_to_inline
+    netcdf4_file, inline_threshold, vars_to_inline, hdf_backend
 ):
     from kerchunk.hdf import SingleHdf5ToZarr
 
@@ -75,7 +77,7 @@ def test_numpy_arrays_to_inlined_kerchunk_refs(
 
     # loading the variables should produce same result as inlining them using kerchunk
     vds = open_virtual_dataset(
-        netcdf4_file, loadable_variables=vars_to_inline, indexes={}
+        netcdf4_file, loadable_variables=vars_to_inline, indexes={}, backend=hdf_backend
     )
     refs = vds.virtualize.to_kerchunk(format="dict")
 
@@ -90,15 +92,16 @@ def test_numpy_arrays_to_inlined_kerchunk_refs(
 @requires_kerchunk
 @pytest.mark.parametrize("format", ["dict", "json", "parquet"])
 class TestKerchunkRoundtrip:
-    def test_kerchunk_roundtrip_no_concat(self, tmpdir, format):
+    @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
+    def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
         # set up example xarray dataset
         ds = xr.tutorial.open_dataset("air_temperature", decode_times=False)
 
         # save it to disk as netCDF (in temporary directory)
         ds.to_netcdf(f"{tmpdir}/air.nc")
 
         # use open_dataset_via_kerchunk to read it as references
-        vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={})
+        vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={}, backend=hdf_backend)
 
         if format == "dict":
             # write those references to an in-memory kerchunk-formatted references dictionary
@@ -122,8 +125,11 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format):
         for coord in ds.coords:
             assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs
 
+    @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
     @pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])])
-    def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars):
+    def test_kerchunk_roundtrip_concat(
+        self, tmpdir, format, hdf_backend, decode_times, time_vars
+    ):
         # set up example xarray dataset
         ds = xr.tutorial.open_dataset("air_temperature", decode_times=decode_times)
 
@@ -139,11 +145,13 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars
             f"{tmpdir}/air1.nc",
             indexes={},
             loadable_variables=time_vars,
+            backend=hdf_backend,
         )
         vds2 = open_virtual_dataset(
             f"{tmpdir}/air2.nc",
             indexes={},
             loadable_variables=time_vars,
+            backend=hdf_backend,
         )
 
         if decode_times is False:
@@ -187,7 +195,8 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars
             assert roundtrip.time.encoding["units"] == ds.time.encoding["units"]
             assert roundtrip.time.encoding["calendar"] == ds.time.encoding["calendar"]
 
-    def test_non_dimension_coordinates(self, tmpdir, format):
+    @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
+    def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
         # regression test for GH issue #105
 
         # set up example xarray dataset containing non-dimension coordinate variables
@@ -196,7 +205,9 @@ def test_non_dimension_coordinates(self, tmpdir, format):
         # save it to disk as netCDF (in temporary directory)
         ds.to_netcdf(f"{tmpdir}/non_dim_coords.nc")
 
-        vds = open_virtual_dataset(f"{tmpdir}/non_dim_coords.nc", indexes={})
+        vds = open_virtual_dataset(
+            f"{tmpdir}/non_dim_coords.nc", indexes={}, backend=hdf_backend
+        )
 
         assert "lat" in vds.coords
         assert "coordinates" not in vds.attrs
@@ -269,11 +280,12 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format):
 
 
 @requires_kerchunk
-def test_open_scalar_variable(tmpdir):
+@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
+def test_open_scalar_variable(tmpdir, hdf_backend):
     # regression test for GH issue #100
 
     ds = xr.Dataset(data_vars={"a": 0})
     ds.to_netcdf(f"{tmpdir}/scalar.nc")
 
-    vds = open_virtual_dataset(f"{tmpdir}/scalar.nc", indexes={})
+    vds = open_virtual_dataset(f"{tmpdir}/scalar.nc", indexes={}, backend=hdf_backend)
     assert vds["a"].shape == ()