diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index 19aebfdd..3ab76d1f 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -13,12 +13,13 @@ from virtualizarr.readers import ( DMRPPVirtualBackend, FITSVirtualBackend, - HDFVirtualBackend, + HDF5VirtualBackend, KerchunkVirtualBackend, NetCDF3VirtualBackend, TIFFVirtualBackend, ZarrV3VirtualBackend, ) +from virtualizarr.readers.common import VirtualBackend from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions # TODO add entrypoint to allow external libraries to add to this mapping @@ -26,9 +27,9 @@ "kerchunk": KerchunkVirtualBackend, "zarr_v3": ZarrV3VirtualBackend, "dmrpp": DMRPPVirtualBackend, - "hdf5": HDFVirtualBackend, - "netcdf4": HDFVirtualBackend, # note this is the same as for hdf5 # all the below call one of the kerchunk backends internally (https://fsspec.github.io/kerchunk/reference.html#file-format-backends) + "hdf5": HDF5VirtualBackend, + "netcdf4": HDF5VirtualBackend, # note this is the same as for hdf5 "netcdf3": NetCDF3VirtualBackend, "tiff": TIFFVirtualBackend, "fits": FITSVirtualBackend, @@ -113,6 +114,7 @@ def open_virtual_dataset( indexes: Mapping[str, Index] | None = None, virtual_array_class=ManifestArray, reader_options: Optional[dict] = None, + backend: Optional[VirtualBackend] = None, ) -> Dataset: """ Open a file or store as an xarray Dataset wrapping virtualized zarr arrays. @@ -182,7 +184,10 @@ def open_virtual_dataset( filepath=filepath, reader_options=reader_options ) - backend_cls = VIRTUAL_BACKENDS.get(filetype.name.lower()) + if backend: + backend_cls = backend + else: + backend_cls = VIRTUAL_BACKENDS.get(filetype.name.lower()) if backend_cls is None: raise NotImplementedError(f"Unsupported file type: {filetype.name}") diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index 43a6bbd8..2368848a 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -11,6 +11,7 @@ from virtualizarr import open_virtual_dataset from virtualizarr.backend import FileType, automatically_determine_filetype from virtualizarr.manifests import ManifestArray +from virtualizarr.readers.hdf import HDFVirtualBackend from virtualizarr.tests import ( has_astropy, has_tifffile, @@ -82,14 +83,15 @@ def test_FileType(): @requires_kerchunk +@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) class TestOpenVirtualDatasetIndexes: - def test_no_indexes(self, netcdf4_file): - vds = open_virtual_dataset(netcdf4_file, indexes={}) + def test_no_indexes(self, netcdf4_file, hdf_backend): + vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) assert vds.indexes == {} - def test_create_default_indexes(self, netcdf4_file): + def test_create_default_indexes(self, netcdf4_file, hdf_backend): with pytest.warns(UserWarning, match="will create in-memory pandas indexes"): - vds = open_virtual_dataset(netcdf4_file, indexes=None) + vds = open_virtual_dataset(netcdf4_file, indexes=None, backend=hdf_backend) ds = open_dataset(netcdf4_file, decode_times=True) # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 @@ -113,7 +115,8 @@ def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, I @requires_kerchunk -def test_cftime_index(tmpdir): +@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) +def test_cftime_index(tmpdir, hdf_backend): """Ensure a virtual dataset contains the same indexes as an Xarray dataset""" # Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168 ds = xr.Dataset( @@ -129,7 +132,10 @@ def test_cftime_index(tmpdir): ) ds.to_netcdf(f"{tmpdir}/tmp.nc") vds = open_virtual_dataset( - f"{tmpdir}/tmp.nc", loadable_variables=["time", "lat", "lon"], indexes={} + f"{tmpdir}/tmp.nc", + loadable_variables=["time", "lat", "lon"], + indexes={}, + backend=hdf_backend, ) # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 assert index_mappings_equal(vds.xindexes, ds.xindexes) @@ -139,15 +145,16 @@ def test_cftime_index(tmpdir): @requires_kerchunk +@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) class TestOpenVirtualDatasetAttrs: - def test_drop_array_dimensions(self, netcdf4_file): + def test_drop_array_dimensions(self, netcdf4_file, hdf_backend): # regression test for GH issue #150 - vds = open_virtual_dataset(netcdf4_file, indexes={}) + vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) assert "_ARRAY_DIMENSIONS" not in vds["air"].attrs - def test_coordinate_variable_attrs_preserved(self, netcdf4_file): + def test_coordinate_variable_attrs_preserved(self, netcdf4_file, hdf_backend): # regression test for GH issue #155 - vds = open_virtual_dataset(netcdf4_file, indexes={}) + vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) assert vds["lat"].attrs == { "standard_name": "latitude", "long_name": "Latitude", @@ -165,7 +172,8 @@ class TestReadFromS3: @pytest.mark.parametrize( "indexes", [None, {}], ids=["None index", "empty dict index"] ) - def test_anon_read_s3(self, filetype, indexes): + @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) + def test_anon_read_s3(self, filetype, indexes, hdf_backend): """Parameterized tests for empty vs supplied indexes and filetypes.""" # TODO: Switch away from this s3 url after minIO is implemented. fpath = "s3://carbonplan-share/virtualizarr/local.nc" @@ -174,6 +182,7 @@ def test_anon_read_s3(self, filetype, indexes): filetype=filetype, indexes=indexes, reader_options={"storage_options": {"anon": True}}, + backend=hdf_backend, ) assert vds.dims == {"time": 2920, "lat": 25, "lon": 53} @@ -182,6 +191,7 @@ def test_anon_read_s3(self, filetype, indexes): @network +@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) class TestReadFromURL: @pytest.mark.parametrize( "filetype, url", @@ -228,10 +238,15 @@ class TestReadFromURL: ), ], ) - def test_read_from_url(self, filetype, url): + def test_read_from_url(self, hdf_backend, filetype, url): if filetype in ["grib", "jpg", "hdf4"]: with pytest.raises(NotImplementedError): - vds = open_virtual_dataset(url, reader_options={}, indexes={}) + vds = open_virtual_dataset( + url, + reader_options={}, + indexes={}, + backend=hdf_backend, + ) elif filetype == "hdf5": vds = open_virtual_dataset( url, @@ -239,13 +254,14 @@ def test_read_from_url(self, filetype, url): drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], indexes={}, reader_options={}, + backend=hdf_backend, ) assert isinstance(vds, xr.Dataset) else: - vds = open_virtual_dataset(url, indexes={}) + vds = open_virtual_dataset(url, indexes={}, backend=hdf_backend) assert isinstance(vds, xr.Dataset) - def test_virtualizarr_vs_local_nisar(self): + def test_virtualizarr_vs_local_nisar(self, hdf_backend): import fsspec # Open group directly from locally cached file with xarray @@ -268,6 +284,7 @@ def test_virtualizarr_vs_local_nisar(self): group=hdf_group, indexes={}, drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], + backend=hdf_backend, ) tmpref = "/tmp/cmip6.json" vds.virtualize.to_kerchunk(tmpref, format="json") @@ -279,10 +296,14 @@ def test_virtualizarr_vs_local_nisar(self): @requires_kerchunk class TestLoadVirtualDataset: - def test_loadable_variables(self, netcdf4_file): + @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) + def test_loadable_variables(self, netcdf4_file, hdf_backend): vars_to_load = ["air", "time"] vds = open_virtual_dataset( - netcdf4_file, loadable_variables=vars_to_load, indexes={} + netcdf4_file, + loadable_variables=vars_to_load, + indexes={}, + backend=hdf_backend, ) for name in vds.variables: @@ -304,11 +325,20 @@ def test_explicit_filetype(self, netcdf4_file): with pytest.raises(NotImplementedError): open_virtual_dataset(netcdf4_file, filetype="grib") - def test_group_kwarg(self, hdf5_groups_file): - with pytest.raises(ValueError, match="Multiple HDF Groups found"): - open_virtual_dataset(hdf5_groups_file) - with pytest.raises(ValueError, match="not found in"): - open_virtual_dataset(hdf5_groups_file, group="doesnt_exist") + @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) + def test_group_kwarg(self, hdf5_groups_file, hdf_backend): + if hdf_backend: + with pytest.raises(NotImplementedError, match="Nested groups"): + open_virtual_dataset(hdf5_groups_file, backend=hdf_backend) + with pytest.raises(KeyError, match="doesn't exist"): + open_virtual_dataset( + hdf5_groups_file, group="doesnt_exist", backend=hdf_backend + ) + else: + with pytest.raises(ValueError, match="Multiple HDF Groups found"): + open_virtual_dataset(hdf5_groups_file) + with pytest.raises(ValueError, match="not found in"): + open_virtual_dataset(hdf5_groups_file, group="doesnt_exist") vars_to_load = ["air", "time"] vds = open_virtual_dataset( @@ -316,6 +346,7 @@ def test_group_kwarg(self, hdf5_groups_file): group="test/group", loadable_variables=vars_to_load, indexes={}, + backend=hdf_backend, ) full_ds = xr.open_dataset( hdf5_groups_file, @@ -340,13 +371,15 @@ def test_open_virtual_dataset_passes_expected_args( } mock_read_kerchunk.assert_called_once_with(**args) - def test_open_dataset_with_empty(self, hdf5_empty, tmpdir): - vds = open_virtual_dataset(hdf5_empty) + @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) + def test_open_dataset_with_empty(self, hdf5_empty, tmpdir, hdf_backend): + vds = open_virtual_dataset(hdf5_empty, backend=hdf_backend) assert vds.empty.dims == () assert vds.empty.attrs == {"empty": "true"} - def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir): - vds = open_virtual_dataset(hdf5_scalar) + @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) + def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir, hdf_backend): + vds = open_virtual_dataset(hdf5_scalar, backend=hdf_backend) assert vds.scalar.dims == () assert vds.scalar.attrs == {"scalar": "true"} diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 63158777..0a39eb3d 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -5,6 +5,7 @@ from virtualizarr import open_virtual_dataset from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.readers.hdf import HDFVirtualBackend from virtualizarr.tests import requires_kerchunk from virtualizarr.translators.kerchunk import ( dataset_from_kerchunk_refs, @@ -63,8 +64,9 @@ def test_no_duplicates_find_var_names(): ), ], ) +@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) def test_numpy_arrays_to_inlined_kerchunk_refs( - netcdf4_file, inline_threshold, vars_to_inline + netcdf4_file, inline_threshold, vars_to_inline, hdf_backend ): from kerchunk.hdf import SingleHdf5ToZarr @@ -75,7 +77,7 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( # loading the variables should produce same result as inlining them using kerchunk vds = open_virtual_dataset( - netcdf4_file, loadable_variables=vars_to_inline, indexes={} + netcdf4_file, loadable_variables=vars_to_inline, indexes={}, backend=hdf_backend ) refs = vds.virtualize.to_kerchunk(format="dict") @@ -90,7 +92,8 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( @requires_kerchunk @pytest.mark.parametrize("format", ["dict", "json", "parquet"]) class TestKerchunkRoundtrip: - def test_kerchunk_roundtrip_no_concat(self, tmpdir, format): + @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) + def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend): # set up example xarray dataset ds = xr.tutorial.open_dataset("air_temperature", decode_times=False) @@ -98,7 +101,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format): ds.to_netcdf(f"{tmpdir}/air.nc") # use open_dataset_via_kerchunk to read it as references - vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={}) + vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={}, backend=hdf_backend) if format == "dict": # write those references to an in-memory kerchunk-formatted references dictionary @@ -122,8 +125,11 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format): for coord in ds.coords: assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs + @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) @pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])]) - def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars): + def test_kerchunk_roundtrip_concat( + self, tmpdir, format, hdf_backend, decode_times, time_vars + ): # set up example xarray dataset ds = xr.tutorial.open_dataset("air_temperature", decode_times=decode_times) @@ -139,11 +145,13 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars f"{tmpdir}/air1.nc", indexes={}, loadable_variables=time_vars, + backend=hdf_backend, ) vds2 = open_virtual_dataset( f"{tmpdir}/air2.nc", indexes={}, loadable_variables=time_vars, + backend=hdf_backend, ) if decode_times is False: @@ -187,7 +195,8 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars assert roundtrip.time.encoding["units"] == ds.time.encoding["units"] assert roundtrip.time.encoding["calendar"] == ds.time.encoding["calendar"] - def test_non_dimension_coordinates(self, tmpdir, format): + @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) + def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend): # regression test for GH issue #105 # set up example xarray dataset containing non-dimension coordinate variables @@ -196,7 +205,9 @@ def test_non_dimension_coordinates(self, tmpdir, format): # save it to disk as netCDF (in temporary directory) ds.to_netcdf(f"{tmpdir}/non_dim_coords.nc") - vds = open_virtual_dataset(f"{tmpdir}/non_dim_coords.nc", indexes={}) + vds = open_virtual_dataset( + f"{tmpdir}/non_dim_coords.nc", indexes={}, backend=hdf_backend + ) assert "lat" in vds.coords assert "coordinates" not in vds.attrs @@ -269,11 +280,12 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format): @requires_kerchunk -def test_open_scalar_variable(tmpdir): +@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) +def test_open_scalar_variable(tmpdir, hdf_backend): # regression test for GH issue #100 ds = xr.Dataset(data_vars={"a": 0}) ds.to_netcdf(f"{tmpdir}/scalar.nc") - vds = open_virtual_dataset(f"{tmpdir}/scalar.nc", indexes={}) + vds = open_virtual_dataset(f"{tmpdir}/scalar.nc", indexes={}, backend=hdf_backend) assert vds["a"].shape == () diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index f73292ee..716d1f28 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -4,18 +4,21 @@ import virtualizarr from virtualizarr.backend import FileType +from virtualizarr.readers.hdf import HDFVirtualBackend +@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) class TestIntegration: @pytest.mark.xfail(reason="0 time start is being interpreted as fillvalue") def test_filters_h5netcdf_roundtrip( - self, tmpdir, filter_encoded_roundtrip_hdf5_file + self, tmpdir, filter_encoded_roundtrip_hdf5_file, hdf_backend ): ds = xr.open_dataset(filter_encoded_roundtrip_hdf5_file, decode_times=True) vds = virtualizarr.open_virtual_dataset( filter_encoded_roundtrip_hdf5_file, loadable_variables=["time"], cftime_variables=["time"], + backend=hdf_backend, ) kerchunk_file = f"{tmpdir}/kerchunk.json" vds.virtualize.to_kerchunk(kerchunk_file, format="json") @@ -23,19 +26,26 @@ def test_filters_h5netcdf_roundtrip( xrt.assert_allclose(ds, roundtrip) def test_filters_netcdf4_roundtrip( - self, tmpdir, filter_encoded_roundtrip_netcdf4_file + self, tmpdir, filter_encoded_roundtrip_netcdf4_file, hdf_backend ): filepath = filter_encoded_roundtrip_netcdf4_file["filepath"] ds = xr.open_dataset(filepath) - vds = virtualizarr.open_virtual_dataset(filepath, filetype=FileType("netcdf4")) + vds = virtualizarr.open_virtual_dataset( + filepath, filetype=FileType("netcdf4"), backend=hdf_backend + ) kerchunk_file = f"{tmpdir}/kerchunk.json" vds.virtualize.to_kerchunk(kerchunk_file, format="json") roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") xrt.assert_equal(ds, roundtrip) - def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file): + def test_filter_and_cf_roundtrip( + self, tmpdir, filter_and_cf_roundtrip_hdf5_file, hdf_backend + ): ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file) - vds = virtualizarr.open_virtual_dataset(filter_and_cf_roundtrip_hdf5_file) + vds = virtualizarr.open_virtual_dataset( + filter_and_cf_roundtrip_hdf5_file, + backend=hdf_backend, + ) kerchunk_file = f"{tmpdir}/filter_cf_kerchunk.json" vds.virtualize.to_kerchunk(kerchunk_file, format="json") roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 062eda5f..12f6fadf 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -4,6 +4,7 @@ from virtualizarr import open_virtual_dataset from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.readers.hdf import HDFVirtualBackend from virtualizarr.tests import requires_kerchunk from virtualizarr.zarr import ZArray @@ -224,14 +225,15 @@ def test_concat_dim_coords_along_existing_dim(self): @requires_kerchunk +@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) class TestCombineUsingIndexes: - def test_combine_by_coords(self, netcdf4_files): + def test_combine_by_coords(self, netcdf4_files, hdf_backend): filepath1, filepath2 = netcdf4_files with pytest.warns(UserWarning, match="will create in-memory pandas indexes"): - vds1 = open_virtual_dataset(filepath1) + vds1 = open_virtual_dataset(filepath1, backend=hdf_backend) with pytest.warns(UserWarning, match="will create in-memory pandas indexes"): - vds2 = open_virtual_dataset(filepath2) + vds2 = open_virtual_dataset(filepath2, backend=hdf_backend) combined_vds = xr.combine_by_coords( [vds2, vds1], @@ -240,13 +242,13 @@ def test_combine_by_coords(self, netcdf4_files): assert combined_vds.xindexes["time"].to_pandas_index().is_monotonic_increasing @pytest.mark.xfail(reason="Not yet implemented, see issue #18") - def test_combine_by_coords_keeping_manifestarrays(self, netcdf4_files): + def test_combine_by_coords_keeping_manifestarrays(self, netcdf4_files, hdf_backend): filepath1, filepath2 = netcdf4_files with pytest.warns(UserWarning, match="will create in-memory pandas indexes"): - vds1 = open_virtual_dataset(filepath1) + vds1 = open_virtual_dataset(filepath1, backend=hdf_backend) with pytest.warns(UserWarning, match="will create in-memory pandas indexes"): - vds2 = open_virtual_dataset(filepath2) + vds2 = open_virtual_dataset(filepath2, backend=hdf_backend) combined_vds = xr.combine_by_coords( [vds2, vds1], @@ -258,17 +260,18 @@ def test_combine_by_coords_keeping_manifestarrays(self, netcdf4_files): @requires_kerchunk +@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) class TestRenamePaths: - def test_rename_to_str(self, netcdf4_file): - vds = open_virtual_dataset(netcdf4_file, indexes={}) + def test_rename_to_str(self, netcdf4_file, hdf_backend): + vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) renamed_vds = vds.virtualize.rename_paths("s3://bucket/air.nc") assert ( renamed_vds["air"].data.manifest.dict()["0.0.0"]["path"] == "s3://bucket/air.nc" ) - def test_rename_using_function(self, netcdf4_file): - vds = open_virtual_dataset(netcdf4_file, indexes={}) + def test_rename_using_function(self, netcdf4_file, hdf_backend): + vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) def local_to_s3_url(old_local_path: str) -> str: from pathlib import Path @@ -284,15 +287,20 @@ def local_to_s3_url(old_local_path: str) -> str: == "s3://bucket/air.nc" ) - def test_invalid_type(self, netcdf4_file): - vds = open_virtual_dataset(netcdf4_file, indexes={}) + def test_invalid_type(self, netcdf4_file, hdf_backend): + vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) with pytest.raises(TypeError): vds.virtualize.rename_paths(["file1.nc", "file2.nc"]) - def test_mixture_of_manifestarrays_and_numpy_arrays(self, netcdf4_file): + def test_mixture_of_manifestarrays_and_numpy_arrays( + self, netcdf4_file, hdf_backend + ): vds = open_virtual_dataset( - netcdf4_file, indexes={}, loadable_variables=["lat", "lon"] + netcdf4_file, + indexes={}, + loadable_variables=["lat", "lon"], + backend=hdf_backend, ) renamed_vds = vds.virtualize.rename_paths("s3://bucket/air.nc") assert (