diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index 0cc7b14a..076fc559 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -177,11 +177,12 @@ def open_virtual_dataset( virtual_vars = virtual_vars_from_hdf( path=filepath, + group=group, drop_variables=drop_variables + loadable_variables, reader_options=reader_options, ) ds_attrs = attrs_from_root_group( - path=filepath, reader_options=reader_options + path=filepath, reader_options=reader_options, group=group ) coord_names = ds_attrs.pop("coordinates", []) # we currently read every other filetype using kerchunks various file format backends diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index 8d2c44ce..8db6d781 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -209,6 +209,7 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab def virtual_vars_from_hdf( path: str, + group: Optional[str] = None, drop_variables: Optional[List[str]] = None, reader_options: Optional[dict] = { "storage_options": {"key": "", "secret": "", "anon": True} @@ -220,11 +221,17 @@ def virtual_vars_from_hdf( filepath=path, reader_options=reader_options ) f = h5py.File(open_file, mode="r") + if group: + g = f[group] + if not isinstance(g, h5py.Group): + raise ValueError("The provided group is not an HDF group") + else: + g = f variables = {} - for key in f.keys(): + for key in g.keys(): if key not in drop_variables: - if isinstance(f[key], h5py.Dataset): - variable = _dataset_to_variable(path, f[key]) + if isinstance(g[key], h5py.Dataset): + variable = _dataset_to_variable(path, g[key]) if variable is not None: variables[key] = variable else: @@ -235,6 +242,7 @@ def virtual_vars_from_hdf( def attrs_from_root_group( path: str, + group: Optional[str] = None, reader_options: Optional[dict] = { "storage_options": {"key": "", "secret": "", "anon": True} }, @@ -243,5 +251,11 @@ def attrs_from_root_group( filepath=path, reader_options=reader_options ) f = h5py.File(open_file, mode="r") - attrs = _extract_attrs(f) + if group: + g = f[group] + if not isinstance(g, h5py.Group): + raise ValueError("The provided group is not an HDF group") + else: + g = f + attrs = _extract_attrs(g) return attrs diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index bb68c186..3feab262 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -293,9 +293,9 @@ def test_explicit_filetype(self, netcdf4_file): open_virtual_dataset(netcdf4_file, filetype="grib") def test_group_kwarg(self, hdf5_groups_file): - with pytest.raises(ValueError, match="Multiple HDF Groups found"): + with pytest.raises(NotImplementedError, match="Nested groups"): open_virtual_dataset(hdf5_groups_file) - with pytest.raises(ValueError, match="not found in"): + with pytest.raises(KeyError, match="doesn't exist"): open_virtual_dataset(hdf5_groups_file, group="doesnt_exist") vars_to_load = ["air", "time"] @@ -321,6 +321,7 @@ def test_open_virtual_dataset_passes_expected_args( open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options) args = { "path": netcdf4_file, + "group": None, "drop_variables": [], "reader_options": reader_options, } diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index c47c26c9..b0b7c41f 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -118,7 +118,20 @@ def root_attributes_hdf5_file(tmpdir): def group_hdf5_file(tmpdir): filepath = f"{tmpdir}/group.nc" f = h5py.File(filepath, "w") - f.create_group("group") + g = f.create_group("group") + data = np.random.random((10, 10)) + g.create_dataset("data", data=data) + return filepath + + +@pytest.fixture +def nested_group_hdf5_file(tmpdir): + filepath = f"{tmpdir}/nested_group.nc" + f = h5py.File(filepath, "w") + g = f.create_group("group") + data = np.random.random((10, 10)) + g.create_dataset("data", data=data) + g.create_group("nested_group") return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index 32970a33..cc9e2dff 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -113,10 +113,20 @@ def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file) assert len(variables) == 3 - def test_groups_not_implemented(self, group_hdf5_file): + def test_nested_groups_not_implemented(self, nested_group_hdf5_file): with pytest.raises(NotImplementedError): - virtual_vars_from_hdf(group_hdf5_file) + virtual_vars_from_hdf(path=nested_group_hdf5_file, group="group") def test_drop_variables(self, multiple_datasets_hdf5_file): - variables = virtual_vars_from_hdf(multiple_datasets_hdf5_file, ["data2"]) + variables = virtual_vars_from_hdf( + path=multiple_datasets_hdf5_file, drop_variables=["data2"] + ) assert "data2" not in variables.keys() + + def test_dataset_in_group(self, group_hdf5_file): + variables = virtual_vars_from_hdf(path=group_hdf5_file, group="group") + assert len(variables) == 1 + + def test_non_group_error(self, group_hdf5_file): + with pytest.raises(ValueError): + virtual_vars_from_hdf(path=group_hdf5_file, group="group/data")