Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Translate h5py soft and hard linked datasets with an optional kwarg #463

Merged
merged 14 commits into from
Jul 1, 2024
45 changes: 41 additions & 4 deletions kerchunk/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,22 +113,39 @@ def __init__(
self.error = error
lggr.debug(f"HDF5 file URI: {self._uri}")

def translate(self):
def translate(self, preserve_linked_dsets=False):
"""Translate content of one HDF5 file into Zarr storage format.

This method is the main entry point to execute the workflow, and
returns a "reference" structure to be used with zarr/kerchunk

No data is copied out of the HDF5 file.

Parameters
----------
preserve_linked_dsets : bool (optional, default False)
If True, translate HDF5 soft and hard links for each `h5py.Dataset`
into the reference structure. Requires h5py version 3.11.0 or later.
Will not translate external links or links to `h5py.Group` objects.

Returns
-------
dict
Dictionary containing reference structure.
"""
lggr.debug("Translation begins")
self._transfer_attrs(self._h5f, self._zroot)

self._h5f.visititems(self._translator)

if preserve_linked_dsets:
if not has_visititems_links():
raise RuntimeError(
"'preserve_linked_dsets' kwarg requires h5py 3.11.0 or later "
f"is installed, found {h5py.__version__}"
)
self._h5f.visititems_links(self._translator)

if self.spec < 1:
return self.store
elif isinstance(self.store, LazyReferenceMapper):
Expand Down Expand Up @@ -247,10 +264,26 @@ def _decode_filters(self, h5obj: Union[h5py.Dataset, h5py.Group]):
)
return filters

def _translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]):
def _translator(
self,
name: str,
h5obj: Union[
h5py.Dataset, h5py.Group, h5py.SoftLink, h5py.HardLink, h5py.ExternalLink
],
):
"""Produce Zarr metadata for all groups and datasets in the HDF5 file."""
try: # method must not raise exception
kwargs = {}

if isinstance(h5obj, (h5py.SoftLink, h5py.HardLink)):
h5obj = self._h5f[name]
if isinstance(h5obj, h5py.Group):
# continues iteration of visititems_links
martindurant marked this conversation as resolved.
Show resolved Hide resolved
lggr.debug(
f"Skipping translation of HDF5 linked group: '{h5obj.name}'"
)
return None

if isinstance(h5obj, h5py.Dataset):
lggr.debug(f"HDF5 dataset: {h5obj.name}")
lggr.debug(f"HDF5 compression: {h5obj.compression}")
Expand Down Expand Up @@ -432,7 +465,7 @@ def _translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]):
)

# Create a Zarr array equivalent to this HDF5 dataset...
za = self._zroot.create_dataset(
za = self._zroot.require_dataset(
h5obj.name,
shape=h5obj.shape,
dtype=dt or h5obj.dtype,
Expand Down Expand Up @@ -480,7 +513,7 @@ def _translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]):

elif isinstance(h5obj, h5py.Group):
lggr.debug(f"HDF5 group: {h5obj.name}")
zgrp = self._zroot.create_group(h5obj.name)
zgrp = self._zroot.require_group(h5obj.name)
self._transfer_attrs(h5obj, zgrp)
except Exception as e:
import traceback
Expand Down Expand Up @@ -639,3 +672,7 @@ def _is_netcdf_datetime(dataset: h5py.Dataset):

def _is_netcdf_variable(dataset: h5py.Dataset):
return any("_Netcdf4" in _ for _ in dataset.attrs)


def has_visititems_links():
martindurant marked this conversation as resolved.
Show resolved Hide resolved
return hasattr(h5py.Group, "visititems_links")
Binary file added kerchunk/tests/air_linked.nc
Binary file not shown.
35 changes: 31 additions & 4 deletions kerchunk/tests/test_hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
import pytest
import xarray as xr
import zarr
import h5py

from kerchunk.hdf import SingleHdf5ToZarr
from kerchunk.hdf import SingleHdf5ToZarr, has_visititems_links
from kerchunk.combine import MultiZarrToZarr, drop

here = osp.dirname(__file__)
Expand Down Expand Up @@ -92,9 +93,11 @@ def test_multizarr(generate_mzz):
assert set(ds) == set(expected)
for name in ds:
exp = {
k: (v.tolist() if v.size > 1 else v[0])
if isinstance(v, np.ndarray)
else v
k: (
(v.tolist() if v.size > 1 else v[0])
if isinstance(v, np.ndarray)
else v
)
for k, v in expected[name].attrs.items()
}
assert dict(ds[name].attrs) == dict(exp)
Expand Down Expand Up @@ -331,3 +334,27 @@ def test_inline_threshold():
fn, inline_threshold=1e9
).translate()
assert inline_0 != inline_1_million


@pytest.mark.skipif(
not has_visititems_links(),
reason="'h5py.Group.visititems_links' requires h5py 3.11.0 or later",
)
def test_translate_links():
fn = osp.join(here, "air_linked.nc")
# choose a threshold that will give both inline and non-inline
# datasets for maximum test coverage
out = kerchunk.hdf.SingleHdf5ToZarr(fn, inline_threshold=50).translate(
preserve_linked_dsets=True
)
fs = fsspec.filesystem("reference", fo=out)
z = zarr.open(fs.get_mapper())

# 1. Test the hard linked datasets were translated correctly
# 2. Test the soft linked datasets were translated correctly
for link in ("hard", "soft"):
for dset in ("lat", "time"):
np.testing.assert_allclose(z[dset], z[f"{dset}_{link}"])
for key in z[f"{dset}_{link}"].attrs.keys():
if key not in kerchunk.hdf._HIDDEN_ATTRS and key != "_ARRAY_DIMENSIONS":
assert z[f"{dset}_{link}"].attrs[key] == z[dset].attrs[key]
Loading