Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add metadata consolidation utility #278

Merged
merged 13 commits into from
Dec 1, 2023
1 change: 1 addition & 0 deletions earthaccess/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
search_datasets,
)
from .auth import Auth
from .kerchunk import consolidate_metadata
from .search import DataCollections, DataGranules
from .store import Store

Expand Down
56 changes: 56 additions & 0 deletions earthaccess/kerchunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from __future__ import annotations

import earthaccess
import fsspec
import s3fs


def _get_chunk_metadata(
granuale: earthaccess.results.DataGranule,
fs: fsspec.AbstractFileSystem | s3fs.S3FileSystem,
) -> list[dict]:
from kerchunk.hdf import SingleHdf5ToZarr

metadata = []
access = "direct" if isinstance(fs, s3fs.S3FileSystem) else "indirect"
for url in granuale.data_links(access=access):
with fs.open(url) as inf:
h5chunks = SingleHdf5ToZarr(inf, url)
m = h5chunks.translate()
metadata.append(m)
return metadata


def consolidate_metadata(
granuales: list[earthaccess.results.DataGranule],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we try to kerchunk and consolidate non uniform datasets kerchunk will fail, if we try to kerchunk 1 granule kerchunk succeeds but MultiZarrToZarr requires a variable mapping, if we don't pass it fails, for one granule we should use kerchunk_options={"coo_map": {}} and then kerchunks stays happy.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is that the same as not doing any combination at all?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess? or maybe we can just skip the MultiZarrToZarr when the input is only 1 file. I wanted to do a quick test on a granule from https://podaac.jpl.nasa.gov/MEaSUREs-MUR and ran into this issue.

kerchunk_options: dict | None = None,
access: str = "direct",
outfile: str | None = None,
storage_options: dict | None = None,
) -> str | dict:
try:
import dask
from kerchunk.combine import MultiZarrToZarr
except ImportError as e:
raise ImportError(
"`earthaccess.consolidate_metadata` requires `dask` and `kerchunk` to be be installed"
) from e

if access == "direct":
fs = earthaccess.get_s3fs_session(provider=granuales[0]["meta"]["provider-id"])
else:
fs = earthaccess.get_fsspec_https_session()

# Get metadata for each granuale
get_chunk_metadata = dask.delayed(_get_chunk_metadata)
chunks = dask.compute(*[get_chunk_metadata(g, fs) for g in granuales])
chunks = sum(chunks, start=[])

# Get combined metadata object
mzz = MultiZarrToZarr(chunks, **(kerchunk_options or {}))
if outfile is not None:
output = fsspec.utils.stringify_path(outfile)
mzz.translate(outfile, storage_options=storage_options or {})
return output
else:
return mzz.translate()
171 changes: 171 additions & 0 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,11 @@ s3fs = ">=2021.11, <2024"
fsspec = ">=2022.1"
tinynetrc = "^1.3.1"
multimethod = ">=1.8"
kerchunk = { version = ">=0.1.2", optional = true }
dask = { version = ">=2022.1.0", optional = true }

[tool.poetry.extras]
kerchunk = ["kerchunk", "dask"]

[tool.poetry.dev-dependencies]
python-magic = ">=0.4"
Expand Down
Loading
Loading