From b34a1ee47572901ed6a3623c97d340157a0fa62a Mon Sep 17 00:00:00 2001 From: Gustavo Hidalgo Date: Fri, 2 Aug 2024 15:12:45 -0700 Subject: [PATCH 01/29] Handle scalar dataset variables (#205) * Handle scalar dataset variables * Doc update * add empty and scalar hdf5 fixtures * add h5py to test requirements --- conftest.py | 19 +++++++++++++++++++ docs/releases.rst | 3 +++ pyproject.toml | 1 + virtualizarr/tests/test_xarray.py | 10 ++++++++++ virtualizarr/xarray.py | 13 ++++++++----- 5 files changed, 41 insertions(+), 5 deletions(-) diff --git a/conftest.py b/conftest.py index 8d5e351e..b558abfd 100644 --- a/conftest.py +++ b/conftest.py @@ -1,3 +1,4 @@ +import h5py import pytest import xarray as xr @@ -50,3 +51,21 @@ def netcdf4_files(tmpdir): ds2.close() return filepath1, filepath2 + + +@pytest.fixture +def hdf5_empty(tmpdir): + filepath = f"{tmpdir}/empty.nc" + f = h5py.File(filepath, "w") + dataset = f.create_dataset("empty", shape=(), dtype="float32") + dataset.attrs["empty"] = "true" + return filepath + + +@pytest.fixture +def hdf5_scalar(tmpdir): + filepath = f"{tmpdir}/scalar.nc" + f = h5py.File(filepath, "w") + dataset = f.create_dataset("scalar", data=0.1, dtype="float32") + dataset.attrs["scalar"] = "true" + return filepath diff --git a/docs/releases.rst b/docs/releases.rst index f472db05..81a0aeac 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -9,6 +9,9 @@ v1.0.1 (unreleased) New Features ~~~~~~~~~~~~ +- Load scalar variables by default. (:pull:`205`) + By `Gustavo Hidalgo `_. + Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/pyproject.toml b/pyproject.toml index 9fe0468a..44961165 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ test = [ "fsspec", "s3fs", "fastparquet", + "h5py" ] diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 31ef8320..00140a14 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -419,6 +419,16 @@ def test_open_virtual_dataset_passes_expected_args( } mock_read_kerchunk.assert_called_once_with(**args) + def test_open_dataset_with_empty(self, hdf5_empty, tmpdir): + vds = open_virtual_dataset(hdf5_empty) + assert vds.empty.dims == () + assert vds.empty.attrs == {"empty": "true"} + + def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir): + vds = open_virtual_dataset(hdf5_scalar) + assert vds.scalar.dims == () + assert vds.scalar.attrs == {"scalar": "true"} + class TestRenamePaths: def test_rename_to_str(self, netcdf4_file): diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 3a6dd02c..528c2521 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -333,13 +333,16 @@ def variable_from_kerchunk_refs( arr_refs = kerchunk.extract_array_refs(refs, var_name) chunk_dict, zarray, zattrs = kerchunk.parse_array_refs(arr_refs) - - manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict) - # we want to remove the _ARRAY_DIMENSIONS from the final variables' .attrs dims = zattrs.pop("_ARRAY_DIMENSIONS") - - varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest) + if chunk_dict: + manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict) + varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest) + else: + # This means we encountered a scalar variable of dimension 0, + # very likely that it actually has no numeric value and its only purpose + # is to communicate dataset attributes. + varr = zarray.fill_value return xr.Variable(data=varr, dims=dims, attrs=zattrs) From 04a566ca06f95719236441c6c4499db54d01faa5 Mon Sep 17 00:00:00 2001 From: Timothy Hodson <34148978+thodson-usgs@users.noreply.github.com> Date: Mon, 5 Aug 2024 14:44:53 -0500 Subject: [PATCH 02/29] Fix default fill_value for datetime64 (#206) * Set ZArray fill_value back to nan * Set NaT as datetime64 default fill value * Fixups * Change back to 0 * Added integration test * changelog --------- Co-authored-by: Tom Augspurger --- docs/releases.rst | 3 ++ virtualizarr/tests/test_integration.py | 47 ++++++++++++++++++++++++++ virtualizarr/zarr.py | 15 ++++---- 3 files changed, 58 insertions(+), 7 deletions(-) diff --git a/docs/releases.rst b/docs/releases.rst index 81a0aeac..e7f6df23 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -26,6 +26,9 @@ Bug fixes - Exclude empty chunks during `ChunkDict` construction. (:pull:`198`) By `Gustavo Hidalgo `_. +- Fixed regression in `fill_value` handling for datetime dtypes making virtual + Zarr stores unreadable (:pr:`206`) + By `Timothy Hodson `_ Documentation ~~~~~~~~~~~~~ diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 239316a1..7210b3f3 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -4,6 +4,9 @@ import xarray.testing as xrt from virtualizarr import open_virtual_dataset +from virtualizarr.manifests.array import ManifestArray +from virtualizarr.manifests.manifest import ChunkManifest +from virtualizarr.zarr import ZArray @pytest.mark.parametrize( @@ -166,6 +169,50 @@ def test_non_dimension_coordinates(self, tmpdir, format): # assert equal to original dataset xrt.assert_identical(roundtrip, ds) + def test_datetime64_dtype_fill_value(self, tmpdir, format): + chunks_dict = { + "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, + } + manifest = ChunkManifest(entries=chunks_dict) + chunks = (1, 1, 1) + shape = (1, 1, 1) + zarray = ZArray( + chunks=chunks, + compressor={"id": "zlib", "level": 1}, + dtype=np.dtype(" None: @model_validator(mode="after") def _check_fill_value(self) -> Self: if self.fill_value is None: - self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype, 0.0) + self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype.kind, 0.0) return self @property From 376524345388da594beb4deef85c26eee84c490e Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Mon, 5 Aug 2024 14:13:33 -0600 Subject: [PATCH 03/29] Update .pre-commit-config mypy + bump ruff version (#211) * bump ruff version * downgrade ruff * test * replaced types-pkg_resources with types-setuptools --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a5670c75..e1d03aac 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,7 +11,7 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: "v0.4.7" + rev: "v0.5.6" hooks: # Run the linter. - id: ruff @@ -28,7 +28,7 @@ repos: additional_dependencies: [ # Type stubs types-python-dateutil, - types-pkg_resources, + types-setuptools, types-PyYAML, types-pytz, # Dependencies that are typed From 14f54918a28979aded305c2635de161c2214840c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Aug 2024 07:13:04 -0500 Subject: [PATCH 04/29] Update static typing (#213) * Update packaging 1. Fixed mypy pyproject.toml config 2. Move mypy from pre-commit to pyproject.toml 3. Lint in CI --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .github/workflows/main.yml | 4 ++++ .pre-commit-config.yaml | 24 ------------------- ci/environment.yml | 2 ++ pyproject.toml | 34 ++++++++++++++++++--------- virtualizarr/kerchunk.py | 6 +++-- virtualizarr/manifests/array.py | 2 +- virtualizarr/manifests/manifest.py | 4 ++-- virtualizarr/tests/test_zarr.py | 3 ++- virtualizarr/xarray.py | 37 +++++++++++++++++++++++------- virtualizarr/zarr.py | 20 +++++++++++----- 10 files changed, 81 insertions(+), 55 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 7daafb43..83222d7d 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -42,6 +42,10 @@ jobs: conda env list conda list + - name: Type check + run: | + mypy virtualizarr + - name: Running Tests run: | python -m pytest ./virtualizarr --run-network-tests --cov=./ --cov-report=xml --verbose diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e1d03aac..d58e82ac 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,27 +18,3 @@ repos: args: [ --fix ] # Run the formatter. - id: ruff-format - - - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.10.0 - hooks: - - id: mypy - # Copied from setup.cfg - exclude: "properties|asv_bench|docs" - additional_dependencies: [ - # Type stubs - types-python-dateutil, - types-setuptools, - types-PyYAML, - types-pytz, - # Dependencies that are typed - numpy, - typing-extensions>=4.1.0, - ] - # run this occasionally, ref discussion https://github.com/pydata/xarray/pull/3194 - # - repo: https://github.com/asottile/pyupgrade - # rev: v3.15.2 - # hooks: - # - id: pyupgrade - # args: - # - "--py310-plus" diff --git a/ci/environment.yml b/ci/environment.yml index a41a99d4..5368784d 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -17,7 +17,9 @@ dependencies: # Testing - codecov - pre-commit + - mypy - ruff + - pandas-stubs - pytest-mypy - pytest-cov - pytest diff --git a/pyproject.toml b/pyproject.toml index 44961165..859dd227 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,18 +34,20 @@ dependencies = [ [project.optional-dependencies] test = [ "codecov", + "fastparquet", + "fsspec", + "h5py", + "mypy", + "netcdf4", + "pandas-stubs", + "pooch", "pre-commit", - "ruff", - "pytest-mypy", "pytest-cov", + "pytest-mypy", "pytest", - "pooch", - "scipy", - "netcdf4", - "fsspec", + "ruff", "s3fs", - "fastparquet", - "h5py" + "scipy", ] @@ -70,12 +72,22 @@ exclude = ["docs", "tests", "tests.*", "docs.*"] [tool.setuptools.package-data] datatree = ["py.typed"] - - -[mypy] +[tool.mypy] files = "virtualizarr/**/*.py" show_error_codes = true +[[tool.mypy.overrides]] +module = "fsspec.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "numcodecs.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "kerchunk.*" +ignore_missing_imports = true + [tool.ruff] # Same as Black. line-length = 88 diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 122b86b3..6496bf54 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -42,6 +42,7 @@ class FileType(AutoName): tiff = auto() fits = auto() zarr = auto() + zarr_v3 = auto() class NumpyEncoder(json.JSONEncoder): @@ -223,7 +224,7 @@ def dataset_to_kerchunk_refs(ds: xr.Dataset) -> KerchunkStoreRefs: all_arr_refs = {} for var_name, var in ds.variables.items(): - arr_refs = variable_to_kerchunk_arr_refs(var, var_name) + arr_refs = variable_to_kerchunk_arr_refs(var, str(var_name)) prepended_with_var_name = { f"{var_name}/{key}": val for key, val in arr_refs.items() @@ -233,7 +234,7 @@ def dataset_to_kerchunk_refs(ds: xr.Dataset) -> KerchunkStoreRefs: zattrs = ds.attrs if ds.coords: - coord_names = list(ds.coords) + coord_names = [str(x) for x in ds.coords] # this weird concatenated string instead of a list of strings is inconsistent with how other features in the kerchunk references format are stored # see https://github.com/zarr-developers/VirtualiZarr/issues/105#issuecomment-2187266739 zattrs["coordinates"] = " ".join(coord_names) @@ -302,6 +303,7 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable, var_name: str) -> KerchunkAr shape=np_arr.shape, dtype=np_arr.dtype, order="C", + fill_value=None, ) zarray_dict = zarray.to_kerchunk_json() diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index e15cf7d7..0ec9c844 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -131,7 +131,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs) -> Any: return _isnan(self.shape) return NotImplemented - def __array__(self) -> np.ndarray: + def __array__(self, dtype: np.typing.DTypeLike = None) -> np.ndarray: raise NotImplementedError( "ManifestArrays can't be converted into numpy arrays or pandas Index objects" ) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index bf7c24fd..a8621ed2 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -57,7 +57,7 @@ def to_kerchunk(self) -> tuple[str, int, int]: """Write out in the format that kerchunk uses for chunk entries.""" return (self.path, self.offset, self.length) - def dict(self) -> ChunkDictEntry: + def dict(self) -> ChunkDictEntry: # type: ignore[override] return ChunkDictEntry(path=self.path, offset=self.offset, length=self.length) @@ -238,7 +238,7 @@ def __iter__(self) -> Iterator[ChunkKey]: def __len__(self) -> int: return self._paths.size - def dict(self) -> ChunkDict: + def dict(self) -> ChunkDict: # type: ignore[override] """ Convert the entire manifest to a nested dictionary. diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py index 7715d245..ca33bcb3 100644 --- a/virtualizarr/tests/test_zarr.py +++ b/virtualizarr/tests/test_zarr.py @@ -6,6 +6,7 @@ import xarray.testing as xrt from virtualizarr import ManifestArray, open_virtual_dataset +from virtualizarr.kerchunk import FileType from virtualizarr.manifests.manifest import ChunkManifest from virtualizarr.zarr import dataset_to_zarr, metadata_from_zarr_json @@ -40,7 +41,7 @@ def isconfigurable(value: dict) -> bool: def test_zarr_v3_roundtrip(tmpdir, vds_with_manifest_arrays: xr.Dataset): vds_with_manifest_arrays.virtualize.to_zarr(tmpdir / "store.zarr") roundtrip = open_virtual_dataset( - tmpdir / "store.zarr", filetype="zarr_v3", indexes={} + tmpdir / "store.zarr", filetype=FileType.zarr_v3, indexes={} ) xrt.assert_identical(roundtrip, vds_with_manifest_arrays) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 528c2521..b612eaf5 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -1,10 +1,15 @@ +import os import warnings from collections.abc import Iterable, Mapping, MutableMapping +from io import BufferedIOBase from pathlib import Path from typing import ( + Any, Callable, + Hashable, Literal, Optional, + cast, overload, ) @@ -12,7 +17,7 @@ import xarray as xr from upath import UPath from xarray import register_dataset_accessor -from xarray.backends import BackendArray +from xarray.backends import AbstractDataStore, BackendArray from xarray.coding.times import CFDatetimeCoder from xarray.core.indexes import Index, PandasIndex from xarray.core.variable import IndexVariable @@ -27,6 +32,8 @@ metadata_from_zarr_json, ) +XArrayOpenT = str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore + class ManifestBackendArray(ManifestArray, BackendArray): """Using this prevents xarray from wrapping the KerchunkArray in ExplicitIndexingAdapter etc.""" @@ -85,6 +92,9 @@ def open_virtual_dataset( vds An xarray Dataset containing instances of virtual_array_cls for each variable, or normal lazily indexed arrays for each variable in loadable_variables. """ + loadable_vars: dict[str, xr.Variable] + virtual_vars: dict[str, xr.Variable] + vars: dict[str, xr.Variable] if drop_variables is None: drop_variables = [] @@ -119,7 +129,11 @@ def open_virtual_dataset( if virtual_array_class is not ManifestArray: raise NotImplementedError() - if filetype == "zarr_v3": + # if filetype is user defined, convert to FileType + if filetype is not None: + filetype = FileType(filetype) + + if filetype == FileType.zarr_v3: # TODO is there a neat way of auto-detecting this? return open_virtual_dataset_from_v3_store( storepath=filepath, drop_variables=drop_variables, indexes=indexes @@ -158,8 +172,13 @@ def open_virtual_dataset( filepath=filepath, reader_options=reader_options ) + # fpath can be `Any` thanks to fsspec.filesystem(...).open() returning Any. + # We'll (hopefully safely) cast it to what xarray is expecting, but this might let errors through. + ds = xr.open_dataset( - fpath, drop_variables=drop_variables, decode_times=False + cast(XArrayOpenT, fpath), + drop_variables=drop_variables, + decode_times=False, ) if indexes is None: @@ -177,7 +196,7 @@ def open_virtual_dataset( indexes = dict(**indexes) # for type hinting: to allow mutation loadable_vars = { - name: var + str(name): var for name, var in ds.variables.items() if name in loadable_variables } @@ -265,7 +284,7 @@ def virtual_vars_from_kerchunk_refs( refs: KerchunkStoreRefs, drop_variables: list[str] | None = None, virtual_array_class=ManifestArray, -) -> Mapping[str, xr.Variable]: +) -> dict[str, xr.Variable]: """ Translate a store-level kerchunk reference dict into aaset of xarray Variables containing virtualized arrays. @@ -351,7 +370,7 @@ def separate_coords( vars: Mapping[str, xr.Variable], indexes: MutableMapping[str, Index], coord_names: Iterable[str] | None = None, -) -> tuple[Mapping[str, xr.Variable], xr.Coordinates]: +) -> tuple[dict[str, xr.Variable], xr.Coordinates]: """ Try to generate a set of coordinates that won't cause xarray to automatically build a pandas.Index for the 1D coordinates. @@ -365,7 +384,9 @@ def separate_coords( # split data and coordinate variables (promote dimension coordinates) data_vars = {} - coord_vars = {} + coord_vars: dict[ + str, tuple[Hashable, Any, dict[Any, Any], dict[Any, Any]] | xr.Variable + ] = {} for name, var in vars.items(): if name in coord_names or var.dims == (name,): # use workaround to avoid creating IndexVariables described here https://github.com/pydata/xarray/pull/8107#discussion_r1311214263 @@ -376,7 +397,7 @@ def separate_coords( if isinstance(var, IndexVariable): # unless variable actually already is a loaded IndexVariable, # in which case we need to keep it and add the corresponding indexes explicitly - coord_vars[name] = var + coord_vars[str(name)] = var # TODO this seems suspect - will it handle datetimes? indexes[name] = PandasIndex(var, dim1d) else: diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 43735b14..c91b37ea 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -6,6 +6,7 @@ Literal, NewType, Optional, + cast, ) import numcodecs @@ -31,6 +32,7 @@ "ZAttrs", dict[str, Any] ) # just the .zattrs (for one array or for the whole store/group) FillValueT = bool | str | float | int | list | None +ZARR_FORMAT = Literal[2, 3] ZARR_DEFAULT_FILL_VALUE: dict[str, FillValueT] = { # numpy dtypes's hierarchy lets us avoid checking for all the widths @@ -72,7 +74,7 @@ class ZArray(BaseModel): filters: list[dict] | None = None order: Literal["C", "F"] shape: tuple[int, ...] - zarr_format: Literal[2, 3] = 2 + zarr_format: ZARR_FORMAT = 2 @field_validator("dtype") @classmethod @@ -110,6 +112,10 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": fill_value = np.nan compressor = decoded_arr_refs_zarray["compressor"] + zarr_format = int(decoded_arr_refs_zarray["zarr_format"]) + if zarr_format not in (2, 3): + raise ValueError(f"Zarr format must be 2 or 3, but got {zarr_format}") + return ZArray( chunks=tuple(decoded_arr_refs_zarray["chunks"]), compressor=compressor, @@ -118,10 +124,10 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": filters=decoded_arr_refs_zarray["filters"], order=decoded_arr_refs_zarray["order"], shape=tuple(decoded_arr_refs_zarray["shape"]), - zarr_format=int(decoded_arr_refs_zarray["zarr_format"]), + zarr_format=cast(ZARR_FORMAT, zarr_format), ) - def dict(self) -> dict[str, Any]: + def dict(self) -> dict[str, Any]: # type: ignore zarray_dict = dict(self) zarray_dict["dtype"] = encode_dtype(zarray_dict["dtype"]) return zarray_dict @@ -135,7 +141,7 @@ def to_kerchunk_json(self) -> str: def replace( self, chunks: Optional[tuple[int, ...]] = None, - compressor: Optional[dict] = None, + compressor: Optional[dict] = None, # type: ignore[valid-type] dtype: Optional[np.dtype] = None, fill_value: Optional[float] = None, # float or int? filters: Optional[list[dict]] = None, # type: ignore[valid-type] @@ -251,7 +257,7 @@ def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None: group_metadata_file.write(json_dumps(group_metadata)) for name, var in ds.variables.items(): - array_dir = _storepath / name + array_dir = _storepath / str(name) marr = var.data # TODO move this check outside the writing loop so we don't write an incomplete store on failure? @@ -287,7 +293,9 @@ def to_zarr_json(var: xr.Variable, array_dir: Path) -> None: marr.manifest.to_zarr_json(array_dir / "manifest.json") - metadata = zarr_v3_array_metadata(marr.zarray, list(var.dims), var.attrs) + metadata = zarr_v3_array_metadata( + marr.zarray, [str(x) for x in var.dims], var.attrs + ) with open(array_dir / "zarr.json", "wb") as metadata_file: metadata_file.write(json_dumps(metadata)) From a28b21052271c34ce0a99545387e69bdf9ec5029 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 7 Aug 2024 09:11:12 -0600 Subject: [PATCH 05/29] adds concurrency w/ cancel-in-progress=True (#214) --- .github/workflows/main.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 83222d7d..769f59e5 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -12,6 +12,10 @@ on: schedule: - cron: "0 0 * * *" +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: test: From fdab54cca8d73af4e44903091e5e00bbeb1bb31c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Aug 2024 08:17:06 -0500 Subject: [PATCH 06/29] Implement pydantic models as dataclasses (#210) * Implement pydantic models as dataclasses This removes our pydantic dependency by reimplementing them with dataclasses. There are a few breaking changes: 1. The classes won't automatically cast the argumetns to the declared type. IMO, that's the preferable behavior. Some backwards compatability shims have been added for np.dtype, but perhpas we want to remove that too. 2. The models won't have any of the methods they previously inherited from pydantic.BaseModel. This is probably good for user-facing objects, we now have full control over the public API. 3. We had to reorder some of the fields on ZArray, since dataclasses is stricter about positional arguments. I've aligned the order with `zarr.create`. --- ci/environment.yml | 1 - docs/releases.rst | 4 + pyproject.toml | 1 - virtualizarr/manifests/manifest.py | 18 +-- virtualizarr/tests/test_kerchunk.py | 4 +- .../tests/test_manifests/test_manifest.py | 10 -- virtualizarr/tests/test_zarr.py | 28 ++++- virtualizarr/zarr.py | 112 ++++++++---------- 8 files changed, 89 insertions(+), 89 deletions(-) diff --git a/ci/environment.yml b/ci/environment.yml index 5368784d..883463a2 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -9,7 +9,6 @@ dependencies: - netcdf4 - xarray>=2024.6.0 - kerchunk>=0.2.5 - - pydantic - numpy>=2.0.0 - ujson - packaging diff --git a/docs/releases.rst b/docs/releases.rst index e7f6df23..7283df71 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -17,6 +17,10 @@ Breaking changes - Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`) By `Gustavo Hidalgo `_. +- VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclass + `pydantic.BaseModel` (:pull:`210`) +- `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`xxx`) + Deprecations ~~~~~~~~~~~~ diff --git a/pyproject.toml b/pyproject.toml index 859dd227..6b0efe89 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,6 @@ dependencies = [ "xarray>=2024.06.0", "kerchunk>=0.2.5", "h5netcdf", - "pydantic", "numpy>=2.0.0", "ujson", "packaging", diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index a8621ed2..3aaebb41 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -1,10 +1,10 @@ +import dataclasses import json import re from collections.abc import Iterable, Iterator from typing import Any, Callable, Dict, NewType, Tuple, TypedDict, cast import numpy as np -from pydantic import BaseModel, ConfigDict from upath import UPath from virtualizarr.types import ChunkKey @@ -25,22 +25,18 @@ class ChunkDictEntry(TypedDict): ChunkDict = NewType("ChunkDict", dict[ChunkKey, ChunkDictEntry]) -class ChunkEntry(BaseModel): +@dataclasses.dataclass(frozen=True) +class ChunkEntry: """ Information for a single chunk in the manifest. Stored in the form `{"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}`. """ - model_config = ConfigDict(frozen=True) - path: str # TODO stricter typing/validation of possible local / remote paths? offset: int length: int - def __repr__(self) -> str: - return f"ChunkEntry(path='{self.path}', offset={self.offset}, length={self.length})" - @classmethod def from_kerchunk( cls, path_and_byte_range_info: tuple[str] | tuple[str, int, int] @@ -57,8 +53,12 @@ def to_kerchunk(self) -> tuple[str, int, int]: """Write out in the format that kerchunk uses for chunk entries.""" return (self.path, self.offset, self.length) - def dict(self) -> ChunkDictEntry: # type: ignore[override] - return ChunkDictEntry(path=self.path, offset=self.offset, length=self.length) + def dict(self) -> ChunkDictEntry: + return ChunkDictEntry( + path=self.path, + offset=self.offset, + length=self.length, + ) class ChunkManifest: diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index 9aa934df..379c43ad 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -94,7 +94,7 @@ def test_accessor_to_kerchunk_dict(self): "refs": { ".zgroup": '{"zarr_format":2}', ".zattrs": "{}", - "a/.zarray": '{"chunks":[2,3],"compressor":null,"dtype":" 1 and all(isconfigurable(codec) for codec in metadata["codecs"]) ) + + +def test_replace_partial(): + arr = ZArray(shape=(2, 3), chunks=(1, 1), dtype=np.dtype(" str: - return f"Codec(compressor={self.compressor}, filters={self.filters})" - -class ZArray(BaseModel): +@dataclasses.dataclass +class ZArray: """Just the .zarray information""" # TODO will this work for V3? - model_config = ConfigDict( - arbitrary_types_allowed=True, # only here so pydantic doesn't complain about the numpy dtype field - ) - + shape: tuple[int, ...] chunks: tuple[int, ...] - compressor: dict | None = None dtype: np.dtype - fill_value: FillValueT = Field(None, validate_default=True) + fill_value: FillValueT = dataclasses.field(default=None) + order: Literal["C", "F"] = "C" + compressor: dict | None = None filters: list[dict] | None = None - order: Literal["C", "F"] - shape: tuple[int, ...] - zarr_format: ZARR_FORMAT = 2 - - @field_validator("dtype") - @classmethod - def validate_dtype(cls, dtype) -> np.dtype: - # Your custom validation logic here - # Convert numpy.dtype to a format suitable for Pydantic - return np.dtype(dtype) + zarr_format: Literal[2, 3] = 2 def __post_init__(self) -> None: if len(self.shape) != len(self.chunks): @@ -90,20 +64,18 @@ def __post_init__(self) -> None: f"Array shape {self.shape} has ndim={self.shape} but chunk shape {self.chunks} has ndim={len(self.chunks)}" ) - @model_validator(mode="after") - def _check_fill_value(self) -> Self: + if isinstance(self.dtype, str): + # Convert dtype string to numpy.dtype + self.dtype = np.dtype(self.dtype) + if self.fill_value is None: self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype.kind, 0.0) - return self @property def codec(self) -> Codec: """For comparison against other arrays.""" return Codec(compressor=self.compressor, filters=self.filters) - def __repr__(self) -> str: - return f"ZArray(shape={self.shape}, chunks={self.chunks}, dtype={self.dtype}, compressor={self.compressor}, filters={self.filters}, fill_value={self.fill_value})" - @classmethod def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": # coerce type of fill_value as kerchunk can be inconsistent with this @@ -127,8 +99,8 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": zarr_format=cast(ZARR_FORMAT, zarr_format), ) - def dict(self) -> dict[str, Any]: # type: ignore - zarray_dict = dict(self) + def dict(self) -> dict[str, Any]: + zarray_dict = dataclasses.asdict(self) zarray_dict["dtype"] = encode_dtype(zarray_dict["dtype"]) return zarray_dict @@ -138,30 +110,40 @@ def to_kerchunk_json(self) -> str: zarray_dict["fill_value"] = None return ujson.dumps(zarray_dict) + # ZArray.dict seems to shadow "dict", so we need the type ignore in + # the signature below. def replace( self, - chunks: Optional[tuple[int, ...]] = None, - compressor: Optional[dict] = None, # type: ignore[valid-type] - dtype: Optional[np.dtype] = None, - fill_value: Optional[float] = None, # float or int? - filters: Optional[list[dict]] = None, # type: ignore[valid-type] - order: Optional[Literal["C"] | Literal["F"]] = None, - shape: Optional[tuple[int, ...]] = None, - zarr_format: Optional[Literal[2] | Literal[3]] = None, + shape: tuple[int, ...] | None = None, + chunks: tuple[int, ...] | None = None, + dtype: np.dtype | str | None = None, + fill_value: FillValueT = None, + order: Literal["C", "F"] | None = None, + compressor: "dict | None" = None, # type: ignore[valid-type] + filters: list[dict] | None = None, # type: ignore[valid-type] + zarr_format: Literal[2, 3] | None = None, ) -> "ZArray": """ Convenience method to create a new ZArray from an existing one by altering only certain attributes. """ - return ZArray( - chunks=chunks if chunks is not None else self.chunks, - compressor=compressor if compressor is not None else self.compressor, - dtype=dtype if dtype is not None else self.dtype, - fill_value=fill_value if fill_value is not None else self.fill_value, - filters=filters if filters is not None else self.filters, - shape=shape if shape is not None else self.shape, - order=order if order is not None else self.order, - zarr_format=zarr_format if zarr_format is not None else self.zarr_format, - ) + replacements: dict[str, Any] = {} + if shape is not None: + replacements["shape"] = shape + if chunks is not None: + replacements["chunks"] = chunks + if dtype is not None: + replacements["dtype"] = dtype + if fill_value is not None: + replacements["fill_value"] = fill_value + if order is not None: + replacements["order"] = order + if compressor is not None: + replacements["compressor"] = compressor + if filters is not None: + replacements["filters"] = filters + if zarr_format is not None: + replacements["zarr_format"] = zarr_format + return dataclasses.replace(self, **replacements) def _v3_codec_pipeline(self) -> list: """ @@ -361,8 +343,8 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: attrs = metadata.pop("attributes") dim_names = metadata.pop("dimension_names") - chunk_shape = metadata["chunk_grid"]["configuration"]["chunk_shape"] - shape = metadata["shape"] + chunk_shape = tuple(metadata["chunk_grid"]["configuration"]["chunk_shape"]) + shape = tuple(metadata["shape"]) zarr_format = metadata["zarr_format"] if metadata["fill_value"] is None: From 0343d48ad32263b5cb485857a880b22cbb5eda7d Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Wed, 21 Aug 2024 19:41:16 +0200 Subject: [PATCH 07/29] use the theme options for `pydata_sphinx_theme` (#223) * use the theme options for `pydata_sphinx_theme` * try adding an icon link * use a round github icon * enable the source edit button * move several options into `html_context` --- docs/conf.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 5ec5ff9d..d5312069 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -55,11 +55,23 @@ html_theme = "pydata_sphinx_theme" html_theme_options = { - "repository_url": "https://github.com/TomNicholas/VirtualiZarr", - "repository_branch": "main", - "path_to_docs": "docs", + "use_edit_page_button": True, + "icon_links": [ + { + "name": "GitHub", + "url": "https://github.com/zarr-developers/VirtualiZarr", + "icon": "fa-brands fa-github", + "type": "fontawesome", + }, + ] } html_title = "VirtualiZarr" +html_context = { + "github_user": "zarr-developers", + "github_repo": "VirtualiZarr", + "github_version": "main", + "doc_path": "docs", +} # remove sidebar, see GH issue #82 html_css_files = [ From f7f81cca4bc47d286b824df71e512daaa4fb85fc Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Sat, 24 Aug 2024 09:40:47 -0600 Subject: [PATCH 08/29] Removes default storage options (#228) * removes default storage options * add anon into s3 test * spoof creds in s3 test * nested optts --- virtualizarr/kerchunk.py | 10 +++++----- virtualizarr/tests/test_xarray.py | 7 ++++++- virtualizarr/utils.py | 14 ++------------ virtualizarr/xarray.py | 12 ++---------- 4 files changed, 15 insertions(+), 28 deletions(-) diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 6496bf54..abaa1cc9 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -72,19 +72,19 @@ def read_kerchunk_references_from_file( filetype : FileType, default: None Type of file to be opened. Used to determine which kerchunk file format backend to use. If not provided will attempt to automatically infer the correct filetype from the the filepath's extension. - reader_options: dict, default {'storage_options':{'key':'', 'secret':'', 'anon':True}} + reader_options: dict, default {} Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments, so ensure reader_options match selected Kerchunk reader arguments. """ + if reader_options is None: + reader_options = {} + if filetype is None: filetype = _automatically_determine_filetype( filepath=filepath, reader_options=reader_options ) - if reader_options is None: - reader_options = {} - # if filetype is user defined, convert to FileType filetype = FileType(filetype) @@ -129,7 +129,7 @@ def read_kerchunk_references_from_file( def _automatically_determine_filetype( *, filepath: str, - reader_options: Optional[dict[str, Any]] = None, + reader_options: Optional[dict[str, Any]] = {}, ) -> FileType: if Path(filepath).suffix == ".zarr": # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one... diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 00140a14..9133eb54 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -321,7 +321,12 @@ def test_anon_read_s3(self, filetype, indexes): """Parameterized tests for empty vs supplied indexes and filetypes.""" # TODO: Switch away from this s3 url after minIO is implemented. fpath = "s3://carbonplan-share/virtualizarr/local.nc" - vds = open_virtual_dataset(fpath, filetype=filetype, indexes=indexes) + vds = open_virtual_dataset( + fpath, + filetype=filetype, + indexes=indexes, + reader_options={"storage_options": {"anon": True}}, + ) assert vds.dims == {"time": 2920, "lat": 25, "lon": 53} for var in vds.variables: diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index 4899d41d..aa73e7f8 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -24,8 +24,8 @@ def _fsspec_openfile_from_filepath( ---------- filepath : str Input filepath - reader_options : _type_, optional - Dict containing kwargs to pass to file opener, by default {'storage_options':{'key':'', 'secret':'', 'anon':True}} + reader_options : dict, optional + Dict containing kwargs to pass to file opener, by default {} Returns ------- @@ -44,18 +44,8 @@ def _fsspec_openfile_from_filepath( universal_filepath = UPath(filepath) protocol = universal_filepath.protocol - if protocol == "s3": - protocol_defaults = {"key": "", "secret": "", "anon": True} - else: - protocol_defaults = {} - - if reader_options is None: - reader_options = {} - storage_options = reader_options.get("storage_options", {}) # type: ignore - # using dict merge operator to add in defaults if keys are not specified - storage_options = protocol_defaults | storage_options fpath = fsspec.filesystem(protocol, **storage_options).open(filepath) return fpath diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index b612eaf5..35d60b6f 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -15,7 +15,6 @@ import ujson # type: ignore import xarray as xr -from upath import UPath from xarray import register_dataset_accessor from xarray.backends import AbstractDataStore, BackendArray from xarray.coding.times import CFDatetimeCoder @@ -83,7 +82,7 @@ def open_virtual_dataset( virtual_array_class Virtual array class to use to represent the references to the chunks in each on-disk array. Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. - reader_options: dict, default {'storage_options': {'key': '', 'secret': '', 'anon': True}} + reader_options: dict, default {} Dict passed into Kerchunk file readers, to allow reading from remote filesystems. Note: Each Kerchunk file reader has distinct arguments, so ensure reader_options match selected Kerchunk reader arguments. @@ -140,14 +139,7 @@ def open_virtual_dataset( ) else: if reader_options is None: - universal_filepath = UPath(filepath) - protocol = universal_filepath.protocol - if protocol == "s3": - reader_options = { - "storage_options": {"key": "", "secret": "", "anon": True} - } - else: - reader_options = {} + reader_options = {} # this is the only place we actually always need to use kerchunk directly # TODO avoid even reading byte ranges for variables that will be dropped later anyway? From d7f0c577a03de8acc30624e0e200f8cb08a51238 Mon Sep 17 00:00:00 2001 From: Ayush Nag <35325113+ayushnag@users.noreply.github.com> Date: Mon, 26 Aug 2024 21:23:23 +0530 Subject: [PATCH 09/29] open_virtual_dataset with dmr++ (#113) * basic dmr parsing functionality * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Speedup DMR chunk key parsing * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * added groups, docs, and bug fixes * rework hdf5 parser and group logic * update attrs cast to python dtype * parser passing tests * match main manifest dtypes * modularize dmrpp.py * add dmrpp api docs * resolve conflict * indexes and docs fix * Fix type hint for shape * change how FileType is used * Change FileType check again * fix storage_options bug --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Alex Goodman Co-authored-by: Tom Nicholas --- docs/releases.rst | 3 + virtualizarr/kerchunk.py | 1 + virtualizarr/readers/dmrpp.py | 683 ++++++++++++++++++ virtualizarr/tests/test_readers/test_dmrpp.py | 22 + virtualizarr/utils.py | 5 +- virtualizarr/xarray.py | 15 + 6 files changed, 728 insertions(+), 1 deletion(-) create mode 100644 virtualizarr/readers/dmrpp.py create mode 100644 virtualizarr/tests/test_readers/test_dmrpp.py diff --git a/docs/releases.rst b/docs/releases.rst index 7283df71..3fff4211 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -9,6 +9,9 @@ v1.0.1 (unreleased) New Features ~~~~~~~~~~~~ +- Add parser for the OPeNDAP DMR++ XML format and integration with open_virtual_dataset (:pull:`113`) + By `Ayush Nag `_. + - Load scalar variables by default. (:pull:`205`) By `Gustavo Hidalgo `_. diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index abaa1cc9..a73f2cda 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -42,6 +42,7 @@ class FileType(AutoName): tiff = auto() fits = auto() zarr = auto() + dmrpp = auto() zarr_v3 = auto() diff --git a/virtualizarr/readers/dmrpp.py b/virtualizarr/readers/dmrpp.py new file mode 100644 index 00000000..fa66205a --- /dev/null +++ b/virtualizarr/readers/dmrpp.py @@ -0,0 +1,683 @@ +import os +import warnings +from collections import defaultdict +from collections.abc import Mapping +from typing import Any, Optional +from xml.etree import ElementTree as ET + +import numpy as np +import xarray as xr +from xarray.core.indexes import Index + +from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.types import ChunkKey +from virtualizarr.zarr import ZArray + + +class DMRParser: + """ + Parser for the OPeNDAP DMR++ XML format. + Reads groups, dimensions, coordinates, data variables, encoding, chunk manifests, and attributes. + Highly modular to allow support for older dmrpp schema versions. Includes many utility functions to extract + different information such as finding all variable tags, splitting hdf5 groups, parsing dimensions, and more. + + OPeNDAP DMR++ homepage: https://docs.opendap.org/index.php/DMR%2B%2B + """ + + # DAP and DMRPP XML namespaces + _ns = { + "dap": "http://xml.opendap.org/ns/DAP/4.0#", + "dmr": "http://xml.opendap.org/dap/dmrpp/1.0.0#", + } + # DAP data types to numpy data types + _dap_np_dtype = { + "Byte": "uint8", + "UByte": "uint8", + "Int8": "int8", + "UInt8": "uint8", + "Int16": "int16", + "UInt16": "uint16", + "Int32": "int32", + "UInt32": "uint32", + "Int64": "int64", + "UInt64": "uint64", + "Url": "object", + "Float32": "float32", + "Float64": "float64", + "String": "object", + } + # Default zlib compression value + _default_zlib_value = 6 + # Encoding keys that should be removed from attributes and placed in xarray encoding dict + _encoding_keys = {"_FillValue", "missing_value", "scale_factor", "add_offset"} + + def __init__(self, dmr: str, data_filepath: Optional[str] = None): + """ + Initialize the DMRParser with the given DMR data and data file path. + + Parameters + ---------- + dmr : str + The DMR file contents as a string. + + data_filepath : str, optional + The path to the actual data file that will be set in the chunk manifests. + If None, the data file path is taken from the DMR file. + """ + self.root = ET.fromstring(dmr) + self.data_filepath = ( + data_filepath if data_filepath is not None else self.root.attrib["name"] + ) + + def parse_dataset( + self, group=None, indexes: Mapping[str, Index] = {} + ) -> xr.Dataset: + """ + Parses the given file and creates a virtual xr.Dataset with ManifestArrays. + + Parameters + ---------- + group : str + The group to parse. If None, and no groups are present, the dataset is parsed. + If None and groups are present, the first group is parsed. + + indexes : Mapping[str, Index], default is {} + Indexes to use on the returned xarray Dataset. + Default is {} which will avoid creating any indexes + + Returns + ------- + An xr.Dataset wrapping virtualized zarr arrays. + + Examples + -------- + Open a sample DMR++ file and parse the dataset + + >>> import requests + >>> r = requests.get("https://github.com/OPENDAP/bes/raw/3e518f6dc2f625b0b83cfb6e6fd5275e4d6dcef1/modules/dmrpp_module/data/dmrpp/chunked_threeD.h5.dmrpp") + >>> parser = DMRParser(r.text) + >>> vds = parser.parse_dataset() + >>> vds + Size: 4MB + Dimensions: (phony_dim_0: 100, phony_dim_1: 100, phony_dim_2: 100) + Dimensions without coordinates: phony_dim_0, phony_dim_1, phony_dim_2 + Data variables: + d_8_chunks (phony_dim_0, phony_dim_1, phony_dim_2) float32 4MB ManifestA... + + >>> vds2 = open_virtual_dataset("https://github.com/OPENDAP/bes/raw/3e518f6dc2f625b0b83cfb6e6fd5275e4d6dcef1/modules/dmrpp_module/data/dmrpp/chunked_threeD.h5.dmrpp", filetype="dmrpp", indexes={}) + >>> vds2 + Size: 4MB + Dimensions: (phony_dim_0: 100, phony_dim_1: 100, phony_dim_2: 100) + Dimensions without coordinates: phony_dim_0, phony_dim_1, phony_dim_2 + Data variables: + d_8_chunks (phony_dim_0, phony_dim_1, phony_dim_2) float32 4MB ManifestA... + """ + if group is not None: + # group = "/" + group.strip("/") # ensure group is in form "/a/b" + group = os.path.normpath(group).removeprefix( + "/" + ) # ensure group is in form "a/b/c" + if self._is_hdf5(self.root): + return self._parse_hdf5_dataset(self.root, group, indexes) + if self.data_filepath.endswith(".nc"): + return self._parse_netcdf4_dataset(self.root, group, indexes) + raise ValueError("DMR file must be HDF5 or netCDF4 based") + + def _parse_netcdf4_dataset( + self, + root: ET.Element, + group: Optional[str] = None, + indexes: Mapping[str, Index] = {}, + ) -> xr.Dataset: + """ + Parse the dataset from the netcdf4 based dmrpp with groups, starting at the given group. + Set root to the given group. + + Parameters + ---------- + root : ET.Element + The root element of the DMR file. + + group : str + The group to parse. If None, and no groups are present, the dataset is parsed. + If None and groups are present, the first group is parsed. + + Returns + ------- + xr.Dataset + """ + group_tags = root.findall("dap:Group", self._ns) + if len(group_tags) == 0: + if group is not None: + # no groups found and group specified -> warning + warnings.warn( + "No groups found in NetCDF4 DMR file; ignoring group parameter" + ) + # no groups found and no group specified -> parse dataset + return self._parse_dataset(root, indexes) + all_groups = self._split_netcdf4(root) + if group is None: + # groups found and no group specified -> parse first group + return self._parse_dataset(group_tags[0], indexes) + if group in all_groups: + # groups found and group specified -> parse specified group + return self._parse_dataset(all_groups[group], indexes) + else: + # groups found and specified group not found -> error + raise ValueError(f"Group {group} not found in NetCDF4 DMR file") + + def _split_netcdf4(self, root: ET.Element) -> dict[str, ET.Element]: + """ + Split the input element into several ET.Elements by netcdf4 group + E.g. {"left": , "right": } + + Parameters + ---------- + root : ET.Element + The root element of the DMR file. + + Returns + ------- + dict[str, ET.Element] + """ + group_tags = root.findall("dap:Group", self._ns) + all_groups: dict[str, ET.Element] = defaultdict( + lambda: ET.Element(root.tag, root.attrib) + ) + for group_tag in group_tags: + all_groups[os.path.normpath(group_tag.attrib["name"])] = group_tag + return all_groups + + def _is_hdf5(self, root: ET.Element) -> bool: + """Check if the DMR file is HDF5 based.""" + if root.find(".//dap:Attribute[@name='fullnamepath']", self._ns) is not None: + return True + if root.find("./dap:Attribute[@name='HDF5_GLOBAL']", self._ns) is not None: + return True + return False + + def _parse_hdf5_dataset( + self, + root: ET.Element, + group: Optional[str] = None, + indexes: Mapping[str, Index] = {}, + ) -> xr.Dataset: + """ + Parse the dataset from the HDF5 based dmrpp with groups, starting at the given group. + Set root to the given group. + + Parameters + ---------- + root : ET.Element + The root element of the DMR file. + + group : str + The group to parse. If None, and no groups are present, the dataset is parsed. + If None and groups are present, the first group is parsed. + + indexes : Mapping[str, Index], default is {} + Indexes to use on the returned xarray Dataset. + Default is {} which will avoid creating any indexes + + Returns + ------- + xr.Dataset + """ + all_groups = self._split_hdf5(root=root) + if len(all_groups) == 0: + raise ValueError("No groups found in HDF based dmrpp file") + if group is None: + # pick a random group if no group is specified + group = next(iter(all_groups)) + attrs = {} + for attr_tag in root.iterfind("dap:Attribute", self._ns): + if attr_tag.attrib["type"] != "Container": + attrs.update(self._parse_attribute(attr_tag)) + if group in all_groups: + # replace aliased variable names with original names: gt1r_heights -> heights + orignames = self._find_original_names(all_groups[group]) + vds = self._parse_dataset(all_groups[group], indexes) + # Only one group so found attrs are global attrs + if len(all_groups) == 1: + vds.attrs.update(attrs) + return vds.rename(orignames) + raise ValueError(f"Group {group} not found in HDF5 dmrpp file") + + def _find_original_names(self, root: ET.Element) -> dict[str, str]: + """ + Find the original variable names from the HDF based groups. E.g. gt1r_heights -> heights + + E.g. if the variable name is 'gt1r_heights', the original name is 'heights' from the group 'gt1r'. + + Parameters + ---------- + root : ET.Element + The root element of the DMR file. + + Returns + ------- + dict[str, str] + """ + + orignames: dict[str, str] = {} + vars_tags: list[ET.Element] = [] + for dap_dtype in self._dap_np_dtype: + vars_tags += root.findall(f"dap:{dap_dtype}", self._ns) + for var_tag in vars_tags: + origname_tag = var_tag.find( + "./dap:Attribute[@name='origname']/dap:Value", self._ns + ) + if origname_tag is not None and origname_tag.text is not None: + orignames[var_tag.attrib["name"]] = origname_tag.text + return orignames + + def _split_hdf5(self, root: ET.Element) -> dict[str, ET.Element]: + """ + Split the input element into several ET.Elements by HDF5 group + E.g. {"gtr1/heights": , "gtr1/temperatures": }. Builds up new elements + each with dimensions, variables, and attributes. + + Parameters + ---------- + root : ET.Element + The root element of the DMR file. + + Returns + ------- + dict[str, ET.Element] + """ + # Add all variable, dimension, and attribute tags to their respective groups + groups_roots: dict[str, ET.Element] = defaultdict( + lambda: ET.Element(root.tag, root.attrib) + ) + group_dims: dict[str, set[str]] = defaultdict( + set + ) # {"gt1r/heights": {"dim1", "dim2", ...}} + vars_tags: list[ET.Element] = [] + for dap_dtype in self._dap_np_dtype: + vars_tags += root.findall(f"dap:{dap_dtype}", self._ns) + # Variables + for var_tag in vars_tags: + fullname_tag = var_tag.find( + "./dap:Attribute[@name='fullnamepath']/dap:Value", self._ns + ) + if fullname_tag is not None and fullname_tag.text is not None: + # '/gt1r/heights/ph_id_pulse' -> 'gt1r/heights' + group_name = os.path.dirname(fullname_tag.text).removeprefix("/") + groups_roots[group_name].append(var_tag) + dim_tags = var_tag.findall("dap:Dim", self._ns) + dims = self._parse_multi_dims(dim_tags) + group_dims[group_name].update(dims.keys()) + # Dimensions + for dim_tag in root.iterfind("dap:Dimension", self._ns): + for g, d in group_dims.items(): + if dim_tag.attrib["name"] in d: + groups_roots[g].append(dim_tag) + # Attributes + container_attr_tag = root.find("dap:Attribute[@name='HDF5_GLOBAL']", self._ns) + if container_attr_tag is None: + attrs_tags = root.findall("dap:Attribute", self._ns) + for attr_tag in attrs_tags: + fullname_tag = attr_tag.find( + "./dap:Attribute[@name='fullnamepath']/dap:Value", self._ns + ) + if fullname_tag is not None and fullname_tag.text is not None: + group_name = os.path.dirname(fullname_tag.text).removeprefix("/") + # Add all attributes to the new dataset + groups_roots[group_name].extend(attr_tag) + else: + groups_roots[next(iter(groups_roots))].extend(container_attr_tag) + return groups_roots + + def _parse_dataset( + self, root: ET.Element, indexes: Mapping[str, Index] = {} + ) -> xr.Dataset: + """ + Parse the dataset using the root element of the DMR file. + + Parameters + ---------- + root : ET.Element + The root element of the DMR file. + + Returns + ------- + xr.Dataset + """ + # Dimension names and sizes + dim_tags = root.findall("dap:Dimension", self._ns) + dataset_dims = self._parse_multi_dims(dim_tags) + # Data variables and coordinates + coord_names = self._find_coord_names(root) + # if no coord_names are found or coords don't include dims, dims are used as coords + if len(coord_names) == 0 or len(coord_names) < len(dataset_dims): + coord_names = set(dataset_dims.keys()) + # Seperate and parse coords + data variables + coord_vars: dict[str, xr.Variable] = {} + data_vars: dict[str, xr.Variable] = {} + for var_tag in self._find_var_tags(root): + variable = self._parse_variable(var_tag, dataset_dims) + if var_tag.attrib["name"] in coord_names: + coord_vars[var_tag.attrib["name"]] = variable + else: + data_vars[var_tag.attrib["name"]] = variable + # Attributes + attrs: dict[str, str] = {} + for attr_tag in self.root.iterfind("dap:Attribute", self._ns): + attrs.update(self._parse_attribute(attr_tag)) + return xr.Dataset( + data_vars=data_vars, + coords=xr.Coordinates(coords=coord_vars, indexes=indexes), + attrs=attrs, + ) + + def _find_var_tags(self, root: ET.Element) -> list[ET.Element]: + """ + Find all variable tags in the DMR file. Also known as array tags. + Tags are labeled with the DAP data type. E.g. , , + + Parameters + ---------- + root : ET.Element + The root element of the DMR file. + + Returns + ------- + list[ET.Element] + """ + vars_tags: list[ET.Element] = [] + for dap_dtype in self._dap_np_dtype: + vars_tags += root.findall(f"dap:{dap_dtype}", self._ns) + return vars_tags + + def _find_coord_names(self, root: ET.Element) -> set[str]: + """ + Find the name of all coordinates in root. Checks inside all variables and global attributes. + + Parameters + ---------- + root : ET.Element + The root element of the DMR file. + + Returns + ------- + set[str] : The set of unique coordinate names. + """ + # Check for coordinate names within each variable attributes + coord_names: set[str] = set() + for var_tag in self._find_var_tags(root): + coord_tag = var_tag.find( + "./dap:Attribute[@name='coordinates']/dap:Value", self._ns + ) + if coord_tag is not None and coord_tag.text is not None: + coord_names.update(coord_tag.text.split(" ")) + for map_tag in var_tag.iterfind("dap:Map", self._ns): + coord_names.add(map_tag.attrib["name"].removeprefix("/")) + # Check for coordinate names in a global attribute + coord_tag = var_tag.find("./dap:Attribute[@name='coordinates']", self._ns) + if coord_tag is not None and coord_tag.text is not None: + coord_names.update(coord_tag.text.split(" ")) + return coord_names + + def _parse_dim(self, root: ET.Element) -> dict[str, int | None]: + """ + Parse single or tag + + If the tag has no name attribute, it is a phony dimension. E.g. --> {"phony_dim": 300} + If the tag has no size attribute, it is an unlimited dimension. E.g. --> {"time": None} + If the tag has both name and size attributes, it is a regular dimension. E.g. --> {"lat": 1447} + + Parameters + ---------- + root : ET.Element + The root element Dim/Dimension tag + + Returns + ------- + dict + E.g. {"time": 1, "lat": 1447, "lon": 2895}, {"phony_dim": 300}, {"time": None, "lat": None, "lon": None} + """ + if "name" not in root.attrib and "size" in root.attrib: + return {"phony_dim": int(root.attrib["size"])} + if "name" in root.attrib and "size" not in root.attrib: + return {os.path.basename(root.attrib["name"]): None} + if "name" in root.attrib and "size" in root.attrib: + return {os.path.basename(root.attrib["name"]): int(root.attrib["size"])} + raise ValueError("Not enough information to parse Dim/Dimension tag") + + def _parse_multi_dims( + self, dim_tags: list[ET.Element], global_dims: dict[str, int] = {} + ) -> dict: + """ + Parse multiple or tags. Generally tags are found within dmrpp variable tags. + + Returns best possible matching of {dimension: shape} present in the list and global_dims. E.g tags=(Dim("lat", None), Dim("lon", None)) and global_dims={"lat": 100, "lon": 100, "time": 5} --> {"lat": 100, "lon": 100} + + E.g. tags=(Dim("time", None), Dim("", 200)) and global_dims={"lat": 100, "lon": 100, "time": 5} --> {"time": 5, "phony_dim0": 200} + + This function is often used to fill in missing sizes from the global_dims. E.g. Variable tags may contain only dimension names and not sizes. If the {name: size} matching is known from the global_dims, it is used to fill in the missing sizes. + + Parameters + ---------- + dim_tags : tuple[ET.Element] + A tuple of ElementTree Elements representing dimensions in the DMR file. + + global_dims : dict + A dictionary of dimension names and sizes. E.g. {"time": 1, "lat": 1447, "lon": 2895} + + Returns + ------- + dict + E.g. {"time": 1, "lat": 1447, "lon": 2895} + """ + dims: dict[str, int | None] = {} + for dim_tag in dim_tags: + dim: dict[str, int | None] = self._parse_dim(dim_tag) + if "phony_dim" in dim: + dims["phony_dim_" + str(len(dims))] = dim["phony_dim"] + else: + dims.update(dim) + for name, size in list(dims.items()): + if name in global_dims and size is None: + dims[name] = global_dims[name] + return dims + + def _parse_variable( + self, var_tag: ET.Element, dataset_dims: dict[str, int] + ) -> xr.Variable: + """ + Parse a variable from a DMR tag. + + Parameters + ---------- + var_tag : ET.Element + An ElementTree Element representing a variable in the DMR file. Will have DAP dtype as tag. + + dataset_dims : dict + A dictionary of dimension names and sizes. E.g. {"time": 1, "lat": 1447, "lon": 2895} + Must contain at least all the dimensions used by the variable. Necessary since the variable + metadata only contains the dimension names and not the sizes. + + Returns + ------- + xr.Variable + """ + # Dimension names + dim_tags = var_tag.findall("dap:Dim", self._ns) + dim_shapes = self._parse_multi_dims(dim_tags, dataset_dims) + # convert DAP dtype to numpy dtype + dtype = np.dtype( + self._dap_np_dtype[var_tag.tag.removeprefix("{" + self._ns["dap"] + "}")] + ) + # Chunks and Filters + filters = None + shape: tuple[int, ...] = tuple(dim_shapes.values()) + chunks_shape = shape + chunks_tag = var_tag.find("dmr:chunks", self._ns) + if chunks_tag is not None: + # Chunks + found_chunk_dims = self._parse_chunks_dimensions(chunks_tag) + chunks_shape = found_chunk_dims if found_chunk_dims is not None else shape + chunkmanifest = self._parse_chunks(chunks_tag, chunks_shape) + # Filters + filters = self._parse_filters(chunks_tag, dtype) + # Attributes + attrs: dict[str, Any] = {} + for attr_tag in var_tag.iterfind("dap:Attribute", self._ns): + attrs.update(self._parse_attribute(attr_tag)) + # Fill value is placed in encoding and thus removed from attributes + fill_value = attrs.pop("_FillValue", 0.0) + # Remove attributes only used for parsing logic + attrs.pop("fullnamepath", None) + attrs.pop("origname", None) + attrs.pop("coordinates", None) + # create ManifestArray and ZArray + zarray = ZArray( + chunks=chunks_shape, + dtype=dtype, + fill_value=fill_value, + filters=filters, + order="C", + shape=shape, + ) + marr = ManifestArray(zarray=zarray, chunkmanifest=chunkmanifest) + encoding = {k: attrs.get(k) for k in self._encoding_keys if k in attrs} + return xr.Variable( + dims=dim_shapes.keys(), data=marr, attrs=attrs, encoding=encoding + ) + + def _parse_attribute(self, attr_tag: ET.Element) -> dict[str, Any]: + """ + Parse an attribute from a DMR attr tag. Converts the attribute value to a native python type. + + Parameters + ---------- + attr_tag : ET.Element + An ElementTree Element with an tag. + + Returns + ------- + dict + """ + attr: dict[str, Any] = {} + values = [] + if "type" in attr_tag.attrib and attr_tag.attrib["type"] == "Container": + return attr + dtype = np.dtype(self._dap_np_dtype[attr_tag.attrib["type"]]) + # if multiple Value tags are present, store as "key": "[v1, v2, ...]" + for value_tag in attr_tag: + # cast attribute to native python type using dmr provided dtype + val = ( + dtype.type(value_tag.text).item() + if dtype != np.object_ + else value_tag.text + ) + if val == "*": + val = np.nan + values.append(val) + attr[attr_tag.attrib["name"]] = values[0] if len(values) == 1 else values + return attr + + def _parse_filters( + self, chunks_tag: ET.Element, dtype: np.dtype + ) -> list[dict] | None: + """ + Parse filters from a DMR chunks tag. + + Parameters + ---------- + chunks_tag : ET.Element + An ElementTree Element with a tag. + + dtype : np.dtype + The numpy dtype of the variable. + + Returns + ------- + list[dict] | None + E.g. [{"id": "shuffle", "elementsize": 4}, {"id": "zlib", "level": 4}] + """ + if "compressionType" in chunks_tag.attrib: + filters: list[dict] = [] + # shuffle deflate --> ["shuffle", "deflate"] + compression_types = chunks_tag.attrib["compressionType"].split(" ") + for c in compression_types: + if c == "shuffle": + filters.append({"id": "shuffle", "elementsize": dtype.itemsize}) + elif c == "deflate": + filters.append( + { + "id": "zlib", + "level": int( + chunks_tag.attrib.get( + "deflateLevel", self._default_zlib_value + ) + ), + } + ) + return filters + return None + + def _parse_chunks_dimensions( + self, chunks_tag: ET.Element + ) -> tuple[int, ...] | None: + """ + Parse the chunk dimensions from a DMR chunks tag. Returns None if no chunk dimensions are found. + + Parameters + ---------- + chunks_tag : ET.Element + An ElementTree Element with a tag. + + Returns + ------- + tuple[int, ...] | None + + """ + chunk_dim_tag = chunks_tag.find("dmr:chunkDimensionSizes", self._ns) + if chunk_dim_tag is not None and chunk_dim_tag.text is not None: + # 1 1447 2895 -> (1, 1447, 2895) + return tuple(map(int, chunk_dim_tag.text.split())) + return None + + def _parse_chunks( + self, chunks_tag: ET.Element, chunks_shape: tuple[int, ...] + ) -> ChunkManifest: + """ + Parse the chunk manifest from a DMR chunks tag. + + Parameters + ---------- + chunks_tag : ET.Element + An ElementTree Element with a tag. + + chunks_shape : tuple + Chunk sizes for each dimension. E.g. (1, 1447, 2895) + + Returns + ------- + ChunkManifest + """ + chunkmanifest: dict[ChunkKey, object] = {} + default_num: list[int] = ( + [0 for i in range(len(chunks_shape))] if chunks_shape else [0] + ) + chunk_key_template = ".".join(["{}" for i in range(len(default_num))]) + for chunk_tag in chunks_tag.iterfind("dmr:chunk", self._ns): + chunk_num = default_num + if "chunkPositionInArray" in chunk_tag.attrib: + # "[0,1023,10235]" -> ["0","1023","10235"] + chunk_pos = chunk_tag.attrib["chunkPositionInArray"][1:-1].split(",") + # [0,1023,10235] // [1, 1023, 2047] -> [0,1,5] + chunk_num = [ + int(chunk_pos[i]) // chunks_shape[i] + for i in range(len(chunks_shape)) + ] + # [0,1,5] -> "0.1.5" + chunk_key = ChunkKey(chunk_key_template.format(*chunk_num)) + chunkmanifest[chunk_key] = { + "path": self.data_filepath, + "offset": int(chunk_tag.attrib["offset"]), + "length": int(chunk_tag.attrib["nBytes"]), + } + return ChunkManifest(entries=chunkmanifest) diff --git a/virtualizarr/tests/test_readers/test_dmrpp.py b/virtualizarr/tests/test_readers/test_dmrpp.py new file mode 100644 index 00000000..d2b19d60 --- /dev/null +++ b/virtualizarr/tests/test_readers/test_dmrpp.py @@ -0,0 +1,22 @@ +import pytest +import xarray as xr + +from virtualizarr import open_virtual_dataset +from virtualizarr.tests import network + +urls = [ + ( + "netcdf4", + "https://github.com/OPENDAP/bes/raw/3e518f6dc2f625b0b83cfb6e6fd5275e4d6dcef1/modules/dmrpp_module/data/dmrpp/chunked_threeD.h5", + "dmrpp", + "https://github.com/OPENDAP/bes/raw/3e518f6dc2f625b0b83cfb6e6fd5275e4d6dcef1/modules/dmrpp_module/data/dmrpp/chunked_threeD.h5.dmrpp", + ) +] + + +@network +@pytest.mark.parametrize("data_type, data_url, dmrpp_type, dmrpp_url", urls) +def test_dmrpp_reader(data_type, data_url, dmrpp_type, dmrpp_url): + result = open_virtual_dataset(dmrpp_url, indexes={}, filetype=dmrpp_type) + expected = open_virtual_dataset(data_url, indexes={}) + xr.testing.assert_identical(result, expected) diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index aa73e7f8..092ddd25 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -16,7 +16,7 @@ def _fsspec_openfile_from_filepath( *, filepath: str, - reader_options: Optional[dict] = {}, + reader_options: Optional[dict] = None, ) -> OpenFileType: """Converts input filepath to fsspec openfile object. @@ -44,6 +44,9 @@ def _fsspec_openfile_from_filepath( universal_filepath = UPath(filepath) protocol = universal_filepath.protocol + if reader_options is None: + reader_options = {} + storage_options = reader_options.get("storage_options", {}) # type: ignore fpath = fsspec.filesystem(protocol, **storage_options).open(filepath) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 35d60b6f..0fb33815 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -137,6 +137,21 @@ def open_virtual_dataset( return open_virtual_dataset_from_v3_store( storepath=filepath, drop_variables=drop_variables, indexes=indexes ) + elif filetype == FileType.dmrpp: + from virtualizarr.readers.dmrpp import DMRParser + + if loadable_variables != [] or cftime_variables != [] or indexes is None: + raise NotImplementedError( + "Specifying `loadable_variables`, `cftime_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files." + ) + + fpath = _fsspec_openfile_from_filepath( + filepath=filepath, reader_options=reader_options + ) + parser = DMRParser(fpath.read(), data_filepath=filepath.strip(".dmrpp")) + vds = parser.parse_dataset() + vds.drop_vars(drop_variables) + return vds else: if reader_options is None: reader_options = {} From 515d157b41bbbf9d40898c7b9cab5486d99c66d2 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 26 Aug 2024 20:09:54 -0600 Subject: [PATCH 10/29] Internal refactor to separate reading and writing concerns (#231) * split xarray.py into backend.py and accessor.py * move the kerchunk serialization code out into a new writers submodule * separate out the zarr reading code as a separate reader * actually include new accessor.py file * actually include new kerchunk writers file * actually include new zarr writer file * update test to import from the new location of zarr code * refactor to create a kerchunk 'reader' * split test_xarray.py into two files * split up the kerchunk tests into tests of writing and reading kerchunk * absolute imports in top-level init * kerchunk.py -> types.kerchunk.py * fix some mypy issues * release notes * update module paths in API docs * separate zarr writer tests out * forgot file i moved the zarr tests to * move left behind test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/api.rst | 6 +- docs/releases.rst | 5 +- virtualizarr/__init__.py | 6 +- virtualizarr/accessor.py | 166 +++++++++ virtualizarr/{xarray.py => backend.py} | 339 ++---------------- virtualizarr/manifests/array.py | 7 +- virtualizarr/{ => readers}/kerchunk.py | 305 +++++++--------- virtualizarr/readers/zarr.py | 131 +++++++ virtualizarr/tests/test_backend.py | 255 +++++++++++++ virtualizarr/tests/test_kerchunk.py | 238 +----------- virtualizarr/tests/test_readers/__init__.py | 0 .../tests/test_readers/test_kerchunk.py | 63 ++++ virtualizarr/tests/test_writers/__init__.py | 0 .../tests/test_writers/test_kerchunk.py | 118 ++++++ virtualizarr/tests/test_writers/test_zarr.py | 82 +++++ virtualizarr/tests/test_xarray.py | 187 ---------- virtualizarr/tests/test_zarr.py | 80 +---- virtualizarr/types/__init__.py | 3 + virtualizarr/{types.py => types/general.py} | 0 virtualizarr/types/kerchunk.py | 12 + virtualizarr/writers/__init__.py | 0 virtualizarr/writers/kerchunk.py | 124 +++++++ virtualizarr/writers/zarr.py | 115 ++++++ virtualizarr/zarr.py | 178 --------- 24 files changed, 1247 insertions(+), 1173 deletions(-) create mode 100644 virtualizarr/accessor.py rename virtualizarr/{xarray.py => backend.py} (50%) rename virtualizarr/{ => readers}/kerchunk.py (51%) create mode 100644 virtualizarr/readers/zarr.py create mode 100644 virtualizarr/tests/test_backend.py create mode 100644 virtualizarr/tests/test_readers/__init__.py create mode 100644 virtualizarr/tests/test_readers/test_kerchunk.py create mode 100644 virtualizarr/tests/test_writers/__init__.py create mode 100644 virtualizarr/tests/test_writers/test_kerchunk.py create mode 100644 virtualizarr/tests/test_writers/test_zarr.py create mode 100644 virtualizarr/types/__init__.py rename virtualizarr/{types.py => types/general.py} (100%) create mode 100644 virtualizarr/types/kerchunk.py create mode 100644 virtualizarr/writers/__init__.py create mode 100644 virtualizarr/writers/kerchunk.py create mode 100644 virtualizarr/writers/zarr.py diff --git a/docs/api.rst b/docs/api.rst index 3dc1d146..81d08a77 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -21,7 +21,7 @@ Manifests Reading ======= -.. currentmodule:: virtualizarr.xarray +.. currentmodule:: virtualizarr.backend .. autosummary:: :nosignatures: :toctree: generated/ @@ -32,7 +32,7 @@ Reading Serialization ============= -.. currentmodule:: virtualizarr.xarray +.. currentmodule:: virtualizarr.accessor .. autosummary:: :nosignatures: :toctree: generated/ @@ -44,7 +44,7 @@ Serialization Rewriting ============= -.. currentmodule:: virtualizarr.xarray +.. currentmodule:: virtualizarr.accessor .. autosummary:: :nosignatures: :toctree: generated/ diff --git a/docs/releases.rst b/docs/releases.rst index 3fff4211..5ae3bff4 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -34,7 +34,7 @@ Bug fixes - Exclude empty chunks during `ChunkDict` construction. (:pull:`198`) By `Gustavo Hidalgo `_. - Fixed regression in `fill_value` handling for datetime dtypes making virtual - Zarr stores unreadable (:pr:`206`) + Zarr stores unreadable (:pull:`206`) By `Timothy Hodson `_ Documentation @@ -43,6 +43,9 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ +- Refactored internal structure significantly to split up everything to do with reading references from that to do with writing references. + (:issue:`229`) (:pull:`231`) By `Tom Nicholas `_. + .. _v1.0.0: v1.0.0 (9th July 2024) diff --git a/virtualizarr/__init__.py b/virtualizarr/__init__.py index 11bdae6e..bd70f834 100644 --- a/virtualizarr/__init__.py +++ b/virtualizarr/__init__.py @@ -1,6 +1,6 @@ -from .manifests import ChunkManifest, ManifestArray # type: ignore # noqa -from .xarray import VirtualiZarrDatasetAccessor # type: ignore # noqa -from .xarray import open_virtual_dataset # noqa: F401 +from virtualizarr.manifests import ChunkManifest, ManifestArray # type: ignore # noqa +from virtualizarr.accessor import VirtualiZarrDatasetAccessor # type: ignore # noqa +from virtualizarr.backend import open_virtual_dataset # noqa: F401 from importlib.metadata import version as _version diff --git a/virtualizarr/accessor.py b/virtualizarr/accessor.py new file mode 100644 index 00000000..0a97237e --- /dev/null +++ b/virtualizarr/accessor.py @@ -0,0 +1,166 @@ +from pathlib import Path +from typing import ( + Callable, + Literal, + overload, +) + +import ujson # type: ignore +from xarray import Dataset, register_dataset_accessor + +from virtualizarr.manifests import ManifestArray +from virtualizarr.types.kerchunk import KerchunkStoreRefs +from virtualizarr.writers.kerchunk import dataset_to_kerchunk_refs +from virtualizarr.writers.zarr import dataset_to_zarr + + +@register_dataset_accessor("virtualize") +class VirtualiZarrDatasetAccessor: + """ + Xarray accessor for writing out virtual datasets to disk. + + Methods on this object are called via `ds.virtualize.{method}`. + """ + + def __init__(self, ds: Dataset): + self.ds: Dataset = ds + + def to_zarr(self, storepath: str) -> None: + """ + Serialize all virtualized arrays in this xarray dataset as a Zarr store. + + Currently requires all variables to be backed by ManifestArray objects. + + Not very useful until some implementation of a Zarr reader can actually read these manifest.json files. + See https://github.com/zarr-developers/zarr-specs/issues/287 + + Parameters + ---------- + storepath : str + """ + dataset_to_zarr(self.ds, storepath) + + @overload + def to_kerchunk( + self, filepath: None, format: Literal["dict"] + ) -> KerchunkStoreRefs: ... + + @overload + def to_kerchunk(self, filepath: str | Path, format: Literal["json"]) -> None: ... + + @overload + def to_kerchunk( + self, + filepath: str | Path, + format: Literal["parquet"], + record_size: int = 100_000, + categorical_threshold: int = 10, + ) -> None: ... + + def to_kerchunk( + self, + filepath: str | Path | None = None, + format: Literal["dict", "json", "parquet"] = "dict", + record_size: int = 100_000, + categorical_threshold: int = 10, + ) -> KerchunkStoreRefs | None: + """ + Serialize all virtualized arrays in this xarray dataset into the kerchunk references format. + + Parameters + ---------- + filepath : str, default: None + File path to write kerchunk references into. Not required if format is 'dict'. + format : 'dict', 'json', or 'parquet' + Format to serialize the kerchunk references as. + If 'json' or 'parquet' then the 'filepath' argument is required. + record_size (parquet only): int + Number of references to store in each reference file (default 100,000). Bigger values + mean fewer read requests but larger memory footprint. + categorical_threshold (parquet only) : int + Encode urls as pandas.Categorical to reduce memory footprint if the ratio + of the number of unique urls to total number of refs for each variable + is greater than or equal to this number. (default 10) + + References + ---------- + https://fsspec.github.io/kerchunk/spec.html + """ + refs = dataset_to_kerchunk_refs(self.ds) + + if format == "dict": + return refs + elif format == "json": + if filepath is None: + raise ValueError("Filepath must be provided when format is 'json'") + + with open(filepath, "w") as json_file: + ujson.dump(refs, json_file) + + return None + elif format == "parquet": + from kerchunk.df import refs_to_dataframe + + if isinstance(filepath, Path): + url = str(filepath) + elif isinstance(filepath, str): + url = filepath + + # refs_to_dataframe is responsible for writing to parquet. + # at no point does it create a full in-memory dataframe. + refs_to_dataframe( + refs, + url=url, + record_size=record_size, + categorical_threshold=categorical_threshold, + ) + return None + else: + raise ValueError(f"Unrecognized output format: {format}") + + def rename_paths( + self, + new: str | Callable[[str], str], + ) -> Dataset: + """ + Rename paths to chunks in every ManifestArray in this dataset. + + Accepts either a string, in which case this new path will be used for all chunks, or + a function which accepts the old path and returns the new path. + + Parameters + ---------- + new + New path to use for all chunks, either as a string, or as a function which accepts and returns strings. + + Returns + ------- + Dataset + + Examples + -------- + Rename paths to reflect moving the referenced files from local storage to an S3 bucket. + + >>> def local_to_s3_url(old_local_path: str) -> str: + ... from pathlib import Path + ... + ... new_s3_bucket_url = "http://s3.amazonaws.com/my_bucket/" + ... + ... filename = Path(old_local_path).name + ... return str(new_s3_bucket_url / filename) + + >>> ds.virtualize.rename_paths(local_to_s3_url) + + See Also + -------- + ManifestArray.rename_paths + ChunkManifest.rename_paths + """ + + new_ds = self.ds.copy() + for var_name in new_ds.variables: + data = new_ds[var_name].data + if isinstance(data, ManifestArray): + new_ds[var_name].data = data.rename_paths(new=new) + + return new_ds diff --git a/virtualizarr/xarray.py b/virtualizarr/backend.py similarity index 50% rename from virtualizarr/xarray.py rename to virtualizarr/backend.py index 0fb33815..87c2aa2a 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/backend.py @@ -1,39 +1,47 @@ import os import warnings from collections.abc import Iterable, Mapping, MutableMapping +from enum import Enum, auto from io import BufferedIOBase -from pathlib import Path from typing import ( Any, - Callable, Hashable, - Literal, Optional, cast, - overload, ) -import ujson # type: ignore import xarray as xr -from xarray import register_dataset_accessor from xarray.backends import AbstractDataStore, BackendArray from xarray.coding.times import CFDatetimeCoder from xarray.core.indexes import Index, PandasIndex from xarray.core.variable import IndexVariable -import virtualizarr.kerchunk as kerchunk -from virtualizarr.kerchunk import FileType, KerchunkStoreRefs -from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.manifests import ManifestArray from virtualizarr.utils import _fsspec_openfile_from_filepath -from virtualizarr.zarr import ( - attrs_from_zarr_group_json, - dataset_to_zarr, - metadata_from_zarr_json, -) XArrayOpenT = str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore +class AutoName(Enum): + # Recommended by official Python docs for auto naming: + # https://docs.python.org/3/library/enum.html#using-automatic-values + def _generate_next_value_(name, start, count, last_values): + return name + + +class FileType(AutoName): + netcdf3 = auto() + netcdf4 = auto() # NOTE: netCDF4 is a subset of hdf5 + hdf4 = auto() + hdf5 = auto() + grib = auto() + tiff = auto() + fits = auto() + zarr = auto() + dmrpp = auto() + zarr_v3 = auto() + + class ManifestBackendArray(ManifestArray, BackendArray): """Using this prevents xarray from wrapping the KerchunkArray in ExplicitIndexingAdapter etc.""" @@ -134,6 +142,8 @@ def open_virtual_dataset( if filetype == FileType.zarr_v3: # TODO is there a neat way of auto-detecting this? + from virtualizarr.readers.zarr import open_virtual_dataset_from_v3_store + return open_virtual_dataset_from_v3_store( storepath=filepath, drop_variables=drop_variables, indexes=indexes ) @@ -153,12 +163,19 @@ def open_virtual_dataset( vds.drop_vars(drop_variables) return vds else: + # we currently read every other filetype using kerchunks various file format backends + from virtualizarr.readers.kerchunk import ( + fully_decode_arr_refs, + read_kerchunk_references_from_file, + virtual_vars_from_kerchunk_refs, + ) + if reader_options is None: reader_options = {} # this is the only place we actually always need to use kerchunk directly # TODO avoid even reading byte ranges for variables that will be dropped later anyway? - vds_refs = kerchunk.read_kerchunk_references_from_file( + vds_refs = read_kerchunk_references_from_file( filepath=filepath, filetype=filetype, reader_options=reader_options, @@ -168,7 +185,7 @@ def open_virtual_dataset( drop_variables=drop_variables + loadable_variables, virtual_array_class=virtual_array_class, ) - ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) + ds_attrs = fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) coord_names = ds_attrs.pop("coordinates", []) if indexes is None or len(loadable_variables) > 0: @@ -235,144 +252,6 @@ def open_virtual_dataset( return vds -def open_virtual_dataset_from_v3_store( - storepath: str, - drop_variables: list[str], - indexes: Mapping[str, Index] | None, -) -> xr.Dataset: - """ - Read a Zarr v3 store and return an xarray Dataset containing virtualized arrays. - """ - _storepath = Path(storepath) - - ds_attrs = attrs_from_zarr_group_json(_storepath / "zarr.json") - coord_names = ds_attrs.pop("coordinates", []) - - # TODO recursive glob to create a datatree - # Note: this .is_file() check should not be necessary according to the pathlib docs, but tests fail on github CI without it - # see https://github.com/TomNicholas/VirtualiZarr/pull/45#discussion_r1547833166 - all_paths = _storepath.glob("*/") - directory_paths = [p for p in all_paths if not p.is_file()] - - vars = {} - for array_dir in directory_paths: - var_name = array_dir.name - if var_name in drop_variables: - break - - zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json") - manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json")) - - marr = ManifestArray(chunkmanifest=manifest, zarray=zarray) - var = xr.Variable(data=marr, dims=dim_names, attrs=attrs) - vars[var_name] = var - - if indexes is None: - raise NotImplementedError() - elif indexes != {}: - # TODO allow manual specification of index objects - raise NotImplementedError() - else: - indexes = dict(**indexes) # for type hinting: to allow mutation - - data_vars, coords = separate_coords(vars, indexes, coord_names) - - ds = xr.Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, - ) - - return ds - - -def virtual_vars_from_kerchunk_refs( - refs: KerchunkStoreRefs, - drop_variables: list[str] | None = None, - virtual_array_class=ManifestArray, -) -> dict[str, xr.Variable]: - """ - Translate a store-level kerchunk reference dict into aaset of xarray Variables containing virtualized arrays. - - Parameters - ---------- - drop_variables: list[str], default is None - Variables in the file to drop before returning. - virtual_array_class - Virtual array class to use to represent the references to the chunks in each on-disk array. - Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. - """ - - var_names = kerchunk.find_var_names(refs) - if drop_variables is None: - drop_variables = [] - var_names_to_keep = [ - var_name for var_name in var_names if var_name not in drop_variables - ] - - vars = { - var_name: variable_from_kerchunk_refs(refs, var_name, virtual_array_class) - for var_name in var_names_to_keep - } - return vars - - -def dataset_from_kerchunk_refs( - refs: KerchunkStoreRefs, - drop_variables: list[str] = [], - virtual_array_class: type = ManifestArray, - indexes: MutableMapping[str, Index] | None = None, -) -> xr.Dataset: - """ - Translate a store-level kerchunk reference dict into an xarray Dataset containing virtualized arrays. - - drop_variables: list[str], default is None - Variables in the file to drop before returning. - virtual_array_class - Virtual array class to use to represent the references to the chunks in each on-disk array. - Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. - """ - - vars = virtual_vars_from_kerchunk_refs(refs, drop_variables, virtual_array_class) - ds_attrs = kerchunk.fully_decode_arr_refs(refs["refs"]).get(".zattrs", {}) - coord_names = ds_attrs.pop("coordinates", []) - - if indexes is None: - indexes = {} - data_vars, coords = separate_coords(vars, indexes, coord_names) - - vds = xr.Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, - ) - - return vds - - -def variable_from_kerchunk_refs( - refs: KerchunkStoreRefs, var_name: str, virtual_array_class -) -> xr.Variable: - """Create a single xarray Variable by reading specific keys of a kerchunk references dict.""" - - arr_refs = kerchunk.extract_array_refs(refs, var_name) - chunk_dict, zarray, zattrs = kerchunk.parse_array_refs(arr_refs) - # we want to remove the _ARRAY_DIMENSIONS from the final variables' .attrs - dims = zattrs.pop("_ARRAY_DIMENSIONS") - if chunk_dict: - manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict) - varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest) - else: - # This means we encountered a scalar variable of dimension 0, - # very likely that it actually has no numeric value and its only purpose - # is to communicate dataset attributes. - varr = zarray.fill_value - - return xr.Variable(data=varr, dims=dims, attrs=zattrs) - - def separate_coords( vars: Mapping[str, xr.Variable], indexes: MutableMapping[str, Index], @@ -415,155 +294,3 @@ def separate_coords( coords = xr.Coordinates(coord_vars, indexes=indexes) return data_vars, coords - - -@register_dataset_accessor("virtualize") -class VirtualiZarrDatasetAccessor: - """ - Xarray accessor for writing out virtual datasets to disk. - - Methods on this object are called via `ds.virtualize.{method}`. - """ - - def __init__(self, ds: xr.Dataset): - self.ds: xr.Dataset = ds - - def to_zarr(self, storepath: str) -> None: - """ - Serialize all virtualized arrays in this xarray dataset as a Zarr store. - - Currently requires all variables to be backed by ManifestArray objects. - - Not very useful until some implementation of a Zarr reader can actually read these manifest.json files. - See https://github.com/zarr-developers/zarr-specs/issues/287 - - Parameters - ---------- - storepath : str - """ - dataset_to_zarr(self.ds, storepath) - - @overload - def to_kerchunk( - self, filepath: None, format: Literal["dict"] - ) -> KerchunkStoreRefs: ... - - @overload - def to_kerchunk(self, filepath: str | Path, format: Literal["json"]) -> None: ... - - @overload - def to_kerchunk( - self, - filepath: str | Path, - format: Literal["parquet"], - record_size: int = 100_000, - categorical_threshold: int = 10, - ) -> None: ... - - def to_kerchunk( - self, - filepath: str | Path | None = None, - format: Literal["dict", "json", "parquet"] = "dict", - record_size: int = 100_000, - categorical_threshold: int = 10, - ) -> KerchunkStoreRefs | None: - """ - Serialize all virtualized arrays in this xarray dataset into the kerchunk references format. - - Parameters - ---------- - filepath : str, default: None - File path to write kerchunk references into. Not required if format is 'dict'. - format : 'dict', 'json', or 'parquet' - Format to serialize the kerchunk references as. - If 'json' or 'parquet' then the 'filepath' argument is required. - record_size (parquet only): int - Number of references to store in each reference file (default 100,000). Bigger values - mean fewer read requests but larger memory footprint. - categorical_threshold (parquet only) : int - Encode urls as pandas.Categorical to reduce memory footprint if the ratio - of the number of unique urls to total number of refs for each variable - is greater than or equal to this number. (default 10) - - References - ---------- - https://fsspec.github.io/kerchunk/spec.html - """ - refs = kerchunk.dataset_to_kerchunk_refs(self.ds) - - if format == "dict": - return refs - elif format == "json": - if filepath is None: - raise ValueError("Filepath must be provided when format is 'json'") - - with open(filepath, "w") as json_file: - ujson.dump(refs, json_file) - - return None - elif format == "parquet": - from kerchunk.df import refs_to_dataframe - - if isinstance(filepath, Path): - url = str(filepath) - elif isinstance(filepath, str): - url = filepath - - # refs_to_dataframe is responsible for writing to parquet. - # at no point does it create a full in-memory dataframe. - refs_to_dataframe( - refs, - url=url, - record_size=record_size, - categorical_threshold=categorical_threshold, - ) - return None - else: - raise ValueError(f"Unrecognized output format: {format}") - - def rename_paths( - self, - new: str | Callable[[str], str], - ) -> xr.Dataset: - """ - Rename paths to chunks in every ManifestArray in this dataset. - - Accepts either a string, in which case this new path will be used for all chunks, or - a function which accepts the old path and returns the new path. - - Parameters - ---------- - new - New path to use for all chunks, either as a string, or as a function which accepts and returns strings. - - Returns - ------- - Dataset - - Examples - -------- - Rename paths to reflect moving the referenced files from local storage to an S3 bucket. - - >>> def local_to_s3_url(old_local_path: str) -> str: - ... from pathlib import Path - ... - ... new_s3_bucket_url = "http://s3.amazonaws.com/my_bucket/" - ... - ... filename = Path(old_local_path).name - ... return str(new_s3_bucket_url / filename) - - >>> ds.virtualize.rename_paths(local_to_s3_url) - - See Also - -------- - ManifestArray.rename_paths - ChunkManifest.rename_paths - """ - - new_ds = self.ds.copy() - for var_name in new_ds.variables: - data = new_ds[var_name].data - if isinstance(data, ManifestArray): - new_ds[var_name].data = data.rename_paths(new=new) - - return new_ds diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 0ec9c844..5ac0aef0 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -3,7 +3,7 @@ import numpy as np -from ..kerchunk import KerchunkArrRefs +from ..types.kerchunk import KerchunkArrRefs from ..zarr import ZArray from .array_api import MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS, _isnan from .manifest import ChunkManifest @@ -61,7 +61,10 @@ def __init__( @classmethod def _from_kerchunk_refs(cls, arr_refs: KerchunkArrRefs) -> "ManifestArray": - from virtualizarr.kerchunk import fully_decode_arr_refs, parse_array_refs + from virtualizarr.readers.kerchunk import ( + fully_decode_arr_refs, + parse_array_refs, + ) decoded_arr_refs = fully_decode_arr_refs(arr_refs) diff --git a/virtualizarr/kerchunk.py b/virtualizarr/readers/kerchunk.py similarity index 51% rename from virtualizarr/kerchunk.py rename to virtualizarr/readers/kerchunk.py index a73f2cda..4686ce94 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/readers/kerchunk.py @@ -1,61 +1,57 @@ -import base64 -import json import warnings -from enum import Enum, auto from pathlib import Path -from typing import Any, NewType, Optional, cast +from typing import Any, MutableMapping, Optional, cast -import numpy as np import ujson # type: ignore -import xarray as xr -from xarray.coding.times import CFDatetimeCoder - -from virtualizarr.manifests.manifest import join +from xarray import Dataset +from xarray.core.indexes import Index +from xarray.core.variable import Variable + +from virtualizarr.backend import FileType, separate_coords +from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.types.kerchunk import ( + KerchunkArrRefs, + KerchunkStoreRefs, +) from virtualizarr.utils import _fsspec_openfile_from_filepath from virtualizarr.zarr import ZArray, ZAttrs -# Distinguishing these via type hints makes it a lot easier to mentally keep track of what the opaque kerchunk "reference dicts" actually mean -# (idea from https://kobzol.github.io/rust/python/2023/05/20/writing-python-like-its-rust.html) -# TODO I would prefer to be more specific about these types -KerchunkStoreRefs = NewType( - "KerchunkStoreRefs", dict -) # top-level dict with keys for 'version', 'refs' -KerchunkArrRefs = NewType( - "KerchunkArrRefs", - dict, -) # lower-level dict containing just the information for one zarr array - - -class AutoName(Enum): - # Recommended by official Python docs for auto naming: - # https://docs.python.org/3/library/enum.html#using-automatic-values - def _generate_next_value_(name, start, count, last_values): - return name - - -class FileType(AutoName): - netcdf3 = auto() - netcdf4 = auto() # NOTE: netCDF4 is a subset of hdf5 - hdf4 = auto() - hdf5 = auto() - grib = auto() - tiff = auto() - fits = auto() - zarr = auto() - dmrpp = auto() - zarr_v3 = auto() - - -class NumpyEncoder(json.JSONEncoder): - # TODO I don't understand how kerchunk gets around this problem of encoding numpy types (in the zattrs) whilst only using ujson - def default(self, obj): - if isinstance(obj, np.ndarray): - return obj.tolist() # Convert NumPy array to Python list - elif isinstance(obj, np.generic): - return obj.item() # Convert NumPy scalar to Python scalar - elif isinstance(obj, np.dtype): - return str(obj) - return json.JSONEncoder.default(self, obj) + +# TODO shouldn't this live in backend.py? Because it's not just useful for the kerchunk-specific readers... +def _automatically_determine_filetype( + *, + filepath: str, + reader_options: Optional[dict[str, Any]] = {}, +) -> FileType: + if Path(filepath).suffix == ".zarr": + # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one... + raise NotImplementedError() + + # Read magic bytes from local or remote file + fpath = _fsspec_openfile_from_filepath( + filepath=filepath, reader_options=reader_options + ) + magic_bytes = fpath.read(8) + fpath.close() + + if magic_bytes.startswith(b"CDF"): + filetype = FileType.netcdf3 + elif magic_bytes.startswith(b"\x0e\x03\x13\x01"): + raise NotImplementedError("HDF4 formatted files not supported") + elif magic_bytes.startswith(b"\x89HDF"): + filetype = FileType.hdf5 + elif magic_bytes.startswith(b"GRIB"): + filetype = FileType.grib + elif magic_bytes.startswith(b"II*"): + filetype = FileType.tiff + elif magic_bytes.startswith(b"SIMPLE"): + filetype = FileType.fits + else: + raise NotImplementedError( + f"Unrecognised file based on header bytes: {magic_bytes}" + ) + + return filetype def read_kerchunk_references_from_file( @@ -127,40 +123,90 @@ def read_kerchunk_references_from_file( return refs -def _automatically_determine_filetype( - *, - filepath: str, - reader_options: Optional[dict[str, Any]] = {}, -) -> FileType: - if Path(filepath).suffix == ".zarr": - # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one... - raise NotImplementedError() +def virtual_vars_from_kerchunk_refs( + refs: KerchunkStoreRefs, + drop_variables: list[str] | None = None, + virtual_array_class=ManifestArray, +) -> dict[str, Variable]: + """ + Translate a store-level kerchunk reference dict into aaset of xarray Variables containing virtualized arrays. - # Read magic bytes from local or remote file - fpath = _fsspec_openfile_from_filepath( - filepath=filepath, reader_options=reader_options + Parameters + ---------- + drop_variables: list[str], default is None + Variables in the file to drop before returning. + virtual_array_class + Virtual array class to use to represent the references to the chunks in each on-disk array. + Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. + """ + + var_names = find_var_names(refs) + if drop_variables is None: + drop_variables = [] + var_names_to_keep = [ + var_name for var_name in var_names if var_name not in drop_variables + ] + + vars = { + var_name: variable_from_kerchunk_refs(refs, var_name, virtual_array_class) + for var_name in var_names_to_keep + } + return vars + + +def dataset_from_kerchunk_refs( + refs: KerchunkStoreRefs, + drop_variables: list[str] = [], + virtual_array_class: type = ManifestArray, + indexes: MutableMapping[str, Index] | None = None, +) -> Dataset: + """ + Translate a store-level kerchunk reference dict into an xarray Dataset containing virtualized arrays. + + drop_variables: list[str], default is None + Variables in the file to drop before returning. + virtual_array_class + Virtual array class to use to represent the references to the chunks in each on-disk array. + Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. + """ + + vars = virtual_vars_from_kerchunk_refs(refs, drop_variables, virtual_array_class) + ds_attrs = fully_decode_arr_refs(refs["refs"]).get(".zattrs", {}) + coord_names = ds_attrs.pop("coordinates", []) + + if indexes is None: + indexes = {} + data_vars, coords = separate_coords(vars, indexes, coord_names) + + vds = Dataset( + data_vars, + coords=coords, + # indexes={}, # TODO should be added in a later version of xarray + attrs=ds_attrs, ) - magic_bytes = fpath.read(8) - fpath.close() - if magic_bytes.startswith(b"CDF"): - filetype = FileType.netcdf3 - elif magic_bytes.startswith(b"\x0e\x03\x13\x01"): - raise NotImplementedError("HDF4 formatted files not supported") - elif magic_bytes.startswith(b"\x89HDF"): - filetype = FileType.hdf5 - elif magic_bytes.startswith(b"GRIB"): - filetype = FileType.grib - elif magic_bytes.startswith(b"II*"): - filetype = FileType.tiff - elif magic_bytes.startswith(b"SIMPLE"): - filetype = FileType.fits + return vds + + +def variable_from_kerchunk_refs( + refs: KerchunkStoreRefs, var_name: str, virtual_array_class +) -> Variable: + """Create a single xarray Variable by reading specific keys of a kerchunk references dict.""" + + arr_refs = extract_array_refs(refs, var_name) + chunk_dict, zarray, zattrs = parse_array_refs(arr_refs) + # we want to remove the _ARRAY_DIMENSIONS from the final variables' .attrs + dims = zattrs.pop("_ARRAY_DIMENSIONS") + if chunk_dict: + manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict) + varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest) else: - raise NotImplementedError( - f"Unrecognised file based on header bytes: {magic_bytes}" - ) + # This means we encountered a scalar variable of dimension 0, + # very likely that it actually has no numeric value and its only purpose + # is to communicate dataset attributes. + varr = zarray.fill_value - return filetype + return Variable(data=varr, dims=dims, attrs=zattrs) def find_var_names(ds_reference_dict: KerchunkStoreRefs) -> list[str]: @@ -216,102 +262,3 @@ def fully_decode_arr_refs(d: dict) -> KerchunkArrRefs: sanitized[k] = ujson.loads(v) return cast(KerchunkArrRefs, sanitized) - - -def dataset_to_kerchunk_refs(ds: xr.Dataset) -> KerchunkStoreRefs: - """ - Create a dictionary containing kerchunk-style store references from a single xarray.Dataset (which wraps ManifestArray objects). - """ - - all_arr_refs = {} - for var_name, var in ds.variables.items(): - arr_refs = variable_to_kerchunk_arr_refs(var, str(var_name)) - - prepended_with_var_name = { - f"{var_name}/{key}": val for key, val in arr_refs.items() - } - - all_arr_refs.update(prepended_with_var_name) - - zattrs = ds.attrs - if ds.coords: - coord_names = [str(x) for x in ds.coords] - # this weird concatenated string instead of a list of strings is inconsistent with how other features in the kerchunk references format are stored - # see https://github.com/zarr-developers/VirtualiZarr/issues/105#issuecomment-2187266739 - zattrs["coordinates"] = " ".join(coord_names) - - ds_refs = { - "version": 1, - "refs": { - ".zgroup": '{"zarr_format":2}', - ".zattrs": ujson.dumps(zattrs), - **all_arr_refs, - }, - } - - return cast(KerchunkStoreRefs, ds_refs) - - -def variable_to_kerchunk_arr_refs(var: xr.Variable, var_name: str) -> KerchunkArrRefs: - """ - Create a dictionary containing kerchunk-style array references from a single xarray.Variable (which wraps either a ManifestArray or a numpy array). - - Partially encodes the inner dicts to json to match kerchunk behaviour (see https://github.com/fsspec/kerchunk/issues/415). - """ - from virtualizarr.manifests import ManifestArray - - if isinstance(var.data, ManifestArray): - marr = var.data - - arr_refs: dict[str, str | list[str | int]] = { - str(chunk_key): [entry["path"], entry["offset"], entry["length"]] - for chunk_key, entry in marr.manifest.dict().items() - } - - zarray = marr.zarray.replace(zarr_format=2) - - else: - try: - np_arr = var.to_numpy() - except AttributeError as e: - raise TypeError( - f"Can only serialize wrapped arrays of type ManifestArray or numpy.ndarray, but got type {type(var.data)}" - ) from e - - if var.encoding: - if "scale_factor" in var.encoding: - raise NotImplementedError( - f"Cannot serialize loaded variable {var_name}, as it is encoded with a scale_factor" - ) - if "offset" in var.encoding: - raise NotImplementedError( - f"Cannot serialize loaded variable {var_name}, as it is encoded with an offset" - ) - if "calendar" in var.encoding: - np_arr = CFDatetimeCoder().encode(var.copy(), name=var_name).values - - # This encoding is what kerchunk does when it "inlines" data, see https://github.com/fsspec/kerchunk/blob/a0c4f3b828d37f6d07995925b324595af68c4a19/kerchunk/hdf.py#L472 - byte_data = np_arr.tobytes() - # TODO do I really need to encode then decode like this? - inlined_data = (b"base64:" + base64.b64encode(byte_data)).decode("utf-8") - - # TODO can this be generalized to save individual chunks of a dask array? - # TODO will this fail for a scalar? - arr_refs = {join(0 for _ in np_arr.shape): inlined_data} - - zarray = ZArray( - chunks=np_arr.shape, - shape=np_arr.shape, - dtype=np_arr.dtype, - order="C", - fill_value=None, - ) - - zarray_dict = zarray.to_kerchunk_json() - arr_refs[".zarray"] = zarray_dict - - zattrs = {**var.attrs, **var.encoding} - zattrs["_ARRAY_DIMENSIONS"] = list(var.dims) - arr_refs[".zattrs"] = json.dumps(zattrs, separators=(",", ":"), cls=NumpyEncoder) - - return cast(KerchunkArrRefs, arr_refs) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py new file mode 100644 index 00000000..b841d5c3 --- /dev/null +++ b/virtualizarr/readers/zarr.py @@ -0,0 +1,131 @@ +import json +from pathlib import Path +from typing import Mapping + +import numcodecs +import numpy as np +from xarray import Dataset +from xarray.core.indexes import Index +from xarray.core.variable import Variable + +from virtualizarr.backend import separate_coords +from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.zarr import ZArray + + +def open_virtual_dataset_from_v3_store( + storepath: str, + drop_variables: list[str], + indexes: Mapping[str, Index] | None, +) -> Dataset: + """ + Read a Zarr v3 store and return an xarray Dataset containing virtualized arrays. + """ + _storepath = Path(storepath) + + ds_attrs = attrs_from_zarr_group_json(_storepath / "zarr.json") + coord_names = ds_attrs.pop("coordinates", []) + + # TODO recursive glob to create a datatree + # Note: this .is_file() check should not be necessary according to the pathlib docs, but tests fail on github CI without it + # see https://github.com/TomNicholas/VirtualiZarr/pull/45#discussion_r1547833166 + all_paths = _storepath.glob("*/") + directory_paths = [p for p in all_paths if not p.is_file()] + + vars = {} + for array_dir in directory_paths: + var_name = array_dir.name + if var_name in drop_variables: + break + + zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json") + manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json")) + + marr = ManifestArray(chunkmanifest=manifest, zarray=zarray) + var = Variable(data=marr, dims=dim_names, attrs=attrs) + vars[var_name] = var + + if indexes is None: + raise NotImplementedError() + elif indexes != {}: + # TODO allow manual specification of index objects + raise NotImplementedError() + else: + indexes = dict(**indexes) # for type hinting: to allow mutation + + data_vars, coords = separate_coords(vars, indexes, coord_names) + + ds = Dataset( + data_vars, + coords=coords, + # indexes={}, # TODO should be added in a later version of xarray + attrs=ds_attrs, + ) + + return ds + + +def attrs_from_zarr_group_json(filepath: Path) -> dict: + with open(filepath) as metadata_file: + attrs = json.load(metadata_file) + return attrs["attributes"] + + +def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: + with open(filepath) as metadata_file: + metadata = json.load(metadata_file) + + if { + "name": "chunk-manifest-json", + "configuration": { + "manifest": "./manifest.json", + }, + } not in metadata.get("storage_transformers", []): + raise ValueError( + "Can only read byte ranges from Zarr v3 stores which implement the manifest storage transformer ZEP." + ) + + attrs = metadata.pop("attributes") + dim_names = metadata.pop("dimension_names") + + chunk_shape = tuple(metadata["chunk_grid"]["configuration"]["chunk_shape"]) + shape = tuple(metadata["shape"]) + zarr_format = metadata["zarr_format"] + + if metadata["fill_value"] is None: + raise ValueError( + "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" + ) + else: + fill_value = metadata["fill_value"] + + all_codecs = [ + codec + for codec in metadata["codecs"] + if codec["name"] not in ("transpose", "bytes") + ] + compressor, *filters = [ + _configurable_to_num_codec_config(_filter) for _filter in all_codecs + ] + zarray = ZArray( + chunks=chunk_shape, + compressor=compressor, + dtype=np.dtype(metadata["data_type"]), + fill_value=fill_value, + filters=filters or None, + order="C", + shape=shape, + zarr_format=zarr_format, + ) + + return zarray, dim_names, attrs + + +def _configurable_to_num_codec_config(configurable: dict) -> dict: + """ + Convert a zarr v3 configurable into a numcodecs codec. + """ + configurable_copy = configurable.copy() + codec_id = configurable_copy.pop("name") + configuration = configurable_copy.pop("configuration") + return numcodecs.get_codec({"id": codec_id, **configuration}).get_config() diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py new file mode 100644 index 00000000..3b0c0315 --- /dev/null +++ b/virtualizarr/tests/test_backend.py @@ -0,0 +1,255 @@ +from collections.abc import Mapping +from unittest.mock import patch + +import numpy as np +import pytest +import xarray as xr +import xarray.testing as xrt +from xarray import open_dataset +from xarray.core.indexes import Index + +from virtualizarr import open_virtual_dataset +from virtualizarr.backend import FileType +from virtualizarr.manifests import ManifestArray +from virtualizarr.readers.kerchunk import _automatically_determine_filetype +from virtualizarr.tests import has_astropy, has_tifffile, network, requires_s3fs + + +def test_automatically_determine_filetype_netcdf3_netcdf4(): + # test the NetCDF3 vs NetCDF4 automatic file type selection + + ds = xr.Dataset({"a": (["x"], [0, 1])}) + netcdf3_file_path = "/tmp/netcdf3.nc" + netcdf4_file_path = "/tmp/netcdf4.nc" + + # write two version of NetCDF + ds.to_netcdf(netcdf3_file_path, engine="scipy", format="NETCDF3_CLASSIC") + ds.to_netcdf(netcdf4_file_path, engine="h5netcdf") + + assert FileType("netcdf3") == _automatically_determine_filetype( + filepath=netcdf3_file_path + ) + assert FileType("hdf5") == _automatically_determine_filetype( + filepath=netcdf4_file_path + ) + + +@pytest.mark.parametrize( + "filetype,headerbytes", + [ + ("netcdf3", b"CDF"), + ("hdf5", b"\x89HDF"), + ("grib", b"GRIB"), + ("tiff", b"II*"), + ("fits", b"SIMPLE"), + ], +) +def test_valid_filetype_bytes(tmp_path, filetype, headerbytes): + filepath = tmp_path / "file.abc" + with open(filepath, "wb") as f: + f.write(headerbytes) + assert FileType(filetype) == _automatically_determine_filetype(filepath=filepath) + + +def test_notimplemented_filetype(tmp_path): + for headerbytes in [b"JUNK", b"\x0e\x03\x13\x01"]: + filepath = tmp_path / "file.abc" + with open(filepath, "wb") as f: + f.write(headerbytes) + with pytest.raises(NotImplementedError): + _automatically_determine_filetype(filepath=filepath) + + +def test_FileType(): + # tests if FileType converts user supplied strings to correct filetype + assert "netcdf3" == FileType("netcdf3").name + assert "netcdf4" == FileType("netcdf4").name + assert "hdf4" == FileType("hdf4").name + assert "hdf5" == FileType("hdf5").name + assert "grib" == FileType("grib").name + assert "tiff" == FileType("tiff").name + assert "fits" == FileType("fits").name + assert "zarr" == FileType("zarr").name + with pytest.raises(ValueError): + FileType(None) + + +class TestOpenVirtualDatasetIndexes: + def test_no_indexes(self, netcdf4_file): + vds = open_virtual_dataset(netcdf4_file, indexes={}) + assert vds.indexes == {} + + def test_create_default_indexes(self, netcdf4_file): + with pytest.warns(UserWarning, match="will create in-memory pandas indexes"): + vds = open_virtual_dataset(netcdf4_file, indexes=None) + ds = open_dataset(netcdf4_file, decode_times=False) + + # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 + assert index_mappings_equal(vds.xindexes, ds.xindexes) + + +def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, Index]): + # Check if the mappings have the same keys + if set(indexes1.keys()) != set(indexes2.keys()): + return False + + # Check if the values for each key are identical + for key in indexes1.keys(): + index1 = indexes1[key] + index2 = indexes2[key] + + if not index1.equals(index2): + return False + + return True + + +class TestOpenVirtualDatasetAttrs: + def test_drop_array_dimensions(self, netcdf4_file): + # regression test for GH issue #150 + vds = open_virtual_dataset(netcdf4_file, indexes={}) + assert "_ARRAY_DIMENSIONS" not in vds["air"].attrs + + def test_coordinate_variable_attrs_preserved(self, netcdf4_file): + # regression test for GH issue #155 + vds = open_virtual_dataset(netcdf4_file, indexes={}) + assert vds["lat"].attrs == { + "standard_name": "latitude", + "long_name": "Latitude", + "units": "degrees_north", + "axis": "Y", + } + + +@network +@requires_s3fs +class TestReadFromS3: + @pytest.mark.parametrize( + "filetype", ["netcdf4", None], ids=["netcdf4 filetype", "None filetype"] + ) + @pytest.mark.parametrize( + "indexes", [None, {}], ids=["None index", "empty dict index"] + ) + def test_anon_read_s3(self, filetype, indexes): + """Parameterized tests for empty vs supplied indexes and filetypes.""" + # TODO: Switch away from this s3 url after minIO is implemented. + fpath = "s3://carbonplan-share/virtualizarr/local.nc" + vds = open_virtual_dataset( + fpath, + filetype=filetype, + indexes=indexes, + reader_options={"storage_options": {"anon": True}}, + ) + + assert vds.dims == {"time": 2920, "lat": 25, "lon": 53} + for var in vds.variables: + assert isinstance(vds[var].data, ManifestArray), var + + +@network +class TestReadFromURL: + @pytest.mark.parametrize( + "filetype, url", + [ + ( + "grib", + "https://github.com/pydata/xarray-data/raw/master/era5-2mt-2019-03-uk.grib", + ), + ( + "netcdf3", + "https://github.com/pydata/xarray-data/raw/master/air_temperature.nc", + ), + ( + "netcdf4", + "https://github.com/pydata/xarray-data/raw/master/ROMS_example.nc", + ), + ( + "hdf4", + "https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf", + ), + # https://github.com/zarr-developers/VirtualiZarr/issues/159 + # ("hdf5", "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5"), + pytest.param( + "tiff", + "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif", + marks=pytest.mark.skipif( + not has_tifffile, reason="package tifffile is not available" + ), + ), + pytest.param( + "fits", + "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits", + marks=pytest.mark.skipif( + not has_astropy, reason="package astropy is not available" + ), + ), + ( + "jpg", + "https://github.com/rasterio/rasterio/raw/main/tests/data/389225main_sw_1965_1024.jpg", + ), + ], + ) + def test_read_from_url(self, filetype, url): + if filetype in ["grib", "jpg", "hdf4"]: + with pytest.raises(NotImplementedError): + vds = open_virtual_dataset(url, reader_options={}, indexes={}) + else: + vds = open_virtual_dataset(url, indexes={}) + assert isinstance(vds, xr.Dataset) + + +class TestLoadVirtualDataset: + def test_loadable_variables(self, netcdf4_file): + vars_to_load = ["air", "time"] + vds = open_virtual_dataset( + netcdf4_file, loadable_variables=vars_to_load, indexes={} + ) + + for name in vds.variables: + if name in vars_to_load: + assert isinstance(vds[name].data, np.ndarray), name + else: + assert isinstance(vds[name].data, ManifestArray), name + + full_ds = xr.open_dataset(netcdf4_file, decode_times=False) + + for name in full_ds.variables: + if name in vars_to_load: + xrt.assert_identical(vds.variables[name], full_ds.variables[name]) + + def test_explicit_filetype(self, netcdf4_file): + with pytest.raises(ValueError): + open_virtual_dataset(netcdf4_file, filetype="unknown") + + with pytest.raises(NotImplementedError): + open_virtual_dataset(netcdf4_file, filetype="grib") + + @patch("virtualizarr.readers.kerchunk.read_kerchunk_references_from_file") + def test_open_virtual_dataset_passes_expected_args( + self, mock_read_kerchunk, netcdf4_file + ): + reader_options = {"option1": "value1", "option2": "value2"} + open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options) + args = { + "filepath": netcdf4_file, + "filetype": None, + "reader_options": reader_options, + } + mock_read_kerchunk.assert_called_once_with(**args) + + def test_open_dataset_with_empty(self, hdf5_empty, tmpdir): + vds = open_virtual_dataset(hdf5_empty) + assert vds.empty.dims == () + assert vds.empty.attrs == {"empty": "true"} + + def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir): + vds = open_virtual_dataset(hdf5_scalar) + assert vds.scalar.dims == () + assert vds.scalar.attrs == {"scalar": "true"} + + +def test_cftime_variables_must_be_in_loadable_variables(tmpdir): + ds = xr.Dataset(data_vars={"time": ["2024-06-21"]}) + ds.to_netcdf(f"{tmpdir}/scalar.nc") + with pytest.raises(ValueError, match="'time' not in"): + open_virtual_dataset(f"{tmpdir}/scalar.nc", cftime_variables=["time"]) diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index 379c43ad..2442ec8d 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -1,185 +1,12 @@ import numpy as np -import pandas as pd -import pytest -import ujson # type: ignore import xarray as xr import xarray.testing as xrt -from virtualizarr.kerchunk import ( - FileType, - _automatically_determine_filetype, +from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.readers.kerchunk import ( + dataset_from_kerchunk_refs, find_var_names, ) -from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.xarray import dataset_from_kerchunk_refs - - -def gen_ds_refs( - zgroup: str = '{"zarr_format":2}', - zarray: str = '{"chunks":[2,3],"compressor":null,"dtype":" Dataset: + arr = ManifestArray( + chunkmanifest=ChunkManifest( + entries={"0.0": dict(path="test.nc", offset=6144, length=48)} + ), + zarray=dict( + shape=(2, 3), + dtype=np.dtype(" bool: + """ + Several metadata attributes in ZarrV3 use a dictionary with keys "name" : str and "configuration" : dict + """ + return "name" in value and "configuration" in value + + +def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: Dataset): + """ + Checks that the output metadata of an array variable conforms to this spec + for the required attributes: + https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#metadata + """ + dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr") + # read the a variable's metadata + with open(tmpdir / "store.zarr/a/zarr.json", mode="r") as f: + metadata = json.loads(f.read()) + assert metadata["zarr_format"] == 3 + assert metadata["node_type"] == "array" + assert isinstance(metadata["shape"], list) and all( + isinstance(dim, int) for dim in metadata["shape"] + ) + assert isinstance(metadata["data_type"], str) or isconfigurable( + metadata["data_type"] + ) + assert isconfigurable(metadata["chunk_grid"]) + assert isconfigurable(metadata["chunk_key_encoding"]) + assert isinstance(metadata["fill_value"], (bool, int, float, str, list)) + assert ( + isinstance(metadata["codecs"], list) + and len(metadata["codecs"]) > 1 + and all(isconfigurable(codec) for codec in metadata["codecs"]) + ) + + +def test_zarr_v3_roundtrip(tmpdir, vds_with_manifest_arrays: Dataset): + vds_with_manifest_arrays.virtualize.to_zarr(tmpdir / "store.zarr") + roundtrip = open_virtual_dataset( + tmpdir / "store.zarr", filetype=FileType.zarr_v3, indexes={} + ) + + xrt.assert_identical(roundtrip, vds_with_manifest_arrays) + + +def test_metadata_roundtrip(tmpdir, vds_with_manifest_arrays: Dataset): + dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr") + zarray, _, _ = metadata_from_zarr_json(tmpdir / "store.zarr/a/zarr.json") + assert zarray == vds_with_manifest_arrays.a.data.zarray diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 9133eb54..9db6e3a2 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -1,15 +1,9 @@ -from collections.abc import Mapping -from unittest.mock import patch - import numpy as np import pytest import xarray as xr -import xarray.testing as xrt -from xarray.core.indexes import Index from virtualizarr import open_virtual_dataset from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.tests import has_astropy, has_tifffile, network, requires_s3fs from virtualizarr.zarr import ZArray @@ -228,53 +222,6 @@ def test_concat_dim_coords_along_existing_dim(self): assert result.data.zarray.zarr_format == zarray.zarr_format -class TestOpenVirtualDatasetAttrs: - def test_drop_array_dimensions(self, netcdf4_file): - # regression test for GH issue #150 - vds = open_virtual_dataset(netcdf4_file, indexes={}) - assert "_ARRAY_DIMENSIONS" not in vds["air"].attrs - - def test_coordinate_variable_attrs_preserved(self, netcdf4_file): - # regression test for GH issue #155 - vds = open_virtual_dataset(netcdf4_file, indexes={}) - assert vds["lat"].attrs == { - "standard_name": "latitude", - "long_name": "Latitude", - "units": "degrees_north", - "axis": "Y", - } - - -class TestOpenVirtualDatasetIndexes: - def test_no_indexes(self, netcdf4_file): - vds = open_virtual_dataset(netcdf4_file, indexes={}) - assert vds.indexes == {} - - def test_create_default_indexes(self, netcdf4_file): - with pytest.warns(UserWarning, match="will create in-memory pandas indexes"): - vds = open_virtual_dataset(netcdf4_file, indexes=None) - ds = xr.open_dataset(netcdf4_file, decode_times=False) - - # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 - assert index_mappings_equal(vds.xindexes, ds.xindexes) - - -def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, Index]): - # Check if the mappings have the same keys - if set(indexes1.keys()) != set(indexes2.keys()): - return False - - # Check if the values for each key are identical - for key in indexes1.keys(): - index1 = indexes1[key] - index2 = indexes2[key] - - if not index1.equals(index2): - return False - - return True - - class TestCombineUsingIndexes: def test_combine_by_coords(self, netcdf4_files): filepath1, filepath2 = netcdf4_files @@ -308,133 +255,6 @@ def test_combine_by_coords_keeping_manifestarrays(self, netcdf4_files): assert isinstance(combined_vds["lon"].data, ManifestArray) -@network -@requires_s3fs -class TestReadFromS3: - @pytest.mark.parametrize( - "filetype", ["netcdf4", None], ids=["netcdf4 filetype", "None filetype"] - ) - @pytest.mark.parametrize( - "indexes", [None, {}], ids=["None index", "empty dict index"] - ) - def test_anon_read_s3(self, filetype, indexes): - """Parameterized tests for empty vs supplied indexes and filetypes.""" - # TODO: Switch away from this s3 url after minIO is implemented. - fpath = "s3://carbonplan-share/virtualizarr/local.nc" - vds = open_virtual_dataset( - fpath, - filetype=filetype, - indexes=indexes, - reader_options={"storage_options": {"anon": True}}, - ) - - assert vds.dims == {"time": 2920, "lat": 25, "lon": 53} - for var in vds.variables: - assert isinstance(vds[var].data, ManifestArray), var - - -@network -class TestReadFromURL: - @pytest.mark.parametrize( - "filetype, url", - [ - ( - "grib", - "https://github.com/pydata/xarray-data/raw/master/era5-2mt-2019-03-uk.grib", - ), - ( - "netcdf3", - "https://github.com/pydata/xarray-data/raw/master/air_temperature.nc", - ), - ( - "netcdf4", - "https://github.com/pydata/xarray-data/raw/master/ROMS_example.nc", - ), - ( - "hdf4", - "https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf", - ), - # https://github.com/zarr-developers/VirtualiZarr/issues/159 - # ("hdf5", "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5"), - pytest.param( - "tiff", - "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif", - marks=pytest.mark.skipif( - not has_tifffile, reason="package tifffile is not available" - ), - ), - pytest.param( - "fits", - "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits", - marks=pytest.mark.skipif( - not has_astropy, reason="package astropy is not available" - ), - ), - ( - "jpg", - "https://github.com/rasterio/rasterio/raw/main/tests/data/389225main_sw_1965_1024.jpg", - ), - ], - ) - def test_read_from_url(self, filetype, url): - if filetype in ["grib", "jpg", "hdf4"]: - with pytest.raises(NotImplementedError): - vds = open_virtual_dataset(url, reader_options={}, indexes={}) - else: - vds = open_virtual_dataset(url, indexes={}) - assert isinstance(vds, xr.Dataset) - - -class TestLoadVirtualDataset: - def test_loadable_variables(self, netcdf4_file): - vars_to_load = ["air", "time"] - vds = open_virtual_dataset( - netcdf4_file, loadable_variables=vars_to_load, indexes={} - ) - - for name in vds.variables: - if name in vars_to_load: - assert isinstance(vds[name].data, np.ndarray), name - else: - assert isinstance(vds[name].data, ManifestArray), name - - full_ds = xr.open_dataset(netcdf4_file, decode_times=False) - - for name in full_ds.variables: - if name in vars_to_load: - xrt.assert_identical(vds.variables[name], full_ds.variables[name]) - - def test_explicit_filetype(self, netcdf4_file): - with pytest.raises(ValueError): - open_virtual_dataset(netcdf4_file, filetype="unknown") - - with pytest.raises(NotImplementedError): - open_virtual_dataset(netcdf4_file, filetype="grib") - - @patch("virtualizarr.kerchunk.read_kerchunk_references_from_file") - def test_open_virtual_dataset_passes_expected_args( - self, mock_read_kerchunk, netcdf4_file - ): - reader_options = {"option1": "value1", "option2": "value2"} - open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options) - args = { - "filepath": netcdf4_file, - "filetype": None, - "reader_options": reader_options, - } - mock_read_kerchunk.assert_called_once_with(**args) - - def test_open_dataset_with_empty(self, hdf5_empty, tmpdir): - vds = open_virtual_dataset(hdf5_empty) - assert vds.empty.dims == () - assert vds.empty.attrs == {"empty": "true"} - - def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir): - vds = open_virtual_dataset(hdf5_scalar) - assert vds.scalar.dims == () - assert vds.scalar.attrs == {"scalar": "true"} - - class TestRenamePaths: def test_rename_to_str(self, netcdf4_file): vds = open_virtual_dataset(netcdf4_file, indexes={}) @@ -477,10 +297,3 @@ def test_mixture_of_manifestarrays_and_numpy_arrays(self, netcdf4_file): == "s3://bucket/air.nc" ) assert isinstance(renamed_vds["lat"].data, np.ndarray) - - -def test_cftime_variables_must_be_in_loadable_variables(tmpdir): - ds = xr.Dataset(data_vars={"time": ["2024-06-21"]}) - ds.to_netcdf(f"{tmpdir}/scalar.nc") - with pytest.raises(ValueError, match="'time' not in"): - open_virtual_dataset(f"{tmpdir}/scalar.nc", cftime_variables=["time"]) diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py index 3433030f..95dbf55f 100644 --- a/virtualizarr/tests/test_zarr.py +++ b/virtualizarr/tests/test_zarr.py @@ -1,84 +1,6 @@ -import json - import numpy as np -import pytest -import xarray as xr -import xarray.testing as xrt - -from virtualizarr import ManifestArray, open_virtual_dataset -from virtualizarr.kerchunk import FileType -from virtualizarr.manifests.manifest import ChunkManifest -from virtualizarr.zarr import ZArray, dataset_to_zarr, metadata_from_zarr_json - - -@pytest.fixture -def vds_with_manifest_arrays() -> xr.Dataset: - arr = ManifestArray( - chunkmanifest=ChunkManifest( - entries={"0.0": dict(path="test.nc", offset=6144, length=48)} - ), - zarray=dict( - shape=(2, 3), - dtype=np.dtype(" bool: - """ - Several metadata attributes in ZarrV3 use a dictionary with keys "name" : str and "configuration" : dict - """ - return "name" in value and "configuration" in value - - -def test_zarr_v3_roundtrip(tmpdir, vds_with_manifest_arrays: xr.Dataset): - vds_with_manifest_arrays.virtualize.to_zarr(tmpdir / "store.zarr") - roundtrip = open_virtual_dataset( - tmpdir / "store.zarr", filetype=FileType.zarr_v3, indexes={} - ) - - xrt.assert_identical(roundtrip, vds_with_manifest_arrays) - - -def test_metadata_roundtrip(tmpdir, vds_with_manifest_arrays: xr.Dataset): - dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr") - zarray, _, _ = metadata_from_zarr_json(tmpdir / "store.zarr/a/zarr.json") - assert zarray == vds_with_manifest_arrays.a.data.zarray - - -def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: xr.Dataset): - """ - Checks that the output metadata of an array variable conforms to this spec - for the required attributes: - https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#metadata - """ - dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr") - # read the a variable's metadata - with open(tmpdir / "store.zarr/a/zarr.json", mode="r") as f: - metadata = json.loads(f.read()) - assert metadata["zarr_format"] == 3 - assert metadata["node_type"] == "array" - assert isinstance(metadata["shape"], list) and all( - isinstance(dim, int) for dim in metadata["shape"] - ) - assert isinstance(metadata["data_type"], str) or isconfigurable( - metadata["data_type"] - ) - assert isconfigurable(metadata["chunk_grid"]) - assert isconfigurable(metadata["chunk_key_encoding"]) - assert isinstance(metadata["fill_value"], (bool, int, float, str, list)) - assert ( - isinstance(metadata["codecs"], list) - and len(metadata["codecs"]) > 1 - and all(isconfigurable(codec) for codec in metadata["codecs"]) - ) +from virtualizarr.zarr import ZArray def test_replace_partial(): diff --git a/virtualizarr/types/__init__.py b/virtualizarr/types/__init__.py new file mode 100644 index 00000000..34cd4bde --- /dev/null +++ b/virtualizarr/types/__init__.py @@ -0,0 +1,3 @@ +from virtualizarr.types.general import ChunkKey # type: ignore[F401] + +__all__ = ["ChunkKey"] diff --git a/virtualizarr/types.py b/virtualizarr/types/general.py similarity index 100% rename from virtualizarr/types.py rename to virtualizarr/types/general.py diff --git a/virtualizarr/types/kerchunk.py b/virtualizarr/types/kerchunk.py new file mode 100644 index 00000000..e8dada20 --- /dev/null +++ b/virtualizarr/types/kerchunk.py @@ -0,0 +1,12 @@ +from typing import NewType + +# Distinguishing these via type hints makes it a lot easier to mentally keep track of what the opaque kerchunk "reference dicts" actually mean +# (idea from https://kobzol.github.io/rust/python/2023/05/20/writing-python-like-its-rust.html) +# TODO I would prefer to be more specific about these types +KerchunkStoreRefs = NewType( + "KerchunkStoreRefs", dict +) # top-level dict with keys for 'version', 'refs' +KerchunkArrRefs = NewType( + "KerchunkArrRefs", + dict, +) # lower-level dict containing just the information for one zarr array diff --git a/virtualizarr/writers/__init__.py b/virtualizarr/writers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/virtualizarr/writers/kerchunk.py b/virtualizarr/writers/kerchunk.py new file mode 100644 index 00000000..6b4b55f8 --- /dev/null +++ b/virtualizarr/writers/kerchunk.py @@ -0,0 +1,124 @@ +import base64 +import json +from typing import cast + +import numpy as np +import ujson # type: ignore +from xarray import Dataset +from xarray.coding.times import CFDatetimeCoder +from xarray.core.variable import Variable + +from virtualizarr.manifests.manifest import join +from virtualizarr.types.kerchunk import KerchunkArrRefs, KerchunkStoreRefs +from virtualizarr.zarr import ZArray + + +class NumpyEncoder(json.JSONEncoder): + # TODO I don't understand how kerchunk gets around this problem of encoding numpy types (in the zattrs) whilst only using ujson + def default(self, obj): + if isinstance(obj, np.ndarray): + return obj.tolist() # Convert NumPy array to Python list + elif isinstance(obj, np.generic): + return obj.item() # Convert NumPy scalar to Python scalar + elif isinstance(obj, np.dtype): + return str(obj) + return json.JSONEncoder.default(self, obj) + + +def dataset_to_kerchunk_refs(ds: Dataset) -> KerchunkStoreRefs: + """ + Create a dictionary containing kerchunk-style store references from a single xarray.Dataset (which wraps ManifestArray objects). + """ + + all_arr_refs = {} + for var_name, var in ds.variables.items(): + arr_refs = variable_to_kerchunk_arr_refs(var, str(var_name)) + + prepended_with_var_name = { + f"{var_name}/{key}": val for key, val in arr_refs.items() + } + + all_arr_refs.update(prepended_with_var_name) + + zattrs = ds.attrs + if ds.coords: + coord_names = [str(x) for x in ds.coords] + # this weird concatenated string instead of a list of strings is inconsistent with how other features in the kerchunk references format are stored + # see https://github.com/zarr-developers/VirtualiZarr/issues/105#issuecomment-2187266739 + zattrs["coordinates"] = " ".join(coord_names) + + ds_refs = { + "version": 1, + "refs": { + ".zgroup": '{"zarr_format":2}', + ".zattrs": ujson.dumps(zattrs), + **all_arr_refs, + }, + } + + return cast(KerchunkStoreRefs, ds_refs) + + +def variable_to_kerchunk_arr_refs(var: Variable, var_name: str) -> KerchunkArrRefs: + """ + Create a dictionary containing kerchunk-style array references from a single xarray.Variable (which wraps either a ManifestArray or a numpy array). + + Partially encodes the inner dicts to json to match kerchunk behaviour (see https://github.com/fsspec/kerchunk/issues/415). + """ + from virtualizarr.manifests import ManifestArray + + if isinstance(var.data, ManifestArray): + marr = var.data + + arr_refs: dict[str, str | list[str | int]] = { + str(chunk_key): [entry["path"], entry["offset"], entry["length"]] + for chunk_key, entry in marr.manifest.dict().items() + } + + zarray = marr.zarray.replace(zarr_format=2) + + else: + try: + np_arr = var.to_numpy() + except AttributeError as e: + raise TypeError( + f"Can only serialize wrapped arrays of type ManifestArray or numpy.ndarray, but got type {type(var.data)}" + ) from e + + if var.encoding: + if "scale_factor" in var.encoding: + raise NotImplementedError( + f"Cannot serialize loaded variable {var_name}, as it is encoded with a scale_factor" + ) + if "offset" in var.encoding: + raise NotImplementedError( + f"Cannot serialize loaded variable {var_name}, as it is encoded with an offset" + ) + if "calendar" in var.encoding: + np_arr = CFDatetimeCoder().encode(var.copy(), name=var_name).values + + # This encoding is what kerchunk does when it "inlines" data, see https://github.com/fsspec/kerchunk/blob/a0c4f3b828d37f6d07995925b324595af68c4a19/kerchunk/hdf.py#L472 + byte_data = np_arr.tobytes() + # TODO do I really need to encode then decode like this? + inlined_data = (b"base64:" + base64.b64encode(byte_data)).decode("utf-8") + + # TODO can this be generalized to save individual chunks of a dask array? + # TODO will this fail for a scalar? + arr_refs = {join(0 for _ in np_arr.shape): inlined_data} + + zarray = ZArray( + chunks=np_arr.shape, + shape=np_arr.shape, + dtype=np_arr.dtype, + order="C", + fill_value=None, + ) + + zarray_dict = zarray.to_kerchunk_json() + arr_refs[".zarray"] = zarray_dict + + zattrs = {**var.attrs, **var.encoding} + zattrs["_ARRAY_DIMENSIONS"] = list(var.dims) + arr_refs[".zattrs"] = json.dumps(zattrs, separators=(",", ":"), cls=NumpyEncoder) + + return cast(KerchunkArrRefs, arr_refs) diff --git a/virtualizarr/writers/zarr.py b/virtualizarr/writers/zarr.py new file mode 100644 index 00000000..b3dc8f1a --- /dev/null +++ b/virtualizarr/writers/zarr.py @@ -0,0 +1,115 @@ +from pathlib import Path + +import numpy as np +from xarray import Dataset +from xarray.core.variable import Variable + +from virtualizarr.vendor.zarr.utils import json_dumps +from virtualizarr.zarr import ZArray + + +def dataset_to_zarr(ds: Dataset, storepath: str) -> None: + """ + Write an xarray dataset whose variables wrap ManifestArrays to a v3 Zarr store, writing chunk references into manifest.json files. + + Currently requires all variables to be backed by ManifestArray objects. + + Not very useful until some implementation of a Zarr reader can actually read these manifest.json files. + See https://github.com/zarr-developers/zarr-specs/issues/287 + + Parameters + ---------- + ds: xr.Dataset + storepath: str + """ + + from virtualizarr.manifests import ManifestArray + + _storepath = Path(storepath) + Path.mkdir(_storepath, exist_ok=False) + + # should techically loop over groups in a tree but a dataset corresponds to only one group + group_metadata = {"zarr_format": 3, "node_type": "group", "attributes": ds.attrs} + with open(_storepath / "zarr.json", "wb") as group_metadata_file: + group_metadata_file.write(json_dumps(group_metadata)) + + for name, var in ds.variables.items(): + array_dir = _storepath / str(name) + marr = var.data + + # TODO move this check outside the writing loop so we don't write an incomplete store on failure? + # TODO at some point this should be generalized to also write in-memory arrays as normal zarr chunks, see GH isse #62. + if not isinstance(marr, ManifestArray): + raise TypeError( + "Only xarray objects wrapping ManifestArrays can be written to zarr using this method, " + f"but variable {name} wraps an array of type {type(marr)}" + ) + + Path.mkdir(array_dir, exist_ok=False) + + # write the chunk references into a manifest.json file + # and the array metadata into a zarr.json file + to_zarr_json(var, array_dir) + + +def to_zarr_json(var: Variable, array_dir: Path) -> None: + """ + Write out both the zarr.json and manifest.json file into the given zarr array directory. + + Follows the Zarr v3 manifest storage transformer ZEP (see https://github.com/zarr-developers/zarr-specs/issues/287). + + Parameters + ---------- + var : xr.Variable + Must be wrapping a ManifestArray + dirpath : str + Zarr store array directory into which to write files. + """ + + marr = var.data + + marr.manifest.to_zarr_json(array_dir / "manifest.json") + + metadata = zarr_v3_array_metadata( + marr.zarray, [str(x) for x in var.dims], var.attrs + ) + with open(array_dir / "zarr.json", "wb") as metadata_file: + metadata_file.write(json_dumps(metadata)) + + +def zarr_v3_array_metadata(zarray: ZArray, dim_names: list[str], attrs: dict) -> dict: + """Construct a v3-compliant metadata dict from v2 zarray + information stored on the xarray variable.""" + # TODO it would be nice if we could use the zarr-python metadata.ArrayMetadata classes to do this conversion for us + + metadata = zarray.dict() + + # adjust to match v3 spec + metadata["zarr_format"] = 3 + metadata["node_type"] = "array" + metadata["data_type"] = str(np.dtype(metadata.pop("dtype"))) + metadata["chunk_grid"] = { + "name": "regular", + "configuration": {"chunk_shape": metadata.pop("chunks")}, + } + metadata["chunk_key_encoding"] = { + "name": "default", + "configuration": {"separator": "/"}, + } + metadata["codecs"] = zarray._v3_codec_pipeline() + metadata.pop("filters") + metadata.pop("compressor") + metadata.pop("order") + + # indicate that we're using the manifest storage transformer ZEP + metadata["storage_transformers"] = [ + { + "name": "chunk-manifest-json", + "configuration": {"manifest": "./manifest.json"}, + } + ] + + # add information from xarray object + metadata["dimension_names"] = dim_names + metadata["attributes"] = attrs + + return metadata diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 824892cc..f62b1269 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -1,14 +1,9 @@ import dataclasses -import json -from pathlib import Path from typing import TYPE_CHECKING, Any, Literal, NewType, cast import numcodecs import numpy as np import ujson # type: ignore -import xarray as xr - -from virtualizarr.vendor.zarr.utils import json_dumps if TYPE_CHECKING: pass @@ -213,179 +208,6 @@ def ceildiv(a: int, b: int) -> int: return -(a // -b) -def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None: - """ - Write an xarray dataset whose variables wrap ManifestArrays to a v3 Zarr store, writing chunk references into manifest.json files. - - Currently requires all variables to be backed by ManifestArray objects. - - Not very useful until some implementation of a Zarr reader can actually read these manifest.json files. - See https://github.com/zarr-developers/zarr-specs/issues/287 - - Parameters - ---------- - ds: xr.Dataset - storepath: str - """ - - from virtualizarr.manifests import ManifestArray - - _storepath = Path(storepath) - Path.mkdir(_storepath, exist_ok=False) - - # should techically loop over groups in a tree but a dataset corresponds to only one group - group_metadata = {"zarr_format": 3, "node_type": "group", "attributes": ds.attrs} - with open(_storepath / "zarr.json", "wb") as group_metadata_file: - group_metadata_file.write(json_dumps(group_metadata)) - - for name, var in ds.variables.items(): - array_dir = _storepath / str(name) - marr = var.data - - # TODO move this check outside the writing loop so we don't write an incomplete store on failure? - # TODO at some point this should be generalized to also write in-memory arrays as normal zarr chunks, see GH isse #62. - if not isinstance(marr, ManifestArray): - raise TypeError( - "Only xarray objects wrapping ManifestArrays can be written to zarr using this method, " - f"but variable {name} wraps an array of type {type(marr)}" - ) - - Path.mkdir(array_dir, exist_ok=False) - - # write the chunk references into a manifest.json file - # and the array metadata into a zarr.json file - to_zarr_json(var, array_dir) - - -def to_zarr_json(var: xr.Variable, array_dir: Path) -> None: - """ - Write out both the zarr.json and manifest.json file into the given zarr array directory. - - Follows the Zarr v3 manifest storage transformer ZEP (see https://github.com/zarr-developers/zarr-specs/issues/287). - - Parameters - ---------- - var : xr.Variable - Must be wrapping a ManifestArray - dirpath : str - Zarr store array directory into which to write files. - """ - - marr = var.data - - marr.manifest.to_zarr_json(array_dir / "manifest.json") - - metadata = zarr_v3_array_metadata( - marr.zarray, [str(x) for x in var.dims], var.attrs - ) - with open(array_dir / "zarr.json", "wb") as metadata_file: - metadata_file.write(json_dumps(metadata)) - - -def zarr_v3_array_metadata(zarray: ZArray, dim_names: list[str], attrs: dict) -> dict: - """Construct a v3-compliant metadata dict from v2 zarray + information stored on the xarray variable.""" - # TODO it would be nice if we could use the zarr-python metadata.ArrayMetadata classes to do this conversion for us - - metadata = zarray.dict() - - # adjust to match v3 spec - metadata["zarr_format"] = 3 - metadata["node_type"] = "array" - metadata["data_type"] = str(np.dtype(metadata.pop("dtype"))) - metadata["chunk_grid"] = { - "name": "regular", - "configuration": {"chunk_shape": metadata.pop("chunks")}, - } - metadata["chunk_key_encoding"] = { - "name": "default", - "configuration": {"separator": "/"}, - } - metadata["codecs"] = zarray._v3_codec_pipeline() - metadata.pop("filters") - metadata.pop("compressor") - metadata.pop("order") - - # indicate that we're using the manifest storage transformer ZEP - metadata["storage_transformers"] = [ - { - "name": "chunk-manifest-json", - "configuration": {"manifest": "./manifest.json"}, - } - ] - - # add information from xarray object - metadata["dimension_names"] = dim_names - metadata["attributes"] = attrs - - return metadata - - -def attrs_from_zarr_group_json(filepath: Path) -> dict: - with open(filepath) as metadata_file: - attrs = json.load(metadata_file) - return attrs["attributes"] - - -def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: - with open(filepath) as metadata_file: - metadata = json.load(metadata_file) - - if { - "name": "chunk-manifest-json", - "configuration": { - "manifest": "./manifest.json", - }, - } not in metadata.get("storage_transformers", []): - raise ValueError( - "Can only read byte ranges from Zarr v3 stores which implement the manifest storage transformer ZEP." - ) - - attrs = metadata.pop("attributes") - dim_names = metadata.pop("dimension_names") - - chunk_shape = tuple(metadata["chunk_grid"]["configuration"]["chunk_shape"]) - shape = tuple(metadata["shape"]) - zarr_format = metadata["zarr_format"] - - if metadata["fill_value"] is None: - raise ValueError( - "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" - ) - else: - fill_value = metadata["fill_value"] - - all_codecs = [ - codec - for codec in metadata["codecs"] - if codec["name"] not in ("transpose", "bytes") - ] - compressor, *filters = [ - _configurable_to_num_codec_config(_filter) for _filter in all_codecs - ] - zarray = ZArray( - chunks=chunk_shape, - compressor=compressor, - dtype=np.dtype(metadata["data_type"]), - fill_value=fill_value, - filters=filters or None, - order="C", - shape=shape, - zarr_format=zarr_format, - ) - - return zarray, dim_names, attrs - - -def _configurable_to_num_codec_config(configurable: dict) -> dict: - """ - Convert a zarr v3 configurable into a numcodecs codec. - """ - configurable_copy = configurable.copy() - codec_id = configurable_copy.pop("name") - configuration = configurable_copy.pop("configuration") - return numcodecs.get_codec({"id": codec_id, **configuration}).get_config() - - def _num_codec_config_to_configurable(num_codec: dict) -> dict: """ Convert a numcodecs codec into a zarr v3 configurable. From d2f0d06e248e62ed90deee5e2c87f81614b2f35d Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 27 Aug 2024 10:38:29 -0600 Subject: [PATCH 11/29] Let Xarray handle `decode_times` (#232) * replaces cftime_variables w/ decode_times * adds note in test * keeps cftime_variables arg, but raises a DepreciationWarning * adds some testing for coords, dims and attrs between ds and vds * updated decode_times default * removed test_cftime_variables_must_be_in_loadable_variables test & updated xr.open_dataset decode behavior * adds decode_times to releases.rst * Remove cftime_variables from docstring --------- Co-authored-by: TomNicholas --- docs/releases.rst | 8 +++-- docs/usage.md | 12 ++++---- virtualizarr/backend.py | 41 ++++++++++---------------- virtualizarr/tests/test_backend.py | 36 ++++++++++++++++------ virtualizarr/tests/test_integration.py | 2 -- 5 files changed, 55 insertions(+), 44 deletions(-) diff --git a/docs/releases.rst b/docs/releases.rst index 5ae3bff4..39d517be 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -9,6 +9,9 @@ v1.0.1 (unreleased) New Features ~~~~~~~~~~~~ +- Adds `decode_times` to open_virtual_dataset (:pull:`232`) + By `Raphael Hagen `_. + - Add parser for the OPeNDAP DMR++ XML format and integration with open_virtual_dataset (:pull:`113`) By `Ayush Nag `_. @@ -17,17 +20,18 @@ New Features Breaking changes ~~~~~~~~~~~~~~~~ - - Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`) By `Gustavo Hidalgo `_. - VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclass `pydantic.BaseModel` (:pull:`210`) - `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`xxx`) - Deprecations ~~~~~~~~~~~~ +- Depreciates cftime_variables in open_virtual_dataset in favor of decode_times. (:pull:`232`) + By `Raphael Hagen `_. + Bug fixes ~~~~~~~~~ diff --git a/docs/usage.md b/docs/usage.md index b0935286..40071b8a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -306,8 +306,8 @@ Dimensions: (time: 2920, lat: 25, lon: 53) Coordinates: lat (lat) float32 100B ManifestArray Date: Tue, 27 Aug 2024 13:09:49 -0700 Subject: [PATCH 12/29] Support specifying single HDF Group in open_virtual_dataset (#165) * first pass at single hdf group * Update virtualizarr/tests/test_xarray.py Co-authored-by: Tom Nicholas * Update virtualizarr/xarray.py Co-authored-by: Tom Nicholas * refactor, basic test * add test * fsspec doesnt like pytest tmp_path * document * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Tom Nicholas Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- conftest.py | 13 ++++++ docs/releases.rst | 2 + virtualizarr/backend.py | 5 +++ virtualizarr/readers/kerchunk.py | 47 ++++++++++++++++++++- virtualizarr/tests/test_backend.py | 65 ++++++++++++++++++++++++++++++ virtualizarr/types/kerchunk.py | 9 +++-- 6 files changed, 136 insertions(+), 5 deletions(-) diff --git a/conftest.py b/conftest.py index b558abfd..32b3581f 100644 --- a/conftest.py +++ b/conftest.py @@ -33,6 +33,19 @@ def netcdf4_file(tmpdir): return filepath +@pytest.fixture +def hdf5_groups_file(tmpdir): + # Set up example xarray dataset + ds = xr.tutorial.open_dataset("air_temperature") + + # Save it to disk as netCDF (in temporary directory) + filepath = f"{tmpdir}/air.nc" + ds.to_netcdf(filepath, format="NETCDF4", group="test/group") + ds.close() + + return filepath + + @pytest.fixture def netcdf4_files(tmpdir): # Set up example xarray dataset diff --git a/docs/releases.rst b/docs/releases.rst index 39d517be..216b3b80 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -8,6 +8,8 @@ v1.0.1 (unreleased) New Features ~~~~~~~~~~~~ +- New ``group`` option on ``open_virtual_dataset`` enables extracting specific HDF Groups. + (:pull:`165`) By `Scott Henderson `_. - Adds `decode_times` to open_virtual_dataset (:pull:`232`) By `Raphael Hagen `_. diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index b155c2ce..73fabe74 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -51,6 +51,7 @@ def open_virtual_dataset( filepath: str, *, filetype: FileType | None = None, + group: str | None = None, drop_variables: Iterable[str] | None = None, loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, @@ -74,6 +75,8 @@ def open_virtual_dataset( Type of file to be opened. Used to determine which kerchunk file format backend to use. Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3'}. If not provided will attempt to automatically infer the correct filetype from header bytes. + group : str, default is None + Path to the HDF5/netCDF4 group in the given file to open. Given as a str, supported by filetypes “netcdf4” and “hdf5”. drop_variables: list[str], default is None Variables in the file to drop before returning. loadable_variables: list[str], default is None @@ -171,6 +174,7 @@ def open_virtual_dataset( vds_refs = read_kerchunk_references_from_file( filepath=filepath, filetype=filetype, + group=group, reader_options=reader_options, ) virtual_vars = virtual_vars_from_kerchunk_refs( @@ -195,6 +199,7 @@ def open_virtual_dataset( ds = xr.open_dataset( cast(XArrayOpenT, fpath), drop_variables=drop_variables, + group=group, decode_times=decode_times, ) diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py index 4686ce94..19a8c28d 100644 --- a/virtualizarr/readers/kerchunk.py +++ b/virtualizarr/readers/kerchunk.py @@ -57,6 +57,7 @@ def _automatically_determine_filetype( def read_kerchunk_references_from_file( filepath: str, filetype: FileType | None, + group: str | None, reader_options: Optional[dict[str, Any]] = None, ) -> KerchunkStoreRefs: """ @@ -69,7 +70,8 @@ def read_kerchunk_references_from_file( filetype : FileType, default: None Type of file to be opened. Used to determine which kerchunk file format backend to use. If not provided will attempt to automatically infer the correct filetype from the the filepath's extension. - reader_options: dict, default {} + group : str, default is None + Path to the HDF5/netCDF4 group in the given file to open. Given as a str, supported by filetypes “netcdf4” and “hdf5”. Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments, so ensure reader_options match selected Kerchunk reader arguments. """ @@ -96,6 +98,9 @@ def read_kerchunk_references_from_file( refs = SingleHdf5ToZarr( filepath, inline_threshold=0, **reader_options ).translate() + + refs = extract_group(refs, group) + elif filetype.name.lower() == "grib": # TODO Grib files should be handled as a DataTree object # see https://github.com/TomNicholas/VirtualiZarr/issues/11 @@ -123,6 +128,44 @@ def read_kerchunk_references_from_file( return refs +def extract_group(vds_refs: KerchunkStoreRefs, group: str | None) -> KerchunkStoreRefs: + """Extract only the part of the kerchunk reference dict that is relevant to a single HDF group""" + hdf_groups = [ + k.removesuffix(".zgroup") for k in vds_refs["refs"].keys() if ".zgroup" in k + ] + if len(hdf_groups) == 1: + return vds_refs + else: + if group is None: + raise ValueError( + f"Multiple HDF Groups found. Must specify group= keyword to select one of {hdf_groups}" + ) + else: + # Ensure supplied group kwarg is consistent with kerchunk keys + if not group.endswith("/"): + group += "/" + if group.startswith("/"): + group = group.removeprefix("/") + + if group not in hdf_groups: + raise ValueError(f'Group "{group}" not found in {hdf_groups}') + + # Filter by group prefix and remove prefix from all keys + groupdict = { + k.removeprefix(group): v + for k, v in vds_refs["refs"].items() + if k.startswith(group) + } + # Also remove group prefix from _ARRAY_DIMENSIONS + for k, v in groupdict.items(): + if isinstance(v, str): + groupdict[k] = v.replace("\\/", "/").replace(group, "") + + vds_refs["refs"] = groupdict + + return KerchunkStoreRefs(vds_refs) + + def virtual_vars_from_kerchunk_refs( refs: KerchunkStoreRefs, drop_variables: list[str] | None = None, @@ -214,6 +257,7 @@ def find_var_names(ds_reference_dict: KerchunkStoreRefs) -> list[str]: refs = ds_reference_dict["refs"] found_var_names = {key.split("/")[0] for key in refs.keys() if "/" in key} + return list(found_var_names) @@ -235,6 +279,7 @@ def extract_array_refs( } return fully_decode_arr_refs(arr_refs) + else: raise KeyError( f"Could not find zarr array variable name {var_name}, only {found_var_names}" diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index c3001587..e42ad9ac 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -1,6 +1,7 @@ from collections.abc import Mapping from unittest.mock import patch +import fsspec import numpy as np import pytest import xarray as xr @@ -192,6 +193,10 @@ class TestReadFromURL: "hdf4", "https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf", ), + ( + "hdf5", + "https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/GCOV/ALOS1_Rosamond_20081012/NISAR_L2_PR_GCOV_001_005_A_219_4020_SHNA_A_20081012T060910_20081012T060926_P01101_F_N_J_001.h5", + ), # https://github.com/zarr-developers/VirtualiZarr/issues/159 # ("hdf5", "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5"), pytest.param( @@ -218,10 +223,48 @@ def test_read_from_url(self, filetype, url): if filetype in ["grib", "jpg", "hdf4"]: with pytest.raises(NotImplementedError): vds = open_virtual_dataset(url, reader_options={}, indexes={}) + elif filetype == "hdf5": + vds = open_virtual_dataset( + url, + group="science/LSAR/GCOV/grids/frequencyA", + drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], + indexes={}, + reader_options={}, + ) + assert isinstance(vds, xr.Dataset) else: vds = open_virtual_dataset(url, indexes={}) assert isinstance(vds, xr.Dataset) + def test_virtualizarr_vs_local_nisar(self): + # Open group directly from locally cached file with xarray + url = "https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/GCOV/ALOS1_Rosamond_20081012/NISAR_L2_PR_GCOV_001_005_A_219_4020_SHNA_A_20081012T060910_20081012T060926_P01101_F_N_J_001.h5" + tmpfile = fsspec.open_local( + f"filecache::{url}", filecache=dict(cache_storage="/tmp", same_names=True) + ) + hdf_group = "science/LSAR/GCOV/grids/frequencyA" + dsXR = xr.open_dataset( + tmpfile, + engine="h5netcdf", + group=hdf_group, + drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], + phony_dims="access", + ) + + # save group reference file via virtualizarr, then open with engine="kerchunk" + vds = open_virtual_dataset( + tmpfile, + group=hdf_group, + indexes={}, + drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], + ) + tmpref = "/tmp/cmip6.json" + vds.virtualize.to_kerchunk(tmpref, format="json") + dsV = xr.open_dataset(tmpref, engine="kerchunk") + + # xrt.assert_identical(dsXR, dsV) #Attribute order changes + xrt.assert_equal(dsXR, dsV) + class TestLoadVirtualDataset: def test_loadable_variables(self, netcdf4_file): @@ -249,6 +292,27 @@ def test_explicit_filetype(self, netcdf4_file): with pytest.raises(NotImplementedError): open_virtual_dataset(netcdf4_file, filetype="grib") + def test_group_kwarg(self, hdf5_groups_file): + with pytest.raises(ValueError, match="Multiple HDF Groups found"): + open_virtual_dataset(hdf5_groups_file) + with pytest.raises(ValueError, match="not found in"): + open_virtual_dataset(hdf5_groups_file, group="doesnt_exist") + + vars_to_load = ["air", "time"] + vds = open_virtual_dataset( + hdf5_groups_file, + group="test/group", + loadable_variables=vars_to_load, + indexes={}, + ) + full_ds = xr.open_dataset( + hdf5_groups_file, + group="test/group", + ) + for name in full_ds.variables: + if name in vars_to_load: + xrt.assert_identical(vds.variables[name], full_ds.variables[name]) + @patch("virtualizarr.readers.kerchunk.read_kerchunk_references_from_file") def test_open_virtual_dataset_passes_expected_args( self, mock_read_kerchunk, netcdf4_file @@ -258,6 +322,7 @@ def test_open_virtual_dataset_passes_expected_args( args = { "filepath": netcdf4_file, "filetype": None, + "group": None, "reader_options": reader_options, } mock_read_kerchunk.assert_called_once_with(**args) diff --git a/virtualizarr/types/kerchunk.py b/virtualizarr/types/kerchunk.py index e8dada20..d124cca3 100644 --- a/virtualizarr/types/kerchunk.py +++ b/virtualizarr/types/kerchunk.py @@ -4,9 +4,10 @@ # (idea from https://kobzol.github.io/rust/python/2023/05/20/writing-python-like-its-rust.html) # TODO I would prefer to be more specific about these types KerchunkStoreRefs = NewType( - "KerchunkStoreRefs", dict -) # top-level dict with keys for 'version', 'refs' + "KerchunkStoreRefs", + dict, # dict_keys(['version', 'refs']) +) # top-level dict containing kerchunk version and 'refs' dictionary which assumes single '.zgroup' key and multiple KerchunkArrRefs KerchunkArrRefs = NewType( "KerchunkArrRefs", - dict, -) # lower-level dict containing just the information for one zarr array + dict, # dict_keys(['.zarray', '.zattrs', '0.0', '0.1', ...) +) # lower-level dict defining a single Zarr Array, with keys for '.zarray', '.zattrs', and every chunk From 18c5a10b4434b96c73c3942b036b68f0bb973be5 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 28 Aug 2024 17:23:23 -0600 Subject: [PATCH 13/29] Adds defaults in `open_virtual_dataset_from_v3_store` (#234) * adds defaults * update releases.rst * default empty list instead of None --- docs/releases.rst | 3 +++ virtualizarr/readers/zarr.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/releases.rst b/docs/releases.rst index 216b3b80..dfe5a31d 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -8,6 +8,9 @@ v1.0.1 (unreleased) New Features ~~~~~~~~~~~~ +- Adds defaults for `open_virtual_dataset_from_v3_store` in (:pull:`234`) + By `Raphael Hagen `_. + - New ``group`` option on ``open_virtual_dataset`` enables extracting specific HDF Groups. (:pull:`165`) By `Scott Henderson `_. diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py index b841d5c3..168faa2b 100644 --- a/virtualizarr/readers/zarr.py +++ b/virtualizarr/readers/zarr.py @@ -15,8 +15,8 @@ def open_virtual_dataset_from_v3_store( storepath: str, - drop_variables: list[str], - indexes: Mapping[str, Index] | None, + drop_variables: list[str] = [], + indexes: Mapping[str, Index] | None = None, ) -> Dataset: """ Read a Zarr v3 store and return an xarray Dataset containing virtualized arrays. From 708d168ae3b78f97adbd55168dd76dc64c9cac05 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 29 Aug 2024 09:45:24 -0600 Subject: [PATCH 14/29] Virtualizarr + Coiled Serverless Example Notebook (#233) * terraclimate_coiled ex * adds example to releases.rst * adds fastparquet to ex deps for writing refs * removed FileType --- docs/releases.rst | 5 + examples/coiled/terraclimate.ipynb | 248 +++++++++++++++++++++++++++++ 2 files changed, 253 insertions(+) create mode 100644 examples/coiled/terraclimate.ipynb diff --git a/docs/releases.rst b/docs/releases.rst index dfe5a31d..ec057807 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -25,6 +25,7 @@ New Features Breaking changes ~~~~~~~~~~~~~~~~ + - Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`) By `Gustavo Hidalgo `_. - VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclass @@ -49,6 +50,10 @@ Bug fixes Documentation ~~~~~~~~~~~~~ +- Adds virtualizarr + coiled serverless example notebook (:pull`223`) + By `Raphael Hagen `_. + + Internal Changes ~~~~~~~~~~~~~~~~ diff --git a/examples/coiled/terraclimate.ipynb b/examples/coiled/terraclimate.ipynb new file mode 100644 index 00000000..205f094b --- /dev/null +++ b/examples/coiled/terraclimate.ipynb @@ -0,0 +1,248 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Virtualizarr and Coiled - Building a virtual dataset of Terraclimate\n", + "\n", + "This notebook is an example of using Virtualizarr together with the Python distributed processing framework [Coiled](https://www.coiled.io/) to generate references using [serverless functions](https://docs.coiled.io/user_guide/functions.html). \n", + "- **Note:** running this notebook requires a coiled account.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The dataset\n", + "For this example, we are going to create a virtual zarr store from the [Terraclimate](https://www.climatologylab.org/terraclimate.html) dataset. Terraclimate is a monthly dataset spanning 66 years and containing 14 climate and water balance variables. It is made up of 924 individual NetCDF4 files. When represented as an Xarray dataset, it is over 1TB in size." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Parallelizing `virtualizarr` reference generation with coiled serverless functions\n", + "Coiled serverless functions allow us to easily spin up hundreds of small compute instances, which are great for individual file reference generation. We were able to process 924 netCDF files into a 1TB virtual xarray dataset in 9 minutes for ~$0.24." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Installation and environment\n", + "\n", + "You should install the Python requirements in a clean virtual environment of your choice. Each coiled serverless function will re-use this environment, so it's best to start with a clean slate.\n", + "\n", + "```bash\n", + "pip install virtualizarr coiled xarray fastparquet ipykernel\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import coiled\n", + "import numpy as np\n", + "import xarray as xr\n", + "\n", + "from virtualizarr import open_virtual_dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create the Terraclimate variable and year url combinations \n", + "`14 variables * 66 years = 924 NetCDF files`\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "tvars = [\n", + " \"aet\",\n", + " \"def\",\n", + " \"pet\",\n", + " \"ppt\",\n", + " \"q\",\n", + " \"soil\",\n", + " \"srad\",\n", + " \"swe\",\n", + " \"tmax\",\n", + " \"tmin\",\n", + " \"vap\",\n", + " \"ws\",\n", + " \"vpd\",\n", + " \"PDSI\",\n", + "]\n", + "min_year = 1958\n", + "max_year = 2023\n", + "time_list = np.arange(min_year, max_year + 1, 1)\n", + "\n", + "combinations = [\n", + " f\"https://climate.northwestknowledge.net/TERRACLIMATE-DATA/TerraClimate_{var}_{year}.nc\"\n", + " for year in time_list\n", + " for var in tvars\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define the coiled serverless function\n", + "\n", + "### Serverless function setup notes:\n", + "- This coiled function is tailored to AWS\n", + "- `vm_type=[\"t4g.small\"]` - This is a small instance, you shouldn't need large machines for reference generation\n", + "- `spot_policy=\"spot_with_fallback\"` is cheaper, but might have unintended consequences\n", + "- `arm=True` uses VMs with ARM architecture, which is cheaper\n", + "- `idle_timeout=\"10 minutes\"` workers will shut down after 10 minutes of inactivity \n", + "- `n_workers=[100, 300]` adaptive scaling between 100 & 300 workers\n", + "- `name` [optional] if you want to keep track of your cluster in the coiled dashboard\n", + "\n", + "More details can be found in the [serverless function API](https://docs.coiled.io/user_guide/functions.html#api)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@coiled.function(\n", + " region=\"us-west-2\",\n", + " vm_type=[\"t4g.small\"],\n", + " spot_policy=\"spot_with_fallback\",\n", + " arm=True,\n", + " idle_timeout=\"10 minutes\",\n", + " n_workers=[100, 300],\n", + " name=\"parallel_reference_generation\",\n", + ")\n", + "def process(filename):\n", + " vds = open_virtual_dataset(\n", + " filename,\n", + " decode_times=True,\n", + " loadable_variables=[\"time\", \"lat\", \"lon\", \"crs\"],\n", + " filetype=\"netcdf4\",\n", + " indexes={},\n", + " )\n", + " return vds\n", + "\n", + "\n", + "# process.map distributes out the input file urls to coiled functions\n", + "# retires=10 allows for individual task retires, which can be useful for inconsistent server behavior\n", + "results = process.map(combinations, retries=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Combine references into virtual dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# extract generator values into a list\n", + "vds_list = [result for result in results]\n", + "\n", + "# combine individual refs into a virtual Xarray dataset\n", + "mds = xr.combine_by_coords(\n", + " vds_list, coords=\"minimal\", compat=\"override\", combine_attrs=\"drop_conflicts\"\n", + ")\n", + "mds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(str(\"{0:.2f}\".format(mds.nbytes / 1e12)), \" TB\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save the reference to disk\n", + "\n", + "Now that we have this virtual dataset, we can save the combined reference file for future use. The resulting reference parquet file is only 2.6MB!\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mds.virtualize.to_kerchunk(\"terraclimate.parquet\", format=\"parquet\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Open the reference file and load into Xarray\n", + "You can now open up the reference file with Xarray and Kerchunk. This will now behave similarly to a normal Xarray dataset. \n", + "\n", + "**Warning:** Calling `to_zarr` on this dataset will try to write out 1TB of data.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "combined_ds = xr.open_dataset(\"terraclimate.parquet\", engine=\"kerchunk\", chunks={})\n", + "combined_ds" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 7d47dccd62a2736062914427c70c9979954da102 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 2 Sep 2024 15:01:11 -0700 Subject: [PATCH 15/29] [pre-commit.ci] pre-commit autoupdate (#236) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.5.6 → v0.6.3](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.6...v0.6.3) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d58e82ac..803b7a78 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,7 +11,7 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: "v0.5.6" + rev: "v0.6.3" hooks: # Run the linter. - id: ruff From 53a609f75d4fee3b6b111bdfcf0df01f824480db Mon Sep 17 00:00:00 2001 From: Timothy Hodson <34148978+thodson-usgs@users.noreply.github.com> Date: Thu, 5 Sep 2024 11:39:28 -0500 Subject: [PATCH 16/29] Add example to create a virtual dataset using lithops (#203) * Set ZArray fill_value back to nan * Set NaT as datetime64 default fill value * Add example to create a virtual dataset using lithops * Rename file * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update examples/virtualizarr-with-lithops/README.md Co-authored-by: Tom Nicholas * Update examples/virtualizarr-with-lithops/README.md Co-authored-by: Tom Nicholas * Update examples/virtualizarr-with-lithops/README.md Co-authored-by: Tom Nicholas --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Tom Nicholas --- .../Dockerfile_virtualizarr | 59 +++++++++++++++++++ examples/virtualizarr-with-lithops/README.md | 41 +++++++++++++ .../virtualizarr-with-lithops/lithops.yaml | 14 +++++ .../requirements.txt | 8 +++ .../virtualizarr-with-lithops.py | 59 +++++++++++++++++++ 5 files changed, 181 insertions(+) create mode 100644 examples/virtualizarr-with-lithops/Dockerfile_virtualizarr create mode 100644 examples/virtualizarr-with-lithops/README.md create mode 100644 examples/virtualizarr-with-lithops/lithops.yaml create mode 100644 examples/virtualizarr-with-lithops/requirements.txt create mode 100644 examples/virtualizarr-with-lithops/virtualizarr-with-lithops.py diff --git a/examples/virtualizarr-with-lithops/Dockerfile_virtualizarr b/examples/virtualizarr-with-lithops/Dockerfile_virtualizarr new file mode 100644 index 00000000..d1793c6a --- /dev/null +++ b/examples/virtualizarr-with-lithops/Dockerfile_virtualizarr @@ -0,0 +1,59 @@ +# Python 3.11 +FROM python:3.11-slim-buster + + +RUN apt-get update \ + # Install aws-lambda-cpp build dependencies + && apt-get install -y \ + g++ \ + make \ + cmake \ + unzip \ + # cleanup package lists, they are not used anymore in this image + && rm -rf /var/lib/apt/lists/* \ + && apt-cache search linux-headers-generic + +ARG FUNCTION_DIR="/function" + +# Copy function code +RUN mkdir -p ${FUNCTION_DIR} + +# Update pip +# NB botocore/boto3 are pinned due to https://github.com/boto/boto3/issues/3648 +# using versions from https://github.com/aio-libs/aiobotocore/blob/72b8dd5d7d4ef2f1a49a0ae0c37b47e5280e2070/setup.py +# due to s3fs dependency +RUN pip install --upgrade --ignore-installed pip wheel six setuptools \ + && pip install --upgrade --no-cache-dir --ignore-installed \ + awslambdaric \ + botocore==1.29.76 \ + boto3==1.26.76 \ + redis \ + httplib2 \ + requests \ + numpy \ + scipy \ + pandas \ + pika \ + kafka-python \ + cloudpickle \ + ps-mem \ + tblib + +# Set working directory to function root directory +WORKDIR ${FUNCTION_DIR} + +# Add Lithops +COPY lithops_lambda.zip ${FUNCTION_DIR} +RUN unzip lithops_lambda.zip \ + && rm lithops_lambda.zip \ + && mkdir handler \ + && touch handler/__init__.py \ + && mv entry_point.py handler/ + +# Put your dependencies here, using RUN pip install... or RUN apt install... + +COPY requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ] +CMD [ "handler.entry_point.lambda_handler" ] diff --git a/examples/virtualizarr-with-lithops/README.md b/examples/virtualizarr-with-lithops/README.md new file mode 100644 index 00000000..d3c02037 --- /dev/null +++ b/examples/virtualizarr-with-lithops/README.md @@ -0,0 +1,41 @@ +# Generate a virtual zarr dataset using lithops + +This example walks through how to create a virtual dataset from a collection of +netCDF files on s3 using lithops to open each file in parallel then concatenate +them into a single virtual dataset. + +## Credits +Inspired by Pythia's cookbook: https://projectpythia.org/kerchunk-cookbook +by norlandrhagen. + +Please, contribute improvements. + + + +1. Set up a Python environment +```bash +conda create --name virtualizarr-lithops -y python=3.11 +conda activate virtualizarr-lithops +pip install -r requirements.txt +``` + +2. Configure compute and storage backends for [lithops](https://lithops-cloud.github.io/docs/source/configuration.html). +The configuration in `lithops.yaml` uses AWS Lambda for [compute](https://lithops-cloud.github.io/docs/source/compute_config/aws_lambda.html) and AWS S3 for [storage](https://lithops-cloud.github.io/docs/source/storage_config/aws_s3.html). +To use those backends, simply edit `lithops.yaml` with your `bucket` and `execution_role`. + +1. Build a runtime image for Cubed +```bash +export LITHOPS_CONFIG_FILE=$(pwd)/lithops.yaml +lithops runtime build -b aws_lambda -f Dockerfile_virtualizarr virtualizarr-runtime +``` + +1. Run the script +```bash +python virtualizarr-with-lithops.py +``` + +## Cleaning up +To rebuild the Lithops image, delete the existing one by running +```bash +lithops runtime delete -b aws_lambda -d virtualizarr-runtime +``` diff --git a/examples/virtualizarr-with-lithops/lithops.yaml b/examples/virtualizarr-with-lithops/lithops.yaml new file mode 100644 index 00000000..b142b480 --- /dev/null +++ b/examples/virtualizarr-with-lithops/lithops.yaml @@ -0,0 +1,14 @@ +lithops: + backend: aws_lambda + storage: aws_s3 + +aws: + region: us-west-2 + +aws_lambda: + execution_role: arn:aws:iam::807615458658:role/lambdaLithopsExecutionRole + runtime: virtualizarr-runtime + runtime_memory: 2000 + +aws_s3: + bucket: arn:aws:s3:::cubed-thodson-temp diff --git a/examples/virtualizarr-with-lithops/requirements.txt b/examples/virtualizarr-with-lithops/requirements.txt new file mode 100644 index 00000000..ba6938f8 --- /dev/null +++ b/examples/virtualizarr-with-lithops/requirements.txt @@ -0,0 +1,8 @@ +boto +cftime +h5py +kerchunk +lithops +s3fs +virtualizarr +xarray diff --git a/examples/virtualizarr-with-lithops/virtualizarr-with-lithops.py b/examples/virtualizarr-with-lithops/virtualizarr-with-lithops.py new file mode 100644 index 00000000..5d16bb9c --- /dev/null +++ b/examples/virtualizarr-with-lithops/virtualizarr-with-lithops.py @@ -0,0 +1,59 @@ +# Use lithops to create a virtual dataset from a collection of necdf files on s3. +# +# Inspired by Pythia's cookbook: https://projectpythia.org/kerchunk-cookbook +# by norlandrhagen. +# +# Please, contribute improvements. + +import fsspec +import lithops +import xarray as xr + +from virtualizarr import open_virtual_dataset + +# to demonstrate this workflow, we will use a collection of netcdf files from the WRF-SE-AK-AR5 project. +fs_read = fsspec.filesystem("s3", anon=True, skip_instance_cache=True) +files_paths = fs_read.glob("s3://wrf-se-ak-ar5/ccsm/rcp85/daily/2060/*") +file_pattern = sorted(["s3://" + f for f in files_paths]) + +# optionally, truncate file_pattern while debugging +# file_pattern = file_pattern[:4] + +print(f"{len(file_pattern)} file paths were retrieved.") + + +def map_references(fil): + """Map function to open virtual datasets.""" + vds = open_virtual_dataset( + fil, + indexes={}, + loadable_variables=["Time"], + cftime_variables=["Time"], + ) + return vds + + +def reduce_references(results): + """Reduce to concat virtual datasets.""" + combined_vds = xr.combine_nested( + results, + concat_dim=["Time"], + coords="minimal", + compat="override", + ) + return combined_vds + + +fexec = lithops.FunctionExecutor(config_file="lithops.yaml") + +futures = fexec.map_reduce( + map_references, + file_pattern, + reduce_references, + spawn_reducer=100, +) + +ds = futures.get_result() + +# write out the virtual dataset to a kerchunk json +ds.virtualize.to_kerchunk("combined.json", format="json") From 47a5e8702e44f71bb355bcba0ff6214fe6d09d83 Mon Sep 17 00:00:00 2001 From: Michael Sumner Date: Wed, 25 Sep 2024 00:15:23 +1000 Subject: [PATCH 17/29] Update backend.py (tiny typo) (#240) --- virtualizarr/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index 73fabe74..904cad77 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -104,7 +104,7 @@ def open_virtual_dataset( if cftime_variables is not None: # It seems like stacklevel=2 is req to surface this warning. warnings.warn( - "cftime_variables is depreciated and will be ignored. Pass decode_times=True and loadable_variables=['time'] to decode time values to datetime objects.", + "cftime_variables is deprecated and will be ignored. Pass decode_times=True and loadable_variables=['time'] to decode time values to datetime objects.", DeprecationWarning, stacklevel=2, ) From fadeeba99bdbfe7a0393a2a6d34d08522add60f5 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Fri, 11 Oct 2024 12:54:44 -0600 Subject: [PATCH 18/29] pulled mypy into seperate workflow (#254) --- .github/workflows/main.yml | 4 ---- .github/workflows/typing.yml | 38 ++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/typing.yml diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 769f59e5..0550236f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -46,10 +46,6 @@ jobs: conda env list conda list - - name: Type check - run: | - mypy virtualizarr - - name: Running Tests run: | python -m pytest ./virtualizarr --run-network-tests --cov=./ --cov-report=xml --verbose diff --git a/.github/workflows/typing.yml b/.github/workflows/typing.yml new file mode 100644 index 00000000..0540801b --- /dev/null +++ b/.github/workflows/typing.yml @@ -0,0 +1,38 @@ +name: Typing + +on: + push: + branches: [ "main" ] + paths-ignore: + - 'docs/**' + pull_request: + branches: [ "main" ] + paths-ignore: + - 'docs/**' + schedule: + - cron: "0 0 * * *" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + mypy: + name: mypy + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' + + - name: Install deps + run: | + # We need to test optional dep to add all the library stubs + pip install -e '.[test]' + + - name: Type check + run: | + mypy virtualizarr From 2d66e88697486890b0af55a33d64b9328d3f730c Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 11 Oct 2024 13:39:36 -0600 Subject: [PATCH 19/29] fix mypy errors around numpy functions not being strictly type hinted (#252) --- virtualizarr/manifests/array_api.py | 21 ++++++++++++++------- virtualizarr/manifests/manifest.py | 9 ++++++--- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index 09606978..18f15933 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Callable, Iterable +from typing import TYPE_CHECKING, Any, Callable, Iterable, cast import numpy as np @@ -217,9 +217,12 @@ def stack( new_shape.insert(axis, length_along_new_stacked_axis) # do stacking of entries in manifest - stacked_paths = np.stack( - [arr.manifest._paths for arr in arrays], - axis=axis, + stacked_paths = cast( # `np.stack` apparently is type hinted as if the output could have Any dtype + np.ndarray[Any, np.dtypes.StringDType], + np.stack( + [arr.manifest._paths for arr in arrays], + axis=axis, + ), ) stacked_offsets = np.stack( [arr.manifest._offsets for arr in arrays], @@ -296,10 +299,14 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra ) # do broadcasting of entries in manifest - broadcasted_paths = np.broadcast_to( - x.manifest._paths, - shape=new_chunk_grid_shape, + broadcasted_paths = cast( # `np.broadcast_to` apparently is type hinted as if the output could have Any dtype + np.ndarray[Any, np.dtypes.StringDType], + np.broadcast_to( + x.manifest._paths, + shape=new_chunk_grid_shape, + ), ) + broadcasted_offsets = np.broadcast_to( x.manifest._offsets, shape=new_chunk_grid_shape, diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 3aaebb41..767722b0 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -84,7 +84,7 @@ class ChunkManifest: so it's not possible to have a ChunkManifest object that does not represent a valid grid of chunks. """ - _paths: np.ndarray[Any, np.dtypes.StringDType] # type: ignore[name-defined] + _paths: np.ndarray[Any, np.dtypes.StringDType] _offsets: np.ndarray[Any, np.dtype[np.uint64]] _lengths: np.ndarray[Any, np.dtype[np.uint64]] @@ -113,7 +113,10 @@ def __init__(self, entries: dict) -> None: shape = get_chunk_grid_shape(entries.keys()) # Initializing to empty implies that entries with path='' are treated as missing chunks - paths = np.empty(shape=shape, dtype=np.dtypes.StringDType()) # type: ignore[attr-defined] + paths = cast( # `np.empty` apparently is type hinted as if the output could have Any dtype + np.ndarray[Any, np.dtypes.StringDType], + np.empty(shape=shape, dtype=np.dtypes.StringDType()), + ) offsets = np.empty(shape=shape, dtype=np.dtype("uint64")) lengths = np.empty(shape=shape, dtype=np.dtype("uint64")) @@ -141,7 +144,7 @@ def __init__(self, entries: dict) -> None: @classmethod def from_arrays( cls, - paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]], # type: ignore[name-defined] + paths: np.ndarray[Any, np.dtypes.StringDType], offsets: np.ndarray[Any, np.dtype[np.uint64]], lengths: np.ndarray[Any, np.dtype[np.uint64]], ) -> "ChunkManifest": From ec8e4657ceb91eb5d25df214f1ac0c7a2c66815c Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 16 Oct 2024 14:39:12 -0600 Subject: [PATCH 20/29] Allow `open_virtual_dataset` to read existing Kerchunk references (#251) * reading existing refs - wip * ujson stub to mypy overrides in pyproject.toml * added xfail to kerchunk json * updated reference writing to remove trailing // * MYPY TEMP DISABLED * added section to usage docs + updated releases.rst * test * remove test deps from doc.yaml build * tests passing for reading parquet references to virtual dataset, refactored _fsspec_open... to class * Update pyproject.toml Co-authored-by: Justus Magin * remove ast and replace with ujson * Update .github/workflows/main.yml Co-authored-by: Tom Nicholas * Dict -> dict, -> engine option. very flaky autodetection * removed version from parquet refs * adds path for invalid kerchunk format + test * updates existing references docs --------- Co-authored-by: Justus Magin Co-authored-by: Tom Nicholas --- .gitignore | 1 + ci/doc.yml | 1 - conftest.py | 14 ++++++ docs/releases.rst | 5 ++ docs/usage.md | 12 +++++ pyproject.toml | 4 ++ virtualizarr/backend.py | 50 +++++++++++++++++--- virtualizarr/manifests/manifest.py | 4 +- virtualizarr/readers/kerchunk.py | 6 +-- virtualizarr/tests/test_backend.py | 74 ++++++++++++++++++++++++++++++ virtualizarr/tests/test_utils.py | 6 +-- virtualizarr/utils.py | 59 ++++++++++++++---------- 12 files changed, 197 insertions(+), 39 deletions(-) diff --git a/.gitignore b/.gitignore index d360cfa4..d6720a7a 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,4 @@ cython_debug/ #.idea/ virtualizarr/_version.py docs/generated/ +examples/ diff --git a/ci/doc.yml b/ci/doc.yml index 7d7e9224..ccc3ded6 100644 --- a/ci/doc.yml +++ b/ci/doc.yml @@ -13,4 +13,3 @@ dependencies: - "sphinx_design" - "sphinx_togglebutton" - "sphinx-autodoc-typehints" - - -e "..[test]" diff --git a/conftest.py b/conftest.py index 32b3581f..3af4bf06 100644 --- a/conftest.py +++ b/conftest.py @@ -33,6 +33,20 @@ def netcdf4_file(tmpdir): return filepath +@pytest.fixture +def netcdf4_virtual_dataset(netcdf4_file): + from virtualizarr import open_virtual_dataset + + return open_virtual_dataset(netcdf4_file, indexes={}) + + +@pytest.fixture +def netcdf4_inlined_ref(netcdf4_file): + from kerchunk.hdf import SingleHdf5ToZarr + + return SingleHdf5ToZarr(netcdf4_file, inline_threshold=1000).translate() + + @pytest.fixture def hdf5_groups_file(tmpdir): # Set up example xarray dataset diff --git a/docs/releases.rst b/docs/releases.rst index ec057807..622f01e0 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -8,6 +8,11 @@ v1.0.1 (unreleased) New Features ~~~~~~~~~~~~ + + +- Can open `kerchunk` reference files with ``open_virtual_dataset``. + (:pull:`251`, :pull:`186`) By `Raphael Hagen `_ & `Kristen Thyng `_. + - Adds defaults for `open_virtual_dataset_from_v3_store` in (:pull:`234`) By `Raphael Hagen `_. diff --git a/docs/usage.md b/docs/usage.md index 40071b8a..a0f9d058 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -421,6 +421,18 @@ Currently there are not yet any zarr v3 readers which understand the chunk manif This store can however be read by {py:func}`~virtualizarr.xarray.open_virtual_dataset`, by passing `filetype="zarr_v3"`. ``` +## Opening Kerchunk references as virtual datasets + +You can open existing Kerchunk `json` or `parquet` references as Virtualizarr virtual datasets. This may be useful for converting existing Kerchunk formatted references to storage formats like [Icechunk](https://icechunk.io/). + +```python + +vds = open_virtual_dataset('combined.json', format='kerchunk') +# or +vds = open_virtual_dataset('combined.parquet', format='kerchunk') + +``` + ## Rewriting existing manifests Sometimes it can be useful to rewrite the contents of an already-generated manifest or virtual dataset. diff --git a/pyproject.toml b/pyproject.toml index 6b0efe89..5af632ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,6 +87,10 @@ ignore_missing_imports = true module = "kerchunk.*" ignore_missing_imports = true +[[tool.mypy.overrides]] +module = "ujson.*" +ignore_missing_imports = true + [tool.ruff] # Same as Black. line-length = 88 diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index 904cad77..4da9e896 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -16,7 +16,8 @@ from xarray.core.variable import IndexVariable from virtualizarr.manifests import ManifestArray -from virtualizarr.utils import _fsspec_openfile_from_filepath +from virtualizarr.types.kerchunk import KerchunkStoreRefs +from virtualizarr.utils import _FsspecFSFromFilepath XArrayOpenT = str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore @@ -39,6 +40,7 @@ class FileType(AutoName): zarr = auto() dmrpp = auto() zarr_v3 = auto() + kerchunk = auto() class ManifestBackendArray(ManifestArray, BackendArray): @@ -67,13 +69,14 @@ def open_virtual_dataset( Xarray indexes can optionally be created (the default behaviour). To avoid creating any xarray indexes pass ``indexes={}``. + Parameters ---------- filepath : str, default None File path to open as a set of virtualized zarr arrays. filetype : FileType, default None Type of file to be opened. Used to determine which kerchunk file format backend to use. - Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3'}. + Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3', 'kerchunk'}. If not provided will attempt to automatically infer the correct filetype from header bytes. group : str, default is None Path to the HDF5/netCDF4 group in the given file to open. Given as a str, supported by filetypes “netcdf4” and “hdf5”. @@ -133,9 +136,44 @@ def open_virtual_dataset( raise NotImplementedError() # if filetype is user defined, convert to FileType + if filetype is not None: filetype = FileType(filetype) + if filetype == FileType.kerchunk: + from virtualizarr.readers.kerchunk import dataset_from_kerchunk_refs + + fs = _FsspecFSFromFilepath(filepath=filepath, reader_options=reader_options) + + # The kerchunk .parquet storage format isn't actually a parquet, but a directory that contains named parquets for each group/variable. + if fs.filepath.endswith("ref.parquet"): + from fsspec.implementations.reference import LazyReferenceMapper + + lrm = LazyReferenceMapper(filepath, fs.fs) + + # build reference dict from KV pairs in LazyReferenceMapper + # is there a better / more preformant way to extract this? + array_refs = {k: lrm[k] for k in lrm.keys()} + + full_reference = {"refs": array_refs} + + return dataset_from_kerchunk_refs(KerchunkStoreRefs(full_reference)) + + # JSON has no magic bytes, but the Kerchunk version 1 spec starts with 'version': + # https://fsspec.github.io/kerchunk/spec.html + elif fs.read_bytes(9).startswith(b'{"version'): + import ujson + + with fs.open_file() as of: + refs = ujson.load(of) + + return dataset_from_kerchunk_refs(KerchunkStoreRefs(refs)) + + else: + raise ValueError( + "The input Kerchunk reference did not seem to be in Kerchunk's JSON or Parquet spec: https://fsspec.github.io/kerchunk/spec.html. The Kerchunk format autodetection is quite flaky, so if your reference matches the Kerchunk spec feel free to open an issue: https://github.com/zarr-developers/VirtualiZarr/issues" + ) + if filetype == FileType.zarr_v3: # TODO is there a neat way of auto-detecting this? from virtualizarr.readers.zarr import open_virtual_dataset_from_v3_store @@ -151,9 +189,9 @@ def open_virtual_dataset( "Specifying `loadable_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files." ) - fpath = _fsspec_openfile_from_filepath( + fpath = _FsspecFSFromFilepath( filepath=filepath, reader_options=reader_options - ) + ).open_file() parser = DMRParser(fpath.read(), data_filepath=filepath.strip(".dmrpp")) vds = parser.parse_dataset() vds.drop_vars(drop_variables) @@ -189,9 +227,9 @@ def open_virtual_dataset( # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references # TODO really we probably want a dedicated xarray backend that iterates over all variables only once - fpath = _fsspec_openfile_from_filepath( + fpath = _FsspecFSFromFilepath( filepath=filepath, reader_options=reader_options - ) + ).open_file() # fpath can be `Any` thanks to fsspec.filesystem(...).open() returning Any. # We'll (hopefully safely) cast it to what xarray is expecting, but this might let errors through. diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 767722b0..a6d160ed 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -309,7 +309,9 @@ def _from_kerchunk_chunk_dict( chunk_entries: dict[ChunkKey, ChunkDictEntry] = {} for k, v in kerchunk_chunk_dict.items(): if isinstance(v, (str, bytes)): - raise NotImplementedError("TODO: handle inlined data") + raise NotImplementedError( + "Reading inlined reference data is currently not supported. [ToDo]" + ) elif not isinstance(v, (tuple, list)): raise TypeError(f"Unexpected type {type(v)} for chunk value: {v}") chunk_entries[k] = ChunkEntry.from_kerchunk(v).dict() diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py index 19a8c28d..c274ee5a 100644 --- a/virtualizarr/readers/kerchunk.py +++ b/virtualizarr/readers/kerchunk.py @@ -13,7 +13,7 @@ KerchunkArrRefs, KerchunkStoreRefs, ) -from virtualizarr.utils import _fsspec_openfile_from_filepath +from virtualizarr.utils import _FsspecFSFromFilepath from virtualizarr.zarr import ZArray, ZAttrs @@ -28,9 +28,9 @@ def _automatically_determine_filetype( raise NotImplementedError() # Read magic bytes from local or remote file - fpath = _fsspec_openfile_from_filepath( + fpath = _FsspecFSFromFilepath( filepath=filepath, reader_options=reader_options - ) + ).open_file() magic_bytes = fpath.read(8) fpath.close() diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index e42ad9ac..731c4acc 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -336,3 +336,77 @@ def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir): vds = open_virtual_dataset(hdf5_scalar) assert vds.scalar.dims == () assert vds.scalar.attrs == {"scalar": "true"} + + +@pytest.mark.parametrize( + "reference_format", + ["json", "parquet", "invalid"], +) +def test_open_virtual_dataset_existing_kerchunk_refs( + tmp_path, netcdf4_virtual_dataset, reference_format +): + example_reference_dict = netcdf4_virtual_dataset.virtualize.to_kerchunk( + format="dict" + ) + + if reference_format == "invalid": + # Test invalid file format leads to ValueError + ref_filepath = tmp_path / "ref.csv" + with open(ref_filepath.as_posix(), mode="w") as of: + of.write("tmp") + + with pytest.raises(ValueError): + open_virtual_dataset( + filepath=ref_filepath.as_posix(), filetype="kerchunk", indexes={} + ) + + else: + # Test valid json and parquet reference formats + + if reference_format == "json": + ref_filepath = tmp_path / "ref.json" + + import ujson + + with open(ref_filepath, "w") as json_file: + ujson.dump(example_reference_dict, json_file) + + if reference_format == "parquet": + from kerchunk.df import refs_to_dataframe + + ref_filepath = tmp_path / "ref.parquet" + refs_to_dataframe(fo=example_reference_dict, url=ref_filepath.as_posix()) + + vds = open_virtual_dataset( + filepath=ref_filepath.as_posix(), filetype="kerchunk", indexes={} + ) + + # Inconsistent results! https://github.com/TomNicholas/VirtualiZarr/pull/73#issuecomment-2040931202 + # assert vds.virtualize.to_kerchunk(format='dict') == example_reference_dict + refs = vds.virtualize.to_kerchunk(format="dict") + expected_refs = netcdf4_virtual_dataset.virtualize.to_kerchunk(format="dict") + assert refs["refs"]["air/0.0.0"] == expected_refs["refs"]["air/0.0.0"] + assert refs["refs"]["lon/0"] == expected_refs["refs"]["lon/0"] + assert refs["refs"]["lat/0"] == expected_refs["refs"]["lat/0"] + assert refs["refs"]["time/0"] == expected_refs["refs"]["time/0"] + + assert list(vds) == list(netcdf4_virtual_dataset) + assert set(vds.coords) == set(netcdf4_virtual_dataset.coords) + assert set(vds.variables) == set(netcdf4_virtual_dataset.variables) + + +def test_notimplemented_read_inline_refs(tmp_path, netcdf4_inlined_ref): + # For now, we raise a NotImplementedError if we read existing references that have inlined data + # https://github.com/zarr-developers/VirtualiZarr/pull/251#pullrequestreview-2361916932 + + ref_filepath = tmp_path / "ref.json" + + import ujson + + with open(ref_filepath, "w") as json_file: + ujson.dump(netcdf4_inlined_ref, json_file) + + with pytest.raises(NotImplementedError): + open_virtual_dataset( + filepath=ref_filepath.as_posix(), filetype="kerchunk", indexes={} + ) diff --git a/virtualizarr/tests/test_utils.py b/virtualizarr/tests/test_utils.py index ed204c16..d42c2288 100644 --- a/virtualizarr/tests/test_utils.py +++ b/virtualizarr/tests/test_utils.py @@ -7,7 +7,7 @@ import pytest import xarray as xr -from virtualizarr.utils import _fsspec_openfile_from_filepath +from virtualizarr.utils import _FsspecFSFromFilepath @pytest.fixture @@ -21,7 +21,7 @@ def test_fsspec_openfile_from_path(tmp_path: pathlib.Path, dataset: xr.Dataset) f = tmp_path / "dataset.nc" dataset.to_netcdf(f) - result = _fsspec_openfile_from_filepath(filepath=f.as_posix()) + result = _FsspecFSFromFilepath(filepath=f.as_posix()).open_file() assert isinstance(result, fsspec.implementations.local.LocalFileOpener) @@ -32,6 +32,6 @@ def test_fsspec_openfile_memory(dataset: xr.Dataset): with fs.open("dataset.nc", mode="wb") as f: dataset.to_netcdf(f, engine="h5netcdf") - result = _fsspec_openfile_from_filepath(filepath="memory://dataset.nc") + result = _FsspecFSFromFilepath(filepath="memory://dataset.nc").open_file() with result: assert isinstance(result, fsspec.implementations.memory.MemoryFile) diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index 092ddd25..1721a3e7 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -13,42 +13,51 @@ ] -def _fsspec_openfile_from_filepath( - *, - filepath: str, - reader_options: Optional[dict] = None, -) -> OpenFileType: - """Converts input filepath to fsspec openfile object. +from dataclasses import dataclass, field + + +@dataclass +class _FsspecFSFromFilepath: + """Class to create fsspec Filesystem from input filepath. Parameters ---------- filepath : str Input filepath reader_options : dict, optional - Dict containing kwargs to pass to file opener, by default {} - - Returns - ------- - OpenFileType - An open file-like object, specific to the protocol supplied in filepath. + dict containing kwargs to pass to file opener, by default {} + fs : Option | None + The fsspec filesystem object, created in __post_init__ - Raises - ------ - NotImplementedError - Raises a Not Implemented Error if filepath protocol is not supported. """ - import fsspec - from upath import UPath + filepath: str + reader_options: Optional[dict] = field(default_factory=dict) + fs: fsspec.AbstractFileSystem = field(init=False) + + def open_file(self) -> OpenFileType: + """Calls `.open` on fsspec.Filesystem instantiation using self.filepath as an input. + + Returns + ------- + OpenFileType + file opened with fsspec + """ + return self.fs.open(self.filepath) - universal_filepath = UPath(filepath) - protocol = universal_filepath.protocol + def read_bytes(self, bytes: int) -> bytes: + with self.open_file() as of: + return of.read(bytes) - if reader_options is None: - reader_options = {} + def __post_init__(self) -> None: + """Initialize the fsspec filesystem object""" + import fsspec + from upath import UPath - storage_options = reader_options.get("storage_options", {}) # type: ignore + universal_filepath = UPath(self.filepath) + protocol = universal_filepath.protocol - fpath = fsspec.filesystem(protocol, **storage_options).open(filepath) + self.reader_options = self.reader_options or {} + storage_options = self.reader_options.get("storage_options", {}) # type: ignore - return fpath + self.fs = fsspec.filesystem(protocol, **storage_options) From e6407e01e4dc2536a64c3a418a59ead30bb353bc Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Thu, 17 Oct 2024 14:28:58 -0600 Subject: [PATCH 21/29] Skip tests that require kerchunk (#259) * try making ujson an optional dep * skip all tests which require kerchunk * add new CI job * rename git workflow * move numcodecs import inside * add numcodecs to CI env * make ujson required * unskip tests for parsing in-memory kerchunk dicts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add ujson to CI environment * in-memory roundtrip doesn't require kerchunk * move in-memory kerchunk roundtrip test to test_integration.py * remove now-empty test_kerchunk.py file --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .github/workflows/min-deps.yml | 60 +++++++++++++++++++ ci/min-deps.yml | 26 ++++++++ pyproject.toml | 7 ++- virtualizarr/accessor.py | 3 +- virtualizarr/manifests/manifest.py | 3 +- virtualizarr/readers/kerchunk.py | 3 +- virtualizarr/tests/__init__.py | 2 + virtualizarr/tests/test_backend.py | 19 +++++- virtualizarr/tests/test_integration.py | 48 ++++++++++++++- virtualizarr/tests/test_kerchunk.py | 46 -------------- .../tests/test_manifests/test_array.py | 4 +- .../tests/test_readers/test_kerchunk.py | 3 +- virtualizarr/tests/test_utils.py | 2 + .../tests/test_writers/test_kerchunk.py | 7 ++- virtualizarr/tests/test_xarray.py | 3 + virtualizarr/writers/kerchunk.py | 3 +- virtualizarr/zarr.py | 6 +- 17 files changed, 183 insertions(+), 62 deletions(-) create mode 100644 .github/workflows/min-deps.yml create mode 100644 ci/min-deps.yml delete mode 100644 virtualizarr/tests/test_kerchunk.py diff --git a/.github/workflows/min-deps.yml b/.github/workflows/min-deps.yml new file mode 100644 index 00000000..066e1ba3 --- /dev/null +++ b/.github/workflows/min-deps.yml @@ -0,0 +1,60 @@ +name: min-deps + +on: + push: + branches: [ "main" ] + paths-ignore: + - 'docs/**' + pull_request: + branches: [ "main" ] + paths-ignore: + - 'docs/**' + schedule: + - cron: "0 0 * * *" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + + test: + name: ${{ matrix.python-version }}-build + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + strategy: + matrix: + python-version: ["3.12"] + steps: + - uses: actions/checkout@v4 + + - name: Setup micromamba + uses: mamba-org/setup-micromamba@v1 + with: + environment-file: ci/min-deps.yml + cache-environment: true + create-args: >- + python=${{matrix.python-version}} + + - name: Install virtualizarr + run: | + python -m pip install -e . --no-deps + - name: Conda list information + run: | + conda env list + conda list + + - name: Running Tests + run: | + python -m pytest ./virtualizarr --cov=./ --cov-report=xml --verbose + + - name: Upload code coverage to Codecov + uses: codecov/codecov-action@v3.1.4 + with: + file: ./coverage.xml + flags: unittests + env_vars: OS,PYTHON + name: codecov-umbrella + fail_ci_if_error: false diff --git a/ci/min-deps.yml b/ci/min-deps.yml new file mode 100644 index 00000000..7ca8c0b3 --- /dev/null +++ b/ci/min-deps.yml @@ -0,0 +1,26 @@ +name: virtualizarr-min-deps +channels: + - conda-forge + - nodefaults +dependencies: + - h5netcdf + - h5py + - hdf5 + - netcdf4 + - xarray>=2024.6.0 + - numpy>=2.0.0 + - numcodecs + - packaging + - ujson + - universal_pathlib + # Testing + - codecov + - pre-commit + - mypy + - ruff + - pandas-stubs + - pytest-mypy + - pytest-cov + - pytest + - pooch + - fsspec diff --git a/pyproject.toml b/pyproject.toml index 5af632ce..d216b269 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,12 +22,11 @@ requires-python = ">=3.10" dynamic = ["version"] dependencies = [ "xarray>=2024.06.0", - "kerchunk>=0.2.5", - "h5netcdf", "numpy>=2.0.0", - "ujson", "packaging", "universal-pathlib", + "numcodecs", + "ujson", ] [project.optional-dependencies] @@ -35,7 +34,9 @@ test = [ "codecov", "fastparquet", "fsspec", + "h5netcdf", "h5py", + "kerchunk>=0.2.5", "mypy", "netcdf4", "pandas-stubs", diff --git a/virtualizarr/accessor.py b/virtualizarr/accessor.py index 0a97237e..cc251e63 100644 --- a/virtualizarr/accessor.py +++ b/virtualizarr/accessor.py @@ -5,7 +5,6 @@ overload, ) -import ujson # type: ignore from xarray import Dataset, register_dataset_accessor from virtualizarr.manifests import ManifestArray @@ -91,6 +90,8 @@ def to_kerchunk( if format == "dict": return refs elif format == "json": + import ujson + if filepath is None: raise ValueError("Filepath must be provided when format is 'json'") diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index a6d160ed..88ac9a91 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -5,7 +5,6 @@ from typing import Any, Callable, Dict, NewType, Tuple, TypedDict, cast import numpy as np -from upath import UPath from virtualizarr.types import ChunkKey @@ -41,6 +40,8 @@ class ChunkEntry: def from_kerchunk( cls, path_and_byte_range_info: tuple[str] | tuple[str, int, int] ) -> "ChunkEntry": + from upath import UPath + if len(path_and_byte_range_info) == 1: path = path_and_byte_range_info[0] offset = 0 diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py index c274ee5a..d3632b68 100644 --- a/virtualizarr/readers/kerchunk.py +++ b/virtualizarr/readers/kerchunk.py @@ -2,7 +2,6 @@ from pathlib import Path from typing import Any, MutableMapping, Optional, cast -import ujson # type: ignore from xarray import Dataset from xarray.core.indexes import Index from xarray.core.variable import Variable @@ -300,6 +299,8 @@ def fully_decode_arr_refs(d: dict) -> KerchunkArrRefs: """ Only have to do this because kerchunk.SingleHdf5ToZarr apparently doesn't bother converting .zarray and .zattrs contents to dicts, see https://github.com/fsspec/kerchunk/issues/415 . """ + import ujson + sanitized = d.copy() for k, v in d.items(): if k.startswith("."): diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 7df13d10..70f613ce 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -33,7 +33,9 @@ def _importorskip( has_astropy, requires_astropy = _importorskip("astropy") +has_kerchunk, requires_kerchunk = _importorskip("kerchunk") has_s3fs, requires_s3fs = _importorskip("s3fs") +has_scipy, requires_scipy = _importorskip("scipy") has_tifffile, requires_tifffile = _importorskip("tifffile") diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index 731c4acc..81a23e0c 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -1,7 +1,6 @@ from collections.abc import Mapping from unittest.mock import patch -import fsspec import numpy as np import pytest import xarray as xr @@ -13,9 +12,17 @@ from virtualizarr.backend import FileType from virtualizarr.manifests import ManifestArray from virtualizarr.readers.kerchunk import _automatically_determine_filetype -from virtualizarr.tests import has_astropy, has_tifffile, network, requires_s3fs +from virtualizarr.tests import ( + has_astropy, + has_tifffile, + network, + requires_kerchunk, + requires_s3fs, + requires_scipy, +) +@requires_scipy def test_automatically_determine_filetype_netcdf3_netcdf4(): # test the NetCDF3 vs NetCDF4 automatic file type selection @@ -75,6 +82,7 @@ def test_FileType(): FileType(None) +@requires_kerchunk class TestOpenVirtualDatasetIndexes: def test_no_indexes(self, netcdf4_file): vds = open_virtual_dataset(netcdf4_file, indexes={}) @@ -105,6 +113,7 @@ def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, I return True +@requires_kerchunk def test_cftime_index(tmpdir): """Ensure a virtual dataset contains the same indexes as an Xarray dataset""" # Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168 @@ -130,6 +139,7 @@ def test_cftime_index(tmpdir): assert vds.attrs == ds.attrs +@requires_kerchunk class TestOpenVirtualDatasetAttrs: def test_drop_array_dimensions(self, netcdf4_file): # regression test for GH issue #150 @@ -237,6 +247,8 @@ def test_read_from_url(self, filetype, url): assert isinstance(vds, xr.Dataset) def test_virtualizarr_vs_local_nisar(self): + import fsspec + # Open group directly from locally cached file with xarray url = "https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/GCOV/ALOS1_Rosamond_20081012/NISAR_L2_PR_GCOV_001_005_A_219_4020_SHNA_A_20081012T060910_20081012T060926_P01101_F_N_J_001.h5" tmpfile = fsspec.open_local( @@ -266,6 +278,7 @@ def test_virtualizarr_vs_local_nisar(self): xrt.assert_equal(dsXR, dsV) +@requires_kerchunk class TestLoadVirtualDataset: def test_loadable_variables(self, netcdf4_file): vars_to_load = ["air", "time"] @@ -338,6 +351,7 @@ def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir): assert vds.scalar.attrs == {"scalar": "true"} +@requires_kerchunk @pytest.mark.parametrize( "reference_format", ["json", "parquet", "invalid"], @@ -395,6 +409,7 @@ def test_open_virtual_dataset_existing_kerchunk_refs( assert set(vds.variables) == set(netcdf4_virtual_dataset.variables) +@requires_kerchunk def test_notimplemented_read_inline_refs(tmp_path, netcdf4_inlined_ref): # For now, we raise a NotImplementedError if we read existing references that have inlined data # https://github.com/zarr-developers/VirtualiZarr/pull/251#pullrequestreview-2361916932 diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 5894f643..434d12d7 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -4,11 +4,53 @@ import xarray.testing as xrt from virtualizarr import open_virtual_dataset -from virtualizarr.manifests.array import ManifestArray -from virtualizarr.manifests.manifest import ChunkManifest +from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.readers.kerchunk import ( + dataset_from_kerchunk_refs, + find_var_names, +) +from virtualizarr.tests import requires_kerchunk from virtualizarr.zarr import ZArray +def test_kerchunk_roundtrip_in_memory_no_concat(): + # Set up example xarray dataset + chunks_dict = { + "0.0": {"path": "foo.nc", "offset": 100, "length": 100}, + "0.1": {"path": "foo.nc", "offset": 200, "length": 100}, + } + manifest = ChunkManifest(entries=chunks_dict) + marr = ManifestArray( + zarray=dict( + shape=(2, 4), + dtype=np.dtype(" KerchunkStoreRefs: Create a dictionary containing kerchunk-style store references from a single xarray.Dataset (which wraps ManifestArray objects). """ + import ujson + all_arr_refs = {} for var_name, var in ds.variables.items(): arr_refs = variable_to_kerchunk_arr_refs(var, str(var_name)) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index f62b1269..cd83a67d 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -1,9 +1,7 @@ import dataclasses from typing import TYPE_CHECKING, Any, Literal, NewType, cast -import numcodecs import numpy as np -import ujson # type: ignore if TYPE_CHECKING: pass @@ -100,6 +98,8 @@ def dict(self) -> dict[str, Any]: return zarray_dict def to_kerchunk_json(self) -> str: + import ujson + zarray_dict = self.dict() if zarray_dict["fill_value"] is np.nan: zarray_dict["fill_value"] = None @@ -153,6 +153,8 @@ def _v3_codec_pipeline(self) -> list: post_compressor: Iterable[BytesBytesCodec] #optional ``` """ + import numcodecs + if self.filters: filter_codecs_configs = [ numcodecs.get_codec(filter).get_config() for filter in self.filters From 7053bc02677b31177c76904d6c90a594cff30598 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Fri, 18 Oct 2024 17:27:53 +0200 Subject: [PATCH 22/29] allow creating references for empty archival datasets (#260) * raise a more user-friendly error for empty variables * add a test to make sure the error is raised * create a empty manifest array instead * also allow writing empty chunk manifests * try using an annotated variable * explicitly cast instead * switch the order of `cast` arguments * use the main constructor instead * forgotten call of `ChunkManifest.empty` * explanatory comment * rename the reader test * check that concatenating works * check that stacking works with missing chunk definitions * check that broadcasting works * use empty arrays instead of 0-sized if shape given * pass the chunk grid shape for all empty chunk manifests * don't allow empty chunks if no chunk grid shape given * move `ujson` to top-level * replace the manual floor division * release note * fix a couple of changelog entries --- docs/releases.rst | 7 +- virtualizarr/manifests/manifest.py | 12 +- virtualizarr/readers/kerchunk.py | 15 ++- .../tests/test_manifests/test_array.py | 108 ++++++++++++++++++ .../tests/test_manifests/test_manifest.py | 8 ++ .../tests/test_readers/test_kerchunk.py | 24 +++- .../tests/test_writers/test_kerchunk.py | 29 +++++ 7 files changed, 194 insertions(+), 9 deletions(-) diff --git a/docs/releases.rst b/docs/releases.rst index 622f01e0..ae28fbbe 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -28,6 +28,9 @@ New Features - Load scalar variables by default. (:pull:`205`) By `Gustavo Hidalgo `_. +- Support empty files (:pull:`260`) + By `Justus Magin `_. + Breaking changes ~~~~~~~~~~~~~~~~ @@ -35,7 +38,7 @@ Breaking changes By `Gustavo Hidalgo `_. - VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclass `pydantic.BaseModel` (:pull:`210`) -- `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`xxx`) +- `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`210`) Deprecations ~~~~~~~~~~~~ @@ -55,7 +58,7 @@ Bug fixes Documentation ~~~~~~~~~~~~~ -- Adds virtualizarr + coiled serverless example notebook (:pull`223`) +- Adds virtualizarr + coiled serverless example notebook (:pull:`223`) By `Raphael Hagen `_. diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 88ac9a91..1933844a 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -89,7 +89,7 @@ class ChunkManifest: _offsets: np.ndarray[Any, np.dtype[np.uint64]] _lengths: np.ndarray[Any, np.dtype[np.uint64]] - def __init__(self, entries: dict) -> None: + def __init__(self, entries: dict, shape: tuple[int, ...] | None = None) -> None: """ Create a ChunkManifest from a dictionary mapping zarr chunk keys to byte ranges. @@ -105,13 +105,14 @@ def __init__(self, entries: dict) -> None: "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, } """ + if shape is None and not entries: + raise ValueError("need a chunk grid shape if no chunks given") # TODO do some input validation here first? validate_chunk_keys(entries.keys()) - # TODO should we actually optionally pass chunk grid shape in, - # in case there are not enough chunks to give correct idea of full shape? - shape = get_chunk_grid_shape(entries.keys()) + if shape is None: + shape = get_chunk_grid_shape(entries.keys()) # Initializing to empty implies that entries with path='' are treated as missing chunks paths = cast( # `np.empty` apparently is type hinted as if the output could have Any dtype @@ -386,6 +387,9 @@ def get_ndim_from_key(key: str) -> int: def validate_chunk_keys(chunk_keys: Iterable[ChunkKey]): + if not chunk_keys: + return + # Check if all keys have the correct form for key in chunk_keys: if not re.match(_CHUNK_KEY, key): diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py index d3632b68..a8740b19 100644 --- a/virtualizarr/readers/kerchunk.py +++ b/virtualizarr/readers/kerchunk.py @@ -13,7 +13,7 @@ KerchunkStoreRefs, ) from virtualizarr.utils import _FsspecFSFromFilepath -from virtualizarr.zarr import ZArray, ZAttrs +from virtualizarr.zarr import ZArray, ZAttrs, ceildiv # TODO shouldn't this live in backend.py? Because it's not just useful for the kerchunk-specific readers... @@ -230,6 +230,13 @@ def dataset_from_kerchunk_refs( return vds +def determine_chunk_grid_shape(zarray): + return tuple( + ceildiv(length, chunksize) + for length, chunksize in zip(zarray.shape, zarray.chunks) + ) + + def variable_from_kerchunk_refs( refs: KerchunkStoreRefs, var_name: str, virtual_array_class ) -> Variable: @@ -242,6 +249,12 @@ def variable_from_kerchunk_refs( if chunk_dict: manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict) varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest) + elif len(zarray.shape) != 0: + # empty variables don't have physical chunks, but zarray shows that the variable + # is at least 1D + shape = determine_chunk_grid_shape(zarray) + manifest = ChunkManifest(entries={}, shape=shape) + varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest) else: # This means we encountered a scalar variable of dimension 0, # very likely that it actually has no numeric value and its only purpose diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index 9031195f..f3a9ee9f 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -203,6 +203,38 @@ def test_broadcast_any_shape(self, shape, chunks, target_shape): for len_arr, len_chunk in zip(broadcasted_marr.shape, broadcasted_chunk_shape): assert len_chunk <= len_arr + @pytest.mark.parametrize( + "shape, chunks, grid_shape, target_shape", + [ + ((1,), (1,), (1,), (3,)), + ((2,), (1,), (2,), (2,)), + ((3,), (2,), (2,), (5, 4, 3)), + ((3, 1), (2, 1), (2, 1), (2, 3, 4)), + ], + ) + def test_broadcast_empty(self, shape, chunks, grid_shape, target_shape): + zarray = ZArray( + chunks=chunks, + compressor={"id": "zlib", "level": 1}, + dtype=np.dtype("int32"), + fill_value=0.0, + filters=None, + order="C", + shape=shape, + zarr_format=2, + ) + manifest = ChunkManifest(entries={}, shape=grid_shape) + marr = ManifestArray(zarray, manifest) + + expanded = np.broadcast_to(marr, shape=target_shape) + assert expanded.shape == target_shape + assert len(expanded.chunks) == expanded.ndim + assert all( + len_chunk <= len_arr + for len_arr, len_chunk in zip(expanded.shape, expanded.chunks) + ) + assert expanded.manifest.dict() == {} + # TODO we really need some kind of fixtures to generate useful example data # The hard part is having an alternative way to get to the expected result of concatenation @@ -250,6 +282,44 @@ def test_concat(self): assert result.zarray.order == zarray.order assert result.zarray.zarr_format == zarray.zarr_format + def test_concat_empty(self): + # both manifest arrays in this example have the same zarray properties + zarray = ZArray( + chunks=(5, 1, 10), + compressor={"id": "zlib", "level": 1}, + dtype=np.dtype("int32"), + fill_value=0.0, + filters=None, + order="C", + shape=(5, 1, 20), + zarr_format=2, + ) + + chunks_dict1 = {} + manifest1 = ChunkManifest(entries=chunks_dict1, shape=(1, 1, 2)) + marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1) + + chunks_dict2 = { + "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100}, + "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100}, + } + manifest2 = ChunkManifest(entries=chunks_dict2) + marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2) + + result = np.concatenate([marr1, marr2], axis=1) + + assert result.shape == (5, 2, 20) + assert result.chunks == (5, 1, 10) + assert result.manifest.dict() == { + "0.1.0": {"path": "foo.nc", "offset": 300, "length": 100}, + "0.1.1": {"path": "foo.nc", "offset": 400, "length": 100}, + } + assert result.zarray.compressor == zarray.compressor + assert result.zarray.filters == zarray.filters + assert result.zarray.fill_value == zarray.fill_value + assert result.zarray.order == zarray.order + assert result.zarray.zarr_format == zarray.zarr_format + class TestStack: def test_stack(self): @@ -295,6 +365,44 @@ def test_stack(self): assert result.zarray.order == zarray.order assert result.zarray.zarr_format == zarray.zarr_format + def test_stack_empty(self): + # both manifest arrays in this example have the same zarray properties + zarray = ZArray( + chunks=(5, 10), + compressor={"id": "zlib", "level": 1}, + dtype=np.dtype("int32"), + fill_value=0.0, + filters=None, + order="C", + shape=(5, 20), + zarr_format=2, + ) + + chunks_dict1 = {} + manifest1 = ChunkManifest(entries=chunks_dict1, shape=(1, 2)) + marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1) + + chunks_dict2 = { + "0.0": {"path": "foo.nc", "offset": 300, "length": 100}, + "0.1": {"path": "foo.nc", "offset": 400, "length": 100}, + } + manifest2 = ChunkManifest(entries=chunks_dict2) + marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2) + + result = np.stack([marr1, marr2], axis=1) + + assert result.shape == (5, 2, 20) + assert result.chunks == (5, 1, 10) + assert result.manifest.dict() == { + "0.1.0": {"path": "foo.nc", "offset": 300, "length": 100}, + "0.1.1": {"path": "foo.nc", "offset": 400, "length": 100}, + } + assert result.zarray.compressor == zarray.compressor + assert result.zarray.filters == zarray.filters + assert result.zarray.fill_value == zarray.fill_value + assert result.zarray.order == zarray.order + assert result.zarray.zarr_format == zarray.zarr_format + def test_refuse_combine(): # TODO test refusing to concatenate arrays that have conflicting shapes / chunk sizes diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py index fb099413..3e084e64 100644 --- a/virtualizarr/tests/test_manifests/test_manifest.py +++ b/virtualizarr/tests/test_manifests/test_manifest.py @@ -20,6 +20,14 @@ def test_create_manifest(self): manifest = ChunkManifest(entries=chunks) assert manifest.dict() == chunks + chunks = {} + manifest = ChunkManifest(entries=chunks, shape=(2, 2)) + assert manifest.dict() == chunks + + def test_create_manifest_empty_missing_shape(self): + with pytest.raises(ValueError, match="chunk grid shape if no chunks"): + ChunkManifest(entries={}) + def test_invalid_chunk_entries(self): chunks = { "0.0.0": {"path": "s3://bucket/foo.nc"}, diff --git a/virtualizarr/tests/test_readers/test_kerchunk.py b/virtualizarr/tests/test_readers/test_kerchunk.py index 0faa1ff2..50d4b19b 100644 --- a/virtualizarr/tests/test_readers/test_kerchunk.py +++ b/virtualizarr/tests/test_readers/test_kerchunk.py @@ -1,4 +1,5 @@ import numpy as np +import ujson from virtualizarr.manifests import ManifestArray from virtualizarr.readers.kerchunk import ( @@ -45,8 +46,6 @@ def test_dataset_from_df_refs(): def test_dataset_from_df_refs_with_filters(): - import ujson - filters = [{"elementsize": 4, "id": "shuffle"}, {"id": "zlib", "level": 4}] zarray = { "chunks": [2, 3], @@ -62,3 +61,24 @@ def test_dataset_from_df_refs_with_filters(): ds = dataset_from_kerchunk_refs(ds_refs) da = ds["a"] assert da.data.zarray.filters == filters + + +def test_dataset_from_kerchunk_refs_empty_chunk_manifest(): + zarray = { + "chunks": [50, 100], + "compressor": None, + "dtype": " Date: Fri, 18 Oct 2024 18:42:08 -0600 Subject: [PATCH 23/29] Split kerchunk reader up (#261) * standardize zarr v3 and dmrpp readers behind dedicated open_virtual_dataset functions * refactor hdf5 reader behind open_virtual_dataset function * refactor netcdf3 * refactor tiff * refactor fits * refactored so create VirtualBackends * restore backend.py, but keep readers/common.py * oops I deleted a file * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * standardize open_virtual_dataset method signature, and raise NotImplemented * fix bug with zarr reader * remove todo * make open_virtual_dataset a staticmethod * try to fix mypy error about importing DataTree from versions of xarray where it doesn't exist * mypy * sanitize drop_variables and loadable_variables * implement drop_variables for kerchunk reader * sanitize drmpp args * pass all arguments to kerchunk reader * coerce kerchunk refs to our types * make sure all readers are passed the same set of args * fix bad merge, and refactor determine_chunk_grid_shape a bit * ensure decode_times is passed to each reader * remove match case statement in favour of mapping * ensure optional dependencies aren't imported * release note --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/releases.rst | 2 + virtualizarr/backend.py | 311 +++++----------- virtualizarr/manifests/array.py | 13 +- virtualizarr/manifests/array_api.py | 7 +- virtualizarr/readers/__init__.py | 17 + virtualizarr/readers/common.py | 195 +++++++++++ virtualizarr/readers/dmrpp.py | 63 +++- virtualizarr/readers/fits.py | 59 ++++ virtualizarr/readers/hdf5.py | 64 ++++ virtualizarr/readers/kerchunk.py | 350 +++---------------- virtualizarr/readers/netcdf3.py | 62 ++++ virtualizarr/readers/tiff.py | 73 ++++ virtualizarr/readers/zarr.py | 131 ------- virtualizarr/readers/zarr_v3.py | 154 ++++++++ virtualizarr/tests/test_backend.py | 14 +- virtualizarr/tests/test_integration.py | 4 +- virtualizarr/tests/test_writers/test_zarr.py | 2 +- virtualizarr/translators/__init__.py | 0 virtualizarr/translators/kerchunk.py | 223 ++++++++++++ virtualizarr/utils.py | 27 +- virtualizarr/zarr.py | 6 + 21 files changed, 1090 insertions(+), 687 deletions(-) create mode 100644 virtualizarr/readers/__init__.py create mode 100644 virtualizarr/readers/common.py create mode 100644 virtualizarr/readers/fits.py create mode 100644 virtualizarr/readers/hdf5.py create mode 100644 virtualizarr/readers/netcdf3.py create mode 100644 virtualizarr/readers/tiff.py delete mode 100644 virtualizarr/readers/zarr.py create mode 100644 virtualizarr/readers/zarr_v3.py create mode 100644 virtualizarr/translators/__init__.py create mode 100644 virtualizarr/translators/kerchunk.py diff --git a/docs/releases.rst b/docs/releases.rst index ae28fbbe..ee1ae402 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -67,6 +67,8 @@ Internal Changes - Refactored internal structure significantly to split up everything to do with reading references from that to do with writing references. (:issue:`229`) (:pull:`231`) By `Tom Nicholas `_. +- Refactored readers to consider every filetype as a separate reader, all standardized to present the same `open_virtual_dataset` interface internally. + (:pull:`261`) By `Tom Nicholas `_. .. _v1.0.0: diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index 4da9e896..0322f604 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -1,25 +1,39 @@ -import os import warnings -from collections.abc import Iterable, Mapping, MutableMapping +from collections.abc import Iterable, Mapping from enum import Enum, auto -from io import BufferedIOBase +from pathlib import Path from typing import ( Any, - Hashable, Optional, - cast, ) -import xarray as xr -from xarray.backends import AbstractDataStore, BackendArray -from xarray.core.indexes import Index, PandasIndex -from xarray.core.variable import IndexVariable +from xarray import Dataset +from xarray.core.indexes import Index from virtualizarr.manifests import ManifestArray -from virtualizarr.types.kerchunk import KerchunkStoreRefs -from virtualizarr.utils import _FsspecFSFromFilepath - -XArrayOpenT = str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore +from virtualizarr.readers import ( + DMRPPVirtualBackend, + FITSVirtualBackend, + HDF5VirtualBackend, + KerchunkVirtualBackend, + NetCDF3VirtualBackend, + TIFFVirtualBackend, + ZarrV3VirtualBackend, +) +from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions + +# TODO add entrypoint to allow external libraries to add to this mapping +VIRTUAL_BACKENDS = { + "kerchunk": KerchunkVirtualBackend, + "zarr_v3": ZarrV3VirtualBackend, + "dmrpp": DMRPPVirtualBackend, + # all the below call one of the kerchunk backends internally (https://fsspec.github.io/kerchunk/reference.html#file-format-backends) + "netcdf3": NetCDF3VirtualBackend, + "hdf5": HDF5VirtualBackend, + "netcdf4": HDF5VirtualBackend, # note this is the same as for hdf5 + "tiff": TIFFVirtualBackend, + "fits": FITSVirtualBackend, +} class AutoName(Enum): @@ -43,10 +57,49 @@ class FileType(AutoName): kerchunk = auto() -class ManifestBackendArray(ManifestArray, BackendArray): - """Using this prevents xarray from wrapping the KerchunkArray in ExplicitIndexingAdapter etc.""" +def automatically_determine_filetype( + *, + filepath: str, + reader_options: Optional[dict[str, Any]] = {}, +) -> FileType: + """ + Attempt to automatically infer the correct reader for this filetype. + + Uses magic bytes and file / directory suffixes. + """ - ... + # TODO this should ideally handle every filetype that we have a reader for, not just kerchunk + + # TODO how do we handle kerchunk json / parquet here? + if Path(filepath).suffix == ".zarr": + # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one... + raise NotImplementedError() + + # Read magic bytes from local or remote file + fpath = _FsspecFSFromFilepath( + filepath=filepath, reader_options=reader_options + ).open_file() + magic_bytes = fpath.read(8) + fpath.close() + + if magic_bytes.startswith(b"CDF"): + filetype = FileType.netcdf3 + elif magic_bytes.startswith(b"\x0e\x03\x13\x01"): + raise NotImplementedError("HDF4 formatted files not supported") + elif magic_bytes.startswith(b"\x89HDF"): + filetype = FileType.hdf5 + elif magic_bytes.startswith(b"GRIB"): + filetype = FileType.grib + elif magic_bytes.startswith(b"II*"): + filetype = FileType.tiff + elif magic_bytes.startswith(b"SIMPLE"): + filetype = FileType.fits + else: + raise NotImplementedError( + f"Unrecognised file based on header bytes: {magic_bytes}" + ) + + return filetype def open_virtual_dataset( @@ -61,7 +114,7 @@ def open_virtual_dataset( indexes: Mapping[str, Index] | None = None, virtual_array_class=ManifestArray, reader_options: Optional[dict] = None, -) -> xr.Dataset: +) -> Dataset: """ Open a file or store as an xarray Dataset wrapping virtualized zarr arrays. @@ -69,7 +122,6 @@ def open_virtual_dataset( Xarray indexes can optionally be created (the default behaviour). To avoid creating any xarray indexes pass ``indexes={}``. - Parameters ---------- filepath : str, default None @@ -112,217 +164,38 @@ def open_virtual_dataset( stacklevel=2, ) - loadable_vars: dict[str, xr.Variable] - virtual_vars: dict[str, xr.Variable] - vars: dict[str, xr.Variable] - - if drop_variables is None: - drop_variables = [] - elif isinstance(drop_variables, str): - drop_variables = [drop_variables] - else: - drop_variables = list(drop_variables) - if loadable_variables is None: - loadable_variables = [] - elif isinstance(loadable_variables, str): - loadable_variables = [loadable_variables] - else: - loadable_variables = list(loadable_variables) - common = set(drop_variables).intersection(set(loadable_variables)) - if common: - raise ValueError(f"Cannot both load and drop variables {common}") + drop_variables, loadable_variables = check_for_collisions( + drop_variables, + loadable_variables, + ) if virtual_array_class is not ManifestArray: raise NotImplementedError() - # if filetype is user defined, convert to FileType + if reader_options is None: + reader_options = {} if filetype is not None: + # if filetype is user defined, convert to FileType filetype = FileType(filetype) - - if filetype == FileType.kerchunk: - from virtualizarr.readers.kerchunk import dataset_from_kerchunk_refs - - fs = _FsspecFSFromFilepath(filepath=filepath, reader_options=reader_options) - - # The kerchunk .parquet storage format isn't actually a parquet, but a directory that contains named parquets for each group/variable. - if fs.filepath.endswith("ref.parquet"): - from fsspec.implementations.reference import LazyReferenceMapper - - lrm = LazyReferenceMapper(filepath, fs.fs) - - # build reference dict from KV pairs in LazyReferenceMapper - # is there a better / more preformant way to extract this? - array_refs = {k: lrm[k] for k in lrm.keys()} - - full_reference = {"refs": array_refs} - - return dataset_from_kerchunk_refs(KerchunkStoreRefs(full_reference)) - - # JSON has no magic bytes, but the Kerchunk version 1 spec starts with 'version': - # https://fsspec.github.io/kerchunk/spec.html - elif fs.read_bytes(9).startswith(b'{"version'): - import ujson - - with fs.open_file() as of: - refs = ujson.load(of) - - return dataset_from_kerchunk_refs(KerchunkStoreRefs(refs)) - - else: - raise ValueError( - "The input Kerchunk reference did not seem to be in Kerchunk's JSON or Parquet spec: https://fsspec.github.io/kerchunk/spec.html. The Kerchunk format autodetection is quite flaky, so if your reference matches the Kerchunk spec feel free to open an issue: https://github.com/zarr-developers/VirtualiZarr/issues" - ) - - if filetype == FileType.zarr_v3: - # TODO is there a neat way of auto-detecting this? - from virtualizarr.readers.zarr import open_virtual_dataset_from_v3_store - - return open_virtual_dataset_from_v3_store( - storepath=filepath, drop_variables=drop_variables, indexes=indexes - ) - elif filetype == FileType.dmrpp: - from virtualizarr.readers.dmrpp import DMRParser - - if loadable_variables != [] or indexes is None: - raise NotImplementedError( - "Specifying `loadable_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files." - ) - - fpath = _FsspecFSFromFilepath( - filepath=filepath, reader_options=reader_options - ).open_file() - parser = DMRParser(fpath.read(), data_filepath=filepath.strip(".dmrpp")) - vds = parser.parse_dataset() - vds.drop_vars(drop_variables) - return vds else: - # we currently read every other filetype using kerchunks various file format backends - from virtualizarr.readers.kerchunk import ( - fully_decode_arr_refs, - read_kerchunk_references_from_file, - virtual_vars_from_kerchunk_refs, - ) - - if reader_options is None: - reader_options = {} - - # this is the only place we actually always need to use kerchunk directly - # TODO avoid even reading byte ranges for variables that will be dropped later anyway? - vds_refs = read_kerchunk_references_from_file( - filepath=filepath, - filetype=filetype, - group=group, - reader_options=reader_options, - ) - virtual_vars = virtual_vars_from_kerchunk_refs( - vds_refs, - drop_variables=drop_variables + loadable_variables, - virtual_array_class=virtual_array_class, - ) - ds_attrs = fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) - coord_names = ds_attrs.pop("coordinates", []) - - if indexes is None or len(loadable_variables) > 0: - # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... - # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references - # TODO really we probably want a dedicated xarray backend that iterates over all variables only once - fpath = _FsspecFSFromFilepath( - filepath=filepath, reader_options=reader_options - ).open_file() - - # fpath can be `Any` thanks to fsspec.filesystem(...).open() returning Any. - # We'll (hopefully safely) cast it to what xarray is expecting, but this might let errors through. - - ds = xr.open_dataset( - cast(XArrayOpenT, fpath), - drop_variables=drop_variables, - group=group, - decode_times=decode_times, - ) - - if indexes is None: - warnings.warn( - "Specifying `indexes=None` will create in-memory pandas indexes for each 1D coordinate, but concatenation of ManifestArrays backed by pandas indexes is not yet supported (see issue #18)." - "You almost certainly want to pass `indexes={}` to `open_virtual_dataset` instead." - ) - - # add default indexes by reading data from file - indexes = {name: index for name, index in ds.xindexes.items()} - elif indexes != {}: - # TODO allow manual specification of index objects - raise NotImplementedError() - else: - indexes = dict(**indexes) # for type hinting: to allow mutation - - loadable_vars = { - str(name): var - for name, var in ds.variables.items() - if name in loadable_variables - } - - # if we only read the indexes we can just close the file right away as nothing is lazy - if loadable_vars == {}: - ds.close() - else: - loadable_vars = {} - indexes = {} - - vars = {**virtual_vars, **loadable_vars} - - data_vars, coords = separate_coords(vars, indexes, coord_names) - - vds = xr.Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, + filetype = automatically_determine_filetype( + filepath=filepath, reader_options=reader_options ) - # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened - - return vds - - -def separate_coords( - vars: Mapping[str, xr.Variable], - indexes: MutableMapping[str, Index], - coord_names: Iterable[str] | None = None, -) -> tuple[dict[str, xr.Variable], xr.Coordinates]: - """ - Try to generate a set of coordinates that won't cause xarray to automatically build a pandas.Index for the 1D coordinates. - - Currently requires this function as a workaround unless xarray PR #8124 is merged. - - Will also preserve any loaded variables and indexes it is passed. - """ - - if coord_names is None: - coord_names = [] - - # split data and coordinate variables (promote dimension coordinates) - data_vars = {} - coord_vars: dict[ - str, tuple[Hashable, Any, dict[Any, Any], dict[Any, Any]] | xr.Variable - ] = {} - for name, var in vars.items(): - if name in coord_names or var.dims == (name,): - # use workaround to avoid creating IndexVariables described here https://github.com/pydata/xarray/pull/8107#discussion_r1311214263 - if len(var.dims) == 1: - dim1d, *_ = var.dims - coord_vars[name] = (dim1d, var.data, var.attrs, var.encoding) + backend_cls = VIRTUAL_BACKENDS.get(filetype.name.lower()) - if isinstance(var, IndexVariable): - # unless variable actually already is a loaded IndexVariable, - # in which case we need to keep it and add the corresponding indexes explicitly - coord_vars[str(name)] = var - # TODO this seems suspect - will it handle datetimes? - indexes[name] = PandasIndex(var, dim1d) - else: - coord_vars[name] = var - else: - data_vars[name] = var + if backend_cls is None: + raise NotImplementedError(f"Unsupported file type: {filetype.name}") - coords = xr.Coordinates(coord_vars, indexes=indexes) + vds = backend_cls.open_virtual_dataset( + filepath, + group=group, + drop_variables=drop_variables, + loadable_variables=loadable_variables, + decode_times=decode_times, + indexes=indexes, + reader_options=reader_options, + ) - return data_vars, coords + return vds diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 5ac0aef0..179bcf1c 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -3,10 +3,13 @@ import numpy as np -from ..types.kerchunk import KerchunkArrRefs -from ..zarr import ZArray -from .array_api import MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS, _isnan -from .manifest import ChunkManifest +from virtualizarr.manifests.array_api import ( + MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS, + _isnan, +) +from virtualizarr.manifests.manifest import ChunkManifest +from virtualizarr.types.kerchunk import KerchunkArrRefs +from virtualizarr.zarr import ZArray class ManifestArray: @@ -61,7 +64,7 @@ def __init__( @classmethod def _from_kerchunk_refs(cls, arr_refs: KerchunkArrRefs) -> "ManifestArray": - from virtualizarr.readers.kerchunk import ( + from virtualizarr.translators.kerchunk import ( fully_decode_arr_refs, parse_array_refs, ) diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index 18f15933..f5cf220b 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -2,7 +2,7 @@ import numpy as np -from virtualizarr.zarr import Codec, ceildiv +from virtualizarr.zarr import Codec, determine_chunk_grid_shape from .manifest import ChunkManifest @@ -293,10 +293,7 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra ) # find new chunk grid shape by dividing new array shape by new chunk shape - new_chunk_grid_shape = tuple( - ceildiv(axis_length, chunk_length) - for axis_length, chunk_length in zip(new_shape, new_chunk_shape) - ) + new_chunk_grid_shape = determine_chunk_grid_shape(new_shape, new_chunk_shape) # do broadcasting of entries in manifest broadcasted_paths = cast( # `np.broadcast_to` apparently is type hinted as if the output could have Any dtype diff --git a/virtualizarr/readers/__init__.py b/virtualizarr/readers/__init__.py new file mode 100644 index 00000000..0f83ba39 --- /dev/null +++ b/virtualizarr/readers/__init__.py @@ -0,0 +1,17 @@ +from virtualizarr.readers.dmrpp import DMRPPVirtualBackend +from virtualizarr.readers.fits import FITSVirtualBackend +from virtualizarr.readers.hdf5 import HDF5VirtualBackend +from virtualizarr.readers.kerchunk import KerchunkVirtualBackend +from virtualizarr.readers.netcdf3 import NetCDF3VirtualBackend +from virtualizarr.readers.tiff import TIFFVirtualBackend +from virtualizarr.readers.zarr_v3 import ZarrV3VirtualBackend + +__all__ = [ + "DMRPPVirtualBackend", + "FITSVirtualBackend", + "HDF5VirtualBackend", + "KerchunkVirtualBackend", + "NetCDF3VirtualBackend", + "TIFFVirtualBackend", + "ZarrV3VirtualBackend", +] diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py new file mode 100644 index 00000000..54aedfe2 --- /dev/null +++ b/virtualizarr/readers/common.py @@ -0,0 +1,195 @@ +import os +import warnings +from abc import ABC +from collections.abc import Iterable, Mapping, MutableMapping +from io import BufferedIOBase +from typing import ( + TYPE_CHECKING, + Any, + Hashable, + Optional, + cast, +) + +import xarray as xr +from xarray import Dataset +from xarray.backends import AbstractDataStore, BackendArray +from xarray.core.indexes import Index, PandasIndex +from xarray.core.variable import IndexVariable, Variable + +from virtualizarr.manifests import ManifestArray +from virtualizarr.utils import _FsspecFSFromFilepath + +XArrayOpenT = str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore + +if TYPE_CHECKING: + try: + from xarray import DataTree # type: ignore[attr-defined] + except ImportError: + DataTree = Any + + +class ManifestBackendArray(ManifestArray, BackendArray): + """Using this prevents xarray from wrapping the KerchunkArray in ExplicitIndexingAdapter etc.""" + + ... + + +def open_loadable_vars_and_indexes( + filepath: str, + loadable_variables, + reader_options, + drop_variables, + indexes, + group, + decode_times, +) -> tuple[Mapping[str, Variable], Mapping[str, Index]]: + """ + Open selected variables and indexes using xarray. + + Relies on xr.open_dataset and its auto-detection of filetypes to find the correct installed backend. + """ + + # TODO get rid of this if? + if indexes is None or len(loadable_variables) > 0: + # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... + # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references + # TODO really we probably want a dedicated xarray backend that iterates over all variables only once + fpath = _FsspecFSFromFilepath( + filepath=filepath, reader_options=reader_options + ).open_file() + + # fpath can be `Any` thanks to fsspec.filesystem(...).open() returning Any. + # We'll (hopefully safely) cast it to what xarray is expecting, but this might let errors through. + + ds = xr.open_dataset( + cast(XArrayOpenT, fpath), + drop_variables=drop_variables, + group=group, + decode_times=decode_times, + ) + + if indexes is None: + warnings.warn( + "Specifying `indexes=None` will create in-memory pandas indexes for each 1D coordinate, but concatenation of ManifestArrays backed by pandas indexes is not yet supported (see issue #18)." + "You almost certainly want to pass `indexes={}` to `open_virtual_dataset` instead." + ) + + # add default indexes by reading data from file + indexes = {name: index for name, index in ds.xindexes.items()} + elif indexes != {}: + # TODO allow manual specification of index objects + raise NotImplementedError() + else: + indexes = dict(**indexes) # for type hinting: to allow mutation + + # TODO we should drop these earlier by using drop_variables + loadable_vars = { + str(name): var + for name, var in ds.variables.items() + if name in loadable_variables + } + + # if we only read the indexes we can just close the file right away as nothing is lazy + if loadable_vars == {}: + ds.close() + else: + loadable_vars = {} + indexes = {} + + return loadable_vars, indexes + + +def construct_virtual_dataset( + virtual_vars, + loadable_vars, + indexes, + coord_names, + attrs, +) -> Dataset: + """Construct a virtual Datset from consistuent parts.""" + + vars = {**virtual_vars, **loadable_vars} + + data_vars, coords = separate_coords(vars, indexes, coord_names) + + vds = xr.Dataset( + data_vars, + coords=coords, + # indexes={}, # TODO should be added in a later version of xarray + attrs=attrs, + ) + + # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened + + return vds + + +def separate_coords( + vars: Mapping[str, xr.Variable], + indexes: MutableMapping[str, Index], + coord_names: Iterable[str] | None = None, +) -> tuple[dict[str, xr.Variable], xr.Coordinates]: + """ + Try to generate a set of coordinates that won't cause xarray to automatically build a pandas.Index for the 1D coordinates. + + Currently requires this function as a workaround unless xarray PR #8124 is merged. + + Will also preserve any loaded variables and indexes it is passed. + """ + + if coord_names is None: + coord_names = [] + + # split data and coordinate variables (promote dimension coordinates) + data_vars = {} + coord_vars: dict[ + str, tuple[Hashable, Any, dict[Any, Any], dict[Any, Any]] | xr.Variable + ] = {} + for name, var in vars.items(): + if name in coord_names or var.dims == (name,): + # use workaround to avoid creating IndexVariables described here https://github.com/pydata/xarray/pull/8107#discussion_r1311214263 + if len(var.dims) == 1: + dim1d, *_ = var.dims + coord_vars[name] = (dim1d, var.data, var.attrs, var.encoding) + + if isinstance(var, IndexVariable): + # unless variable actually already is a loaded IndexVariable, + # in which case we need to keep it and add the corresponding indexes explicitly + coord_vars[str(name)] = var + # TODO this seems suspect - will it handle datetimes? + indexes[name] = PandasIndex(var, dim1d) + else: + coord_vars[name] = var + else: + data_vars[name] = var + + coords = xr.Coordinates(coord_vars, indexes=indexes) + + return data_vars, coords + + +class VirtualBackend(ABC): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + raise NotImplementedError() + + @staticmethod + def open_virtual_datatree( + path: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> "DataTree": + raise NotImplementedError() diff --git a/virtualizarr/readers/dmrpp.py b/virtualizarr/readers/dmrpp.py index fa66205a..766b1c62 100644 --- a/virtualizarr/readers/dmrpp.py +++ b/virtualizarr/readers/dmrpp.py @@ -2,18 +2,55 @@ import warnings from collections import defaultdict from collections.abc import Mapping -from typing import Any, Optional +from typing import Any, Iterable, Optional from xml.etree import ElementTree as ET import numpy as np -import xarray as xr +from xarray import Coordinates, Dataset from xarray.core.indexes import Index +from xarray.core.variable import Variable from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.readers.common import VirtualBackend from virtualizarr.types import ChunkKey +from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions from virtualizarr.zarr import ZArray +class DMRPPVirtualBackend(VirtualBackend): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + loadable_variables, drop_variables = check_for_collisions( + drop_variables=drop_variables, + loadable_variables=loadable_variables, + ) + + if loadable_variables != [] or decode_times or indexes is None: + raise NotImplementedError( + "Specifying `loadable_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files." + ) + + if group: + raise NotImplementedError() + + fpath = _FsspecFSFromFilepath( + filepath=filepath, reader_options=reader_options + ).open_file() + + parser = DMRParser(fpath.read(), data_filepath=filepath.strip(".dmrpp")) + vds = parser.parse_dataset() + + return vds.drop_vars(drop_variables) + + class DMRParser: """ Parser for the OPeNDAP DMR++ XML format. @@ -69,9 +106,7 @@ def __init__(self, dmr: str, data_filepath: Optional[str] = None): data_filepath if data_filepath is not None else self.root.attrib["name"] ) - def parse_dataset( - self, group=None, indexes: Mapping[str, Index] = {} - ) -> xr.Dataset: + def parse_dataset(self, group=None, indexes: Mapping[str, Index] = {}) -> Dataset: """ Parses the given file and creates a virtual xr.Dataset with ManifestArrays. @@ -128,7 +163,7 @@ def _parse_netcdf4_dataset( root: ET.Element, group: Optional[str] = None, indexes: Mapping[str, Index] = {}, - ) -> xr.Dataset: + ) -> Dataset: """ Parse the dataset from the netcdf4 based dmrpp with groups, starting at the given group. Set root to the given group. @@ -201,7 +236,7 @@ def _parse_hdf5_dataset( root: ET.Element, group: Optional[str] = None, indexes: Mapping[str, Index] = {}, - ) -> xr.Dataset: + ) -> Dataset: """ Parse the dataset from the HDF5 based dmrpp with groups, starting at the given group. Set root to the given group. @@ -331,7 +366,7 @@ def _split_hdf5(self, root: ET.Element) -> dict[str, ET.Element]: def _parse_dataset( self, root: ET.Element, indexes: Mapping[str, Index] = {} - ) -> xr.Dataset: + ) -> Dataset: """ Parse the dataset using the root element of the DMR file. @@ -353,8 +388,8 @@ def _parse_dataset( if len(coord_names) == 0 or len(coord_names) < len(dataset_dims): coord_names = set(dataset_dims.keys()) # Seperate and parse coords + data variables - coord_vars: dict[str, xr.Variable] = {} - data_vars: dict[str, xr.Variable] = {} + coord_vars: dict[str, Variable] = {} + data_vars: dict[str, Variable] = {} for var_tag in self._find_var_tags(root): variable = self._parse_variable(var_tag, dataset_dims) if var_tag.attrib["name"] in coord_names: @@ -365,9 +400,9 @@ def _parse_dataset( attrs: dict[str, str] = {} for attr_tag in self.root.iterfind("dap:Attribute", self._ns): attrs.update(self._parse_attribute(attr_tag)) - return xr.Dataset( + return Dataset( data_vars=data_vars, - coords=xr.Coordinates(coords=coord_vars, indexes=indexes), + coords=Coordinates(coords=coord_vars, indexes=indexes), attrs=attrs, ) @@ -484,7 +519,7 @@ def _parse_multi_dims( def _parse_variable( self, var_tag: ET.Element, dataset_dims: dict[str, int] - ) -> xr.Variable: + ) -> Variable: """ Parse a variable from a DMR tag. @@ -542,7 +577,7 @@ def _parse_variable( ) marr = ManifestArray(zarray=zarray, chunkmanifest=chunkmanifest) encoding = {k: attrs.get(k) for k in self._encoding_keys if k in attrs} - return xr.Variable( + return Variable( dims=dim_shapes.keys(), data=marr, attrs=attrs, encoding=encoding ) diff --git a/virtualizarr/readers/fits.py b/virtualizarr/readers/fits.py new file mode 100644 index 00000000..618d81cd --- /dev/null +++ b/virtualizarr/readers/fits.py @@ -0,0 +1,59 @@ +from typing import Iterable, Mapping, Optional + +from xarray import Dataset +from xarray.core.indexes import Index + +from virtualizarr.readers.common import ( + VirtualBackend, + construct_virtual_dataset, + open_loadable_vars_and_indexes, +) +from virtualizarr.translators.kerchunk import ( + extract_group, + virtual_vars_and_metadata_from_kerchunk_refs, +) +from virtualizarr.types.kerchunk import KerchunkStoreRefs + + +class FITSVirtualBackend(VirtualBackend): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + from kerchunk.fits import process_file + + # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 + refs = KerchunkStoreRefs({"refs": process_file(filepath, **reader_options)}) + + refs = extract_group(refs, group) + + virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( + refs, + loadable_variables, + drop_variables, + ) + + # TODO this wouldn't work until you had an xarray backend for FITS installed + loadable_vars, indexes = open_loadable_vars_and_indexes( + filepath, + loadable_variables=loadable_variables, + reader_options=reader_options, + drop_variables=drop_variables, + indexes=indexes, + group=group, + decode_times=decode_times, + ) + + return construct_virtual_dataset( + virtual_vars=virtual_vars, + loadable_vars=loadable_vars, + indexes=indexes, + coord_names=coord_names, + attrs=attrs, + ) diff --git a/virtualizarr/readers/hdf5.py b/virtualizarr/readers/hdf5.py new file mode 100644 index 00000000..c0d38e20 --- /dev/null +++ b/virtualizarr/readers/hdf5.py @@ -0,0 +1,64 @@ +from typing import Iterable, Mapping, Optional + +from xarray import Dataset +from xarray.core.indexes import Index + +from virtualizarr.readers.common import ( + VirtualBackend, + construct_virtual_dataset, + open_loadable_vars_and_indexes, +) +from virtualizarr.translators.kerchunk import ( + extract_group, + virtual_vars_and_metadata_from_kerchunk_refs, +) +from virtualizarr.utils import check_for_collisions + + +class HDF5VirtualBackend(VirtualBackend): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + from kerchunk.hdf import SingleHdf5ToZarr + + drop_variables, loadable_variables = check_for_collisions( + drop_variables, + loadable_variables, + ) + + refs = SingleHdf5ToZarr( + filepath, inline_threshold=0, **reader_options + ).translate() + + refs = extract_group(refs, group) + + virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( + refs, + loadable_variables, + drop_variables, + ) + + loadable_vars, indexes = open_loadable_vars_and_indexes( + filepath, + loadable_variables=loadable_variables, + reader_options=reader_options, + drop_variables=drop_variables, + indexes=indexes, + group=group, + decode_times=decode_times, + ) + + return construct_virtual_dataset( + virtual_vars=virtual_vars, + loadable_vars=loadable_vars, + indexes=indexes, + coord_names=coord_names, + attrs=attrs, + ) diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py index a8740b19..35fa4932 100644 --- a/virtualizarr/readers/kerchunk.py +++ b/virtualizarr/readers/kerchunk.py @@ -1,323 +1,69 @@ -import warnings -from pathlib import Path -from typing import Any, MutableMapping, Optional, cast +from typing import Iterable, Mapping, Optional +import ujson from xarray import Dataset from xarray.core.indexes import Index -from xarray.core.variable import Variable -from virtualizarr.backend import FileType, separate_coords -from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.readers.common import VirtualBackend +from virtualizarr.translators.kerchunk import dataset_from_kerchunk_refs from virtualizarr.types.kerchunk import ( - KerchunkArrRefs, KerchunkStoreRefs, ) -from virtualizarr.utils import _FsspecFSFromFilepath -from virtualizarr.zarr import ZArray, ZAttrs, ceildiv - - -# TODO shouldn't this live in backend.py? Because it's not just useful for the kerchunk-specific readers... -def _automatically_determine_filetype( - *, - filepath: str, - reader_options: Optional[dict[str, Any]] = {}, -) -> FileType: - if Path(filepath).suffix == ".zarr": - # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one... - raise NotImplementedError() - - # Read magic bytes from local or remote file - fpath = _FsspecFSFromFilepath( - filepath=filepath, reader_options=reader_options - ).open_file() - magic_bytes = fpath.read(8) - fpath.close() - - if magic_bytes.startswith(b"CDF"): - filetype = FileType.netcdf3 - elif magic_bytes.startswith(b"\x0e\x03\x13\x01"): - raise NotImplementedError("HDF4 formatted files not supported") - elif magic_bytes.startswith(b"\x89HDF"): - filetype = FileType.hdf5 - elif magic_bytes.startswith(b"GRIB"): - filetype = FileType.grib - elif magic_bytes.startswith(b"II*"): - filetype = FileType.tiff - elif magic_bytes.startswith(b"SIMPLE"): - filetype = FileType.fits - else: - raise NotImplementedError( - f"Unrecognised file based on header bytes: {magic_bytes}" +from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions + + +class KerchunkVirtualBackend(VirtualBackend): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + """Reads existing kerchunk references (in JSON or parquet) format.""" + + if group: + raise NotImplementedError() + + loadable_variables, drop_variables = check_for_collisions( + drop_variables=drop_variables, + loadable_variables=loadable_variables, ) - return filetype + if loadable_variables or indexes or decode_times: + raise NotImplementedError() + fs = _FsspecFSFromFilepath(filepath=filepath, reader_options=reader_options) -def read_kerchunk_references_from_file( - filepath: str, - filetype: FileType | None, - group: str | None, - reader_options: Optional[dict[str, Any]] = None, -) -> KerchunkStoreRefs: - """ - Read a single legacy file and return kerchunk references to its contents. - - Parameters - ---------- - filepath : str, default: None - File path to open as a set of virtualized zarr arrays. - filetype : FileType, default: None - Type of file to be opened. Used to determine which kerchunk file format backend to use. - If not provided will attempt to automatically infer the correct filetype from the the filepath's extension. - group : str, default is None - Path to the HDF5/netCDF4 group in the given file to open. Given as a str, supported by filetypes “netcdf4” and “hdf5”. - Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments, - so ensure reader_options match selected Kerchunk reader arguments. - """ - - if reader_options is None: - reader_options = {} - - if filetype is None: - filetype = _automatically_determine_filetype( - filepath=filepath, reader_options=reader_options - ) + # The kerchunk .parquet storage format isn't actually a parquet, but a directory that contains named parquets for each group/variable. + if fs.filepath.endswith("ref.parquet"): + from fsspec.implementations.reference import LazyReferenceMapper - # if filetype is user defined, convert to FileType - filetype = FileType(filetype) + lrm = LazyReferenceMapper(filepath, fs.fs) - if filetype.name.lower() == "netcdf3": - from kerchunk.netCDF3 import NetCDF3ToZarr - - refs = NetCDF3ToZarr(filepath, inline_threshold=0, **reader_options).translate() - - elif filetype.name.lower() == "hdf5" or filetype.name.lower() == "netcdf4": - from kerchunk.hdf import SingleHdf5ToZarr - - refs = SingleHdf5ToZarr( - filepath, inline_threshold=0, **reader_options - ).translate() - - refs = extract_group(refs, group) - - elif filetype.name.lower() == "grib": - # TODO Grib files should be handled as a DataTree object - # see https://github.com/TomNicholas/VirtualiZarr/issues/11 - raise NotImplementedError(f"Unsupported file type: {filetype}") - elif filetype.name.lower() == "tiff": - from kerchunk.tiff import tiff_to_zarr - - reader_options.pop("storage_options", {}) - warnings.warn( - "storage_options have been dropped from reader_options as they are not supported by kerchunk.tiff.tiff_to_zarr", - UserWarning, - ) + # build reference dict from KV pairs in LazyReferenceMapper + # is there a better / more preformant way to extract this? + array_refs = {k: lrm[k] for k in lrm.keys()} - # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 - refs = {"refs": tiff_to_zarr(filepath, **reader_options)} - elif filetype.name.lower() == "fits": - from kerchunk.fits import process_file + full_reference = {"refs": array_refs} - # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 - refs = {"refs": process_file(filepath, **reader_options)} - else: - raise NotImplementedError(f"Unsupported file type: {filetype.name}") + vds = dataset_from_kerchunk_refs(KerchunkStoreRefs(full_reference)) - # TODO validate the references that were read before returning? - return refs + # JSON has no magic bytes, but the Kerchunk version 1 spec starts with 'version': + # https://fsspec.github.io/kerchunk/spec.html + elif fs.read_bytes(9).startswith(b'{"version'): + with fs.open_file() as of: + refs = ujson.load(of) + vds = dataset_from_kerchunk_refs(KerchunkStoreRefs(refs)) -def extract_group(vds_refs: KerchunkStoreRefs, group: str | None) -> KerchunkStoreRefs: - """Extract only the part of the kerchunk reference dict that is relevant to a single HDF group""" - hdf_groups = [ - k.removesuffix(".zgroup") for k in vds_refs["refs"].keys() if ".zgroup" in k - ] - if len(hdf_groups) == 1: - return vds_refs - else: - if group is None: + else: raise ValueError( - f"Multiple HDF Groups found. Must specify group= keyword to select one of {hdf_groups}" + "The input Kerchunk reference did not seem to be in Kerchunk's JSON or Parquet spec: https://fsspec.github.io/kerchunk/spec.html. The Kerchunk format autodetection is quite flaky, so if your reference matches the Kerchunk spec feel free to open an issue: https://github.com/zarr-developers/VirtualiZarr/issues" ) - else: - # Ensure supplied group kwarg is consistent with kerchunk keys - if not group.endswith("/"): - group += "/" - if group.startswith("/"): - group = group.removeprefix("/") - - if group not in hdf_groups: - raise ValueError(f'Group "{group}" not found in {hdf_groups}') - - # Filter by group prefix and remove prefix from all keys - groupdict = { - k.removeprefix(group): v - for k, v in vds_refs["refs"].items() - if k.startswith(group) - } - # Also remove group prefix from _ARRAY_DIMENSIONS - for k, v in groupdict.items(): - if isinstance(v, str): - groupdict[k] = v.replace("\\/", "/").replace(group, "") - - vds_refs["refs"] = groupdict - - return KerchunkStoreRefs(vds_refs) - - -def virtual_vars_from_kerchunk_refs( - refs: KerchunkStoreRefs, - drop_variables: list[str] | None = None, - virtual_array_class=ManifestArray, -) -> dict[str, Variable]: - """ - Translate a store-level kerchunk reference dict into aaset of xarray Variables containing virtualized arrays. - - Parameters - ---------- - drop_variables: list[str], default is None - Variables in the file to drop before returning. - virtual_array_class - Virtual array class to use to represent the references to the chunks in each on-disk array. - Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. - """ - - var_names = find_var_names(refs) - if drop_variables is None: - drop_variables = [] - var_names_to_keep = [ - var_name for var_name in var_names if var_name not in drop_variables - ] - - vars = { - var_name: variable_from_kerchunk_refs(refs, var_name, virtual_array_class) - for var_name in var_names_to_keep - } - return vars - - -def dataset_from_kerchunk_refs( - refs: KerchunkStoreRefs, - drop_variables: list[str] = [], - virtual_array_class: type = ManifestArray, - indexes: MutableMapping[str, Index] | None = None, -) -> Dataset: - """ - Translate a store-level kerchunk reference dict into an xarray Dataset containing virtualized arrays. - - drop_variables: list[str], default is None - Variables in the file to drop before returning. - virtual_array_class - Virtual array class to use to represent the references to the chunks in each on-disk array. - Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. - """ - - vars = virtual_vars_from_kerchunk_refs(refs, drop_variables, virtual_array_class) - ds_attrs = fully_decode_arr_refs(refs["refs"]).get(".zattrs", {}) - coord_names = ds_attrs.pop("coordinates", []) - - if indexes is None: - indexes = {} - data_vars, coords = separate_coords(vars, indexes, coord_names) - - vds = Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, - ) - - return vds - - -def determine_chunk_grid_shape(zarray): - return tuple( - ceildiv(length, chunksize) - for length, chunksize in zip(zarray.shape, zarray.chunks) - ) - - -def variable_from_kerchunk_refs( - refs: KerchunkStoreRefs, var_name: str, virtual_array_class -) -> Variable: - """Create a single xarray Variable by reading specific keys of a kerchunk references dict.""" - - arr_refs = extract_array_refs(refs, var_name) - chunk_dict, zarray, zattrs = parse_array_refs(arr_refs) - # we want to remove the _ARRAY_DIMENSIONS from the final variables' .attrs - dims = zattrs.pop("_ARRAY_DIMENSIONS") - if chunk_dict: - manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict) - varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest) - elif len(zarray.shape) != 0: - # empty variables don't have physical chunks, but zarray shows that the variable - # is at least 1D - shape = determine_chunk_grid_shape(zarray) - manifest = ChunkManifest(entries={}, shape=shape) - varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest) - else: - # This means we encountered a scalar variable of dimension 0, - # very likely that it actually has no numeric value and its only purpose - # is to communicate dataset attributes. - varr = zarray.fill_value - - return Variable(data=varr, dims=dims, attrs=zattrs) - - -def find_var_names(ds_reference_dict: KerchunkStoreRefs) -> list[str]: - """Find the names of zarr variables in this store/group.""" - - refs = ds_reference_dict["refs"] - found_var_names = {key.split("/")[0] for key in refs.keys() if "/" in key} - - return list(found_var_names) - - -def extract_array_refs( - ds_reference_dict: KerchunkStoreRefs, var_name: str -) -> KerchunkArrRefs: - """Extract only the part of the kerchunk reference dict that is relevant to this one zarr array""" - - found_var_names = find_var_names(ds_reference_dict) - - refs = ds_reference_dict["refs"] - if var_name in found_var_names: - # TODO these function probably have more loops in them than they need to... - - arr_refs = { - key.split("/")[1]: refs[key] - for key in refs.keys() - if var_name == key.split("/")[0] - } - - return fully_decode_arr_refs(arr_refs) - - else: - raise KeyError( - f"Could not find zarr array variable name {var_name}, only {found_var_names}" - ) - - -def parse_array_refs( - arr_refs: KerchunkArrRefs, -) -> tuple[dict, ZArray, ZAttrs]: - zarray = ZArray.from_kerchunk_refs(arr_refs.pop(".zarray")) - zattrs = arr_refs.pop(".zattrs", {}) - chunk_dict = arr_refs - - return chunk_dict, zarray, zattrs - - -def fully_decode_arr_refs(d: dict) -> KerchunkArrRefs: - """ - Only have to do this because kerchunk.SingleHdf5ToZarr apparently doesn't bother converting .zarray and .zattrs contents to dicts, see https://github.com/fsspec/kerchunk/issues/415 . - """ - import ujson - - sanitized = d.copy() - for k, v in d.items(): - if k.startswith("."): - # ensure contents of .zattrs and .zarray are python dictionaries - sanitized[k] = ujson.loads(v) - return cast(KerchunkArrRefs, sanitized) + # TODO would be more efficient to drop these before converting them into ManifestArrays, i.e. drop them from the kerchunk refs dict + return vds.drop_vars(drop_variables) diff --git a/virtualizarr/readers/netcdf3.py b/virtualizarr/readers/netcdf3.py new file mode 100644 index 00000000..30c6746e --- /dev/null +++ b/virtualizarr/readers/netcdf3.py @@ -0,0 +1,62 @@ +from typing import Iterable, Mapping, Optional + +from xarray import Dataset +from xarray.core.indexes import Index + +from virtualizarr.readers.common import ( + VirtualBackend, + construct_virtual_dataset, + open_loadable_vars_and_indexes, +) +from virtualizarr.translators.kerchunk import ( + extract_group, + virtual_vars_and_metadata_from_kerchunk_refs, +) +from virtualizarr.utils import check_for_collisions + + +class NetCDF3VirtualBackend(VirtualBackend): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + from kerchunk.netCDF3 import NetCDF3ToZarr + + drop_variables, loadable_variables = check_for_collisions( + drop_variables, + loadable_variables, + ) + + refs = NetCDF3ToZarr(filepath, inline_threshold=0, **reader_options).translate() + + refs = extract_group(refs, group) + + virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( + refs, + loadable_variables, + drop_variables, + ) + + loadable_vars, indexes = open_loadable_vars_and_indexes( + filepath, + loadable_variables=loadable_variables, + reader_options=reader_options, + drop_variables=drop_variables, + indexes=indexes, + group=group, + decode_times=decode_times, + ) + + return construct_virtual_dataset( + virtual_vars=virtual_vars, + loadable_vars=loadable_vars, + indexes=indexes, + coord_names=coord_names, + attrs=attrs, + ) diff --git a/virtualizarr/readers/tiff.py b/virtualizarr/readers/tiff.py new file mode 100644 index 00000000..bb32e647 --- /dev/null +++ b/virtualizarr/readers/tiff.py @@ -0,0 +1,73 @@ +import warnings +from typing import Iterable, Mapping, Optional + +from xarray import Dataset +from xarray.core.indexes import Index + +from virtualizarr.readers.common import ( + VirtualBackend, + construct_virtual_dataset, + open_loadable_vars_and_indexes, +) +from virtualizarr.translators.kerchunk import ( + extract_group, + virtual_vars_and_metadata_from_kerchunk_refs, +) +from virtualizarr.types.kerchunk import KerchunkStoreRefs +from virtualizarr.utils import check_for_collisions + + +class TIFFVirtualBackend(VirtualBackend): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + from kerchunk.tiff import tiff_to_zarr + + drop_variables, loadable_variables = check_for_collisions( + drop_variables=drop_variables, loadable_variables=loadable_variables + ) + + if reader_options is None: + reader_options = {} + + reader_options.pop("storage_options", {}) + warnings.warn( + "storage_options have been dropped from reader_options as they are not supported by kerchunk.tiff.tiff_to_zarr", + UserWarning, + ) + + # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 + refs = KerchunkStoreRefs({"refs": tiff_to_zarr(filepath, **reader_options)}) + + refs = extract_group(refs, group) + + virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( + refs, + loadable_variables, + drop_variables, + ) + + loadable_vars, indexes = open_loadable_vars_and_indexes( + filepath, + loadable_variables=loadable_variables, + reader_options=reader_options, + drop_variables=drop_variables, + indexes=indexes, + group=group, + decode_times=decode_times, + ) + + return construct_virtual_dataset( + virtual_vars=virtual_vars, + loadable_vars=loadable_vars, + indexes=indexes, + coord_names=coord_names, + attrs=attrs, + ) diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py deleted file mode 100644 index 168faa2b..00000000 --- a/virtualizarr/readers/zarr.py +++ /dev/null @@ -1,131 +0,0 @@ -import json -from pathlib import Path -from typing import Mapping - -import numcodecs -import numpy as np -from xarray import Dataset -from xarray.core.indexes import Index -from xarray.core.variable import Variable - -from virtualizarr.backend import separate_coords -from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.zarr import ZArray - - -def open_virtual_dataset_from_v3_store( - storepath: str, - drop_variables: list[str] = [], - indexes: Mapping[str, Index] | None = None, -) -> Dataset: - """ - Read a Zarr v3 store and return an xarray Dataset containing virtualized arrays. - """ - _storepath = Path(storepath) - - ds_attrs = attrs_from_zarr_group_json(_storepath / "zarr.json") - coord_names = ds_attrs.pop("coordinates", []) - - # TODO recursive glob to create a datatree - # Note: this .is_file() check should not be necessary according to the pathlib docs, but tests fail on github CI without it - # see https://github.com/TomNicholas/VirtualiZarr/pull/45#discussion_r1547833166 - all_paths = _storepath.glob("*/") - directory_paths = [p for p in all_paths if not p.is_file()] - - vars = {} - for array_dir in directory_paths: - var_name = array_dir.name - if var_name in drop_variables: - break - - zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json") - manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json")) - - marr = ManifestArray(chunkmanifest=manifest, zarray=zarray) - var = Variable(data=marr, dims=dim_names, attrs=attrs) - vars[var_name] = var - - if indexes is None: - raise NotImplementedError() - elif indexes != {}: - # TODO allow manual specification of index objects - raise NotImplementedError() - else: - indexes = dict(**indexes) # for type hinting: to allow mutation - - data_vars, coords = separate_coords(vars, indexes, coord_names) - - ds = Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, - ) - - return ds - - -def attrs_from_zarr_group_json(filepath: Path) -> dict: - with open(filepath) as metadata_file: - attrs = json.load(metadata_file) - return attrs["attributes"] - - -def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: - with open(filepath) as metadata_file: - metadata = json.load(metadata_file) - - if { - "name": "chunk-manifest-json", - "configuration": { - "manifest": "./manifest.json", - }, - } not in metadata.get("storage_transformers", []): - raise ValueError( - "Can only read byte ranges from Zarr v3 stores which implement the manifest storage transformer ZEP." - ) - - attrs = metadata.pop("attributes") - dim_names = metadata.pop("dimension_names") - - chunk_shape = tuple(metadata["chunk_grid"]["configuration"]["chunk_shape"]) - shape = tuple(metadata["shape"]) - zarr_format = metadata["zarr_format"] - - if metadata["fill_value"] is None: - raise ValueError( - "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" - ) - else: - fill_value = metadata["fill_value"] - - all_codecs = [ - codec - for codec in metadata["codecs"] - if codec["name"] not in ("transpose", "bytes") - ] - compressor, *filters = [ - _configurable_to_num_codec_config(_filter) for _filter in all_codecs - ] - zarray = ZArray( - chunks=chunk_shape, - compressor=compressor, - dtype=np.dtype(metadata["data_type"]), - fill_value=fill_value, - filters=filters or None, - order="C", - shape=shape, - zarr_format=zarr_format, - ) - - return zarray, dim_names, attrs - - -def _configurable_to_num_codec_config(configurable: dict) -> dict: - """ - Convert a zarr v3 configurable into a numcodecs codec. - """ - configurable_copy = configurable.copy() - codec_id = configurable_copy.pop("name") - configuration = configurable_copy.pop("configuration") - return numcodecs.get_codec({"id": codec_id, **configuration}).get_config() diff --git a/virtualizarr/readers/zarr_v3.py b/virtualizarr/readers/zarr_v3.py new file mode 100644 index 00000000..6da81581 --- /dev/null +++ b/virtualizarr/readers/zarr_v3.py @@ -0,0 +1,154 @@ +import json +from pathlib import Path +from typing import Iterable, Mapping, Optional + +import numcodecs +import numpy as np +from xarray import Dataset +from xarray.core.indexes import Index +from xarray.core.variable import Variable + +from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.readers.common import VirtualBackend, separate_coords +from virtualizarr.zarr import ZArray + + +class ZarrV3VirtualBackend(VirtualBackend): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + """ + Read a Zarr v3 store containing chunk manifests and return an xarray Dataset containing virtualized arrays. + + This is experimental - chunk manifests are not part of the Zarr v3 Spec. + """ + storepath = Path(filepath) + + if group: + raise NotImplementedError() + + if loadable_variables or decode_times: + raise NotImplementedError() + + if reader_options: + raise NotImplementedError() + + drop_vars: list[str] + if drop_variables is None: + drop_vars = [] + else: + drop_vars = list(drop_variables) + + ds_attrs = attrs_from_zarr_group_json(storepath / "zarr.json") + coord_names = ds_attrs.pop("coordinates", []) + + # TODO recursive glob to create a datatree + # Note: this .is_file() check should not be necessary according to the pathlib docs, but tests fail on github CI without it + # see https://github.com/TomNicholas/VirtualiZarr/pull/45#discussion_r1547833166 + all_paths = storepath.glob("*/") + directory_paths = [p for p in all_paths if not p.is_file()] + + vars = {} + for array_dir in directory_paths: + var_name = array_dir.name + if var_name in drop_vars: + break + + zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json") + manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json")) + + marr = ManifestArray(chunkmanifest=manifest, zarray=zarray) + var = Variable(data=marr, dims=dim_names, attrs=attrs) + vars[var_name] = var + + if indexes is None: + raise NotImplementedError() + elif indexes != {}: + # TODO allow manual specification of index objects + raise NotImplementedError() + else: + indexes = dict(**indexes) # for type hinting: to allow mutation + + data_vars, coords = separate_coords(vars, indexes, coord_names) + + ds = Dataset( + data_vars, + coords=coords, + # indexes={}, # TODO should be added in a later version of xarray + attrs=ds_attrs, + ) + + return ds + + +def attrs_from_zarr_group_json(filepath: Path) -> dict: + with open(filepath) as metadata_file: + attrs = json.load(metadata_file) + return attrs["attributes"] + + +def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: + with open(filepath) as metadata_file: + metadata = json.load(metadata_file) + + if { + "name": "chunk-manifest-json", + "configuration": { + "manifest": "./manifest.json", + }, + } not in metadata.get("storage_transformers", []): + raise ValueError( + "Can only read byte ranges from Zarr v3 stores which implement the manifest storage transformer ZEP." + ) + + attrs = metadata.pop("attributes") + dim_names = metadata.pop("dimension_names") + + chunk_shape = tuple(metadata["chunk_grid"]["configuration"]["chunk_shape"]) + shape = tuple(metadata["shape"]) + zarr_format = metadata["zarr_format"] + + if metadata["fill_value"] is None: + raise ValueError( + "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" + ) + else: + fill_value = metadata["fill_value"] + + all_codecs = [ + codec + for codec in metadata["codecs"] + if codec["name"] not in ("transpose", "bytes") + ] + compressor, *filters = [ + _configurable_to_num_codec_config(_filter) for _filter in all_codecs + ] + zarray = ZArray( + chunks=chunk_shape, + compressor=compressor, + dtype=np.dtype(metadata["data_type"]), + fill_value=fill_value, + filters=filters or None, + order="C", + shape=shape, + zarr_format=zarr_format, + ) + + return zarray, dim_names, attrs + + +def _configurable_to_num_codec_config(configurable: dict) -> dict: + """ + Convert a zarr v3 configurable into a numcodecs codec. + """ + configurable_copy = configurable.copy() + codec_id = configurable_copy.pop("name") + configuration = configurable_copy.pop("configuration") + return numcodecs.get_codec({"id": codec_id, **configuration}).get_config() diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index 81a23e0c..43a6bbd8 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -9,9 +9,8 @@ from xarray.core.indexes import Index from virtualizarr import open_virtual_dataset -from virtualizarr.backend import FileType +from virtualizarr.backend import FileType, automatically_determine_filetype from virtualizarr.manifests import ManifestArray -from virtualizarr.readers.kerchunk import _automatically_determine_filetype from virtualizarr.tests import ( has_astropy, has_tifffile, @@ -34,10 +33,10 @@ def test_automatically_determine_filetype_netcdf3_netcdf4(): ds.to_netcdf(netcdf3_file_path, engine="scipy", format="NETCDF3_CLASSIC") ds.to_netcdf(netcdf4_file_path, engine="h5netcdf") - assert FileType("netcdf3") == _automatically_determine_filetype( + assert FileType("netcdf3") == automatically_determine_filetype( filepath=netcdf3_file_path ) - assert FileType("hdf5") == _automatically_determine_filetype( + assert FileType("hdf5") == automatically_determine_filetype( filepath=netcdf4_file_path ) @@ -56,7 +55,7 @@ def test_valid_filetype_bytes(tmp_path, filetype, headerbytes): filepath = tmp_path / "file.abc" with open(filepath, "wb") as f: f.write(headerbytes) - assert FileType(filetype) == _automatically_determine_filetype(filepath=filepath) + assert FileType(filetype) == automatically_determine_filetype(filepath=filepath) def test_notimplemented_filetype(tmp_path): @@ -65,7 +64,7 @@ def test_notimplemented_filetype(tmp_path): with open(filepath, "wb") as f: f.write(headerbytes) with pytest.raises(NotImplementedError): - _automatically_determine_filetype(filepath=filepath) + automatically_determine_filetype(filepath=filepath) def test_FileType(): @@ -326,7 +325,8 @@ def test_group_kwarg(self, hdf5_groups_file): if name in vars_to_load: xrt.assert_identical(vds.variables[name], full_ds.variables[name]) - @patch("virtualizarr.readers.kerchunk.read_kerchunk_references_from_file") + @pytest.mark.xfail(reason="patches a function which no longer exists") + @patch("virtualizarr.translators.kerchunk.read_kerchunk_references_from_file") def test_open_virtual_dataset_passes_expected_args( self, mock_read_kerchunk, netcdf4_file ): diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 434d12d7..c9e3e302 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -5,11 +5,11 @@ from virtualizarr import open_virtual_dataset from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.readers.kerchunk import ( +from virtualizarr.tests import requires_kerchunk +from virtualizarr.translators.kerchunk import ( dataset_from_kerchunk_refs, find_var_names, ) -from virtualizarr.tests import requires_kerchunk from virtualizarr.zarr import ZArray diff --git a/virtualizarr/tests/test_writers/test_zarr.py b/virtualizarr/tests/test_writers/test_zarr.py index 278b2d78..67af401a 100644 --- a/virtualizarr/tests/test_writers/test_zarr.py +++ b/virtualizarr/tests/test_writers/test_zarr.py @@ -8,7 +8,7 @@ from virtualizarr import ManifestArray, open_virtual_dataset from virtualizarr.backend import FileType from virtualizarr.manifests.manifest import ChunkManifest -from virtualizarr.readers.zarr import metadata_from_zarr_json +from virtualizarr.readers.zarr_v3 import metadata_from_zarr_json from virtualizarr.writers.zarr import dataset_to_zarr diff --git a/virtualizarr/translators/__init__.py b/virtualizarr/translators/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/virtualizarr/translators/kerchunk.py b/virtualizarr/translators/kerchunk.py new file mode 100644 index 00000000..f2d2f5df --- /dev/null +++ b/virtualizarr/translators/kerchunk.py @@ -0,0 +1,223 @@ +from typing import Any, Mapping, MutableMapping, cast + +from xarray import Dataset +from xarray.core.indexes import Index +from xarray.core.variable import Variable + +from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.readers.common import separate_coords +from virtualizarr.types.kerchunk import ( + KerchunkArrRefs, + KerchunkStoreRefs, +) +from virtualizarr.zarr import ZArray, ZAttrs, determine_chunk_grid_shape + + +def virtual_vars_and_metadata_from_kerchunk_refs( + vds_refs: KerchunkStoreRefs, + loadable_variables, + drop_variables, + virtual_array_class=ManifestArray, +) -> tuple[Mapping[str, Variable], dict[str, Any], list[str]]: + """ + Parses all useful information from a set kerchunk references (for a single group). + """ + + virtual_vars = virtual_vars_from_kerchunk_refs( + vds_refs, + drop_variables=drop_variables + loadable_variables, + virtual_array_class=virtual_array_class, + ) + ds_attrs = fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) + coord_names = ds_attrs.pop("coordinates", []) + + return virtual_vars, ds_attrs, coord_names + + +def extract_group(vds_refs: KerchunkStoreRefs, group: str | None) -> KerchunkStoreRefs: + """Extract only the part of the kerchunk reference dict that is relevant to a single HDF group""" + hdf_groups = [ + k.removesuffix(".zgroup") for k in vds_refs["refs"].keys() if ".zgroup" in k + ] + if len(hdf_groups) == 1: + return vds_refs + else: + if group is None: + raise ValueError( + f"Multiple HDF Groups found. Must specify group= keyword to select one of {hdf_groups}" + ) + else: + # Ensure supplied group kwarg is consistent with kerchunk keys + if not group.endswith("/"): + group += "/" + if group.startswith("/"): + group = group.removeprefix("/") + + if group not in hdf_groups: + raise ValueError(f'Group "{group}" not found in {hdf_groups}') + + # Filter by group prefix and remove prefix from all keys + groupdict = { + k.removeprefix(group): v + for k, v in vds_refs["refs"].items() + if k.startswith(group) + } + # Also remove group prefix from _ARRAY_DIMENSIONS + for k, v in groupdict.items(): + if isinstance(v, str): + groupdict[k] = v.replace("\\/", "/").replace(group, "") + + vds_refs["refs"] = groupdict + + return KerchunkStoreRefs(vds_refs) + + +def virtual_vars_from_kerchunk_refs( + refs: KerchunkStoreRefs, + drop_variables: list[str] | None = None, + virtual_array_class=ManifestArray, +) -> dict[str, Variable]: + """ + Translate a store-level kerchunk reference dict into aaset of xarray Variables containing virtualized arrays. + + Parameters + ---------- + drop_variables: list[str], default is None + Variables in the file to drop before returning. + virtual_array_class + Virtual array class to use to represent the references to the chunks in each on-disk array. + Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. + """ + + var_names = find_var_names(refs) + if drop_variables is None: + drop_variables = [] + var_names_to_keep = [ + var_name for var_name in var_names if var_name not in drop_variables + ] + + vars = { + var_name: variable_from_kerchunk_refs(refs, var_name, virtual_array_class) + for var_name in var_names_to_keep + } + return vars + + +def dataset_from_kerchunk_refs( + refs: KerchunkStoreRefs, + drop_variables: list[str] = [], + virtual_array_class: type = ManifestArray, + indexes: MutableMapping[str, Index] | None = None, +) -> Dataset: + """ + Translate a store-level kerchunk reference dict into an xarray Dataset containing virtualized arrays. + + drop_variables: list[str], default is None + Variables in the file to drop before returning. + virtual_array_class + Virtual array class to use to represent the references to the chunks in each on-disk array. + Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. + """ + + vars = virtual_vars_from_kerchunk_refs(refs, drop_variables, virtual_array_class) + ds_attrs = fully_decode_arr_refs(refs["refs"]).get(".zattrs", {}) + coord_names = ds_attrs.pop("coordinates", []) + + if indexes is None: + indexes = {} + data_vars, coords = separate_coords(vars, indexes, coord_names) + + vds = Dataset( + data_vars, + coords=coords, + # indexes={}, # TODO should be added in a later version of xarray + attrs=ds_attrs, + ) + + return vds + + +def variable_from_kerchunk_refs( + refs: KerchunkStoreRefs, var_name: str, virtual_array_class +) -> Variable: + """Create a single xarray Variable by reading specific keys of a kerchunk references dict.""" + + arr_refs = extract_array_refs(refs, var_name) + chunk_dict, zarray, zattrs = parse_array_refs(arr_refs) + # we want to remove the _ARRAY_DIMENSIONS from the final variables' .attrs + dims = zattrs.pop("_ARRAY_DIMENSIONS") + if chunk_dict: + manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict) + varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest) + elif len(zarray.shape) != 0: + # empty variables don't have physical chunks, but zarray shows that the variable + # is at least 1D + shape = determine_chunk_grid_shape(zarray.shape, zarray.chunks) + manifest = ChunkManifest(entries={}, shape=shape) + varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest) + else: + # This means we encountered a scalar variable of dimension 0, + # very likely that it actually has no numeric value and its only purpose + # is to communicate dataset attributes. + varr = zarray.fill_value + + return Variable(data=varr, dims=dims, attrs=zattrs) + + +def find_var_names(ds_reference_dict: KerchunkStoreRefs) -> list[str]: + """Find the names of zarr variables in this store/group.""" + + refs = ds_reference_dict["refs"] + found_var_names = {key.split("/")[0] for key in refs.keys() if "/" in key} + + return list(found_var_names) + + +def extract_array_refs( + ds_reference_dict: KerchunkStoreRefs, var_name: str +) -> KerchunkArrRefs: + """Extract only the part of the kerchunk reference dict that is relevant to this one zarr array""" + + found_var_names = find_var_names(ds_reference_dict) + + refs = ds_reference_dict["refs"] + if var_name in found_var_names: + # TODO these function probably have more loops in them than they need to... + + arr_refs = { + key.split("/")[1]: refs[key] + for key in refs.keys() + if var_name == key.split("/")[0] + } + + return fully_decode_arr_refs(arr_refs) + + else: + raise KeyError( + f"Could not find zarr array variable name {var_name}, only {found_var_names}" + ) + + +def parse_array_refs( + arr_refs: KerchunkArrRefs, +) -> tuple[dict, ZArray, ZAttrs]: + zarray = ZArray.from_kerchunk_refs(arr_refs.pop(".zarray")) + zattrs = arr_refs.pop(".zattrs", {}) + chunk_dict = arr_refs + + return chunk_dict, zarray, zattrs + + +def fully_decode_arr_refs(d: dict) -> KerchunkArrRefs: + """ + Only have to do this because kerchunk.SingleHdf5ToZarr apparently doesn't bother converting .zarray and .zattrs contents to dicts, see https://github.com/fsspec/kerchunk/issues/415 . + """ + import ujson + + sanitized = d.copy() + for k, v in d.items(): + if k.startswith("."): + # ensure contents of .zattrs and .zarray are python dictionaries + sanitized[k] = ujson.loads(v) + + return cast(KerchunkArrRefs, sanitized) diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index 1721a3e7..c9260aa6 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -1,7 +1,7 @@ from __future__ import annotations import io -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Iterable, Optional, Union if TYPE_CHECKING: import fsspec.core @@ -61,3 +61,28 @@ def __post_init__(self) -> None: storage_options = self.reader_options.get("storage_options", {}) # type: ignore self.fs = fsspec.filesystem(protocol, **storage_options) + + +def check_for_collisions( + drop_variables: Iterable[str] | None, + loadable_variables: Iterable[str] | None, +) -> tuple[list[str], list[str]]: + if drop_variables is None: + drop_variables = [] + elif isinstance(drop_variables, str): + drop_variables = [drop_variables] + else: + drop_variables = list(drop_variables) + + if loadable_variables is None: + loadable_variables = [] + elif isinstance(loadable_variables, str): + loadable_variables = [loadable_variables] + else: + loadable_variables = list(loadable_variables) + + common = set(drop_variables).intersection(set(loadable_variables)) + if common: + raise ValueError(f"Cannot both load and drop variables {common}") + + return drop_variables, loadable_variables diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index cd83a67d..4b3fdd53 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -210,6 +210,12 @@ def ceildiv(a: int, b: int) -> int: return -(a // -b) +def determine_chunk_grid_shape( + shape: tuple[int, ...], chunks: tuple[int, ...] +) -> tuple[int, ...]: + return tuple(ceildiv(length, chunksize) for length, chunksize in zip(shape, chunks)) + + def _num_codec_config_to_configurable(num_codec: dict) -> dict: """ Convert a numcodecs codec into a zarr v3 configurable. From b1ae3fae1d05eabdf44a4dc6893ffb81e6f00316 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 20 Oct 2024 08:47:48 -0600 Subject: [PATCH 24/29] [pre-commit.ci] pre-commit autoupdate (#250) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/pre-commit/pre-commit-hooks: v4.6.0 → v5.0.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.6.0...v5.0.0) - [github.com/astral-sh/ruff-pre-commit: v0.6.3 → v0.6.9](https://github.com/astral-sh/ruff-pre-commit/compare/v0.6.3...v0.6.9) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Tom Nicholas --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 803b7a78..3bae6a6c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ ci: autoupdate_schedule: monthly repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer @@ -11,7 +11,7 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: "v0.6.3" + rev: "v0.6.9" hooks: # Run the linter. - id: ruff From 4b7612e5649058475223cdfbb1bf50910b70ff5b Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Sun, 20 Oct 2024 14:29:06 -0600 Subject: [PATCH 25/29] Add CI job for testing upstream versions of dependencies (#264) * add new CI workflow * add environment with various bleeding edge branches * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bad yaml formatting * install pip via conda and correct indentation again * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * formatting again * Note about kerchunk * Add comments about zarr-python v3 compatibility branches * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .github/workflows/upstream.yml | 60 ++++++++++++++++++++++++++++++++++ ci/upstream.yml | 30 +++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 .github/workflows/upstream.yml create mode 100644 ci/upstream.yml diff --git a/.github/workflows/upstream.yml b/.github/workflows/upstream.yml new file mode 100644 index 00000000..9140896b --- /dev/null +++ b/.github/workflows/upstream.yml @@ -0,0 +1,60 @@ +name: upstream + +on: + push: + branches: [ "main" ] + paths-ignore: + - 'docs/**' + pull_request: + branches: [ "main" ] + paths-ignore: + - 'docs/**' + schedule: + - cron: "0 0 * * *" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + + test: + name: ${{ matrix.python-version }}-build + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + strategy: + matrix: + python-version: ["3.12"] + steps: + - uses: actions/checkout@v4 + + - name: Setup micromamba + uses: mamba-org/setup-micromamba@v1 + with: + environment-file: ci/upstream.yml + cache-environment: true + create-args: >- + python=${{matrix.python-version}} + + - name: Install virtualizarr + run: | + python -m pip install -e . --no-deps + - name: Conda list information + run: | + conda env list + conda list + + - name: Running Tests + run: | + python -m pytest ./virtualizarr --cov=./ --cov-report=xml --verbose + + - name: Upload code coverage to Codecov + uses: codecov/codecov-action@v3.1.4 + with: + file: ./coverage.xml + flags: unittests + env_vars: OS,PYTHON + name: codecov-umbrella + fail_ci_if_error: false diff --git a/ci/upstream.yml b/ci/upstream.yml new file mode 100644 index 00000000..184c6710 --- /dev/null +++ b/ci/upstream.yml @@ -0,0 +1,30 @@ +name: virtualizarr-min-deps +channels: + - conda-forge + - nodefaults +dependencies: + - h5netcdf + - h5py + - hdf5 + - netcdf4 + - numpy>=2.0.0 + - packaging + - ujson + - universal_pathlib + # Testing + - codecov + - pre-commit + - mypy + - ruff + - pandas-stubs + - pytest-mypy + - pytest-cov + - pytest + - pooch + - fsspec + - pip + - pip: + - zarr==3.0.0b1 # beta release of zarr-python v3 + - git+https://github.com/pydata/xarray@zarr-v3 # zarr-v3 compatibility branch + - git+https://github.com/zarr-developers/numcodecs@zarr3-codecs # zarr-v3 compatibility branch + # - git+https://github.com/fsspec/kerchunk@main # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516) From 775c2c834274526c7a2f93e711bcdf5d70262166 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Tue, 22 Oct 2024 12:15:23 -0400 Subject: [PATCH 26/29] Add Icechunk Support (#256) * move vds_with_manifest_arrays fixture up * sketch implementation * test that we can create an icechunk store * fixture to create icechunk filestore in temporary directory * get the async fixture working properly * split into more functions * change mode * try creating zarr group and arrays explicitly * create root group from store * todos * do away with the async pytest fixtures/functions * successfully writes root group attrs * check array metadata is correct * try to write array attributes * sketch test for checking virtual references have been set correctly * test setting single virtual ref * use async properly * better separation of handling of loadable variables * fix chunk key format * use require_array * check that store supports writes * removed outdated note about awaiting * fix incorrect chunk key in test * absolute path * convert to file URI before handing to icechunk * test that without encoding we can definitely read one chunk * Work on encoding test * Update test to match * Quick comment * more comprehensive * add attrtirbute encoding * Fix array dimensions * Fix v3 codec pipeline * Put xarray dep back * Handle codecs, but get bad results * Gzip an d zlib are not directly working * Get up working with numcodecs zarr 3 codecs * Update codec pipeline * oUdpate to latest icechunk using sync api * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Some type stuff * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update zarr and icechunk tests, fix zarr v3 metadata * Update import we dont need * Update kerhcunk tests * Check for v3 metadata import in zarr test * More tests * type checker * types * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * More types * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * ooops * One left * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Finally done being dumb * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Support loadables without tests * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add test for multiple chunks to check order * Add loadable varaible test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add accessor, simple docs * Update icechunk.py Co-authored-by: Tom Nicholas * Update accessor.py Co-authored-by: Tom Nicholas * Fix attributes when loadables are available * Protect zarr import * Fix import errors in icechunk writer * More protection * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * i am bad at this * Add xarray roundtrip asserts * Add icechunk to api.rst * Update virtualizarr/tests/test_writers/test_icechunk.py Co-authored-by: Tom Nicholas * More test improvements, update realeses.rst * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * tmore testing * Figure out tests for real this time * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: TomNicholas Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- ci/upstream.yml | 2 +- conftest.py | 15 + docs/api.rst | 1 + docs/releases.rst | 3 + docs/usage.md | 17 + virtualizarr/accessor.py | 18 ++ virtualizarr/readers/zarr_v3.py | 2 + virtualizarr/tests/test_integration.py | 2 +- .../tests/test_manifests/test_array.py | 2 +- .../tests/test_readers/test_kerchunk.py | 2 +- virtualizarr/tests/test_writers/conftest.py | 27 ++ .../tests/test_writers/test_icechunk.py | 290 ++++++++++++++++++ virtualizarr/tests/test_writers/test_zarr.py | 26 +- virtualizarr/writers/icechunk.py | 204 ++++++++++++ virtualizarr/writers/zarr.py | 3 +- virtualizarr/zarr.py | 67 ++-- 16 files changed, 622 insertions(+), 59 deletions(-) create mode 100644 virtualizarr/tests/test_writers/conftest.py create mode 100644 virtualizarr/tests/test_writers/test_icechunk.py create mode 100644 virtualizarr/writers/icechunk.py diff --git a/ci/upstream.yml b/ci/upstream.yml index 184c6710..2c2680bc 100644 --- a/ci/upstream.yml +++ b/ci/upstream.yml @@ -24,7 +24,7 @@ dependencies: - fsspec - pip - pip: - - zarr==3.0.0b1 # beta release of zarr-python v3 + - icechunk # Installs zarr v3 as dependency - git+https://github.com/pydata/xarray@zarr-v3 # zarr-v3 compatibility branch - git+https://github.com/zarr-developers/numcodecs@zarr3-codecs # zarr-v3 compatibility branch # - git+https://github.com/fsspec/kerchunk@main # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516) diff --git a/conftest.py b/conftest.py index 3af4bf06..810fd833 100644 --- a/conftest.py +++ b/conftest.py @@ -1,6 +1,8 @@ import h5py +import numpy as np import pytest import xarray as xr +from xarray.core.variable import Variable def pytest_addoption(parser): @@ -96,3 +98,16 @@ def hdf5_scalar(tmpdir): dataset = f.create_dataset("scalar", data=0.1, dtype="float32") dataset.attrs["scalar"] = "true" return filepath + + +@pytest.fixture +def simple_netcdf4(tmpdir): + filepath = f"{tmpdir}/simple.nc" + + arr = np.arange(12, dtype=np.dtype("int32")).reshape(3, 4) + var = Variable(data=arr, dims=["x", "y"]) + ds = xr.Dataset({"foo": var}) + + ds.to_netcdf(filepath) + + return filepath diff --git a/docs/api.rst b/docs/api.rst index 81d08a77..755713d0 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -39,6 +39,7 @@ Serialization VirtualiZarrDatasetAccessor.to_kerchunk VirtualiZarrDatasetAccessor.to_zarr + VirtualiZarrDatasetAccessor.to_icechunk Rewriting diff --git a/docs/releases.rst b/docs/releases.rst index ee1ae402..93a5fec9 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -31,6 +31,9 @@ New Features - Support empty files (:pull:`260`) By `Justus Magin `_. +- Can write virtual datasets to Icechunk stores using `vitualize.to_icechunk` (:pull:`256`) + By `Matt Iannucci `_. + Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/docs/usage.md b/docs/usage.md index a0f9d058..30eab144 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -396,6 +396,23 @@ combined_ds = xr.open_dataset('combined.parq', engine="kerchunk") By default references are placed in separate parquet file when the total number of references exceeds `record_size`. If there are fewer than `categorical_threshold` unique urls referenced by a particular variable, url will be stored as a categorical variable. +### Writing to an Icechunk Store + +We can also write these references out as an [IcechunkStore](https://icechunk.io/). `Icechunk` is a Open-source, cloud-native transactional tensor storage engine that is compatible with zarr version 3. To export our virtual dataset to an `Icechunk` Store, we simply use the {py:meth}`ds.virtualize.to_icechunk ` accessor method. + +```python +# create an icechunk store +from icechunk import IcechunkStore, StorageConfig, StoreConfig, VirtualRefConfig +storage = StorageConfig.filesystem(str('combined')) +store = IcechunkStore.create(storage=storage, mode="w", config=StoreConfig( + virtual_ref_config=VirtualRefConfig.s3_anonymous(region='us-east-1'), +)) + +combined_vds.virtualize.to_icechunk(store) +``` + +See the [Icechunk documentation](https://icechunk.io/icechunk-python/virtual/#creating-a-virtual-dataset-with-virtualizarr) for more details. + ### Writing as Zarr Alternatively, we can write these references out as an actual Zarr store, at least one that is compliant with the [proposed "Chunk Manifest" ZEP](https://github.com/zarr-developers/zarr-specs/issues/287). To do this we simply use the {py:meth}`ds.virtualize.to_zarr ` accessor method. diff --git a/virtualizarr/accessor.py b/virtualizarr/accessor.py index cc251e63..336838f9 100644 --- a/virtualizarr/accessor.py +++ b/virtualizarr/accessor.py @@ -1,5 +1,6 @@ from pathlib import Path from typing import ( + TYPE_CHECKING, Callable, Literal, overload, @@ -12,6 +13,9 @@ from virtualizarr.writers.kerchunk import dataset_to_kerchunk_refs from virtualizarr.writers.zarr import dataset_to_zarr +if TYPE_CHECKING: + from icechunk import IcechunkStore # type: ignore[import-not-found] + @register_dataset_accessor("virtualize") class VirtualiZarrDatasetAccessor: @@ -39,6 +43,20 @@ def to_zarr(self, storepath: str) -> None: """ dataset_to_zarr(self.ds, storepath) + def to_icechunk(self, store: "IcechunkStore") -> None: + """ + Write an xarray dataset to an Icechunk store. + + Any variables backed by ManifestArray objects will be be written as virtual references, any other variables will be loaded into memory before their binary chunk data is written into the store. + + Parameters + ---------- + store: IcechunkStore + """ + from virtualizarr.writers.icechunk import dataset_to_icechunk + + dataset_to_icechunk(self.ds, store) + @overload def to_kerchunk( self, filepath: None, format: Literal["dict"] diff --git a/virtualizarr/readers/zarr_v3.py b/virtualizarr/readers/zarr_v3.py index 6da81581..a1f4ab7d 100644 --- a/virtualizarr/readers/zarr_v3.py +++ b/virtualizarr/readers/zarr_v3.py @@ -150,5 +150,7 @@ def _configurable_to_num_codec_config(configurable: dict) -> dict: """ configurable_copy = configurable.copy() codec_id = configurable_copy.pop("name") + if codec_id.startswith("numcodecs."): + codec_id = codec_id[len("numcodecs.") :] configuration = configurable_copy.pop("configuration") return numcodecs.get_codec({"id": codec_id, **configuration}).get_config() diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index c9e3e302..09d0c0a8 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -27,7 +27,7 @@ def test_kerchunk_roundtrip_in_memory_no_concat(): chunks=(2, 2), compressor=None, filters=None, - fill_value=np.nan, + fill_value=None, order="C", ), chunkmanifest=manifest, diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index f3a9ee9f..06e54d95 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -47,7 +47,7 @@ def test_create_manifestarray_from_kerchunk_refs(self): assert marr.chunks == (2, 3) assert marr.dtype == np.dtype("int64") assert marr.zarray.compressor is None - assert marr.zarray.fill_value is np.nan + assert marr.zarray.fill_value == 0 assert marr.zarray.filters is None assert marr.zarray.order == "C" diff --git a/virtualizarr/tests/test_readers/test_kerchunk.py b/virtualizarr/tests/test_readers/test_kerchunk.py index 50d4b19b..f693b370 100644 --- a/virtualizarr/tests/test_readers/test_kerchunk.py +++ b/virtualizarr/tests/test_readers/test_kerchunk.py @@ -37,7 +37,7 @@ def test_dataset_from_df_refs(): assert da.data.zarray.compressor is None assert da.data.zarray.filters is None - assert da.data.zarray.fill_value is np.nan + assert da.data.zarray.fill_value == 0 assert da.data.zarray.order == "C" assert da.data.manifest.dict() == { diff --git a/virtualizarr/tests/test_writers/conftest.py b/virtualizarr/tests/test_writers/conftest.py new file mode 100644 index 00000000..28c5b3db --- /dev/null +++ b/virtualizarr/tests/test_writers/conftest.py @@ -0,0 +1,27 @@ +import numpy as np +import pytest +from xarray import Dataset +from xarray.core.variable import Variable + +from virtualizarr.manifests import ChunkManifest, ManifestArray + + +@pytest.fixture +def vds_with_manifest_arrays() -> Dataset: + arr = ManifestArray( + chunkmanifest=ChunkManifest( + entries={"0.0": dict(path="/test.nc", offset=6144, length=48)} + ), + zarray=dict( + shape=(2, 3), + dtype=np.dtype(" "IcechunkStore": + from icechunk import IcechunkStore, StorageConfig + + storage = StorageConfig.filesystem(str(tmpdir)) + + # TODO if icechunk exposed a synchronous version of .open then we wouldn't need to use asyncio.run here + # TODO is this the correct mode to use? + store = IcechunkStore.create(storage=storage, mode="w") + + # TODO instead yield store then store.close() ?? + return store + + +def test_write_new_virtual_variable( + icechunk_filestore: "IcechunkStore", vds_with_manifest_arrays: Dataset +): + vds = vds_with_manifest_arrays + + dataset_to_icechunk(vds, icechunk_filestore) + + # check attrs + root_group = group(store=icechunk_filestore) + assert isinstance(root_group, Group) + assert root_group.attrs == {"something": 0} + + # TODO check against vds, then perhaps parametrize? + + # check array exists + assert "a" in root_group + arr = root_group["a"] + assert isinstance(arr, Array) + + # check array metadata + # TODO why doesn't a .zarr_format or .version attribute exist on zarr.Array? + # assert arr.zarr_format == 3 + assert arr.shape == (2, 3) + assert arr.chunks == (2, 3) + assert arr.dtype == np.dtype(" Dataset: - arr = ManifestArray( - chunkmanifest=ChunkManifest( - entries={"0.0": dict(path="test.nc", offset=6144, length=48)} - ), - zarray=dict( - shape=(2, 3), - dtype=np.dtype(" bool: """ Several metadata attributes in ZarrV3 use a dictionary with keys "name" : str and "configuration" : dict diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py new file mode 100644 index 00000000..6dadbc08 --- /dev/null +++ b/virtualizarr/writers/icechunk.py @@ -0,0 +1,204 @@ +from typing import TYPE_CHECKING, cast + +import numpy as np +from xarray import Dataset +from xarray.backends.zarr import encode_zarr_attr_value +from xarray.core.variable import Variable + +from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.zarr import encode_dtype + +if TYPE_CHECKING: + from icechunk import IcechunkStore # type: ignore[import-not-found] + from zarr import Group # type: ignore + + +VALID_URI_PREFIXES = { + "s3://", + # "gs://", # https://github.com/earth-mover/icechunk/issues/265 + # "azure://", # https://github.com/earth-mover/icechunk/issues/266 + # "r2://", + # "cos://", + # "minio://", + "file:///", +} + + +def dataset_to_icechunk(ds: Dataset, store: "IcechunkStore") -> None: + """ + Write an xarray dataset whose variables wrap ManifestArrays to an Icechunk store. + + Currently requires all variables to be backed by ManifestArray objects. + + Parameters + ---------- + ds: xr.Dataset + store: IcechunkStore + """ + try: + from icechunk import IcechunkStore # type: ignore[import-not-found] + from zarr import Group # type: ignore[import-untyped] + except ImportError: + raise ImportError( + "The 'icechunk' and 'zarr' version 3 libraries are required to use this function" + ) + + if not isinstance(store, IcechunkStore): + raise TypeError(f"expected type IcechunkStore, but got type {type(store)}") + + if not store.supports_writes: + raise ValueError("supplied store does not support writes") + + # TODO only supports writing to the root group currently + # TODO pass zarr_format kwarg? + root_group = Group.from_store(store=store) + + # TODO this is Frozen, the API for setting attributes must be something else + # root_group.attrs = ds.attrs + # for k, v in ds.attrs.items(): + # root_group.attrs[k] = encode_zarr_attr_value(v) + + return write_variables_to_icechunk_group( + ds.variables, + ds.attrs, + store=store, + group=root_group, + ) + + +def write_variables_to_icechunk_group( + variables, + attrs, + store, + group, +): + virtual_variables = { + name: var + for name, var in variables.items() + if isinstance(var.data, ManifestArray) + } + + loadable_variables = { + name: var for name, var in variables.items() if name not in virtual_variables + } + + # First write all the non-virtual variables + # NOTE: We set the attributes of the group before writing the dataset because the dataset + # will overwrite the root group's attributes with the dataset's attributes. We take advantage + # of xarrays zarr integration to ignore having to format the attributes ourselves. + ds = Dataset(loadable_variables, attrs=attrs) + ds.to_zarr(store, zarr_format=3, consolidated=False, mode="a") + + # Then finish by writing the virtual variables to the same group + for name, var in virtual_variables.items(): + write_virtual_variable_to_icechunk( + store=store, + group=group, + name=name, + var=var, + ) + + +def write_variable_to_icechunk( + store: "IcechunkStore", + group: "Group", + name: str, + var: Variable, +) -> None: + """Write a single (possibly virtual) variable into an icechunk store""" + if isinstance(var.data, ManifestArray): + write_virtual_variable_to_icechunk( + store=store, + group=group, + name=name, + var=var, + ) + else: + raise ValueError( + "Cannot write non-virtual variables as virtual variables to Icechunk stores" + ) + + +def write_virtual_variable_to_icechunk( + store: "IcechunkStore", + group: "Group", + name: str, + var: Variable, +) -> None: + """Write a single virtual variable into an icechunk store""" + ma = cast(ManifestArray, var.data) + zarray = ma.zarray + + # creates array if it doesn't already exist + arr = group.require_array( + name=name, + shape=zarray.shape, + chunk_shape=zarray.chunks, + dtype=encode_dtype(zarray.dtype), + codecs=zarray._v3_codec_pipeline(), + dimension_names=var.dims, + fill_value=zarray.fill_value, + # TODO fill_value? + ) + + # TODO it would be nice if we could assign directly to the .attrs property + for k, v in var.attrs.items(): + arr.attrs[k] = encode_zarr_attr_value(v) + arr.attrs["_ARRAY_DIMENSIONS"] = encode_zarr_attr_value(var.dims) + + _encoding_keys = {"_FillValue", "missing_value", "scale_factor", "add_offset"} + for k, v in var.encoding.items(): + if k in _encoding_keys: + arr.attrs[k] = encode_zarr_attr_value(v) + + write_manifest_virtual_refs( + store=store, + group=group, + arr_name=name, + manifest=ma.manifest, + ) + + +def write_manifest_virtual_refs( + store: "IcechunkStore", + group: "Group", + arr_name: str, + manifest: ChunkManifest, +) -> None: + """Write all the virtual references for one array manifest at once.""" + + key_prefix = f"{group.name}{arr_name}" + + # loop over every reference in the ChunkManifest for that array + # TODO inefficient: this should be replaced with something that sets all (new) references for the array at once + # but Icechunk need to expose a suitable API first + it = np.nditer( + [manifest._paths, manifest._offsets, manifest._lengths], # type: ignore[arg-type] + flags=[ + "refs_ok", + "multi_index", + "c_index", + ], + op_flags=[["readonly"]] * 3, # type: ignore + ) + for path, offset, length in it: + index = it.multi_index + chunk_key = "/".join(str(i) for i in index) + + # set each reference individually + store.set_virtual_ref( + # TODO it would be marginally neater if I could pass the group and name as separate args + key=f"{key_prefix}/c/{chunk_key}", # should be of form 'group/arr_name/c/0/1/2', where c stands for chunks + location=as_file_uri(path.item()), + offset=offset.item(), + length=length.item(), + ) + + +def as_file_uri(path): + # TODO a more robust solution to this requirement exists in https://github.com/zarr-developers/VirtualiZarr/pull/243 + if not any(path.startswith(prefix) for prefix in VALID_URI_PREFIXES) and path != "": + # assume path is local + return f"file://{path}" + else: + return path diff --git a/virtualizarr/writers/zarr.py b/virtualizarr/writers/zarr.py index b3dc8f1a..b9529ad5 100644 --- a/virtualizarr/writers/zarr.py +++ b/virtualizarr/writers/zarr.py @@ -80,7 +80,6 @@ def to_zarr_json(var: Variable, array_dir: Path) -> None: def zarr_v3_array_metadata(zarray: ZArray, dim_names: list[str], attrs: dict) -> dict: """Construct a v3-compliant metadata dict from v2 zarray + information stored on the xarray variable.""" # TODO it would be nice if we could use the zarr-python metadata.ArrayMetadata classes to do this conversion for us - metadata = zarray.dict() # adjust to match v3 spec @@ -95,7 +94,7 @@ def zarr_v3_array_metadata(zarray: ZArray, dim_names: list[str], attrs: dict) -> "name": "default", "configuration": {"separator": "/"}, } - metadata["codecs"] = zarray._v3_codec_pipeline() + metadata["codecs"] = tuple(c.to_dict() for c in zarray._v3_codec_pipeline()) metadata.pop("filters") metadata.pop("compressor") metadata.pop("order") diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 4b3fdd53..e339a3f4 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -72,8 +72,11 @@ def codec(self) -> Codec: @classmethod def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": # coerce type of fill_value as kerchunk can be inconsistent with this + dtype = np.dtype(decoded_arr_refs_zarray["dtype"]) fill_value = decoded_arr_refs_zarray["fill_value"] - if fill_value is None or fill_value == "NaN" or fill_value == "nan": + if np.issubdtype(dtype, np.floating) and ( + fill_value is None or fill_value == "NaN" or fill_value == "nan" + ): fill_value = np.nan compressor = decoded_arr_refs_zarray["compressor"] @@ -84,7 +87,7 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": return ZArray( chunks=tuple(decoded_arr_refs_zarray["chunks"]), compressor=compressor, - dtype=np.dtype(decoded_arr_refs_zarray["dtype"]), + dtype=dtype, fill_value=fill_value, filters=decoded_arr_refs_zarray["filters"], order=decoded_arr_refs_zarray["order"], @@ -140,7 +143,7 @@ def replace( replacements["zarr_format"] = zarr_format return dataclasses.replace(self, **replacements) - def _v3_codec_pipeline(self) -> list: + def _v3_codec_pipeline(self) -> Any: """ VirtualiZarr internally uses the `filters`, `compressor`, and `order` attributes from zarr v2, but to create conformant zarr v3 metadata those 3 must be turned into `codecs` objects. @@ -153,46 +156,46 @@ def _v3_codec_pipeline(self) -> list: post_compressor: Iterable[BytesBytesCodec] #optional ``` """ - import numcodecs + try: + from zarr.core.metadata.v3 import ( # type: ignore[import-untyped] + parse_codecs, + ) + except ImportError: + raise ImportError("zarr v3 is required to generate v3 codec pipelines") - if self.filters: - filter_codecs_configs = [ - numcodecs.get_codec(filter).get_config() for filter in self.filters - ] - filters = [ - dict(name=codec.pop("id"), configuration=codec) - for codec in filter_codecs_configs - ] - else: - filters = [] - - # Noting here that zarr v3 has very few codecs specificed in the official spec, - # and that there are far more codecs in `numcodecs`. We take a gamble and assume - # that the codec names and configuration are simply mapped into zarrv3 "configurables". - if self.compressor: - compressor = [_num_codec_config_to_configurable(self.compressor)] - else: - compressor = [] + codec_configs = [] # https://zarr-specs.readthedocs.io/en/latest/v3/codecs/transpose/v1.0.html#transpose-codec-v1 # Either "C" or "F", defining the layout of bytes within each chunk of the array. # "C" means row-major order, i.e., the last dimension varies fastest; # "F" means column-major order, i.e., the first dimension varies fastest. - if self.order == "C": - order = tuple(range(len(self.shape))) - elif self.order == "F": + # For now, we only need transpose if the order is not "C" + if self.order == "F": order = tuple(reversed(range(len(self.shape)))) + transpose = dict(name="transpose", configuration=dict(order=order)) + codec_configs.append(transpose) - transpose = dict(name="transpose", configuration=dict(order=order)) # https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097 # "If no ArrayBytesCodec is supplied, we can auto-add a BytesCodec" bytes = dict( name="bytes", configuration={} ) # TODO need to handle endianess configuration + codec_configs.append(bytes) + + # Noting here that zarr v3 has very few codecs specificed in the official spec, + # and that there are far more codecs in `numcodecs`. We take a gamble and assume + # that the codec names and configuration are simply mapped into zarrv3 "configurables". + if self.filters: + codec_configs.extend( + [_num_codec_config_to_configurable(filter) for filter in self.filters] + ) + + if self.compressor: + codec_configs.append(_num_codec_config_to_configurable(self.compressor)) + + # convert the pipeline repr into actual v3 codec objects + codec_pipeline = parse_codecs(codec_configs) - # The order here is significant! - # [ArrayArray] -> ArrayBytes -> [BytesBytes] - codec_pipeline = [transpose, bytes] + compressor + filters return codec_pipeline @@ -220,5 +223,9 @@ def _num_codec_config_to_configurable(num_codec: dict) -> dict: """ Convert a numcodecs codec into a zarr v3 configurable. """ + if num_codec["id"].startswith("numcodecs."): + return num_codec + num_codec_copy = num_codec.copy() - return {"name": num_codec_copy.pop("id"), "configuration": num_codec_copy} + name = "numcodecs." + num_codec_copy.pop("id") + return {"name": name, "configuration": num_codec_copy} From 534ae0174474b2948b802e2411ab9e2456ecdbb4 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 22 Oct 2024 14:39:49 -0600 Subject: [PATCH 27/29] FAQ updates (#266) * faq question about already having kerchunked data * note compatibility with icechunk * move more basic usage questions to the bottom * q about custom readers * split API into User API and Developer API * note about manifest classes --- docs/api.rst | 41 +++++++++++++++++++++++++---------------- docs/faq.md | 28 ++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 16 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 755713d0..fef8f2f0 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -5,21 +5,13 @@ API Reference .. currentmodule:: virtualizarr VirtualiZarr has a small API surface, because most of the complexity is handled by xarray functions like ``xarray.concat`` and ``xarray.merge``. +Users can use xarray for every step apart from reading and serializing virtual references. -Manifests -========= - -.. currentmodule:: virtualizarr.manifests -.. autosummary:: - :nosignatures: - :toctree: generated/ - - ChunkManifest - ManifestArray - +User API +======== Reading -======= +------- .. currentmodule:: virtualizarr.backend .. autosummary:: @@ -30,7 +22,7 @@ Reading Serialization -============= +------------- .. currentmodule:: virtualizarr.accessor .. autosummary:: @@ -41,9 +33,8 @@ Serialization VirtualiZarrDatasetAccessor.to_zarr VirtualiZarrDatasetAccessor.to_icechunk - Rewriting -============= +--------- .. currentmodule:: virtualizarr.accessor .. autosummary:: @@ -52,9 +43,27 @@ Rewriting VirtualiZarrDatasetAccessor.rename_paths +Developer API +============= + +If you want to write a new reader to create virtual references pointing to a custom file format, you will need to use VirtualiZarr's internal classes. + +Manifests +--------- + +VirtualiZarr uses these classes to store virtual references internally. + +.. currentmodule:: virtualizarr.manifests +.. autosummary:: + :nosignatures: + :toctree: generated/ + + ChunkManifest + ManifestArray + Array API -========= +--------- VirtualiZarr's :py:class:`~virtualizarr.ManifestArray` objects support a limited subset of the Python Array API standard in :py:mod:`virtualizarr.manifests.array_api`. diff --git a/docs/faq.md b/docs/faq.md index d273a529..81f55aa3 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -68,3 +68,31 @@ We have a lot of ideas, including: - [Generating references without kerchunk](https://github.com/zarr-developers/VirtualiZarr/issues/78) If you see other opportunities then we would love to hear your ideas! + +## Is this compatible with Icechunk? + +Yes! VirtualiZarr allows you to ingest data as virtual references and write those references into an Icechunk Store. See the [Icechunk documentation on creating virtaul datasets.](https://icechunk.io/icechunk-python/virtual/#creating-a-virtual-dataset-with-virtualizarr) + +## I already have Kerchunked data, do I have to redo that work? + +No - you can simply open the Kerchunk-formatted references you already have into VirtualiZarr directly. Then you can re-save them into a new format, e.g. [Icechunk](https://icechunk.io/) like so: + +```python +from virtualizarr import open_virtual_dataset + +vds = open_virtual_dataset('refs.json') +# vds = open_virtual_dataset('refs.parq') # kerchunk parquet files are supported too + +vds.virtualize.to_icechunk(icechunkstore) +``` + +## Can I add a new reader for my custom file format? + +There are a lot of legacy file formats which could potentially be represented as virtual zarr references (see [this issue](https://github.com/zarr-developers/VirtualiZarr/issues/218) for some examples). VirtualiZarr ships with some readers for common formats (e.g. netCDF/HDF5), but you may want to write your own reader for some other file format. + +VirtualiZarr is designed in a way to make this as straightforward as possible. If you want to do this then [this comment](https://github.com/zarr-developers/VirtualiZarr/issues/262#issuecomment-2429968244 +) will be helpful. + +You can also use this approach to write a reader that starts from a kerchunk-formatted virtual references dict. + +Currently if you want to call your new reader from `virtualizarr.open_virtual_dataset` you would need to open a PR to this repository, but we plan to generalize this system to allow 3rd party libraries to plug in via an entrypoint (see [issue #245](https://github.com/zarr-developers/VirtualiZarr/issues/245)). From dacdd67dac05241d6e8a894e510d8b90c9dad560 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 22 Oct 2024 17:32:44 -0600 Subject: [PATCH 28/29] Link to stable version of docs instead of latest --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f415b356..dc581297 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ VirtualiZarr (pronounced like "virtualize" but more piratey) grew out of [discus You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides [almost all the same features](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) as Kerchunk. -_Please see the [documentation](https://virtualizarr.readthedocs.io/en/latest/)_ +_Please see the [documentation](https://virtualizarr.readthedocs.io/en/stable/api.html)_ ### Development Status and Roadmap From fffdc2d831526d4bb2926d253a7915262d0ee827 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 22 Oct 2024 17:52:29 -0600 Subject: [PATCH 29/29] import top-level version of xarray classes (#267) --- virtualizarr/backend.py | 3 +-- virtualizarr/readers/common.py | 25 +++++++++++++++---------- virtualizarr/readers/dmrpp.py | 4 +--- virtualizarr/readers/fits.py | 3 +-- virtualizarr/readers/hdf5.py | 3 +-- virtualizarr/readers/kerchunk.py | 3 +-- virtualizarr/readers/netcdf3.py | 3 +-- virtualizarr/readers/tiff.py | 3 +-- virtualizarr/readers/zarr_v3.py | 4 +--- 9 files changed, 23 insertions(+), 28 deletions(-) diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index 0322f604..32403d04 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -7,8 +7,7 @@ Optional, ) -from xarray import Dataset -from xarray.core.indexes import Index +from xarray import Dataset, Index from virtualizarr.manifests import ManifestArray from virtualizarr.readers import ( diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index 54aedfe2..9be2b45f 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -11,11 +11,16 @@ cast, ) -import xarray as xr -from xarray import Dataset +from xarray import ( + Coordinates, + Dataset, + Index, + IndexVariable, + Variable, + open_dataset, +) from xarray.backends import AbstractDataStore, BackendArray -from xarray.core.indexes import Index, PandasIndex -from xarray.core.variable import IndexVariable, Variable +from xarray.core.indexes import PandasIndex from virtualizarr.manifests import ManifestArray from virtualizarr.utils import _FsspecFSFromFilepath @@ -62,7 +67,7 @@ def open_loadable_vars_and_indexes( # fpath can be `Any` thanks to fsspec.filesystem(...).open() returning Any. # We'll (hopefully safely) cast it to what xarray is expecting, but this might let errors through. - ds = xr.open_dataset( + ds = open_dataset( cast(XArrayOpenT, fpath), drop_variables=drop_variables, group=group, @@ -113,7 +118,7 @@ def construct_virtual_dataset( data_vars, coords = separate_coords(vars, indexes, coord_names) - vds = xr.Dataset( + vds = Dataset( data_vars, coords=coords, # indexes={}, # TODO should be added in a later version of xarray @@ -126,10 +131,10 @@ def construct_virtual_dataset( def separate_coords( - vars: Mapping[str, xr.Variable], + vars: Mapping[str, Variable], indexes: MutableMapping[str, Index], coord_names: Iterable[str] | None = None, -) -> tuple[dict[str, xr.Variable], xr.Coordinates]: +) -> tuple[dict[str, Variable], Coordinates]: """ Try to generate a set of coordinates that won't cause xarray to automatically build a pandas.Index for the 1D coordinates. @@ -144,7 +149,7 @@ def separate_coords( # split data and coordinate variables (promote dimension coordinates) data_vars = {} coord_vars: dict[ - str, tuple[Hashable, Any, dict[Any, Any], dict[Any, Any]] | xr.Variable + str, tuple[Hashable, Any, dict[Any, Any], dict[Any, Any]] | Variable ] = {} for name, var in vars.items(): if name in coord_names or var.dims == (name,): @@ -164,7 +169,7 @@ def separate_coords( else: data_vars[name] = var - coords = xr.Coordinates(coord_vars, indexes=indexes) + coords = Coordinates(coord_vars, indexes=indexes) return data_vars, coords diff --git a/virtualizarr/readers/dmrpp.py b/virtualizarr/readers/dmrpp.py index 766b1c62..c9095c7e 100644 --- a/virtualizarr/readers/dmrpp.py +++ b/virtualizarr/readers/dmrpp.py @@ -6,9 +6,7 @@ from xml.etree import ElementTree as ET import numpy as np -from xarray import Coordinates, Dataset -from xarray.core.indexes import Index -from xarray.core.variable import Variable +from xarray import Coordinates, Dataset, Index, Variable from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.readers.common import VirtualBackend diff --git a/virtualizarr/readers/fits.py b/virtualizarr/readers/fits.py index 618d81cd..de93bc1f 100644 --- a/virtualizarr/readers/fits.py +++ b/virtualizarr/readers/fits.py @@ -1,7 +1,6 @@ from typing import Iterable, Mapping, Optional -from xarray import Dataset -from xarray.core.indexes import Index +from xarray import Dataset, Index from virtualizarr.readers.common import ( VirtualBackend, diff --git a/virtualizarr/readers/hdf5.py b/virtualizarr/readers/hdf5.py index c0d38e20..91e5b6f9 100644 --- a/virtualizarr/readers/hdf5.py +++ b/virtualizarr/readers/hdf5.py @@ -1,7 +1,6 @@ from typing import Iterable, Mapping, Optional -from xarray import Dataset -from xarray.core.indexes import Index +from xarray import Dataset, Index from virtualizarr.readers.common import ( VirtualBackend, diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py index 35fa4932..2f1ff4b2 100644 --- a/virtualizarr/readers/kerchunk.py +++ b/virtualizarr/readers/kerchunk.py @@ -1,8 +1,7 @@ from typing import Iterable, Mapping, Optional import ujson -from xarray import Dataset -from xarray.core.indexes import Index +from xarray import Dataset, Index from virtualizarr.readers.common import VirtualBackend from virtualizarr.translators.kerchunk import dataset_from_kerchunk_refs diff --git a/virtualizarr/readers/netcdf3.py b/virtualizarr/readers/netcdf3.py index 30c6746e..25f212ca 100644 --- a/virtualizarr/readers/netcdf3.py +++ b/virtualizarr/readers/netcdf3.py @@ -1,7 +1,6 @@ from typing import Iterable, Mapping, Optional -from xarray import Dataset -from xarray.core.indexes import Index +from xarray import Dataset, Index from virtualizarr.readers.common import ( VirtualBackend, diff --git a/virtualizarr/readers/tiff.py b/virtualizarr/readers/tiff.py index bb32e647..d9c440ba 100644 --- a/virtualizarr/readers/tiff.py +++ b/virtualizarr/readers/tiff.py @@ -1,8 +1,7 @@ import warnings from typing import Iterable, Mapping, Optional -from xarray import Dataset -from xarray.core.indexes import Index +from xarray import Dataset, Index from virtualizarr.readers.common import ( VirtualBackend, diff --git a/virtualizarr/readers/zarr_v3.py b/virtualizarr/readers/zarr_v3.py index a1f4ab7d..4a867ffb 100644 --- a/virtualizarr/readers/zarr_v3.py +++ b/virtualizarr/readers/zarr_v3.py @@ -4,9 +4,7 @@ import numcodecs import numpy as np -from xarray import Dataset -from xarray.core.indexes import Index -from xarray.core.variable import Variable +from xarray import Dataset, Index, Variable from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.readers.common import VirtualBackend, separate_coords