From fd05b7d932b1d40b3481c55b0c123a5cb381c9f6 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Sat, 23 Nov 2024 11:26:40 -0700 Subject: [PATCH 01/10] Expand xarray openable type hint with ReadBuffer (#316) * expand xarray openable type hint with ReadBuffer * try making type hint version dependent * fix import * remove ReadBuffer from older type * just give up and use type ignore --- virtualizarr/readers/common.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index 1ad24629..40f61497 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -1,15 +1,13 @@ -import os import warnings from abc import ABC from collections.abc import Iterable, Mapping, MutableMapping -from io import BufferedIOBase from typing import ( Any, Hashable, Optional, - cast, ) +import xarray # noqa from xarray import ( Coordinates, Dataset, @@ -19,13 +17,10 @@ Variable, open_dataset, ) -from xarray.backends import AbstractDataStore from xarray.core.indexes import PandasIndex from virtualizarr.utils import _FsspecFSFromFilepath -XArrayOpenT = str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore - def open_loadable_vars_and_indexes( filepath: str, @@ -52,10 +47,8 @@ def open_loadable_vars_and_indexes( ).open_file() # fpath can be `Any` thanks to fsspec.filesystem(...).open() returning Any. - # We'll (hopefully safely) cast it to what xarray is expecting, but this might let errors through. - ds = open_dataset( - cast(XArrayOpenT, fpath), + fpath, # type: ignore[arg-type] drop_variables=drop_variables, group=group, decode_times=decode_times, From 2d2459b60bf4d4257ed54c6e65d6dca865343881 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Sat, 23 Nov 2024 11:54:49 -0700 Subject: [PATCH 02/10] Consolidate hdf reader tests into their own tests module (#314) --- virtualizarr/tests/test_readers/{ => test_hdf}/test_hdf.py | 0 .../tests/test_readers/{ => test_hdf}/test_hdf_filters.py | 0 .../tests/test_readers/{ => test_hdf}/test_hdf_integration.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename virtualizarr/tests/test_readers/{ => test_hdf}/test_hdf.py (100%) rename virtualizarr/tests/test_readers/{ => test_hdf}/test_hdf_filters.py (100%) rename virtualizarr/tests/test_readers/{ => test_hdf}/test_hdf_integration.py (100%) diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf.py similarity index 100% rename from virtualizarr/tests/test_readers/test_hdf.py rename to virtualizarr/tests/test_readers/test_hdf/test_hdf.py diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf_filters.py similarity index 100% rename from virtualizarr/tests/test_readers/test_hdf_filters.py rename to virtualizarr/tests/test_readers/test_hdf/test_hdf_filters.py diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py similarity index 100% rename from virtualizarr/tests/test_readers/test_hdf_integration.py rename to virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py From 3d7a4be71803967fa8b65b311123eff6f629bd54 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 25 Nov 2024 13:51:52 -0700 Subject: [PATCH 03/10] Refactor kerchunk reader tests to call open_virtual_dataset (#317) * use a fixture which generates json files * refactor last test to use open_virtual_dataset --- .../tests/test_readers/test_kerchunk.py | 113 ++++++++++++------ 1 file changed, 76 insertions(+), 37 deletions(-) diff --git a/virtualizarr/tests/test_readers/test_kerchunk.py b/virtualizarr/tests/test_readers/test_kerchunk.py index f693b370..dab049cb 100644 --- a/virtualizarr/tests/test_readers/test_kerchunk.py +++ b/virtualizarr/tests/test_readers/test_kerchunk.py @@ -1,51 +1,88 @@ +from pathlib import Path +from typing import Any, Callable, Generator, Optional + import numpy as np +import pytest import ujson +from virtualizarr.backend import open_virtual_dataset from virtualizarr.manifests import ManifestArray -from virtualizarr.readers.kerchunk import ( - dataset_from_kerchunk_refs, -) def gen_ds_refs( - zgroup: str = '{"zarr_format":2}', - zarray: str = '{"chunks":[2,3],"compressor":null,"dtype":" Generator[ + Callable[[Optional[Any], Optional[Any], Optional[Any], Optional[Any]], str], + None, + None, +]: + """ + Fixture which defers creation of the references file until the parameters zgroup etc. are known. + """ + + def _refs_file(zgroup=None, zarray=None, zattrs=None, chunks=None) -> str: + refs = gen_ds_refs(zgroup=zgroup, zarray=zarray, zattrs=zattrs, chunks=chunks) + filepath = tmp_path / "refs.json" + + with open(filepath, "w") as json_file: + ujson.dump(refs, json_file) + + return str(filepath) + + yield _refs_file + - assert da.data.zarray.compressor is None - assert da.data.zarray.filters is None - assert da.data.zarray.fill_value == 0 - assert da.data.zarray.order == "C" +def test_dataset_from_df_refs(refs_file_factory): + refs_file = refs_file_factory() - assert da.data.manifest.dict() == { + vds = open_virtual_dataset(refs_file, filetype="kerchunk") + + assert "a" in vds + vda = vds["a"] + assert isinstance(vda.data, ManifestArray) + assert vda.dims == ("x", "y") + assert vda.shape == (2, 3) + assert vda.chunks == (2, 3) + assert vda.dtype == np.dtype(" Date: Wed, 27 Nov 2024 11:00:10 -0700 Subject: [PATCH 04/10] Add reader_kwargs argument to open_virtual_dataset (#315) * add reader_kwargs argument to open_virtual_dataset, and pass it down to every reader * rename reader_kwargs -> virtual_backend_kwargs * release note --- docs/releases.rst | 3 +++ virtualizarr/backend.py | 4 ++++ virtualizarr/readers/common.py | 2 ++ virtualizarr/readers/dmrpp.py | 6 ++++++ virtualizarr/readers/fits.py | 6 ++++++ virtualizarr/readers/hdf/hdf.py | 6 ++++++ virtualizarr/readers/hdf5.py | 6 ++++++ virtualizarr/readers/kerchunk.py | 6 ++++++ virtualizarr/readers/netcdf3.py | 6 ++++++ virtualizarr/readers/tiff.py | 6 ++++++ virtualizarr/readers/zarr_v3.py | 6 ++++++ 11 files changed, 57 insertions(+) diff --git a/docs/releases.rst b/docs/releases.rst index bde41778..ed29ab13 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -9,6 +9,9 @@ v1.1.1 (unreleased) New Features ~~~~~~~~~~~~ +- Add a ``virtual_backend_kwargs`` keyword argument to file readers and to ``open_virtual_dataset``, to allow reader-specific options to be passed down. + (:pull:`315`) By `Tom Nicholas `_. + Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index fed2b756..b9df1923 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -113,6 +113,7 @@ def open_virtual_dataset( cftime_variables: Iterable[str] | None = None, indexes: Mapping[str, Index] | None = None, virtual_array_class=ManifestArray, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, backend: Optional[VirtualBackend] = None, ) -> Dataset: @@ -147,6 +148,8 @@ def open_virtual_dataset( virtual_array_class Virtual array class to use to represent the references to the chunks in each on-disk array. Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. + virtual_backend_kwargs: dict, default is None + Dictionary of keyword arguments passed down to this reader. Allows passing arguments specific to certain readers. reader_options: dict, default {} Dict passed into Kerchunk file readers, to allow reading from remote filesystems. Note: Each Kerchunk file reader has distinct arguments, so ensure reader_options match selected Kerchunk reader arguments. @@ -201,6 +204,7 @@ def open_virtual_dataset( loadable_variables=loadable_variables, decode_times=decode_times, indexes=indexes, + virtual_backend_kwargs=virtual_backend_kwargs, reader_options=reader_options, ) diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index 40f61497..d4b65872 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -168,6 +168,7 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: raise NotImplementedError() @@ -180,6 +181,7 @@ def open_virtual_datatree( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> DataTree: raise NotImplementedError() diff --git a/virtualizarr/readers/dmrpp.py b/virtualizarr/readers/dmrpp.py index 5859ca92..5d74c412 100644 --- a/virtualizarr/readers/dmrpp.py +++ b/virtualizarr/readers/dmrpp.py @@ -23,6 +23,7 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: loadable_variables, drop_variables = check_for_collisions( @@ -30,6 +31,11 @@ def open_virtual_dataset( loadable_variables=loadable_variables, ) + if virtual_backend_kwargs: + raise NotImplementedError( + "DMR++ reader does not understand any virtual_backend_kwargs" + ) + if loadable_variables != [] or decode_times or indexes is None: raise NotImplementedError( "Specifying `loadable_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files." diff --git a/virtualizarr/readers/fits.py b/virtualizarr/readers/fits.py index de93bc1f..7b6221ff 100644 --- a/virtualizarr/readers/fits.py +++ b/virtualizarr/readers/fits.py @@ -23,10 +23,16 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: from kerchunk.fits import process_file + if virtual_backend_kwargs: + raise NotImplementedError( + "FITS reader does not understand any virtual_backend_kwargs" + ) + # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 refs = KerchunkStoreRefs({"refs": process_file(filepath, **reader_options)}) diff --git a/virtualizarr/readers/hdf/hdf.py b/virtualizarr/readers/hdf/hdf.py index 0eee63d5..ca921bb0 100644 --- a/virtualizarr/readers/hdf/hdf.py +++ b/virtualizarr/readers/hdf/hdf.py @@ -38,8 +38,14 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> xr.Dataset: + if virtual_backend_kwargs: + raise NotImplementedError( + "HDF reader does not understand any virtual_backend_kwargs" + ) + drop_variables, loadable_variables = check_for_collisions( drop_variables, loadable_variables, diff --git a/virtualizarr/readers/hdf5.py b/virtualizarr/readers/hdf5.py index 91e5b6f9..ca910ab4 100644 --- a/virtualizarr/readers/hdf5.py +++ b/virtualizarr/readers/hdf5.py @@ -23,10 +23,16 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: from kerchunk.hdf import SingleHdf5ToZarr + if virtual_backend_kwargs: + raise NotImplementedError( + "HDF5 reader does not understand any virtual_backend_kwargs" + ) + drop_variables, loadable_variables = check_for_collisions( drop_variables, loadable_variables, diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py index 4a41548c..43ad5341 100644 --- a/virtualizarr/readers/kerchunk.py +++ b/virtualizarr/readers/kerchunk.py @@ -20,10 +20,16 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: """Reads existing kerchunk references (in JSON or parquet) format.""" + if virtual_backend_kwargs: + raise NotImplementedError( + "Kerchunk reader does not understand any virtual_backend_kwargs" + ) + if group: raise NotImplementedError() diff --git a/virtualizarr/readers/netcdf3.py b/virtualizarr/readers/netcdf3.py index 25f212ca..f93dd2a6 100644 --- a/virtualizarr/readers/netcdf3.py +++ b/virtualizarr/readers/netcdf3.py @@ -23,10 +23,16 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: from kerchunk.netCDF3 import NetCDF3ToZarr + if virtual_backend_kwargs: + raise NotImplementedError( + "netcdf3 reader does not understand any virtual_backend_kwargs" + ) + drop_variables, loadable_variables = check_for_collisions( drop_variables, loadable_variables, diff --git a/virtualizarr/readers/tiff.py b/virtualizarr/readers/tiff.py index d9c440ba..0383e899 100644 --- a/virtualizarr/readers/tiff.py +++ b/virtualizarr/readers/tiff.py @@ -25,8 +25,14 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: + if virtual_backend_kwargs: + raise NotImplementedError( + "TIFF reader does not understand any virtual_backend_kwargs" + ) + from kerchunk.tiff import tiff_to_zarr drop_variables, loadable_variables = check_for_collisions( diff --git a/virtualizarr/readers/zarr_v3.py b/virtualizarr/readers/zarr_v3.py index 4a867ffb..1491d2d2 100644 --- a/virtualizarr/readers/zarr_v3.py +++ b/virtualizarr/readers/zarr_v3.py @@ -20,6 +20,7 @@ def open_virtual_dataset( loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, + virtual_backend_kwargs: Optional[dict] = None, reader_options: Optional[dict] = None, ) -> Dataset: """ @@ -27,6 +28,11 @@ def open_virtual_dataset( This is experimental - chunk manifests are not part of the Zarr v3 Spec. """ + if virtual_backend_kwargs: + raise NotImplementedError( + "Zarr_v3 reader does not understand any virtual_backend_kwargs" + ) + storepath = Path(filepath) if group: From 9dea14eb14fc000daeb1fde10f2bb978ac4834b9 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 27 Nov 2024 13:10:58 -0700 Subject: [PATCH 05/10] Fix documentation link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 22d0a948..672d12e0 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ VirtualiZarr (pronounced like "virtualizer" but more piratey) grew out of [discu You now have a choice between using VirtualiZarr and Kerchunk: VirtualiZarr provides [almost all the same features](https://virtualizarr.readthedocs.io/en/latest/faq.html#how-do-virtualizarr-and-kerchunk-compare) as Kerchunk. -_Please see the [documentation](https://virtualizarr.readthedocs.io/en/stable/api.html)_ +_Please see the [documentation](https://virtualizarr.readthedocs.io/en/stable/index.html)_ ### Development Status and Roadmap From 39ca8362975cb0247850f1f34728e124835c471c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 2 Dec 2024 14:14:03 -0700 Subject: [PATCH 06/10] [pre-commit.ci] pre-commit autoupdate (#322) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.7.2 → v0.8.1](https://github.com/astral-sh/ruff-pre-commit/compare/v0.7.2...v0.8.1) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9990375b..98122388 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,7 +11,7 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: "v0.7.2" + rev: "v0.8.1" hooks: # Run the linter. - id: ruff From 2bfeee39497620121a63a3a4a25b317aa6057653 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 3 Dec 2024 13:26:47 -0700 Subject: [PATCH 07/10] Refactor dmrpp tests to expose data file path (#323) * rewrite tests to use a new dmrparser_factory * rewrite using global dict of XML strings * fix final test by explicitly passing in tmp_path instead of using a fixture which requests tmp_path * fix bug with not converting Path objects to strings --- virtualizarr/tests/test_readers/test_dmrpp.py | 346 ++++++++++-------- 1 file changed, 191 insertions(+), 155 deletions(-) diff --git a/virtualizarr/tests/test_readers/test_dmrpp.py b/virtualizarr/tests/test_readers/test_dmrpp.py index cbafc40f..d6238ca5 100644 --- a/virtualizarr/tests/test_readers/test_dmrpp.py +++ b/virtualizarr/tests/test_readers/test_dmrpp.py @@ -20,153 +20,159 @@ # TODO: later add MUR, SWOT, TEMPO and others by using kerchunk JSON to read refs (rather than reading the whole netcdf file) ] - -@pytest.fixture -def basic_dmrpp() -> DMRParser: - xml_str = """\ - - - - - - - - - grid x-axis - - - - - - - - - grid y-axis - - - - - - - - - grid z-axis - - - - - - - - - - analysed sea surface temperature - - - 1 - 2 - 3 - - - -32768 - - - 298.14999999999998 - - - 0.001 - - - x y z - - - 360 720 - - - - - - - - - - - - - - - - - mask - - - - - - - CF-1.6 - - - Sample Dataset - - - """ - return DMRParser(root=ET.fromstring(textwrap.dedent(xml_str))) - - -@pytest.fixture -def nested_groups_dmrpp() -> DMRParser: - xml_str = """\ - - - - - - - - - - - - - - - - - +DMRPP_XML_STRINGS = { + "basic": textwrap.dedent( + """\ + + + - - - test + + + grid x-axis - - - test + + + grid y-axis - + + + + + + + grid z-axis + + + - - - - + + + + + analysed sea surface temperature + + + 1 + 2 + 3 + + + -32768 + + + 298.14999999999998 + + + 0.001 + + + x y z + + + 360 720 + + + + + + + + + + + + + + + + + mask + + + + + + + CF-1.6 + + + Sample Dataset + + + """ + ), + "nested_groups": textwrap.dedent( + """\ + + + + + + - - - - """ - return DMRParser(root=ET.fromstring(textwrap.dedent(xml_str))) + + + + + + + + + + + + + test + + + + + + + + + test + + + + + + + + + + + + + + + + + """ + ), +} + + +def dmrparser(dmrpp_xml_str: str, tmp_path: Path, filename="test.nc") -> DMRParser: + # TODO we should actually create a dmrpp file in a temporary directory + # this would avoid the need to pass tmp_path separately + + return DMRParser( + root=ET.fromstring(dmrpp_xml_str), data_filepath=str(tmp_path / filename) + ) @network @@ -179,36 +185,37 @@ def test_NASA_dmrpp(data_url, dmrpp_url): @pytest.mark.parametrize( - "dmrpp_fixture, fqn_path, expected_xpath", + "dmrpp_xml_str_key, fqn_path, expected_xpath", [ - ("basic_dmrpp", "/", "."), - ("basic_dmrpp", "/data", "./*[@name='data']"), - ("basic_dmrpp", "/data/items", "./*[@name='data']/*[@name='items']"), + ("basic", "/", "."), + ("basic", "/data", "./*[@name='data']"), + ("basic", "/data/items", "./*[@name='data']/*[@name='items']"), ( - "nested_groups_dmrpp", + "nested_groups", "/group1/group2/area", "./*[@name='group1']/*[@name='group2']/*[@name='area']", ), ], ) -def test_find_node_fqn(request, dmrpp_fixture, fqn_path, expected_xpath): - parser_instance = request.getfixturevalue(dmrpp_fixture) +def test_find_node_fqn(tmp_path, dmrpp_xml_str_key, fqn_path, expected_xpath): + parser_instance = dmrparser(DMRPP_XML_STRINGS[dmrpp_xml_str_key], tmp_path=tmp_path) result = parser_instance.find_node_fqn(fqn_path) expected = parser_instance.root.find(expected_xpath, parser_instance._NS) assert result == expected @pytest.mark.parametrize( - "dmrpp_fixture, group_path", + "dmrpp_xml_str_key, group_path", [ - ("basic_dmrpp", "/"), - ("nested_groups_dmrpp", "/"), - ("nested_groups_dmrpp", "/group1"), - ("nested_groups_dmrpp", "/group1/group2"), + ("basic", "/"), + ("nested_groups", "/"), + ("nested_groups", "/group1"), + ("nested_groups", "/group1/group2"), ], ) -def test_split_groups(request, dmrpp_fixture, group_path): - dmrpp_instance = request.getfixturevalue(dmrpp_fixture) +def test_split_groups(tmp_path, dmrpp_xml_str_key, group_path): + dmrpp_instance = dmrparser(DMRPP_XML_STRINGS[dmrpp_xml_str_key], tmp_path=tmp_path) + # get all tags in a dataset (so all tags excluding nested groups) dataset_tags = lambda x: [ d for d in x if d.tag != "{" + dmrpp_instance._NS["dap"] + "}" + "Group" @@ -221,21 +228,30 @@ def test_split_groups(request, dmrpp_fixture, group_path): assert result_tags == expected_tags -def test_parse_dataset(basic_dmrpp, nested_groups_dmrpp): +def test_parse_dataset(tmp_path): + basic_dmrpp = dmrparser(DMRPP_XML_STRINGS["basic"], tmp_path=tmp_path) + vds = basic_dmrpp.parse_dataset() assert vds.sizes == {"x": 720, "y": 1440, "z": 3} assert vds.data_vars.keys() == {"data", "mask"} assert vds.data_vars["data"].dims == ("x", "y") assert vds.attrs == {"Conventions": "CF-1.6", "title": "Sample Dataset"} assert vds.coords.keys() == {"x", "y", "z"} + + nested_groups_dmrpp = dmrparser( + DMRPP_XML_STRINGS["nested_groups"], tmp_path=tmp_path + ) + vds_root_implicit = nested_groups_dmrpp.parse_dataset() vds_root = nested_groups_dmrpp.parse_dataset(group="/") xrt.assert_identical(vds_root_implicit, vds_root) assert vds_root.sizes == {"a": 10, "b": 10} assert vds_root.coords.keys() == {"a", "b"} + vds_g1 = nested_groups_dmrpp.parse_dataset(group="/group1") assert vds_g1.sizes == {"x": 720, "y": 1440} assert vds_g1.coords.keys() == {"x", "y"} + vds_g2 = nested_groups_dmrpp.parse_dataset(group="/group1/group2") assert vds_g2.sizes == {"x": 720, "y": 1440} assert vds_g2.data_vars.keys() == {"area"} @@ -249,13 +265,19 @@ def test_parse_dataset(basic_dmrpp, nested_groups_dmrpp): ("/group1/x", {"x": 720}), ], ) -def test_parse_dim(nested_groups_dmrpp, dim_path, expected): +def test_parse_dim(tmp_path, dim_path, expected): + nested_groups_dmrpp = dmrparser( + DMRPP_XML_STRINGS["nested_groups"], tmp_path=tmp_path + ) + result = nested_groups_dmrpp._parse_dim(nested_groups_dmrpp.find_node_fqn(dim_path)) assert result == expected @pytest.mark.parametrize("dim_path", ["/", "/mask"]) -def test_find_dimension_tags(basic_dmrpp, dim_path): +def test_find_dimension_tags(tmp_path, dim_path): + basic_dmrpp = dmrparser(DMRPP_XML_STRINGS["basic"], tmp_path=tmp_path) + # Check that Dimension tags match Dimension tags from the root # Check that Dim tags reference the same Dimension tags from the root assert basic_dmrpp._find_dimension_tags( @@ -263,7 +285,9 @@ def test_find_dimension_tags(basic_dmrpp, dim_path): ) == basic_dmrpp.root.findall("dap:Dimension", basic_dmrpp._NS) -def test_parse_variable(basic_dmrpp): +def test_parse_variable(tmp_path): + basic_dmrpp = dmrparser(DMRPP_XML_STRINGS["basic"], tmp_path=tmp_path) + var = basic_dmrpp._parse_variable(basic_dmrpp.find_node_fqn("/data")) assert var.dtype == "float32" assert var.dims == ("x", "y") @@ -288,7 +312,9 @@ def test_parse_variable(basic_dmrpp): ("data/_FillValue", {"_FillValue": -32768}), ], ) -def test_parse_attribute(basic_dmrpp, attr_path, expected): +def test_parse_attribute(tmp_path, attr_path, expected): + basic_dmrpp = dmrparser(DMRPP_XML_STRINGS["basic"], tmp_path=tmp_path) + result = basic_dmrpp._parse_attribute(basic_dmrpp.find_node_fqn(attr_path)) assert result == expected @@ -311,7 +337,9 @@ def test_parse_attribute(basic_dmrpp, attr_path, expected): ), ], ) -def test_parse_filters(basic_dmrpp, var_path, dtype, expected_filters): +def test_parse_filters(tmp_path, var_path, dtype, expected_filters): + basic_dmrpp = dmrparser(DMRPP_XML_STRINGS["basic"], tmp_path=tmp_path) + chunks_tag = basic_dmrpp.find_node_fqn(var_path).find( "dmrpp:chunks", basic_dmrpp._NS ) @@ -320,36 +348,44 @@ def test_parse_filters(basic_dmrpp, var_path, dtype, expected_filters): @pytest.mark.parametrize( - "var_path, chunk_shape, expected_lengths, expected_offsets, expected_paths", + "var_path, chunk_shape, chunk_grid_shape, expected_lengths, expected_offsets", [ ( "/data", (360, 720), + (3, 3), np.full((3, 3), 4083, dtype=np.uint64), (np.arange(9, dtype=np.uint64) * 4083 + 40762).reshape(3, 3), - np.full((3, 3), "test.dmrpp", dtype=np.dtypes.StringDType), ), ( "/mask", (720, 1440), + (1,), np.array([4], dtype=np.uint64), np.array([41276], dtype=np.uint64), - np.array(["test.dmrpp"], dtype=np.dtypes.StringDType), ), ], ) def test_parse_chunks( - basic_dmrpp, + tmp_path, var_path, chunk_shape, + chunk_grid_shape, expected_lengths, expected_offsets, - expected_paths, ): + basic_dmrpp = dmrparser(DMRPP_XML_STRINGS["basic"], tmp_path=tmp_path) + chunks_tag = basic_dmrpp.find_node_fqn(var_path).find( "dmrpp:chunks", basic_dmrpp._NS ) result = basic_dmrpp._parse_chunks(chunks_tag, chunk_shape) + + expected_paths = np.full( + shape=chunk_grid_shape, + fill_value=str(tmp_path / "test.nc"), + dtype=np.dtypes.StringDType, + ) expected = ChunkManifest.from_arrays( lengths=expected_lengths, offsets=expected_offsets, paths=expected_paths ) From a08b4958afca1a37ad53388fe1e48ddb47ab5265 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 3 Dec 2024 15:22:46 -0700 Subject: [PATCH 08/10] Paths as URIs (#243) * validate that paths can be coerced to valid URIs * add a test that paths are converted to URIs * added test and better error if local path is not absolute * raise more informative error if path is not absolute * test that empty paths are allowed * add failing test for raising on malformed paths * fix paths in tests * fix more tests * remove the file:/// prefix when writing to kerchunk format * absolute paths in recently-added tests * absolute paths in recently-added tests * fix one more test * stop wrapping specific error in less useful one * moved remaining kerchunk parsing logic out into translator file * add fs_root parameter to validation fn * demote ChunkEntry to a TypedDict to separate validation fn * actually instead add new constructor method to TypedDict * test kerchunk writer with absolute paths * make kerchunk reader tests pass * try to implement the fs_root concatenation * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * implementation using cloudpathlib working * check that fs_root doesn't have a filetype suffix * add cloudpathlib to dependencies * allow http paths, and require file suffixes * unit tests for path validation * test whether we can catch malformed paths * test fs_root * add (unimplemented) validate_entries kwarg to .from_arrays * add .keys(), .values(), .items() * test validate_paths during .from_arrays * ensure validation actually normalizes paths to uris * test .rename_paths correctly validates paths * some release notes * remove now-superfluous coercion to URI in icechunk writer * added link to icechunk writer performance benchmark * add reader_kwargs argument to open_virtual_dataset, and pass it down to every reader * ensure relative paths containing .. can be normalized * ensure HDF5 reader always returns absolute URIs * ensure HDF reader always returns absolute URIs * add relative path handling to other kerchunk-based readers * add dmrpp relative path integration test * fix kerchunk relative paths test by pluggin through fs_root kwarg * fix dmrpp tests by using absolute filepaths in DMR++ contents * clarify new dmrpp test * test handling of relative filepaths to dmrpp files * group related tests * removed cloudpathlib from validation code * fix bug but restrict fs_root to only handle filesystem paths, not bucket url prefixes * global list of recognized URI prefixes * cleanup * remove cloudpathlib from dependencies * fix/ignore some typing errors * rewrite tests to use a new dmrparser_factory * rewrite using global dict of XML strings * fix final test by explicitly passing in tmp_path instead of using a fixture which requests tmp_path * fix bug with not converting Path objects to strings * dmrpp relative paths tests passing * fix type hint for filetype kwarg * user documentation on fs_root * change example manifests to use URIs * reminder that rename_paths exists * update release notes * remove note about .rename_paths --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: ayushnag <35325113+ayushnag@users.noreply.github.com> --- docs/releases.rst | 4 + docs/usage.md | 37 ++- pyproject.toml | 1 - virtualizarr/backend.py | 2 +- virtualizarr/manifests/array.py | 19 -- virtualizarr/manifests/manifest.py | 252 +++++++++++++----- virtualizarr/readers/dmrpp.py | 12 +- virtualizarr/readers/fits.py | 2 + virtualizarr/readers/hdf/hdf.py | 19 +- virtualizarr/readers/hdf5.py | 2 + virtualizarr/readers/kerchunk.py | 13 +- virtualizarr/readers/netcdf3.py | 2 + virtualizarr/readers/tiff.py | 2 + virtualizarr/readers/zarr_v3.py | 1 + virtualizarr/tests/__init__.py | 2 +- virtualizarr/tests/test_integration.py | 34 ++- .../tests/test_manifests/test_array.py | 105 +++----- .../tests/test_manifests/test_manifest.py | 174 +++++++++++- virtualizarr/tests/test_readers/test_dmrpp.py | 52 ++++ .../tests/test_readers/test_kerchunk.py | 54 +++- .../tests/test_writers/test_kerchunk.py | 16 +- virtualizarr/tests/test_xarray.py | 60 ++--- virtualizarr/translators/kerchunk.py | 62 ++++- virtualizarr/writers/icechunk.py | 23 +- virtualizarr/writers/kerchunk.py | 13 +- 25 files changed, 711 insertions(+), 252 deletions(-) diff --git a/docs/releases.rst b/docs/releases.rst index ed29ab13..18d5920b 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -17,6 +17,8 @@ Breaking changes - Minimum required version of Xarray is now v2024.10.0. (:pull:`284`) By `Tom Nicholas `_. +- Opening kerchunk-formatted references from disk which contain relative paths now requires passing the ``fs_root`` keyword argument via ``virtual_backend_kwargs``. + (:pull:`243`) By `Tom Nicholas `_. Deprecations ~~~~~~~~~~~~ @@ -50,6 +52,8 @@ Internal Changes (:pull:`87`) By `Sean Harkins `_. - Support downstream type checking by adding py.typed marker file. (:pull:`306`) By `Max Jones `_. +- File paths in chunk manifests are now always stored as abolute URIs. + (:pull:`243`) By `Tom Nicholas `_. .. _v1.1.0: diff --git a/docs/usage.md b/docs/usage.md index d9b292e0..cad2f96f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -106,10 +106,10 @@ manifest = marr.manifest manifest.dict() ``` ```python -{'0.0.0': {'path': 'air.nc', 'offset': 15419, 'length': 7738000}} +{'0.0.0': {'path': 'file:///work/data/air.nc', 'offset': 15419, 'length': 7738000}} ``` -In this case we can see that the `"air"` variable contains only one chunk, the bytes for which live in the `air.nc` file at the location given by the `'offset'` and `'length'` attributes. +In this case we can see that the `"air"` variable contains only one chunk, the bytes for which live in the `file:///work/data/air.nc` file, at the location given by the `'offset'` and `'length'` attributes. The {py:class}`ChunkManifest ` class is virtualizarr's internal in-memory representation of this manifest. @@ -153,8 +153,8 @@ ManifestArray concatenated.manifest.dict() ``` ``` -{'0.0.0': {'path': 'air.nc', 'offset': 15419, 'length': 7738000}, - '1.0.0': {'path': 'air.nc', 'offset': 15419, 'length': 7738000}} +{'0.0.0': {'path': 'file:///work/data/air.nc', 'offset': 15419, 'length': 7738000}, + '1.0.0': {'path': 'file:///work/data/air.nc', 'offset': 15419, 'length': 7738000}} ``` This concatenation property is what will allow us to combine the data from multiple netCDF files on disk into a single Zarr store containing arrays of many chunks. @@ -254,8 +254,8 @@ We can see that the resulting combined manifest has two chunks, as expected. combined_vds['air'].data.manifest.dict() ``` ``` -{'0.0.0': {'path': 'air1.nc', 'offset': 15419, 'length': 3869000}, - '1.0.0': {'path': 'air2.nc', 'offset': 15419, 'length': 3869000}} +{'0.0.0': {'path': 'file:///work/data/air1.nc', 'offset': 15419, 'length': 3869000}, + '1.0.0': {'path': 'file:///work/data/air2.nc', 'offset': 15419, 'length': 3869000}} ``` ```{note} @@ -327,8 +327,6 @@ Loading variables can be useful in a few scenarios: To correctly decode time variables according to the CF conventions, you need to pass `time` to `loadable_variables` and ensure the `decode_times` argument of `open_virtual_dataset` is set to True (`decode_times` defaults to None). - - ```python vds = open_virtual_dataset( 'air.nc', @@ -354,8 +352,6 @@ Attributes: title: 4x daily NMC reanalysis (1948) ``` - - ## Writing virtual stores to disk Once we've combined references to all the chunks of all our legacy files into one virtual xarray dataset, we still need to write these references out to disk so that they can be read by our analysis code later. @@ -443,20 +439,37 @@ This store can however be read by {py:func}`~virtualizarr.xarray.open_virtual_da You can open existing Kerchunk `json` or `parquet` references as Virtualizarr virtual datasets. This may be useful for converting existing Kerchunk formatted references to storage formats like [Icechunk](https://icechunk.io/). ```python - vds = open_virtual_dataset('combined.json', filetype='kerchunk', indexes={}) # or vds = open_virtual_dataset('combined.parquet', filetype='kerchunk', indexes={}) +``` + +One difference between the kerchunk references format and virtualizarr's internal manifest representation (as well as icechunk's format) is that paths in kerchunk references can be relative paths. Opening kerchunk references that contain relative local filepaths therefore requires supplying another piece of information: the directory of the ``fsspec`` filesystem which the filepath was defined relative to. + +You can dis-ambuiguate kerchunk references containing relative paths by passing the ``fs_root`` kwarg to ``virtual_backend_kwargs``. +```python +# file `relative_refs.json` contains a path like './file.nc' + +vds = open_virtual_dataset( + 'relative_refs.json', + filetype='kerchunk', + indexes={}, + virtual_backend_kwargs={'fs_root': 'file:///some_directory/'} +) + +# the path in the virtual dataset will now be 'file:///some_directory/file.nc' ``` +Note that as the virtualizarr {py:meth}`vds.virtualize.to_kerchunk ` method only writes absolute paths, the only scenario in which you might come across references containing relative paths is if you are opening references that were previously created using the ``kerchunk`` library alone. + ## Rewriting existing manifests Sometimes it can be useful to rewrite the contents of an already-generated manifest or virtual dataset. ### Rewriting file paths -You can rewrite the file paths stored in a manifest or virtual dataset without changing the byte range information using the {py:meth}`ds.virtualize.rename_paths ` accessor method. +You can rewrite the file paths stored in a manifest or virtual dataset without changing the byte range information using the {py:meth}`vds.virtualize.rename_paths ` accessor method. For example, you may want to rename file paths according to a function to reflect having moved the location of the referenced files from local storage to an S3 bucket. diff --git a/pyproject.toml b/pyproject.toml index 77998076..aaaade02 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,6 @@ test = [ "virtualizarr[hdf_reader]" ] - [project.urls] Home = "https://github.com/TomNicholas/VirtualiZarr" Documentation = "https://github.com/TomNicholas/VirtualiZarr/blob/main/README.md" diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index b9df1923..a8e3b66a 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -105,7 +105,7 @@ def automatically_determine_filetype( def open_virtual_dataset( filepath: str, *, - filetype: FileType | None = None, + filetype: FileType | str | None = None, group: str | None = None, drop_variables: Iterable[str] | None = None, loadable_variables: Iterable[str] | None = None, diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 179bcf1c..44c0546c 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -8,7 +8,6 @@ _isnan, ) from virtualizarr.manifests.manifest import ChunkManifest -from virtualizarr.types.kerchunk import KerchunkArrRefs from virtualizarr.zarr import ZArray @@ -62,24 +61,6 @@ def __init__( self._zarray = _zarray self._manifest = _chunkmanifest - @classmethod - def _from_kerchunk_refs(cls, arr_refs: KerchunkArrRefs) -> "ManifestArray": - from virtualizarr.translators.kerchunk import ( - fully_decode_arr_refs, - parse_array_refs, - ) - - decoded_arr_refs = fully_decode_arr_refs(arr_refs) - - chunk_dict, zarray, _zattrs = parse_array_refs(decoded_arr_refs) - manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict) - - obj = object.__new__(cls) - obj._manifest = manifest - obj._zarray = zarray - - return obj - @property def manifest(self) -> ChunkManifest: return self._manifest diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 38743f9b..cc970fb2 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -1,8 +1,9 @@ -import dataclasses import json import re -from collections.abc import Iterable, Iterator -from typing import Any, Callable, Dict, NewType, Tuple, TypedDict, cast +from collections.abc import ItemsView, Iterable, Iterator, KeysView, ValuesView +from pathlib import PosixPath +from typing import Any, Callable, NewType, Tuple, TypedDict, cast +from urllib.parse import urlparse, urlunparse import numpy as np @@ -15,52 +16,169 @@ _CHUNK_KEY = rf"^{_INTEGER}+({_SEPARATOR}{_INTEGER})*$" # matches 1 integer, optionally followed by more integers each separated by a separator (i.e. a period) -class ChunkDictEntry(TypedDict): +# doesn't guarantee that writers actually handle these +VALID_URI_PREFIXES = { + "s3://", + "gs://", + "azure://", + "r2://", + "cos://", + "minio://", + "file:///", +} + + +class ChunkEntry(TypedDict): path: str offset: int length: int + @classmethod # type: ignore[misc] + def with_validation( + cls, *, path: str, offset: int, length: int, fs_root: str | None = None + ) -> "ChunkEntry": + """ + Constructor which validates each part of the chunk entry. + + Parameters + ---------- + fs_root + The root of the filesystem on which these references were generated. + Required if any (likely kerchunk-generated) paths are relative in order to turn them into absolute paths (which virtualizarr requires). + """ + + # note: we can't just use `__init__` or a dataclass' `__post_init__` because we need `fs_root` to be an optional kwarg -ChunkDict = NewType("ChunkDict", dict[ChunkKey, ChunkDictEntry]) + path = validate_and_normalize_path_to_uri(path, fs_root=fs_root) + validate_byte_range(offset=offset, length=length) + return ChunkEntry(path=path, offset=offset, length=length) -@dataclasses.dataclass(frozen=True) -class ChunkEntry: +def validate_and_normalize_path_to_uri(path: str, fs_root: str | None = None) -> str: """ - Information for a single chunk in the manifest. + Makes all paths into fully-qualified absolute URIs, or raises. - Stored in the form `{"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}`. + See https://en.wikipedia.org/wiki/File_URI_scheme + + Parameters + ---------- + fs_root + The root of the filesystem on which these references were generated. + Required if any (likely kerchunk-generated) paths are relative in order to turn them into absolute paths (which virtualizarr requires). """ + if path == "": + # (empty paths are allowed through as they represent missing chunks) + return path - path: str # TODO stricter typing/validation of possible local / remote paths? - offset: int - length: int + # TODO ideally we would just use cloudpathlib.AnyPath to handle all types of paths but that would require extra depedencies, see https://github.com/drivendataorg/cloudpathlib/issues/489#issuecomment-2504725280 - @classmethod - def from_kerchunk( - cls, path_and_byte_range_info: tuple[str] | tuple[str, int, int] - ) -> "ChunkEntry": - from upath import UPath + if path.startswith("http://") or path.startswith("https://"): + # hopefully would raise if given a malformed URL + components = urlparse(path) - if len(path_and_byte_range_info) == 1: - path = path_and_byte_range_info[0] - offset = 0 - length = UPath(path).stat().st_size - else: - path, offset, length = path_and_byte_range_info - return ChunkEntry(path=path, offset=offset, length=length) + if not PosixPath(components.path).suffix: + raise ValueError( + f"entries in the manifest must be paths to files, but this path has no file suffix: {path}" + ) + + return urlunparse(components) + + elif any(path.startswith(prefix) for prefix in VALID_URI_PREFIXES): + if not PosixPath(path).suffix: + raise ValueError( + f"entries in the manifest must be paths to files, but this path has no file suffix: {path}" + ) + + return path # path is already in URI form - def to_kerchunk(self) -> tuple[str, int, int]: - """Write out in the format that kerchunk uses for chunk entries.""" - return (self.path, self.offset, self.length) + else: + # must be a posix filesystem path (absolute or relative) + # using PosixPath here ensures a clear error would be thrown on windows (whose paths and platform are not officially supported) + _path = PosixPath(path) - def dict(self) -> ChunkDictEntry: - return ChunkDictEntry( - path=self.path, - offset=self.offset, - length=self.length, + if not _path.suffix: + raise ValueError( + f"entries in the manifest must be paths to files, but this path has no file suffix: {path}" + ) + + # only posix paths can possibly not be absolute + if not _path.is_absolute(): + if fs_root is None: + raise ValueError( + f"paths in the manifest must be absolute posix paths or URIs, but got {path}, and fs_root was not specified" + ) + else: + _path = convert_relative_path_to_absolute(_path, fs_root) + + return _path.as_uri() + + +def posixpath_maybe_from_uri(path: str) -> PosixPath: + """ + Handles file URIs like cloudpathlib.AnyPath does, i.e. + + In[1]: pathlib.PosixPath('file:///dir/file.nc') + Out[1]: pathlib.PosixPath('file:/dir/file.nc') + + In [2]: cloudpathlib.AnyPath('file:///dir/file.nc') + Out[2]: pathlib.PosixPath('/dir/file.nc') + + In [3]: posixpath_maybe_from_uri('file:///dir/file.nc') + Out[3]: pathlib.PosixPath('/dir/file.nc') + + This is needed otherwise pathlib thinks the URI is a relative path. + """ + if path.startswith("file:///"): + # TODO in python 3.13 we could probably use Path.from_uri() instead + return PosixPath(path.removeprefix("file://")) + else: + return PosixPath(path) + + +def convert_relative_path_to_absolute(path: PosixPath, fs_root: str) -> PosixPath: + _fs_root = posixpath_maybe_from_uri(fs_root) + + if not _fs_root.is_absolute() or _fs_root.suffix: + # TODO handle http url roots and bucket prefix roots? (ideally through cloudpathlib) + raise ValueError( + f"fs_root must be an absolute path to a filesystem directory, but got {fs_root}" ) + return (_fs_root / path).resolve() + + +def validate_byte_range(*, offset: Any, length: Any) -> None: + """Raise if byte offset or length has invalid type or value""" + + if isinstance(offset, np.integer): + _offset = int(offset) + elif isinstance(offset, int): + _offset = offset + else: + raise TypeError( + f"chunk entry byte offset must of type int, but got type {type(offset)}" + ) + if _offset < 0: + raise ValueError( + f"chunk entry byte offset must be a positive integer, but got offset={_offset}" + ) + + if isinstance(length, np.integer): + _length = int(length) + elif isinstance(length, int): + _length = length + else: + raise TypeError( + f"chunk entry byte offset must of type int, but got type {type(length)}" + ) + if _length < 0: + raise ValueError( + f"chunk entry byte offset must be a positive integer, but got offset={_length}" + ) + + +ChunkDict = NewType("ChunkDict", dict[ChunkKey, ChunkEntry]) + class ChunkManifest: """ @@ -124,20 +242,20 @@ def __init__(self, entries: dict, shape: tuple[int, ...] | None = None) -> None: # populate the arrays for key, entry in entries.items(): - try: - path, offset, length = entry.values() - entry = ChunkEntry(path=path, offset=offset, length=length) - except (ValueError, TypeError) as e: + if not isinstance(entry, dict) or len(entry) != 3: msg = ( "Each chunk entry must be of the form dict(path=, offset=, length=), " f"but got {entry}" ) - raise ValueError(msg) from e + raise ValueError(msg) + + path, offset, length = entry.values() + entry = ChunkEntry.with_validation(path=path, offset=offset, length=length) # type: ignore[attr-defined] split_key = split(key) - paths[split_key] = entry.path - offsets[split_key] = entry.offset - lengths[split_key] = entry.length + paths[split_key] = entry["path"] + offsets[split_key] = entry["offset"] + lengths[split_key] = entry["length"] self._paths = paths self._offsets = offsets @@ -146,9 +264,11 @@ def __init__(self, entries: dict, shape: tuple[int, ...] | None = None) -> None: @classmethod def from_arrays( cls, + *, paths: np.ndarray[Any, np.dtypes.StringDType], offsets: np.ndarray[Any, np.dtype[np.uint64]], lengths: np.ndarray[Any, np.dtype[np.uint64]], + validate_paths: bool = True, ) -> "ChunkManifest": """ Create manifest directly from numpy arrays containing the path and byte range information. @@ -161,6 +281,9 @@ def from_arrays( paths: np.ndarray offsets: np.ndarray lengths: np.ndarray + validate_paths: bool + Check that entries in the manifest are valid paths (e.g. that local paths are absolute not relative). + Set to False to skip validation for performance reasons. """ # check types @@ -200,6 +323,12 @@ def from_arrays( f"Shapes of the arrays must be consistent, but shapes of paths array and lengths array do not match: {paths.shape} vs {lengths.shape}" ) + if validate_paths: + vectorized_validation_fn = np.vectorize( + validate_and_normalize_path_to_uri, otypes=[np.dtypes.StringDType()] + ) # type: ignore[attr-defined] + paths = vectorized_validation_fn(paths) + obj = object.__new__(cls) obj._paths = paths obj._offsets = offsets @@ -233,6 +362,7 @@ def __getitem__(self, key: ChunkKey) -> ChunkEntry: path = self._paths[indices] offset = self._offsets[indices] length = self._lengths[indices] + # TODO fix bug here - types of path, offset, length shoudl be coerced return ChunkEntry(path=path, offset=offset, length=length) def __iter__(self) -> Iterator[ChunkKey]: @@ -243,26 +373,35 @@ def __iter__(self) -> Iterator[ChunkKey]: def __len__(self) -> int: return self._paths.size + def keys(self) -> KeysView: + return self.dict().keys() + + def values(self) -> ValuesView: + return self.dict().values() + + def items(self) -> ItemsView: + return self.dict().items() + def dict(self) -> ChunkDict: # type: ignore[override] """ Convert the entire manifest to a nested dictionary. The returned dict will be of the form - { - "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, - "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100}, - "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, - "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, - } + | { + | "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + | "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100}, + | "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, + | "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, + | } Entries whose path is an empty string will be interpreted as missing chunks and omitted from the dictionary. """ - coord_vectors = np.mgrid[ tuple(slice(None, length) for length in self.shape_chunk_grid) ] + # TODO consolidate each occurrence of this np.nditer pattern d = { join(inds): dict( path=path.item(), offset=offset.item(), length=length.item() @@ -301,24 +440,6 @@ def to_zarr_json(self, filepath: str) -> None: with open(filepath, "w") as json_file: json.dump(entries, json_file, indent=4, separators=(", ", ": ")) - @classmethod - def _from_kerchunk_chunk_dict( - cls, - # The type hint requires `Dict` instead of `dict` due to - # the conflicting ChunkManifest.dict method. - kerchunk_chunk_dict: Dict[ChunkKey, str | tuple[str] | tuple[str, int, int]], - ) -> "ChunkManifest": - chunk_entries: dict[ChunkKey, ChunkDictEntry] = {} - for k, v in kerchunk_chunk_dict.items(): - if isinstance(v, (str, bytes)): - raise NotImplementedError( - "Reading inlined reference data is currently not supported. [ToDo]" - ) - elif not isinstance(v, (tuple, list)): - raise TypeError(f"Unexpected type {type(v)} for chunk value: {v}") - chunk_entries[k] = ChunkEntry.from_kerchunk(v).dict() - return ChunkManifest(entries=chunk_entries) - def rename_paths( self, new: str | Callable[[str], str], @@ -370,6 +491,7 @@ def rename_paths( paths=renamed_paths, offsets=self._offsets, lengths=self._lengths, + validate_paths=True, ) diff --git a/virtualizarr/readers/dmrpp.py b/virtualizarr/readers/dmrpp.py index 5d74c412..451ecdc7 100644 --- a/virtualizarr/readers/dmrpp.py +++ b/virtualizarr/readers/dmrpp.py @@ -8,6 +8,7 @@ from xarray import Coordinates, Dataset, Index, Variable from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri from virtualizarr.readers.common import VirtualBackend from virtualizarr.types import ChunkKey from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions @@ -41,6 +42,10 @@ def open_virtual_dataset( "Specifying `loadable_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files." ) + filepath = validate_and_normalize_path_to_uri( + filepath, fs_root=Path.cwd().as_uri() + ) + fpath = _FsspecFSFromFilepath( filepath=filepath, reader_options=reader_options ).open_file() @@ -90,6 +95,8 @@ class DMRParser: _DEFAULT_ZLIB_VALUE = 6 # Encoding keys that should be removed from attributes and placed in xarray encoding dict _ENCODING_KEYS = {"_FillValue", "missing_value", "scale_factor", "add_offset"} + root: ET.Element + data_filepath: str def __init__(self, root: ET.Element, data_filepath: Optional[str] = None): """ @@ -97,9 +104,8 @@ def __init__(self, root: ET.Element, data_filepath: Optional[str] = None): Parameters ---------- - dmrpp_str : str - The dmrpp file contents as a string. - + root: xml.ElementTree.Element + Root of the xml tree struture of a DMR++ file. data_filepath : str, optional The path to the actual data file that will be set in the chunk manifests. If None, the data file path is taken from the DMR++ file. diff --git a/virtualizarr/readers/fits.py b/virtualizarr/readers/fits.py index 7b6221ff..1a218773 100644 --- a/virtualizarr/readers/fits.py +++ b/virtualizarr/readers/fits.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Iterable, Mapping, Optional from xarray import Dataset, Index @@ -42,6 +43,7 @@ def open_virtual_dataset( refs, loadable_variables, drop_variables, + fs_root=Path.cwd().as_uri(), ) # TODO this wouldn't work until you had an xarray backend for FITS installed diff --git a/virtualizarr/readers/hdf/hdf.py b/virtualizarr/readers/hdf/hdf.py index ca921bb0..5ff054a5 100644 --- a/virtualizarr/readers/hdf/hdf.py +++ b/virtualizarr/readers/hdf/hdf.py @@ -1,11 +1,17 @@ import math +from pathlib import Path from typing import TYPE_CHECKING, Dict, Iterable, List, Mapping, Optional, Union import numpy as np import xarray as xr -from xarray import Index, Variable +from xarray import Dataset, Index, Variable -from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray +from virtualizarr.manifests import ( + ChunkEntry, + ChunkManifest, + ManifestArray, +) +from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri from virtualizarr.readers.common import ( VirtualBackend, construct_virtual_dataset, @@ -51,6 +57,10 @@ def open_virtual_dataset( loadable_variables, ) + filepath = validate_and_normalize_path_to_uri( + filepath, fs_root=Path.cwd().as_uri() + ) + virtual_vars = HDFVirtualBackend._virtual_vars_from_hdf( path=filepath, group=group, @@ -105,11 +115,12 @@ def _dataset_chunk_manifest(path: str, dataset: Dataset) -> Optional[ChunkManife else: key_list = [0] * (len(dataset.shape) or 1) key = ".".join(map(str, key_list)) - chunk_entry = ChunkEntry( + + chunk_entry = ChunkEntry.with_validation( path=path, offset=dsid.get_offset(), length=dsid.get_storage_size() ) chunk_key = ChunkKey(key) - chunk_entries = {chunk_key: chunk_entry.dict()} + chunk_entries = {chunk_key: chunk_entry} chunk_manifest = ChunkManifest(entries=chunk_entries) return chunk_manifest else: diff --git a/virtualizarr/readers/hdf5.py b/virtualizarr/readers/hdf5.py index ca910ab4..0226a5e9 100644 --- a/virtualizarr/readers/hdf5.py +++ b/virtualizarr/readers/hdf5.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Iterable, Mapping, Optional from xarray import Dataset, Index @@ -48,6 +49,7 @@ def open_virtual_dataset( refs, loadable_variables, drop_variables, + fs_root=Path.cwd().as_uri(), ) loadable_vars, indexes = open_loadable_vars_and_indexes( diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py index 43ad5341..6f688d3c 100644 --- a/virtualizarr/readers/kerchunk.py +++ b/virtualizarr/readers/kerchunk.py @@ -25,9 +25,14 @@ def open_virtual_dataset( ) -> Dataset: """Reads existing kerchunk references (in JSON or parquet) format.""" + if virtual_backend_kwargs is None: + virtual_backend_kwargs = {} + + fs_root = virtual_backend_kwargs.pop("fs_root", None) + if virtual_backend_kwargs: raise NotImplementedError( - "Kerchunk reader does not understand any virtual_backend_kwargs" + f"Kerchunk reader does not understand any of the virtual_backend_kwargs {virtual_backend_kwargs}" ) if group: @@ -57,7 +62,9 @@ def open_virtual_dataset( full_reference = {"refs": array_refs} - vds = dataset_from_kerchunk_refs(KerchunkStoreRefs(full_reference)) + vds = dataset_from_kerchunk_refs( + KerchunkStoreRefs(full_reference), fs_root=fs_root + ) # JSON has no magic bytes, but the Kerchunk version 1 spec starts with 'version': # https://fsspec.github.io/kerchunk/spec.html @@ -65,7 +72,7 @@ def open_virtual_dataset( with fs.open_file() as of: refs = ujson.load(of) - vds = dataset_from_kerchunk_refs(KerchunkStoreRefs(refs)) + vds = dataset_from_kerchunk_refs(KerchunkStoreRefs(refs), fs_root=fs_root) else: raise ValueError( diff --git a/virtualizarr/readers/netcdf3.py b/virtualizarr/readers/netcdf3.py index f93dd2a6..b90e67d9 100644 --- a/virtualizarr/readers/netcdf3.py +++ b/virtualizarr/readers/netcdf3.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Iterable, Mapping, Optional from xarray import Dataset, Index @@ -46,6 +47,7 @@ def open_virtual_dataset( refs, loadable_variables, drop_variables, + fs_root=Path.cwd().as_uri(), ) loadable_vars, indexes = open_loadable_vars_and_indexes( diff --git a/virtualizarr/readers/tiff.py b/virtualizarr/readers/tiff.py index 0383e899..c4755583 100644 --- a/virtualizarr/readers/tiff.py +++ b/virtualizarr/readers/tiff.py @@ -1,4 +1,5 @@ import warnings +from pathlib import Path from typing import Iterable, Mapping, Optional from xarray import Dataset, Index @@ -57,6 +58,7 @@ def open_virtual_dataset( refs, loadable_variables, drop_variables, + fs_root=Path.cwd().as_uri(), ) loadable_vars, indexes = open_loadable_vars_and_indexes( diff --git a/virtualizarr/readers/zarr_v3.py b/virtualizarr/readers/zarr_v3.py index 1491d2d2..70bf66e8 100644 --- a/virtualizarr/readers/zarr_v3.py +++ b/virtualizarr/readers/zarr_v3.py @@ -28,6 +28,7 @@ def open_virtual_dataset( This is experimental - chunk manifests are not part of the Zarr v3 Spec. """ + if virtual_backend_kwargs: raise NotImplementedError( "Zarr_v3 reader does not understand any virtual_backend_kwargs" diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index aee82542..b6884782 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -83,7 +83,7 @@ def create_manifestarray( def entry_from_chunk_key(ind: tuple[int, ...]) -> dict[str, str | int]: """Generate a (somewhat) unique manifest entry from a given chunk key""" entry = { - "path": f"file.{str(join(ind))}.nc", + "path": f"/foo.{str(join(ind))}.nc", "offset": offset_from_chunk_key(ind), "length": length_from_chunk_key(ind), } diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 11b5cc20..09a9fd83 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -1,3 +1,6 @@ +from os.path import relpath +from pathlib import Path + import numpy as np import pytest import xarray as xr @@ -18,8 +21,8 @@ def test_kerchunk_roundtrip_in_memory_no_concat(): # Set up example xarray dataset chunks_dict = { - "0.0": {"path": "foo.nc", "offset": 100, "length": 100}, - "0.1": {"path": "foo.nc", "offset": 200, "length": 100}, + "0.0": {"path": "/foo.nc", "offset": 100, "length": 100}, + "0.1": {"path": "/foo.nc", "offset": 200, "length": 100}, } manifest = ChunkManifest(entries=chunks_dict) marr = ManifestArray( @@ -239,7 +242,7 @@ def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend): def test_datetime64_dtype_fill_value(self, tmpdir, format): chunks_dict = { - "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, + "0.0.0": {"path": "/foo.nc", "offset": 100, "length": 100}, } manifest = ChunkManifest(entries=chunks_dict) chunks = (1, 1, 1) @@ -292,3 +295,28 @@ def test_open_scalar_variable(tmpdir, hdf_backend): vds = open_virtual_dataset(f"{tmpdir}/scalar.nc", indexes={}, backend=hdf_backend) assert vds["a"].shape == () + + +class TestPathsToURIs: + @requires_kerchunk + @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) + def test_convert_absolute_paths_to_uris(self, netcdf4_file, hdf_backend): + vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) + + expected_path = Path(netcdf4_file).as_uri() + + manifest = vds["air"].data.manifest.dict() + path = manifest["0.0.0"]["path"] + assert path == expected_path + + @requires_kerchunk + @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) + def test_convert_relative_paths_to_uris(self, netcdf4_file, hdf_backend): + relative_path = relpath(netcdf4_file) + vds = open_virtual_dataset(relative_path, indexes={}, backend=hdf_backend) + + expected_path = Path(netcdf4_file).as_uri() + + manifest = vds["air"].data.manifest.dict() + path = manifest["0.0.0"]["path"] + assert path == expected_path diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index 06e54d95..2444e4ea 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -2,7 +2,7 @@ import pytest from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.tests import create_manifestarray, requires_kerchunk +from virtualizarr.tests import create_manifestarray from virtualizarr.zarr import ZArray @@ -35,33 +35,6 @@ def test_create_manifestarray(self): assert marr.size == 5 * 2 * 20 assert marr.ndim == 3 - @requires_kerchunk - def test_create_manifestarray_from_kerchunk_refs(self): - arr_refs = { - ".zarray": '{"chunks":[2,3],"compressor":null,"dtype":" str: def test_invalid_type(self): chunks = { - "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, + "0.0.0": {"path": "/foo.nc", "offset": 100, "length": 100}, } manifest = ChunkManifest(entries=chunks) with pytest.raises(TypeError): + # list is an invalid arg type manifest.rename_paths(["file1.nc", "file2.nc"]) + + def test_normalize_paths_to_uris(self): + chunks = { + "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + } + manifest = ChunkManifest(entries=chunks) + + renamed = manifest.rename_paths("/home/directory/bar.nc") + assert renamed.dict() == { + "0.0.0": { + "path": "file:///home/directory/bar.nc", + "offset": 100, + "length": 100, + }, + } + + def test_catch_malformed_paths(self): + chunks = { + "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + } + manifest = ChunkManifest(entries=chunks) + + with pytest.raises(ValueError): + # list is an invalid arg type + manifest.rename_paths("./foo.nc") diff --git a/virtualizarr/tests/test_readers/test_dmrpp.py b/virtualizarr/tests/test_readers/test_dmrpp.py index d6238ca5..4ef6b143 100644 --- a/virtualizarr/tests/test_readers/test_dmrpp.py +++ b/virtualizarr/tests/test_readers/test_dmrpp.py @@ -1,3 +1,4 @@ +import os import textwrap from pathlib import Path from xml.etree import ElementTree as ET @@ -20,6 +21,7 @@ # TODO: later add MUR, SWOT, TEMPO and others by using kerchunk JSON to read refs (rather than reading the whole netcdf file) ] + DMRPP_XML_STRINGS = { "basic": textwrap.dedent( """\ @@ -390,3 +392,53 @@ def test_parse_chunks( lengths=expected_lengths, offsets=expected_offsets, paths=expected_paths ) assert result == expected + + +@pytest.fixture +def basic_dmrpp_temp_filepath(tmp_path: Path) -> Path: + # TODO generalize key here? Would require the factory pattern + # (https://docs.pytest.org/en/stable/how-to/fixtures.html#factories-as-fixtures) + drmpp_xml_str = DMRPP_XML_STRINGS["basic"] + + # TODO generalize filename here? + filepath = tmp_path / "test.nc.dmrpp" + + with open(filepath, "w") as f: + f.write(drmpp_xml_str) + + return filepath + + +class TestRelativePaths: + def test_absolute_path_to_dmrpp_file_containing_relative_path( + self, + basic_dmrpp_temp_filepath: Path, + ): + vds = open_virtual_dataset( + str(basic_dmrpp_temp_filepath), indexes={}, filetype="dmrpp" + ) + path = vds["x"].data.manifest["0"]["path"] + + # by convention, if dmrpp file path is {PATH}.nc.dmrpp, the data filepath should be {PATH}.nc + # and the manifest should only contain absolute file URIs + expected_datafile_path_uri = basic_dmrpp_temp_filepath.as_uri().removesuffix( + ".dmrpp" + ) + assert path == expected_datafile_path_uri + + def test_relative_path_to_dmrpp_file(self, basic_dmrpp_temp_filepath: Path): + # test that if a user supplies a relative path to a DMR++ file we still get an absolute path in the manifest + relative_dmrpp_filepath = os.path.relpath( + str(basic_dmrpp_temp_filepath), start=os.getcwd() + ) + + vds = open_virtual_dataset( + relative_dmrpp_filepath, indexes={}, filetype="dmrpp" + ) + path = vds["x"].data.manifest["0"]["path"] + + # by convention, if dmrpp file path is {PATH}.nc.dmrpp, the data filepath should be {PATH}.nc + expected_datafile_path_uri = basic_dmrpp_temp_filepath.as_uri().removesuffix( + ".dmrpp" + ) + assert path == expected_datafile_path_uri diff --git a/virtualizarr/tests/test_readers/test_kerchunk.py b/virtualizarr/tests/test_readers/test_kerchunk.py index dab049cb..c89d0f53 100644 --- a/virtualizarr/tests/test_readers/test_kerchunk.py +++ b/virtualizarr/tests/test_readers/test_kerchunk.py @@ -22,7 +22,7 @@ def gen_ds_refs( if zattrs is None: zattrs = '{"_ARRAY_DIMENSIONS":["x","y"]}' if chunks is None: - chunks = {"a/0.0": ["test1.nc", 6144, 48]} + chunks = {"a/0.0": ["/test1.nc", 6144, 48]} return { "version": 1, @@ -78,7 +78,7 @@ def test_dataset_from_df_refs(refs_file_factory): assert vda.data.zarray.order == "C" assert vda.data.manifest.dict() == { - "0.0": {"path": "test1.nc", "offset": 6144, "length": 48} + "0.0": {"path": "file:///test1.nc", "offset": 6144, "length": 48} } @@ -121,3 +121,53 @@ def test_empty_chunk_manifest(refs_file_factory): assert isinstance(vds["a"].data, ManifestArray) assert vds["a"].sizes == {"x": 100, "y": 200} assert vds["a"].chunksizes == {"x": 50, "y": 100} + + +def test_handle_relative_paths(refs_file_factory): + # deliberately use relative path here, see https://github.com/zarr-developers/VirtualiZarr/pull/243#issuecomment-2492341326 + refs_file = refs_file_factory(chunks={"a/0.0": ["test1.nc", 6144, 48]}) + + with pytest.raises(ValueError, match="must be absolute posix paths"): + open_virtual_dataset(refs_file, filetype="kerchunk") + + refs_file = refs_file_factory(chunks={"a/0.0": ["./test1.nc", 6144, 48]}) + with pytest.raises(ValueError, match="must be absolute posix paths"): + open_virtual_dataset(refs_file, filetype="kerchunk") + + with pytest.raises( + ValueError, match="fs_root must be an absolute path to a filesystem directory" + ): + open_virtual_dataset( + refs_file, + filetype="kerchunk", + virtual_backend_kwargs={"fs_root": "some_directory/"}, + ) + + with pytest.raises( + ValueError, match="fs_root must be an absolute path to a filesystem directory" + ): + open_virtual_dataset( + refs_file, + filetype="kerchunk", + virtual_backend_kwargs={"fs_root": "/some_directory/file.nc"}, + ) + + vds = open_virtual_dataset( + refs_file, + filetype="kerchunk", + virtual_backend_kwargs={"fs_root": "/some_directory/"}, + ) + vda = vds["a"] + assert vda.data.manifest.dict() == { + "0.0": {"path": "file:///some_directory/test1.nc", "offset": 6144, "length": 48} + } + + vds = open_virtual_dataset( + refs_file, + filetype="kerchunk", + virtual_backend_kwargs={"fs_root": "file:///some_directory/"}, + ) + vda = vds["a"] + assert vda.data.manifest.dict() == { + "0.0": {"path": "file:///some_directory/test1.nc", "offset": 6144, "length": 48} + } diff --git a/virtualizarr/tests/test_writers/test_kerchunk.py b/virtualizarr/tests/test_writers/test_kerchunk.py index a8c75fb4..8cc7f825 100644 --- a/virtualizarr/tests/test_writers/test_kerchunk.py +++ b/virtualizarr/tests/test_writers/test_kerchunk.py @@ -10,7 +10,7 @@ class TestAccessor: def test_accessor_to_kerchunk_dict(self): manifest = ChunkManifest( - entries={"0.0": dict(path="test.nc", offset=6144, length=48)} + entries={"0.0": dict(path="file:///test.nc", offset=6144, length=48)} ) arr = ManifestArray( chunkmanifest=manifest, @@ -33,7 +33,7 @@ def test_accessor_to_kerchunk_dict(self): ".zattrs": "{}", "a/.zarray": '{"shape":[2,3],"chunks":[2,3],"dtype":" tuple[Mapping[str, Variable], dict[str, Any], list[str]]: """ Parses all useful information from a set kerchunk references (for a single group). + + Parameters + ---------- + fs_root + The root of the fsspec filesystem on which these references were generated. + Required if any paths are relative in order to turn them into absolute paths (which virtualizarr requires). """ virtual_vars = virtual_vars_from_kerchunk_refs( vds_refs, drop_variables=drop_variables + loadable_variables, virtual_array_class=virtual_array_class, + fs_root=fs_root, ) ds_attrs = fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) coord_names = ds_attrs.pop("coordinates", []) @@ -76,6 +85,7 @@ def virtual_vars_from_kerchunk_refs( refs: KerchunkStoreRefs, drop_variables: list[str] | None = None, virtual_array_class=ManifestArray, + fs_root: str | None = None, ) -> dict[str, Variable]: """ Translate a store-level kerchunk reference dict into aaset of xarray Variables containing virtualized arrays. @@ -97,7 +107,9 @@ def virtual_vars_from_kerchunk_refs( ] vars = { - var_name: variable_from_kerchunk_refs(refs, var_name, virtual_array_class) + var_name: variable_from_kerchunk_refs( + refs, var_name, virtual_array_class, fs_root=fs_root + ) for var_name in var_names_to_keep } return vars @@ -108,6 +120,7 @@ def dataset_from_kerchunk_refs( drop_variables: list[str] = [], virtual_array_class: type = ManifestArray, indexes: MutableMapping[str, Index] | None = None, + fs_root: str | None = None, ) -> Dataset: """ Translate a store-level kerchunk reference dict into an xarray Dataset containing virtualized arrays. @@ -119,7 +132,9 @@ def dataset_from_kerchunk_refs( Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. """ - vars = virtual_vars_from_kerchunk_refs(refs, drop_variables, virtual_array_class) + vars = virtual_vars_from_kerchunk_refs( + refs, drop_variables, virtual_array_class, fs_root=fs_root + ) ds_attrs = fully_decode_arr_refs(refs["refs"]).get(".zattrs", {}) coord_names = ds_attrs.pop("coordinates", []) @@ -138,7 +153,10 @@ def dataset_from_kerchunk_refs( def variable_from_kerchunk_refs( - refs: KerchunkStoreRefs, var_name: str, virtual_array_class + refs: KerchunkStoreRefs, + var_name: str, + virtual_array_class, + fs_root: str | None = None, ) -> Variable: """Create a single xarray Variable by reading specific keys of a kerchunk references dict.""" @@ -147,7 +165,7 @@ def variable_from_kerchunk_refs( # we want to remove the _ARRAY_DIMENSIONS from the final variables' .attrs dims = zattrs.pop("_ARRAY_DIMENSIONS") if chunk_dict: - manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict) + manifest = manifest_from_kerchunk_chunk_dict(chunk_dict, fs_root=fs_root) varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest) elif len(zarray.shape) != 0: # empty variables don't have physical chunks, but zarray shows that the variable @@ -164,6 +182,42 @@ def variable_from_kerchunk_refs( return Variable(data=varr, dims=dims, attrs=zattrs) +def manifest_from_kerchunk_chunk_dict( + kerchunk_chunk_dict: dict[ChunkKey, str | tuple[str] | tuple[str, int, int]], + fs_root: str | None = None, +) -> ChunkManifest: + """Create a single ChunkManifest from the mapping of keys to chunk information stored inside kerchunk array refs.""" + + chunk_entries: dict[ChunkKey, ChunkEntry] = {} + for k, v in kerchunk_chunk_dict.items(): + if isinstance(v, (str, bytes)): + raise NotImplementedError( + "Reading inlined reference data is currently not supported. [ToDo]" + ) + elif not isinstance(v, (tuple, list)): + raise TypeError(f"Unexpected type {type(v)} for chunk value: {v}") + chunk_entries[k] = chunkentry_from_kerchunk(v, fs_root=fs_root) + return ChunkManifest(entries=chunk_entries) + + +def chunkentry_from_kerchunk( + path_and_byte_range_info: tuple[str] | tuple[str, int, int], + fs_root: str | None = None, +) -> ChunkEntry: + """Create a single validated ChunkEntry object from whatever kerchunk contains under that chunk key.""" + from upath import UPath + + if len(path_and_byte_range_info) == 1: + path = path_and_byte_range_info[0] + offset = 0 + length = UPath(path).stat().st_size + else: + path, offset, length = path_and_byte_range_info + return ChunkEntry.with_validation( # type: ignore[attr-defined] + path=path, offset=offset, length=length, fs_root=fs_root + ) + + def find_var_names(ds_reference_dict: KerchunkStoreRefs) -> list[str]: """Find the names of zarr variables in this store/group.""" diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index 0ba95a36..ae86eed4 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -13,17 +13,6 @@ from zarr import Group # type: ignore -VALID_URI_PREFIXES = { - "s3://", - # "gs://", # https://github.com/earth-mover/icechunk/issues/265 - # "azure://", # https://github.com/earth-mover/icechunk/issues/266 - # "r2://", - # "cos://", - # "minio://", - "file:///", -} - - def dataset_to_icechunk(ds: Dataset, store: "IcechunkStore") -> None: """ Write an xarray dataset whose variables wrap ManifestArrays to an Icechunk store. @@ -171,6 +160,7 @@ def write_manifest_virtual_refs( # loop over every reference in the ChunkManifest for that array # TODO inefficient: this should be replaced with something that sets all (new) references for the array at once # but Icechunk need to expose a suitable API first + # See https://github.com/earth-mover/icechunk/issues/401 for performance benchmark it = np.nditer( [manifest._paths, manifest._offsets, manifest._lengths], # type: ignore[arg-type] flags=[ @@ -188,16 +178,7 @@ def write_manifest_virtual_refs( store.set_virtual_ref( # TODO it would be marginally neater if I could pass the group and name as separate args key=f"{key_prefix}/c/{chunk_key}", # should be of form 'group/arr_name/c/0/1/2', where c stands for chunks - location=as_file_uri(path.item()), + location=path.item(), offset=offset.item(), length=length.item(), ) - - -def as_file_uri(path): - # TODO a more robust solution to this requirement exists in https://github.com/zarr-developers/VirtualiZarr/pull/243 - if not any(path.startswith(prefix) for prefix in VALID_URI_PREFIXES) and path != "": - # assume path is local - return f"file://{path}" - else: - return path diff --git a/virtualizarr/writers/kerchunk.py b/virtualizarr/writers/kerchunk.py index 3a0bd27b..b6d302b2 100644 --- a/virtualizarr/writers/kerchunk.py +++ b/virtualizarr/writers/kerchunk.py @@ -60,6 +60,13 @@ def dataset_to_kerchunk_refs(ds: Dataset) -> KerchunkStoreRefs: return cast(KerchunkStoreRefs, ds_refs) +def remove_file_uri_prefix(path: str): + if path.startswith("file:///"): + return path.removeprefix("file://") + else: + return path + + def variable_to_kerchunk_arr_refs(var: Variable, var_name: str) -> KerchunkArrRefs: """ Create a dictionary containing kerchunk-style array references from a single xarray.Variable (which wraps either a ManifestArray or a numpy array). @@ -72,7 +79,11 @@ def variable_to_kerchunk_arr_refs(var: Variable, var_name: str) -> KerchunkArrRe marr = var.data arr_refs: dict[str, str | list[str | int]] = { - str(chunk_key): [entry["path"], entry["offset"], entry["length"]] + str(chunk_key): [ + remove_file_uri_prefix(entry["path"]), + entry["offset"], + entry["length"], + ] for chunk_key, entry in marr.manifest.dict().items() } From f0127010f7e9225f63c251edf9091a5541e39c8c Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 3 Dec 2024 15:39:15 -0700 Subject: [PATCH 09/10] Add list of previous talks to readme (#313) * add previous presentations * Add Sean's talk at ESIP * release note --- README.md | 7 +++++++ docs/releases.rst | 2 ++ 2 files changed, 9 insertions(+) diff --git a/README.md b/README.md index 672d12e0..471b446a 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,13 @@ We have a lot of ideas, including: If you see other opportunities then we would love to hear your ideas! +### Presentations + +- 2024/11/21 - MET Office Architecture Guild - Tom Nicholas - [Slides](https://speakerdeck.com/tomnicholas/virtualizarr-talk-at-met-office) +- 2024/?/? - Cloud-Native Geospatial conference - Raphael Hagen - [Slides](https://decks.carbonplan.org/cloud-native-geo/11-13-24) +- 2024/07/24 - ESIP Meeting - Sean Harkins - [Event](https://2024julyesipmeeting.sched.com/event/1eVP6) / [Recording](https://youtu.be/T6QAwJIwI3Q?t=3689) +- 2024/05/15 - Pangeo showcase - Tom Nicholas - [Event](https://discourse.pangeo.io/t/pangeo-showcase-virtualizarr-create-virtual-zarr-stores-using-xarray-syntax/4127/2) / [Recording](https://youtu.be/ioxgzhDaYiE) / [Slides](https://speakerdeck.com/tomnicholas/virtualizarr-create-virtual-zarr-stores-using-xarray-syntax) + ### Credits This package was originally developed by [Tom Nicholas](https://github.com/TomNicholas) whilst working at [[C]Worthy](cworthy.org), who deserve credit for allowing him to prioritise a generalizable open-source solution to the dataset virtualization problem. VirtualiZarr is now a community-owned multi-stakeholder project. diff --git a/docs/releases.rst b/docs/releases.rst index 18d5920b..fe1d9835 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -44,6 +44,8 @@ Documentation (:pull:`298`) By `Tom Nicholas `_. - More minor improvements to the Contributing Guide. (:pull:`304`) By `Doug Latornell `_. +- Added links to recorded presentations on VirtualiZarr. + (:pull:`313`) By `Tom Nicholas `_. Internal Changes ~~~~~~~~~~~~~~~~ From 13e90779603be4038fa23ecc4fa2129255007c9d Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 3 Dec 2024 15:51:41 -0700 Subject: [PATCH 10/10] Correct some documentation links to function the public API (#325) * correct some documentation links to function the public API * releases * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/releases.rst | 2 ++ docs/usage.md | 16 ++++++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/docs/releases.rst b/docs/releases.rst index fe1d9835..35b7c488 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -44,6 +44,8 @@ Documentation (:pull:`298`) By `Tom Nicholas `_. - More minor improvements to the Contributing Guide. (:pull:`304`) By `Doug Latornell `_. +- Correct some links to the API. + (:pull:`325`) By `Tom Nicholas `_. - Added links to recorded presentations on VirtualiZarr. (:pull:`313`) By `Tom Nicholas `_. diff --git a/docs/usage.md b/docs/usage.md index cad2f96f..17d4672e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -17,7 +17,7 @@ ds = xr.tutorial.open_dataset('air_temperature') ds.to_netcdf('air.nc') ``` -We can open a virtual representation of this file using {py:func}`open_virtual_dataset `. +We can open a virtual representation of this file using {py:func}`open_virtual_dataset `. ```python from virtualizarr import open_virtual_dataset @@ -25,7 +25,7 @@ from virtualizarr import open_virtual_dataset vds = open_virtual_dataset('air.nc') ``` -(Notice we did not have to explicitly indicate the file format, as {py:func}`open_virtual_dataset ` will attempt to automatically infer it.) +(Notice we did not have to explicitly indicate the file format, as {py:func}`open_virtual_dataset ` will attempt to automatically infer it.) ```{note} @@ -360,7 +360,7 @@ Once we've combined references to all the chunks of all our legacy files into on The [kerchunk library](https://github.com/fsspec/kerchunk) has its own [specification](https://fsspec.github.io/kerchunk/spec.html) for how byte range references should be serialized (either as a JSON or parquet file). -To write out all the references in the virtual dataset as a single kerchunk-compliant JSON or parquet file, you can use the {py:meth}`ds.virtualize.to_kerchunk ` accessor method. +To write out all the references in the virtual dataset as a single kerchunk-compliant JSON or parquet file, you can use the {py:meth}`vds.virtualize.to_kerchunk ` accessor method. ```python combined_vds.virtualize.to_kerchunk('combined.json', format='json') @@ -394,7 +394,7 @@ By default references are placed in separate parquet file when the total number ### Writing to an Icechunk Store -We can also write these references out as an [IcechunkStore](https://icechunk.io/). `Icechunk` is a Open-source, cloud-native transactional tensor storage engine that is compatible with zarr version 3. To export our virtual dataset to an `Icechunk` Store, we simply use the {py:meth}`ds.virtualize.to_icechunk ` accessor method. +We can also write these references out as an [IcechunkStore](https://icechunk.io/). `Icechunk` is a Open-source, cloud-native transactional tensor storage engine that is compatible with zarr version 3. To export our virtual dataset to an `Icechunk` Store, we simply use the {py:meth}`vds.virtualize.to_icechunk ` accessor method. ```python # create an icechunk store @@ -411,7 +411,7 @@ See the [Icechunk documentation](https://icechunk.io/icechunk-python/virtual/#cr ### Writing as Zarr -Alternatively, we can write these references out as an actual Zarr store, at least one that is compliant with the [proposed "Chunk Manifest" ZEP](https://github.com/zarr-developers/zarr-specs/issues/287). To do this we simply use the {py:meth}`ds.virtualize.to_zarr ` accessor method. +Alternatively, we can write these references out as an actual Zarr store, at least one that is compliant with the [proposed "Chunk Manifest" ZEP](https://github.com/zarr-developers/zarr-specs/issues/287). To do this we simply use the {py:meth}`vds.virtualize.to_zarr ` accessor method. ```python combined_vds.virtualize.to_zarr('combined.zarr') @@ -431,7 +431,7 @@ The advantage of this format is that any zarr v3 reader that understands the chu ```{note} Currently there are not yet any zarr v3 readers which understand the chunk manifest ZEP, so until then this feature cannot be used for data processing. -This store can however be read by {py:func}`~virtualizarr.xarray.open_virtual_dataset`, by passing `filetype="zarr_v3"`. +This store can however be read by {py:func}`~virtualizarr.open_virtual_dataset`, by passing `filetype="zarr_v3"`. ``` ## Opening Kerchunk references as virtual datasets @@ -461,7 +461,7 @@ vds = open_virtual_dataset( # the path in the virtual dataset will now be 'file:///some_directory/file.nc' ``` -Note that as the virtualizarr {py:meth}`vds.virtualize.to_kerchunk ` method only writes absolute paths, the only scenario in which you might come across references containing relative paths is if you are opening references that were previously created using the ``kerchunk`` library alone. +Note that as the virtualizarr {py:meth}`vds.virtualize.to_kerchunk ` method only writes absolute paths, the only scenario in which you might come across references containing relative paths is if you are opening references that were previously created using the ``kerchunk`` library alone. ## Rewriting existing manifests @@ -469,7 +469,7 @@ Sometimes it can be useful to rewrite the contents of an already-generated manif ### Rewriting file paths -You can rewrite the file paths stored in a manifest or virtual dataset without changing the byte range information using the {py:meth}`vds.virtualize.rename_paths ` accessor method. +You can rewrite the file paths stored in a manifest or virtual dataset without changing the byte range information using the {py:meth}`vds.virtualize.rename_paths ` accessor method. For example, you may want to rename file paths according to a function to reflect having moved the location of the referenced files from local storage to an S3 bucket.