diff --git a/docs/releases.rst b/docs/releases.rst index ed29ab13..18d5920b 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -17,6 +17,8 @@ Breaking changes - Minimum required version of Xarray is now v2024.10.0. (:pull:`284`) By `Tom Nicholas `_. +- Opening kerchunk-formatted references from disk which contain relative paths now requires passing the ``fs_root`` keyword argument via ``virtual_backend_kwargs``. + (:pull:`243`) By `Tom Nicholas `_. Deprecations ~~~~~~~~~~~~ @@ -50,6 +52,8 @@ Internal Changes (:pull:`87`) By `Sean Harkins `_. - Support downstream type checking by adding py.typed marker file. (:pull:`306`) By `Max Jones `_. +- File paths in chunk manifests are now always stored as abolute URIs. + (:pull:`243`) By `Tom Nicholas `_. .. _v1.1.0: diff --git a/docs/usage.md b/docs/usage.md index d9b292e0..cad2f96f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -106,10 +106,10 @@ manifest = marr.manifest manifest.dict() ``` ```python -{'0.0.0': {'path': 'air.nc', 'offset': 15419, 'length': 7738000}} +{'0.0.0': {'path': 'file:///work/data/air.nc', 'offset': 15419, 'length': 7738000}} ``` -In this case we can see that the `"air"` variable contains only one chunk, the bytes for which live in the `air.nc` file at the location given by the `'offset'` and `'length'` attributes. +In this case we can see that the `"air"` variable contains only one chunk, the bytes for which live in the `file:///work/data/air.nc` file, at the location given by the `'offset'` and `'length'` attributes. The {py:class}`ChunkManifest ` class is virtualizarr's internal in-memory representation of this manifest. @@ -153,8 +153,8 @@ ManifestArray concatenated.manifest.dict() ``` ``` -{'0.0.0': {'path': 'air.nc', 'offset': 15419, 'length': 7738000}, - '1.0.0': {'path': 'air.nc', 'offset': 15419, 'length': 7738000}} +{'0.0.0': {'path': 'file:///work/data/air.nc', 'offset': 15419, 'length': 7738000}, + '1.0.0': {'path': 'file:///work/data/air.nc', 'offset': 15419, 'length': 7738000}} ``` This concatenation property is what will allow us to combine the data from multiple netCDF files on disk into a single Zarr store containing arrays of many chunks. @@ -254,8 +254,8 @@ We can see that the resulting combined manifest has two chunks, as expected. combined_vds['air'].data.manifest.dict() ``` ``` -{'0.0.0': {'path': 'air1.nc', 'offset': 15419, 'length': 3869000}, - '1.0.0': {'path': 'air2.nc', 'offset': 15419, 'length': 3869000}} +{'0.0.0': {'path': 'file:///work/data/air1.nc', 'offset': 15419, 'length': 3869000}, + '1.0.0': {'path': 'file:///work/data/air2.nc', 'offset': 15419, 'length': 3869000}} ``` ```{note} @@ -327,8 +327,6 @@ Loading variables can be useful in a few scenarios: To correctly decode time variables according to the CF conventions, you need to pass `time` to `loadable_variables` and ensure the `decode_times` argument of `open_virtual_dataset` is set to True (`decode_times` defaults to None). - - ```python vds = open_virtual_dataset( 'air.nc', @@ -354,8 +352,6 @@ Attributes: title: 4x daily NMC reanalysis (1948) ``` - - ## Writing virtual stores to disk Once we've combined references to all the chunks of all our legacy files into one virtual xarray dataset, we still need to write these references out to disk so that they can be read by our analysis code later. @@ -443,20 +439,37 @@ This store can however be read by {py:func}`~virtualizarr.xarray.open_virtual_da You can open existing Kerchunk `json` or `parquet` references as Virtualizarr virtual datasets. This may be useful for converting existing Kerchunk formatted references to storage formats like [Icechunk](https://icechunk.io/). ```python - vds = open_virtual_dataset('combined.json', filetype='kerchunk', indexes={}) # or vds = open_virtual_dataset('combined.parquet', filetype='kerchunk', indexes={}) +``` + +One difference between the kerchunk references format and virtualizarr's internal manifest representation (as well as icechunk's format) is that paths in kerchunk references can be relative paths. Opening kerchunk references that contain relative local filepaths therefore requires supplying another piece of information: the directory of the ``fsspec`` filesystem which the filepath was defined relative to. + +You can dis-ambuiguate kerchunk references containing relative paths by passing the ``fs_root`` kwarg to ``virtual_backend_kwargs``. +```python +# file `relative_refs.json` contains a path like './file.nc' + +vds = open_virtual_dataset( + 'relative_refs.json', + filetype='kerchunk', + indexes={}, + virtual_backend_kwargs={'fs_root': 'file:///some_directory/'} +) + +# the path in the virtual dataset will now be 'file:///some_directory/file.nc' ``` +Note that as the virtualizarr {py:meth}`vds.virtualize.to_kerchunk ` method only writes absolute paths, the only scenario in which you might come across references containing relative paths is if you are opening references that were previously created using the ``kerchunk`` library alone. + ## Rewriting existing manifests Sometimes it can be useful to rewrite the contents of an already-generated manifest or virtual dataset. ### Rewriting file paths -You can rewrite the file paths stored in a manifest or virtual dataset without changing the byte range information using the {py:meth}`ds.virtualize.rename_paths ` accessor method. +You can rewrite the file paths stored in a manifest or virtual dataset without changing the byte range information using the {py:meth}`vds.virtualize.rename_paths ` accessor method. For example, you may want to rename file paths according to a function to reflect having moved the location of the referenced files from local storage to an S3 bucket. diff --git a/pyproject.toml b/pyproject.toml index 77998076..aaaade02 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,6 @@ test = [ "virtualizarr[hdf_reader]" ] - [project.urls] Home = "https://github.com/TomNicholas/VirtualiZarr" Documentation = "https://github.com/TomNicholas/VirtualiZarr/blob/main/README.md" diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index b9df1923..a8e3b66a 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -105,7 +105,7 @@ def automatically_determine_filetype( def open_virtual_dataset( filepath: str, *, - filetype: FileType | None = None, + filetype: FileType | str | None = None, group: str | None = None, drop_variables: Iterable[str] | None = None, loadable_variables: Iterable[str] | None = None, diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 179bcf1c..44c0546c 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -8,7 +8,6 @@ _isnan, ) from virtualizarr.manifests.manifest import ChunkManifest -from virtualizarr.types.kerchunk import KerchunkArrRefs from virtualizarr.zarr import ZArray @@ -62,24 +61,6 @@ def __init__( self._zarray = _zarray self._manifest = _chunkmanifest - @classmethod - def _from_kerchunk_refs(cls, arr_refs: KerchunkArrRefs) -> "ManifestArray": - from virtualizarr.translators.kerchunk import ( - fully_decode_arr_refs, - parse_array_refs, - ) - - decoded_arr_refs = fully_decode_arr_refs(arr_refs) - - chunk_dict, zarray, _zattrs = parse_array_refs(decoded_arr_refs) - manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict) - - obj = object.__new__(cls) - obj._manifest = manifest - obj._zarray = zarray - - return obj - @property def manifest(self) -> ChunkManifest: return self._manifest diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 38743f9b..cc970fb2 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -1,8 +1,9 @@ -import dataclasses import json import re -from collections.abc import Iterable, Iterator -from typing import Any, Callable, Dict, NewType, Tuple, TypedDict, cast +from collections.abc import ItemsView, Iterable, Iterator, KeysView, ValuesView +from pathlib import PosixPath +from typing import Any, Callable, NewType, Tuple, TypedDict, cast +from urllib.parse import urlparse, urlunparse import numpy as np @@ -15,52 +16,169 @@ _CHUNK_KEY = rf"^{_INTEGER}+({_SEPARATOR}{_INTEGER})*$" # matches 1 integer, optionally followed by more integers each separated by a separator (i.e. a period) -class ChunkDictEntry(TypedDict): +# doesn't guarantee that writers actually handle these +VALID_URI_PREFIXES = { + "s3://", + "gs://", + "azure://", + "r2://", + "cos://", + "minio://", + "file:///", +} + + +class ChunkEntry(TypedDict): path: str offset: int length: int + @classmethod # type: ignore[misc] + def with_validation( + cls, *, path: str, offset: int, length: int, fs_root: str | None = None + ) -> "ChunkEntry": + """ + Constructor which validates each part of the chunk entry. + + Parameters + ---------- + fs_root + The root of the filesystem on which these references were generated. + Required if any (likely kerchunk-generated) paths are relative in order to turn them into absolute paths (which virtualizarr requires). + """ + + # note: we can't just use `__init__` or a dataclass' `__post_init__` because we need `fs_root` to be an optional kwarg -ChunkDict = NewType("ChunkDict", dict[ChunkKey, ChunkDictEntry]) + path = validate_and_normalize_path_to_uri(path, fs_root=fs_root) + validate_byte_range(offset=offset, length=length) + return ChunkEntry(path=path, offset=offset, length=length) -@dataclasses.dataclass(frozen=True) -class ChunkEntry: +def validate_and_normalize_path_to_uri(path: str, fs_root: str | None = None) -> str: """ - Information for a single chunk in the manifest. + Makes all paths into fully-qualified absolute URIs, or raises. - Stored in the form `{"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}`. + See https://en.wikipedia.org/wiki/File_URI_scheme + + Parameters + ---------- + fs_root + The root of the filesystem on which these references were generated. + Required if any (likely kerchunk-generated) paths are relative in order to turn them into absolute paths (which virtualizarr requires). """ + if path == "": + # (empty paths are allowed through as they represent missing chunks) + return path - path: str # TODO stricter typing/validation of possible local / remote paths? - offset: int - length: int + # TODO ideally we would just use cloudpathlib.AnyPath to handle all types of paths but that would require extra depedencies, see https://github.com/drivendataorg/cloudpathlib/issues/489#issuecomment-2504725280 - @classmethod - def from_kerchunk( - cls, path_and_byte_range_info: tuple[str] | tuple[str, int, int] - ) -> "ChunkEntry": - from upath import UPath + if path.startswith("http://") or path.startswith("https://"): + # hopefully would raise if given a malformed URL + components = urlparse(path) - if len(path_and_byte_range_info) == 1: - path = path_and_byte_range_info[0] - offset = 0 - length = UPath(path).stat().st_size - else: - path, offset, length = path_and_byte_range_info - return ChunkEntry(path=path, offset=offset, length=length) + if not PosixPath(components.path).suffix: + raise ValueError( + f"entries in the manifest must be paths to files, but this path has no file suffix: {path}" + ) + + return urlunparse(components) + + elif any(path.startswith(prefix) for prefix in VALID_URI_PREFIXES): + if not PosixPath(path).suffix: + raise ValueError( + f"entries in the manifest must be paths to files, but this path has no file suffix: {path}" + ) + + return path # path is already in URI form - def to_kerchunk(self) -> tuple[str, int, int]: - """Write out in the format that kerchunk uses for chunk entries.""" - return (self.path, self.offset, self.length) + else: + # must be a posix filesystem path (absolute or relative) + # using PosixPath here ensures a clear error would be thrown on windows (whose paths and platform are not officially supported) + _path = PosixPath(path) - def dict(self) -> ChunkDictEntry: - return ChunkDictEntry( - path=self.path, - offset=self.offset, - length=self.length, + if not _path.suffix: + raise ValueError( + f"entries in the manifest must be paths to files, but this path has no file suffix: {path}" + ) + + # only posix paths can possibly not be absolute + if not _path.is_absolute(): + if fs_root is None: + raise ValueError( + f"paths in the manifest must be absolute posix paths or URIs, but got {path}, and fs_root was not specified" + ) + else: + _path = convert_relative_path_to_absolute(_path, fs_root) + + return _path.as_uri() + + +def posixpath_maybe_from_uri(path: str) -> PosixPath: + """ + Handles file URIs like cloudpathlib.AnyPath does, i.e. + + In[1]: pathlib.PosixPath('file:///dir/file.nc') + Out[1]: pathlib.PosixPath('file:/dir/file.nc') + + In [2]: cloudpathlib.AnyPath('file:///dir/file.nc') + Out[2]: pathlib.PosixPath('/dir/file.nc') + + In [3]: posixpath_maybe_from_uri('file:///dir/file.nc') + Out[3]: pathlib.PosixPath('/dir/file.nc') + + This is needed otherwise pathlib thinks the URI is a relative path. + """ + if path.startswith("file:///"): + # TODO in python 3.13 we could probably use Path.from_uri() instead + return PosixPath(path.removeprefix("file://")) + else: + return PosixPath(path) + + +def convert_relative_path_to_absolute(path: PosixPath, fs_root: str) -> PosixPath: + _fs_root = posixpath_maybe_from_uri(fs_root) + + if not _fs_root.is_absolute() or _fs_root.suffix: + # TODO handle http url roots and bucket prefix roots? (ideally through cloudpathlib) + raise ValueError( + f"fs_root must be an absolute path to a filesystem directory, but got {fs_root}" ) + return (_fs_root / path).resolve() + + +def validate_byte_range(*, offset: Any, length: Any) -> None: + """Raise if byte offset or length has invalid type or value""" + + if isinstance(offset, np.integer): + _offset = int(offset) + elif isinstance(offset, int): + _offset = offset + else: + raise TypeError( + f"chunk entry byte offset must of type int, but got type {type(offset)}" + ) + if _offset < 0: + raise ValueError( + f"chunk entry byte offset must be a positive integer, but got offset={_offset}" + ) + + if isinstance(length, np.integer): + _length = int(length) + elif isinstance(length, int): + _length = length + else: + raise TypeError( + f"chunk entry byte offset must of type int, but got type {type(length)}" + ) + if _length < 0: + raise ValueError( + f"chunk entry byte offset must be a positive integer, but got offset={_length}" + ) + + +ChunkDict = NewType("ChunkDict", dict[ChunkKey, ChunkEntry]) + class ChunkManifest: """ @@ -124,20 +242,20 @@ def __init__(self, entries: dict, shape: tuple[int, ...] | None = None) -> None: # populate the arrays for key, entry in entries.items(): - try: - path, offset, length = entry.values() - entry = ChunkEntry(path=path, offset=offset, length=length) - except (ValueError, TypeError) as e: + if not isinstance(entry, dict) or len(entry) != 3: msg = ( "Each chunk entry must be of the form dict(path=, offset=, length=), " f"but got {entry}" ) - raise ValueError(msg) from e + raise ValueError(msg) + + path, offset, length = entry.values() + entry = ChunkEntry.with_validation(path=path, offset=offset, length=length) # type: ignore[attr-defined] split_key = split(key) - paths[split_key] = entry.path - offsets[split_key] = entry.offset - lengths[split_key] = entry.length + paths[split_key] = entry["path"] + offsets[split_key] = entry["offset"] + lengths[split_key] = entry["length"] self._paths = paths self._offsets = offsets @@ -146,9 +264,11 @@ def __init__(self, entries: dict, shape: tuple[int, ...] | None = None) -> None: @classmethod def from_arrays( cls, + *, paths: np.ndarray[Any, np.dtypes.StringDType], offsets: np.ndarray[Any, np.dtype[np.uint64]], lengths: np.ndarray[Any, np.dtype[np.uint64]], + validate_paths: bool = True, ) -> "ChunkManifest": """ Create manifest directly from numpy arrays containing the path and byte range information. @@ -161,6 +281,9 @@ def from_arrays( paths: np.ndarray offsets: np.ndarray lengths: np.ndarray + validate_paths: bool + Check that entries in the manifest are valid paths (e.g. that local paths are absolute not relative). + Set to False to skip validation for performance reasons. """ # check types @@ -200,6 +323,12 @@ def from_arrays( f"Shapes of the arrays must be consistent, but shapes of paths array and lengths array do not match: {paths.shape} vs {lengths.shape}" ) + if validate_paths: + vectorized_validation_fn = np.vectorize( + validate_and_normalize_path_to_uri, otypes=[np.dtypes.StringDType()] + ) # type: ignore[attr-defined] + paths = vectorized_validation_fn(paths) + obj = object.__new__(cls) obj._paths = paths obj._offsets = offsets @@ -233,6 +362,7 @@ def __getitem__(self, key: ChunkKey) -> ChunkEntry: path = self._paths[indices] offset = self._offsets[indices] length = self._lengths[indices] + # TODO fix bug here - types of path, offset, length shoudl be coerced return ChunkEntry(path=path, offset=offset, length=length) def __iter__(self) -> Iterator[ChunkKey]: @@ -243,26 +373,35 @@ def __iter__(self) -> Iterator[ChunkKey]: def __len__(self) -> int: return self._paths.size + def keys(self) -> KeysView: + return self.dict().keys() + + def values(self) -> ValuesView: + return self.dict().values() + + def items(self) -> ItemsView: + return self.dict().items() + def dict(self) -> ChunkDict: # type: ignore[override] """ Convert the entire manifest to a nested dictionary. The returned dict will be of the form - { - "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, - "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100}, - "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, - "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, - } + | { + | "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + | "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100}, + | "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100}, + | "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, + | } Entries whose path is an empty string will be interpreted as missing chunks and omitted from the dictionary. """ - coord_vectors = np.mgrid[ tuple(slice(None, length) for length in self.shape_chunk_grid) ] + # TODO consolidate each occurrence of this np.nditer pattern d = { join(inds): dict( path=path.item(), offset=offset.item(), length=length.item() @@ -301,24 +440,6 @@ def to_zarr_json(self, filepath: str) -> None: with open(filepath, "w") as json_file: json.dump(entries, json_file, indent=4, separators=(", ", ": ")) - @classmethod - def _from_kerchunk_chunk_dict( - cls, - # The type hint requires `Dict` instead of `dict` due to - # the conflicting ChunkManifest.dict method. - kerchunk_chunk_dict: Dict[ChunkKey, str | tuple[str] | tuple[str, int, int]], - ) -> "ChunkManifest": - chunk_entries: dict[ChunkKey, ChunkDictEntry] = {} - for k, v in kerchunk_chunk_dict.items(): - if isinstance(v, (str, bytes)): - raise NotImplementedError( - "Reading inlined reference data is currently not supported. [ToDo]" - ) - elif not isinstance(v, (tuple, list)): - raise TypeError(f"Unexpected type {type(v)} for chunk value: {v}") - chunk_entries[k] = ChunkEntry.from_kerchunk(v).dict() - return ChunkManifest(entries=chunk_entries) - def rename_paths( self, new: str | Callable[[str], str], @@ -370,6 +491,7 @@ def rename_paths( paths=renamed_paths, offsets=self._offsets, lengths=self._lengths, + validate_paths=True, ) diff --git a/virtualizarr/readers/dmrpp.py b/virtualizarr/readers/dmrpp.py index 5d74c412..451ecdc7 100644 --- a/virtualizarr/readers/dmrpp.py +++ b/virtualizarr/readers/dmrpp.py @@ -8,6 +8,7 @@ from xarray import Coordinates, Dataset, Index, Variable from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri from virtualizarr.readers.common import VirtualBackend from virtualizarr.types import ChunkKey from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions @@ -41,6 +42,10 @@ def open_virtual_dataset( "Specifying `loadable_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files." ) + filepath = validate_and_normalize_path_to_uri( + filepath, fs_root=Path.cwd().as_uri() + ) + fpath = _FsspecFSFromFilepath( filepath=filepath, reader_options=reader_options ).open_file() @@ -90,6 +95,8 @@ class DMRParser: _DEFAULT_ZLIB_VALUE = 6 # Encoding keys that should be removed from attributes and placed in xarray encoding dict _ENCODING_KEYS = {"_FillValue", "missing_value", "scale_factor", "add_offset"} + root: ET.Element + data_filepath: str def __init__(self, root: ET.Element, data_filepath: Optional[str] = None): """ @@ -97,9 +104,8 @@ def __init__(self, root: ET.Element, data_filepath: Optional[str] = None): Parameters ---------- - dmrpp_str : str - The dmrpp file contents as a string. - + root: xml.ElementTree.Element + Root of the xml tree struture of a DMR++ file. data_filepath : str, optional The path to the actual data file that will be set in the chunk manifests. If None, the data file path is taken from the DMR++ file. diff --git a/virtualizarr/readers/fits.py b/virtualizarr/readers/fits.py index 7b6221ff..1a218773 100644 --- a/virtualizarr/readers/fits.py +++ b/virtualizarr/readers/fits.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Iterable, Mapping, Optional from xarray import Dataset, Index @@ -42,6 +43,7 @@ def open_virtual_dataset( refs, loadable_variables, drop_variables, + fs_root=Path.cwd().as_uri(), ) # TODO this wouldn't work until you had an xarray backend for FITS installed diff --git a/virtualizarr/readers/hdf/hdf.py b/virtualizarr/readers/hdf/hdf.py index ca921bb0..5ff054a5 100644 --- a/virtualizarr/readers/hdf/hdf.py +++ b/virtualizarr/readers/hdf/hdf.py @@ -1,11 +1,17 @@ import math +from pathlib import Path from typing import TYPE_CHECKING, Dict, Iterable, List, Mapping, Optional, Union import numpy as np import xarray as xr -from xarray import Index, Variable +from xarray import Dataset, Index, Variable -from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray +from virtualizarr.manifests import ( + ChunkEntry, + ChunkManifest, + ManifestArray, +) +from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri from virtualizarr.readers.common import ( VirtualBackend, construct_virtual_dataset, @@ -51,6 +57,10 @@ def open_virtual_dataset( loadable_variables, ) + filepath = validate_and_normalize_path_to_uri( + filepath, fs_root=Path.cwd().as_uri() + ) + virtual_vars = HDFVirtualBackend._virtual_vars_from_hdf( path=filepath, group=group, @@ -105,11 +115,12 @@ def _dataset_chunk_manifest(path: str, dataset: Dataset) -> Optional[ChunkManife else: key_list = [0] * (len(dataset.shape) or 1) key = ".".join(map(str, key_list)) - chunk_entry = ChunkEntry( + + chunk_entry = ChunkEntry.with_validation( path=path, offset=dsid.get_offset(), length=dsid.get_storage_size() ) chunk_key = ChunkKey(key) - chunk_entries = {chunk_key: chunk_entry.dict()} + chunk_entries = {chunk_key: chunk_entry} chunk_manifest = ChunkManifest(entries=chunk_entries) return chunk_manifest else: diff --git a/virtualizarr/readers/hdf5.py b/virtualizarr/readers/hdf5.py index ca910ab4..0226a5e9 100644 --- a/virtualizarr/readers/hdf5.py +++ b/virtualizarr/readers/hdf5.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Iterable, Mapping, Optional from xarray import Dataset, Index @@ -48,6 +49,7 @@ def open_virtual_dataset( refs, loadable_variables, drop_variables, + fs_root=Path.cwd().as_uri(), ) loadable_vars, indexes = open_loadable_vars_and_indexes( diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py index 43ad5341..6f688d3c 100644 --- a/virtualizarr/readers/kerchunk.py +++ b/virtualizarr/readers/kerchunk.py @@ -25,9 +25,14 @@ def open_virtual_dataset( ) -> Dataset: """Reads existing kerchunk references (in JSON or parquet) format.""" + if virtual_backend_kwargs is None: + virtual_backend_kwargs = {} + + fs_root = virtual_backend_kwargs.pop("fs_root", None) + if virtual_backend_kwargs: raise NotImplementedError( - "Kerchunk reader does not understand any virtual_backend_kwargs" + f"Kerchunk reader does not understand any of the virtual_backend_kwargs {virtual_backend_kwargs}" ) if group: @@ -57,7 +62,9 @@ def open_virtual_dataset( full_reference = {"refs": array_refs} - vds = dataset_from_kerchunk_refs(KerchunkStoreRefs(full_reference)) + vds = dataset_from_kerchunk_refs( + KerchunkStoreRefs(full_reference), fs_root=fs_root + ) # JSON has no magic bytes, but the Kerchunk version 1 spec starts with 'version': # https://fsspec.github.io/kerchunk/spec.html @@ -65,7 +72,7 @@ def open_virtual_dataset( with fs.open_file() as of: refs = ujson.load(of) - vds = dataset_from_kerchunk_refs(KerchunkStoreRefs(refs)) + vds = dataset_from_kerchunk_refs(KerchunkStoreRefs(refs), fs_root=fs_root) else: raise ValueError( diff --git a/virtualizarr/readers/netcdf3.py b/virtualizarr/readers/netcdf3.py index f93dd2a6..b90e67d9 100644 --- a/virtualizarr/readers/netcdf3.py +++ b/virtualizarr/readers/netcdf3.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Iterable, Mapping, Optional from xarray import Dataset, Index @@ -46,6 +47,7 @@ def open_virtual_dataset( refs, loadable_variables, drop_variables, + fs_root=Path.cwd().as_uri(), ) loadable_vars, indexes = open_loadable_vars_and_indexes( diff --git a/virtualizarr/readers/tiff.py b/virtualizarr/readers/tiff.py index 0383e899..c4755583 100644 --- a/virtualizarr/readers/tiff.py +++ b/virtualizarr/readers/tiff.py @@ -1,4 +1,5 @@ import warnings +from pathlib import Path from typing import Iterable, Mapping, Optional from xarray import Dataset, Index @@ -57,6 +58,7 @@ def open_virtual_dataset( refs, loadable_variables, drop_variables, + fs_root=Path.cwd().as_uri(), ) loadable_vars, indexes = open_loadable_vars_and_indexes( diff --git a/virtualizarr/readers/zarr_v3.py b/virtualizarr/readers/zarr_v3.py index 1491d2d2..70bf66e8 100644 --- a/virtualizarr/readers/zarr_v3.py +++ b/virtualizarr/readers/zarr_v3.py @@ -28,6 +28,7 @@ def open_virtual_dataset( This is experimental - chunk manifests are not part of the Zarr v3 Spec. """ + if virtual_backend_kwargs: raise NotImplementedError( "Zarr_v3 reader does not understand any virtual_backend_kwargs" diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index aee82542..b6884782 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -83,7 +83,7 @@ def create_manifestarray( def entry_from_chunk_key(ind: tuple[int, ...]) -> dict[str, str | int]: """Generate a (somewhat) unique manifest entry from a given chunk key""" entry = { - "path": f"file.{str(join(ind))}.nc", + "path": f"/foo.{str(join(ind))}.nc", "offset": offset_from_chunk_key(ind), "length": length_from_chunk_key(ind), } diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 11b5cc20..09a9fd83 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -1,3 +1,6 @@ +from os.path import relpath +from pathlib import Path + import numpy as np import pytest import xarray as xr @@ -18,8 +21,8 @@ def test_kerchunk_roundtrip_in_memory_no_concat(): # Set up example xarray dataset chunks_dict = { - "0.0": {"path": "foo.nc", "offset": 100, "length": 100}, - "0.1": {"path": "foo.nc", "offset": 200, "length": 100}, + "0.0": {"path": "/foo.nc", "offset": 100, "length": 100}, + "0.1": {"path": "/foo.nc", "offset": 200, "length": 100}, } manifest = ChunkManifest(entries=chunks_dict) marr = ManifestArray( @@ -239,7 +242,7 @@ def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend): def test_datetime64_dtype_fill_value(self, tmpdir, format): chunks_dict = { - "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, + "0.0.0": {"path": "/foo.nc", "offset": 100, "length": 100}, } manifest = ChunkManifest(entries=chunks_dict) chunks = (1, 1, 1) @@ -292,3 +295,28 @@ def test_open_scalar_variable(tmpdir, hdf_backend): vds = open_virtual_dataset(f"{tmpdir}/scalar.nc", indexes={}, backend=hdf_backend) assert vds["a"].shape == () + + +class TestPathsToURIs: + @requires_kerchunk + @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) + def test_convert_absolute_paths_to_uris(self, netcdf4_file, hdf_backend): + vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) + + expected_path = Path(netcdf4_file).as_uri() + + manifest = vds["air"].data.manifest.dict() + path = manifest["0.0.0"]["path"] + assert path == expected_path + + @requires_kerchunk + @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) + def test_convert_relative_paths_to_uris(self, netcdf4_file, hdf_backend): + relative_path = relpath(netcdf4_file) + vds = open_virtual_dataset(relative_path, indexes={}, backend=hdf_backend) + + expected_path = Path(netcdf4_file).as_uri() + + manifest = vds["air"].data.manifest.dict() + path = manifest["0.0.0"]["path"] + assert path == expected_path diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index 06e54d95..2444e4ea 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -2,7 +2,7 @@ import pytest from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.tests import create_manifestarray, requires_kerchunk +from virtualizarr.tests import create_manifestarray from virtualizarr.zarr import ZArray @@ -35,33 +35,6 @@ def test_create_manifestarray(self): assert marr.size == 5 * 2 * 20 assert marr.ndim == 3 - @requires_kerchunk - def test_create_manifestarray_from_kerchunk_refs(self): - arr_refs = { - ".zarray": '{"chunks":[2,3],"compressor":null,"dtype":" str: def test_invalid_type(self): chunks = { - "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100}, + "0.0.0": {"path": "/foo.nc", "offset": 100, "length": 100}, } manifest = ChunkManifest(entries=chunks) with pytest.raises(TypeError): + # list is an invalid arg type manifest.rename_paths(["file1.nc", "file2.nc"]) + + def test_normalize_paths_to_uris(self): + chunks = { + "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + } + manifest = ChunkManifest(entries=chunks) + + renamed = manifest.rename_paths("/home/directory/bar.nc") + assert renamed.dict() == { + "0.0.0": { + "path": "file:///home/directory/bar.nc", + "offset": 100, + "length": 100, + }, + } + + def test_catch_malformed_paths(self): + chunks = { + "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}, + } + manifest = ChunkManifest(entries=chunks) + + with pytest.raises(ValueError): + # list is an invalid arg type + manifest.rename_paths("./foo.nc") diff --git a/virtualizarr/tests/test_readers/test_dmrpp.py b/virtualizarr/tests/test_readers/test_dmrpp.py index d6238ca5..4ef6b143 100644 --- a/virtualizarr/tests/test_readers/test_dmrpp.py +++ b/virtualizarr/tests/test_readers/test_dmrpp.py @@ -1,3 +1,4 @@ +import os import textwrap from pathlib import Path from xml.etree import ElementTree as ET @@ -20,6 +21,7 @@ # TODO: later add MUR, SWOT, TEMPO and others by using kerchunk JSON to read refs (rather than reading the whole netcdf file) ] + DMRPP_XML_STRINGS = { "basic": textwrap.dedent( """\ @@ -390,3 +392,53 @@ def test_parse_chunks( lengths=expected_lengths, offsets=expected_offsets, paths=expected_paths ) assert result == expected + + +@pytest.fixture +def basic_dmrpp_temp_filepath(tmp_path: Path) -> Path: + # TODO generalize key here? Would require the factory pattern + # (https://docs.pytest.org/en/stable/how-to/fixtures.html#factories-as-fixtures) + drmpp_xml_str = DMRPP_XML_STRINGS["basic"] + + # TODO generalize filename here? + filepath = tmp_path / "test.nc.dmrpp" + + with open(filepath, "w") as f: + f.write(drmpp_xml_str) + + return filepath + + +class TestRelativePaths: + def test_absolute_path_to_dmrpp_file_containing_relative_path( + self, + basic_dmrpp_temp_filepath: Path, + ): + vds = open_virtual_dataset( + str(basic_dmrpp_temp_filepath), indexes={}, filetype="dmrpp" + ) + path = vds["x"].data.manifest["0"]["path"] + + # by convention, if dmrpp file path is {PATH}.nc.dmrpp, the data filepath should be {PATH}.nc + # and the manifest should only contain absolute file URIs + expected_datafile_path_uri = basic_dmrpp_temp_filepath.as_uri().removesuffix( + ".dmrpp" + ) + assert path == expected_datafile_path_uri + + def test_relative_path_to_dmrpp_file(self, basic_dmrpp_temp_filepath: Path): + # test that if a user supplies a relative path to a DMR++ file we still get an absolute path in the manifest + relative_dmrpp_filepath = os.path.relpath( + str(basic_dmrpp_temp_filepath), start=os.getcwd() + ) + + vds = open_virtual_dataset( + relative_dmrpp_filepath, indexes={}, filetype="dmrpp" + ) + path = vds["x"].data.manifest["0"]["path"] + + # by convention, if dmrpp file path is {PATH}.nc.dmrpp, the data filepath should be {PATH}.nc + expected_datafile_path_uri = basic_dmrpp_temp_filepath.as_uri().removesuffix( + ".dmrpp" + ) + assert path == expected_datafile_path_uri diff --git a/virtualizarr/tests/test_readers/test_kerchunk.py b/virtualizarr/tests/test_readers/test_kerchunk.py index dab049cb..c89d0f53 100644 --- a/virtualizarr/tests/test_readers/test_kerchunk.py +++ b/virtualizarr/tests/test_readers/test_kerchunk.py @@ -22,7 +22,7 @@ def gen_ds_refs( if zattrs is None: zattrs = '{"_ARRAY_DIMENSIONS":["x","y"]}' if chunks is None: - chunks = {"a/0.0": ["test1.nc", 6144, 48]} + chunks = {"a/0.0": ["/test1.nc", 6144, 48]} return { "version": 1, @@ -78,7 +78,7 @@ def test_dataset_from_df_refs(refs_file_factory): assert vda.data.zarray.order == "C" assert vda.data.manifest.dict() == { - "0.0": {"path": "test1.nc", "offset": 6144, "length": 48} + "0.0": {"path": "file:///test1.nc", "offset": 6144, "length": 48} } @@ -121,3 +121,53 @@ def test_empty_chunk_manifest(refs_file_factory): assert isinstance(vds["a"].data, ManifestArray) assert vds["a"].sizes == {"x": 100, "y": 200} assert vds["a"].chunksizes == {"x": 50, "y": 100} + + +def test_handle_relative_paths(refs_file_factory): + # deliberately use relative path here, see https://github.com/zarr-developers/VirtualiZarr/pull/243#issuecomment-2492341326 + refs_file = refs_file_factory(chunks={"a/0.0": ["test1.nc", 6144, 48]}) + + with pytest.raises(ValueError, match="must be absolute posix paths"): + open_virtual_dataset(refs_file, filetype="kerchunk") + + refs_file = refs_file_factory(chunks={"a/0.0": ["./test1.nc", 6144, 48]}) + with pytest.raises(ValueError, match="must be absolute posix paths"): + open_virtual_dataset(refs_file, filetype="kerchunk") + + with pytest.raises( + ValueError, match="fs_root must be an absolute path to a filesystem directory" + ): + open_virtual_dataset( + refs_file, + filetype="kerchunk", + virtual_backend_kwargs={"fs_root": "some_directory/"}, + ) + + with pytest.raises( + ValueError, match="fs_root must be an absolute path to a filesystem directory" + ): + open_virtual_dataset( + refs_file, + filetype="kerchunk", + virtual_backend_kwargs={"fs_root": "/some_directory/file.nc"}, + ) + + vds = open_virtual_dataset( + refs_file, + filetype="kerchunk", + virtual_backend_kwargs={"fs_root": "/some_directory/"}, + ) + vda = vds["a"] + assert vda.data.manifest.dict() == { + "0.0": {"path": "file:///some_directory/test1.nc", "offset": 6144, "length": 48} + } + + vds = open_virtual_dataset( + refs_file, + filetype="kerchunk", + virtual_backend_kwargs={"fs_root": "file:///some_directory/"}, + ) + vda = vds["a"] + assert vda.data.manifest.dict() == { + "0.0": {"path": "file:///some_directory/test1.nc", "offset": 6144, "length": 48} + } diff --git a/virtualizarr/tests/test_writers/test_kerchunk.py b/virtualizarr/tests/test_writers/test_kerchunk.py index a8c75fb4..8cc7f825 100644 --- a/virtualizarr/tests/test_writers/test_kerchunk.py +++ b/virtualizarr/tests/test_writers/test_kerchunk.py @@ -10,7 +10,7 @@ class TestAccessor: def test_accessor_to_kerchunk_dict(self): manifest = ChunkManifest( - entries={"0.0": dict(path="test.nc", offset=6144, length=48)} + entries={"0.0": dict(path="file:///test.nc", offset=6144, length=48)} ) arr = ManifestArray( chunkmanifest=manifest, @@ -33,7 +33,7 @@ def test_accessor_to_kerchunk_dict(self): ".zattrs": "{}", "a/.zarray": '{"shape":[2,3],"chunks":[2,3],"dtype":" tuple[Mapping[str, Variable], dict[str, Any], list[str]]: """ Parses all useful information from a set kerchunk references (for a single group). + + Parameters + ---------- + fs_root + The root of the fsspec filesystem on which these references were generated. + Required if any paths are relative in order to turn them into absolute paths (which virtualizarr requires). """ virtual_vars = virtual_vars_from_kerchunk_refs( vds_refs, drop_variables=drop_variables + loadable_variables, virtual_array_class=virtual_array_class, + fs_root=fs_root, ) ds_attrs = fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) coord_names = ds_attrs.pop("coordinates", []) @@ -76,6 +85,7 @@ def virtual_vars_from_kerchunk_refs( refs: KerchunkStoreRefs, drop_variables: list[str] | None = None, virtual_array_class=ManifestArray, + fs_root: str | None = None, ) -> dict[str, Variable]: """ Translate a store-level kerchunk reference dict into aaset of xarray Variables containing virtualized arrays. @@ -97,7 +107,9 @@ def virtual_vars_from_kerchunk_refs( ] vars = { - var_name: variable_from_kerchunk_refs(refs, var_name, virtual_array_class) + var_name: variable_from_kerchunk_refs( + refs, var_name, virtual_array_class, fs_root=fs_root + ) for var_name in var_names_to_keep } return vars @@ -108,6 +120,7 @@ def dataset_from_kerchunk_refs( drop_variables: list[str] = [], virtual_array_class: type = ManifestArray, indexes: MutableMapping[str, Index] | None = None, + fs_root: str | None = None, ) -> Dataset: """ Translate a store-level kerchunk reference dict into an xarray Dataset containing virtualized arrays. @@ -119,7 +132,9 @@ def dataset_from_kerchunk_refs( Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that. """ - vars = virtual_vars_from_kerchunk_refs(refs, drop_variables, virtual_array_class) + vars = virtual_vars_from_kerchunk_refs( + refs, drop_variables, virtual_array_class, fs_root=fs_root + ) ds_attrs = fully_decode_arr_refs(refs["refs"]).get(".zattrs", {}) coord_names = ds_attrs.pop("coordinates", []) @@ -138,7 +153,10 @@ def dataset_from_kerchunk_refs( def variable_from_kerchunk_refs( - refs: KerchunkStoreRefs, var_name: str, virtual_array_class + refs: KerchunkStoreRefs, + var_name: str, + virtual_array_class, + fs_root: str | None = None, ) -> Variable: """Create a single xarray Variable by reading specific keys of a kerchunk references dict.""" @@ -147,7 +165,7 @@ def variable_from_kerchunk_refs( # we want to remove the _ARRAY_DIMENSIONS from the final variables' .attrs dims = zattrs.pop("_ARRAY_DIMENSIONS") if chunk_dict: - manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict) + manifest = manifest_from_kerchunk_chunk_dict(chunk_dict, fs_root=fs_root) varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest) elif len(zarray.shape) != 0: # empty variables don't have physical chunks, but zarray shows that the variable @@ -164,6 +182,42 @@ def variable_from_kerchunk_refs( return Variable(data=varr, dims=dims, attrs=zattrs) +def manifest_from_kerchunk_chunk_dict( + kerchunk_chunk_dict: dict[ChunkKey, str | tuple[str] | tuple[str, int, int]], + fs_root: str | None = None, +) -> ChunkManifest: + """Create a single ChunkManifest from the mapping of keys to chunk information stored inside kerchunk array refs.""" + + chunk_entries: dict[ChunkKey, ChunkEntry] = {} + for k, v in kerchunk_chunk_dict.items(): + if isinstance(v, (str, bytes)): + raise NotImplementedError( + "Reading inlined reference data is currently not supported. [ToDo]" + ) + elif not isinstance(v, (tuple, list)): + raise TypeError(f"Unexpected type {type(v)} for chunk value: {v}") + chunk_entries[k] = chunkentry_from_kerchunk(v, fs_root=fs_root) + return ChunkManifest(entries=chunk_entries) + + +def chunkentry_from_kerchunk( + path_and_byte_range_info: tuple[str] | tuple[str, int, int], + fs_root: str | None = None, +) -> ChunkEntry: + """Create a single validated ChunkEntry object from whatever kerchunk contains under that chunk key.""" + from upath import UPath + + if len(path_and_byte_range_info) == 1: + path = path_and_byte_range_info[0] + offset = 0 + length = UPath(path).stat().st_size + else: + path, offset, length = path_and_byte_range_info + return ChunkEntry.with_validation( # type: ignore[attr-defined] + path=path, offset=offset, length=length, fs_root=fs_root + ) + + def find_var_names(ds_reference_dict: KerchunkStoreRefs) -> list[str]: """Find the names of zarr variables in this store/group.""" diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index 0ba95a36..ae86eed4 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -13,17 +13,6 @@ from zarr import Group # type: ignore -VALID_URI_PREFIXES = { - "s3://", - # "gs://", # https://github.com/earth-mover/icechunk/issues/265 - # "azure://", # https://github.com/earth-mover/icechunk/issues/266 - # "r2://", - # "cos://", - # "minio://", - "file:///", -} - - def dataset_to_icechunk(ds: Dataset, store: "IcechunkStore") -> None: """ Write an xarray dataset whose variables wrap ManifestArrays to an Icechunk store. @@ -171,6 +160,7 @@ def write_manifest_virtual_refs( # loop over every reference in the ChunkManifest for that array # TODO inefficient: this should be replaced with something that sets all (new) references for the array at once # but Icechunk need to expose a suitable API first + # See https://github.com/earth-mover/icechunk/issues/401 for performance benchmark it = np.nditer( [manifest._paths, manifest._offsets, manifest._lengths], # type: ignore[arg-type] flags=[ @@ -188,16 +178,7 @@ def write_manifest_virtual_refs( store.set_virtual_ref( # TODO it would be marginally neater if I could pass the group and name as separate args key=f"{key_prefix}/c/{chunk_key}", # should be of form 'group/arr_name/c/0/1/2', where c stands for chunks - location=as_file_uri(path.item()), + location=path.item(), offset=offset.item(), length=length.item(), ) - - -def as_file_uri(path): - # TODO a more robust solution to this requirement exists in https://github.com/zarr-developers/VirtualiZarr/pull/243 - if not any(path.startswith(prefix) for prefix in VALID_URI_PREFIXES) and path != "": - # assume path is local - return f"file://{path}" - else: - return path diff --git a/virtualizarr/writers/kerchunk.py b/virtualizarr/writers/kerchunk.py index 3a0bd27b..b6d302b2 100644 --- a/virtualizarr/writers/kerchunk.py +++ b/virtualizarr/writers/kerchunk.py @@ -60,6 +60,13 @@ def dataset_to_kerchunk_refs(ds: Dataset) -> KerchunkStoreRefs: return cast(KerchunkStoreRefs, ds_refs) +def remove_file_uri_prefix(path: str): + if path.startswith("file:///"): + return path.removeprefix("file://") + else: + return path + + def variable_to_kerchunk_arr_refs(var: Variable, var_name: str) -> KerchunkArrRefs: """ Create a dictionary containing kerchunk-style array references from a single xarray.Variable (which wraps either a ManifestArray or a numpy array). @@ -72,7 +79,11 @@ def variable_to_kerchunk_arr_refs(var: Variable, var_name: str) -> KerchunkArrRe marr = var.data arr_refs: dict[str, str | list[str | int]] = { - str(chunk_key): [entry["path"], entry["offset"], entry["length"]] + str(chunk_key): [ + remove_file_uri_prefix(entry["path"]), + entry["offset"], + entry["length"], + ] for chunk_key, entry in marr.manifest.dict().items() }