zarr-developers · TomNicholas · Dec 3, 2024 · Oct 2, 2024 · Oct 2, 2024 · Oct 2, 2024
diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py
@@ -8,7 +8,6 @@
     _isnan,
 )
 from virtualizarr.manifests.manifest import ChunkManifest
-from virtualizarr.types.kerchunk import KerchunkArrRefs
 from virtualizarr.zarr import ZArray
 
 
@@ -62,24 +61,6 @@ def __init__(
         self._zarray = _zarray
         self._manifest = _chunkmanifest
 
-    @classmethod
-    def _from_kerchunk_refs(cls, arr_refs: KerchunkArrRefs) -> "ManifestArray":
-        from virtualizarr.translators.kerchunk import (
-            fully_decode_arr_refs,
-            parse_array_refs,
-        )
-
-        decoded_arr_refs = fully_decode_arr_refs(arr_refs)
-
-        chunk_dict, zarray, _zattrs = parse_array_refs(decoded_arr_refs)
-        manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict)
-
-        obj = object.__new__(cls)
-        obj._manifest = manifest
-        obj._zarray = zarray
-
-        return obj
-
     @property
     def manifest(self) -> ChunkManifest:
         return self._manifest

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
@@ -1,65 +1,118 @@
-import dataclasses
 import json
 import re
 from collections.abc import Iterable, Iterator
-from typing import Any, Callable, Dict, NewType, Tuple, TypedDict, cast
+from pathlib import Path
+from typing import Any, Callable, NewType, Tuple, TypedDict, cast
 
 import numpy as np
 
 from virtualizarr.types import ChunkKey
 
+VALID_URI_PREFIXES = {
+    "s3://",
+    "gs://",
+    "azure://",
+    "r2://",
+    "cos://",
+    "minio://",
+    "file:///",
+}
 _INTEGER = (
     r"([1-9]+\d*|0)"  # matches 0 or an unsigned integer that does not begin with zero
 )
 _SEPARATOR = r"\."
 _CHUNK_KEY = rf"^{_INTEGER}+({_SEPARATOR}{_INTEGER})*$"  # matches 1 integer, optionally followed by more integers each separated by a separator (i.e. a period)
 
 
-class ChunkDictEntry(TypedDict):
+class ChunkEntry(TypedDict):
     path: str
     offset: int
     length: int
 
+    @classmethod
+    def with_validation(
+        cls, *, path: str, offset: int, length: int, fs_root: str | None = None
+    ) -> "ChunkEntry":
+        """
+        Constructor which validates each part of the chunk entry.
 
-ChunkDict = NewType("ChunkDict", dict[ChunkKey, ChunkDictEntry])
-
-
-@dataclasses.dataclass(frozen=True)
-class ChunkEntry:
-    """
-    Information for a single chunk in the manifest.
+        Parameters
+        ----------
+        fs_root
+            The root of the filesystem on which these references were generated.
+            Required if any (likely kerchunk-generated) paths are relative in order to turn them into absolute paths (which virtualizarr requires).
+        """
 
-    Stored in the form `{"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}`.
-    """
+        # note: we can't just use `__init__` or a dataclass' `__post_init__` because we need `fs_root` to be an optional kwarg
 
-    path: str  # TODO stricter typing/validation of possible local / remote paths?
-    offset: int
-    length: int
+        path = validate_and_normalize_path_to_uri(path, fs_root=fs_root)
 
-    @classmethod
-    def from_kerchunk(
-        cls, path_and_byte_range_info: tuple[str] | tuple[str, int, int]
-    ) -> "ChunkEntry":
-        from upath import UPath
+        if isinstance(offset, np.integer):
+            _offset = int(offset)
+        elif isinstance(offset, int):
+            _offset = offset
+        else:
+            raise TypeError(
+                f"chunk entry byte offset must of type int, but got type {type(offset)}"
+            )
+        if _offset < 0:
+            raise ValueError(
+                f"chunk entry byte offset must be a positive integer, but got offset={_offset}"
+            )
 
-        if len(path_and_byte_range_info) == 1:
-            path = path_and_byte_range_info[0]
-            offset = 0
-            length = UPath(path).stat().st_size
+        if isinstance(length, np.integer):
+            _length = int(length)
+        elif isinstance(length, int):
+            _length = length
         else:
-            path, offset, length = path_and_byte_range_info
+            raise TypeError(
+                f"chunk entry byte offset must of type int, but got type {type(length)}"
+            )
+        if _length < 0:
+            raise ValueError(
+                f"chunk entry byte offset must be a positive integer, but got offset={_length}"
+            )
+
         return ChunkEntry(path=path, offset=offset, length=length)
 
-    def to_kerchunk(self) -> tuple[str, int, int]:
-        """Write out in the format that kerchunk uses for chunk entries."""
-        return (self.path, self.offset, self.length)
 
-    def dict(self) -> ChunkDictEntry:
-        return ChunkDictEntry(
-            path=self.path,
-            offset=self.offset,
-            length=self.length,
-        )
+def validate_and_normalize_path_to_uri(path: str, fs_root: str | None = None) -> str:
+    """
+    Makes all paths into fully-qualified absolute URIs, or raises
+
+    See https://en.wikipedia.org/wiki/File_URI_scheme
+
+    Parameters
+    ----------
+    fs_root
+        The root of the filesystem on which these references were generated.
+        Required if any (likely kerchunk-generated) paths are relative in order to turn them into absolute paths (which virtualizarr requires).
+    """
+    if not any(path.startswith(prefix) for prefix in VALID_URI_PREFIXES) and path != "":
+        # TODO refactor this logic?
+        try:
+            return str(Path(path).as_uri())
+        except ValueError as e:
+            if str(e) == "relative path can't be expressed as a file URI":
+                # problem is that path is relative instead of absolute
+                if fs_root is not None:
+                    # use knowledge of filesystem root to convert to absolute path
+                    return str(fs_root / Path(path).as_uri())
+                else:
+                    # add context to error message that relative paths are forbidden
+                    raise ValueError(
+                        f"paths in the manifest must be absolute, but got {path}, and fs_root was not specified"
+                    ) from e
+            else:
+                # must be some other problem with the path
+                raise
+    else:
+        # (empty paths are allowed through as they represent missing chunks)
+        # TODO should we do other validation here? e.g. to prevent a malformed path like `file:///directory//filename.nc`?
+        return path
+
+
+ChunkDict = NewType("ChunkDict", dict[ChunkKey, ChunkEntry])
 
 
 class ChunkManifest:
@@ -124,20 +177,20 @@ def __init__(self, entries: dict, shape: tuple[int, ...] | None = None) -> None:
 
         # populate the arrays
         for key, entry in entries.items():
-            try:
-                path, offset, length = entry.values()
-                entry = ChunkEntry(path=path, offset=offset, length=length)
-            except (ValueError, TypeError) as e:
+            if not isinstance(entry, dict) or len(entry) != 3:
                 msg = (
                     "Each chunk entry must be of the form dict(path=<str>, offset=<int>, length=<int>), "
                     f"but got {entry}"
                 )
-                raise ValueError(msg) from e
+                raise ValueError(msg)
+
+            path, offset, length = entry.values()
+            entry = ChunkEntry.with_validation(path=path, offset=offset, length=length)
 
             split_key = split(key)
-            paths[split_key] = entry.path
-            offsets[split_key] = entry.offset
-            lengths[split_key] = entry.length
+            paths[split_key] = entry["path"]
+            offsets[split_key] = entry["offset"]
+            lengths[split_key] = entry["length"]
 
         self._paths = paths
         self._offsets = offsets
@@ -249,12 +302,12 @@ def dict(self) -> ChunkDict:  # type: ignore[override]
 
         The returned dict will be of the form
 
-        {
-            "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
-            "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100},
-            "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100},
-            "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
-        }
+        |    {
+        |        "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
+        |        "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100},
+        |        "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100},
+        |        "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
+        |    }
 
         Entries whose path is an empty string will be interpreted as missing chunks and omitted from the dictionary.
         """
@@ -301,24 +354,6 @@ def to_zarr_json(self, filepath: str) -> None:
         with open(filepath, "w") as json_file:
             json.dump(entries, json_file, indent=4, separators=(", ", ": "))
 
-    @classmethod
-    def _from_kerchunk_chunk_dict(
-        cls,
-        # The type hint requires `Dict` instead of `dict` due to
-        # the conflicting ChunkManifest.dict method.
-        kerchunk_chunk_dict: Dict[ChunkKey, str | tuple[str] | tuple[str, int, int]],
-    ) -> "ChunkManifest":
-        chunk_entries: dict[ChunkKey, ChunkDictEntry] = {}
-        for k, v in kerchunk_chunk_dict.items():
-            if isinstance(v, (str, bytes)):
-                raise NotImplementedError(
-                    "Reading inlined reference data is currently not supported. [ToDo]"
-                )
-            elif not isinstance(v, (tuple, list)):
-                raise TypeError(f"Unexpected type {type(v)} for chunk value: {v}")
-            chunk_entries[k] = ChunkEntry.from_kerchunk(v).dict()
-        return ChunkManifest(entries=chunk_entries)
-
     def rename_paths(
         self,
         new: str | Callable[[str], str],

diff --git a/virtualizarr/readers/hdf/hdf.py b/virtualizarr/readers/hdf/hdf.py
@@ -3,9 +3,13 @@
 
 import numpy as np
 import xarray as xr
-from xarray import Index, Variable
+from xarray import Dataset, Index, Variable
 
-from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
+from virtualizarr.manifests import (
+    ChunkEntry,
+    ChunkManifest,
+    ManifestArray,
+)
 from virtualizarr.readers.common import (
     VirtualBackend,
     construct_virtual_dataset,
@@ -99,11 +103,12 @@ def _dataset_chunk_manifest(path: str, dataset: Dataset) -> Optional[ChunkManife
             else:
                 key_list = [0] * (len(dataset.shape) or 1)
                 key = ".".join(map(str, key_list))
-                chunk_entry = ChunkEntry(
+
+                chunk_entry = ChunkEntry.with_validation(
                     path=path, offset=dsid.get_offset(), length=dsid.get_storage_size()
                 )
                 chunk_key = ChunkKey(key)
-                chunk_entries = {chunk_key: chunk_entry.dict()}
+                chunk_entries = {chunk_key: chunk_entry}
                 chunk_manifest = ChunkManifest(entries=chunk_entries)
                 return chunk_manifest
         else:

diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py
@@ -83,7 +83,7 @@ def create_manifestarray(
 def entry_from_chunk_key(ind: tuple[int, ...]) -> dict[str, str | int]:
     """Generate a (somewhat) unique manifest entry from a given chunk key"""
     entry = {
-        "path": f"file.{str(join(ind))}.nc",
+        "path": f"/foo.{str(join(ind))}.nc",
         "offset": offset_from_chunk_key(ind),
         "length": length_from_chunk_key(ind),
     }

diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
@@ -18,8 +18,8 @@
 def test_kerchunk_roundtrip_in_memory_no_concat():
     # Set up example xarray dataset
     chunks_dict = {
-        "0.0": {"path": "foo.nc", "offset": 100, "length": 100},
-        "0.1": {"path": "foo.nc", "offset": 200, "length": 100},
+        "0.0": {"path": "/foo.nc", "offset": 100, "length": 100},
+        "0.1": {"path": "/foo.nc", "offset": 200, "length": 100},
     }
     manifest = ChunkManifest(entries=chunks_dict)
     marr = ManifestArray(
@@ -239,7 +239,7 @@ def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
 
     def test_datetime64_dtype_fill_value(self, tmpdir, format):
         chunks_dict = {
-            "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
+            "0.0.0": {"path": "/foo.nc", "offset": 100, "length": 100},
         }
         manifest = ChunkManifest(entries=chunks_dict)
         chunks = (1, 1, 1)