zarr-developers · TomNicholas · Dec 3, 2024 · Oct 2, 2024 · Oct 2, 2024 · Oct 2, 2024
diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
@@ -2,12 +2,22 @@
 import json
 import re
 from collections.abc import Iterable, Iterator
+from pathlib import Path
 from typing import Any, Callable, Dict, NewType, Tuple, TypedDict, cast
 
 import numpy as np
 
 from virtualizarr.types import ChunkKey
 
+VALID_URI_PREFIXES = {
+    "s3://",
+    "gs://",
+    "azure://",
+    "r2://",
+    "cos://",
+    "minio://",
+    "file:///",
+}
 _INTEGER = (
     r"([1-9]+\d*|0)"  # matches 0 or an unsigned integer that does not begin with zero
 )
@@ -24,18 +34,48 @@ class ChunkDictEntry(TypedDict):
 ChunkDict = NewType("ChunkDict", dict[ChunkKey, ChunkDictEntry])
 
 
+def validate_and_normalize_path_to_uri(path: str) -> str:
+    """
+    Makes all paths into fully-qualified absolute URIs, or raises
+
+    See https://en.wikipedia.org/wiki/File_URI_scheme
+    """
+    if not any(path.startswith(prefix) for prefix in VALID_URI_PREFIXES) and path != "":
+        # assume the path is local
+        try:
+            return str(Path(path).as_uri())
+        except ValueError as e:
+            if str(e) == "relative path can't be expressed as a file URI":
+                # problem is that path is relative instead of absolute, so add context to error message that this is forbidden
+                raise ValueError(
+                    f"paths in the manifest must be absolute, but got {path}"
+                ) from e
+            else:
+                # must be some other problem with the path
+                raise
+    else:
+        # (empty paths are allowed through as they represent missing chunks)
+        # TODO should we do other validation here? e.g. to prevent a malformed path like `file:///directory//filename.nc`?
+        return path
+
+
+# TODO combine this with the ChunkDictEntry class?
 @dataclasses.dataclass(frozen=True)
 class ChunkEntry:
     """
     Information for a single chunk in the manifest.
 
-    Stored in the form `{"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}`.
+    Stored in the form `{"path": "s3://bucket//foo.nc", "offset": 100, "length": 100}`.
     """
 
-    path: str  # TODO stricter typing/validation of possible local / remote paths?
+    path: str
     offset: int
     length: int
 
+    def __post_init__(self) -> None:
+        object.__setattr__(self, "path", validate_and_normalize_path_to_uri(self.path))
+
+    # TODO kerchunk-specific constructors and translators could just live in the kerchunk module as functions
     @classmethod
     def from_kerchunk(
         cls, path_and_byte_range_info: tuple[str] | tuple[str, int, int]
@@ -71,10 +111,10 @@ class ChunkManifest:
     The manifest can be converted to or from a dictionary which looks like this
 
         {
-            "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
-            "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100},
-            "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100},
-            "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
+            "0.0.0": {"path": "s3://bucket//foo.nc", "offset": 100, "length": 100},
+            "0.0.1": {"path": "s3://bucket//foo.nc", "offset": 200, "length": 100},
+            "0.1.0": {"path": "s3://bucket//foo.nc", "offset": 300, "length": 100},
+            "0.1.1": {"path": "s3://bucket//foo.nc", "offset": 400, "length": 100},
         }
 
     using the .__init__() and .dict() methods, so users of this class can think of the manifest as if it were a dict mapping zarr chunk keys to byte ranges.
@@ -99,10 +139,10 @@ def __init__(self, entries: dict, shape: tuple[int, ...] | None = None) -> None:
             Chunk keys and byte range information, as a dictionary of the form
 
                 {
-                    "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
-                    "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100},
-                    "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100},
-                    "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
+                    "0.0.0": {"path": "s3://bucket//foo.nc", "offset": 100, "length": 100},
+                    "0.0.1": {"path": "s3://bucket//foo.nc", "offset": 200, "length": 100},
+                    "0.1.0": {"path": "s3://bucket//foo.nc", "offset": 300, "length": 100},
+                    "0.1.1": {"path": "s3://bucket//foo.nc", "offset": 400, "length": 100},
                 }
         """
         if shape is None and not entries:
@@ -250,10 +290,10 @@ def dict(self) -> ChunkDict:  # type: ignore[override]
         The returned dict will be of the form
 
         {
-            "0.0.0": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},
-            "0.0.1": {"path": "s3://bucket/foo.nc", "offset": 200, "length": 100},
-            "0.1.0": {"path": "s3://bucket/foo.nc", "offset": 300, "length": 100},
-            "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100},
+            "0.0.0": {"path": "s3://bucket//foo.nc", "offset": 100, "length": 100},
+            "0.0.1": {"path": "s3://bucket//foo.nc", "offset": 200, "length": 100},
+            "0.1.0": {"path": "s3://bucket//foo.nc", "offset": 300, "length": 100},
+            "0.1.1": {"path": "s3://bucket//foo.nc", "offset": 400, "length": 100},
         }
 
         Entries whose path is an empty string will be interpreted as missing chunks and omitted from the dictionary.

diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py
@@ -81,7 +81,7 @@ def create_manifestarray(
 def entry_from_chunk_key(ind: tuple[int, ...]) -> dict[str, str | int]:
     """Generate a (somewhat) unique manifest entry from a given chunk key"""
     entry = {
-        "path": f"file.{str(join(ind))}.nc",
+        "path": f"/foo.{str(join(ind))}.nc",
         "offset": offset_from_chunk_key(ind),
         "length": length_from_chunk_key(ind),
     }

diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
@@ -16,8 +16,8 @@
 def test_kerchunk_roundtrip_in_memory_no_concat():
     # Set up example xarray dataset
     chunks_dict = {
-        "0.0": {"path": "foo.nc", "offset": 100, "length": 100},
-        "0.1": {"path": "foo.nc", "offset": 200, "length": 100},
+        "0.0": {"path": "/foo.nc", "offset": 100, "length": 100},
+        "0.1": {"path": "/foo.nc", "offset": 200, "length": 100},
     }
     manifest = ChunkManifest(entries=chunks_dict)
     marr = ManifestArray(
@@ -212,7 +212,7 @@ def test_non_dimension_coordinates(self, tmpdir, format):
 
     def test_datetime64_dtype_fill_value(self, tmpdir, format):
         chunks_dict = {
-            "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
+            "0.0.0": {"path": "/foo.nc", "offset": 100, "length": 100},
         }
         manifest = ChunkManifest(entries=chunks_dict)
         chunks = (1, 1, 1)

diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py
@@ -107,15 +107,15 @@ def test_not_equal_chunk_entries(self):
         )
 
         chunks_dict1 = {
-            "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
-            "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100},
+            "0.0.0": {"path": "/oo.nc", "offset": 100, "length": 100},
+            "0.0.1": {"path": "/oo.nc", "offset": 200, "length": 100},
         }
         manifest1 = ChunkManifest(entries=chunks_dict1)
         marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)
 
         chunks_dict2 = {
-            "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100},
-            "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100},
+            "0.0.0": {"path": "/oo.nc", "offset": 300, "length": 100},
+            "0.0.1": {"path": "/oo.nc", "offset": 400, "length": 100},
         }
         manifest2 = ChunkManifest(entries=chunks_dict2)
         marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)
@@ -132,9 +132,9 @@ def test_broadcast_existing_axis(self):
         assert expanded.shape == (3, 2)
         assert expanded.chunks == (1, 2)
         assert expanded.manifest.dict() == {
-            "0.0": {"path": "file.0.0.nc", "offset": 0, "length": 5},
-            "1.0": {"path": "file.0.0.nc", "offset": 0, "length": 5},
-            "2.0": {"path": "file.0.0.nc", "offset": 0, "length": 5},
+            "0.0": {"path": "file:///foo.0.0.nc", "offset": 0, "length": 5},
+            "1.0": {"path": "file:///foo.0.0.nc", "offset": 0, "length": 5},
+            "2.0": {"path": "file:///foo.0.0.nc", "offset": 0, "length": 5},
         }
 
     def test_broadcast_new_axis(self):
@@ -143,9 +143,9 @@ def test_broadcast_new_axis(self):
         assert expanded.shape == (1, 3)
         assert expanded.chunks == (1, 1)
         assert expanded.manifest.dict() == {
-            "0.0": {"path": "file.0.nc", "offset": 0, "length": 5},
-            "0.1": {"path": "file.1.nc", "offset": 10, "length": 6},
-            "0.2": {"path": "file.2.nc", "offset": 20, "length": 7},
+            "0.0": {"path": "file:///foo.0.nc", "offset": 0, "length": 5},
+            "0.1": {"path": "file:///foo.1.nc", "offset": 10, "length": 6},
+            "0.2": {"path": "file:///foo.2.nc", "offset": 20, "length": 7},
         }
 
     def test_broadcast_scalar(self):
@@ -154,14 +154,14 @@ def test_broadcast_scalar(self):
         assert marr.shape == ()
         assert marr.chunks == ()
         assert marr.manifest.dict() == {
-            "0": {"path": "file.0.nc", "offset": 0, "length": 5},
+            "0": {"path": "file:///foo.0.nc", "offset": 0, "length": 5},
         }
 
         expanded = np.broadcast_to(marr, shape=(1,))
         assert expanded.shape == (1,)
         assert expanded.chunks == (1,)
         assert expanded.manifest.dict() == {
-            "0": {"path": "file.0.nc", "offset": 0, "length": 5},
+            "0": {"path": "file:///foo.0.nc", "offset": 0, "length": 5},
         }
 
     @pytest.mark.parametrize(
@@ -253,15 +253,15 @@ def test_concat(self):
         )
 
         chunks_dict1 = {
-            "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
-            "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100},
+            "0.0.0": {"path": "/foo.nc", "offset": 100, "length": 100},
+            "0.0.1": {"path": "/foo.nc", "offset": 200, "length": 100},
         }
         manifest1 = ChunkManifest(entries=chunks_dict1)
         marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)
 
         chunks_dict2 = {
-            "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100},
-            "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100},
+            "0.0.0": {"path": "/foo.nc", "offset": 300, "length": 100},
+            "0.0.1": {"path": "/foo.nc", "offset": 400, "length": 100},
         }
         manifest2 = ChunkManifest(entries=chunks_dict2)
         marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)
@@ -271,10 +271,10 @@ def test_concat(self):
         assert result.shape == (5, 2, 20)
         assert result.chunks == (5, 1, 10)
         assert result.manifest.dict() == {
-            "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
-            "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100},
-            "0.1.0": {"path": "foo.nc", "offset": 300, "length": 100},
-            "0.1.1": {"path": "foo.nc", "offset": 400, "length": 100},
+            "0.0.0": {"path": "file:///foo.nc", "offset": 100, "length": 100},
+            "0.0.1": {"path": "file:///foo.nc", "offset": 200, "length": 100},
+            "0.1.0": {"path": "file:///foo.nc", "offset": 300, "length": 100},
+            "0.1.1": {"path": "file:///foo.nc", "offset": 400, "length": 100},
         }
         assert result.zarray.compressor == zarray.compressor
         assert result.zarray.filters == zarray.filters
@@ -300,8 +300,8 @@ def test_concat_empty(self):
         marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)
 
         chunks_dict2 = {
-            "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100},
-            "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100},
+            "0.0.0": {"path": "/foo.nc", "offset": 300, "length": 100},
+            "0.0.1": {"path": "/foo.nc", "offset": 400, "length": 100},
         }
         manifest2 = ChunkManifest(entries=chunks_dict2)
         marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)
@@ -311,8 +311,8 @@ def test_concat_empty(self):
         assert result.shape == (5, 2, 20)
         assert result.chunks == (5, 1, 10)
         assert result.manifest.dict() == {
-            "0.1.0": {"path": "foo.nc", "offset": 300, "length": 100},
-            "0.1.1": {"path": "foo.nc", "offset": 400, "length": 100},
+            "0.1.0": {"path": "file:///foo.nc", "offset": 300, "length": 100},
+            "0.1.1": {"path": "file:///foo.nc", "offset": 400, "length": 100},
         }
         assert result.zarray.compressor == zarray.compressor
         assert result.zarray.filters == zarray.filters
@@ -336,15 +336,15 @@ def test_stack(self):
         )
 
         chunks_dict1 = {
-            "0.0": {"path": "foo.nc", "offset": 100, "length": 100},
-            "0.1": {"path": "foo.nc", "offset": 200, "length": 100},
+            "0.0": {"path": "/foo.nc", "offset": 100, "length": 100},
+            "0.1": {"path": "/foo.nc", "offset": 200, "length": 100},
         }
         manifest1 = ChunkManifest(entries=chunks_dict1)
         marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)
 
         chunks_dict2 = {
-            "0.0": {"path": "foo.nc", "offset": 300, "length": 100},
-            "0.1": {"path": "foo.nc", "offset": 400, "length": 100},
+            "0.0": {"path": "/foo.nc", "offset": 300, "length": 100},
+            "0.1": {"path": "/foo.nc", "offset": 400, "length": 100},
         }
         manifest2 = ChunkManifest(entries=chunks_dict2)
         marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)
@@ -354,10 +354,10 @@ def test_stack(self):
         assert result.shape == (5, 2, 20)
         assert result.chunks == (5, 1, 10)
         assert result.manifest.dict() == {
-            "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
-            "0.0.1": {"path": "foo.nc", "offset": 200, "length": 100},
-            "0.1.0": {"path": "foo.nc", "offset": 300, "length": 100},
-            "0.1.1": {"path": "foo.nc", "offset": 400, "length": 100},
+            "0.0.0": {"path": "file:///foo.nc", "offset": 100, "length": 100},
+            "0.0.1": {"path": "file:///foo.nc", "offset": 200, "length": 100},
+            "0.1.0": {"path": "file:///foo.nc", "offset": 300, "length": 100},
+            "0.1.1": {"path": "file:///foo.nc", "offset": 400, "length": 100},
         }
         assert result.zarray.compressor == zarray.compressor
         assert result.zarray.filters == zarray.filters
@@ -383,8 +383,8 @@ def test_stack_empty(self):
         marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1)
 
         chunks_dict2 = {
-            "0.0": {"path": "foo.nc", "offset": 300, "length": 100},
-            "0.1": {"path": "foo.nc", "offset": 400, "length": 100},
+            "0.0": {"path": "/foo.nc", "offset": 300, "length": 100},
+            "0.1": {"path": "/foo.nc", "offset": 400, "length": 100},
         }
         manifest2 = ChunkManifest(entries=chunks_dict2)
         marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2)
@@ -394,8 +394,8 @@ def test_stack_empty(self):
         assert result.shape == (5, 2, 20)
         assert result.chunks == (5, 1, 10)
         assert result.manifest.dict() == {
-            "0.1.0": {"path": "foo.nc", "offset": 300, "length": 100},
-            "0.1.1": {"path": "foo.nc", "offset": 400, "length": 100},
+            "0.1.0": {"path": "file:///foo.nc", "offset": 300, "length": 100},
+            "0.1.1": {"path": "file:///foo.nc", "offset": 400, "length": 100},
         }
         assert result.zarray.compressor == zarray.compressor
         assert result.zarray.filters == zarray.filters
@@ -418,11 +418,11 @@ def test_refuse_combine():
         "zarr_format": 2,
     }
     chunks_dict1 = {
-        "0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
+        "0.0.0": {"path": "/foo.nc", "offset": 100, "length": 100},
     }
     chunkmanifest1 = ChunkManifest(entries=chunks_dict1)
     chunks_dict2 = {
-        "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100},
+        "0.0.0": {"path": "/foo.nc", "offset": 300, "length": 100},
     }
     chunkmanifest2 = ChunkManifest(entries=chunks_dict2)
     marr1 = ManifestArray(zarray=zarray_common, chunkmanifest=chunkmanifest1)