Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed May 6, 2024
1 parent 7031d81 commit 8703afe
Show file tree
Hide file tree
Showing 9 changed files with 97 additions and 25 deletions.
12 changes: 9 additions & 3 deletions virtualizarr/kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,9 @@ def find_var_names(ds_reference_dict: KerchunkStoreRefs) -> list[str]:
return found_var_names


def extract_array_refs(ds_reference_dict: KerchunkStoreRefs, var_name: str) -> KerchunkArrRefs:
def extract_array_refs(
ds_reference_dict: KerchunkStoreRefs, var_name: str
) -> KerchunkArrRefs:
"""Extract only the part of the kerchunk reference dict that is relevant to this one zarr array"""

found_var_names = find_var_names(ds_reference_dict)
Expand All @@ -133,7 +135,9 @@ def extract_array_refs(ds_reference_dict: KerchunkStoreRefs, var_name: str) -> K
# TODO these function probably have more loops in them than they need to...

arr_refs = {
key.split("/")[1]: refs[key] for key in refs.keys() if var_name == key.split("/")[0]
key.split("/")[1]: refs[key]
for key in refs.keys()
if var_name == key.split("/")[0]
}

return fully_decode_arr_refs(arr_refs)
Expand Down Expand Up @@ -175,7 +179,9 @@ def dataset_to_kerchunk_refs(ds: xr.Dataset) -> KerchunkStoreRefs:
for var_name, var in ds.variables.items():
arr_refs = variable_to_kerchunk_arr_refs(var)

prepended_with_var_name = {f"{var_name}/{key}": val for key, val in arr_refs.items()}
prepended_with_var_name = {
f"{var_name}/{key}": val for key, val in arr_refs.items()
}

all_arr_refs.update(prepended_with_var_name)

Expand Down
8 changes: 6 additions & 2 deletions virtualizarr/manifests/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@ def _from_kerchunk_refs(cls, arr_refs: KerchunkArrRefs) -> "ManifestArray":

zarray = ZArray.from_kerchunk_refs(decoded_arr_refs[".zarray"])

kerchunk_chunk_dict = {k: v for k, v in decoded_arr_refs.items() if re.match(_CHUNK_KEY, k)}
kerchunk_chunk_dict = {
k: v for k, v in decoded_arr_refs.items() if re.match(_CHUNK_KEY, k)
}
chunkmanifest = ChunkManifest._from_kerchunk_chunk_dict(kerchunk_chunk_dict)

obj = object.__new__(cls)
Expand Down Expand Up @@ -204,7 +206,9 @@ def __getitem__(
indexer = _possibly_expand_trailing_ellipsis(key, self.ndim)

if len(indexer) != self.ndim:
raise ValueError(f"Invalid indexer for array with ndim={self.ndim}: {indexer}")
raise ValueError(
f"Invalid indexer for array with ndim={self.ndim}: {indexer}"
)

if all(
isinstance(axis_indexer, slice) and axis_indexer == slice(None)
Expand Down
12 changes: 9 additions & 3 deletions virtualizarr/manifests/array_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,12 +154,16 @@ def _check_same_ndims(ndims: list[int]) -> None:

def _check_same_shapes_except_on_concat_axis(shapes: list[tuple[int, ...]], axis: int):
"""Check that shapes are compatible for concatenation"""
shapes_without_concat_axis = [_remove_element_at_position(shape, axis) for shape in shapes]
shapes_without_concat_axis = [
_remove_element_at_position(shape, axis) for shape in shapes
]

first_shape, *other_shapes = shapes_without_concat_axis
for other_shape in other_shapes:
if other_shape != first_shape:
raise ValueError(f"Cannot concatenate arrays with shapes {[shape for shape in shapes]}")
raise ValueError(
f"Cannot concatenate arrays with shapes {[shape for shape in shapes]}"
)


def _remove_element_at_position(t: tuple[int, ...], pos: int) -> tuple[int, ...]:
Expand Down Expand Up @@ -269,7 +273,9 @@ def broadcast_to(x: "ManifestArray", /, shape: Tuple[int, ...]) -> "ManifestArra
# concatenate same array upon itself d_requested number of times along existing axis
result = concatenate([result] * d_requested, axis=axis)
else:
raise ValueError(f"Array with shape {x.shape} cannot be broadcast to shape {shape}")
raise ValueError(
f"Array with shape {x.shape} cannot be broadcast to shape {shape}"
)

return result

Expand Down
24 changes: 18 additions & 6 deletions virtualizarr/manifests/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@

from ..types import ChunkKey

_INTEGER = r"([1-9]+\d*|0)" # matches 0 or an unsigned integer that does not begin with zero
_INTEGER = (
r"([1-9]+\d*|0)" # matches 0 or an unsigned integer that does not begin with zero
)
_SEPARATOR = r"\."
_CHUNK_KEY = rf"^{_INTEGER}+({_SEPARATOR}{_INTEGER})*$" # matches 1 integer, optionally followed by more integers each separated by a separator (i.e. a period)

Expand All @@ -30,7 +32,9 @@ def __repr__(self) -> str:
return f"ChunkEntry(path='{self.path}', offset={self.offset}, length={self.length})"

@classmethod
def from_kerchunk(cls, path_and_byte_range_info: List[Union[str, int]]) -> "ChunkEntry":
def from_kerchunk(
cls, path_and_byte_range_info: List[Union[str, int]]
) -> "ChunkEntry":
path, offset, length = path_and_byte_range_info
return ChunkEntry(path=path, offset=offset, length=length)

Expand Down Expand Up @@ -113,7 +117,9 @@ def from_zarr_json(cls, filepath: str) -> "ChunkManifest":
with open(filepath, "r") as manifest_file:
entries_dict = json.load(manifest_file)

entries = {cast(ChunkKey, k): ChunkEntry(**entry) for k, entry in entries_dict.items()}
entries = {
cast(ChunkKey, k): ChunkEntry(**entry) for k, entry in entries_dict.items()
}
return cls(entries=entries)

def to_zarr_json(self, filepath: str) -> None:
Expand All @@ -123,7 +129,9 @@ def to_zarr_json(self, filepath: str) -> None:

@classmethod
def _from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest":
chunkentries = {k: ChunkEntry.from_kerchunk(v) for k, v in kerchunk_chunk_dict.items()}
chunkentries = {
k: ChunkEntry.from_kerchunk(v) for k, v in kerchunk_chunk_dict.items()
}
return ChunkManifest(entries=chunkentries)


Expand Down Expand Up @@ -175,8 +183,12 @@ def check_keys_form_grid(chunk_keys: Iterable[ChunkKey]):
chunk_grid_shape = get_chunk_grid_shape(chunk_keys)

# create every possible combination
all_possible_combos = itertools.product(*[range(length) for length in chunk_grid_shape])
all_required_chunk_keys: set[ChunkKey] = set(join(inds) for inds in all_possible_combos)
all_possible_combos = itertools.product(
*[range(length) for length in chunk_grid_shape]
)
all_required_chunk_keys: set[ChunkKey] = set(
join(inds) for inds in all_possible_combos
)

# check that every possible combination is represented once in the list of chunk keys
if set(chunk_keys) != all_required_chunk_keys:
Expand Down
4 changes: 3 additions & 1 deletion virtualizarr/tests/test_kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ def test_dataset_from_df_refs():
assert da.data.zarray.fill_value is None
assert da.data.zarray.order == "C"

assert da.data.manifest.dict() == {"0.0": {"path": "test1.nc", "offset": 6144, "length": 48}}
assert da.data.manifest.dict() == {
"0.0": {"path": "test1.nc", "offset": 6144, "length": 48}
}


def test_dataset_from_df_refs_with_filters():
Expand Down
4 changes: 3 additions & 1 deletion virtualizarr/tests/test_zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ def test_zarr_v3_roundtrip(tmpdir):
original = xr.Dataset({"a": (["x", "y"], arr)}, attrs={"something": 0})

original.virtualize.to_zarr(tmpdir / "store.zarr")
roundtrip = open_virtual_dataset(tmpdir / "store.zarr", filetype="zarr_v3", indexes={})
roundtrip = open_virtual_dataset(
tmpdir / "store.zarr", filetype="zarr_v3", indexes={}
)

xrt.assert_identical(roundtrip, original)
7 changes: 6 additions & 1 deletion virtualizarr/vendor/zarr/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,10 @@ def default(self, o):
def json_dumps(o: Any) -> bytes:
"""Write JSON in a consistent, human-readable way."""
return json.dumps(
o, indent=4, sort_keys=True, ensure_ascii=True, separators=(",", ": "), cls=NumberEncoder
o,
indent=4,
sort_keys=True,
ensure_ascii=True,
separators=(",", ": "),
cls=NumberEncoder,
).encode("ascii")
29 changes: 24 additions & 5 deletions virtualizarr/xarray.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
from pathlib import Path
from typing import List, Literal, Mapping, Optional, Union, overload, MutableMapping, Iterable
from typing import (
List,
Literal,
Mapping,
Optional,
Union,
overload,
MutableMapping,
Iterable,
)

import ujson # type: ignore
import xarray as xr
Expand All @@ -11,7 +20,11 @@
import virtualizarr.kerchunk as kerchunk
from virtualizarr.kerchunk import KerchunkStoreRefs, FileType
from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.zarr import dataset_to_zarr, attrs_from_zarr_group_json, metadata_from_zarr_json
from virtualizarr.zarr import (
dataset_to_zarr,
attrs_from_zarr_group_json,
metadata_from_zarr_json,
)


class ManifestBackendArray(ManifestArray, BackendArray):
Expand Down Expand Up @@ -116,7 +129,9 @@ def open_virtual_dataset(
indexes = dict(**indexes) # for type hinting: to allow mutation

loadable_vars = {
name: var for name, var in ds.variables.items() if name in loadable_variables
name: var
for name, var in ds.variables.items()
if name in loadable_variables
}

# if we only read the indexes we can just close the file right away as nothing is lazy
Expand Down Expand Up @@ -211,7 +226,9 @@ def virtual_vars_from_kerchunk_refs(
var_names = kerchunk.find_var_names(refs)
if drop_variables is None:
drop_variables = []
var_names_to_keep = [var_name for var_name in var_names if var_name not in drop_variables]
var_names_to_keep = [
var_name for var_name in var_names if var_name not in drop_variables
]

vars = {
var_name: variable_from_kerchunk_refs(refs, var_name, virtual_array_class)
Expand Down Expand Up @@ -337,7 +354,9 @@ def to_zarr(self, storepath: str) -> None:
dataset_to_zarr(self.ds, storepath)

@overload
def to_kerchunk(self, filepath: None, format: Literal["dict"]) -> KerchunkStoreRefs: ...
def to_kerchunk(
self, filepath: None, format: Literal["dict"]
) -> KerchunkStoreRefs: ...

@overload
def to_kerchunk(self, filepath: str, format: Literal["json"]) -> None: ...
Expand Down
22 changes: 19 additions & 3 deletions virtualizarr/zarr.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
from pathlib import Path
from typing import Any, Literal, NewType, Optional, Tuple, Union, List, Dict, TYPE_CHECKING
from typing import (
Any,
Literal,
NewType,
Optional,
Tuple,
Union,
List,
Dict,
TYPE_CHECKING,
)
import json

import numpy as np
Expand Down Expand Up @@ -193,14 +203,20 @@ def zarr_v3_array_metadata(zarray: ZArray, dim_names: List[str], attrs: dict) ->
"name": "regular",
"configuration": {"chunk_shape": metadata.pop("chunks")},
}
metadata["chunk_key_encoding"] = {"name": "default", "configuration": {"separator": "/"}}
metadata["chunk_key_encoding"] = {
"name": "default",
"configuration": {"separator": "/"},
}
metadata["codecs"] = metadata.pop("filters")
metadata.pop("compressor") # TODO this should be entered in codecs somehow
metadata.pop("order") # TODO this should be replaced by a transpose codec

# indicate that we're using the manifest storage transformer ZEP
metadata["storage_transformers"] = [
{"name": "chunk-manifest-json", "configuration": {"manifest": "./manifest.json"}}
{
"name": "chunk-manifest-json",
"configuration": {"manifest": "./manifest.json"},
}
]

# add information from xarray object
Expand Down

0 comments on commit 8703afe

Please sign in to comment.