diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ec71436d..bb486943 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,7 +18,7 @@ repos: args: [ --fix ] # Run the formatter. - id: ruff-format - + - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.9.0 hooks: diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 0d5c507a..53e12cfe 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -20,12 +20,14 @@ from enum import Enum, auto + class AutoName(Enum): # Recommended by official Python docs for auto naming: # https://docs.python.org/3/library/enum.html#using-automatic-values def _generate_next_value_(name, start, count, last_values): return name + class FileType(AutoName): netcdf3 = auto() netcdf4 = auto() @@ -34,6 +36,7 @@ class FileType(AutoName): fits = auto() zarr = auto() + def read_kerchunk_references_from_file( filepath: str, filetype: Optional[FileType] ) -> KerchunkStoreRefs: @@ -57,6 +60,7 @@ def read_kerchunk_references_from_file( if filetype.name.lower() == "netcdf3": from kerchunk.netCDF3 import NetCDF3ToZarr + refs = NetCDF3ToZarr(filepath, inline_threshold=0).translate() elif filetype.name.lower() == "netcdf4": @@ -87,7 +91,7 @@ def _automatically_determine_filetype(filepath: str) -> FileType: if file_extension == ".nc": # based off of: https://github.com/TomNicholas/VirtualiZarr/pull/43#discussion_r1543415167 - with open(filepath, 'rb') as f: + with open(filepath, "rb") as f: magic = f.read() if magic[0:3] == b"CDF": filetype = FileType.netcdf3 @@ -119,9 +123,7 @@ def find_var_names(ds_reference_dict: KerchunkStoreRefs) -> list[str]: return found_var_names -def extract_array_refs( - ds_reference_dict: KerchunkStoreRefs, var_name: str -) -> KerchunkArrRefs: +def extract_array_refs(ds_reference_dict: KerchunkStoreRefs, var_name: str) -> KerchunkArrRefs: """Extract only the part of the kerchunk reference dict that is relevant to this one zarr array""" found_var_names = find_var_names(ds_reference_dict) @@ -131,9 +133,7 @@ def extract_array_refs( # TODO these function probably have more loops in them than they need to... arr_refs = { - key.split("/")[1]: refs[key] - for key in refs.keys() - if var_name == key.split("/")[0] + key.split("/")[1]: refs[key] for key in refs.keys() if var_name == key.split("/")[0] } return fully_decode_arr_refs(arr_refs) @@ -175,9 +175,7 @@ def dataset_to_kerchunk_refs(ds: xr.Dataset) -> KerchunkStoreRefs: for var_name, var in ds.variables.items(): arr_refs = variable_to_kerchunk_arr_refs(var) - prepended_with_var_name = { - f"{var_name}/{key}": val for key, val in arr_refs.items() - } + prepended_with_var_name = {f"{var_name}/{key}": val for key, val in arr_refs.items()} all_arr_refs.update(prepended_with_var_name) diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 0b21e0a7..7100b06a 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -71,9 +71,7 @@ def _from_kerchunk_refs(cls, arr_refs: KerchunkArrRefs) -> "ManifestArray": zarray = ZArray.from_kerchunk_refs(decoded_arr_refs[".zarray"]) - kerchunk_chunk_dict = { - k: v for k, v in decoded_arr_refs.items() if re.match(_CHUNK_KEY, k) - } + kerchunk_chunk_dict = {k: v for k, v in decoded_arr_refs.items() if re.match(_CHUNK_KEY, k)} chunkmanifest = ChunkManifest._from_kerchunk_chunk_dict(kerchunk_chunk_dict) obj = object.__new__(cls) @@ -206,9 +204,7 @@ def __getitem__( indexer = _possibly_expand_trailing_ellipsis(key, self.ndim) if len(indexer) != self.ndim: - raise ValueError( - f"Invalid indexer for array with ndim={self.ndim}: {indexer}" - ) + raise ValueError(f"Invalid indexer for array with ndim={self.ndim}: {indexer}") if all( isinstance(axis_indexer, slice) and axis_indexer == slice(None) diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index 9333d88e..0211985b 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -154,16 +154,12 @@ def _check_same_ndims(ndims: list[int]) -> None: def _check_same_shapes_except_on_concat_axis(shapes: list[tuple[int, ...]], axis: int): """Check that shapes are compatible for concatenation""" - shapes_without_concat_axis = [ - _remove_element_at_position(shape, axis) for shape in shapes - ] + shapes_without_concat_axis = [_remove_element_at_position(shape, axis) for shape in shapes] first_shape, *other_shapes = shapes_without_concat_axis for other_shape in other_shapes: if other_shape != first_shape: - raise ValueError( - f"Cannot concatenate arrays with shapes {[shape for shape in shapes]}" - ) + raise ValueError(f"Cannot concatenate arrays with shapes {[shape for shape in shapes]}") def _remove_element_at_position(t: tuple[int, ...], pos: int) -> tuple[int, ...]: @@ -273,9 +269,7 @@ def broadcast_to(x: "ManifestArray", /, shape: Tuple[int, ...]) -> "ManifestArra # concatenate same array upon itself d_requested number of times along existing axis result = concatenate([result] * d_requested, axis=axis) else: - raise ValueError( - f"Array with shape {x.shape} cannot be broadcast to shape {shape}" - ) + raise ValueError(f"Array with shape {x.shape} cannot be broadcast to shape {shape}") return result diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index b12813eb..57019ee1 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -8,9 +8,7 @@ from ..types import ChunkKey -_INTEGER = ( - r"([1-9]+\d*|0)" # matches 0 or an unsigned integer that does not begin with zero -) +_INTEGER = r"([1-9]+\d*|0)" # matches 0 or an unsigned integer that does not begin with zero _SEPARATOR = r"\." _CHUNK_KEY = rf"^{_INTEGER}+({_SEPARATOR}{_INTEGER})*$" # matches 1 integer, optionally followed by more integers each separated by a separator (i.e. a period) @@ -32,9 +30,7 @@ def __repr__(self) -> str: return f"ChunkEntry(path='{self.path}', offset={self.offset}, length={self.length})" @classmethod - def from_kerchunk( - cls, path_and_byte_range_info: List[Union[str, int]] - ) -> "ChunkEntry": + def from_kerchunk(cls, path_and_byte_range_info: List[Union[str, int]]) -> "ChunkEntry": path, offset, length = path_and_byte_range_info return ChunkEntry(path=path, offset=offset, length=length) @@ -127,9 +123,7 @@ def to_zarr_json(self, filepath: str) -> None: @classmethod def _from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest": - chunkentries = { - k: ChunkEntry.from_kerchunk(v) for k, v in kerchunk_chunk_dict.items() - } + chunkentries = {k: ChunkEntry.from_kerchunk(v) for k, v in kerchunk_chunk_dict.items()} return ChunkManifest(entries=chunkentries) @@ -181,12 +175,8 @@ def check_keys_form_grid(chunk_keys: Iterable[ChunkKey]): chunk_grid_shape = get_chunk_grid_shape(chunk_keys) # create every possible combination - all_possible_combos = itertools.product( - *[range(length) for length in chunk_grid_shape] - ) - all_required_chunk_keys: set[ChunkKey] = set( - join(inds) for inds in all_possible_combos - ) + all_possible_combos = itertools.product(*[range(length) for length in chunk_grid_shape]) + all_required_chunk_keys: set[ChunkKey] = set(join(inds) for inds in all_possible_combos) # check that every possible combination is represented once in the list of chunk keys if set(chunk_keys) != all_required_chunk_keys: diff --git a/virtualizarr/tests/conftest.py b/virtualizarr/tests/conftest.py index 1ffa50e1..51d672d7 100644 --- a/virtualizarr/tests/conftest.py +++ b/virtualizarr/tests/conftest.py @@ -1,6 +1,7 @@ import pytest import xarray as xr + @pytest.fixture def netcdf4_file(tmpdir): # Set up example xarray dataset diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index 7a2ffb88..0d4bb87d 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -9,11 +9,12 @@ from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray from virtualizarr.xarray import dataset_from_kerchunk_refs + def gen_ds_refs( - zgroup: str = '{"zarr_format":2}', - zarray: str = '{"chunks":[2,3],"compressor":null,"dtype":" None: dataset_to_zarr(self.ds, storepath) @overload - def to_kerchunk(self, filepath: None, format: Literal["dict"]) -> KerchunkStoreRefs: - ... + def to_kerchunk(self, filepath: None, format: Literal["dict"]) -> KerchunkStoreRefs: ... @overload - def to_kerchunk(self, filepath: str, format: Literal["json"]) -> None: - ... + def to_kerchunk(self, filepath: str, format: Literal["json"]) -> None: ... @overload - def to_kerchunk(self, filepath: str, format: Literal["parquet"]) -> None: - ... + def to_kerchunk(self, filepath: str, format: Literal["parquet"]) -> None: ... def to_kerchunk( self, diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 54774131..d92f6a8c 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -1,4 +1,3 @@ - from pathlib import Path from typing import Any, Literal, NewType, Optional, Tuple, Union, List, Dict, TYPE_CHECKING import json @@ -134,12 +133,8 @@ def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None: Path.mkdir(_storepath, exist_ok=False) # should techically loop over groups in a tree but a dataset corresponds to only one group - group_metadata = { - "zarr_format": 3, - "node_type": "group", - "attributes": ds.attrs - } - with open(_storepath / 'zarr.json', "wb") as group_metadata_file: + group_metadata = {"zarr_format": 3, "node_type": "group", "attributes": ds.attrs} + with open(_storepath / "zarr.json", "wb") as group_metadata_file: group_metadata_file.write(json_dumps(group_metadata)) for name, var in ds.variables.items(): @@ -177,10 +172,10 @@ def to_zarr_json(var: xr.Variable, array_dir: Path) -> None: marr = var.data - marr.manifest.to_zarr_json(array_dir / 'manifest.json') + marr.manifest.to_zarr_json(array_dir / "manifest.json") metadata = zarr_v3_array_metadata(marr.zarray, list(var.dims), var.attrs) - with open(array_dir / 'zarr.json', "wb") as metadata_file: + with open(array_dir / "zarr.json", "wb") as metadata_file: metadata_file.write(json_dumps(metadata)) @@ -194,25 +189,18 @@ def zarr_v3_array_metadata(zarray: ZArray, dim_names: List[str], attrs: dict) -> metadata["zarr_format"] = 3 metadata["node_type"] = "array" metadata["data_type"] = str(np.dtype(metadata.pop("dtype"))) - metadata["chunk_grid"] = {"name": "regular", "configuration": {"chunk_shape": metadata.pop("chunks")}} - metadata["chunk_key_encoding"] = { - "name": "default", - "configuration": { - "separator": "/" - } + metadata["chunk_grid"] = { + "name": "regular", + "configuration": {"chunk_shape": metadata.pop("chunks")}, } + metadata["chunk_key_encoding"] = {"name": "default", "configuration": {"separator": "/"}} metadata["codecs"] = metadata.pop("filters") metadata.pop("compressor") # TODO this should be entered in codecs somehow metadata.pop("order") # TODO this should be replaced by a transpose codec # indicate that we're using the manifest storage transformer ZEP metadata["storage_transformers"] = [ - { - "name": "chunk-manifest-json", - "configuration": { - "manifest": "./manifest.json" - } - } + {"name": "chunk-manifest-json", "configuration": {"manifest": "./manifest.json"}} ] # add information from xarray object @@ -233,12 +221,14 @@ def metadata_from_zarr_json(filepath: Path) -> Tuple[ZArray, List[str], dict]: metadata = json.load(metadata_file) if { - "name": "chunk-manifest-json", - "configuration": { - "manifest": "./manifest.json", - } - } not in metadata.get("storage_transformers", []): - raise ValueError("Can only read byte ranges from Zarr v3 stores which implement the manifest storage transformer ZEP.") + "name": "chunk-manifest-json", + "configuration": { + "manifest": "./manifest.json", + }, + } not in metadata.get("storage_transformers", []): + raise ValueError( + "Can only read byte ranges from Zarr v3 stores which implement the manifest storage transformer ZEP." + ) attrs = metadata.pop("attributes") dim_names = metadata.pop("dimension_names")