From 3a0fa4cf7f3e785464be2124e38d2802f8d34778 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 May 2024 11:38:06 -0600 Subject: [PATCH 1/2] [pre-commit.ci] pre-commit autoupdate (#101) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/pre-commit/pre-commit-hooks: v4.5.0 → v4.6.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.5.0...v4.6.0) - [github.com/astral-sh/ruff-pre-commit: v0.3.5 → v0.4.3](https://github.com/astral-sh/ruff-pre-commit/compare/v0.3.5...v0.4.3) - [github.com/pre-commit/mirrors-mypy: v1.9.0 → v1.10.0](https://github.com/pre-commit/mirrors-mypy/compare/v1.9.0...v1.10.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2bf64990..49e28b73 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,14 +3,14 @@ ci: autoupdate_schedule: monthly repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 + rev: v4.6.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml - repo: https://github.com/astral-sh/ruff-pre-commit - rev: "v0.3.5" + rev: "v0.4.3" hooks: - id: ruff args: ["--fix"] @@ -20,7 +20,7 @@ repos: # - id: velin # args: ["--write", "--compact"] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.9.0 + rev: v1.10.0 hooks: - id: mypy # Copied from setup.cfg From a3dab6c970c062a756f664abe373b3b3b53ba487 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 7 May 2024 13:54:00 -0400 Subject: [PATCH 2/2] Bump Ruff version and add formatting (#98) * Bump Ruff version and add formatting - [ ] Closes #96 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add config options to conform closer to black for linting * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update pyproject.toml * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Tom Nicholas --- .pre-commit-config.yaml | 12 ++-- pyproject.toml | 17 +++--- virtualizarr/kerchunk.py | 6 +- virtualizarr/manifests/manifest.py | 4 +- virtualizarr/tests/conftest.py | 1 + virtualizarr/tests/test_kerchunk.py | 47 +++++++++------- .../tests/test_manifests/test_array.py | 3 +- .../tests/test_manifests/test_manifest.py | 6 +- virtualizarr/tests/test_xarray.py | 5 +- virtualizarr/tests/test_zarr.py | 33 ++++++----- virtualizarr/vendor/zarr/utils.py | 8 ++- virtualizarr/xarray.py | 49 +++++++++++------ virtualizarr/zarr.py | 55 +++++++++++-------- 13 files changed, 145 insertions(+), 101 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 49e28b73..a1af21fa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,15 +10,15 @@ repos: - id: check-yaml - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. rev: "v0.4.3" hooks: + # Run the linter. - id: ruff - args: ["--fix"] - # - repo: https://github.com/Carreau/velin - # rev: 0.0.8 - # hooks: - # - id: velin - # args: ["--write", "--compact"] + args: [ --fix ] + # Run the formatter. + - id: ruff-format + - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.10.0 hooks: diff --git a/pyproject.toml b/pyproject.toml index 7bde54f5..afdde7a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,24 +75,21 @@ datatree = ["py.typed"] files = "virtualizarr/**/*.py" show_error_codes = true - - - - [tool.ruff] -line-length = 100 +# Same as Black. +line-length = 88 +indent-width = 4 target-version = "py39" exclude = [ "docs", ".eggs"] - [tool.ruff.lint] # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or # McCabe complexity (`C901`) by default. -select = ["E4", "E7", "E9", "F"] +select = ["E4", "E7", "E9", "F", "I"] per-file-ignores = {} # E402: module level import not at top of file @@ -101,7 +98,13 @@ per-file-ignores = {} ignore = ["E402", "E731"] +# Allow fix for all enabled rules (when `--fix`) is provided. +fixable = ["ALL"] +unfixable = [] + [tool.ruff.format] +# Like Black, use double quotes for strings. +quote-style = "double" # Indent with spaces, rather than tabs. indent-style = "space" # Respect magic trailing commas. diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 0d5c507a..f826502e 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -20,12 +20,14 @@ from enum import Enum, auto + class AutoName(Enum): # Recommended by official Python docs for auto naming: # https://docs.python.org/3/library/enum.html#using-automatic-values def _generate_next_value_(name, start, count, last_values): return name + class FileType(AutoName): netcdf3 = auto() netcdf4 = auto() @@ -34,6 +36,7 @@ class FileType(AutoName): fits = auto() zarr = auto() + def read_kerchunk_references_from_file( filepath: str, filetype: Optional[FileType] ) -> KerchunkStoreRefs: @@ -57,6 +60,7 @@ def read_kerchunk_references_from_file( if filetype.name.lower() == "netcdf3": from kerchunk.netCDF3 import NetCDF3ToZarr + refs = NetCDF3ToZarr(filepath, inline_threshold=0).translate() elif filetype.name.lower() == "netcdf4": @@ -87,7 +91,7 @@ def _automatically_determine_filetype(filepath: str) -> FileType: if file_extension == ".nc": # based off of: https://github.com/TomNicholas/VirtualiZarr/pull/43#discussion_r1543415167 - with open(filepath, 'rb') as f: + with open(filepath, "rb") as f: magic = f.read() if magic[0:3] == b"CDF": filetype = FileType.netcdf3 diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index b12813eb..a0c6922d 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -117,7 +117,9 @@ def from_zarr_json(cls, filepath: str) -> "ChunkManifest": with open(filepath, "r") as manifest_file: entries_dict = json.load(manifest_file) - entries = {cast(ChunkKey, k): ChunkEntry(**entry) for k, entry in entries_dict.items()} + entries = { + cast(ChunkKey, k): ChunkEntry(**entry) for k, entry in entries_dict.items() + } return cls(entries=entries) def to_zarr_json(self, filepath: str) -> None: diff --git a/virtualizarr/tests/conftest.py b/virtualizarr/tests/conftest.py index 1ffa50e1..51d672d7 100644 --- a/virtualizarr/tests/conftest.py +++ b/virtualizarr/tests/conftest.py @@ -1,6 +1,7 @@ import pytest import xarray as xr + @pytest.fixture def netcdf4_file(tmpdir): # Set up example xarray dataset diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index 7a2ffb88..7a10f465 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -1,19 +1,19 @@ import numpy as np +import pytest import ujson # type: ignore import xarray as xr import xarray.testing as xrt -import pytest - -from virtualizarr.kerchunk import _automatically_determine_filetype, FileType +from virtualizarr.kerchunk import FileType, _automatically_determine_filetype from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray from virtualizarr.xarray import dataset_from_kerchunk_refs + def gen_ds_refs( - zgroup: str = '{"zarr_format":2}', - zarray: str = '{"chunks":[2,3],"compressor":null,"dtype":" bytes: """Write JSON in a consistent, human-readable way.""" return json.dumps( - o, indent=4, sort_keys=True, ensure_ascii=True, separators=(",", ": "), cls=NumberEncoder + o, + indent=4, + sort_keys=True, + ensure_ascii=True, + separators=(",", ": "), + cls=NumberEncoder, ).encode("ascii") diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 1fa83577..8ab69d09 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -1,5 +1,14 @@ from pathlib import Path -from typing import List, Literal, Mapping, Optional, Union, overload, MutableMapping, Iterable +from typing import ( + Iterable, + List, + Literal, + Mapping, + MutableMapping, + Optional, + Union, + overload, +) import ujson # type: ignore import xarray as xr @@ -9,9 +18,13 @@ from xarray.core.variable import IndexVariable import virtualizarr.kerchunk as kerchunk -from virtualizarr.kerchunk import KerchunkStoreRefs, FileType +from virtualizarr.kerchunk import FileType, KerchunkStoreRefs from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.zarr import dataset_to_zarr, attrs_from_zarr_group_json, metadata_from_zarr_json +from virtualizarr.zarr import ( + attrs_from_zarr_group_json, + dataset_to_zarr, + metadata_from_zarr_json, +) class ManifestBackendArray(ManifestArray, BackendArray): @@ -78,13 +91,14 @@ def open_virtual_dataset( if common: raise ValueError(f"Cannot both load and drop variables {common}") - if virtual_array_class is not ManifestArray: raise NotImplementedError() if filetype == "zarr_v3": # TODO is there a neat way of auto-detecting this? - return open_virtual_dataset_from_v3_store(storepath=filepath, drop_variables=drop_variables, indexes=indexes) + return open_virtual_dataset_from_v3_store( + storepath=filepath, drop_variables=drop_variables, indexes=indexes + ) else: # this is the only place we actually always need to use kerchunk directly # TODO avoid even reading byte ranges for variables that will be dropped later anyway? @@ -114,7 +128,11 @@ def open_virtual_dataset( else: indexes = dict(**indexes) # for type hinting: to allow mutation - loadable_vars = {name: var for name, var in ds.variables.items() if name in loadable_variables} + loadable_vars = { + name: var + for name, var in ds.variables.items() + if name in loadable_variables + } # if we only read the indexes we can just close the file right away as nothing is lazy if loadable_vars == {}: @@ -212,14 +230,14 @@ def virtual_vars_from_kerchunk_refs( var_name for var_name in var_names if var_name not in drop_variables ] - vars = {var_name: variable_from_kerchunk_refs( - refs, var_name, virtual_array_class - ) for var_name in var_names_to_keep} + vars = { + var_name: variable_from_kerchunk_refs(refs, var_name, virtual_array_class) + for var_name in var_names_to_keep + } return vars - def dataset_from_kerchunk_refs( refs: KerchunkStoreRefs, drop_variables: List[str] = [], @@ -336,16 +354,15 @@ def to_zarr(self, storepath: str) -> None: dataset_to_zarr(self.ds, storepath) @overload - def to_kerchunk(self, filepath: None, format: Literal["dict"]) -> KerchunkStoreRefs: - ... + def to_kerchunk( + self, filepath: None, format: Literal["dict"] + ) -> KerchunkStoreRefs: ... @overload - def to_kerchunk(self, filepath: str, format: Literal["json"]) -> None: - ... + def to_kerchunk(self, filepath: str, format: Literal["json"]) -> None: ... @overload - def to_kerchunk(self, filepath: str, format: Literal["parquet"]) -> None: - ... + def to_kerchunk(self, filepath: str, format: Literal["parquet"]) -> None: ... def to_kerchunk( self, diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 54774131..ac3c2c45 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -1,12 +1,22 @@ - -from pathlib import Path -from typing import Any, Literal, NewType, Optional, Tuple, Union, List, Dict, TYPE_CHECKING import json +from pathlib import Path +from typing import ( + TYPE_CHECKING, + Any, + Dict, + List, + Literal, + NewType, + Optional, + Tuple, + Union, +) import numpy as np import ujson # type: ignore import xarray as xr from pydantic import BaseModel, ConfigDict, field_validator + from virtualizarr.vendor.zarr.utils import json_dumps if TYPE_CHECKING: @@ -134,12 +144,8 @@ def dataset_to_zarr(ds: xr.Dataset, storepath: str) -> None: Path.mkdir(_storepath, exist_ok=False) # should techically loop over groups in a tree but a dataset corresponds to only one group - group_metadata = { - "zarr_format": 3, - "node_type": "group", - "attributes": ds.attrs - } - with open(_storepath / 'zarr.json', "wb") as group_metadata_file: + group_metadata = {"zarr_format": 3, "node_type": "group", "attributes": ds.attrs} + with open(_storepath / "zarr.json", "wb") as group_metadata_file: group_metadata_file.write(json_dumps(group_metadata)) for name, var in ds.variables.items(): @@ -177,10 +183,10 @@ def to_zarr_json(var: xr.Variable, array_dir: Path) -> None: marr = var.data - marr.manifest.to_zarr_json(array_dir / 'manifest.json') + marr.manifest.to_zarr_json(array_dir / "manifest.json") metadata = zarr_v3_array_metadata(marr.zarray, list(var.dims), var.attrs) - with open(array_dir / 'zarr.json', "wb") as metadata_file: + with open(array_dir / "zarr.json", "wb") as metadata_file: metadata_file.write(json_dumps(metadata)) @@ -194,12 +200,13 @@ def zarr_v3_array_metadata(zarray: ZArray, dim_names: List[str], attrs: dict) -> metadata["zarr_format"] = 3 metadata["node_type"] = "array" metadata["data_type"] = str(np.dtype(metadata.pop("dtype"))) - metadata["chunk_grid"] = {"name": "regular", "configuration": {"chunk_shape": metadata.pop("chunks")}} + metadata["chunk_grid"] = { + "name": "regular", + "configuration": {"chunk_shape": metadata.pop("chunks")}, + } metadata["chunk_key_encoding"] = { "name": "default", - "configuration": { - "separator": "/" - } + "configuration": {"separator": "/"}, } metadata["codecs"] = metadata.pop("filters") metadata.pop("compressor") # TODO this should be entered in codecs somehow @@ -209,9 +216,7 @@ def zarr_v3_array_metadata(zarray: ZArray, dim_names: List[str], attrs: dict) -> metadata["storage_transformers"] = [ { "name": "chunk-manifest-json", - "configuration": { - "manifest": "./manifest.json" - } + "configuration": {"manifest": "./manifest.json"}, } ] @@ -233,12 +238,14 @@ def metadata_from_zarr_json(filepath: Path) -> Tuple[ZArray, List[str], dict]: metadata = json.load(metadata_file) if { - "name": "chunk-manifest-json", - "configuration": { - "manifest": "./manifest.json", - } - } not in metadata.get("storage_transformers", []): - raise ValueError("Can only read byte ranges from Zarr v3 stores which implement the manifest storage transformer ZEP.") + "name": "chunk-manifest-json", + "configuration": { + "manifest": "./manifest.json", + }, + } not in metadata.get("storage_transformers", []): + raise ValueError( + "Can only read byte ranges from Zarr v3 stores which implement the manifest storage transformer ZEP." + ) attrs = metadata.pop("attributes") dim_names = metadata.pop("dimension_names")