From 3fa5cffefcd9af1f536d11fab81972a6e84554ad Mon Sep 17 00:00:00 2001 From: Ayush Nag <35325113+ayushnag@users.noreply.github.com> Date: Tue, 5 Nov 2024 09:56:27 -0800 Subject: [PATCH 1/5] Search for coord_names in separate_coords (#191) * find coord_names in vars * resolve merge conflict * add 2d coords test * add kerchunk dep and add 1d coord test --------- Co-authored-by: Tom Nicholas --- conftest.py | 9 +++++++++ virtualizarr/readers/common.py | 7 ++++++- virtualizarr/tests/test_backend.py | 22 ++++++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/conftest.py b/conftest.py index 810fd833..55c07823 100644 --- a/conftest.py +++ b/conftest.py @@ -35,6 +35,15 @@ def netcdf4_file(tmpdir): return filepath +@pytest.fixture +def netcdf4_file_with_2d_coords(tmpdir): + ds = xr.tutorial.open_dataset("ROMS_example") + filepath = f"{tmpdir}/ROMS_example.nc" + ds.to_netcdf(filepath, format="NETCDF4") + ds.close() + return filepath + + @pytest.fixture def netcdf4_virtual_dataset(netcdf4_file): from virtualizarr import open_virtual_dataset diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index f6f5dff4..646d26ca 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -144,8 +144,13 @@ def separate_coords( coord_vars: dict[ str, tuple[Hashable, Any, dict[Any, Any], dict[Any, Any]] | Variable ] = {} + found_coord_names: set[str] = set() + # Search through variable attributes for coordinate names + for var in vars.values(): + if "coordinates" in var.attrs: + found_coord_names.update(var.attrs["coordinates"].split(" ")) for name, var in vars.items(): - if name in coord_names or var.dims == (name,): + if name in coord_names or var.dims == (name,) or name in found_coord_names: # use workaround to avoid creating IndexVariables described here https://github.com/pydata/xarray/pull/8107#discussion_r1311214263 if len(var.dims) == 1: dim1d, *_ = var.dims diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index 43a6bbd8..e9b60814 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -156,6 +156,28 @@ def test_coordinate_variable_attrs_preserved(self, netcdf4_file): } +@requires_kerchunk +class TestDetermineCoords: + def test_infer_one_dimensional_coords(self, netcdf4_file): + vds = open_virtual_dataset(netcdf4_file, indexes={}) + assert set(vds.coords) == {"time", "lat", "lon"} + + def test_var_attr_coords(self, netcdf4_file_with_2d_coords): + vds = open_virtual_dataset(netcdf4_file_with_2d_coords, indexes={}) + + expected_dimension_coords = ["ocean_time", "s_rho"] + expected_2d_coords = ["lon_rho", "lat_rho", "h"] + expected_1d_non_dimension_coords = ["Cs_r"] + expected_scalar_coords = ["hc", "Vtransform"] + expected_coords = ( + expected_dimension_coords + + expected_2d_coords + + expected_1d_non_dimension_coords + + expected_scalar_coords + ) + assert set(vds.coords) == set(expected_coords) + + @network @requires_s3fs class TestReadFromS3: From 2316fcbff7cbfbf93faf1884ba4482908ae1d50e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 6 Nov 2024 08:54:52 -0700 Subject: [PATCH 2/5] [pre-commit.ci] pre-commit autoupdate (#283) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.6.9 → v0.7.2](https://github.com/astral-sh/ruff-pre-commit/compare/v0.6.9...v0.7.2) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3bae6a6c..9990375b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,7 +11,7 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: "v0.6.9" + rev: "v0.7.2" hooks: # Run the linter. - id: ruff From efbc4930ff80ff086b67192de589e71eff23bb1c Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Thu, 7 Nov 2024 09:40:44 -0700 Subject: [PATCH 3/5] Bump minimum Xarray dependency to 2024.10.0 (#284) * change upstream CI dependency * change non-upstream CI dependencies * change entire project depedendency * use explicit import of xarray.DataTree for type hint * release note --- ci/environment.yml | 2 +- ci/min-deps.yml | 2 +- ci/upstream.yml | 2 +- docs/releases.rst | 2 ++ pyproject.toml | 2 +- virtualizarr/readers/common.py | 10 ++-------- 6 files changed, 8 insertions(+), 12 deletions(-) diff --git a/ci/environment.yml b/ci/environment.yml index 883463a2..0bb5b366 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -7,7 +7,7 @@ dependencies: - h5py - hdf5 - netcdf4 - - xarray>=2024.6.0 + - xarray>=2024.10.0 - kerchunk>=0.2.5 - numpy>=2.0.0 - ujson diff --git a/ci/min-deps.yml b/ci/min-deps.yml index 7ca8c0b3..05778382 100644 --- a/ci/min-deps.yml +++ b/ci/min-deps.yml @@ -7,7 +7,7 @@ dependencies: - h5py - hdf5 - netcdf4 - - xarray>=2024.6.0 + - xarray>=2024.10.0 - numpy>=2.0.0 - numcodecs - packaging diff --git a/ci/upstream.yml b/ci/upstream.yml index 2c2680bc..035d76f8 100644 --- a/ci/upstream.yml +++ b/ci/upstream.yml @@ -3,6 +3,7 @@ channels: - conda-forge - nodefaults dependencies: + - xarray>=2024.10.0 - h5netcdf - h5py - hdf5 @@ -25,6 +26,5 @@ dependencies: - pip - pip: - icechunk # Installs zarr v3 as dependency - - git+https://github.com/pydata/xarray@zarr-v3 # zarr-v3 compatibility branch - git+https://github.com/zarr-developers/numcodecs@zarr3-codecs # zarr-v3 compatibility branch # - git+https://github.com/fsspec/kerchunk@main # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516) diff --git a/docs/releases.rst b/docs/releases.rst index 93a5fec9..1ca594a1 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -42,6 +42,8 @@ Breaking changes - VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclass `pydantic.BaseModel` (:pull:`210`) - `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`210`) +- Minimum required version of Xarray is now v2024.10.0. + (:pull:`284`) By `Tom Nicholas `_. Deprecations ~~~~~~~~~~~~ diff --git a/pyproject.toml b/pyproject.toml index d216b269..749afb94 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ classifiers = [ requires-python = ">=3.10" dynamic = ["version"] dependencies = [ - "xarray>=2024.06.0", + "xarray>=2024.10.0", "numpy>=2.0.0", "packaging", "universal-pathlib", diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index 646d26ca..1ad24629 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -4,7 +4,6 @@ from collections.abc import Iterable, Mapping, MutableMapping from io import BufferedIOBase from typing import ( - TYPE_CHECKING, Any, Hashable, Optional, @@ -14,6 +13,7 @@ from xarray import ( Coordinates, Dataset, + DataTree, Index, IndexVariable, Variable, @@ -26,12 +26,6 @@ XArrayOpenT = str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore -if TYPE_CHECKING: - try: - from xarray import DataTree # type: ignore[attr-defined] - except ImportError: - DataTree = Any - def open_loadable_vars_and_indexes( filepath: str, @@ -194,5 +188,5 @@ def open_virtual_datatree( decode_times: bool | None = None, indexes: Mapping[str, Index] | None = None, reader_options: Optional[dict] = None, - ) -> "DataTree": + ) -> DataTree: raise NotImplementedError() From 2d7b4ee3ff38a3363ea6de20799cce0c04f19769 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Thu, 7 Nov 2024 09:45:14 -0700 Subject: [PATCH 4/5] Dont write _ARRAY_DIMENSIONS to icechunk (#286) * dont write _ARRAY_DIMENSIONS * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * release note * change test * add xarray 2024.10.0 dependency to icechunk CI --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/releases.rst | 2 ++ virtualizarr/tests/test_writers/test_icechunk.py | 2 +- virtualizarr/writers/icechunk.py | 1 - 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/releases.rst b/docs/releases.rst index 1ca594a1..1e2bdb90 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -59,6 +59,8 @@ Bug fixes - Fixed regression in `fill_value` handling for datetime dtypes making virtual Zarr stores unreadable (:pull:`206`) By `Timothy Hodson `_ +- Fixed bug with writing of `dimension_names` into zarr metadata. + (:pull:`286`) By `Tom Nicholas `_. Documentation ~~~~~~~~~~~~~ diff --git a/virtualizarr/tests/test_writers/test_icechunk.py b/virtualizarr/tests/test_writers/test_icechunk.py index f99b2718..7a22defa 100644 --- a/virtualizarr/tests/test_writers/test_icechunk.py +++ b/virtualizarr/tests/test_writers/test_icechunk.py @@ -68,7 +68,7 @@ def test_write_new_virtual_variable( # assert dict(arr.attrs) == {"units": "km"} # check dimensions - assert arr.attrs["_ARRAY_DIMENSIONS"] == ["x", "y"] + assert arr.metadata.dimension_names == ("x", "y") def test_set_single_virtual_ref_without_encoding( diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index 6dadbc08..0ba95a36 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -144,7 +144,6 @@ def write_virtual_variable_to_icechunk( # TODO it would be nice if we could assign directly to the .attrs property for k, v in var.attrs.items(): arr.attrs[k] = encode_zarr_attr_value(v) - arr.attrs["_ARRAY_DIMENSIONS"] = encode_zarr_attr_value(var.dims) _encoding_keys = {"_FillValue", "missing_value", "scale_factor", "add_offset"} for k, v in var.encoding.items(): From 4ae7a19c2bb2996dc4739dfd3ebbe32b17ac1658 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Thu, 7 Nov 2024 10:12:13 -0700 Subject: [PATCH 5/5] Fix release notes for v1.1.0 (#288) * add new section to release notes for unreleased additions * add release note for #191 * add release note for #266 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * pre-commit --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/releases.rst | 49 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/docs/releases.rst b/docs/releases.rst index 1e2bdb90..cadbc855 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -1,36 +1,62 @@ Release notes ============= -.. _v1.0.1: +.. _v1.1.1: -v1.0.1 (unreleased) +v1.1.1 (unreleased) ------------------- New Features ~~~~~~~~~~~~ +Breaking changes +~~~~~~~~~~~~~~~~ + +- Minimum required version of Xarray is now v2024.10.0. + (:pull:`284`) By `Tom Nicholas `_. + +Deprecations +~~~~~~~~~~~~ + +Bug fixes +~~~~~~~~~ + +- Fixed bug with writing of `dimension_names` into zarr metadata. + (:pull:`286`) By `Tom Nicholas `_. +- Fixed bug causing CF-compliant variables not to be identified as coordinates (:pull:`191`) + By `Ayush Nag `_. + +Documentation +~~~~~~~~~~~~~ + +- FAQ answers on Icechunk compatibility, converting from existing Kerchunk references to Icechunk, and how to add a new reader for a custom file format. + (:pull:`266`) By `Tom Nicholas `_. + +Internal Changes +~~~~~~~~~~~~~~~~ + +.. _v1.1.0: + +v1.1.0 (22nd Oct 2024) +---------------------- + +New Features +~~~~~~~~~~~~ - Can open `kerchunk` reference files with ``open_virtual_dataset``. (:pull:`251`, :pull:`186`) By `Raphael Hagen `_ & `Kristen Thyng `_. - - Adds defaults for `open_virtual_dataset_from_v3_store` in (:pull:`234`) By `Raphael Hagen `_. - - New ``group`` option on ``open_virtual_dataset`` enables extracting specific HDF Groups. (:pull:`165`) By `Scott Henderson `_. - - Adds `decode_times` to open_virtual_dataset (:pull:`232`) By `Raphael Hagen `_. - - Add parser for the OPeNDAP DMR++ XML format and integration with open_virtual_dataset (:pull:`113`) By `Ayush Nag `_. - - Load scalar variables by default. (:pull:`205`) By `Gustavo Hidalgo `_. - - Support empty files (:pull:`260`) By `Justus Magin `_. - - Can write virtual datasets to Icechunk stores using `vitualize.to_icechunk` (:pull:`256`) By `Matt Iannucci `_. @@ -42,8 +68,6 @@ Breaking changes - VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclass `pydantic.BaseModel` (:pull:`210`) - `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`210`) -- Minimum required version of Xarray is now v2024.10.0. - (:pull:`284`) By `Tom Nicholas `_. Deprecations ~~~~~~~~~~~~ @@ -59,8 +83,6 @@ Bug fixes - Fixed regression in `fill_value` handling for datetime dtypes making virtual Zarr stores unreadable (:pull:`206`) By `Timothy Hodson `_ -- Fixed bug with writing of `dimension_names` into zarr metadata. - (:pull:`286`) By `Tom Nicholas `_. Documentation ~~~~~~~~~~~~~ @@ -68,7 +90,6 @@ Documentation - Adds virtualizarr + coiled serverless example notebook (:pull:`223`) By `Raphael Hagen `_. - Internal Changes ~~~~~~~~~~~~~~~~