From e6407e01e4dc2536a64c3a418a59ead30bb353bc Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Thu, 17 Oct 2024 14:28:58 -0600 Subject: [PATCH] Skip tests that require kerchunk (#259) * try making ujson an optional dep * skip all tests which require kerchunk * add new CI job * rename git workflow * move numcodecs import inside * add numcodecs to CI env * make ujson required * unskip tests for parsing in-memory kerchunk dicts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add ujson to CI environment * in-memory roundtrip doesn't require kerchunk * move in-memory kerchunk roundtrip test to test_integration.py * remove now-empty test_kerchunk.py file --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .github/workflows/min-deps.yml | 60 +++++++++++++++++++ ci/min-deps.yml | 26 ++++++++ pyproject.toml | 7 ++- virtualizarr/accessor.py | 3 +- virtualizarr/manifests/manifest.py | 3 +- virtualizarr/readers/kerchunk.py | 3 +- virtualizarr/tests/__init__.py | 2 + virtualizarr/tests/test_backend.py | 19 +++++- virtualizarr/tests/test_integration.py | 48 ++++++++++++++- virtualizarr/tests/test_kerchunk.py | 46 -------------- .../tests/test_manifests/test_array.py | 4 +- .../tests/test_readers/test_kerchunk.py | 3 +- virtualizarr/tests/test_utils.py | 2 + .../tests/test_writers/test_kerchunk.py | 7 ++- virtualizarr/tests/test_xarray.py | 3 + virtualizarr/writers/kerchunk.py | 3 +- virtualizarr/zarr.py | 6 +- 17 files changed, 183 insertions(+), 62 deletions(-) create mode 100644 .github/workflows/min-deps.yml create mode 100644 ci/min-deps.yml delete mode 100644 virtualizarr/tests/test_kerchunk.py diff --git a/.github/workflows/min-deps.yml b/.github/workflows/min-deps.yml new file mode 100644 index 00000000..066e1ba3 --- /dev/null +++ b/.github/workflows/min-deps.yml @@ -0,0 +1,60 @@ +name: min-deps + +on: + push: + branches: [ "main" ] + paths-ignore: + - 'docs/**' + pull_request: + branches: [ "main" ] + paths-ignore: + - 'docs/**' + schedule: + - cron: "0 0 * * *" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + + test: + name: ${{ matrix.python-version }}-build + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + strategy: + matrix: + python-version: ["3.12"] + steps: + - uses: actions/checkout@v4 + + - name: Setup micromamba + uses: mamba-org/setup-micromamba@v1 + with: + environment-file: ci/min-deps.yml + cache-environment: true + create-args: >- + python=${{matrix.python-version}} + + - name: Install virtualizarr + run: | + python -m pip install -e . --no-deps + - name: Conda list information + run: | + conda env list + conda list + + - name: Running Tests + run: | + python -m pytest ./virtualizarr --cov=./ --cov-report=xml --verbose + + - name: Upload code coverage to Codecov + uses: codecov/codecov-action@v3.1.4 + with: + file: ./coverage.xml + flags: unittests + env_vars: OS,PYTHON + name: codecov-umbrella + fail_ci_if_error: false diff --git a/ci/min-deps.yml b/ci/min-deps.yml new file mode 100644 index 00000000..7ca8c0b3 --- /dev/null +++ b/ci/min-deps.yml @@ -0,0 +1,26 @@ +name: virtualizarr-min-deps +channels: + - conda-forge + - nodefaults +dependencies: + - h5netcdf + - h5py + - hdf5 + - netcdf4 + - xarray>=2024.6.0 + - numpy>=2.0.0 + - numcodecs + - packaging + - ujson + - universal_pathlib + # Testing + - codecov + - pre-commit + - mypy + - ruff + - pandas-stubs + - pytest-mypy + - pytest-cov + - pytest + - pooch + - fsspec diff --git a/pyproject.toml b/pyproject.toml index 5af632ce..d216b269 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,12 +22,11 @@ requires-python = ">=3.10" dynamic = ["version"] dependencies = [ "xarray>=2024.06.0", - "kerchunk>=0.2.5", - "h5netcdf", "numpy>=2.0.0", - "ujson", "packaging", "universal-pathlib", + "numcodecs", + "ujson", ] [project.optional-dependencies] @@ -35,7 +34,9 @@ test = [ "codecov", "fastparquet", "fsspec", + "h5netcdf", "h5py", + "kerchunk>=0.2.5", "mypy", "netcdf4", "pandas-stubs", diff --git a/virtualizarr/accessor.py b/virtualizarr/accessor.py index 0a97237e..cc251e63 100644 --- a/virtualizarr/accessor.py +++ b/virtualizarr/accessor.py @@ -5,7 +5,6 @@ overload, ) -import ujson # type: ignore from xarray import Dataset, register_dataset_accessor from virtualizarr.manifests import ManifestArray @@ -91,6 +90,8 @@ def to_kerchunk( if format == "dict": return refs elif format == "json": + import ujson + if filepath is None: raise ValueError("Filepath must be provided when format is 'json'") diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index a6d160ed..88ac9a91 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -5,7 +5,6 @@ from typing import Any, Callable, Dict, NewType, Tuple, TypedDict, cast import numpy as np -from upath import UPath from virtualizarr.types import ChunkKey @@ -41,6 +40,8 @@ class ChunkEntry: def from_kerchunk( cls, path_and_byte_range_info: tuple[str] | tuple[str, int, int] ) -> "ChunkEntry": + from upath import UPath + if len(path_and_byte_range_info) == 1: path = path_and_byte_range_info[0] offset = 0 diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py index c274ee5a..d3632b68 100644 --- a/virtualizarr/readers/kerchunk.py +++ b/virtualizarr/readers/kerchunk.py @@ -2,7 +2,6 @@ from pathlib import Path from typing import Any, MutableMapping, Optional, cast -import ujson # type: ignore from xarray import Dataset from xarray.core.indexes import Index from xarray.core.variable import Variable @@ -300,6 +299,8 @@ def fully_decode_arr_refs(d: dict) -> KerchunkArrRefs: """ Only have to do this because kerchunk.SingleHdf5ToZarr apparently doesn't bother converting .zarray and .zattrs contents to dicts, see https://github.com/fsspec/kerchunk/issues/415 . """ + import ujson + sanitized = d.copy() for k, v in d.items(): if k.startswith("."): diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 7df13d10..70f613ce 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -33,7 +33,9 @@ def _importorskip( has_astropy, requires_astropy = _importorskip("astropy") +has_kerchunk, requires_kerchunk = _importorskip("kerchunk") has_s3fs, requires_s3fs = _importorskip("s3fs") +has_scipy, requires_scipy = _importorskip("scipy") has_tifffile, requires_tifffile = _importorskip("tifffile") diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index 731c4acc..81a23e0c 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -1,7 +1,6 @@ from collections.abc import Mapping from unittest.mock import patch -import fsspec import numpy as np import pytest import xarray as xr @@ -13,9 +12,17 @@ from virtualizarr.backend import FileType from virtualizarr.manifests import ManifestArray from virtualizarr.readers.kerchunk import _automatically_determine_filetype -from virtualizarr.tests import has_astropy, has_tifffile, network, requires_s3fs +from virtualizarr.tests import ( + has_astropy, + has_tifffile, + network, + requires_kerchunk, + requires_s3fs, + requires_scipy, +) +@requires_scipy def test_automatically_determine_filetype_netcdf3_netcdf4(): # test the NetCDF3 vs NetCDF4 automatic file type selection @@ -75,6 +82,7 @@ def test_FileType(): FileType(None) +@requires_kerchunk class TestOpenVirtualDatasetIndexes: def test_no_indexes(self, netcdf4_file): vds = open_virtual_dataset(netcdf4_file, indexes={}) @@ -105,6 +113,7 @@ def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, I return True +@requires_kerchunk def test_cftime_index(tmpdir): """Ensure a virtual dataset contains the same indexes as an Xarray dataset""" # Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168 @@ -130,6 +139,7 @@ def test_cftime_index(tmpdir): assert vds.attrs == ds.attrs +@requires_kerchunk class TestOpenVirtualDatasetAttrs: def test_drop_array_dimensions(self, netcdf4_file): # regression test for GH issue #150 @@ -237,6 +247,8 @@ def test_read_from_url(self, filetype, url): assert isinstance(vds, xr.Dataset) def test_virtualizarr_vs_local_nisar(self): + import fsspec + # Open group directly from locally cached file with xarray url = "https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/GCOV/ALOS1_Rosamond_20081012/NISAR_L2_PR_GCOV_001_005_A_219_4020_SHNA_A_20081012T060910_20081012T060926_P01101_F_N_J_001.h5" tmpfile = fsspec.open_local( @@ -266,6 +278,7 @@ def test_virtualizarr_vs_local_nisar(self): xrt.assert_equal(dsXR, dsV) +@requires_kerchunk class TestLoadVirtualDataset: def test_loadable_variables(self, netcdf4_file): vars_to_load = ["air", "time"] @@ -338,6 +351,7 @@ def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir): assert vds.scalar.attrs == {"scalar": "true"} +@requires_kerchunk @pytest.mark.parametrize( "reference_format", ["json", "parquet", "invalid"], @@ -395,6 +409,7 @@ def test_open_virtual_dataset_existing_kerchunk_refs( assert set(vds.variables) == set(netcdf4_virtual_dataset.variables) +@requires_kerchunk def test_notimplemented_read_inline_refs(tmp_path, netcdf4_inlined_ref): # For now, we raise a NotImplementedError if we read existing references that have inlined data # https://github.com/zarr-developers/VirtualiZarr/pull/251#pullrequestreview-2361916932 diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 5894f643..434d12d7 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -4,11 +4,53 @@ import xarray.testing as xrt from virtualizarr import open_virtual_dataset -from virtualizarr.manifests.array import ManifestArray -from virtualizarr.manifests.manifest import ChunkManifest +from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.readers.kerchunk import ( + dataset_from_kerchunk_refs, + find_var_names, +) +from virtualizarr.tests import requires_kerchunk from virtualizarr.zarr import ZArray +def test_kerchunk_roundtrip_in_memory_no_concat(): + # Set up example xarray dataset + chunks_dict = { + "0.0": {"path": "foo.nc", "offset": 100, "length": 100}, + "0.1": {"path": "foo.nc", "offset": 200, "length": 100}, + } + manifest = ChunkManifest(entries=chunks_dict) + marr = ManifestArray( + zarray=dict( + shape=(2, 4), + dtype=np.dtype(" KerchunkStoreRefs: Create a dictionary containing kerchunk-style store references from a single xarray.Dataset (which wraps ManifestArray objects). """ + import ujson + all_arr_refs = {} for var_name, var in ds.variables.items(): arr_refs = variable_to_kerchunk_arr_refs(var, str(var_name)) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index f62b1269..cd83a67d 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -1,9 +1,7 @@ import dataclasses from typing import TYPE_CHECKING, Any, Literal, NewType, cast -import numcodecs import numpy as np -import ujson # type: ignore if TYPE_CHECKING: pass @@ -100,6 +98,8 @@ def dict(self) -> dict[str, Any]: return zarray_dict def to_kerchunk_json(self) -> str: + import ujson + zarray_dict = self.dict() if zarray_dict["fill_value"] is np.nan: zarray_dict["fill_value"] = None @@ -153,6 +153,8 @@ def _v3_codec_pipeline(self) -> list: post_compressor: Iterable[BytesBytesCodec] #optional ``` """ + import numcodecs + if self.filters: filter_codecs_configs = [ numcodecs.get_codec(filter).get_config() for filter in self.filters