Merge branch 'main' into dmrpp_root_group_fix

zarr-developers · Oct 17, 2024 · b8fbb15 · b8fbb15
2 parents f1c5cbf + e6407e0
commit b8fbb15
Show file tree

Hide file tree

Showing 17 changed files with 183 additions and 62 deletions.
diff --git a/.github/workflows/min-deps.yml b/.github/workflows/min-deps.yml
@@ -0,0 +1,60 @@
+name: min-deps
+
+on:
+  push:
+    branches: [ "main" ]
+    paths-ignore:
+    - 'docs/**'
+  pull_request:
+    branches: [ "main" ]
+    paths-ignore:
+    - 'docs/**'
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+
+  test:
+    name: ${{ matrix.python-version }}-build
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -l {0}
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Setup micromamba
+        uses: mamba-org/setup-micromamba@v1
+        with:
+          environment-file: ci/min-deps.yml
+          cache-environment: true
+          create-args: >-
+            python=${{matrix.python-version}}
+
+      - name: Install virtualizarr
+        run: |
+           python -m pip install -e . --no-deps
+      - name: Conda list information
+        run: |
+          conda env list
+          conda list
+
+      - name: Running Tests
+        run: |
+          python -m pytest ./virtualizarr --cov=./ --cov-report=xml --verbose
+
+      - name: Upload code coverage to Codecov
+        uses: codecov/[email protected]
+        with:
+          file: ./coverage.xml
+          flags: unittests
+          env_vars: OS,PYTHON
+          name: codecov-umbrella
+          fail_ci_if_error: false
diff --git a/ci/min-deps.yml b/ci/min-deps.yml
@@ -0,0 +1,26 @@
+name: virtualizarr-min-deps
+channels:
+  - conda-forge
+  - nodefaults
+dependencies:
+  - h5netcdf
+  - h5py
+  - hdf5
+  - netcdf4
+  - xarray>=2024.6.0
+  - numpy>=2.0.0
+  - numcodecs
+  - packaging
+  - ujson
+  - universal_pathlib
+  # Testing
+  - codecov
+  - pre-commit
+  - mypy
+  - ruff
+  - pandas-stubs
+  - pytest-mypy
+  - pytest-cov
+  - pytest
+  - pooch
+  - fsspec
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,20 +22,21 @@ requires-python = ">=3.10"
 dynamic = ["version"]
 dependencies = [
     "xarray>=2024.06.0",
-    "kerchunk>=0.2.5",
-    "h5netcdf",
     "numpy>=2.0.0",
-    "ujson",
     "packaging",
     "universal-pathlib",
+    "numcodecs",
+    "ujson",
 ]
 
 [project.optional-dependencies]
 test = [
     "codecov",
     "fastparquet",
     "fsspec",
+    "h5netcdf",
     "h5py",
+    "kerchunk>=0.2.5",
     "mypy",
     "netcdf4",
     "pandas-stubs",

diff --git a/virtualizarr/accessor.py b/virtualizarr/accessor.py
@@ -5,7 +5,6 @@
     overload,
 )
 
-import ujson  # type: ignore
 from xarray import Dataset, register_dataset_accessor
 
 from virtualizarr.manifests import ManifestArray
@@ -91,6 +90,8 @@ def to_kerchunk(
         if format == "dict":
             return refs
         elif format == "json":
+            import ujson
+
             if filepath is None:
                 raise ValueError("Filepath must be provided when format is 'json'")
 

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
@@ -5,7 +5,6 @@
 from typing import Any, Callable, Dict, NewType, Tuple, TypedDict, cast
 
 import numpy as np
-from upath import UPath
 
 from virtualizarr.types import ChunkKey
 
@@ -41,6 +40,8 @@ class ChunkEntry:
     def from_kerchunk(
         cls, path_and_byte_range_info: tuple[str] | tuple[str, int, int]
     ) -> "ChunkEntry":
+        from upath import UPath
+
         if len(path_and_byte_range_info) == 1:
             path = path_and_byte_range_info[0]
             offset = 0

diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py
@@ -2,7 +2,6 @@
 from pathlib import Path
 from typing import Any, MutableMapping, Optional, cast
 
-import ujson  # type: ignore
 from xarray import Dataset
 from xarray.core.indexes import Index
 from xarray.core.variable import Variable
@@ -300,6 +299,8 @@ def fully_decode_arr_refs(d: dict) -> KerchunkArrRefs:
     """
     Only have to do this because kerchunk.SingleHdf5ToZarr apparently doesn't bother converting .zarray and .zattrs contents to dicts, see https://github.com/fsspec/kerchunk/issues/415 .
     """
+    import ujson
+
     sanitized = d.copy()
     for k, v in d.items():
         if k.startswith("."):

diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py
@@ -33,7 +33,9 @@ def _importorskip(
 
 
 has_astropy, requires_astropy = _importorskip("astropy")
+has_kerchunk, requires_kerchunk = _importorskip("kerchunk")
 has_s3fs, requires_s3fs = _importorskip("s3fs")
+has_scipy, requires_scipy = _importorskip("scipy")
 has_tifffile, requires_tifffile = _importorskip("tifffile")
 
 

diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py
@@ -1,7 +1,6 @@
 from collections.abc import Mapping
 from unittest.mock import patch
 
-import fsspec
 import numpy as np
 import pytest
 import xarray as xr
@@ -13,9 +12,17 @@
 from virtualizarr.backend import FileType
 from virtualizarr.manifests import ManifestArray
 from virtualizarr.readers.kerchunk import _automatically_determine_filetype
-from virtualizarr.tests import has_astropy, has_tifffile, network, requires_s3fs
+from virtualizarr.tests import (
+    has_astropy,
+    has_tifffile,
+    network,
+    requires_kerchunk,
+    requires_s3fs,
+    requires_scipy,
+)
 
 
+@requires_scipy
 def test_automatically_determine_filetype_netcdf3_netcdf4():
     # test the NetCDF3 vs NetCDF4 automatic file type selection
 
@@ -75,6 +82,7 @@ def test_FileType():
         FileType(None)
 
 
+@requires_kerchunk
 class TestOpenVirtualDatasetIndexes:
     def test_no_indexes(self, netcdf4_file):
         vds = open_virtual_dataset(netcdf4_file, indexes={})
@@ -105,6 +113,7 @@ def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, I
     return True
 
 
+@requires_kerchunk
 def test_cftime_index(tmpdir):
     """Ensure a virtual dataset contains the same indexes as an Xarray dataset"""
     # Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168
@@ -130,6 +139,7 @@ def test_cftime_index(tmpdir):
     assert vds.attrs == ds.attrs
 
 
+@requires_kerchunk
 class TestOpenVirtualDatasetAttrs:
     def test_drop_array_dimensions(self, netcdf4_file):
         # regression test for GH issue #150
@@ -237,6 +247,8 @@ def test_read_from_url(self, filetype, url):
             assert isinstance(vds, xr.Dataset)
 
     def test_virtualizarr_vs_local_nisar(self):
+        import fsspec
+
         # Open group directly from locally cached file with xarray
         url = "https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/GCOV/ALOS1_Rosamond_20081012/NISAR_L2_PR_GCOV_001_005_A_219_4020_SHNA_A_20081012T060910_20081012T060926_P01101_F_N_J_001.h5"
         tmpfile = fsspec.open_local(
@@ -266,6 +278,7 @@ def test_virtualizarr_vs_local_nisar(self):
         xrt.assert_equal(dsXR, dsV)
 
 
+@requires_kerchunk
 class TestLoadVirtualDataset:
     def test_loadable_variables(self, netcdf4_file):
         vars_to_load = ["air", "time"]
@@ -338,6 +351,7 @@ def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir):
         assert vds.scalar.attrs == {"scalar": "true"}
 
 
+@requires_kerchunk
 @pytest.mark.parametrize(
     "reference_format",
     ["json", "parquet", "invalid"],
@@ -395,6 +409,7 @@ def test_open_virtual_dataset_existing_kerchunk_refs(
         assert set(vds.variables) == set(netcdf4_virtual_dataset.variables)
 
 
+@requires_kerchunk
 def test_notimplemented_read_inline_refs(tmp_path, netcdf4_inlined_ref):
     # For now, we raise a NotImplementedError if we read existing references that have inlined data
     # https://github.com/zarr-developers/VirtualiZarr/pull/251#pullrequestreview-2361916932

diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
@@ -4,11 +4,53 @@
 import xarray.testing as xrt
 
 from virtualizarr import open_virtual_dataset
-from virtualizarr.manifests.array import ManifestArray
-from virtualizarr.manifests.manifest import ChunkManifest
+from virtualizarr.manifests import ChunkManifest, ManifestArray
+from virtualizarr.readers.kerchunk import (
+    dataset_from_kerchunk_refs,
+    find_var_names,
+)
+from virtualizarr.tests import requires_kerchunk
 from virtualizarr.zarr import ZArray
 
 
+def test_kerchunk_roundtrip_in_memory_no_concat():
+    # Set up example xarray dataset
+    chunks_dict = {
+        "0.0": {"path": "foo.nc", "offset": 100, "length": 100},
+        "0.1": {"path": "foo.nc", "offset": 200, "length": 100},
+    }
+    manifest = ChunkManifest(entries=chunks_dict)
+    marr = ManifestArray(
+        zarray=dict(
+            shape=(2, 4),
+            dtype=np.dtype("<i8"),
+            chunks=(2, 2),
+            compressor=None,
+            filters=None,
+            fill_value=np.nan,
+            order="C",
+        ),
+        chunkmanifest=manifest,
+    )
+    ds = xr.Dataset({"a": (["x", "y"], marr)})
+
+    # Use accessor to write it out to kerchunk reference dict
+    ds_refs = ds.virtualize.to_kerchunk(format="dict")
+
+    # Use dataset_from_kerchunk_refs to reconstruct the dataset
+    roundtrip = dataset_from_kerchunk_refs(ds_refs)
+
+    # Assert equal to original dataset
+    xrt.assert_equal(roundtrip, ds)
+
+
+def test_no_duplicates_find_var_names():
+    """Verify that we get a deduplicated list of var names"""
+    ref_dict = {"refs": {"x/something": {}, "x/otherthing": {}}}
+    assert len(find_var_names(ref_dict)) == 1
+
+
+@requires_kerchunk
 @pytest.mark.parametrize(
     "inline_threshold, vars_to_inline",
     [
@@ -45,6 +87,7 @@ def test_numpy_arrays_to_inlined_kerchunk_refs(
     assert refs["refs"]["time/0"] == expected["refs"]["time/0"]
 
 
+@requires_kerchunk
 @pytest.mark.parametrize("format", ["dict", "json", "parquet"])
 class TestKerchunkRoundtrip:
     def test_kerchunk_roundtrip_no_concat(self, tmpdir, format):
@@ -212,6 +255,7 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format):
         assert roundtrip.a.attrs == ds.a.attrs
 
 
+@requires_kerchunk
 def test_open_scalar_variable(tmpdir):
     # regression test for GH issue #100
 

diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py