More indexes fixes (#358)

* simplify logic * remove some outdated occurrences of passing indexes={} * add PR number to release notes * ensure fits reader never tries to load variables * rename function to indicate fact it might not open file again * rename function in patched test too * add type hint * remove redefined type hint
zarr-developers · Dec 18, 2024 · 7a35bd4 · 7a35bd4
1 parent eb4d6e0
commit 7a35bd4
Show file tree

Hide file tree

Showing 10 changed files with 69 additions and 73 deletions.
diff --git a/docs/releases.rst b/docs/releases.rst
@@ -21,7 +21,7 @@ Breaking changes
 - Indexes are now created by default for any loadable one-dimensional coordinate variables.
   Also a warning is no longer thrown when ``indexes=None`` is passed to ``open_virtual_dataset``, and the recommendations in the docs updated to match.
   This also means that ``xarray.combine_by_coords`` will now work when the necessary dimension coordinates are specified in ``loadable_variables``.
-  (:issue:`18`, :pull:`357`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
+  (:issue:`18`, :pull:`357`, :pull:`358`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
 
 Deprecations
 ~~~~~~~~~~~~

diff --git a/docs/usage.md b/docs/usage.md
@@ -205,7 +205,7 @@ But before we combine our data, we might want to consider loading some variables
 Whilst the values of virtual variables (i.e. those backed by `ManifestArray` objects) cannot be loaded into memory, you do have the option of opening specific variables from the file as loadable lazy numpy/dask arrays, just like `xr.open_dataset` normally returns. These variables are specified using the `loadable_variables` argument:
 
 ```python
-vds = open_virtual_dataset('air.nc', loadable_variables=['air', 'time'], indexes={})
+vds = open_virtual_dataset('air.nc', loadable_variables=['air', 'time'])
 ```
 ```python
 <xarray.Dataset> Size: 31MB
@@ -228,8 +228,8 @@ You can see that the dataset contains a mixture of virtual variables backed by `
 Loading variables can be useful in a few scenarios:
 1. You need to look at the actual values of a multi-dimensional variable in order to decide what to do next,
 2. You want in-memory indexes to use with ``xr.combine_by_coords``,
-3. Storing a variable on-disk as a set of references would be inefficient, e.g. because it's a very small array (saving the values like this is similar to kerchunk's concept of "inlining" data),
-4. The variable has encoding, and the simplest way to decode it correctly is to let xarray's standard decoding machinery load it into memory and apply the decoding.
+3. Storing a variable on-disk as a set of references would be inefficient, e.g. because each chunk is very small (saving the values like this is similar to kerchunk's concept of "inlining" data),
+4. The variable has complicated encoding, and the simplest way to decode it correctly is to let xarray's standard decoding machinery load it into memory and apply the decoding.
 
 ### CF-encoded time variables
 
@@ -240,7 +240,6 @@ vds = open_virtual_dataset(
     'air.nc',
     loadable_variables=['air', 'time'],
     decode_times=True,
-    indexes={},
 )
 ```
 ```python
@@ -341,7 +340,6 @@ In future we would like for it to be possible to just use `xr.open_mfdataset` to
         concat_dim=['time'],
         coords='minimal',
         compat='override',
-        indexes={},
     )
 
 but this requires some [upstream changes](https://github.com/TomNicholas/VirtualiZarr/issues/35) in xarray.

diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py
@@ -21,7 +21,7 @@
 from virtualizarr.utils import _FsspecFSFromFilepath
 
 
-def open_loadable_vars_and_indexes(
+def maybe_open_loadable_vars_and_indexes(
     filepath: str,
     loadable_variables,
     reader_options,
@@ -36,50 +36,51 @@ def open_loadable_vars_and_indexes(
     Relies on xr.open_dataset and its auto-detection of filetypes to find the correct installed backend.
     """
 
-    # TODO get rid of this if?
-    if indexes is None or len(loadable_variables) > 0:
-        # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
-        # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
-        # TODO really we probably want a dedicated xarray backend that iterates over all variables only once
-        fpath = _FsspecFSFromFilepath(
-            filepath=filepath, reader_options=reader_options
-        ).open_file()
-
-        # fpath can be `Any` thanks to fsspec.filesystem(...).open() returning Any.
-        ds = open_dataset(
-            fpath,  # type: ignore[arg-type]
-            drop_variables=drop_variables,
-            group=group,
-            decode_times=decode_times,
-        )
-
-        if indexes is None:
-            # add default indexes by reading data from file
-            # but avoid creating an in-memory index for virtual variables by default
-            indexes = {
-                name: index
-                for name, index in ds.xindexes.items()
-                if name in loadable_variables
-            }
-        elif indexes != {}:
-            # TODO allow manual specification of index objects
-            raise NotImplementedError()
-        else:
-            indexes = dict(**indexes)  # for type hinting: to allow mutation
+    if loadable_variables == [] and indexes == {}:
+        # no need to even attempt to open the file using xarray
+        return {}, {}
+
+    # TODO We are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
+    # TODO It would also be nice if we could somehow consolidate this with the reading of the kerchunk references
+    # TODO Really we probably want a dedicated backend that iterates over all variables only once
+    # TODO See issue #124 for a suggestion of how to avoid calling xarray here.
+
+    fpath = _FsspecFSFromFilepath(
+        filepath=filepath, reader_options=reader_options
+    ).open_file()
+
+    # fpath can be `Any` thanks to fsspec.filesystem(...).open() returning Any.
+    ds = open_dataset(
+        fpath,  # type: ignore[arg-type]
+        drop_variables=drop_variables,
+        group=group,
+        decode_times=decode_times,
+    )
 
-        # TODO we should drop these earlier by using drop_variables
-        loadable_vars = {
-            str(name): var
-            for name, var in ds.variables.items()
+    if indexes is None:
+        # add default indexes by reading data from file
+        # but avoid creating an in-memory index for virtual variables by default
+        indexes = {
+            name: index
+            for name, index in ds.xindexes.items()
             if name in loadable_variables
         }
-
-        # if we only read the indexes we can just close the file right away as nothing is lazy
-        if loadable_vars == {}:
-            ds.close()
+    elif indexes != {}:
+        # TODO allow manual specification of index objects
+        raise NotImplementedError()
     else:
-        loadable_vars = {}
-        indexes = {}
+        indexes = dict(**indexes)  # for type hinting: to allow mutation
+
+    # TODO we should drop these earlier by using drop_variables
+    loadable_vars = {
+        str(name): var
+        for name, var in ds.variables.items()
+        if name in loadable_variables
+    }
+
+    # if we only read the indexes we can just close the file right away as nothing is lazy
+    if loadable_vars == {}:
+        ds.close()
 
     return loadable_vars, indexes
 

diff --git a/virtualizarr/readers/fits.py b/virtualizarr/readers/fits.py
@@ -1,12 +1,11 @@
 from pathlib import Path
 from typing import Iterable, Mapping, Optional
 
-from xarray import Dataset, Index
+from xarray import Dataset, Index, Variable
 
 from virtualizarr.readers.common import (
     VirtualBackend,
     construct_virtual_dataset,
-    open_loadable_vars_and_indexes,
 )
 from virtualizarr.translators.kerchunk import (
     extract_group,
@@ -41,24 +40,22 @@ def open_virtual_dataset(
         if group:
             refs = extract_group(refs, group)
 
+        # TODO This wouldn't work until either you had an xarray backend for FITS installed, or issue #124 is implemented to load data from ManifestArrays directly
+        # TODO Once we have one of those we can use ``maybe_open_loadable_vars_and_indexes`` here
+        if loadable_variables != [] or indexes != {} or decode_times:
+            raise NotImplementedError(
+                "Cannot load variables or indexes from FITS files as there is no xarray backend engine for FITS"
+            )
+        loadable_vars: dict[str, Variable] = {}
+        indexes = {}
+
         virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs(
             refs,
             loadable_variables,
             drop_variables,
             fs_root=Path.cwd().as_uri(),
         )
 
-        # TODO this wouldn't work until you had an xarray backend for FITS installed
-        loadable_vars, indexes = open_loadable_vars_and_indexes(
-            filepath,
-            loadable_variables=loadable_variables,
-            reader_options=reader_options,
-            drop_variables=drop_variables,
-            indexes=indexes,
-            group=group,
-            decode_times=decode_times,
-        )
-
         return construct_virtual_dataset(
             virtual_vars=virtual_vars,
             loadable_vars=loadable_vars,

diff --git a/virtualizarr/readers/hdf/__init__.py b/virtualizarr/readers/hdf/__init__.py
@@ -1,11 +1,11 @@
 from .hdf import (
     HDFVirtualBackend,
     construct_virtual_dataset,
-    open_loadable_vars_and_indexes,
+    maybe_open_loadable_vars_and_indexes,
 )
 
 __all__ = [
     "HDFVirtualBackend",
     "construct_virtual_dataset",
-    "open_loadable_vars_and_indexes",
+    "maybe_open_loadable_vars_and_indexes",
 ]
diff --git a/virtualizarr/readers/hdf/hdf.py b/virtualizarr/readers/hdf/hdf.py
@@ -15,7 +15,7 @@
 from virtualizarr.readers.common import (
     VirtualBackend,
     construct_virtual_dataset,
-    open_loadable_vars_and_indexes,
+    maybe_open_loadable_vars_and_indexes,
 )
 from virtualizarr.readers.hdf.filters import cfcodec_from_dataset, codecs_from_dataset
 from virtualizarr.types import ChunkKey
@@ -68,7 +68,7 @@ def open_virtual_dataset(
             reader_options=reader_options,
         )
 
-        loadable_vars, indexes = open_loadable_vars_and_indexes(
+        loadable_vars, indexes = maybe_open_loadable_vars_and_indexes(
             filepath,
             loadable_variables=loadable_variables,
             reader_options=reader_options,

diff --git a/virtualizarr/readers/hdf5.py b/virtualizarr/readers/hdf5.py
@@ -6,7 +6,7 @@
 from virtualizarr.readers.common import (
     VirtualBackend,
     construct_virtual_dataset,
-    open_loadable_vars_and_indexes,
+    maybe_open_loadable_vars_and_indexes,
 )
 from virtualizarr.translators.kerchunk import (
     extract_group,
@@ -54,7 +54,7 @@ def open_virtual_dataset(
             fs_root=Path.cwd().as_uri(),
         )
 
-        loadable_vars, indexes = open_loadable_vars_and_indexes(
+        loadable_vars, indexes = maybe_open_loadable_vars_and_indexes(
             filepath,
             loadable_variables=loadable_variables,
             reader_options=reader_options,

diff --git a/virtualizarr/readers/netcdf3.py b/virtualizarr/readers/netcdf3.py
@@ -6,7 +6,7 @@
 from virtualizarr.readers.common import (
     VirtualBackend,
     construct_virtual_dataset,
-    open_loadable_vars_and_indexes,
+    maybe_open_loadable_vars_and_indexes,
 )
 from virtualizarr.translators.kerchunk import (
     virtual_vars_and_metadata_from_kerchunk_refs,
@@ -53,7 +53,7 @@ def open_virtual_dataset(
             fs_root=Path.cwd().as_uri(),
         )
 
-        loadable_vars, indexes = open_loadable_vars_and_indexes(
+        loadable_vars, indexes = maybe_open_loadable_vars_and_indexes(
             filepath,
             loadable_variables=loadable_variables,
             reader_options=reader_options,

diff --git a/virtualizarr/readers/tiff.py b/virtualizarr/readers/tiff.py
@@ -7,7 +7,7 @@
 from virtualizarr.readers.common import (
     VirtualBackend,
     construct_virtual_dataset,
-    open_loadable_vars_and_indexes,
+    maybe_open_loadable_vars_and_indexes,
 )
 from virtualizarr.translators.kerchunk import (
     extract_group,
@@ -63,7 +63,7 @@ def open_virtual_dataset(
             fs_root=Path.cwd().as_uri(),
         )
 
-        loadable_vars, indexes = open_loadable_vars_and_indexes(
+        loadable_vars, indexes = maybe_open_loadable_vars_and_indexes(
             filepath,
             loadable_variables=loadable_variables,
             reader_options=reader_options,

diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf.py
@@ -168,13 +168,13 @@ def test_non_group_error(self, group_hdf5_file):
 @requires_imagecodecs
 class TestOpenVirtualDataset:
     @patch("virtualizarr.readers.hdf.hdf.construct_virtual_dataset")
-    @patch("virtualizarr.readers.hdf.hdf.open_loadable_vars_and_indexes")
+    @patch("virtualizarr.readers.hdf.hdf.maybe_open_loadable_vars_and_indexes")
     def test_coord_names(
         self,
-        open_loadable_vars_and_indexes,
+        maybe_open_loadable_vars_and_indexes,
         construct_virtual_dataset,
         root_coordinates_hdf5_file,
     ):
-        open_loadable_vars_and_indexes.return_value = (0, 0)
+        maybe_open_loadable_vars_and_indexes.return_value = (0, 0)
         HDFVirtualBackend.open_virtual_dataset(root_coordinates_hdf5_file)
         assert construct_virtual_dataset.call_args[1]["coord_names"] == ["lat", "lon"]