diff --git a/docs/releases.rst b/docs/releases.rst index 8f0b2a8a..4146948f 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -21,7 +21,7 @@ Breaking changes - Indexes are now created by default for any loadable one-dimensional coordinate variables. Also a warning is no longer thrown when ``indexes=None`` is passed to ``open_virtual_dataset``, and the recommendations in the docs updated to match. This also means that ``xarray.combine_by_coords`` will now work when the necessary dimension coordinates are specified in ``loadable_variables``. - (:issue:`18`, :pull:`357`) By `Tom Nicholas `_. + (:issue:`18`, :pull:`357`, :pull:`358`) By `Tom Nicholas `_. Deprecations ~~~~~~~~~~~~ diff --git a/docs/usage.md b/docs/usage.md index e6cd093d..69b2a386 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -205,7 +205,7 @@ But before we combine our data, we might want to consider loading some variables Whilst the values of virtual variables (i.e. those backed by `ManifestArray` objects) cannot be loaded into memory, you do have the option of opening specific variables from the file as loadable lazy numpy/dask arrays, just like `xr.open_dataset` normally returns. These variables are specified using the `loadable_variables` argument: ```python -vds = open_virtual_dataset('air.nc', loadable_variables=['air', 'time'], indexes={}) +vds = open_virtual_dataset('air.nc', loadable_variables=['air', 'time']) ``` ```python Size: 31MB @@ -228,8 +228,8 @@ You can see that the dataset contains a mixture of virtual variables backed by ` Loading variables can be useful in a few scenarios: 1. You need to look at the actual values of a multi-dimensional variable in order to decide what to do next, 2. You want in-memory indexes to use with ``xr.combine_by_coords``, -3. Storing a variable on-disk as a set of references would be inefficient, e.g. because it's a very small array (saving the values like this is similar to kerchunk's concept of "inlining" data), -4. The variable has encoding, and the simplest way to decode it correctly is to let xarray's standard decoding machinery load it into memory and apply the decoding. +3. Storing a variable on-disk as a set of references would be inefficient, e.g. because each chunk is very small (saving the values like this is similar to kerchunk's concept of "inlining" data), +4. The variable has complicated encoding, and the simplest way to decode it correctly is to let xarray's standard decoding machinery load it into memory and apply the decoding. ### CF-encoded time variables @@ -240,7 +240,6 @@ vds = open_virtual_dataset( 'air.nc', loadable_variables=['air', 'time'], decode_times=True, - indexes={}, ) ``` ```python @@ -341,7 +340,6 @@ In future we would like for it to be possible to just use `xr.open_mfdataset` to concat_dim=['time'], coords='minimal', compat='override', - indexes={}, ) but this requires some [upstream changes](https://github.com/TomNicholas/VirtualiZarr/issues/35) in xarray. diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index b53b1f1c..0a7ad36e 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -21,7 +21,7 @@ from virtualizarr.utils import _FsspecFSFromFilepath -def open_loadable_vars_and_indexes( +def maybe_open_loadable_vars_and_indexes( filepath: str, loadable_variables, reader_options, @@ -36,50 +36,51 @@ def open_loadable_vars_and_indexes( Relies on xr.open_dataset and its auto-detection of filetypes to find the correct installed backend. """ - # TODO get rid of this if? - if indexes is None or len(loadable_variables) > 0: - # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... - # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references - # TODO really we probably want a dedicated xarray backend that iterates over all variables only once - fpath = _FsspecFSFromFilepath( - filepath=filepath, reader_options=reader_options - ).open_file() - - # fpath can be `Any` thanks to fsspec.filesystem(...).open() returning Any. - ds = open_dataset( - fpath, # type: ignore[arg-type] - drop_variables=drop_variables, - group=group, - decode_times=decode_times, - ) - - if indexes is None: - # add default indexes by reading data from file - # but avoid creating an in-memory index for virtual variables by default - indexes = { - name: index - for name, index in ds.xindexes.items() - if name in loadable_variables - } - elif indexes != {}: - # TODO allow manual specification of index objects - raise NotImplementedError() - else: - indexes = dict(**indexes) # for type hinting: to allow mutation + if loadable_variables == [] and indexes == {}: + # no need to even attempt to open the file using xarray + return {}, {} + + # TODO We are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... + # TODO It would also be nice if we could somehow consolidate this with the reading of the kerchunk references + # TODO Really we probably want a dedicated backend that iterates over all variables only once + # TODO See issue #124 for a suggestion of how to avoid calling xarray here. + + fpath = _FsspecFSFromFilepath( + filepath=filepath, reader_options=reader_options + ).open_file() + + # fpath can be `Any` thanks to fsspec.filesystem(...).open() returning Any. + ds = open_dataset( + fpath, # type: ignore[arg-type] + drop_variables=drop_variables, + group=group, + decode_times=decode_times, + ) - # TODO we should drop these earlier by using drop_variables - loadable_vars = { - str(name): var - for name, var in ds.variables.items() + if indexes is None: + # add default indexes by reading data from file + # but avoid creating an in-memory index for virtual variables by default + indexes = { + name: index + for name, index in ds.xindexes.items() if name in loadable_variables } - - # if we only read the indexes we can just close the file right away as nothing is lazy - if loadable_vars == {}: - ds.close() + elif indexes != {}: + # TODO allow manual specification of index objects + raise NotImplementedError() else: - loadable_vars = {} - indexes = {} + indexes = dict(**indexes) # for type hinting: to allow mutation + + # TODO we should drop these earlier by using drop_variables + loadable_vars = { + str(name): var + for name, var in ds.variables.items() + if name in loadable_variables + } + + # if we only read the indexes we can just close the file right away as nothing is lazy + if loadable_vars == {}: + ds.close() return loadable_vars, indexes diff --git a/virtualizarr/readers/fits.py b/virtualizarr/readers/fits.py index d285187c..16ebe0ad 100644 --- a/virtualizarr/readers/fits.py +++ b/virtualizarr/readers/fits.py @@ -1,12 +1,11 @@ from pathlib import Path from typing import Iterable, Mapping, Optional -from xarray import Dataset, Index +from xarray import Dataset, Index, Variable from virtualizarr.readers.common import ( VirtualBackend, construct_virtual_dataset, - open_loadable_vars_and_indexes, ) from virtualizarr.translators.kerchunk import ( extract_group, @@ -41,6 +40,15 @@ def open_virtual_dataset( if group: refs = extract_group(refs, group) + # TODO This wouldn't work until either you had an xarray backend for FITS installed, or issue #124 is implemented to load data from ManifestArrays directly + # TODO Once we have one of those we can use ``maybe_open_loadable_vars_and_indexes`` here + if loadable_variables != [] or indexes != {} or decode_times: + raise NotImplementedError( + "Cannot load variables or indexes from FITS files as there is no xarray backend engine for FITS" + ) + loadable_vars: dict[str, Variable] = {} + indexes = {} + virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( refs, loadable_variables, @@ -48,17 +56,6 @@ def open_virtual_dataset( fs_root=Path.cwd().as_uri(), ) - # TODO this wouldn't work until you had an xarray backend for FITS installed - loadable_vars, indexes = open_loadable_vars_and_indexes( - filepath, - loadable_variables=loadable_variables, - reader_options=reader_options, - drop_variables=drop_variables, - indexes=indexes, - group=group, - decode_times=decode_times, - ) - return construct_virtual_dataset( virtual_vars=virtual_vars, loadable_vars=loadable_vars, diff --git a/virtualizarr/readers/hdf/__init__.py b/virtualizarr/readers/hdf/__init__.py index d1519383..61aa1ae6 100644 --- a/virtualizarr/readers/hdf/__init__.py +++ b/virtualizarr/readers/hdf/__init__.py @@ -1,11 +1,11 @@ from .hdf import ( HDFVirtualBackend, construct_virtual_dataset, - open_loadable_vars_and_indexes, + maybe_open_loadable_vars_and_indexes, ) __all__ = [ "HDFVirtualBackend", "construct_virtual_dataset", - "open_loadable_vars_and_indexes", + "maybe_open_loadable_vars_and_indexes", ] diff --git a/virtualizarr/readers/hdf/hdf.py b/virtualizarr/readers/hdf/hdf.py index 5447e068..a8a49836 100644 --- a/virtualizarr/readers/hdf/hdf.py +++ b/virtualizarr/readers/hdf/hdf.py @@ -15,7 +15,7 @@ from virtualizarr.readers.common import ( VirtualBackend, construct_virtual_dataset, - open_loadable_vars_and_indexes, + maybe_open_loadable_vars_and_indexes, ) from virtualizarr.readers.hdf.filters import cfcodec_from_dataset, codecs_from_dataset from virtualizarr.types import ChunkKey @@ -68,7 +68,7 @@ def open_virtual_dataset( reader_options=reader_options, ) - loadable_vars, indexes = open_loadable_vars_and_indexes( + loadable_vars, indexes = maybe_open_loadable_vars_and_indexes( filepath, loadable_variables=loadable_variables, reader_options=reader_options, diff --git a/virtualizarr/readers/hdf5.py b/virtualizarr/readers/hdf5.py index eafcfb99..94821357 100644 --- a/virtualizarr/readers/hdf5.py +++ b/virtualizarr/readers/hdf5.py @@ -6,7 +6,7 @@ from virtualizarr.readers.common import ( VirtualBackend, construct_virtual_dataset, - open_loadable_vars_and_indexes, + maybe_open_loadable_vars_and_indexes, ) from virtualizarr.translators.kerchunk import ( extract_group, @@ -54,7 +54,7 @@ def open_virtual_dataset( fs_root=Path.cwd().as_uri(), ) - loadable_vars, indexes = open_loadable_vars_and_indexes( + loadable_vars, indexes = maybe_open_loadable_vars_and_indexes( filepath, loadable_variables=loadable_variables, reader_options=reader_options, diff --git a/virtualizarr/readers/netcdf3.py b/virtualizarr/readers/netcdf3.py index a32fb822..816df75e 100644 --- a/virtualizarr/readers/netcdf3.py +++ b/virtualizarr/readers/netcdf3.py @@ -6,7 +6,7 @@ from virtualizarr.readers.common import ( VirtualBackend, construct_virtual_dataset, - open_loadable_vars_and_indexes, + maybe_open_loadable_vars_and_indexes, ) from virtualizarr.translators.kerchunk import ( virtual_vars_and_metadata_from_kerchunk_refs, @@ -53,7 +53,7 @@ def open_virtual_dataset( fs_root=Path.cwd().as_uri(), ) - loadable_vars, indexes = open_loadable_vars_and_indexes( + loadable_vars, indexes = maybe_open_loadable_vars_and_indexes( filepath, loadable_variables=loadable_variables, reader_options=reader_options, diff --git a/virtualizarr/readers/tiff.py b/virtualizarr/readers/tiff.py index b9230e42..8a360dfe 100644 --- a/virtualizarr/readers/tiff.py +++ b/virtualizarr/readers/tiff.py @@ -7,7 +7,7 @@ from virtualizarr.readers.common import ( VirtualBackend, construct_virtual_dataset, - open_loadable_vars_and_indexes, + maybe_open_loadable_vars_and_indexes, ) from virtualizarr.translators.kerchunk import ( extract_group, @@ -63,7 +63,7 @@ def open_virtual_dataset( fs_root=Path.cwd().as_uri(), ) - loadable_vars, indexes = open_loadable_vars_and_indexes( + loadable_vars, indexes = maybe_open_loadable_vars_and_indexes( filepath, loadable_variables=loadable_variables, reader_options=reader_options, diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf.py index b75c56b0..13a051d4 100644 --- a/virtualizarr/tests/test_readers/test_hdf/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf/test_hdf.py @@ -168,13 +168,13 @@ def test_non_group_error(self, group_hdf5_file): @requires_imagecodecs class TestOpenVirtualDataset: @patch("virtualizarr.readers.hdf.hdf.construct_virtual_dataset") - @patch("virtualizarr.readers.hdf.hdf.open_loadable_vars_and_indexes") + @patch("virtualizarr.readers.hdf.hdf.maybe_open_loadable_vars_and_indexes") def test_coord_names( self, - open_loadable_vars_and_indexes, + maybe_open_loadable_vars_and_indexes, construct_virtual_dataset, root_coordinates_hdf5_file, ): - open_loadable_vars_and_indexes.return_value = (0, 0) + maybe_open_loadable_vars_and_indexes.return_value = (0, 0) HDFVirtualBackend.open_virtual_dataset(root_coordinates_hdf5_file) assert construct_virtual_dataset.call_args[1]["coord_names"] == ["lat", "lon"]