diff --git a/docs/api.rst b/docs/api.rst index a8bee2c3..b712cb97 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -33,6 +33,16 @@ Serialization VirtualiZarrDatasetAccessor.to_zarr VirtualiZarrDatasetAccessor.to_icechunk +Information +----------- + +.. currentmodule:: virtualizarr.accessor +.. autosummary:: + :nosignatures: + :toctree: generated/ + + VirtualiZarrDatasetAccessor.nbytes + Rewriting --------- diff --git a/docs/releases.rst b/docs/releases.rst index a744cb9b..8f0b2a8a 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -9,6 +9,9 @@ v1.2.1 (unreleased) New Features ~~~~~~~~~~~~ +- Added a ``.nbytes`` accessor method which displays the bytes needed to hold the virtual references in memory. + (:issue:`167`, :pull:`227`) By `Tom Nicholas `_. + Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/docs/usage.md b/docs/usage.md index 3118dfeb..e6cd093d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -60,11 +60,25 @@ Attributes: title: 4x daily NMC reanalysis (1948) ``` - Generally a "virtual dataset" is any `xarray.Dataset` which wraps one or more {py:class}`ManifestArray ` objects. These particular {py:class}`ManifestArray ` objects are each a virtual reference to some data in the `air.nc` netCDF file, with the references stored in the form of "Chunk Manifests". +As the manifest contains only addresses at which to find large binary chunks, the virtual dataset takes up far less space in memory than the original dataset does: + +```python +ds.nbytes +``` +``` +30975672 +``` +```python +vds.virtualize.nbytes +``` +``` +128 +``` + ```{important} Virtual datasets are not normal xarray datasets! Although the top-level type is still `xarray.Dataset`, they are intended only as an abstract representation of a set of data files, not as something you can do analysis with. If you try to load, view, or plot any data you will get a `NotImplementedError`. Virtual datasets only support a very limited subset of normal xarray operations, particularly functions and methods for concatenating, merging and extracting variables, as well as operations for renaming dimensions and variables. diff --git a/virtualizarr/accessor.py b/virtualizarr/accessor.py index 969d21af..48482df3 100644 --- a/virtualizarr/accessor.py +++ b/virtualizarr/accessor.py @@ -183,3 +183,21 @@ def rename_paths( new_ds[var_name].data = data.rename_paths(new=new) return new_ds + + @property + def nbytes(self) -> int: + """ + Size required to hold these references in memory in bytes. + + Note this is not the size of the referenced chunks if they were actually loaded into memory, + this is only the size of the pointers to the chunk locations. + If you were to load the data into memory it would be ~1e6x larger for 1MB chunks. + + In-memory (loadable) variables are included in the total using xarray's normal ``.nbytes`` method. + """ + return sum( + var.data.nbytes_virtual + if isinstance(var.data, ManifestArray) + else var.nbytes + for var in self.ds.variables.values() + ) diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 44c0546c..89f648aa 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -93,6 +93,18 @@ def size(self) -> int: def __repr__(self) -> str: return f"ManifestArray" + @property + def nbytes_virtual(self) -> int: + """ + Size required to hold these references in memory in bytes. + + Note this is not the size of the referenced array if it were actually loaded into memory, + this is only the size of the pointers to the chunk locations. + If you were to load the data into memory it would be ~1e6x larger for 1MB chunks. + """ + # note: we don't name this method `.nbytes` as we don't want xarray's repr to use it + return self.manifest.nbytes + def __array_function__(self, func, types, args, kwargs) -> Any: """ Hook to teach this class what to do if np.concat etc. is called on it. diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index cc970fb2..666f4854 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -357,6 +357,17 @@ def shape_chunk_grid(self) -> tuple[int, ...]: def __repr__(self) -> str: return f"ChunkManifest" + @property + def nbytes(self) -> int: + """ + Size required to hold these references in memory in bytes. + + Note this is not the size of the referenced chunks if they were actually loaded into memory, + this is only the size of the pointers to the chunk locations. + If you were to load the data into memory it would be ~1e6x larger for 1MB chunks. + """ + return self._paths.nbytes + self._offsets.nbytes + self._lengths.nbytes + def __getitem__(self, key: ChunkKey) -> ChunkEntry: indices = split(key) path = self._paths[indices] diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 0d874565..856ff395 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -3,6 +3,7 @@ import numpy as np import pytest import xarray as xr +from xarray import open_dataset from virtualizarr import open_virtual_dataset from virtualizarr.manifests import ChunkManifest, ManifestArray @@ -310,3 +311,16 @@ def test_mixture_of_manifestarrays_and_numpy_arrays( == "s3://bucket/air.nc" ) assert isinstance(renamed_vds["lat"].data, np.ndarray) + + +@requires_kerchunk +def test_nbytes(simple_netcdf4): + vds = open_virtual_dataset(simple_netcdf4) + assert vds.virtualize.nbytes == 32 + assert vds.nbytes == 48 + + vds = open_virtual_dataset(simple_netcdf4, loadable_variables=["foo"]) + assert vds.virtualize.nbytes == 48 + + ds = open_dataset(simple_netcdf4) + assert ds.virtualize.nbytes == ds.nbytes