From 303e5cb1e4ceb2827b96b23fbf1cca56e4fc0424 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Mon, 21 Oct 2024 21:31:10 -0400 Subject: [PATCH] Add accessor, simple docs --- docs/usage.md | 17 +++++++++++++++++ virtualizarr/accessor.py | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index a0f9d058..30eab144 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -396,6 +396,23 @@ combined_ds = xr.open_dataset('combined.parq', engine="kerchunk") By default references are placed in separate parquet file when the total number of references exceeds `record_size`. If there are fewer than `categorical_threshold` unique urls referenced by a particular variable, url will be stored as a categorical variable. +### Writing to an Icechunk Store + +We can also write these references out as an [IcechunkStore](https://icechunk.io/). `Icechunk` is a Open-source, cloud-native transactional tensor storage engine that is compatible with zarr version 3. To export our virtual dataset to an `Icechunk` Store, we simply use the {py:meth}`ds.virtualize.to_icechunk ` accessor method. + +```python +# create an icechunk store +from icechunk import IcechunkStore, StorageConfig, StoreConfig, VirtualRefConfig +storage = StorageConfig.filesystem(str('combined')) +store = IcechunkStore.create(storage=storage, mode="w", config=StoreConfig( + virtual_ref_config=VirtualRefConfig.s3_anonymous(region='us-east-1'), +)) + +combined_vds.virtualize.to_icechunk(store) +``` + +See the [Icechunk documentation](https://icechunk.io/icechunk-python/virtual/#creating-a-virtual-dataset-with-virtualizarr) for more details. + ### Writing as Zarr Alternatively, we can write these references out as an actual Zarr store, at least one that is compliant with the [proposed "Chunk Manifest" ZEP](https://github.com/zarr-developers/zarr-specs/issues/287). To do this we simply use the {py:meth}`ds.virtualize.to_zarr ` accessor method. diff --git a/virtualizarr/accessor.py b/virtualizarr/accessor.py index cc251e63..d6091613 100644 --- a/virtualizarr/accessor.py +++ b/virtualizarr/accessor.py @@ -1,5 +1,6 @@ from pathlib import Path from typing import ( + TYPE_CHECKING, Callable, Literal, overload, @@ -9,9 +10,13 @@ from virtualizarr.manifests import ManifestArray from virtualizarr.types.kerchunk import KerchunkStoreRefs +from virtualizarr.writers.icechunk import dataset_to_icechunk from virtualizarr.writers.kerchunk import dataset_to_kerchunk_refs from virtualizarr.writers.zarr import dataset_to_zarr +if TYPE_CHECKING: + from icechunk import IcechunkStore # type: ignore[import-not-found] + @register_dataset_accessor("virtualize") class VirtualiZarrDatasetAccessor: @@ -39,6 +44,18 @@ def to_zarr(self, storepath: str) -> None: """ dataset_to_zarr(self.ds, storepath) + def to_icechunk(self, store: "IcechunkStore") -> None: + """ + Write an xarray dataset whose variables wrap ManifestArrays to an Icechunk store. + + Currently requires all variables to be backed by ManifestArray objects. + + Parameters + ---------- + store: IcechunkStore + """ + dataset_to_icechunk(self.ds, store) + @overload def to_kerchunk( self, filepath: None, format: Literal["dict"]