From d18099a8f8b2e5674efdaf646e5ef0b968c94442 Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Sat, 13 Jul 2024 15:55:27 -0700 Subject: [PATCH 1/2] changes to main to get running Most changes brought in from main branch to fix a few errors. Also a change so that if a parquet file is input the NotImplemented exception is raised, and a docstring added for the kerchunk filetype. --- virtualizarr/kerchunk.py | 3 +++ virtualizarr/utils.py | 20 ++------------------ virtualizarr/xarray.py | 21 +++++++++++++++------ 3 files changed, 20 insertions(+), 24 deletions(-) diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index a98a4045..2493ee42 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -139,6 +139,9 @@ def _automatically_determine_filetype( fpath = _fsspec_openfile_from_filepath( filepath=filepath, reader_options=reader_options ) + magic_bytes = fpath.read(8) + fpath.close() + if magic_bytes.startswith(b"CDF"): filetype = FileType.netcdf3 elif magic_bytes.startswith(b"\x0e\x03\x13\x01"): diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index a4218571..4899d41d 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -44,24 +44,8 @@ def _fsspec_openfile_from_filepath( universal_filepath = UPath(filepath) protocol = universal_filepath.protocol - if protocol == "": - fpath = fsspec.open(universal_filepath, "rb") - if universal_filepath.is_file(): - fpath = fpath.open() - - elif protocol in ["s3"]: - s3_anon_defaults = {"key": "", "secret": "", "anon": True} - if not bool(reader_options): - storage_options = s3_anon_defaults - - else: - storage_options = reader_options.get("storage_options") # type: ignore - - # using dict merge operator to add in defaults if keys are not specified - storage_options = s3_anon_defaults | storage_options - fpath = fsspec.filesystem(protocol, **storage_options) - if universal_filepath.is_file(): - fpath = fpath.open(filepath) + if protocol == "s3": + protocol_defaults = {"key": "", "secret": "", "anon": True} else: protocol_defaults = {} diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index aaa4ee6f..aa73aee2 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -57,8 +57,10 @@ def open_virtual_dataset( File path to open as a set of virtualized zarr arrays. filetype : FileType, default None Type of file to be opened. Used to determine which kerchunk file format backend to use. - Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3'}. + Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3', 'kerchunk'}. If not provided will attempt to automatically infer the correct filetype from header bytes. + For type of 'kerchunk' the file must be a JSON containing kerchunk references + and the filetype string must be input. drop_variables: list[str], default is None Variables in the file to drop before returning. loadable_variables: list[str], default is None @@ -125,9 +127,16 @@ def open_virtual_dataset( ) if filetype == "kerchunk": - fpath = _fsspec_openfile_from_filepath( - filepath=filepath, reader_options=reader_options - ) + + try: + fpath = _fsspec_openfile_from_filepath( + filepath=filepath, reader_options=reader_options + ) + except IsADirectoryError: + # moved this here because fsspec wasn't finding the + # suffix of the parquet file due to thinking it was a directory + # but left the other code since it should ultimately be back down there + raise NotImplementedError() from upath import UPath @@ -140,8 +149,8 @@ def open_virtual_dataset( vds = dataset_from_kerchunk_refs(refs_dict) return vds - elif kerchunk_storage_ftype == ".parquet": - raise NotImplementedError + elif kerchunk_storage_ftype in [".parquet", ".parq"]: + raise NotImplementedError() # Question: How should we read the parquet files # into a dict to pass into dataset_from_kerchunk_refs? From 9308fb4b499ee89d1c44f12e61d9acc3823ffdf4 Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Sat, 13 Jul 2024 16:29:14 -0700 Subject: [PATCH 2/2] added release and docs --- docs/releases.rst | 3 +++ docs/usage.md | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/docs/releases.rst b/docs/releases.rst index c44ff245..3b1ae34a 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -9,6 +9,9 @@ v1.0.1 (unreleased) New Features ~~~~~~~~~~~~ +- Can open `kerchunk` reference files with ``open_virtual_dataset`` if they are json + (:pull:`119`, :pull:`186`) By `Raphael Hagen `_, `Kristen Thyng `_. + Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/docs/usage.md b/docs/usage.md index b0935286..316641bd 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -419,6 +419,14 @@ Currently there are not yet any zarr v3 readers which understand the chunk manif This store can however be read by {py:func}`~virtualizarr.xarray.open_virtual_dataset`, by passing `filetype="zarr_v3"`. ``` +## Opening kerchunk files from disk as virtual datasets + +You can open kerchunk files from disk as virtual datasets if they are json (not yet parquet). This is helpful, for example, to allow for the workflow necessary if you have existing model output you want to create a kerchunk file for, but also want to be able to add to later. One way to do this is to create derivative json kerchunk files to represent the available model output and combine those into a single kerchunk file for all available model output. Then as more model output is available, create more derivative kerchunk files to represent the new model output and as needed, recreate the single combined kerchunk file that represents the full model dataset. + +```python +open_virtual_dataset('combined.json', format='json') +``` + ## Rewriting existing manifests Sometimes it can be useful to rewrite the contents of an already-generated manifest or virtual dataset.