Skip to content

Commit

Permalink
Merge branch 'main' into pre-commit-ci-update-config
Browse files Browse the repository at this point in the history
  • Loading branch information
TomNicholas authored Oct 17, 2024
2 parents c1b3c7e + ec8e465 commit 07b2fa2
Show file tree
Hide file tree
Showing 15 changed files with 255 additions and 53 deletions.
4 changes: 0 additions & 4 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,6 @@ jobs:
conda env list
conda list
- name: Type check
run: |
mypy virtualizarr
- name: Running Tests
run: |
python -m pytest ./virtualizarr --run-network-tests --cov=./ --cov-report=xml --verbose
Expand Down
38 changes: 38 additions & 0 deletions .github/workflows/typing.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: Typing

on:
push:
branches: [ "main" ]
paths-ignore:
- 'docs/**'
pull_request:
branches: [ "main" ]
paths-ignore:
- 'docs/**'
schedule:
- cron: "0 0 * * *"

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
mypy:
name: mypy
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.12'

- name: Install deps
run: |
# We need to test optional dep to add all the library stubs
pip install -e '.[test]'
- name: Type check
run: |
mypy virtualizarr
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,4 @@ cython_debug/
#.idea/
virtualizarr/_version.py
docs/generated/
examples/
1 change: 0 additions & 1 deletion ci/doc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,3 @@ dependencies:
- "sphinx_design"
- "sphinx_togglebutton"
- "sphinx-autodoc-typehints"
- -e "..[test]"
14 changes: 14 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,20 @@ def netcdf4_file(tmpdir):
return filepath


@pytest.fixture
def netcdf4_virtual_dataset(netcdf4_file):
from virtualizarr import open_virtual_dataset

return open_virtual_dataset(netcdf4_file, indexes={})


@pytest.fixture
def netcdf4_inlined_ref(netcdf4_file):
from kerchunk.hdf import SingleHdf5ToZarr

return SingleHdf5ToZarr(netcdf4_file, inline_threshold=1000).translate()


@pytest.fixture
def hdf5_groups_file(tmpdir):
# Set up example xarray dataset
Expand Down
5 changes: 5 additions & 0 deletions docs/releases.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ v1.0.1 (unreleased)

New Features
~~~~~~~~~~~~


- Can open `kerchunk` reference files with ``open_virtual_dataset``.
(:pull:`251`, :pull:`186`) By `Raphael Hagen <https://github.com/norlandrhagen>`_ & `Kristen Thyng <https://github.com/kthyng>`_.

- Adds defaults for `open_virtual_dataset_from_v3_store` in (:pull:`234`)
By `Raphael Hagen <https://github.com/norlandrhagen>`_.

Expand Down
12 changes: 12 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,18 @@ Currently there are not yet any zarr v3 readers which understand the chunk manif
This store can however be read by {py:func}`~virtualizarr.xarray.open_virtual_dataset`, by passing `filetype="zarr_v3"`.
```

## Opening Kerchunk references as virtual datasets

You can open existing Kerchunk `json` or `parquet` references as Virtualizarr virtual datasets. This may be useful for converting existing Kerchunk formatted references to storage formats like [Icechunk](https://icechunk.io/).

```python

vds = open_virtual_dataset('combined.json', format='kerchunk')
# or
vds = open_virtual_dataset('combined.parquet', format='kerchunk')

```

## Rewriting existing manifests

Sometimes it can be useful to rewrite the contents of an already-generated manifest or virtual dataset.
Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@ ignore_missing_imports = true
module = "kerchunk.*"
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = "ujson.*"
ignore_missing_imports = true

[tool.ruff]
# Same as Black.
line-length = 88
Expand Down
50 changes: 44 additions & 6 deletions virtualizarr/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
from xarray.core.variable import IndexVariable

from virtualizarr.manifests import ManifestArray
from virtualizarr.utils import _fsspec_openfile_from_filepath
from virtualizarr.types.kerchunk import KerchunkStoreRefs
from virtualizarr.utils import _FsspecFSFromFilepath

XArrayOpenT = str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore

Expand All @@ -39,6 +40,7 @@ class FileType(AutoName):
zarr = auto()
dmrpp = auto()
zarr_v3 = auto()
kerchunk = auto()


class ManifestBackendArray(ManifestArray, BackendArray):
Expand Down Expand Up @@ -67,13 +69,14 @@ def open_virtual_dataset(
Xarray indexes can optionally be created (the default behaviour). To avoid creating any xarray indexes pass ``indexes={}``.
Parameters
----------
filepath : str, default None
File path to open as a set of virtualized zarr arrays.
filetype : FileType, default None
Type of file to be opened. Used to determine which kerchunk file format backend to use.
Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3'}.
Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3', 'kerchunk'}.
If not provided will attempt to automatically infer the correct filetype from header bytes.
group : str, default is None
Path to the HDF5/netCDF4 group in the given file to open. Given as a str, supported by filetypes “netcdf4” and “hdf5”.
Expand Down Expand Up @@ -133,9 +136,44 @@ def open_virtual_dataset(
raise NotImplementedError()

# if filetype is user defined, convert to FileType

if filetype is not None:
filetype = FileType(filetype)

if filetype == FileType.kerchunk:
from virtualizarr.readers.kerchunk import dataset_from_kerchunk_refs

fs = _FsspecFSFromFilepath(filepath=filepath, reader_options=reader_options)

# The kerchunk .parquet storage format isn't actually a parquet, but a directory that contains named parquets for each group/variable.
if fs.filepath.endswith("ref.parquet"):
from fsspec.implementations.reference import LazyReferenceMapper

lrm = LazyReferenceMapper(filepath, fs.fs)

# build reference dict from KV pairs in LazyReferenceMapper
# is there a better / more preformant way to extract this?
array_refs = {k: lrm[k] for k in lrm.keys()}

full_reference = {"refs": array_refs}

return dataset_from_kerchunk_refs(KerchunkStoreRefs(full_reference))

# JSON has no magic bytes, but the Kerchunk version 1 spec starts with 'version':
# https://fsspec.github.io/kerchunk/spec.html
elif fs.read_bytes(9).startswith(b'{"version'):
import ujson

with fs.open_file() as of:
refs = ujson.load(of)

return dataset_from_kerchunk_refs(KerchunkStoreRefs(refs))

else:
raise ValueError(
"The input Kerchunk reference did not seem to be in Kerchunk's JSON or Parquet spec: https://fsspec.github.io/kerchunk/spec.html. The Kerchunk format autodetection is quite flaky, so if your reference matches the Kerchunk spec feel free to open an issue: https://github.com/zarr-developers/VirtualiZarr/issues"
)

if filetype == FileType.zarr_v3:
# TODO is there a neat way of auto-detecting this?
from virtualizarr.readers.zarr import open_virtual_dataset_from_v3_store
Expand All @@ -151,9 +189,9 @@ def open_virtual_dataset(
"Specifying `loadable_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files."
)

fpath = _fsspec_openfile_from_filepath(
fpath = _FsspecFSFromFilepath(
filepath=filepath, reader_options=reader_options
)
).open_file()
parser = DMRParser(fpath.read(), data_filepath=filepath.strip(".dmrpp"))
vds = parser.parse_dataset()
vds.drop_vars(drop_variables)
Expand Down Expand Up @@ -189,9 +227,9 @@ def open_virtual_dataset(
# TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
# TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
# TODO really we probably want a dedicated xarray backend that iterates over all variables only once
fpath = _fsspec_openfile_from_filepath(
fpath = _FsspecFSFromFilepath(
filepath=filepath, reader_options=reader_options
)
).open_file()

# fpath can be `Any` thanks to fsspec.filesystem(...).open() returning Any.
# We'll (hopefully safely) cast it to what xarray is expecting, but this might let errors through.
Expand Down
21 changes: 14 additions & 7 deletions virtualizarr/manifests/array_api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import TYPE_CHECKING, Callable, Iterable
from typing import TYPE_CHECKING, Any, Callable, Iterable, cast

import numpy as np

Expand Down Expand Up @@ -217,9 +217,12 @@ def stack(
new_shape.insert(axis, length_along_new_stacked_axis)

# do stacking of entries in manifest
stacked_paths = np.stack(
[arr.manifest._paths for arr in arrays],
axis=axis,
stacked_paths = cast( # `np.stack` apparently is type hinted as if the output could have Any dtype
np.ndarray[Any, np.dtypes.StringDType],
np.stack(
[arr.manifest._paths for arr in arrays],
axis=axis,
),
)
stacked_offsets = np.stack(
[arr.manifest._offsets for arr in arrays],
Expand Down Expand Up @@ -296,10 +299,14 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra
)

# do broadcasting of entries in manifest
broadcasted_paths = np.broadcast_to(
x.manifest._paths,
shape=new_chunk_grid_shape,
broadcasted_paths = cast( # `np.broadcast_to` apparently is type hinted as if the output could have Any dtype
np.ndarray[Any, np.dtypes.StringDType],
np.broadcast_to(
x.manifest._paths,
shape=new_chunk_grid_shape,
),
)

broadcasted_offsets = np.broadcast_to(
x.manifest._offsets,
shape=new_chunk_grid_shape,
Expand Down
13 changes: 9 additions & 4 deletions virtualizarr/manifests/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class ChunkManifest:
so it's not possible to have a ChunkManifest object that does not represent a valid grid of chunks.
"""

_paths: np.ndarray[Any, np.dtypes.StringDType] # type: ignore[name-defined]
_paths: np.ndarray[Any, np.dtypes.StringDType]
_offsets: np.ndarray[Any, np.dtype[np.uint64]]
_lengths: np.ndarray[Any, np.dtype[np.uint64]]

Expand Down Expand Up @@ -113,7 +113,10 @@ def __init__(self, entries: dict) -> None:
shape = get_chunk_grid_shape(entries.keys())

# Initializing to empty implies that entries with path='' are treated as missing chunks
paths = np.empty(shape=shape, dtype=np.dtypes.StringDType()) # type: ignore[attr-defined]
paths = cast( # `np.empty` apparently is type hinted as if the output could have Any dtype
np.ndarray[Any, np.dtypes.StringDType],
np.empty(shape=shape, dtype=np.dtypes.StringDType()),
)
offsets = np.empty(shape=shape, dtype=np.dtype("uint64"))
lengths = np.empty(shape=shape, dtype=np.dtype("uint64"))

Expand Down Expand Up @@ -141,7 +144,7 @@ def __init__(self, entries: dict) -> None:
@classmethod
def from_arrays(
cls,
paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]], # type: ignore[name-defined]
paths: np.ndarray[Any, np.dtypes.StringDType],
offsets: np.ndarray[Any, np.dtype[np.uint64]],
lengths: np.ndarray[Any, np.dtype[np.uint64]],
) -> "ChunkManifest":
Expand Down Expand Up @@ -306,7 +309,9 @@ def _from_kerchunk_chunk_dict(
chunk_entries: dict[ChunkKey, ChunkDictEntry] = {}
for k, v in kerchunk_chunk_dict.items():
if isinstance(v, (str, bytes)):
raise NotImplementedError("TODO: handle inlined data")
raise NotImplementedError(
"Reading inlined reference data is currently not supported. [ToDo]"
)
elif not isinstance(v, (tuple, list)):
raise TypeError(f"Unexpected type {type(v)} for chunk value: {v}")
chunk_entries[k] = ChunkEntry.from_kerchunk(v).dict()
Expand Down
6 changes: 3 additions & 3 deletions virtualizarr/readers/kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
KerchunkArrRefs,
KerchunkStoreRefs,
)
from virtualizarr.utils import _fsspec_openfile_from_filepath
from virtualizarr.utils import _FsspecFSFromFilepath
from virtualizarr.zarr import ZArray, ZAttrs


Expand All @@ -28,9 +28,9 @@ def _automatically_determine_filetype(
raise NotImplementedError()

# Read magic bytes from local or remote file
fpath = _fsspec_openfile_from_filepath(
fpath = _FsspecFSFromFilepath(
filepath=filepath, reader_options=reader_options
)
).open_file()
magic_bytes = fpath.read(8)
fpath.close()

Expand Down
Loading

0 comments on commit 07b2fa2

Please sign in to comment.