Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Set ZArray default fill_value as NaT for datetime64 #206

Merged
merged 4 commits into from
Aug 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/releases.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ Bug fixes

- Exclude empty chunks during `ChunkDict` construction. (:pull:`198`)
By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.
- Fixed regression in `fill_value` handling for datetime dtypes making virtual
Zarr stores unreadable (:pr:`206`)
By `Timothy Hodson <https://github.com/thodson-usgs>`_

Documentation
~~~~~~~~~~~~~
Expand Down
47 changes: 47 additions & 0 deletions virtualizarr/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
import xarray.testing as xrt

from virtualizarr import open_virtual_dataset
from virtualizarr.manifests.array import ManifestArray
from virtualizarr.manifests.manifest import ChunkManifest
from virtualizarr.zarr import ZArray


@pytest.mark.parametrize(
Expand Down Expand Up @@ -166,6 +169,50 @@ def test_non_dimension_coordinates(self, tmpdir, format):
# assert equal to original dataset
xrt.assert_identical(roundtrip, ds)

def test_datetime64_dtype_fill_value(self, tmpdir, format):
chunks_dict = {
"0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
}
manifest = ChunkManifest(entries=chunks_dict)
chunks = (1, 1, 1)
shape = (1, 1, 1)
zarray = ZArray(
chunks=chunks,
compressor={"id": "zlib", "level": 1},
dtype=np.dtype("<M8[ns]"),
# fill_value=0.0,
filters=None,
order="C",
shape=shape,
zarr_format=2,
)
marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest)
ds = xr.Dataset(
{
"a": xr.DataArray(
marr1,
attrs={
"_FillValue": np.datetime64("1970-01-01T00:00:00.000000000")
},
)
}
)

if format == "dict":
# write those references to an in-memory kerchunk-formatted references dictionary
ds_refs = ds.virtualize.to_kerchunk(format=format)

# use fsspec to read the dataset from the kerchunk references dict
roundtrip = xr.open_dataset(ds_refs, engine="kerchunk")
else:
# write those references to disk as kerchunk references format
ds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)

# use fsspec to read the dataset from disk via the kerchunk references
roundtrip = xr.open_dataset(f"{tmpdir}/refs.{format}", engine="kerchunk")

assert roundtrip.a.attrs == ds.a.attrs


def test_open_scalar_variable(tmpdir):
# regression test for GH issue #100
Expand Down
15 changes: 8 additions & 7 deletions virtualizarr/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,14 @@
) # just the .zattrs (for one array or for the whole store/group)
FillValueT = bool | str | float | int | list | None

ZARR_DEFAULT_FILL_VALUE: dict[np.dtype, FillValueT] = {
ZARR_DEFAULT_FILL_VALUE: dict[str, FillValueT] = {
# numpy dtypes's hierarchy lets us avoid checking for all the widths
# https://numpy.org/doc/stable/reference/arrays.scalars.html
np.dtype("bool"): False,
np.dtype("int"): 0,
np.dtype("float"): 0.0,
np.dtype("complex"): [0.0, 0.0],
np.dtype("bool").kind: False,
np.dtype("int").kind: 0,
np.dtype("float").kind: 0.0,
np.dtype("complex").kind: [0.0, 0.0],
np.dtype("datetime64").kind: 0,
}
"""
The value and format of the fill_value depend on the `data_type` of the array.
Expand Down Expand Up @@ -67,7 +68,7 @@ class ZArray(BaseModel):
chunks: tuple[int, ...]
compressor: dict | None = None
dtype: np.dtype
fill_value: FillValueT = Field(default=0.0, validate_default=True)
fill_value: FillValueT = Field(None, validate_default=True)
filters: list[dict] | None = None
order: Literal["C", "F"]
shape: tuple[int, ...]
Expand All @@ -90,7 +91,7 @@ def __post_init__(self) -> None:
@model_validator(mode="after")
def _check_fill_value(self) -> Self:
if self.fill_value is None:
self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype, 0.0)
self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype.kind, 0.0)
return self

@property
Expand Down
Loading