Skip to content

Commit

Permalink
Linting
Browse files Browse the repository at this point in the history
  • Loading branch information
abarciauskas-bgse committed Oct 30, 2024
1 parent 5846d7e commit 66bbd6e
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 23 deletions.
22 changes: 22 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,28 @@ def netcdf4_file(tmpdir):
return filepath


@pytest.fixture
def compressed_netcdf4_files(tmpdir):
ds = xr.tutorial.open_dataset("air_temperature")
# Define compression options for NetCDF
encoding = {
var: dict(compression="gzip", compression_opts=4) for var in ds.data_vars
}

ds1 = ds.isel(time=slice(None, 1460))
ds2 = ds.isel(time=slice(1460, None))

# Save it to disk as netCDF (in temporary directory)
filepath1 = f"{tmpdir}/air1_compressed.nc"
filepath2 = f"{tmpdir}/air2_compressed.nc"
ds1.to_netcdf(filepath1, engine="h5netcdf", encoding=encoding)
ds2.to_netcdf(filepath2, engine="h5netcdf", encoding=encoding)
ds1.close()
ds2.close()

return filepath1, filepath2


@pytest.fixture
def netcdf4_virtual_dataset(netcdf4_file):
from virtualizarr import open_virtual_dataset
Expand Down
57 changes: 44 additions & 13 deletions virtualizarr/tests/test_writers/test_icechunk_append.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def gen_virtual_dataset(
shape: tuple[int, int] = (3, 4),
chunk_shape: tuple[int, int] = (3, 4),
dtype: np.dtype = np.dtype("int32"),
compressor: str = None,
compressor: dict = None,
filters: str = None,
fill_value: str = None,
encoding: dict = None,
Expand All @@ -95,7 +95,7 @@ def gen_virtual_dataset(
)
ma = ManifestArray(chunkmanifest=manifest, zarray=zarray)
ds = open_dataset(file_uri)
dims = dims or list(ds.dims.keys())
dims = dims or ds.sizes.keys()
var = Variable(
data=ma,
dims=dims,
Expand Down Expand Up @@ -161,7 +161,7 @@ def test_append_virtual_ref_with_encoding(
dims=["time", "lat", "lon"],
dtype=np.dtype("int16"),
variable_name="air",
encoding={"_FillValue": -9999, "scale_factor": scale_factor},
encoding={"scale_factor": scale_factor},
base_offset=15419,
length=3869000,
),
Expand All @@ -172,7 +172,7 @@ def test_append_virtual_ref_with_encoding(
dims=["time", "lat", "lon"],
dtype=np.dtype("int16"),
variable_name="air",
encoding={"_FillValue": -9999, "scale_factor": scale_factor},
encoding={"scale_factor": scale_factor},
base_offset=15419,
length=3869000,
),
Expand Down Expand Up @@ -201,31 +201,60 @@ def test_append_virtual_ref_with_encoding(


## When appending to a virtual ref with compression, it succeeds
@pytest.mark.skip(reason="working on this")
@pytest.mark.skip(reason="Failing with gzip.BadGzipFile: Not a gzipped file")
def test_append_with_compression_succeeds(
icechunk_storage: "StorageConfig", simple_netcdf4: str
icechunk_storage: "StorageConfig", compressed_netcdf4_files: str
):
import xarray as xr
from icechunk import IcechunkStore

file1, file2 = compressed_netcdf4_files
# Generate compressed dataset
vds = gen_virtual_dataset(
file_uri=simple_netcdf4, compressor="zlib", dtype=np.dtype("int16")
vds1, vds2 = (
gen_virtual_dataset(
file_uri=file1,
# https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#metadata
compressor={"id": "gzip", "level": 4},
dtype=np.dtype("int16"),
variable_name="air",
shape=(1460, 25, 53),
chunk_shape=(1460, 25, 53),
base_offset=15419,
length=3869000,
),
gen_virtual_dataset(
file_uri=file2,
compressor={"id": "gzip", "level": 4},
dtype=np.dtype("int16"),
variable_name="air",
shape=(1460, 25, 53),
chunk_shape=(1460, 25, 53),
base_offset=15419,
length=3869000,
),
)

# Create icechunk store and commit the compressed dataset
icechunk_filestore = IcechunkStore.create(storage=icechunk_storage)
dataset_to_icechunk(vds, icechunk_filestore)
dataset_to_icechunk(vds1, icechunk_filestore)
icechunk_filestore.commit("test commit")

# Append another dataset with compatible compression
icechunk_filestore_append = IcechunkStore.open_existing(
storage=icechunk_storage, mode="a"
)
dataset_to_icechunk(vds, icechunk_filestore_append, append_dim="x")
dataset_to_icechunk(vds2, icechunk_filestore_append, append_dim="time")
root_group = group(store=icechunk_filestore_append)
array = root_group["air"]

expected_ds1, expected_ds2 = open_dataset(file1), open_dataset(file2)
expected_array = xr.concat(
[expected_ds1["air"], expected_ds2["air"]], dim="time"
).to_numpy()
npt.assert_equal(array, expected_array)


## When chunk shapes are different it fails
@pytest.mark.skip(reason="working on this")
def test_append_with_different_chunking_fails(
icechunk_storage: "StorageConfig", simple_netcdf4: str
):
Expand All @@ -246,14 +275,16 @@ def test_append_with_different_chunking_fails(
icechunk_filestore_append = IcechunkStore.open_existing(
storage=icechunk_storage, mode="a"
)
with pytest.raises(ValueError, match="incompatible chunking"):
with pytest.raises(
ValueError, match="Cannot concatenate arrays with inconsistent chunk shapes"
):
dataset_to_icechunk(
vds_different_chunking, icechunk_filestore_append, append_dim="x"
)


## When encoding is different it fails
@pytest.mark.skip(reason="working on this")
# @pytest.mark.skip(reason="working on this")
def test_append_with_different_encoding_fails(
icechunk_storage: "StorageConfig", simple_netcdf4: str
):
Expand Down
18 changes: 9 additions & 9 deletions virtualizarr/writers/icechunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,16 +224,16 @@ def write_virtual_variable_to_icechunk(
# TODO fill_value?
)

# TODO it would be nice if we could assign directly to the .attrs property
# Aimee: assert that new attributes are the same as existing attributes
for k, v in var.attrs.items():
arr.attrs[k] = encode_zarr_attr_value(v)
arr.attrs["_ARRAY_DIMENSIONS"] = encode_zarr_attr_value(var.dims)

_encoding_keys = {"_FillValue", "missing_value", "scale_factor", "add_offset"}
for k, v in var.encoding.items():
if k in _encoding_keys:
# TODO it would be nice if we could assign directly to the .attrs property
# Aimee: Can we assume that the attributes are the same for the new array?
for k, v in var.attrs.items():
arr.attrs[k] = encode_zarr_attr_value(v)
arr.attrs["_ARRAY_DIMENSIONS"] = encode_zarr_attr_value(var.dims)

_encoding_keys = {"_FillValue", "missing_value", "scale_factor", "add_offset"}
for k, v in var.encoding.items():
if k in _encoding_keys:
arr.attrs[k] = encode_zarr_attr_value(v)

write_manifest_virtual_refs(
store=store,
Expand Down
2 changes: 1 addition & 1 deletion virtualizarr/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,5 +227,5 @@ def _num_codec_config_to_configurable(num_codec: dict) -> dict:
return num_codec

num_codec_copy = num_codec.copy()
name = "numcodecs." + num_codec_copy.pop("id")
name = num_codec_copy.pop("id")
return {"name": name, "configuration": num_codec_copy}

0 comments on commit 66bbd6e

Please sign in to comment.