Skip to content

Commit

Permalink
move some tests that specifically only apply to the kerchunk reader a…
Browse files Browse the repository at this point in the history
…nd writer to a different module (#351)
  • Loading branch information
TomNicholas authored Dec 16, 2024
1 parent 1dbd119 commit 6f3fb1c
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 78 deletions.
80 changes: 2 additions & 78 deletions virtualizarr/tests/test_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,89 +428,13 @@ def test_open_virtual_dataset_passes_expected_args(
mock_read_kerchunk.assert_called_once_with(**args)

@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
def test_open_dataset_with_empty(self, hdf5_empty, tmpdir, hdf_backend):
def test_open_dataset_with_empty(self, hdf5_empty, hdf_backend):
vds = open_virtual_dataset(hdf5_empty, backend=hdf_backend)
assert vds.empty.dims == ()
assert vds.empty.attrs == {"empty": "true"}

@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir, hdf_backend):
def test_open_dataset_with_scalar(self, hdf5_scalar, hdf_backend):
vds = open_virtual_dataset(hdf5_scalar, backend=hdf_backend)
assert vds.scalar.dims == ()
assert vds.scalar.attrs == {"scalar": "true"}


@requires_kerchunk
@pytest.mark.parametrize(
"reference_format",
["json", "parquet", "invalid"],
)
def test_open_virtual_dataset_existing_kerchunk_refs(
tmp_path, netcdf4_virtual_dataset, reference_format
):
example_reference_dict = netcdf4_virtual_dataset.virtualize.to_kerchunk(
format="dict"
)

if reference_format == "invalid":
# Test invalid file format leads to ValueError
ref_filepath = tmp_path / "ref.csv"
with open(ref_filepath.as_posix(), mode="w") as of:
of.write("tmp")

with pytest.raises(ValueError):
open_virtual_dataset(
filepath=ref_filepath.as_posix(), filetype="kerchunk", indexes={}
)

else:
# Test valid json and parquet reference formats

if reference_format == "json":
ref_filepath = tmp_path / "ref.json"

import ujson

with open(ref_filepath, "w") as json_file:
ujson.dump(example_reference_dict, json_file)

if reference_format == "parquet":
from kerchunk.df import refs_to_dataframe

ref_filepath = tmp_path / "ref.parquet"
refs_to_dataframe(fo=example_reference_dict, url=ref_filepath.as_posix())

vds = open_virtual_dataset(
filepath=ref_filepath.as_posix(), filetype="kerchunk", indexes={}
)

# Inconsistent results! https://github.com/TomNicholas/VirtualiZarr/pull/73#issuecomment-2040931202
# assert vds.virtualize.to_kerchunk(format='dict') == example_reference_dict
refs = vds.virtualize.to_kerchunk(format="dict")
expected_refs = netcdf4_virtual_dataset.virtualize.to_kerchunk(format="dict")
assert refs["refs"]["air/0.0.0"] == expected_refs["refs"]["air/0.0.0"]
assert refs["refs"]["lon/0"] == expected_refs["refs"]["lon/0"]
assert refs["refs"]["lat/0"] == expected_refs["refs"]["lat/0"]
assert refs["refs"]["time/0"] == expected_refs["refs"]["time/0"]

assert list(vds) == list(netcdf4_virtual_dataset)
assert set(vds.coords) == set(netcdf4_virtual_dataset.coords)
assert set(vds.variables) == set(netcdf4_virtual_dataset.variables)


@requires_kerchunk
def test_notimplemented_read_inline_refs(tmp_path, netcdf4_inlined_ref):
# For now, we raise a NotImplementedError if we read existing references that have inlined data
# https://github.com/zarr-developers/VirtualiZarr/pull/251#pullrequestreview-2361916932

ref_filepath = tmp_path / "ref.json"

import ujson

with open(ref_filepath, "w") as json_file:
ujson.dump(netcdf4_inlined_ref, json_file)

with pytest.raises(NotImplementedError):
open_virtual_dataset(
filepath=ref_filepath.as_posix(), filetype="kerchunk", indexes={}
)
77 changes: 77 additions & 0 deletions virtualizarr/tests/test_readers/test_kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from virtualizarr.backend import open_virtual_dataset
from virtualizarr.manifests import ManifestArray
from virtualizarr.tests import requires_kerchunk


def gen_ds_refs(
Expand Down Expand Up @@ -171,3 +172,79 @@ def test_handle_relative_paths(refs_file_factory):
assert vda.data.manifest.dict() == {
"0.0": {"path": "file:///some_directory/test1.nc", "offset": 6144, "length": 48}
}


@requires_kerchunk
@pytest.mark.parametrize(
"reference_format",
["json", "parquet", "invalid"],
)
def test_open_virtual_dataset_existing_kerchunk_refs(
tmp_path, netcdf4_virtual_dataset, reference_format
):
example_reference_dict = netcdf4_virtual_dataset.virtualize.to_kerchunk(
format="dict"
)

if reference_format == "invalid":
# Test invalid file format leads to ValueError
ref_filepath = tmp_path / "ref.csv"
with open(ref_filepath.as_posix(), mode="w") as of:
of.write("tmp")

with pytest.raises(ValueError):
open_virtual_dataset(
filepath=ref_filepath.as_posix(), filetype="kerchunk", indexes={}
)

else:
# Test valid json and parquet reference formats

if reference_format == "json":
ref_filepath = tmp_path / "ref.json"

import ujson

with open(ref_filepath, "w") as json_file:
ujson.dump(example_reference_dict, json_file)

if reference_format == "parquet":
from kerchunk.df import refs_to_dataframe

ref_filepath = tmp_path / "ref.parquet"
refs_to_dataframe(fo=example_reference_dict, url=ref_filepath.as_posix())

vds = open_virtual_dataset(
filepath=ref_filepath.as_posix(), filetype="kerchunk", indexes={}
)

# Inconsistent results! https://github.com/TomNicholas/VirtualiZarr/pull/73#issuecomment-2040931202
# assert vds.virtualize.to_kerchunk(format='dict') == example_reference_dict
refs = vds.virtualize.to_kerchunk(format="dict")
expected_refs = netcdf4_virtual_dataset.virtualize.to_kerchunk(format="dict")
assert refs["refs"]["air/0.0.0"] == expected_refs["refs"]["air/0.0.0"]
assert refs["refs"]["lon/0"] == expected_refs["refs"]["lon/0"]
assert refs["refs"]["lat/0"] == expected_refs["refs"]["lat/0"]
assert refs["refs"]["time/0"] == expected_refs["refs"]["time/0"]

assert list(vds) == list(netcdf4_virtual_dataset)
assert set(vds.coords) == set(netcdf4_virtual_dataset.coords)
assert set(vds.variables) == set(netcdf4_virtual_dataset.variables)


@requires_kerchunk
def test_notimplemented_read_inline_refs(tmp_path, netcdf4_inlined_ref):
# For now, we raise a NotImplementedError if we read existing references that have inlined data
# https://github.com/zarr-developers/VirtualiZarr/pull/251#pullrequestreview-2361916932

ref_filepath = tmp_path / "ref.json"

import ujson

with open(ref_filepath, "w") as json_file:
ujson.dump(netcdf4_inlined_ref, json_file)

with pytest.raises(NotImplementedError):
open_virtual_dataset(
filepath=ref_filepath.as_posix(), filetype="kerchunk", indexes={}
)

0 comments on commit 6f3fb1c

Please sign in to comment.