diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index 0c054a3b..9db0a370 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -428,89 +428,13 @@ def test_open_virtual_dataset_passes_expected_args( mock_read_kerchunk.assert_called_once_with(**args) @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) - def test_open_dataset_with_empty(self, hdf5_empty, tmpdir, hdf_backend): + def test_open_dataset_with_empty(self, hdf5_empty, hdf_backend): vds = open_virtual_dataset(hdf5_empty, backend=hdf_backend) assert vds.empty.dims == () assert vds.empty.attrs == {"empty": "true"} @pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend]) - def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir, hdf_backend): + def test_open_dataset_with_scalar(self, hdf5_scalar, hdf_backend): vds = open_virtual_dataset(hdf5_scalar, backend=hdf_backend) assert vds.scalar.dims == () assert vds.scalar.attrs == {"scalar": "true"} - - -@requires_kerchunk -@pytest.mark.parametrize( - "reference_format", - ["json", "parquet", "invalid"], -) -def test_open_virtual_dataset_existing_kerchunk_refs( - tmp_path, netcdf4_virtual_dataset, reference_format -): - example_reference_dict = netcdf4_virtual_dataset.virtualize.to_kerchunk( - format="dict" - ) - - if reference_format == "invalid": - # Test invalid file format leads to ValueError - ref_filepath = tmp_path / "ref.csv" - with open(ref_filepath.as_posix(), mode="w") as of: - of.write("tmp") - - with pytest.raises(ValueError): - open_virtual_dataset( - filepath=ref_filepath.as_posix(), filetype="kerchunk", indexes={} - ) - - else: - # Test valid json and parquet reference formats - - if reference_format == "json": - ref_filepath = tmp_path / "ref.json" - - import ujson - - with open(ref_filepath, "w") as json_file: - ujson.dump(example_reference_dict, json_file) - - if reference_format == "parquet": - from kerchunk.df import refs_to_dataframe - - ref_filepath = tmp_path / "ref.parquet" - refs_to_dataframe(fo=example_reference_dict, url=ref_filepath.as_posix()) - - vds = open_virtual_dataset( - filepath=ref_filepath.as_posix(), filetype="kerchunk", indexes={} - ) - - # Inconsistent results! https://github.com/TomNicholas/VirtualiZarr/pull/73#issuecomment-2040931202 - # assert vds.virtualize.to_kerchunk(format='dict') == example_reference_dict - refs = vds.virtualize.to_kerchunk(format="dict") - expected_refs = netcdf4_virtual_dataset.virtualize.to_kerchunk(format="dict") - assert refs["refs"]["air/0.0.0"] == expected_refs["refs"]["air/0.0.0"] - assert refs["refs"]["lon/0"] == expected_refs["refs"]["lon/0"] - assert refs["refs"]["lat/0"] == expected_refs["refs"]["lat/0"] - assert refs["refs"]["time/0"] == expected_refs["refs"]["time/0"] - - assert list(vds) == list(netcdf4_virtual_dataset) - assert set(vds.coords) == set(netcdf4_virtual_dataset.coords) - assert set(vds.variables) == set(netcdf4_virtual_dataset.variables) - - -@requires_kerchunk -def test_notimplemented_read_inline_refs(tmp_path, netcdf4_inlined_ref): - # For now, we raise a NotImplementedError if we read existing references that have inlined data - # https://github.com/zarr-developers/VirtualiZarr/pull/251#pullrequestreview-2361916932 - - ref_filepath = tmp_path / "ref.json" - - import ujson - - with open(ref_filepath, "w") as json_file: - ujson.dump(netcdf4_inlined_ref, json_file) - - with pytest.raises(NotImplementedError): - open_virtual_dataset( - filepath=ref_filepath.as_posix(), filetype="kerchunk", indexes={} - ) diff --git a/virtualizarr/tests/test_readers/test_kerchunk.py b/virtualizarr/tests/test_readers/test_kerchunk.py index c89d0f53..10f385d6 100644 --- a/virtualizarr/tests/test_readers/test_kerchunk.py +++ b/virtualizarr/tests/test_readers/test_kerchunk.py @@ -7,6 +7,7 @@ from virtualizarr.backend import open_virtual_dataset from virtualizarr.manifests import ManifestArray +from virtualizarr.tests import requires_kerchunk def gen_ds_refs( @@ -171,3 +172,79 @@ def test_handle_relative_paths(refs_file_factory): assert vda.data.manifest.dict() == { "0.0": {"path": "file:///some_directory/test1.nc", "offset": 6144, "length": 48} } + + +@requires_kerchunk +@pytest.mark.parametrize( + "reference_format", + ["json", "parquet", "invalid"], +) +def test_open_virtual_dataset_existing_kerchunk_refs( + tmp_path, netcdf4_virtual_dataset, reference_format +): + example_reference_dict = netcdf4_virtual_dataset.virtualize.to_kerchunk( + format="dict" + ) + + if reference_format == "invalid": + # Test invalid file format leads to ValueError + ref_filepath = tmp_path / "ref.csv" + with open(ref_filepath.as_posix(), mode="w") as of: + of.write("tmp") + + with pytest.raises(ValueError): + open_virtual_dataset( + filepath=ref_filepath.as_posix(), filetype="kerchunk", indexes={} + ) + + else: + # Test valid json and parquet reference formats + + if reference_format == "json": + ref_filepath = tmp_path / "ref.json" + + import ujson + + with open(ref_filepath, "w") as json_file: + ujson.dump(example_reference_dict, json_file) + + if reference_format == "parquet": + from kerchunk.df import refs_to_dataframe + + ref_filepath = tmp_path / "ref.parquet" + refs_to_dataframe(fo=example_reference_dict, url=ref_filepath.as_posix()) + + vds = open_virtual_dataset( + filepath=ref_filepath.as_posix(), filetype="kerchunk", indexes={} + ) + + # Inconsistent results! https://github.com/TomNicholas/VirtualiZarr/pull/73#issuecomment-2040931202 + # assert vds.virtualize.to_kerchunk(format='dict') == example_reference_dict + refs = vds.virtualize.to_kerchunk(format="dict") + expected_refs = netcdf4_virtual_dataset.virtualize.to_kerchunk(format="dict") + assert refs["refs"]["air/0.0.0"] == expected_refs["refs"]["air/0.0.0"] + assert refs["refs"]["lon/0"] == expected_refs["refs"]["lon/0"] + assert refs["refs"]["lat/0"] == expected_refs["refs"]["lat/0"] + assert refs["refs"]["time/0"] == expected_refs["refs"]["time/0"] + + assert list(vds) == list(netcdf4_virtual_dataset) + assert set(vds.coords) == set(netcdf4_virtual_dataset.coords) + assert set(vds.variables) == set(netcdf4_virtual_dataset.variables) + + +@requires_kerchunk +def test_notimplemented_read_inline_refs(tmp_path, netcdf4_inlined_ref): + # For now, we raise a NotImplementedError if we read existing references that have inlined data + # https://github.com/zarr-developers/VirtualiZarr/pull/251#pullrequestreview-2361916932 + + ref_filepath = tmp_path / "ref.json" + + import ujson + + with open(ref_filepath, "w") as json_file: + ujson.dump(netcdf4_inlined_ref, json_file) + + with pytest.raises(NotImplementedError): + open_virtual_dataset( + filepath=ref_filepath.as_posix(), filetype="kerchunk", indexes={} + )