diff --git a/pyproject.toml b/pyproject.toml index 6a45df11..33b304f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ test = [ "pre-commit", "pytest-mypy", "pytest", + "netCDF4" ] diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 48e4b5af..b507f15d 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -1,3 +1,4 @@ +import importlib.util from pathlib import Path from typing import List, NewType, Optional, Tuple, Union, cast @@ -61,10 +62,26 @@ def _automatically_determine_filetype(filepath: str) -> str: file_extension = Path(filepath).suffix if file_extension == ".nc": - # TODO how can we automatically distinguish netCDF3 and 4? - raise NotImplementedError( - "Cannot unambiguously automatically determine which kerchunk file format reader to use" - ) + # checks if netCDF library is installed. + # It currently is not a requirement in the pyproj.toml. + + if importlib.util.find_spec("netCDF4") is None: + raise ImportError( + "netCDF4 library is required to determine NetCDF file type." + ) + + import netCDF4 + + with netCDF4.Dataset(filepath, "r") as dataset: + if dataset.data_model == "NETCDF4": + filetype = "netCDF4" + elif dataset.data_model == "NETCDF3_CLASSIC": + filetype = "netCDF3" + else: + raise NotImplementedError( + ".nc file does not appear to be NETCDF3 OR NETCDF4" + ) + elif file_extension == ".zarr": # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one... raise NotImplementedError() diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index 6fbee558..21dc54c0 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -3,6 +3,7 @@ import xarray as xr import xarray.testing as xrt +from virtualizarr.kerchunk import _automatically_determine_filetype from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray from virtualizarr.xarray import dataset_from_kerchunk_refs @@ -130,3 +131,17 @@ def test_kerchunk_roundtrip_in_memory_no_concat(): # Assert equal to original dataset xrt.assert_equal(roundtrip, ds) + + +def test_automatically_determine_filetype_netcdf3_netcdf4(): + # test the NetCDF3 vs NetCDF4 automatic file type selection + + ds = xr.Dataset({"a": (["x"], [0, 1])}) + netcdf3_file_path = "/tmp/netcdf3.nc" + netcdf4_file_path = "/tmp/netcdf4.nc" + + # write two version of NetCDF + ds.to_netcdf(netcdf3_file_path, engine="scipy", format="NETCDF3_CLASSIC") + ds.to_netcdf(netcdf4_file_path) + assert "netCDF3" == _automatically_determine_filetype(netcdf3_file_path) + assert "netCDF4" == _automatically_determine_filetype(netcdf4_file_path)