diff --git a/ci/environment.yml b/ci/environment.yml index 0bb5b366..70b5d430 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -27,7 +27,8 @@ dependencies: - fsspec - s3fs - fastparquet - # for opening tiff files + # for opening and creating test tiff files - tifffile + - pillow # for opening FITS files - astropy diff --git a/conftest.py b/conftest.py index 55c07823..8514187b 100644 --- a/conftest.py +++ b/conftest.py @@ -120,3 +120,16 @@ def simple_netcdf4(tmpdir): ds.to_netcdf(filepath) return filepath + + +@pytest.fixture +def random_tiff(tmpdir): + from PIL import Image + + array = np.random.randint(0, 255, (128, 128), dtype=np.uint8) + img = Image.fromarray(array) + + filepath = tmpdir / "rand.tiff" + img.save(filepath) + + return str(filepath) diff --git a/pyproject.toml b/pyproject.toml index 749afb94..ecfd7c13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,8 @@ test = [ "ruff", "s3fs", "scipy", + "tifffile", + "pillow", ] diff --git a/virtualizarr/__init__.py b/virtualizarr/__init__.py index bd70f834..27376d3b 100644 --- a/virtualizarr/__init__.py +++ b/virtualizarr/__init__.py @@ -1,6 +1,6 @@ from virtualizarr.manifests import ChunkManifest, ManifestArray # type: ignore # noqa from virtualizarr.accessor import VirtualiZarrDatasetAccessor # type: ignore # noqa -from virtualizarr.backend import open_virtual_dataset # noqa: F401 +from virtualizarr.backend import open_virtual_dataset, open_virtual_dataarray # noqa: F401 from importlib.metadata import version as _version diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index fab010c7..78fea0a2 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -7,7 +7,7 @@ Optional, ) -from xarray import Dataset, Index +from xarray import DataArray, Dataset, Index from virtualizarr.manifests import ManifestArray from virtualizarr.readers import ( @@ -198,3 +198,52 @@ def open_virtual_dataset( ) return vds + + +def open_virtual_dataarray( + filepath: str, + *, + filetype: FileType | None = None, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + virtual_array_class=ManifestArray, + reader_options: Optional[dict] = None, +) -> DataArray: + drop_variables, loadable_variables = check_for_collisions( + drop_variables, + loadable_variables, + ) + + if virtual_array_class is not ManifestArray: + raise NotImplementedError() + + if reader_options is None: + reader_options = {} + + if filetype is not None: + # if filetype is user defined, convert to FileType + filetype = FileType(filetype) + else: + filetype = automatically_determine_filetype( + filepath=filepath, reader_options=reader_options + ) + + backend_cls = VIRTUAL_BACKENDS.get(filetype.name.lower()) + + if backend_cls is None: + raise NotImplementedError(f"Unsupported file type: {filetype.name}") + + vda = backend_cls.open_virtual_dataarray( + filepath, + group=group, + drop_variables=drop_variables, + loadable_variables=loadable_variables, + decode_times=decode_times, + indexes=indexes, + reader_options=reader_options, + ) + + return vda diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index 1ad24629..a546bc10 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -12,6 +12,7 @@ from xarray import ( Coordinates, + DataArray, Dataset, DataTree, Index, @@ -93,11 +94,11 @@ def open_loadable_vars_and_indexes( def construct_virtual_dataset( - virtual_vars, - loadable_vars, - indexes, - coord_names, - attrs, + virtual_vars: Mapping[str, Variable], + loadable_vars: Mapping[str, Variable], + indexes: Mapping[str, Index], + coord_names: Iterable[str], + attrs: dict[str, str], ) -> Dataset: """Construct a virtual Datset from consistuent parts.""" @@ -117,6 +118,22 @@ def construct_virtual_dataset( return vds +def construct_virtual_dataarray( + virtual_var: Variable, + loadable_vars: Mapping[str, Variable], + indexes: Mapping[str, Index], + coord_names: Iterable[str], + attrs: dict[str, str], +) -> DataArray: + vda = DataArray( + data=virtual_var, + coords=coord_names, + # indexes={}, # TODO should be added in a later version of xarray + attrs=attrs, + ) + return vda + + def separate_coords( vars: Mapping[str, Variable], indexes: MutableMapping[str, Index], @@ -167,6 +184,18 @@ def separate_coords( class VirtualBackend(ABC): + @staticmethod + def open_virtual_dataarray( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> DataArray: + raise NotImplementedError() + @staticmethod def open_virtual_dataset( filepath: str, diff --git a/virtualizarr/readers/tiff.py b/virtualizarr/readers/tiff.py index d9c440ba..2dafff51 100644 --- a/virtualizarr/readers/tiff.py +++ b/virtualizarr/readers/tiff.py @@ -1,24 +1,15 @@ -import warnings from typing import Iterable, Mapping, Optional -from xarray import Dataset, Index +import zarr +from xarray import DataArray, Dataset, Index -from virtualizarr.readers.common import ( - VirtualBackend, - construct_virtual_dataset, - open_loadable_vars_and_indexes, -) -from virtualizarr.translators.kerchunk import ( - extract_group, - virtual_vars_and_metadata_from_kerchunk_refs, -) -from virtualizarr.types.kerchunk import KerchunkStoreRefs -from virtualizarr.utils import check_for_collisions +from virtualizarr.readers.common import VirtualBackend +from virtualizarr.readers.zarr import virtual_variable_from_zarr_array class TIFFVirtualBackend(VirtualBackend): @staticmethod - def open_virtual_dataset( + def open_virtual_dataarray( filepath: str, group: str | None = None, drop_variables: Iterable[str] | None = None, @@ -27,46 +18,40 @@ def open_virtual_dataset( indexes: Mapping[str, Index] | None = None, reader_options: Optional[dict] = None, ) -> Dataset: - from kerchunk.tiff import tiff_to_zarr + from tifffile import imread + + store = imread(filepath, aszarr=True) - drop_variables, loadable_variables = check_for_collisions( - drop_variables=drop_variables, loadable_variables=loadable_variables - ) + # TODO exception handling for TIFF files with multiple arrays + za = zarr.open_array(store=store, mode="r") - if reader_options is None: - reader_options = {} + vv = virtual_variable_from_zarr_array(za) - reader_options.pop("storage_options", {}) - warnings.warn( - "storage_options have been dropped from reader_options as they are not supported by kerchunk.tiff.tiff_to_zarr", - UserWarning, - ) + # TODO should we generate any pixel coordnate arrays like kerhunk seems to do? - # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160 - refs = KerchunkStoreRefs({"refs": tiff_to_zarr(filepath, **reader_options)}) + return DataArray(data=vv, dims=vv.dims, attrs=za.attrs) - refs = extract_group(refs, group) + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + from tifffile import imread - virtual_vars, attrs, coord_names = virtual_vars_and_metadata_from_kerchunk_refs( - refs, - loadable_variables, - drop_variables, - ) + store = imread(filepath, aszarr=True) - loadable_vars, indexes = open_loadable_vars_and_indexes( - filepath, - loadable_variables=loadable_variables, - reader_options=reader_options, - drop_variables=drop_variables, - indexes=indexes, - group=group, - decode_times=decode_times, - ) + try: + zg = zarr.open_group(store, mode="r") + except zarr.errors.ContainsArrayError: + # TODO tidy this up + print( + "TIFF file contains only a single array, please use `open_virtual_dataarray` instead" + ) + raise - return construct_virtual_dataset( - virtual_vars=virtual_vars, - loadable_vars=loadable_vars, - indexes=indexes, - coord_names=coord_names, - attrs=attrs, - ) + raise NotImplementedError() diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py new file mode 100644 index 00000000..84893dbc --- /dev/null +++ b/virtualizarr/readers/zarr.py @@ -0,0 +1,50 @@ +import io + +import zarr +from xarray import Variable + +from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.zarr import ZArray + + +def virtual_variable_from_zarr_array(za: zarr.Array) -> Variable: + """ + Create a virtual xarray.Variable wrapping a ManifestArray from a single zarr.Array. + """ + + # TODO this only works with zarr-python v2 for now + + attrs = dict(za.attrs) + + # extract _ARRAY_DIMENSIONS and remove it from attrs + # TODO handle v3 DIMENSION_NAMES too + dims = attrs.pop("_ARRAY_DIMENSIONS") + + zarray = ZArray( + shape=za.shape, + chunks=za.chunks, + dtype=za.dtype, + fill_value=za.fill_value, + order=za.order, + compressor=za.compressor, + filters=za.filters, + # zarr_format=za.zarr_format, + ) + + manifest = chunkmanifest_from_zarr_array(za) + + ma = ManifestArray(chunkmanifest=manifest, zarray=zarray) + + return Variable(data=ma, dims=dims, attrs=attrs) + + +def chunkmanifest_from_zarr_array(za: zarr.Array) -> ChunkManifest: + import ujson + + of2 = io.StringIO() + + # TODO handle remote urls + za.store.write_fsspec(of2) # , url=url) + out = ujson.loads(of2.getvalue()) + + print(out) diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 70f613ce..7a8dac25 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -37,6 +37,7 @@ def _importorskip( has_s3fs, requires_s3fs = _importorskip("s3fs") has_scipy, requires_scipy = _importorskip("scipy") has_tifffile, requires_tifffile = _importorskip("tifffile") +has_pillow, requires_pillow = _importorskip("PIL") def create_manifestarray( diff --git a/virtualizarr/tests/test_readers/test_tiff.py b/virtualizarr/tests/test_readers/test_tiff.py new file mode 100644 index 00000000..48c8e043 --- /dev/null +++ b/virtualizarr/tests/test_readers/test_tiff.py @@ -0,0 +1,22 @@ +import numpy as np +from xarray import DataArray + +from virtualizarr import open_virtual_dataarray +from virtualizarr.manifests import ManifestArray +from virtualizarr.tests import requires_pillow + + +@requires_pillow +def test_random_tiff(random_tiff): + vda = open_virtual_dataarray(random_tiff, indexes={}) + + assert isinstance(vda, DataArray) + + assert vda.sizes == {"X": 128, "Y": 128} + assert vda.dtype == np.uint8 + + assert isinstance(vda.data, ManifestArray) + manifest = vda.data.manifest + assert manifest.dict() == { + "0.0": {"path": random_tiff, "offset": 122, "length": 16384} + }