Merge branch 'main' into unify_freq_strings

pydata · Feb 15, 2024 · f86355b · f86355b
2 parents 6670f28 + fffb03c
commit f86355b
Show file tree

Hide file tree

Showing 70 changed files with 864 additions and 754 deletions.
diff --git a/doc/ecosystem.rst b/doc/ecosystem.rst
@@ -36,6 +36,7 @@ Geosciences
 - `rioxarray <https://corteva.github.io/rioxarray>`_: geospatial xarray extension powered by rasterio
 - `salem <https://salem.readthedocs.io>`_: Adds geolocalised subsetting, masking, and plotting operations to xarray's data structures via accessors.
 - `SatPy <https://satpy.readthedocs.io/>`_ : Library for reading and manipulating meteorological remote sensing data and writing it to various image and data file formats.
+- `SARXarray <https://tudelftgeodesy.github.io/sarxarray/>`_: xarray extension for reading and processing large Synthetic Aperture Radar (SAR) data stacks.
 - `Spyfit <https://spyfit.readthedocs.io/en/master/>`_: FTIR spectroscopy of the atmosphere
 - `windspharm <https://ajdawson.github.io/windspharm/index.html>`_: Spherical
   harmonic wind analysis in Python.

diff --git a/doc/internals/chunked-arrays.rst b/doc/internals/chunked-arrays.rst
@@ -35,44 +35,44 @@ The implementation of these functions is specific to the type of arrays passed t
 whereas :py:class:`cubed.Array` objects must be processed by :py:func:`cubed.map_blocks`.
 
 In order to use the correct implementation of a core operation for the array type encountered, xarray dispatches to the
-corresponding subclass of :py:class:`~xarray.core.parallelcompat.ChunkManagerEntrypoint`,
+corresponding subclass of :py:class:`~xarray.namedarray.parallelcompat.ChunkManagerEntrypoint`,
 also known as a "Chunk Manager". Therefore **a full list of the operations that need to be defined is set by the
-API of the** :py:class:`~xarray.core.parallelcompat.ChunkManagerEntrypoint` **abstract base class**. Note that chunked array
+API of the** :py:class:`~xarray.namedarray.parallelcompat.ChunkManagerEntrypoint` **abstract base class**. Note that chunked array
 methods are also currently dispatched using this class.
 
 Chunked array creation is also handled by this class. As chunked array objects have a one-to-one correspondence with
 in-memory numpy arrays, it should be possible to create a chunked array from a numpy array by passing the desired
-chunking pattern to an implementation of :py:class:`~xarray.core.parallelcompat.ChunkManagerEntrypoint.from_array``.
+chunking pattern to an implementation of :py:class:`~xarray.namedarray.parallelcompat.ChunkManagerEntrypoint.from_array``.
 
 .. note::
 
-    The :py:class:`~xarray.core.parallelcompat.ChunkManagerEntrypoint` abstract base class is mostly just acting as a
+    The :py:class:`~xarray.namedarray.parallelcompat.ChunkManagerEntrypoint` abstract base class is mostly just acting as a
     namespace for containing the chunked-aware function primitives. Ideally in the future we would have an API standard
     for chunked array types which codified this structure, making the entrypoint system unnecessary.
 
-.. currentmodule:: xarray.core.parallelcompat
+.. currentmodule:: xarray.namedarray.parallelcompat
 
-.. autoclass:: xarray.core.parallelcompat.ChunkManagerEntrypoint
+.. autoclass:: xarray.namedarray.parallelcompat.ChunkManagerEntrypoint
    :members:
 
 Registering a new ChunkManagerEntrypoint subclass
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Rather than hard-coding various chunk managers to deal with specific chunked array implementations, xarray uses an
 entrypoint system to allow developers of new chunked array implementations to register their corresponding subclass of
-:py:class:`~xarray.core.parallelcompat.ChunkManagerEntrypoint`.
+:py:class:`~xarray.namedarray.parallelcompat.ChunkManagerEntrypoint`.
 
 
 To register a new entrypoint you need to add an entry to the ``setup.cfg`` like this::
 
     [options.entry_points]
     xarray.chunkmanagers =
-        dask = xarray.core.daskmanager:DaskManager
+        dask = xarray.namedarray.daskmanager:DaskManager
 
 See also `cubed-xarray <https://github.com/xarray-contrib/cubed-xarray>`_ for another example.
 
 To check that the entrypoint has worked correctly, you may find it useful to display the available chunkmanagers using
-the internal function :py:func:`~xarray.core.parallelcompat.list_chunkmanagers`.
+the internal function :py:func:`~xarray.namedarray.parallelcompat.list_chunkmanagers`.
 
 .. autofunction:: list_chunkmanagers
 

diff --git a/doc/roadmap.rst b/doc/roadmap.rst
@@ -156,7 +156,7 @@ types would also be highly useful for xarray users.
 By pursuing these improvements in NumPy we hope to extend the benefits
 to the full scientific Python community, and avoid tight coupling
 between xarray and specific third-party libraries (e.g., for
-implementing untis). This will allow xarray to maintain its domain
+implementing units). This will allow xarray to maintain its domain
 agnostic strengths.
 
 We expect that we may eventually add some minimal interfaces in xarray

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -76,6 +76,8 @@ Bug fixes
   lead to integer overflow or unsafe conversion from floating point to integer
   values (:issue:`8542`, :pull:`8575`).  By `Spencer Clark
   <https://github.com/spencerkclark>`_.
+- Raise an error when unstacking a MultiIndex that has duplicates as this would lead
+  to silent data loss (:issue:`7104`, :pull:`8737`). By `Mathias Hauser <https://github.com/mathause>`_.
 
 Documentation
 ~~~~~~~~~~~~~
@@ -90,6 +92,16 @@ Internal Changes
   when the data isn't datetime-like. (:issue:`8718`, :pull:`8724`)
   By `Maximilian Roos <https://github.com/max-sixty>`_.
 
+- Move ``parallelcompat`` and ``chunk managers`` modules from ``xarray/core`` to ``xarray/namedarray``. (:pull:`8319`)
+  By `Tom Nicholas <https://github.com/TomNicholas>`_ and `Anderson Banihirwe <https://github.com/andersy005>`_.
+
+- Imports ``datatree`` repository and history into internal
+  location. (:pull:`8688`) By `Matt Savoie <https://github.com/flamingbear>`_
+  and `Justus Magin <https://github.com/keewis>`_.
+
+- Adds :py:func:`open_datatree` into ``xarray/backends`` (:pull:`8697`) By `Matt
+  Savoie <https://github.com/flamingbear>`_.
+
 .. _whats-new.2024.01.1:
 
 v2024.01.1 (23 Jan, 2024)

diff --git a/pyproject.toml b/pyproject.toml
@@ -54,7 +54,7 @@ issue-tracker = "https://github.com/pydata/xarray/issues"
 source-code = "https://github.com/pydata/xarray"
 
 [project.entry-points."xarray.chunkmanagers"]
-dask = "xarray.core.daskmanager:DaskManager"
+dask = "xarray.namedarray.daskmanager:DaskManager"
 
 [build-system]
 build-backend = "setuptools.build_meta"
@@ -96,6 +96,11 @@ warn_redundant_casts = true
 warn_unused_configs = true
 warn_unused_ignores = true
 
+# Ignore mypy errors for modules imported from datatree_.
+[[tool.mypy.overrides]]
+module = "xarray.datatree_.*"
+ignore_errors = true
+
 # Much of the numerical computing stack doesn't have type annotations yet.
 [[tool.mypy.overrides]]
 ignore_missing_imports = true

diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -34,13 +34,13 @@
     _nested_combine,
     combine_by_coords,
 )
-from xarray.core.daskmanager import DaskManager
 from xarray.core.dataarray import DataArray
 from xarray.core.dataset import Dataset, _get_chunk, _maybe_chunk
 from xarray.core.indexes import Index
-from xarray.core.parallelcompat import guess_chunkmanager
 from xarray.core.types import ZarrWriteModes
 from xarray.core.utils import is_remote_uri
+from xarray.namedarray.daskmanager import DaskManager
+from xarray.namedarray.parallelcompat import guess_chunkmanager
 
 if TYPE_CHECKING:
     try:
@@ -69,6 +69,7 @@
     T_NetcdfTypes = Literal[
         "NETCDF4", "NETCDF4_CLASSIC", "NETCDF3_64BIT", "NETCDF3_CLASSIC"
     ]
+    from xarray.datatree_.datatree import DataTree
 
 DATAARRAY_NAME = "__xarray_dataarray_name__"
 DATAARRAY_VARIABLE = "__xarray_dataarray_variable__"
@@ -788,6 +789,34 @@ def open_dataarray(
     return data_array
 
 
+def open_datatree(
+    filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
+    engine: T_Engine = None,
+    **kwargs,
+) -> DataTree:
+    """
+    Open and decode a DataTree from a file or file-like object, creating one tree node for each group in the file.
+
+    Parameters
+    ----------
+    filename_or_obj : str, Path, file-like, or DataStore
+        Strings and Path objects are interpreted as a path to a netCDF file or Zarr store.
+    engine : str, optional
+        Xarray backend engine to use. Valid options include `{"netcdf4", "h5netcdf", "zarr"}`.
+    **kwargs : dict
+        Additional keyword arguments passed to :py:func:`~xarray.open_dataset` for each group.
+    Returns
+    -------
+    xarray.DataTree
+    """
+    if engine is None:
+        engine = plugins.guess_engine(filename_or_obj)
+
+    backend = plugins.get_backend(engine)
+
+    return backend.open_datatree(filename_or_obj, **kwargs)
+
+
 def open_mfdataset(
     paths: str | NestedSequence[str | os.PathLike],
     chunks: T_Chunks | None = None,

diff --git a/xarray/backends/common.py b/xarray/backends/common.py
@@ -12,15 +12,19 @@
 
 from xarray.conventions import cf_encoder
 from xarray.core import indexing
-from xarray.core.parallelcompat import get_chunked_array_type
-from xarray.core.pycompat import is_chunked_array
 from xarray.core.utils import FrozenDict, NdimSizeLenMixin, is_remote_uri
+from xarray.namedarray.parallelcompat import get_chunked_array_type
+from xarray.namedarray.pycompat import is_chunked_array
 
 if TYPE_CHECKING:
     from io import BufferedIOBase
 
+    from h5netcdf.legacyapi import Dataset as ncDatasetLegacyH5
+    from netCDF4 import Dataset as ncDataset
+
     from xarray.core.dataset import Dataset
     from xarray.core.types import NestedSequence
+    from xarray.datatree_.datatree import DataTree
 
 # Create a logger object, but don't add any handlers. Leave that to user code.
 logger = logging.getLogger(__name__)
@@ -127,6 +131,43 @@ def _decode_variable_name(name):
     return name
 
 
+def _open_datatree_netcdf(
+    ncDataset: ncDataset | ncDatasetLegacyH5,
+    filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
+    **kwargs,
+) -> DataTree:
+    from xarray.backends.api import open_dataset
+    from xarray.datatree_.datatree import DataTree
+    from xarray.datatree_.datatree.treenode import NodePath
+
+    ds = open_dataset(filename_or_obj, **kwargs)
+    tree_root = DataTree.from_dict({"/": ds})
+    with ncDataset(filename_or_obj, mode="r") as ncds:
+        for path in _iter_nc_groups(ncds):
+            subgroup_ds = open_dataset(filename_or_obj, group=path, **kwargs)
+
+            # TODO refactor to use __setitem__ once creation of new nodes by assigning Dataset works again
+            node_name = NodePath(path).name
+            new_node: DataTree = DataTree(name=node_name, data=subgroup_ds)
+            tree_root._set_item(
+                path,
+                new_node,
+                allow_overwrite=False,
+                new_nodes_along_path=True,
+            )
+    return tree_root
+
+
+def _iter_nc_groups(root, parent="/"):
+    from xarray.datatree_.datatree.treenode import NodePath
+
+    parent = NodePath(parent)
+    for path, group in root.groups.items():
+        gpath = parent / path
+        yield str(gpath)
+        yield from _iter_nc_groups(group, parent=gpath)
+
+
 def find_root_and_group(ds):
     """Find the root and group name of a netCDF4/h5netcdf dataset."""
     hierarchy = ()
@@ -458,6 +499,11 @@ class BackendEntrypoint:
     - ``guess_can_open`` method: it shall return ``True`` if the backend is able to open
       ``filename_or_obj``, ``False`` otherwise. The implementation of this
       method is not mandatory.
+    - ``open_datatree`` method: it shall implement reading from file, variables
+      decoding and it returns an instance of :py:class:`~datatree.DataTree`.
+      It shall take in input at least ``filename_or_obj`` argument. The
+      implementation of this method is not mandatory.  For more details see
+      <reference to open_datatree documentation>.
 
     Attributes
     ----------
@@ -496,7 +542,7 @@ def open_dataset(
         Backend open_dataset method used by Xarray in :py:func:`~xarray.open_dataset`.
         """
 
-        raise NotImplementedError
+        raise NotImplementedError()
 
     def guess_can_open(
         self,
@@ -508,6 +554,17 @@ def guess_can_open(
 
         return False
 
+    def open_datatree(
+        self,
+        filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
+        **kwargs: Any,
+    ) -> DataTree:
+        """
+        Backend open_datatree method used by Xarray in :py:func:`~xarray.open_datatree`.
+        """
+
+        raise NotImplementedError()
+
 
 # mapping of engine name to (module name, BackendEntrypoint Class)
 BACKEND_ENTRYPOINTS: dict[str, tuple[str | None, type[BackendEntrypoint]]] = {}
diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py
@@ -11,6 +11,7 @@
     BackendEntrypoint,
     WritableCFDataStore,
     _normalize_path,
+    _open_datatree_netcdf,
     find_root_and_group,
 )
 from xarray.backends.file_manager import CachingFileManager, DummyFileManager
@@ -38,6 +39,7 @@
 
     from xarray.backends.common import AbstractDataStore
     from xarray.core.dataset import Dataset
+    from xarray.datatree_.datatree import DataTree
 
 
 class H5NetCDFArrayWrapper(BaseNetCDF4Array):
@@ -423,5 +425,14 @@ def open_dataset(  # type: ignore[override]  # allow LSP violation, not supporti
         )
         return ds
 
+    def open_datatree(
+        self,
+        filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
+        **kwargs,
+    ) -> DataTree:
+        from h5netcdf.legacyapi import Dataset as ncDataset
+
+        return _open_datatree_netcdf(ncDataset, filename_or_obj, **kwargs)
+
 
 BACKEND_ENTRYPOINTS["h5netcdf"] = ("h5netcdf", H5netcdfBackendEntrypoint)
diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py
@@ -16,6 +16,7 @@
     BackendEntrypoint,
     WritableCFDataStore,
     _normalize_path,
+    _open_datatree_netcdf,
     find_root_and_group,
     robust_getitem,
 )
@@ -44,6 +45,7 @@
 
     from xarray.backends.common import AbstractDataStore
     from xarray.core.dataset import Dataset
+    from xarray.datatree_.datatree import DataTree
 
 # This lookup table maps from dtype.byteorder to a readable endian
 # string used by netCDF4.
@@ -667,5 +669,14 @@ def open_dataset(  # type: ignore[override]  # allow LSP violation, not supporti
             )
         return ds
 
+    def open_datatree(
+        self,
+        filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
+        **kwargs,
+    ) -> DataTree:
+        from netCDF4 import Dataset as ncDataset
+
+        return _open_datatree_netcdf(ncDataset, filename_or_obj, **kwargs)
+
 
 BACKEND_ENTRYPOINTS["netcdf4"] = ("netCDF4", NetCDF4BackendEntrypoint)
diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py
@@ -14,7 +14,6 @@
 )
 from xarray.backends.store import StoreBackendEntrypoint
 from xarray.core import indexing
-from xarray.core.pycompat import integer_types
 from xarray.core.utils import (
     Frozen,
     FrozenDict,
@@ -23,6 +22,7 @@
     is_remote_uri,
 )
 from xarray.core.variable import Variable
+from xarray.namedarray.pycompat import integer_types
 
 if TYPE_CHECKING:
     import os