From 7390847190fd67df292172bbfb7ffc6c3e5399ae Mon Sep 17 00:00:00 2001
From: Kalle Westerling <kalle.westerling@bl.uk>
Date: Mon, 16 Oct 2023 14:25:44 +0100
Subject: [PATCH] Additional docstrings fixed

---
 deepsensor/data/loader.py    |  85 +++--
 deepsensor/data/processor.py | 125 ++++----
 deepsensor/data/task.py      | 242 ++++++++------
 deepsensor/data/utils.py     |  88 +++---
 deepsensor/model/convnp.py   | 592 +++++++++++++++++------------------
 deepsensor/model/defaults.py |  54 ++--
 deepsensor/model/model.py    | 553 +++++++++++++++++---------------
 deepsensor/model/nps.py      | 257 ++++++++-------
 deepsensor/train/train.py    |  87 ++---
 tests/test_task_loader.py    |  22 +-
 tests/utils.py               |  62 ++--
 11 files changed, 1116 insertions(+), 1051 deletions(-)

diff --git a/deepsensor/data/loader.py b/deepsensor/data/loader.py
index 1bb0003a..71a362a0 100644
--- a/deepsensor/data/loader.py
+++ b/deepsensor/data/loader.py
@@ -317,18 +317,17 @@ def _cast_to_dtype(
         """
         Cast context and target data to the default dtype.
 
-        Parameters
-        ----------
-        var : ...
-            ...
-
-        TODO unit test this by passing in a variety of data types and checking that they are
-        cast correctly.
-
-        Returns
-        -------
-        context : tuple. Tuple of context data with specified dtype.
-        target : tuple. Tuple of target data with specified dtype.
+        ..
+            TODO unit test this by passing in a variety of data types and
+            checking that they are cast correctly.
+
+        Args:
+            var : ...
+                ...
+
+        Returns:
+            tuple: Tuple of context data with specified dtype.
+            tuple: Tuple of target data with specified dtype.
         """
 
         def cast_to_dtype(var):
@@ -526,24 +525,21 @@ def _check_links(
         """
         Check that the context-target links are valid.
 
-        Parameters
-        ----------
-        links : Tuple[int, int] | List[Tuple[int, int]]
-            Specifies links between context and target data. Each link is a
-            tuple of two integers, where the first integer is the index of the
-            context data and the second integer is the index of the target
-            data. Can be a single tuple in the case of a single link. If None,
-            no links are specified. Default: None.
-
-        Returns
-        -------
-        links : Tuple[int, int] | List[Tuple[int, int]]
-            The input links, if valid.
-
-        Raises
-        ------
-        ValueError
-            If the links are not valid.
+        Args:
+            links (Tuple[int, int] | List[Tuple[int, int]]):
+                Specifies links between context and target data. Each link is a
+                tuple of two integers, where the first integer is the index of
+                the context data and the second integer is the index of the
+                target data. Can be a single tuple in the case of a single
+                link. If None, no links are specified. Default: None.
+
+        Returns:
+            Tuple[int, int] | List[Tuple[int, int]]
+                The input links, if valid.
+
+        Raises:
+            ValueError
+                If the links are not valid.
         """
         if links is None:
             return None
@@ -827,22 +823,19 @@ def time_slice_variable(self, var, date, delta_t=0):
         """
         Slice a variable by a given time delta.
 
-        Parameters
-        ----------
-        var : ...
-            Variable to slice.
-        delta_t : ...
-            Time delta to slice by.
-
-        Returns
-        -------
-        var : ...
-            Sliced variable.
-
-        Raises
-        ------
-        ValueError
-            If the variable is of an unknown type.
+        Args:
+            var (...):
+                Variable to slice.
+            delta_t (...):
+                Time delta to slice by.
+
+        Returns:
+            var (...)
+                Sliced variable.
+
+        Raises:
+            ValueError
+                If the variable is of an unknown type.
         """
         # TODO: Does this work with instantaneous time?
         delta_t = pd.Timedelta(delta_t, unit=self.time_freq)
diff --git a/deepsensor/data/processor.py b/deepsensor/data/processor.py
index 6e08034a..ef250b48 100644
--- a/deepsensor/data/processor.py
+++ b/deepsensor/data/processor.py
@@ -647,19 +647,17 @@ def unnormalise(
         """
         Unnormalise data.
 
-        Parameters
-        ----------
-        data : :class:`xarray.DataArray` | :class:`xarray.Dataset` | :class:`pandas.DataFrame` | List[:class:`xarray.DataArray` | :class:`xarray.Dataset` | :class:`pandas.DataFrame`]
-            Data to unnormalise.
-        add_offset : bool, optional
-            Whether to add the offset to the data when unnormalising. Set to
-            False to unnormalise uncertainty values (e.g. std dev). Defaults to
-            True.
-
-        Returns
-        -------
-        :class:`xarray.DataArray` | :class:`xarray.Dataset` | :class:`pandas.DataFrame` | List[:class:`xarray.DataArray` | :class:`xarray.Dataset` | :class:`pandas.DataFrame`]
-            Unnormalised data.
+        Args:
+            data (:class:`xarray.DataArray` | :class:`xarray.Dataset` | :class:`pandas.DataFrame` | List[:class:`xarray.DataArray` | :class:`xarray.Dataset` | :class:`pandas.DataFrame`]):
+                Data to unnormalise.
+            add_offset (bool, optional):
+                Whether to add the offset to the data when unnormalising. Set
+                to False to unnormalise uncertainty values (e.g. std dev).
+                Defaults to True.
+
+        Returns:
+            :class:`xarray.DataArray` | :class:`xarray.Dataset` | :class:`pandas.DataFrame` | List[:class:`xarray.DataArray` | :class:`xarray.Dataset` | :class:`pandas.DataFrame`]:
+                Unnormalised data.
         """
         if isinstance(data, list):
             return [
@@ -675,25 +673,35 @@ def xarray_to_coord_array_normalised(
     """
     Convert xarray to normalised coordinate array.
 
-    Parameters
-    ----------
-    da : :class:`xarray.Dataset` | :class:`xarray.DataArray`
-        ...
+    Args:
+        da (:class:`xarray.Dataset` | :class:`xarray.DataArray`)
+            ...
 
-    Returns
-    -------
-    :class:`numpy:numpy.ndarray`
-        A normalised coordinate array of shape ``(2, N)``.
+    Returns:
+        :class:`numpy:numpy.ndarray`
+            A normalised coordinate array of shape ``(2, N)``.
     """
     x1, x2 = da["x1"].values, da["x2"].values
     X1, X2 = np.meshgrid(x1, x2, indexing="ij")
     return np.stack([X1.ravel(), X2.ravel()], axis=0)
 
 
-def process_X_mask_for_X(X_mask: xr.DataArray, X: xr.DataArray):
+def process_X_mask_for_X(
+    X_mask: xr.DataArray, X: xr.DataArray
+) -> xr.DataArray:
     """Process X_mask by interpolating to X and converting to boolean.
 
     Both X_mask and X are xarray DataArrays with the same spatial coords.
+
+    Args:
+        X_mask (:class:`xarray.DataArray`):
+            ...
+        X (:class:`xarray.DataArray`):
+            ...
+
+    Returns:
+        :class:`xarray.DataArray`
+            ...
     """
     X_mask = X_mask.astype(float).interp_like(
         X, method="nearest", kwargs={"fill_value": 0}
@@ -705,24 +713,23 @@ def process_X_mask_for_X(X_mask: xr.DataArray, X: xr.DataArray):
 
 def mask_coord_array_normalised(
     coord_arr: np.ndarray, mask_da: Union[xr.DataArray, xr.Dataset, None]
-):
+) -> np.ndarray:
     """
-    Remove points from (2, N) numpy array that are outside gridded xarray boolean mask.
-
-    If `coord_arr` is shape `(2, N)`, then `mask_da` is a shape `(N,)` boolean array
-    (True if point is inside mask, False if outside).
-
-    Parameters
-    ----------
-    coord_arr : ...
-        ...
-    mask_da : ...
-        ...
-
-    Returns
-    -------
-    ...
-        ...
+    Remove points from (2, N) numpy array that are outside gridded xarray
+    boolean mask.
+
+    If `coord_arr` is shape `(2, N)`, then `mask_da` is a shape `(N,)` boolean
+    array (True if point is inside mask, False if outside).
+
+    Args:
+        coord_arr (:class:`numpy:numpy.ndarray`):
+            ...
+        mask_da (:class:`xarray.Dataset` | :class:`xarray.DataArray`):
+            ...
+
+    Returns:
+        :class:`numpy:numpy.ndarray`
+            ...
     """
     if mask_da is None:
         return coord_arr
@@ -739,17 +746,15 @@ def da1_da2_same_grid(da1: xr.DataArray, da2: xr.DataArray) -> bool:
     .. note::
         ``da1`` and ``da2`` are assumed normalised by ``DataProcessor``.
 
-    Parameters
-    ----------
-    da1 : :class:`xarray.DataArray`
-        ...
-    da2 : :class:`xarray.DataArray`
-        ...
-
-    Returns
-    -------
-    bool
-        Whether ``da1`` and ``da2`` are on the same grid.
+    Args:
+        da1 (:class:`xarray.DataArray`):
+            ...
+        da2 (:class:`xarray.DataArray`):
+            ...
+
+    Returns:
+        bool
+            Whether ``da1`` and ``da2`` are on the same grid.
     """
     x1equal = np.array_equal(da1["x1"].values, da2["x1"].values)
     x2equal = np.array_equal(da1["x2"].values, da2["x2"].values)
@@ -763,16 +768,14 @@ def interp_da1_to_da2(da1: xr.DataArray, da2: xr.DataArray) -> xr.DataArray:
     .. note::
         ``da1`` and ``da2`` are assumed normalised by ``DataProcessor``.
 
-    Parameters
-    ----------
-    da1 : :class:`xarray.DataArray`
-        ...
-    da2 : :class:`xarray.DataArray`
-        ...
-
-    Returns
-    -------
-    :class:`xarray.DataArray`
-        Interpolated xarray.
+    Args:
+        da1 (:class:`xarray.DataArray`):
+            ...
+        da2 (:class:`xarray.DataArray`):
+            ...
+
+    Returns:
+        :class:`xarray.DataArray`
+            Interpolated xarray.
     """
     return da1.interp(x1=da2["x1"], x2=da2["x2"], method="nearest")
diff --git a/deepsensor/data/task.py b/deepsensor/data/task.py
index 1b5d53f5..21998c70 100644
--- a/deepsensor/data/task.py
+++ b/deepsensor/data/task.py
@@ -1,6 +1,6 @@
 import deepsensor
 
-from typing import Union, Tuple, List
+from typing import Callable, Union, Tuple, List, Optional
 import numpy as np
 import lab as B
 import plum
@@ -21,10 +21,9 @@ def __init__(self, task_dict: dict) -> None:
         """
         Initialise a Task object.
 
-        Parameters
-        ----------
-        task_dict : dict
-            Dictionary containing the task.
+        Args:
+            task_dict (dict):
+                Dictionary containing the task.
         """
         super().__init__(task_dict)
 
@@ -45,7 +44,22 @@ def summarise_str(cls, k, v):
             return v
 
     @classmethod
-    def summarise_repr(cls, k, v):
+    def summarise_repr(cls, k, v) -> str:
+        """
+        Summarise the task in a representation that can be printed.
+
+        Args:
+            cls (:class:`deepsensor.data.task.Task`:):
+                Task class.
+            k (str):
+                Key of the task dictionary.
+            v (object):
+                Value of the task dictionary.
+
+        Returns:
+            str:
+                String representation of the task.
+        """
         if plum.isinstance(v, B.Numeric):
             return f"{type(v).__name__}/{v.dtype}/{v.shape}"
         if plum.isinstance(v, deepsensor.backend.nps.mask.Masked):
@@ -58,7 +72,7 @@ def summarise_repr(cls, k, v):
         else:
             return f"{type(v).__name__}/{v}"
 
-    def __str__(self):
+    def __str__(self) -> str:
         """
         Print a convenient summary of the task dictionary.
 
@@ -69,7 +83,7 @@ def __str__(self):
             s += f"{k}: {Task.summarise_str(k, v)}\n"
         return s
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         """
         Print a convenient summary of the task dictionary.
 
@@ -81,26 +95,23 @@ def __repr__(self):
             s += f"{k}: {Task.summarise_repr(k, v)}\n"
         return s
 
-    def op(self, f, op_flag=None):
-        """Apply function f to the array elements of a task dictionary.
+    def op(self, f: Callable, op_flag: Optional[str] = None):
+        """
+        Apply function f to the array elements of a task dictionary.
 
         Useful for recasting to a different dtype or reshaping (e.g. adding a
         batch dimension).
 
-        Parameters
-        ----------
-        f : function
-            Function to apply to the array elements of the task.
-        task : dict
-            Task dictionary.
-        op_flag : str
-            Flag to set in the task dictionary's `ops` key.
-
-        Returns
-        -------
-        task : dict.
-            Task dictionary with f applied to the array elements and
-            op_flag set in the ``ops`` key.
+        Args:
+            f (callable):
+                Function to apply to the array elements of the task.
+            op_flag (str):
+                Flag to set in the task dictionary's `ops` key.
+
+        Returns:
+            :class:`deepsensor.data.task.Task`:
+                Task with f applied to the array elements and op_flag set in
+                the ``ops`` key.
         """
 
         def recurse(k, v):
@@ -109,7 +120,8 @@ def recurse(k, v):
             elif type(v) is tuple:
                 return (recurse(k, v[0]), recurse(k, v[1]))
             elif isinstance(
-                v, (np.ndarray, np.ma.MaskedArray, deepsensor.backend.nps.Masked)
+                v,
+                (np.ndarray, np.ma.MaskedArray, deepsensor.backend.nps.Masked),
             ):
                 return f(v)
             else:
@@ -123,25 +135,33 @@ def recurse(k, v):
         return self  # altered by reference, but return anyway
 
     def add_batch_dim(self):
-        """Add a batch dimension to the arrays in the task dictionary.
+        """
+        Add a batch dimension to the arrays in the task dictionary.
 
-        Returns
-        -------
-        task : dict. Task dictionary with batch dimension added to the array elements.
+        Returns:
+            :class:`deepsensor.data.task.Task`:
+                Task with batch dimension added to the array elements.
         """
         return self.op(lambda x: x[None, ...], op_flag="batch_dim")
 
     def cast_to_float32(self):
-        """Cast the arrays in the task dictionary to float32.
+        """
+        Cast the arrays in the task dictionary to float32.
 
-        Returns
-        -------
-        task : dict. Task dictionary with arrays cast to float32.
+        Returns:
+            :class:`deepsensor.data.task.Task`:
+                Task with arrays cast to float32.
         """
         return self.op(lambda x: x.astype(np.float32), op_flag="float32")
 
     def remove_any_nans_from_Y_t(self):
-        """If NaNs are present in task["Y_t"], remove them (and corresponding task["X_t"])"""
+        """
+        If NaNs are present in task["Y_t"], remove them (and corresponding task["X_t"])
+
+        Returns:
+            :class:`deepsensor.data.task.Task`:
+                ...
+        """
         if "batch_dim" in self["ops"]:
             raise ValueError(
                 "Cannot remove NaNs from task if a batch dim has been added."
@@ -179,14 +199,19 @@ def remove_any_nans_from_Y_t(self):
         return self
 
     def mask_nans_numpy(self):
-        """Replace NaNs with zeroes and set a mask to indicate where the NaNs were.
+        """
+        Replace NaNs with zeroes and set a mask to indicate where the NaNs
+        were.
 
-        Returns
-        -------
-        task : dict. Task with NaNs set to zeros and a mask indicating where the missing values are.
+        Returns:
+            :class:`deepsensor.data.task.Task`:
+                Task with NaNs set to zeros and a mask indicating where the
+                missing values are.
         """
         if "batch_dim" not in self["ops"]:
-            raise ValueError("Must call `add_batch_dim` before `mask_nans_numpy`")
+            raise ValueError(
+                "Must call `add_batch_dim` before `mask_nans_numpy`"
+            )
 
         def f(arr):
             if isinstance(arr, deepsensor.backend.nps.Masked):
@@ -207,10 +232,21 @@ def f(arr):
         return self.op(lambda x: f(x), op_flag="numpy_mask")
 
     def mask_nans_nps(self):
+        """
+        ...
+
+        Returns:
+            :class:`deepsensor.data.task.Task`:
+                ...
+        """
         if "batch_dim" not in self["ops"]:
-            raise ValueError("Must call `add_batch_dim` before `mask_nans_nps`")
+            raise ValueError(
+                "Must call `add_batch_dim` before `mask_nans_nps`"
+            )
         if "numpy_mask" not in self["ops"]:
-            raise ValueError("Must call `mask_nans_numpy` before `mask_nans_nps`")
+            raise ValueError(
+                "Must call `mask_nans_numpy` before `mask_nans_nps`"
+            )
 
         def f(arr):
             if isinstance(arr, np.ma.MaskedArray):
@@ -223,10 +259,12 @@ def f(arr):
         return self.op(lambda x: f(x), op_flag="nps_mask")
 
     def convert_to_tensor(self):
-        """Convert to tensor object based on deep learning backend
+        """
+        Convert to tensor object based on deep learning backend.
 
-        Returns
-            task: dict. Task dictionary with arrays converted to deep learning tensor objects
+        Returns:
+            :class:`deepsensor.data.task.Task`:
+                Task with arrays converted to deep learning tensor objects.
         """
 
         def f(arr):
@@ -257,6 +295,20 @@ def append_obs_to_task(
     ..
         TODO: for speed during active learning algs, consider a shallow copy
         option plus ability to remove observations.
+
+    Args:
+        task (:class:`deepsensor.data.task.Task`:):
+            The task to modify.
+        X_new (array-like):
+            New observation coordinates.
+        Y_new (array-like):
+            New observation values.
+        context_set_idx (int):
+            Index of the context set to append to.
+
+    Returns:
+        :class:`deepsensor.data.task.Task`:
+            Task with new observation appended to the context set.
     """
     if not 0 <= context_set_idx <= len(task["X_c"]) - 1:
         raise TaskSetIndexError(context_set_idx, len(task["X_c"]), "context")
@@ -289,19 +341,19 @@ def append_obs_to_task(
     return task_with_new
 
 
-def flatten_X(X: Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]) -> np.ndarray:
+def flatten_X(
+    X: Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]
+) -> np.ndarray:
     """
     Convert tuple of gridded coords to (2, N) array if necessary.
 
-    Parameters
-    ----------
-    X : :class:`numpy:numpy.ndarray` | Tuple[:class:`numpy:numpy.ndarray`, :class:`numpy:numpy.ndarray`]
-        ...
+    Args:
+        X (:class:`numpy:numpy.ndarray` | Tuple[:class:`numpy:numpy.ndarray`, :class:`numpy:numpy.ndarray`]):
+            ...
 
-    Returns
-    ----------
-    :class:`numpy:numpy.ndarray`
-        ...
+    Returns:
+        :class:`numpy:numpy.ndarray`
+            ...
     """
     if type(X) is tuple:
         X1, X2 = np.meshgrid(X[0], X[1], indexing="ij")
@@ -309,20 +361,20 @@ def flatten_X(X: Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]) -> np.ndarray
     return X
 
 
-def flatten_Y(Y: Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]) -> np.ndarray:
+def flatten_Y(
+    Y: Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]
+) -> np.ndarray:
     """
     Convert gridded data of shape (N_dim, N_x1, N_x2) to (N_dim, N_x1 * N_x2)
     array if necessary.
 
-    Parameters
-    ----------
-    Y : :class:`numpy:numpy.ndarray` | Tuple[:class:`numpy:numpy.ndarray`, :class:`numpy:numpy.ndarray`]
-        ...
+    Args:
+        Y (:class:`numpy:numpy.ndarray` | Tuple[:class:`numpy:numpy.ndarray`, :class:`numpy:numpy.ndarray`]):
+            ...
 
-    Returns
-    -------
-    :class:`numpy:numpy.ndarray`
-        ...
+    Returns:
+        :class:`numpy:numpy.ndarray`
+            ...
     """
     if Y.ndim == 3:
         Y = Y.reshape(*Y.shape[:-2], -1)
@@ -335,15 +387,13 @@ def flatten_gridded_data_in_task(task: Task) -> Task:
 
     Necessary for AR sampling, which doesn't yet permit gridded context sets.
 
-    Parameters
-    ----------
-    task : :class:`~.data.task.Task`
-        ...
+    Args:
+        task : :class:`~.data.task.Task`
+            ...
 
-    Returns
-    -------
-    Task
-        ...
+    Returns:
+        :class:`deepsensor.data.task.Task`:
+            ...
     """
     task_flattened = copy.deepcopy(task)
 
@@ -365,31 +415,29 @@ def concat_tasks(tasks: List[Task], multiple: int = 1) -> Task:
           functionality.
         - Raise error if ``aux_t`` values passed (not supported I don't think)
 
-    Parameters
-    ----------
-    tasks : List[Task]
-        List of tasks to concatenate into a single task.
-    multiple : int, optional
-        Contexts are padded to the smallest multiple of this number that is
-        greater than the number of contexts in each task. Defaults to 1
-        (padded to the largest number of contexts in the tasks). Setting to a
-        larger number will increase the amount of padding but decrease the
-        range of tensor shapes presented to the model, which simplifies the
-        computational graph in graph mode.
-
-    Returns
-    -------
-    merged_task : :class:`~.data.task.Task`
-        Task containing multiple batches.
-
-    Raises
-    ------
-    ValueError
-        If the tasks have different numbers of target sets.
-    ValueError
-        If the tasks have different numbers of targets.
-    ValueError
-        If the tasks have different types of target sets (gridded/non-gridded).
+    Args:
+        tasks (List[:class:`deepsensor.data.task.Task`:]):
+            List of tasks to concatenate into a single task.
+        multiple (int, optional):
+            Contexts are padded to the smallest multiple of this number that is
+            greater than the number of contexts in each task. Defaults to 1
+            (padded to the largest number of contexts in the tasks). Setting
+            to a larger number will increase the amount of padding but decrease
+            the range of tensor shapes presented to the model, which simplifies
+            the computational graph in graph mode.
+
+    Returns:
+        :class:`~.data.task.Task`
+            Task containing multiple batches.
+
+    Raises:
+        ValueError
+            If the tasks have different numbers of target sets.
+        ValueError
+            If the tasks have different numbers of targets.
+        ValueError
+            If the tasks have different types of target sets (gridded/
+            non-gridded).
     """
     if len(tasks) == 1:
         return tasks[0]
@@ -466,7 +514,9 @@ def concat_tasks(tasks: List[Task], multiple: int = 1) -> Task:
             )
         else:
             # Target set is off-the-grid with tensor for `X_t`
-            merged_task["X_t"][i] = B.concat(*[t["X_t"][i] for t in tasks], axis=0)
+            merged_task["X_t"][i] = B.concat(
+                *[t["X_t"][i] for t in tasks], axis=0
+            )
         merged_task["Y_t"][i] = B.concat(*[t["Y_t"][i] for t in tasks], axis=0)
 
     merged_task["time"] = [t["time"] for t in tasks]
diff --git a/deepsensor/data/utils.py b/deepsensor/data/utils.py
index bd869561..08ac27a7 100644
--- a/deepsensor/data/utils.py
+++ b/deepsensor/data/utils.py
@@ -12,15 +12,13 @@ def construct_x1x2_ds(gridded_ds):
     a 2D gridded channel whose values contain the x_1 and x_2 coordinate
     values, respectively.
 
-    Parameters
-    ----------
-    gridded_ds : :class:`xarray.Dataset`
-        ...
-
-    Returns
-    -------
-    :class:`xarray.Dataset`
-        ...
+    Args:
+        gridded_ds (:class:`xarray.Dataset`):
+            ...
+
+    Returns:
+        :class:`xarray.Dataset`
+            ...
     """
     X1, X2 = np.meshgrid(gridded_ds.x1, gridded_ds.x2, indexing="ij")
     ds = xr.Dataset(
@@ -40,17 +38,15 @@ def construct_circ_time_ds(dates, freq):
         - ``'D'``: cycles once per year at daily intervals
         - ``'M'``: cycles once per year at monthly intervals
 
-    Parameters
-    ----------
-    dates: ...
-        ...
-    freq : ...
-        ...
-
-    Returns
-    -------
-    :class:`xarray.Dataset`
-        ...
+    Args:
+        dates (...):
+            ...
+        freq (...):
+            ...
+
+    Returns:
+        :class:`xarray.Dataset`
+            ...
     """
     if freq == "D":
         time_var = dates.dayofyear
@@ -79,7 +75,9 @@ def construct_circ_time_ds(dates, freq):
     return ds
 
 
-def compute_xarray_data_resolution(ds: Union[xr.DataArray, xr.Dataset]) -> float:
+def compute_xarray_data_resolution(
+    ds: Union[xr.DataArray, xr.Dataset]
+) -> float:
     """
     Computes the resolution of an xarray object with coordinates x1 and x2.
 
@@ -88,15 +86,12 @@ def compute_xarray_data_resolution(ds: Union[xr.DataArray, xr.Dataset]) -> float
     resolution of 0.2 degrees, the data resolution returned will be 0.1
     degrees.
 
-    Parameters
-    ----------
-    ds : :class:`xarray.DataArray` | :class:`xarray.Dataset`
-        Xarray object with coordinates x1 and x2.
+    Args:
+        ds (:class:`xarray.DataArray` | :class:`xarray.Dataset`):
+            Xarray object with coordinates x1 and x2.
 
-    Returns
-    -------
-    data_resolution : float
-        Resolution of the data (in spatial units, e.g. 0.1 degrees).
+    Returns:
+        float: Resolution of the data (in spatial units, e.g. 0.1 degrees).
     """
     x1_res = np.abs(np.mean(np.diff(ds["x1"])))
     x2_res = np.abs(np.mean(np.diff(ds["x2"])))
@@ -119,21 +114,18 @@ def compute_pandas_data_resolution(
     than 1000) and to use the 5th percentile. This means that the resolution is
     the distance between the closest 5% of neighbouring observations.
 
-    Parameters
-    ----------
-    df : :class:`pandas.DataFrame` | :class:`pandas.Series`
-        Dataframe or series with indexes time, x1, and x2.
-    n_times : int, optional
-        Number of dates to sample. Defaults to 1000. If "all", all dates are
-        used.
-    percentile : int, optional
-        Percentile of pairwise distances for computing the resolution.
-        Defaults to 5.
-
-    Returns
-    -------
-    data_resolution : float
-        Resolution of the data (in spatial units, e.g. 0.1 degrees).
+    Args:
+        df (:class:`pandas.DataFrame` | :class:`pandas.Series`):
+            Dataframe or series with indexes time, x1, and x2.
+        n_times (int, optional):
+            Number of dates to sample. Defaults to 1000. If "all", all dates
+            are used.
+        percentile (int, optional):
+            Percentile of pairwise distances for computing the resolution.
+            Defaults to 5.
+
+    Returns:
+        float: Resolution of the data (in spatial units, e.g. 0.1 degrees).
     """
     dates = df.index.get_level_values("time").unique()
 
@@ -149,10 +141,14 @@ def compute_pandas_data_resolution(
         if X.shape[0] < 2:
             # Skip this time if there are fewer than 2 stationS
             continue
-        X_unique = np.unique(X, axis=0)  # (N_unique, 2) array of unique coordinates
+        X_unique = np.unique(
+            X, axis=0
+        )  # (N_unique, 2) array of unique coordinates
 
         pairwise_distances = scipy.spatial.distance.cdist(X_unique, X_unique)
-        percentile_distances_without_self = np.ma.masked_equal(pairwise_distances, 0)
+        percentile_distances_without_self = np.ma.masked_equal(
+            pairwise_distances, 0
+        )
 
         # Compute the closest distance from each station to each other station
         closest_distances_t = np.min(percentile_distances_without_self, axis=1)
diff --git a/deepsensor/model/convnp.py b/deepsensor/model/convnp.py
index 9e4bf5a9..f191347f 100644
--- a/deepsensor/model/convnp.py
+++ b/deepsensor/model/convnp.py
@@ -62,75 +62,74 @@ class ConvNP(DeepSensorModel):
     customise the model, which will override any defaults inferred from a
     ``TaskLoader``.
 
-    Parameters
-    ----------
-    points_per_unit : int, optional
-        Density of the internal discretisation. Defaults to 100.
-    likelihood : str, optional
-        Likelihood. Must be one of ``"cnp"`` (equivalently ``"het"``),
-        ``"gnp"`` (equivalently ``"lowrank"``), or ``"cnp-spikes-beta"``
-        (equivalently ``"spikes-beta"``). Defaults to ``"cnp"``.
-    dim_x : int, optional
-        Dimensionality of the inputs. Defaults to 1.
-    dim_y : int, optional
-        Dimensionality of the outputs. Defaults to 1.
-    dim_yc : int or tuple[int], optional
-        Dimensionality of the outputs of the context set. You should set this
-        if the dimensionality of the outputs of the context set is not equal
-        to the dimensionality of the outputs of the target set. You should
-        also set this if you want to use multiple context sets. In that case,
-        set this equal to a tuple of integers indicating the respective output
-        dimensionalities.
-    dim_yt : int, optional
-        Dimensionality of the outputs of the target set. You should set this
-        if the dimensionality of the outputs of the target set is not equal to
-        the dimensionality of the outputs of the context set.
-    dim_aux_t : int, optional
-        Dimensionality of target-specific auxiliary variables.
-    conv_arch : str, optional
-        Convolutional architecture to use. Must be one of
-        ``"unet[-res][-sep]"`` or ``"conv[-res][-sep]"``. Defaults to
-        ``"unet"``.
-    unet_channels : tuple[int], optional
-        Channels of every layer of the UNet. Defaults to six layers each with
-        64 channels.
-    unet_kernels : int or tuple[int], optional
-        Sizes of the kernels in the UNet. Defaults to 5.
-    unet_resize_convs : bool, optional
-        Use resize convolutions rather than transposed convolutions in the
-        UNet. Defaults to ``False``.
-    unet_resize_conv_interp_method : str, optional
-        Interpolation method for the resize convolutions in the UNet. Can be
-        set to ``"bilinear"``. Defaults to "bilinear".
-    num_basis_functions : int, optional
-        Number of basis functions for the low-rank likelihood. Defaults to
-        64.
-    dim_lv : int, optional
-        Dimensionality of the latent variable. Setting to >0 constructs a
-        latent neural process. Defaults to 0.
-    encoder_scales : float or tuple[float], optional
-        Initial value for the length scales of the set convolutions for the
-        context sets embeddings. Set to a tuple equal to the number of context
-        sets to use different values for each set. Set to a single value to use
-        the same value for all context sets. Defaults to
-        ``1 / points_per_unit``.
-    encoder_scales_learnable : bool, optional
-        Whether the encoder SetConv length scale(s) are learnable. Defaults to
-        ``False``.
-    decoder_scale : float, optional
-        Initial value for the length scale of the set convolution in the
-        decoder. Defaults to ``1 / points_per_unit``.
-    decoder_scale_learnable : bool, optional
-        Whether the decoder SetConv length scale(s) are learnable. Defaults to
-        ``False``.
-    aux_t_mlp_layers : tuple[int], optional
-        Widths of the layers of the MLP for the target-specific auxiliary
-        variable. Defaults to three layers of width 128.
-    epsilon : float, optional
-        Epsilon added by the set convolutions before dividing by the density
-        channel. Defaults to ``1e-2``.
-    dtype : dtype, optional
-        Data type.
+    Args:
+        points_per_unit (int, optional):
+            Density of the internal discretisation. Defaults to 100.
+        likelihood (str, optional):
+            Likelihood. Must be one of ``"cnp"`` (equivalently ``"het"``),
+            ``"gnp"`` (equivalently ``"lowrank"``), or ``"cnp-spikes-beta"``
+            (equivalently ``"spikes-beta"``). Defaults to ``"cnp"``.
+        dim_x (int, optional):
+            Dimensionality of the inputs. Defaults to 1.
+        dim_y (int, optional):
+            Dimensionality of the outputs. Defaults to 1.
+        dim_yc (int or tuple[int], optional):
+            Dimensionality of the outputs of the context set. You should set this
+            if the dimensionality of the outputs of the context set is not equal
+            to the dimensionality of the outputs of the target set. You should
+            also set this if you want to use multiple context sets. In that case,
+            set this equal to a tuple of integers indicating the respective output
+            dimensionalities.
+        dim_yt (int, optional):
+            Dimensionality of the outputs of the target set. You should set this
+            if the dimensionality of the outputs of the target set is not equal to
+            the dimensionality of the outputs of the context set.
+        dim_aux_t (int, optional):
+            Dimensionality of target-specific auxiliary variables.
+        conv_arch (str, optional):
+            Convolutional architecture to use. Must be one of
+            ``"unet[-res][-sep]"`` or ``"conv[-res][-sep]"``. Defaults to
+            ``"unet"``.
+        unet_channels (tuple[int], optional):
+            Channels of every layer of the UNet. Defaults to six layers each with
+            64 channels.
+        unet_kernels (int or tuple[int], optional):
+            Sizes of the kernels in the UNet. Defaults to 5.
+        unet_resize_convs (bool, optional):
+            Use resize convolutions rather than transposed convolutions in the
+            UNet. Defaults to ``False``.
+        unet_resize_conv_interp_method (str, optional):
+            Interpolation method for the resize convolutions in the UNet. Can be
+            set to ``"bilinear"``. Defaults to "bilinear".
+        num_basis_functions (int, optional):
+            Number of basis functions for the low-rank likelihood. Defaults to
+            64.
+        dim_lv (int, optional):
+            Dimensionality of the latent variable. Setting to >0 constructs a
+            latent neural process. Defaults to 0.
+        encoder_scales (float or tuple[float], optional):
+            Initial value for the length scales of the set convolutions for the
+            context sets embeddings. Set to a tuple equal to the number of context
+            sets to use different values for each set. Set to a single value to use
+            the same value for all context sets. Defaults to
+            ``1 / points_per_unit``.
+        encoder_scales_learnable (bool, optional):
+            Whether the encoder SetConv length scale(s) are learnable. Defaults to
+            ``False``.
+        decoder_scale (float, optional):
+            Initial value for the length scale of the set convolution in the
+            decoder. Defaults to ``1 / points_per_unit``.
+        decoder_scale_learnable (bool, optional):
+            Whether the decoder SetConv length scale(s) are learnable. Defaults to
+            ``False``.
+        aux_t_mlp_layers (tuple[int], optional):
+            Widths of the layers of the MLP for the target-specific auxiliary
+            variable. Defaults to three layers of width 128.
+        epsilon (float, optional):
+            Epsilon added by the set convolutions before dividing by the density
+            channel. Defaults to ``1e-2``.
+        dtype (dtype, optional):
+            Data type.
     """
 
     @dispatch
@@ -162,14 +161,13 @@ def __init__(
         Instantiate model from TaskLoader, using data to infer model parameters
         (unless overridden).
 
-        Parameters
-        ----------
-        data_processor : :class:`~.data.processor.DataProcessor`
-            DataProcessor object.
-        task_loader : :class:`~.data.loader.TaskLoader`
-            TaskLoader object.
-        verbose : bool, optional
-            Whether to print inferred model parameters, by default True.
+        Args:
+            data_processor (:class:`~.data.processor.DataProcessor`):
+                DataProcessor object.
+            task_loader (:class:`~.data.loader.TaskLoader`):
+                TaskLoader object.
+            verbose (bool, optional):
+                Whether to print inferred model parameters, by default True.
         """
         super().__init__(data_processor, task_loader)
 
@@ -191,21 +189,29 @@ def __init__(
         if "aux_t_mlp_layers" not in kwargs and kwargs["dim_aux_t"] > 0:
             kwargs["aux_t_mlp_layers"] = (64,) * 3
             if verbose:
-                print(f"Setting aux_t_mlp_layers: {kwargs['aux_t_mlp_layers']}")
+                print(
+                    f"Setting aux_t_mlp_layers: {kwargs['aux_t_mlp_layers']}"
+                )
         if "points_per_unit" not in kwargs:
             ppu = gen_ppu(task_loader)
             if verbose:
                 print(f"points_per_unit inferred from TaskLoader: {ppu}")
             kwargs["points_per_unit"] = ppu
         if "encoder_scales" not in kwargs:
-            encoder_scales = gen_encoder_scales(kwargs["points_per_unit"], task_loader)
+            encoder_scales = gen_encoder_scales(
+                kwargs["points_per_unit"], task_loader
+            )
             if verbose:
-                print(f"encoder_scales inferred from TaskLoader: {encoder_scales}")
+                print(
+                    f"encoder_scales inferred from TaskLoader: {encoder_scales}"
+                )
             kwargs["encoder_scales"] = encoder_scales
         if "decoder_scale" not in kwargs:
             decoder_scale = gen_decoder_scale(kwargs["points_per_unit"])
             if verbose:
-                print(f"decoder_scale inferred from TaskLoader: {decoder_scale}")
+                print(
+                    f"decoder_scale inferred from TaskLoader: {decoder_scale}"
+                )
             kwargs["decoder_scale"] = decoder_scale
 
         self.model, self.config = construct_neural_process(*args, **kwargs)
@@ -220,14 +226,13 @@ def __init__(
         """
         Instantiate with a pre-defined neural process model.
 
-        Parameters
-        ----------
-        data_processor : :class:`~.data.processor.DataProcessor`
-            DataProcessor object.
-        task_loader : :class:`~.data.loader.TaskLoader`
-            TaskLoader object.
-        neural_process : TFModel | TorchModel
-            Pre-defined neural process model.
+        Args:
+            data_processor (:class:`~.data.processor.DataProcessor`):
+                DataProcessor object.
+            task_loader (:class:`~.data.loader.TaskLoader`):
+                TaskLoader object.
+            neural_process (TFModel | TorchModel):
+                Pre-defined neural process model.
         """
         super().__init__(data_processor, task_loader)
 
@@ -253,13 +258,24 @@ def __init__(
         self.load(model_ID)
 
     def save(self, model_ID: str):
-        """Save the model weights and config to a folder."""
+        """
+        Save the model weights and config to a folder.
+
+        Args:
+            model_ID (str):
+                Folder to save the model to.
+
+        Returns:
+            None.
+        """
         os.makedirs(model_ID, exist_ok=True)
 
         if backend.str == "torch":
             import torch
 
-            torch.save(self.model.state_dict(), os.path.join(model_ID, "model.pt"))
+            torch.save(
+                self.model.state_dict(), os.path.join(model_ID, "model.pt")
+            )
         elif backend.str == "tf":
             self.model.save_weights(os.path.join(model_ID, "model"))
         else:
@@ -270,7 +286,16 @@ def save(self, model_ID: str):
             json.dump(self.config, f, indent=4, sort_keys=False)
 
     def load(self, model_ID: str):
-        """Load a model from a folder containing model weights and config."""
+        """
+        Load a model from a folder containing model weights and config.
+
+        Args:
+            model_ID (str):
+                Folder to load the model from.
+
+        Returns:
+            None.
+        """
         config_fpath = os.path.join(model_ID, "model_config.json")
         with open(config_fpath, "r") as f:
             self.config = json.load(f)
@@ -280,7 +305,9 @@ def load(self, model_ID: str):
         if backend.str == "torch":
             import torch
 
-            self.model.load_state_dict(torch.load(os.path.join(model_ID, "model.pt")))
+            self.model.load_state_dict(
+                torch.load(os.path.join(model_ID, "model.pt"))
+            )
         elif backend.str == "tf":
             self.model.load_weights(os.path.join(model_ID, "model"))
         else:
@@ -292,15 +319,12 @@ def modify_task(cls, task: Task):
         Cast numpy arrays to TensorFlow or PyTorch tensors, add batch dim, and
         mask NaNs.
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            ...
+        Args:
+            task (:class:`~.data.task.Task`):
+                ...
 
-        Returns
-        -------
-        ...
-            ...
+        Returns:
+            ...: ...
         """
 
         if "batch_dim" not in task["ops"]:
@@ -320,19 +344,16 @@ def __call__(self, task, n_samples=10, requires_grad=False):
         """
         Compute ConvNP distribution.
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            ...
-        n_samples : int, optional
-            Number of samples to draw from the distribution, by default 10.
-        requires_grad : bool, optional
-            Whether to compute gradients, by default False.
-
-        Returns
-        -------
-        ...
-            The ConvNP distribution.
+        Args:
+            task (:class:`~.data.task.Task`):
+                ...
+            n_samples (int, optional):
+                Number of samples to draw from the distribution, by default 10.
+            requires_grad (bool, optional):
+                Whether to compute gradients, by default False.
+
+        Returns:
+            ...: The ConvNP distribution.
         """
         task = ConvNP.modify_task(task)
         dist = run_nps_model(self.model, task, n_samples, requires_grad)
@@ -343,15 +364,12 @@ def mean(self, dist: AbstractMultiOutputDistribution):
         """
         ...
 
-        Parameters
-        ----------
-        dist : neuralprocesses.dist.AbstractMultiOutputDistribution
-            ...
+        Args:
+            dist (neuralprocesses.dist.AbstractMultiOutputDistribution):
+                ...
 
-        Returns
-        -------
-        ...
-            ...
+        Returns:
+            ...: ...
         """
         mean = dist.mean
         if isinstance(mean, backend.nps.Aggregate):
@@ -364,15 +382,12 @@ def mean(self, task: Task):
         """
         ...
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            ...
+        Args:
+            task (:class:`~.data.task.Task`):
+                ...
 
-        Returns
-        -------
-        ...
-            ...
+        Returns:
+            ...: ...
         """
         dist = self(task)
         return self.mean(dist)
@@ -382,15 +397,12 @@ def variance(self, dist: AbstractMultiOutputDistribution):
         """
         ...
 
-        Parameters
-        ----------
-        dist : neuralprocesses.dist.AbstractMultiOutputDistribution
-            ...
+        Args:
+            dist (neuralprocesses.dist.AbstractMultiOutputDistribution):
+                ...
 
-        Returns
-        -------
-        ...
-            ...
+        Returns:
+            ...: ...
         """
         variance = dist.var
         if isinstance(variance, backend.nps.Aggregate):
@@ -403,15 +415,12 @@ def variance(self, task: Task):
         """
         ...
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            ...
+        Args:
+            task (:class:`~.data.task.Task`):
+                ...
 
-        Returns
-        -------
-        ...
-            ...
+        Returns:
+            ...: ...
         """
         dist = self(task)
         return self.variance(dist)
@@ -421,15 +430,12 @@ def stddev(self, dist: AbstractMultiOutputDistribution):
         """
         ...
 
-        Parameters
-        ----------
-        dist : neuralprocesses.dist.AbstractMultiOutputDistribution
-            ...
+        Args:
+            dist (neuralprocesses.dist.AbstractMultiOutputDistribution):
+                ...
 
-        Returns
-        -------
-        ...
-            ...
+        Returns:
+            ...: ...
         """
         variance = self.variance(dist)
         if isinstance(variance, (list, tuple)):
@@ -442,15 +448,12 @@ def stddev(self, task: Task):
         """
         ...
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            ...
+        Args:
+            task (:class:`~.data.task.Task`):
+                ...
 
-        Returns
-        -------
-        ...
-            ...
+        Returns:
+            ...: ...
         """
         dist = self(task)
         return self.stddev(dist)
@@ -460,15 +463,12 @@ def covariance(self, dist: AbstractMultiOutputDistribution):
         """
         ...
 
-        Parameters
-        ----------
-        dist : neuralprocesses.dist.AbstractMultiOutputDistribution
-            ...
+        Args:
+            dist (neuralprocesses.dist.AbstractMultiOutputDistribution):
+                ...
 
-        Returns
-        -------
-        ...
-            ...
+        Returns:
+            ...: ...
         """
         return B.to_numpy(B.dense(dist.vectorised_normal.var))[0, 0]
 
@@ -477,15 +477,12 @@ def covariance(self, task: Task):
         """
         ...
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            ...
+        Args:
+            task (:class:`~.data.task.Task`):
+                ...
 
-        Returns
-        -------
-        ...
-            ...
+        Returns:
+            ...: ...
         """
         dist = self(task)
         return self.covariance(dist)
@@ -500,19 +497,19 @@ def sample(
         """
         Create samples from a ConvNP distribution.
 
-        Parameters
-        ----------
-        dist : neuralprocesses.dist.AbstractMultiOutputDistribution
-            The distribution to sample from.
-        n_samples : int, optional
-            The number of samples to draw from the distribution, by default 1.
-        noiseless : bool, optional
-            Whether to sample from the noiseless distribution, by default True.
+        Args:
+            dist (neuralprocesses.dist.AbstractMultiOutputDistribution):
+                The distribution to sample from.
+            n_samples (int, optional):
+                The number of samples to draw from the distribution, by
+                default 1.
+            noiseless (bool, optional):
+                Whether to sample from the noiseless distribution, by default
+                True.
 
-        Returns
-        -------
-        :class:`numpy:numpy.ndarray` | List[:class:`numpy:numpy.ndarray`]
-            The samples as an array or list of arrays.
+        Returns:
+            :class:`numpy:numpy.ndarray` | List[:class:`numpy:numpy.ndarray`]:
+                The samples as an array or list of arrays.
         """
         if noiseless:
             samples = dist.noiseless.sample(n_samples)
@@ -529,19 +526,19 @@ def sample(self, task: Task, n_samples: int = 1, noiseless: bool = True):
         """
         Create samples from a ConvNP distribution.
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            The task to sample from.
-        n_samples : int, optional
-            The number of samples to draw from the distribution, by default 1.
-        noiseless : bool, optional
-            Whether to sample from the noiseless distribution, by default True.
+        Args:
+            dist (neuralprocesses.dist.AbstractMultiOutputDistribution):
+                The distribution to sample from.
+            n_samples (int, optional):
+                The number of samples to draw from the distribution, by
+                default 1.
+            noiseless (bool, optional):
+                Whether to sample from the noiseless distribution, by default
+                True.
 
-        Returns
-        -------
-        :class:`numpy:numpy.ndarray` | List[:class:`numpy:numpy.ndarray`]
-            The samples as an array or list of arrays.
+        Returns:
+            :class:`numpy:numpy.ndarray` | List[:class:`numpy:numpy.ndarray`]:
+                The samples as an array or list of arrays.
         """
         dist = self(task)
         return self.sample(dist, n_samples, noiseless)
@@ -551,15 +548,12 @@ def slice_diag(self, task: Task):
         """
         Slice out the ConvCNP part of the ConvNP distribution.
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            The task to slice.
+        Args:
+            task (:class:`~.data.task.Task`):
+                The task to slice.
 
-        Returns
-        -------
-        ...
-            ...
+        Returns:
+            ...: ...
         """
         dist = self(task)
         dist_diag = backend.nps.MultiOutputNormal(
@@ -575,15 +569,12 @@ def slice_diag(self, dist: AbstractMultiOutputDistribution):
         """
         Slice out the ConvCNP part of the ConvNP distribution.
 
-        Parameters
-        ----------
-        dist : neuralprocesses.dist.AbstractMultiOutputDistribution
-            The distribution to slice.
+        Args:
+            dist (neuralprocesses.dist.AbstractMultiOutputDistribution):
+                The distribution to slice.
 
-        Returns
-        -------
-        ...
-            ...
+        Returns:
+            ...: ...
         """
         dist_diag = backend.nps.MultiOutputNormal(
             dist._mean,
@@ -598,15 +589,12 @@ def mean_marginal_entropy(self, dist: AbstractMultiOutputDistribution):
         """
         Mean marginal entropy over target points given context points.
 
-        Parameters
-        ----------
-        dist : neuralprocesses.dist.AbstractMultiOutputDistribution
-            The distribution to compute the entropy of.
+        Args:
+            dist (neuralprocesses.dist.AbstractMultiOutputDistribution):
+                The distribution to compute the entropy of.
 
-        Returns
-        -------
-        float
-            The mean marginal entropy.
+        Returns:
+            float: The mean marginal entropy.
         """
         dist_diag = self.slice_diag(dist)
         return B.mean(B.to_numpy(dist_diag.entropy())[0, 0])
@@ -616,15 +604,12 @@ def mean_marginal_entropy(self, task: Task):
         """
         Mean marginal entropy over target points given context points.
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            The task to compute the entropy of.
+        Args:
+            task (:class:`~.data.task.Task`):
+                The task to compute the entropy of.
 
-        Returns
-        -------
-        float
-            The mean marginal entropy.
+        Returns:
+            float: The mean marginal entropy.
         """
         dist_diag = self.slice_diag(task)
         return B.mean(B.to_numpy(dist_diag.entropy())[0, 0])
@@ -634,15 +619,12 @@ def joint_entropy(self, dist: AbstractMultiOutputDistribution):
         """
         Model entropy over target points given context points.
 
-        Parameters
-        ----------
-        dist : neuralprocesses.dist.AbstractMultiOutputDistribution
-            The distribution to compute the entropy of.
+        Args:
+            dist (neuralprocesses.dist.AbstractMultiOutputDistribution):
+                The distribution to compute the entropy of.
 
-        Returns
-        -------
-        float
-            The model entropy.
+        Returns:
+            float: The model entropy.
         """
         return B.to_numpy(dist.entropy())[0, 0]
 
@@ -651,15 +633,12 @@ def joint_entropy(self, task: Task):
         """
         Model entropy over target points given context points.
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            The task to compute the entropy of.
+        Args:
+            task (:class:`~.data.task.Task`):
+                The task to compute the entropy of.
 
-        Returns
-        -------
-        float
-            The model entropy.
+        Returns:
+            float: The model entropy.
         """
         return B.to_numpy(self(task).entropy())[0, 0]
 
@@ -669,17 +648,14 @@ def logpdf(self, dist: AbstractMultiOutputDistribution, task: Task):
         Model outputs joint distribution over all targets: Concat targets along
         observation dimension.
 
-        Parameters
-        ----------
-        dist : neuralprocesses.dist.AbstractMultiOutputDistribution
-            The distribution to compute the logpdf of.
-        task : :class:`~.data.task.Task`
-            The task to compute the logpdf of.
+        Args:
+            dist (neuralprocesses.dist.AbstractMultiOutputDistribution):
+                The distribution to compute the logpdf of.
+            task (:class:`~.data.task.Task`):
+                The task to compute the logpdf of.
 
-        Returns
-        -------
-        float
-            The logpdf.
+        Returns:
+            float: The logpdf.
         """
         Y_t = B.concat(*task["Y_t"], axis=-1)
         return B.to_numpy(dist.logpdf(Y_t)).mean()
@@ -690,15 +666,12 @@ def logpdf(self, task: Task):
         Model outputs joint distribution over all targets: Concat targets along
         observation dimension.
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            The task to compute the logpdf of.
+        Args:
+            task (:class:`~.data.task.Task`):
+                The task to compute the logpdf of.
 
-        Returns
-        -------
-        float
-            The logpdf.
+        Returns:
+            float: The logpdf.
         """
         dist = self(task)
         return self.logpdf(dist, task)
@@ -713,24 +686,21 @@ def loss_fn(
         """
         Compute the loss of a task.
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            The task to compute the loss of.
-        fix_noise : ...
-            Whether to fix the noise to the value specified in the model
-            config.
-        num_lv_samples : int, optional
-            If latent variable model, number of lv samples for evaluating the
-            loss, by default 8.
-        normalise : bool, optional
-            Whether to normalise the loss by the number of target points, by
-            default False.
-
-        Returns
-        -------
-        float
-            The loss.
+        Args:
+            task (:class:`~.data.task.Task`):
+                The task to compute the loss of.
+            fix_noise (...):
+                Whether to fix the noise to the value specified in the model
+                config.
+            num_lv_samples (int, optional):
+                If latent variable model, number of lv samples for evaluating
+                the loss, by default 8.
+            normalise (bool, optional):
+                Whether to normalise the loss by the number of target points,
+                by default False.
+
+        Returns:
+            float: The loss.
         """
         task = ConvNP.modify_task(task)
 
@@ -769,25 +739,25 @@ def ar_sample(
         .. note::
             AR sampling only works for 0th context/target set
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            The task to sample from.
-        n_samples : int, optional
-            The number of samples to draw from the distribution, by default 1.
-        X_target_AR : :class:`numpy:numpy.ndarray`, optional
-            Locations to draw AR samples over. If None, AR samples will be
-            drawn over the target locations in the task. Defaults to None.
-        ar_subsample_factor : int, optional
-            Subsample target locations to draw AR samples over. Defaults to 1.
-        fill_type : Literal["mean", "sample"], optional
-            How to infill the rest of the sample. Must be one of "mean" or
-            "sample". Defaults to "mean".
-
-        Returns
-        -------
-        :class:`numpy:numpy.ndarray`
-            The samples.
+        Args:
+            task (:class:`~.data.task.Task`):
+                The task to sample from.
+            n_samples (int, optional):
+                The number of samples to draw from the distribution, by
+                default 1.
+            X_target_AR (:class:`numpy:numpy.ndarray`, optional):
+                Locations to draw AR samples over. If None, AR samples will be
+                drawn over the target locations in the task. Defaults to None.
+            ar_subsample_factor (int, optional):
+                Subsample target locations to draw AR samples over. Defaults
+                to 1.
+            fill_type (Literal["mean", "sample"], optional):
+                How to infill the rest of the sample. Must be one of "mean" or
+                "sample". Defaults to "mean".
+
+        Returns:
+            :class:`numpy:numpy.ndarray`
+                The samples.
         """
 
         # AR sampling requires gridded data to be flattened, not coordinate tuples
@@ -828,14 +798,18 @@ def ar_sample(
                     variance,
                     noiseless_samples,
                     noisy_samples,
-                ) = run_nps_model_ar(self.model, task_arsample, num_samples=n_samples)
+                ) = run_nps_model_ar(
+                    self.model, task_arsample, num_samples=n_samples
+                )
         else:
             (
                 mean,
                 variance,
                 noiseless_samples,
                 noisy_samples,
-            ) = run_nps_model_ar(self.model, task_arsample, num_samples=n_samples)
+            ) = run_nps_model_ar(
+                self.model, task_arsample, num_samples=n_samples
+            )
 
         # Slice out first (and assumed only) target entry in nps.Aggregate object
         noiseless_samples = B.to_numpy(noiseless_samples)
@@ -849,7 +823,9 @@ def ar_sample(
                 task_with_sample["X_c"][0] = B.concat(
                     task["X_c"][0], task_arsample["X_t"][0], axis=-1
                 )
-                task_with_sample["Y_c"][0] = B.concat(task["Y_c"][0], sample, axis=-1)
+                task_with_sample["Y_c"][0] = B.concat(
+                    task["Y_c"][0], sample, axis=-1
+                )
 
                 if fill_type == "mean":
                     # Compute the mean conditioned on the AR samples
diff --git a/deepsensor/model/defaults.py b/deepsensor/model/defaults.py
index cacf54af..47993829 100644
--- a/deepsensor/model/defaults.py
+++ b/deepsensor/model/defaults.py
@@ -21,16 +21,14 @@ def gen_ppu(task_loader: TaskLoader) -> int:
     computes the data resolution for each. The model ppu is then set to the
     maximum data ppu.
 
-    Parameters
-    ----------
-    task_loader : :class:`~.data.loader.TaskLoader`
-        TaskLoader object containing context and target sets.
-
-    Returns
-    -------
-    model_ppu : int
-        Model ppu (points per unit), i.e. the number of points per unit of
-        input space.
+    Args:
+        task_loader (:class:`~.data.loader.TaskLoader`):
+            TaskLoader object containing context and target sets.
+
+    Returns:
+        int:
+            Model ppu (points per unit), i.e. the number of points per unit of
+            input space.
     """
     # List of data resolutions for each context/target variable (in points-per-unit)
     data_ppus = []
@@ -63,16 +61,13 @@ def gen_decoder_scale(model_ppu: int) -> float:
     internal grid. The value chosen is 1 / model_ppu (i.e. the length scale is
     equal to the model's internal grid spacing).
 
-    Parameters
-    ----------
-    model_ppu : int
-        Model ppu (points per unit), i.e. the number of points per unit of
-        input space.
+    Args:
+        model_ppu (int):
+            Model ppu (points per unit), i.e. the number of points per unit of
+            input space.
 
-    Returns
-    -------
-    decoder_scale : float
-        Decoder scale.
+    Returns:
+        float: Decoder scale.
     """
     return 1 / model_ppu
 
@@ -95,18 +90,15 @@ def gen_encoder_scales(model_ppu: int, task_loader: TaskLoader) -> List[float]:
     points) for each context variable. The encoder scale is then set to 0.5 *
     data_resolution.
 
-    Parameters
-    ----------
-    model_ppu : int
-        Model ppu (points per unit), i.e. the number of points per unit of
-        input space.
-    task_loader : :class:`~.data.loader.TaskLoader`
-        TaskLoader object containing context and target sets.
-
-    Returns
-    -------
-    encoder_scales : list[float]
-        List of encoder scales for each context set.
+    Args:
+        model_ppu (int):
+            Model ppu (points per unit), i.e. the number of points per unit of
+            input space.
+        task_loader (:class:`~.data.loader.TaskLoader`):
+            TaskLoader object containing context and target sets.
+
+    Returns:
+        list[float]: List of encoder scales for each context set.
     """
     encoder_scales = []
     for var in task_loader.context:
diff --git a/deepsensor/model/model.py b/deepsensor/model/model.py
index 7187cafb..80a06e7d 100644
--- a/deepsensor/model/model.py
+++ b/deepsensor/model/model.py
@@ -34,36 +34,33 @@ def create_empty_spatiotemporal_xarray(
     """
     ...
 
-    Parameters
-    ----------
-    X : :class:`xarray.Dataset` | :class:`xarray.DataArray`
-        ...
-    dates : List[...]
-        ...
-    coord_names : dict, optional
-        ..., by default {"x1": "x1", "x2": "x2"}
-    data_vars : List[str], optional
-        ..., by default ["var"]
-    prepend_dims : List[str], optional
-        ..., by default None
-    prepend_coords : dict, optional
-        ..., by default None
-
-    Returns
-    -------
-    ...
+    Args:
+        X (:class:`xarray.Dataset` | :class:`xarray.DataArray`):
+            ...
+        dates (List[...]):
+            ...
+        coord_names (dict, optional):
+            ..., by default {"x1": "x1", "x2": "x2"}
+        data_vars (List[str], optional):
+            ..., by default ["var"]
+        prepend_dims (List[str], optional):
+            ..., by default None
+        prepend_coords (dict, optional):
+            ..., by default None
+
+    Returns:
         ...
+            ...
 
-    Raises
-    ------
-    ValueError
-        If ``data_vars`` contains duplicate values.
-    ValueError
-        If ``coord_names["x1"]`` is not uniformly spaced.
-    ValueError
-        If ``coord_names["x2"]`` is not uniformly spaced.
-    ValueError
-        If ``prepend_dims`` and ``prepend_coords`` are not the same length.
+    Raises:
+        ValueError
+            If ``data_vars`` contains duplicate values.
+        ValueError
+            If ``coord_names["x1"]`` is not uniformly spaced.
+        ValueError
+            If ``coord_names["x2"]`` is not uniformly spaced.
+        ValueError
+            If ``prepend_dims`` and ``prepend_coords`` are not the same length.
     """
     if prepend_dims is None:
         prepend_dims = []
@@ -82,9 +79,13 @@ def create_empty_spatiotemporal_xarray(
 
     # Assert uniform spacing
     if not np.allclose(np.diff(x1_predict), np.diff(x1_predict)[0]):
-        raise ValueError(f"Coordinate {coord_names['x1']} must be uniformly spaced.")
+        raise ValueError(
+            f"Coordinate {coord_names['x1']} must be uniformly spaced."
+        )
     if not np.allclose(np.diff(x2_predict), np.diff(x2_predict)[0]):
-        raise ValueError(f"Coordinate {coord_names['x2']} must be uniformly spaced.")
+        raise ValueError(
+            f"Coordinate {coord_names['x2']} must be uniformly spaced."
+        )
 
     if len(prepend_dims) != len(set(prepend_dims)):
         # TODO unit test
@@ -102,7 +103,10 @@ def create_empty_spatiotemporal_xarray(
     }
 
     pred_ds = xr.Dataset(
-        {data_var: xr.DataArray(dims=dims, coords=coords) for data_var in data_vars}
+        {
+            data_var: xr.DataArray(dims=dims, coords=coords)
+            for data_var in data_vars
+        }
     ).astype("float32")
 
     # Convert time coord to pandas timestamps
@@ -126,26 +130,28 @@ def increase_spatial_resolution(
     ..
         # TODO wasteful to interpolate X_t_normalised
 
-    Parameters
-    ----------
-    X_t_normalised : ...
-        ...
-    resolution_factor : ...
-        ...
-    coord_names : dict, optional
-        ..., by default {"x1": "x1", "x2": "x2"}
+    Args:
+        X_t_normalised (...):
+            ...
+        resolution_factor (...):
+            ...
+        coord_names (dict, optional):
+            ..., by default {"x1": "x1", "x2": "x2"}
 
-    Returns
-    -------
-    ...
+    Returns:
         ...
+            ...
     """
     assert isinstance(resolution_factor, (float, int))
     assert isinstance(X_t_normalised, (xr.DataArray, xr.Dataset))
     x1_name, x2_name = coord_names["x1"], coord_names["x2"]
     x1, x2 = X_t_normalised.coords[x1_name], X_t_normalised.coords[x2_name]
-    x1 = np.linspace(x1[0], x1[-1], int(x1.size * resolution_factor), dtype="float64")
-    x2 = np.linspace(x2[0], x2[-1], int(x2.size * resolution_factor), dtype="float64")
+    x1 = np.linspace(
+        x1[0], x1[-1], int(x1.size * resolution_factor), dtype="float64"
+    )
+    x2 = np.linspace(
+        x2[0], x2[-1], int(x2.size * resolution_factor), dtype="float64"
+    )
     X_t_normalised = X_t_normalised.interp(
         **{x1_name: x1, x2_name: x2}, method="nearest"
     )
@@ -164,20 +170,16 @@ def mean(self, task: Task, *args, **kwargs):
         Computes the model mean prediction over target points based on given context
         data.
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            Task containing context data.
+        Args:
+            task (:class:`~.data.task.Task`):
+                Task containing context data.
 
-        Returns
-        -------
-        mean : :class:`numpy:numpy.ndarray`
-            Should return mean prediction over target points.
+        Returns:
+            :class:`numpy:numpy.ndarray`: Mean prediction over target points.
 
-        Raises
-        ------
-        NotImplementedError
-            If not implemented by child class.
+        Raises:
+            NotImplementedError
+                If not implemented by child class.
         """
         raise NotImplementedError()
 
@@ -186,20 +188,16 @@ def variance(self, task: Task, *args, **kwargs):
         Model marginal variance over target points given context points.
         Shape (N,).
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            Task containing context data.
+        Args:
+            task (:class:`~.data.task.Task`):
+                Task containing context data.
 
-        Returns
-        -------
-        var : :class:`numpy:numpy.ndarray`
-            Should return marginal variance over target points.
+        Returns:
+            :class:`numpy:numpy.ndarray`: Marginal variance over target points.
 
-        Raises
-        ------
-        NotImplementedError
-            If not implemented by child class.
+        Raises:
+            NotImplementedError
+                If not implemented by child class.
         """
         raise NotImplementedError()
 
@@ -208,15 +206,12 @@ def stddev(self, task: Task):
         Model marginal standard deviation over target points given context
         points. Shape (N,).
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            Task containing context data.
+        Args:
+            task (:class:`~.data.task.Task`):
+                Task containing context data.
 
-        Returns
-        -------
-        std : :class:`numpy:numpy.ndarray`
-            Should return marginal standard deviation over target points.
+        Returns:
+            :class:`numpy:numpy.ndarray`: Marginal standard deviation over target points.
         """
         var = self.variance(task)
         return var**0.5
@@ -226,20 +221,16 @@ def covariance(self, task: Task, *args, **kwargs):
         Computes the model covariance matrix over target points based on given
         context data. Shape (N, N).
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            Task containing context data.
+        Args:
+            task (:class:`~.data.task.Task`):
+                Task containing context data.
 
-        Returns
-        -------
-        cov : :class:`numpy:numpy.ndarray`
-            Should return covariance matrix over target points.
+        Returns:
+            :class:`numpy:numpy.ndarray`: Covariance matrix over target points.
 
-        Raises
-        ------
-        NotImplementedError
-            If not implemented by child class.
+        Raises:
+            NotImplementedError
+                If not implemented by child class.
         """
         raise NotImplementedError()
 
@@ -251,20 +242,17 @@ def mean_marginal_entropy(self, task: Task, *args, **kwargs):
         .. note::
             Note: Getting a vector of marginal entropies would be useful too.
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            Task containing context data.
 
-        Returns
-        -------
-        mean_marginal_entropy : float
-            Should return mean marginal entropy over target points.
+        Args:
+            task (:class:`~.data.task.Task`):
+                Task containing context data.
 
-        Raises
-        ------
-        NotImplementedError
-            If not implemented by child class.
+        Returns:
+            float: Mean marginal entropy over target points.
+
+        Raises:
+            NotImplementedError
+                If not implemented by child class.
         """
         raise NotImplementedError()
 
@@ -273,20 +261,17 @@ def joint_entropy(self, task: Task, *args, **kwargs):
         Computes the model joint entropy over target points based on given
         context data.
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            Task containing context data.
 
-        Returns
-        -------
-        joint_entropy : float
-            Should return joint entropy over target points.
+        Args:
+            task (:class:`~.data.task.Task`):
+                Task containing context data.
+
+        Returns:
+            float: Joint entropy over target points.
 
-        Raises
-        ------
-        NotImplementedError
-            If not implemented by child class.
+        Raises:
+            NotImplementedError
+                If not implemented by child class.
         """
         raise NotImplementedError()
 
@@ -295,20 +280,16 @@ def logpdf(self, task: Task, *args, **kwargs):
         Computes the joint model logpdf over target points based on given
         context data.
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            Task containing context data.
+        Args:
+            task (:class:`~.data.task.Task`):
+                Task containing context data.
 
-        Returns
-        -------
-        logpdf : float
-            Should return joint logpdf over target points.
+        Returns:
+            float: Joint logpdf over target points.
 
-        Raises
-        ------
-        NotImplementedError
-            If not implemented by child class.
+        Raises:
+            NotImplementedError
+                If not implemented by child class.
         """
         raise NotImplementedError()
 
@@ -316,20 +297,16 @@ def loss(self, task: Task, *args, **kwargs):
         """
         Computes the model loss over target points based on given context data.
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            Task containing context data.
+        Args:
+            task (:class:`~.data.task.Task`):
+                Task containing context data.
 
-        Returns
-        -------
-        loss : float
-            Should return loss over target points.
+        Returns:
+            float: Loss over target points.
 
-        Raises
-        ------
-        NotImplementedError
-            If not implemented by child class.
+        Raises:
+            NotImplementedError
+                If not implemented by child class.
         """
         raise NotImplementedError()
 
@@ -338,22 +315,19 @@ def sample(self, task: Task, n_samples=1, *args, **kwargs):
         Draws ``n_samples`` joint samples over target points based on given
         context data. Returned shape is ``(n_samples, n_target)``.
 
-        Parameters
-        ----------
-        task : :class:`~.data.task.Task`
-            Task containing context data.
-        n_samples : int
-            Number of samples to draw.
-
-        Returns
-        -------
-        samples : Tuple[:class:`numpy:numpy.ndarray`, :class:`numpy:numpy.ndarray`]
-            Should return joint samples over target points.
-
-        Raises
-        ------
-        NotImplementedError
-            If not implemented by child class.
+
+        Args:
+            task (:class:`~.data.task.Task`):
+                Task containing context data.
+            n_samples (int, optional):
+                Number of samples to draw. Defaults to 1.
+
+        Returns:
+            tuple[:class:`numpy:numpy.ndarray`]: Joint samples over target points.
+
+        Raises:
+            NotImplementedError
+                If not implemented by child class.
         """
         raise NotImplementedError()
 
@@ -373,13 +347,12 @@ def __init__(
         """
         Initialise DeepSensorModel.
 
-        Parameters
-        ----------
-        data_processor : :class:`~.data.processor.DataProcessor`
-            DataProcessor object, used to unnormalise predictions.
-        task_loader : :class:`~.data.loader.TaskLoader`
-            TaskLoader object, used to determine target variables for
-            unnormalising.
+        Args:
+            data_processor (:class:`~.data.processor.DataProcessor`):
+                DataProcessor object, used to unnormalise predictions.
+            task_loader (:class:`~.data.loader.TaskLoader`):
+                TaskLoader object, used to determine target variables for
+                unnormalising.
         """
         self.task_loader = task_loader
         self.data_processor = data_processor
@@ -416,73 +389,77 @@ def predict(
             TODO:
             - Test with multiple targets model
 
-        Parameters
-        ----------
-        tasks : List[Task] | Task
-            List of tasks containing context data.
-        X_t : :class:`xarray.Dataset` | :class:`xarray.DataArray` | :class:`pandas.DataFrame` | :class:`pandas.Series` | :class:`pandas.Index` | :class:`numpy:numpy.ndarray`
-            Target locations to predict at. Can be an xarray object containing
-            on-grid locations or a pandas object containing off-grid locations.
-        X_t_mask: :class:`xarray.Dataset` | :class:`xarray.DataArray`
-            Optional 2D mask to apply to X_t (zero/False will be NaNs). Will be interpolated
-            to the same grid as X_t. Default None (no mask).
-        X_t_is_normalised : bool
-            Whether the ``X_t`` coords are normalised. If False, will normalise
-            the coords before passing to model. Default ``False``.
-        aux_at_targets_override : :class:`xarray.Dataset` | :class:`xarray.DataArray`
-            Optional auxiliary xarray data to override from the task_loader.
-        aux_at_targets_override_is_normalised : bool
-            Whether the `aux_at_targets_override` coords are normalised.
-            If False, the DataProcessor will normalise the coords before passing to model.
-            Default False.
-        resolution_factor : float
-            Optional factor to increase the resolution of the target grid by.
-            E.g. 2 will double the target resolution, 0.5 will halve it.
-            Applies to on-grid predictions only. Default 1.
-        n_samples : int
-            Number of joint samples to draw from the model. If 0, will not
-            draw samples. Default 0.
-        ar_sample : bool
-            Whether to use autoregressive sampling. Default ``False``.
-        unnormalise : bool
-            Whether to unnormalise the predictions. Only works if ``self`` has
-            a ``data_processor`` and ``task_loader`` attribute. Default
-            ``True``.
-        seed : int
-            Random seed for deterministic sampling. Default 0.
-        append_indexes : dict
-            Dictionary of index metadata to append to pandas indexes in the
-            off-grid case. Default ``None``.
-        progress_bar : int
-            Whether to display a progress bar over tasks. Default 0.
-        verbose : bool
-            Whether to print time taken for prediction. Default ``False``.
-
-        Returns
-        -------
-        predictions : :class:`xarray.Dataset` | :class:`xarray.DataArray` | :class:`pandas.DataFrame` | :class:`pandas.Series` | :class:`pandas.Index`
-            If ``X_t`` is a pandas object, returns pandas objects containing
-            off-grid predictions.
-
-            If ``X_t`` is an xarray object, returns xarray object containing
-            on-grid predictions.
-
-            If ``n_samples`` == 0, returns only mean and std predictions.
-
-            If ``n_samples`` > 0, returns mean, std and samples predictions.
-
-        Raises
-        ------
-        ValueError
-            If ``X_t`` is not an xarray object and
-            ``resolution_factor`` is not 1 or ``ar_subsample_factor`` is not 1.
-        ValueError
-            If ``X_t`` is not a pandas object and ``append_indexes`` is not
-            ``None``.
-        ValueError
-            If ``X_t`` is not an xarray, pandas or numpy object.
-        ValueError
-            If ``append_indexes`` are not all the same length as ``X_t``.
+        Args:
+            tasks (List[Task] | Task):
+                List of tasks containing context data.
+            X_t (:class:`xarray.Dataset` | :class:`xarray.DataArray` | :class:`pandas.DataFrame` | :class:`pandas.Series` | :class:`pandas.Index` | :class:`numpy:numpy.ndarray`):
+                Target locations to predict at. Can be an xarray object
+                containing on-grid locations or a pandas object containing
+                off-grid locations.
+            X_t_mask: :class:`xarray.Dataset` | :class:`xarray.DataArray`
+                Optional 2D mask to apply to X_t (zero/False will be NaNs).
+                Will be interpolated to the same grid as X_t. Default None (no
+                mask).
+            X_t_is_normalised (bool):
+                Whether the ``X_t`` coords are normalised. If False, will
+                normalise
+                the coords before passing to model. Default ``False``.
+            aux_at_targets_override (:class:`xarray.Dataset` | :class:`xarray.DataArray`):
+                Optional auxiliary xarray data to override from the
+                task_loader.
+            aux_at_targets_override_is_normalised (bool):
+                Whether the `aux_at_targets_override` coords are normalised.
+                If False, the DataProcessor will normalise the coords before
+                passing to model.
+                Default False.
+            resolution_factor (float):
+                Optional factor to increase the resolution of the target grid
+                by. E.g. 2 will double the target resolution, 0.5 will halve
+                it. Applies to on-grid predictions only. Default 1.
+            n_samples (int):
+                Number of joint samples to draw from the model. If 0, will not
+                draw samples. Default 0.
+            ar_sample (bool):
+                Whether to use autoregressive sampling. Default ``False``.
+            unnormalise (bool):
+                Whether to unnormalise the predictions. Only works if ``self``
+                has a ``data_processor`` and ``task_loader`` attribute. Default
+                ``True``.
+            seed (int):
+                Random seed for deterministic sampling. Default 0.
+            append_indexes (dict):
+                Dictionary of index metadata to append to pandas indexes in the
+                off-grid case. Default ``None``.
+            progress_bar (int):
+                Whether to display a progress bar over tasks. Default 0.
+            verbose (bool):
+                Whether to print time taken for prediction. Default ``False``.
+
+        Returns:=
+            :class:`xarray.Dataset` | :class:`xarray.DataArray` | :class:`pandas.DataFrame` | :class:`pandas.Series` | :class:`pandas.Index`
+                If ``X_t`` is a pandas object, returns pandas objects
+                containing off-grid predictions.
+
+                If ``X_t`` is an xarray object, returns xarray object
+                containing on-grid predictions.
+
+                If ``n_samples`` == 0, returns only mean and std predictions.
+
+                If ``n_samples`` > 0, returns mean, std and samples
+                predictions.
+
+        Raises:
+            ValueError
+                If ``X_t`` is not an xarray object and
+                ``resolution_factor`` is not 1 or ``ar_subsample_factor`` is
+                not 1.
+            ValueError
+                If ``X_t`` is not a pandas object and ``append_indexes`` is not
+                ``None``.
+            ValueError
+                If ``X_t`` is not an xarray, pandas or numpy object.
+            ValueError
+                If ``append_indexes`` are not all the same length as ``X_t``.
         """
         tic = time.time()
 
@@ -495,7 +472,9 @@ def predict(
                 raise ValueError(
                     "ar_subsample_factor can only be used with on-grid predictions."
                 )
-        if not isinstance(X_t, (pd.DataFrame, pd.Series, pd.Index, np.ndarray)):
+        if not isinstance(
+            X_t, (pd.DataFrame, pd.Series, pd.Index, np.ndarray)
+        ):
             if append_indexes is not None:
                 raise ValueError(
                     "append_indexes can only be used with off-grid predictions."
@@ -512,7 +491,9 @@ def predict(
 
         if mode == "off-grid" and X_t_mask is not None:
             # TODO: Unit test this
-            raise ValueError("X_t_mask can only be used with on-grid predictions.")
+            raise ValueError(
+                "X_t_mask can only be used with on-grid predictions."
+            )
 
         if type(tasks) is Task:
             tasks = [tasks]
@@ -571,9 +552,14 @@ def predict(
                 X_t_mask_normalised = self.data_processor.map_coords(X_t_mask)
                 X_t_arr = xarray_to_coord_array_normalised(X_t_normalised)
                 # Remove points that lie outside the mask
-                X_t_arr = mask_coord_array_normalised(X_t_arr, X_t_mask_normalised)
+                X_t_arr = mask_coord_array_normalised(
+                    X_t_arr, X_t_mask_normalised
+                )
             else:
-                X_t_arr = (X_t_normalised["x1"].values, X_t_normalised["x2"].values)
+                X_t_arr = (
+                    X_t_normalised["x1"].values,
+                    X_t_normalised["x2"].values,
+                )
 
         elif mode == "off-grid":
             X_t_arr = X_t_normalised.reset_index()[["x1", "x2"]].values.T
@@ -617,7 +603,9 @@ def predict(
         elif mode == "off-grid":
             # Repeat target locs for each date to create multiindex
             idxs = [(date, *idxs) for date in dates for idxs in X_t.index]
-            index = pd.MultiIndex.from_tuples(idxs, names=["time", *X_t.index.names])
+            index = pd.MultiIndex.from_tuples(
+                idxs, names=["time", *X_t.index.names]
+            )
             mean = pd.DataFrame(index=index, columns=target_var_IDs)
             std = pd.DataFrame(index=index, columns=target_var_IDs)
             if n_samples >= 1:
@@ -630,7 +618,9 @@ def predict(
                 index_samples = pd.MultiIndex.from_tuples(
                     idxs_samples, names=["sample", "time", *X_t.index.names]
                 )
-                samples = pd.DataFrame(index=index_samples, columns=target_var_IDs)
+                samples = pd.DataFrame(
+                    index=index_samples, columns=target_var_IDs
+                )
 
         def unnormalise_pred_array(arr, **kwargs):
             var_IDs_flattened = [
@@ -663,7 +653,9 @@ def unnormalise_pred_array(arr, **kwargs):
             else:
                 aux_at_targets = self.task_loader.aux_at_targets
 
-        for task in tqdm(tasks, position=0, disable=progress_bar < 1, leave=True):
+        for task in tqdm(
+            tasks, position=0, disable=progress_bar < 1, leave=True
+        ):
             task["X_t"] = [X_t_arr for _ in range(len(task["X_t"]))]
 
             # If passing auxiliary data, need to sample it at target locations
@@ -692,7 +684,9 @@ def unnormalise_pred_array(arr, **kwargs):
                             n_samples=n_samples,
                             ar_subsample_factor=ar_subsample_factor,
                         )
-                        samples_arr = samples_arr.reshape((n_samples, *mean_arr.shape))
+                        samples_arr = samples_arr.reshape(
+                            (n_samples, *mean_arr.shape)
+                        )
                     else:
                         samples_arr = self.sample(dist, n_samples=n_samples)
             else:
@@ -708,7 +702,9 @@ def unnormalise_pred_array(arr, **kwargs):
                             n_samples=n_samples,
                             ar_subsample_factor=ar_subsample_factor,
                         )
-                        samples_arr = samples_arr.reshape((n_samples, *mean_arr.shape))
+                        samples_arr = samples_arr.reshape(
+                            (n_samples, *mean_arr.shape)
+                        )
                     else:
                         samples_arr = self.sample(task, n_samples=n_samples)
 
@@ -734,12 +730,16 @@ def unnormalise_pred_array(arr, **kwargs):
                     std.loc[:, task["time"], :, :] = std_arr
                     if n_samples >= 1:
                         for sample_i in range(n_samples):
-                            samples.loc[:, sample_i, task["time"], :, :] = samples_arr[
-                                sample_i
-                            ]
+                            samples.loc[
+                                :, sample_i, task["time"], :, :
+                            ] = samples_arr[sample_i]
                 else:
-                    mean.loc[:, task["time"], :, :].data[:, X_t_mask.data] = mean_arr
-                    std.loc[:, task["time"], :, :].data[:, X_t_mask.data] = std_arr
+                    mean.loc[:, task["time"], :, :].data[
+                        :, X_t_mask.data
+                    ] = mean_arr
+                    std.loc[:, task["time"], :, :].data[
+                        :, X_t_mask.data
+                    ] = std_arr
                     if n_samples >= 1:
                         for sample_i in range(n_samples):
                             samples.loc[:, sample_i, task["time"], :, :].data[
@@ -751,7 +751,9 @@ def unnormalise_pred_array(arr, **kwargs):
                 std.loc[task["time"]] = std_arr.T
                 if n_samples >= 1:
                     for sample_i in range(n_samples):
-                        samples.loc[sample_i, task["time"]] = samples_arr[sample_i].T
+                        samples.loc[sample_i, task["time"]] = samples_arr[
+                            sample_i
+                        ].T
 
         if mode == "on-grid":
             mean = mean.to_dataset(dim="data_var")
@@ -777,6 +779,36 @@ def create_empty_spatiotemporal_xarray(
     prepend_dims: List[str] = None,
     prepend_coords: dict = None,
 ):
+    """
+    ...
+
+    Args:
+        X (xr.Dataset | xr.DataArray):
+            _description_
+        dates (List):
+            _description_
+        coord_names (..., optional):
+            _description_, by default {"x1": "x1", "x2": "x2"}
+        data_vars (List, optional):
+            _description_, by default ["var"]
+        prepend_dims (List[str], optional):
+            _description_, by default None
+        prepend_coords (dict, optional):
+            _description_, by default None
+
+    Returns:
+        ...: ...
+
+    Raises:
+        ValueError
+            ...
+        ValueError
+            ...
+        ValueError
+            ...
+        ValueError
+            ...
+    """
     if prepend_dims is None:
         prepend_dims = []
     if prepend_coords is None:
@@ -794,9 +826,13 @@ def create_empty_spatiotemporal_xarray(
 
     # Assert uniform spacing
     if not np.allclose(np.diff(x1_predict), np.diff(x1_predict)[0]):
-        raise ValueError(f"Coordinate {coord_names['x1']} must be uniformly spaced.")
+        raise ValueError(
+            f"Coordinate {coord_names['x1']} must be uniformly spaced."
+        )
     if not np.allclose(np.diff(x2_predict), np.diff(x2_predict)[0]):
-        raise ValueError(f"Coordinate {coord_names['x2']} must be uniformly spaced.")
+        raise ValueError(
+            f"Coordinate {coord_names['x2']} must be uniformly spaced."
+        )
 
     if len(prepend_dims) != len(set(prepend_dims)):
         # TODO unit test
@@ -814,7 +850,10 @@ def create_empty_spatiotemporal_xarray(
     }
 
     pred_ds = xr.Dataset(
-        {data_var: xr.DataArray(dims=dims, coords=coords) for data_var in data_vars}
+        {
+            data_var: xr.DataArray(dims=dims, coords=coords)
+            for data_var in data_vars
+        }
     ).astype("float32")
 
     # Convert time coord to pandas timestamps
@@ -828,15 +867,35 @@ def create_empty_spatiotemporal_xarray(
 
 
 def increase_spatial_resolution(
-    X_t_normalised, resolution_factor, coord_names: dict = {"x1": "x1", "x2": "x2"}
+    X_t_normalised,
+    resolution_factor,
+    coord_names: dict = {"x1": "x1", "x2": "x2"},
 ):
+    """
+    ...
+
+    Args:
+        X_t_normalised (...):
+            ...
+        resolution_factor (...):
+            ...
+        coord_names (..., optional):
+            ..., by default {"x1": "x1", "x2": "x2"}
+
+    Returns:
+        ...: ...
+    """
     # TODO wasteful to interpolate X_t_normalised
     assert isinstance(resolution_factor, (float, int))
     assert isinstance(X_t_normalised, (xr.DataArray, xr.Dataset))
     x1_name, x2_name = coord_names["x1"], coord_names["x2"]
     x1, x2 = X_t_normalised.coords[x1_name], X_t_normalised.coords[x2_name]
-    x1 = np.linspace(x1[0], x1[-1], int(x1.size * resolution_factor), dtype="float64")
-    x2 = np.linspace(x2[0], x2[-1], int(x2.size * resolution_factor), dtype="float64")
+    x1 = np.linspace(
+        x1[0], x1[-1], int(x1.size * resolution_factor), dtype="float64"
+    )
+    x2 = np.linspace(
+        x2[0], x2[-1], int(x2.size * resolution_factor), dtype="float64"
+    )
     X_t_normalised = X_t_normalised.interp(
         **{x1_name: x1, x2_name: x2}, method="nearest"
     )
diff --git a/deepsensor/model/nps.py b/deepsensor/model/nps.py
index 2fb993e8..f99dc3c2 100644
--- a/deepsensor/model/nps.py
+++ b/deepsensor/model/nps.py
@@ -12,15 +12,13 @@ def convert_task_to_nps_args(task: Task):
     ..
         TODO move to ConvNP class?
 
-    Parameters
-    ----------
-    task : :class:`~.data.task.Task`
-        Task object containing context and target sets.
-
-    Returns
-    -------
-    ...
-        ...
+    Args:
+        task (:class:`~.data.task.Task`):
+            Task object containing context and target sets.
+
+    Returns:
+        tuple[list[tuple[numpy.ndarray, numpy.ndarray]], numpy.ndarray, numpy.ndarray, dict]:
+            ...
     """
 
     context_data = list(zip(task["X_c"], task["Y_c"]))
@@ -31,7 +29,9 @@ def convert_task_to_nps_args(task: Task):
         yt = task["Y_t"][0]
     elif len(task["X_t"]) > 1 and len(task["Y_t"]) > 1:
         # Multiple target sets, different target locations
-        xt = backend.nps.AggregateInput(*[(xt, i) for i, xt in enumerate(task["X_t"])])
+        xt = backend.nps.AggregateInput(
+            *[(xt, i) for i, xt in enumerate(task["X_t"])]
+        )
         yt = backend.nps.Aggregate(*task["Y_t"])
     elif len(task["X_t"]) == 1 and len(task["Y_t"]) > 1:
         # Multiple target sets, same target locations
@@ -58,22 +58,20 @@ def run_nps_model(
     """
     Run ``neuralprocesses`` model.
 
-    Parameters
-    ----------
-    neural_process : neuralprocesses.Model
-        Neural process model.
-    task : :class:`~.data.task.Task`
-        Task object containing context and target sets.
-    n_samples : int, optional
-        Number of samples to draw from the model. Defaults to ``None`` (single
-        sample).
-    requires_grad : bool, optional
-        Whether to require gradients. Defaults to ``False``.
-
-    Returns
-    -------
-    dist : neuralprocesses.distributions.Distribution
-        Distribution object containing the model's predictions.
+    Args:
+        neural_process (neuralprocesses.Model):
+            Neural process model.
+        task (:class:`~.data.task.Task`):
+            Task object containing context and target sets.
+        n_samples (int, optional):
+            Number of samples to draw from the model. Defaults to ``None``
+            (single sample).
+        requires_grad (bool, optional):
+            Whether to require gradients. Defaults to ``False``.
+
+    Returns:
+        neuralprocesses.distributions.Distribution:
+            Distribution object containing the model's predictions.
     """
     context_data, xt, _, model_kwargs = convert_task_to_nps_args(task)
     if backend.str == "torch" and not requires_grad:
@@ -85,7 +83,9 @@ def run_nps_model(
                 context_data, xt, **model_kwargs, num_samples=n_samples
             )
     else:
-        dist = neural_process(context_data, xt, **model_kwargs, num_samples=n_samples)
+        dist = neural_process(
+            context_data, xt, **model_kwargs, num_samples=n_samples
+        )
     return dist
 
 
@@ -93,19 +93,17 @@ def run_nps_model_ar(neural_process, task: Task, num_samples: int = 1):
     """
     Run ``neural_process`` in AR mode.
 
-    Parameters
-    ----------
-    neural_process : neuralprocesses.Model
-        Neural process model.
-    task : :class:`~.data.task.Task`
-        Task object containing context and target sets.
-    num_samples : int, optional
-        Number of samples to draw from the model. Defaults to 1.
-
-    Returns
-    -------
-    Tuple[..., ..., ..., ...]
-        Tuple of mean, variance, noiseless samples, and noisy samples.
+    Args:
+        neural_process (neuralprocesses.Model):
+            Neural process model.
+        task (:class:`~.data.task.Task`):
+            Task object containing context and target sets.
+        num_samples (int, optional):
+            Number of samples to draw from the model. Defaults to 1.
+
+    Returns:
+        tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]:
+            Tuple of mean, variance, noiseless samples, and noisy samples.
     """
     context_data, xt, _, _ = convert_task_to_nps_args(task)
 
@@ -150,83 +148,80 @@ def construct_neural_process(
     needed, they must be explicitly passed to ``neuralprocesses`` constructor
     (not currently safe to use `**kwargs` here).
 
-    Parameters
-    ----------
-    dim_x : int, optional
-        Dimensionality of the inputs. Defaults to 1.
-    dim_y : int, optional
-        Dimensionality of the outputs. Defaults to 1.
-    dim_yc : int or tuple[int], optional
-        Dimensionality of the outputs of the context set. You should set this
-        if the dimensionality of the outputs of the context set is not equal to
-        the dimensionality of the outputs of the target set. You should also
-        set this if you want to use multiple context sets. In that case, set
-        this equal to a tuple of integers indicating the respective output
-        dimensionalities.
-    dim_yt : int, optional
-        Dimensionality of the outputs of the target set. You should set this if
-        the dimensionality of the outputs of the target set is not equal to the
-        dimensionality of the outputs of the context set.
-    dim_aux_t : int, optional
-        Dimensionality of target-specific auxiliary variables.
-    points_per_unit : int, optional
-        Density of the internal discretisation. Defaults to 100.
-    likelihood : str, optional
-        Likelihood. Must be one of ``"cnp"`` (equivalently ``"het"``),
-        ``"gnp"`` (equivalently ``"lowrank"``), or ``"cnp-spikes-beta"``
-        (equivalently ``"spikes-beta"``). Defaults to ``"cnp"``.
-    conv_arch : str, optional
-        Convolutional architecture to use. Must be one of
-        ``"unet[-res][-sep]"`` or ``"conv[-res][-sep]"``. Defaults to
-        ``"unet"``.
-    unet_channels: tuple[int], optional
-        Channels of every layer of the UNet. Defaults to six layers each with
-        64 channels.
-    unet_kernels : int or tuple[int], optional
-        Sizes of the kernels in the UNet. Defaults to 5.
-    unet_resize_convs : bool, optional
-        Use resize convolutions rather than transposed convolutions in the
-        UNet. Defaults to ``False``.
-    unet_resize_conv_interp_method : str, optional
-        Interpolation method for the resize convolutions in the UNet. Can be
-        set to ``"bilinear"``. Defaults to "bilinear".
-    num_basis_functions : int, optional
-        Number of basis functions for the low-rank likelihood. Defaults to
-        64.
-    dim_lv : int, optional
-        Dimensionality of the latent variable. Setting to >0 constructs a
-        latent neural process. Defaults to 0.
-    encoder_scales : float or tuple[float], optional
-        Initial value for the length scales of the set convolutions for the
-        context sets embeddings. Set to a tuple equal to the number of context
-        sets to use different values for each set. Set to a single value to use
-        the same value for all context sets. Defaults to
-        ``1 / points_per_unit``.
-    encoder_scales_learnable : bool, optional
-        Whether the encoder SetConv length scale(s) are learnable. Defaults to
-        ``False``.
-    decoder_scale : float, optional
-        Initial value for the length scale of the set convolution in the
-        decoder. Defaults to ``1 / points_per_unit``.
-    decoder_scale_learnable : bool, optional
-        Whether the decoder SetConv length scale(s) are learnable. Defaults to
-        ``False``.
-    aux_t_mlp_layers : tuple[int], optional
-        Widths of the layers of the MLP for the target-specific auxiliary
-        variable. Defaults to three layers of width 128.
-    epsilon : float, optional
-        Epsilon added by the set convolutions before dividing by the density
-        channel. Defaults to ``1e-2``.
-
-    Returns
-    -------
-    :class:`.model.Model`:
-        ConvNP model.
-
-    Raises
-    ------
-    NotImplementedError
-        If specified backend has no default dtype.
+    Args:
+        dim_x (int, optional):
+            Dimensionality of the inputs. Defaults to 1.
+        dim_y (int, optional):
+            Dimensionality of the outputs. Defaults to 1.
+        dim_yc (int or tuple[int], optional):
+            Dimensionality of the outputs of the context set. You should set
+            this if the dimensionality of the outputs of the context set is not
+            equal to the dimensionality of the outputs of the target set. You
+            should also set this if you want to use multiple context sets. In
+            that case, set this equal to a tuple of integers indicating the
+            respective output dimensionalities.
+        dim_yt (int, optional):
+            Dimensionality of the outputs of the target set. You should set
+            this if the dimensionality of the outputs of the target set is not
+            equal to the dimensionality of the outputs of the context set.
+        dim_aux_t (int, optional):
+            Dimensionality of target-specific auxiliary variables.
+        points_per_unit (int, optional):
+            Density of the internal discretisation. Defaults to 100.
+        likelihood (str, optional):
+            Likelihood. Must be one of ``"cnp"`` (equivalently ``"het"``),
+            ``"gnp"`` (equivalently ``"lowrank"``), or ``"cnp-spikes-beta"``
+            (equivalently ``"spikes-beta"``). Defaults to ``"cnp"``.
+        conv_arch (str, optional):
+            Convolutional architecture to use. Must be one of
+            ``"unet[-res][-sep]"`` or ``"conv[-res][-sep]"``. Defaults to
+            ``"unet"``.
+        unet_channels (tuple[int], optional):
+            Channels of every layer of the UNet. Defaults to six layers each
+            with 64 channels.
+        unet_kernels (int or tuple[int], optional):
+            Sizes of the kernels in the UNet. Defaults to 5.
+        unet_resize_convs (bool, optional):
+            Use resize convolutions rather than transposed convolutions in the
+            UNet. Defaults to ``False``.
+        unet_resize_conv_interp_method (str, optional):
+            Interpolation method for the resize convolutions in the UNet. Can
+            be set to ``"bilinear"``. Defaults to "bilinear".
+        num_basis_functions (int, optional):
+            Number of basis functions for the low-rank likelihood. Defaults to
+            64.
+        dim_lv (int, optional):
+            Dimensionality of the latent variable. Setting to >0 constructs a
+            latent neural process. Defaults to 0.
+        encoder_scales (float or tuple[float], optional):
+            Initial value for the length scales of the set convolutions for the
+            context sets embeddings. Set to a tuple equal to the number of
+            context sets to use different values for each set. Set to a single
+            value to use the same value for all context sets. Defaults to
+            ``1 / points_per_unit``.
+        encoder_scales_learnable (bool, optional):
+            Whether the encoder SetConv length scale(s) are learnable.
+            Defaults to ``False``.
+        decoder_scale (float, optional):
+            Initial value for the length scale of the set convolution in the
+            decoder. Defaults to ``1 / points_per_unit``.
+        decoder_scale_learnable (bool, optional):
+            Whether the decoder SetConv length scale(s) are learnable. Defaults
+            to ``False``.
+        aux_t_mlp_layers (tuple[int], optional):
+            Widths of the layers of the MLP for the target-specific auxiliary
+            variable. Defaults to three layers of width 128.
+        epsilon (float, optional):
+            Epsilon added by the set convolutions before dividing by the
+            density channel. Defaults to ``1e-2``.
+
+    Returns:
+        :class:`.model.Model`:
+            ConvNP model.
+
+    Raises:
+        NotImplementedError
+            If specified backend has no default dtype.
     """
     if likelihood == "cnp":
         likelihood = "het"
@@ -247,7 +242,9 @@ def construct_neural_process(
 
         dtype = tf.float32
     else:
-        raise NotImplementedError(f"Backend {backend.str} has no default dtype.")
+        raise NotImplementedError(
+            f"Backend {backend.str} has no default dtype."
+        )
 
     neural_process = backend.nps.construct_convgnp(
         dim_x=dim_x,
@@ -281,19 +278,19 @@ def compute_encoding_tensor(model, task: Task):
     """
     Compute the encoding tensor for a given task.
 
-    Parameters
-    ----------
-    model : ...
-        Model object.
-    task : :class:`~.data.task.Task`
-        Task object containing context and target sets.
-
-    Returns
-    -------
-    encoding : :class:`numpy:numpy.ndarray`
-        Encoding tensor? #TODO
+    Args:
+        model (...):
+            Model object.
+        task (:class:`~.data.task.Task`):
+            Task object containing context and target sets.
+
+    Returns:
+        encoding : :class:`numpy:numpy.ndarray`
+            Encoding tensor? #TODO
     """
-    neural_process_encoder = backend.nps.Model(model.model.encoder, lambda x: x)
+    neural_process_encoder = backend.nps.Model(
+        model.model.encoder, lambda x: x
+    )
     task = model.modify_task(task)
     encoding = B.to_numpy(run_nps_model(neural_process_encoder, task))
     return encoding
diff --git a/deepsensor/train/train.py b/deepsensor/train/train.py
index 366ec87a..aa60bbb5 100644
--- a/deepsensor/train/train.py
+++ b/deepsensor/train/train.py
@@ -13,18 +13,16 @@ def set_gpu_default_device() -> None:
     """
     Set default GPU device for the backend.
 
-    Raises
-    ------
-    RuntimeError
-        If no GPU is available.
-    RuntimeError
-        If backend is not supported.
-    NotImplementedError
-        If backend is not supported.
-
-    Returns
-    -------
-    None.
+    Raises:
+        RuntimeError
+            If no GPU is available.
+        RuntimeError
+            If backend is not supported.
+        NotImplementedError
+            If backend is not supported.
+
+    Returns:
+        None.
     """
     if deepsensor.backend.str == "torch":
         # Run on GPU if available
@@ -35,7 +33,9 @@ def set_gpu_default_device() -> None:
             torch.set_default_device("cuda")
             B.set_global_device("cuda:0")
         else:
-            raise RuntimeError("No GPU available: torch.cuda.is_available() == False")
+            raise RuntimeError(
+                "No GPU available: torch.cuda.is_available() == False"
+            )
     elif deepsensor.backend.str == "tf":
         # Run on GPU if available
         import tensorflow as tf
@@ -47,10 +47,14 @@ def set_gpu_default_device() -> None:
             )
             B.set_global_device("GPU:0")
         else:
-            raise RuntimeError("No GPU available: tf.test.is_gpu_available() == False")
+            raise RuntimeError(
+                "No GPU available: tf.test.is_gpu_available() == False"
+            )
 
     else:
-        raise NotImplementedError(f"Backend {deepsensor.backend.str} not implemented")
+        raise NotImplementedError(
+            f"Backend {deepsensor.backend.str} not implemented"
+        )
 
 
 def train_epoch(
@@ -65,28 +69,25 @@ def train_epoch(
     """
     Train model for one epoch.
 
-    Parameters
-    ----------
-    model : :class:`~.model.convnp.ConvNP`
-        Model to train.
-    tasks : List[:class:`~.data.task.Task`]
-        List of tasks to train on.
-    lr : float, optional
-        Learning rate, by default 5e-5.
-    batch_size : int, optional
-        Batch size. Defaults to None. If None, no batching is performed.
-    opt : Optimizer, optional
-        TF or Torch optimizer. Defaults to None. If None,
-        :class:`tensorflow:tensorflow.keras.optimizer.Adam` is used.
-    progress_bar : bool, optional
-        Whether to display a progress bar. Defaults to False.
-    tqdm_notebook : bool, optional
-        Whether to use a notebook progress bar. Defaults to False.
-
-    Returns
-    -------
-    List[float]
-        List of losses for each task/batch.
+    Args:
+        model (:class:`~.model.convnp.ConvNP`):
+            Model to train.
+        tasks (List[:class:`~.data.task.Task`]):
+            List of tasks to train on.
+        lr (float, optional):
+            Learning rate, by default 5e-5.
+        batch_size (int, optional):
+            Batch size. Defaults to None. If None, no batching is performed.
+        opt (Optimizer, optional):
+            TF or Torch optimizer. Defaults to None. If None,
+            :class:`tensorflow:tensorflow.keras.optimizer.Adam` is used.
+        progress_bar (bool, optional):
+            Whether to display a progress bar. Defaults to False.
+        tqdm_notebook (bool, optional):
+            Whether to use a notebook progress bar. Defaults to False.
+
+    Returns:
+        List[float]: List of losses for each task/batch.
     """
     if deepsensor.backend.str == "tf":
         import tensorflow as tf
@@ -102,7 +103,9 @@ def train_step(tasks):
                 for task in tasks:
                     task_losses.append(model.loss_fn(task, normalise=True))
                 mean_batch_loss = B.mean(B.stack(*task_losses))
-            grads = tape.gradient(mean_batch_loss, model.model.trainable_weights)
+            grads = tape.gradient(
+                mean_batch_loss, model.model.trainable_weights
+            )
             opt.apply_gradients(zip(grads, model.model.trainable_weights))
             return mean_batch_loss
 
@@ -125,12 +128,16 @@ def train_step(tasks):
             return mean_batch_loss.detach().cpu().numpy()
 
     else:
-        raise NotImplementedError(f"Backend {deepsensor.backend.str} not implemented")
+        raise NotImplementedError(
+            f"Backend {deepsensor.backend.str} not implemented"
+        )
 
     tasks = np.random.permutation(tasks)
 
     if batch_size is not None:
-        n_batches = len(tasks) // batch_size  # Note that this will drop the remainder
+        n_batches = (
+            len(tasks) // batch_size
+        )  # Note that this will drop the remainder
     else:
         n_batches = len(tasks)
 
diff --git a/tests/test_task_loader.py b/tests/test_task_loader.py
index c329b206..53142345 100644
--- a/tests/test_task_loader.py
+++ b/tests/test_task_loader.py
@@ -74,7 +74,9 @@ def _gen_task_loader_call_args(self, n_context_sets, n_target_sets):
             "all",
             np.zeros((2, 1)),
         ]:
-            yield [sampling_method] * n_context_sets, [sampling_method] * n_target_sets
+            yield [sampling_method] * n_context_sets, [
+                sampling_method
+            ] * n_target_sets
 
     def test_load_dask(self):
         """Test loading dask data"""
@@ -107,16 +109,6 @@ def data_type_ID_to_data(set_list):
 
             E.g. ["xr", "pd", "xr"] -> [self.da, self.df, self.da]
             E.g. "xr" -> self.da
-
-            Parameters
-            ----------
-            set_list : list[str] | str
-                List of data type IDs or single data type ID.
-
-            Returns
-            -------
-            list[xr.DataArray] | list[pd.DataFrame] | xr.DataArray | pd.DataFrame
-                List of data objects or single data object.
             """
             if set_list == "xr":
                 return self.da
@@ -198,7 +190,9 @@ def test_invalid_sampling_strat(self):
                 target=self.df,
             ),
         ]:
-            for invalid_sampling_strategy in invalid_context_sampling_strategies:
+            for (
+                invalid_sampling_strategy
+            ) in invalid_context_sampling_strategies:
                 with self.assertRaises(InvalidSamplingStrategyError):
                     task = tl("2020-01-01", invalid_sampling_strategy)
 
@@ -212,7 +206,9 @@ def test_links_gapfill_da(self) -> None:
         da_with_nans = copy.deepcopy(self.da)
         nan_idxs = np.random.randint(0, da_with_nans.size, size=10_000)
         da_with_nans.data.ravel()[nan_idxs] = np.nan
-        tl = TaskLoader(context=da_with_nans, target=da_with_nans, links=[(0, 0)])
+        tl = TaskLoader(
+            context=da_with_nans, target=da_with_nans, links=[(0, 0)]
+        )
 
         # This should not raise an error
         task = tl("2020-01-01", "gapfill", "gapfill")
diff --git a/tests/utils.py b/tests/utils.py
index bdc653b0..99fcb470 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -11,23 +11,21 @@ def gen_random_data_xr(
     """
     Generate random xarray data.
 
-    Parameters
-    ----------
-    coords : dict
-        Coordinates of the data.
-    dims : list, optional
-        Dimensions of the data. Defaults to None. If None, dims is inferred
-        from coords. This arg can be used to change the order of the
-        dimensions.
-    data_vars : list, optional
-        Data variables. Defaults to None. If None, variable is an
-        :class:`xarray.DataArray`. If not None, variable is an
-        :class:`xarray.Dataset` containing the data_vars.
+    Args:
+        coords (dict):
+            Coordinates of the data.
+        dims (list, optional):
+            Dimensions of the data. Defaults to None. If None, dims is inferred
+            from coords. This arg can be used to change the order of the
+            dimensions.
+        data_vars (list, optional):
+            Data variables. Defaults to None. If None, variable is an
+            :class:`xarray.DataArray`. If not None, variable is an
+            :class:`xarray.Dataset` containing the data_vars.
 
-    Returns
-    -------
-    da : :class:`xarray.DataArray` | :class:`xarray.Dataset`
-        Random xarray data.
+    Returns:
+        da (:class:`xarray.DataArray` | :class:`xarray.Dataset`):
+            Random xarray data.
     """
     if dims is None:
         shape = tuple([len(coords[dim]) for dim in coords])
@@ -47,24 +45,22 @@ def gen_random_data_pandas(coords: dict, dims: list = None, cols: list = None):
     """
     Generate random pandas data.
 
-    Parameters
-    ----------
-    coords : dict
-        Coordinates of the data. This will be used to construct a MultiIndex
-        using pandas.MultiIndex.from_product.
-    dims : list, optional
-        Dimensions of the data. Defaults to None. If None, dims is inferred
-        from coords. This arg can be used to change the order of the
-        MultiIndex.
-    cols : list, optional
-        Columns of the data. Defaults to None. If None, generate a
-        :class:`pandas.Series` with an arbitrary name. If not None, cols is
-        used to construct a :class:`pandas.DataFrame`.
+    Args:
+        coords (dict):
+            Coordinates of the data. This will be used to construct a
+            MultiIndex using pandas.MultiIndex.from_product.
+        dims (list, optional):
+            Dimensions of the data. Defaults to None. If None, dims is inferred
+            from coords. This arg can be used to change the order of the
+            MultiIndex.
+        cols (list, optional):
+            Columns of the data. Defaults to None. If None, generate a
+            :class:`pandas.Series` with an arbitrary name. If not None, cols is
+            used to construct a :class:`pandas.DataFrame`.
 
-    Returns
-    -------
-    df : :class:`pandas.Series` | :class:`pandas.DataFrame`
-        Random pandas data.
+    Returns:
+        :class:`pandas.Series` | :class:`pandas.DataFrame`
+            Random pandas data.
     """
     if dims is None:
         dims = list(coords.keys())