diff --git a/notebooks/working_notebook.ipynb b/notebooks/working_notebook.ipynb new file mode 100644 index 0000000..e40434e --- /dev/null +++ b/notebooks/working_notebook.ipynb @@ -0,0 +1,1397 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### First implementation of RGDR's dbscan-based clustering & dimensionality reduction\n", + "This notebook outlines the current status of the RGDR implementation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First we will load in some example data, and resample them using the `AdventCalendar`" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import xarray as xr\n", + "import s2spy.time\n", + "import s2spy.rgdr\n", + "\n", + "file_path = '../tests/test_rgdr/test_data'\n", + "field = xr.open_dataset(f'{file_path}/sst_daily_1979-2018_5deg_Pacific_175_240E_25_50N.nc')\n", + "target = xr.open_dataset(f'{file_path}/tf5_nc5_dendo_80d77.nc')\n", + "\n", + "cal = s2spy.time.AdventCalendar((8, 31), freq = \"30d\")\n", + "field_resampled = cal.resample(field)\n", + "target_resampled = cal.resample(target)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The target timeseries comes from already pre-clustered land surface temperature data." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "target_timeseries = target_resampled.sel(cluster=3).ts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The precursor field consists of sea surface temperature data, with latitude and longitude dimensions:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With `_rgdr.correlation` we can determine correlation coefficient and p-values" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "field_resampled['corr'], field_resampled['p_val'] = (\n", + " s2spy.rgdr._rgdr.correlation(field_resampled.sst,\n", + " target_timeseries.sel(i_interval=0),\n", + " corr_dim='anchor_year')\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:      (anchor_year: 39, i_interval: 12, latitude: 5, longitude: 13)\n",
+       "Coordinates:\n",
+       "    index        (anchor_year, i_interval) int64 0 1 2 3 4 ... 464 465 466 467\n",
+       "    interval     (anchor_year, i_interval) object (1980-08-01, 1980-08-31] .....\n",
+       "  * latitude     (latitude) float64 47.5 42.5 37.5 32.5 27.5\n",
+       "  * longitude    (longitude) float64 177.5 182.5 187.5 ... 227.5 232.5 237.5\n",
+       "  * anchor_year  (anchor_year) int64 1980 1981 1982 1983 ... 2015 2016 2017 2018\n",
+       "  * i_interval   (i_interval) int64 0 1 2 3 4 5 6 7 8 9 10 11\n",
+       "    target       (i_interval) bool True False False False ... False False False\n",
+       "    tfreq        int64 5\n",
+       "    n_clusters   int64 6\n",
+       "    cluster      int64 3\n",
+       "Data variables:\n",
+       "    sst          (anchor_year, i_interval, latitude, longitude) float64 284.2...\n",
+       "    corr         (i_interval, latitude, longitude) float64 -0.08036 ... -0.00...\n",
+       "    p_val        (i_interval, latitude, longitude) float64 0.6267 ... 0.9864
" + ], + "text/plain": [ + "\n", + "Dimensions: (anchor_year: 39, i_interval: 12, latitude: 5, longitude: 13)\n", + "Coordinates:\n", + " index (anchor_year, i_interval) int64 0 1 2 3 4 ... 464 465 466 467\n", + " interval (anchor_year, i_interval) object (1980-08-01, 1980-08-31] .....\n", + " * latitude (latitude) float64 47.5 42.5 37.5 32.5 27.5\n", + " * longitude (longitude) float64 177.5 182.5 187.5 ... 227.5 232.5 237.5\n", + " * anchor_year (anchor_year) int64 1980 1981 1982 1983 ... 2015 2016 2017 2018\n", + " * i_interval (i_interval) int64 0 1 2 3 4 5 6 7 8 9 10 11\n", + " target (i_interval) bool True False False False ... False False False\n", + " tfreq int64 5\n", + " n_clusters int64 6\n", + " cluster int64 3\n", + "Data variables:\n", + " sst (anchor_year, i_interval, latitude, longitude) float64 284.2...\n", + " corr (i_interval, latitude, longitude) float64 -0.08036 ... -0.00...\n", + " p_val (i_interval, latitude, longitude) float64 0.6267 ... 0.9864" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "field_resampled" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot `lag 1` correlation map:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "field_resampled.corr.sel(i_interval=1).plot(cmap='viridis', size=3, aspect=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we can cluster the remaining grid points with `DBSCAN`.\n", + "\n", + "Compare these results with the [prototype-notebook](https://github.com/AI4S2S/s2spy/blob/6da3233168bc7b083b084a14f7afe5618b8c82c9/notebooks/prototype_RGDR.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import s2spy.rgdr\n", + "\n", + "clusters = s2spy.rgdr._rgdr.masked_spherical_dbscan(\n", + " field_resampled.sel(i_interval=1),\n", + " eps_km=600,\n", + " alpha=0.05,\n", + " min_area_km2=3000**2\n", + ")\n", + "\n", + "clusters.cluster_labels.plot(cmap='viridis', size=3, aspect=3)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above plot showing clusters will probably be useful for users to understand what is going on,\n", + "and will be required to finetune the parameters `eps_km` and `min_area_km2`. \n", + "\n", + "(Note: negative labels denote clusters with a negative correlation, positive labels clusters\n", + "with a positive correlation, and `0` labelled data is not within a cluster.)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`_rgdr.reduce_to_clusters` will both perform the clustering and return means per cluster:\n", + "\n", + "(and thus not providing direct feedback on what geographical region the clusters represent)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:         (anchor_year: 39, cluster_labels: 3)\n",
+       "Coordinates:\n",
+       "    index           (anchor_year) int64 1 13 25 37 49 61 ... 409 421 433 445 457\n",
+       "    interval        (anchor_year) object (1980-07-02, 1980-08-01] ... (2018-0...\n",
+       "  * anchor_year     (anchor_year) int64 1980 1981 1982 1983 ... 2016 2017 2018\n",
+       "    i_interval      int64 1\n",
+       "    target          bool False\n",
+       "    tfreq           int64 5\n",
+       "    n_clusters      int64 6\n",
+       "    cluster         int64 3\n",
+       "  * cluster_labels  (cluster_labels) float64 -1.0 0.0 1.0\n",
+       "Data variables:\n",
+       "    sst             (cluster_labels, anchor_year) float64 290.8 291.0 ... 298.2\n",
+       "    corr            (cluster_labels) float64 -0.3901 -0.06555 0.3602\n",
+       "    p_val           (cluster_labels) float64 0.01541 0.4659 0.02706\n",
+       "    area            (cluster_labels) float64 3.961e+06 3.869e+06 4.301e+06
" + ], + "text/plain": [ + "\n", + "Dimensions: (anchor_year: 39, cluster_labels: 3)\n", + "Coordinates:\n", + " index (anchor_year) int64 1 13 25 37 49 61 ... 409 421 433 445 457\n", + " interval (anchor_year) object (1980-07-02, 1980-08-01] ... (2018-0...\n", + " * anchor_year (anchor_year) int64 1980 1981 1982 1983 ... 2016 2017 2018\n", + " i_interval int64 1\n", + " target bool False\n", + " tfreq int64 5\n", + " n_clusters int64 6\n", + " cluster int64 3\n", + " * cluster_labels (cluster_labels) float64 -1.0 0.0 1.0\n", + "Data variables:\n", + " sst (cluster_labels, anchor_year) float64 290.8 291.0 ... 298.2\n", + " corr (cluster_labels) float64 -0.3901 -0.06555 0.3602\n", + " p_val (cluster_labels) float64 0.01541 0.4659 0.02706\n", + " area (cluster_labels) float64 3.961e+06 3.869e+06 4.301e+06" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clustered_data = s2spy.rgdr._rgdr.reduce_to_clusters(\n", + " field_resampled.sel(i_interval=1),\n", + " eps_km=600, \n", + " alpha=0.04\n", + ")\n", + "clustered_data" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "clustered_data['sst'] -= clustered_data['sst'].mean(dim='anchor_year')\n", + "\n", + "clustered_data['sst'].sel(cluster_labels=1).plot.line(x='anchor_year')\n", + "clustered_data['sst'].sel(cluster_labels=-1).plot.line(x='anchor_year')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Selecting different intervals for the precursor data (i.e. lags) will have a significant impact on the clusters that will be found:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[array([-1., 0.]),\n", + " array([-2., -1., 0., 1.]),\n", + " array([-1., 0., 1.]),\n", + " array([-1., 0., 1.]),\n", + " array([-2., -1., 0.]),\n", + " array([-2., -1., 0.]),\n", + " array([-3., -2., -1., 0.]),\n", + " array([-1., 0.]),\n", + " array([-1., 0.]),\n", + " array([-1., 0.]),\n", + " array([0.]),\n", + " array([-1., 0.])]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n_intervals = field_resampled.i_interval.size\n", + "cdata = [0]*n_intervals\n", + "for interval in range(n_intervals):\n", + " cdata[interval] = s2spy.rgdr._rgdr.reduce_to_clusters(\n", + " field_resampled.sel(i_interval=interval), eps_km=600, alpha=0.05)\n", + "\n", + "[c.cluster_labels.values for c in cdata]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.5 ('ai4s2s')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "a87ac8a6238ac8fffcd070769ea0ffc634a0ba702c52b706c3554dda7777b3fd" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/s2spy/rgdr/__init__.py b/s2spy/rgdr/__init__.py index e8e3421..76e2a89 100644 --- a/s2spy/rgdr/__init__.py +++ b/s2spy/rgdr/__init__.py @@ -1,4 +1,3 @@ """Response Guided Dimensionality Reduction.""" -from . import _map_analysis -from . import _map_regions +from . import _rgdr from ._rgdr import RGDR diff --git a/s2spy/rgdr/_map_analysis.py b/s2spy/rgdr/_map_analysis.py deleted file mode 100644 index f20194b..0000000 --- a/s2spy/rgdr/_map_analysis.py +++ /dev/null @@ -1,88 +0,0 @@ -"""Spatial-temporal data analysis utils. - -A toolbox for spatial-temporal data analysis, including regression, -correlation, auto-correlation and relevant utilities functions. -""" -import numpy as np -import xarray as xr -from scipy.stats import pearsonr as _pearsonr - - -def _pearsonr_nan(x: np.ndarray, y: np.ndarray): - """NaN friendly implementation of scipy.stats.pearsonr. Calculates the correlation - coefficient between two arrays, as well as the p-value of this correlation. - - Args: - x: 1-D array - y: 1-D array - Returns: - r_coefficient - p_value - - """ - if np.any(np.isnan(x)) or np.any(np.isnan(y)): - return np.nan, np.nan - return _pearsonr(x, y) - - -def correlation(field: xr.DataArray, target: xr.DataArray, corr_dim: str = "time"): - """Calculate correlation maps. - - Args: - field: Spatial data with a dimension named `corr_dim`, over which each - location should have the Pearson correlation coefficient calculated with the - target data. - target: Data which has to be correlated with the spatial data. Requires a - dimension named `corr_dim`. - - Returns: - r_coefficient: DataArray filled with the correlation coefficient for each - non-`corr_dim` coordinate. - p_value: DataArray filled with the two-tailed p-values for each computed - correlation coefficient. - """ - assert ( - corr_dim in target.dims - ), f"input target does not have contain the '{corr_dim}' dimension" - assert ( - corr_dim in field.dims - ), f"input field does not have contain the '{corr_dim}' dimension" - assert np.all( - [dim in field.dims for dim in target.dims] - ), "Field and target dims do not match" - - return xr.apply_ufunc( - _pearsonr_nan, - field, - target, - input_core_dims=[[corr_dim], [corr_dim]], - vectorize=True, - output_core_dims=[[], []], - ) - - -def partial_correlation(field, target, z): - """Calculate partial correlation maps.""" - raise NotImplementedError - - -def regression(field, target): - """Regression analysis on entire maps. - - Methods include Linear, Ridge, Lasso. - """ - raise NotImplementedError - - -def save_map(): - """Save calculated coefficients. - - Store calculated coefficients and significance values, and - save them as netcdf files. - """ - raise NotImplementedError - - -def load_map(): - """Load coefficients from saved netcdf maps.""" - raise NotImplementedError diff --git a/s2spy/rgdr/_map_regions.py b/s2spy/rgdr/_map_regions.py deleted file mode 100644 index 791b101..0000000 --- a/s2spy/rgdr/_map_regions.py +++ /dev/null @@ -1,24 +0,0 @@ -"""Regions clustering utils. - -A module for clustering regions based on the given correlation -between spatial fields and target timeseries. -""" - -def spatial_mean(): - """Calculate 1-d timeseries for each precursor region. Precursor regions - are integer label masks within the np.ndarray labels. - """ - raise NotImplementedError - -def cluster_dbscan(): - """Perform DBSCAN clustering from vector array or distance matrix. - - Density-Based Spatial Clustering of Applications with Noise (DBSCAN). - Clusters gridcells together which are of the same sign and in proximity to - each other using DBSCAN. - - The input data will be processed in this function to ensure that the distance - is free of the impact from spherical curvature. The actual Euclidean distance - will be obtained and passed to the DBSCAN clustering function. - """ - raise NotImplementedError diff --git a/s2spy/rgdr/_rgdr.py b/s2spy/rgdr/_rgdr.py index 0aa8121..ff89f25 100644 --- a/s2spy/rgdr/_rgdr.py +++ b/s2spy/rgdr/_rgdr.py @@ -1,23 +1,22 @@ """Response Guided Dimensionality Reduction.""" +from typing import Tuple +import numpy as np +import xarray as xr +from scipy.stats import pearsonr as _pearsonr +from sklearn.cluster import DBSCAN class RGDR: """Response Guided Dimensionality Reduction.""" + def __init__(self, timeseries, lag_shift: int = 0): """Instantiate an RGDR operator.""" self.timeseries = timeseries self.lag_shift = lag_shift - def lag_shifting(self): - """loop through data with lag shift upto the given `lag_shift` value.""" - # To do: loop through lags - # To do: call `map_analysis` and `map_regions` and manage - # inputs and outputs. - raise NotImplementedError - def _map_analysis(self): """Perform map analysis. - + Use chosen method from `map_analysis` and perform map analysis. """ raise NotImplementedError @@ -33,3 +32,264 @@ def _clustering_regions(self): def fit(self, data): """Perform RGDR calculations with given data.""" raise NotImplementedError + + def transform(self, data): + """Apply RGDR on data, based on the fit model""" + raise NotImplementedError + + +radius_earth_km = 6371 +surface_area_earth_km2 = 5.1e8 + + +def spherical_area(latitude: float, dlat: float, dlon: float = None) -> float: + """Approximate the area of a square grid cell on a spherical (!) earth. + Returns the area in square kilometers of earth surface. + + Args: + latitude (float): Latitude at the center of the grid cell (deg) + dlat (float): Latitude grid resolution (deg) + dlon (float): Longitude grid resolution (deg), optional in case of a square grid. + + Returns: + float: Area of the grid cell (km^2) + """ + if dlon is None: + dlon = dlat + dlon = np.radians(dlon) + dlat = np.radians(dlat) + + lat = np.radians(latitude) + h = np.sin(lat + dlat / 2) - np.sin(lat - dlat / 2) + spherical_area = h * dlon / np.pi * 4 + + return spherical_area * surface_area_earth_km2 + + +def cluster_area(ds: xr.Dataset, cluster_label: float) -> float: + """Determines the total area of a cluster. Requires the input dataset to have the + variables `area` and `cluster_labels`. + + Args: + ds (xr.Dataset): Dataset containing the variables `area` and `cluster_labels`. + cluster_label (float): The label for which the area should be calculated. + + Returns: + float: Area of the cluster `cluster_label`. + """ + return ( + ds["area"].where(ds["cluster_labels"] == cluster_label).sum(skipna=True).values + ) + + +def remove_small_area_clusters(ds: xr.Dataset, min_area_km2: float) -> xr.Dataset: + """Removes the clusters where the area is under the input threshold. + + Args: + ds (xr.Dataset): Dataset containing `cluster_labels` and `area`. + min_area_km2 (float): The minimum allowed area of each cluster + + Returns: + xr.Dataset: The input dataset with the labels of the clusters set to 0 when the + area of the cluster is under the `min_area_km2` threshold. + """ + clusters = np.unique(ds["cluster_labels"]) + areas = [cluster_area(ds, c) for c in clusters] + valid_clusters = np.array([c for c, a in zip(clusters, areas) if a > min_area_km2]) + + ds["cluster_labels"] = ds["cluster_labels"].where( + np.isin(ds["cluster_labels"], valid_clusters), 0 + ) + + return ds + + +def weighted_groupby(ds: xr.Dataset, groupby: str, weight: str, method: str = "mean") -> xr.Dataset: + """Apply a weighted reduction after a groupby call. xarray does not currently support + combining `weighted` and `groupby`. An open PR adds supports for this functionality + (https://github.com/pydata/xarray/pull/5480), but this branch was never merged. + + Args: + ds (xr.Dataset): Dataset containing the coordinates or variables specified in + the `groupby` and `weight` kwargs. + groupby (str): Coordinate which should be used to make the groups. + weight (str): Variable in the Dataset containing the weights that should be used. + method (str): Method that should be used to reduce the dataset, by default + 'mean'. Supports any of xarray's builtin methods, e.g. median, min, max. + + Returns: + xr.Dataset: Dataset reduced using the `groupby` coordinate, using weights = + based on `ds[weight]`. + """ + groups = ds.groupby(groupby) + + # find stacked dim name: + group_dims = list(groups)[0][1].dims # Get ds of first group + stacked_dims = [dim for dim in group_dims.keys() if "stacked_" in dim] + + reduced_data = [ + getattr(g.weighted(g[weight]), method)(dim=stacked_dims) for _, g in groups + ] + return xr.concat(reduced_data, dim=groupby) + + +def masked_spherical_dbscan( + ds: xr.Dataset, alpha: float = 0.05, eps_km: float = 600, min_area_km2: float = None +) -> xr.Dataset: + """Determines the clusters based on sklearn's DBSCAN implementation. Alpha determines + the mask based on the minimum p_value. Grouping can be adjusted using the `eps_km` + kwarg. Cluster labels are negative for areas with a negative correlation coefficient + and positive for areas with a positive correlation coefficient. Areas without any + significant correlation are put in the cluster labelled '0'. + + Args: + ds (xr.Dataset): Dataset containing 'latitude' and 'longitude' dimensions in + degrees. Must also contain 'p_val' and 'corr' to base the groups on. + alpha (float): Value below which the correlation is significant enough to be + considered + eps_km (float): The maximum distance (in km) between two samples for one to be + considered as in the neighborhood of the other. This is not a maximum bound + on the distances of points within a cluster. This is the most important + DBSCAN parameter to choose appropriately. + min_area_km2 (float): The minimum area of a cluster. Clusters smaller than this + minimum area will be discarded. + + Returns: + xr.Dataset: Dataset grouped by the DBSCAN clusters. + """ + ds = ds.stack(coord=["latitude", "longitude"]) + coords = np.asarray(list(ds["coord"].values)) # turn array of tuples to 2d-array + coords = np.radians(coords) + + # Prepare labels, default value is 0 (not in cluster) + labels = np.zeros(len(coords)) + + for sign, sign_mask in zip([1, -1], [ds["corr"] >= 0, ds["corr"] < 0]): + mask = np.logical_and(ds["p_val"] < alpha, sign_mask) + + if np.sum(mask) > 0: # Check if the mask contains any points to cluster + db = DBSCAN( + eps=eps_km / radius_earth_km, + min_samples=1, + algorithm="auto", + metric="haversine", + ).fit(coords[mask]) + + labels[mask] = sign * (db.labels_ + 1) + + ds["cluster_labels"] = ("coord", labels) + + ds = ds.unstack(("coord")) + + dlat = np.abs(ds.latitude.values[1] - ds.latitude.values[0]) + dlon = np.abs(ds.longitude.values[1] - ds.longitude.values[0]) + ds["area"] = spherical_area(ds.latitude, dlat, dlon) + + if min_area_km2: + ds = remove_small_area_clusters(ds, min_area_km2) + return ds + + +def reduce_to_clusters( + ds: xr.Dataset, alpha: float = 0.05, eps_km: float = 600, min_area_km2: float = None +) -> xr.Dataset: + """Perform DBSCAN clustering on a prepared Dataset, and then group the data by their + determined clusters, taking the weighted mean. The weight is based on the area of + each grid cell. + + Density-Based Spatial Clustering of Applications with Noise (DBSCAN). + Clusters gridcells together which are of the same sign and in proximity to + each other using DBSCAN. + + The input data will be processed in this function to ensure that the distance + is free of the impact from spherical curvature. The actual geodesic distance + will be obtained and passed to the DBSCAN clustering function. + + Args: + ds: Dataset prepared to have p_val and corr. + alpha (float): Value below which the correlation is significant enough to be + considered + eps_km (float): The maximum distance (in km) between two samples for one to be + considered as in the neighborhood of the other. This is not a maximum bound + on the distances of points within a cluster. This is the most important + DBSCAN parameter to choose appropriately. + + """ + ds = masked_spherical_dbscan( + ds, alpha=alpha, eps_km=eps_km, min_area_km2=min_area_km2 + ) + + ds = weighted_groupby(ds, groupby="cluster_labels", weight="area") + + return ds + + +def _pearsonr_nan(x: np.ndarray, y: np.ndarray) -> Tuple[float, float]: + """NaN friendly implementation of scipy.stats.pearsonr. Calculates the correlation + coefficient between two arrays, as well as the p-value of this correlation. However, + instead of raising an error when encountering NaN values, this function will return + both the correlation coefficient and the p-value as NaN. + + Args: + x: 1-D array + y: 1-D array + Returns: + r_coefficient + p_value + + """ + if np.any(np.isnan(x)) or np.any(np.isnan(y)): + return np.nan, np.nan + return _pearsonr(x, y) + + +def correlation( + field: xr.DataArray, target: xr.DataArray, corr_dim: str = "time" +) -> Tuple[xr.DataArray, xr.DataArray]: + """Calculate correlation maps. + + Args: + field: Spatial data with a dimension named `corr_dim`, over which each + location should have the Pearson correlation coefficient calculated with the + target data. + target: Data which has to be correlated with the spatial data. Requires a + dimension named `corr_dim`. + corr_dim: Dimension over which the correlation coefficient should be calculated. + + Returns: + r_coefficient: DataArray filled with the correlation coefficient for each + non-`corr_dim` coordinate. + p_value: DataArray filled with the two-tailed p-values for each computed + correlation coefficient. + """ + assert ( + corr_dim in target.dims + ), f"input target does not have contain the '{corr_dim}' dimension" + assert ( + corr_dim in field.dims + ), f"input field does not have contain the '{corr_dim}' dimension" + assert np.all( + [dim in field.dims for dim in target.dims] + ), "Field and target dims do not match" + + return xr.apply_ufunc( + _pearsonr_nan, + field, + target, + input_core_dims=[[corr_dim], [corr_dim]], + vectorize=True, + output_core_dims=[[], []], + ) + + +def partial_correlation(field, target, z): + """Calculate partial correlation maps.""" + raise NotImplementedError + + +def regression(field, target): + """Regression analysis on entire maps. + + Methods include Linear, Ridge, Lasso. + """ + raise NotImplementedError diff --git a/setup.cfg b/setup.cfg index f932077..6db6c40 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,6 +32,7 @@ zip_safe = False include_package_data = True packages = find: install_requires = + netcdf4 numpy pandas scikit-learn diff --git a/tests/test_rgdr/test_data/sst_daily_1979-2018_5deg_Pacific_175_240E_25_50N.nc b/tests/test_rgdr/test_data/sst_daily_1979-2018_5deg_Pacific_175_240E_25_50N.nc new file mode 100644 index 0000000..9f02596 Binary files /dev/null and b/tests/test_rgdr/test_data/sst_daily_1979-2018_5deg_Pacific_175_240E_25_50N.nc differ diff --git a/tests/test_rgdr/test_data/tf5_nc5_dendo_80d77.nc b/tests/test_rgdr/test_data/tf5_nc5_dendo_80d77.nc new file mode 100644 index 0000000..36a6a2d Binary files /dev/null and b/tests/test_rgdr/test_data/tf5_nc5_dendo_80d77.nc differ diff --git a/tests/test_rgdr/test_map_analysis.py b/tests/test_rgdr/test_map_analysis.py deleted file mode 100644 index 9525dbd..0000000 --- a/tests/test_rgdr/test_map_analysis.py +++ /dev/null @@ -1,87 +0,0 @@ -"""Tests for the s2s._RGDR.map_analysis module.""" -import numpy as np -import pandas as pd -import pytest -import xarray as xr -from s2spy.rgdr import _map_analysis - - -# pylint: disable=protected-access - - -class TestMapAnalysis: - @pytest.fixture(autouse=True) - def dummy_dataarray(self): - time = pd.date_range("20161001", "20211001", freq="60d") - - da = xr.DataArray( - np.tile(np.arange(len(time)), (2, 2, 1)), - dims=["lat", "lon", "time"], - coords={ - "time": time, - "lat": np.arange(0, 2), - "lon": np.arange(0, 2), - }, - ) - return da - - @pytest.fixture(autouse=True) - def dummy_timeseries(self, dummy_dataarray): - return dummy_dataarray.isel(lat=0, lon=0).drop_vars(["lat", "lon"]) - - def test_pearsonr(self): - result = _map_analysis._pearsonr_nan([0, 0, 1], [0, 1, 1]) - expected = (0.5, 0.66666667) - np.testing.assert_array_almost_equal(result, expected) - - def test_pearsonr_nan(self): - result = _map_analysis._pearsonr_nan([np.nan, 0, 1], [0, 1, 1]) - expected = (np.nan, np.nan) - np.testing.assert_array_almost_equal(result, expected) - - def test_correlation(self, dummy_dataarray, dummy_timeseries): - c_val, p_val = _map_analysis.correlation(dummy_dataarray, dummy_timeseries) - - np.testing.assert_equal(c_val.values, 1) - np.testing.assert_equal(p_val.values, 0) - - def test_correlation_dim_name(self, dummy_dataarray, dummy_timeseries): - da = dummy_dataarray.rename({"time": "i_interval"}) - ts = dummy_timeseries.rename({"time": "i_interval"}) - c_val, p_val = _map_analysis.correlation( - da, ts, corr_dim="i_interval" - ) - - np.testing.assert_equal(c_val.values, 1) - np.testing.assert_equal(p_val.values, 0) - - def test_correlation_wrong_target_dim_name(self, dummy_dataarray, dummy_timeseries): - ts = dummy_timeseries.rename({"time": "dummy"}) - with pytest.raises(AssertionError): - _map_analysis.correlation(dummy_dataarray, ts) - - def test_correlation_wrong_field_dim_name(self, dummy_dataarray, dummy_timeseries): - da = dummy_dataarray.rename({"time": "dummy"}) - with pytest.raises(AssertionError): - _map_analysis.correlation(da, dummy_timeseries) - - def test_correlation_wrong_target_dims(self, dummy_dataarray): - ts = dummy_dataarray.rename({'lat': 'latitude'}) - with pytest.raises(AssertionError): - _map_analysis.correlation(dummy_dataarray, ts) - - def test_partial_correlation(self): - with pytest.raises(NotImplementedError): - _map_analysis.partial_correlation("field", "target", "z") - - def test_regression(self): - with pytest.raises(NotImplementedError): - _map_analysis.regression("field", "target") - - def test_save_map(self): - with pytest.raises(NotImplementedError): - _map_analysis.save_map() - - def test_load_map(self): - with pytest.raises(NotImplementedError): - _map_analysis.load_map() diff --git a/tests/test_rgdr/test_map_regions.py b/tests/test_rgdr/test_map_regions.py deleted file mode 100644 index d25252a..0000000 --- a/tests/test_rgdr/test_map_regions.py +++ /dev/null @@ -1,16 +0,0 @@ -"""Tests for the s2s._RGDR.map_regions module.""" -import pytest -from s2spy.rgdr import _map_regions - - -# pylint: disable=protected-access - - -class TestMapRegions: - def test_spatial_mean(self): - with pytest.raises(NotImplementedError): - _map_regions.spatial_mean() - - def test_cluster_dbscan(self): - with pytest.raises(NotImplementedError): - _map_regions.cluster_dbscan() diff --git a/tests/test_rgdr/test_rgdr.py b/tests/test_rgdr/test_rgdr.py index 248b4c3..bf646ba 100644 --- a/tests/test_rgdr/test_rgdr.py +++ b/tests/test_rgdr/test_rgdr.py @@ -1,6 +1,11 @@ """Tests for the s2s._RGDR.rgdr module.""" +import numpy as np +import pandas as pd import pytest +import xarray as xr from s2spy import RGDR +from s2spy.rgdr import _rgdr +from s2spy.time import AdventCalendar # pylint: disable=protected-access @@ -8,21 +13,152 @@ class TestRGDR: """Test RGDR methods.""" + @pytest.fixture(autouse=True) def dummy_rgdr(self): - rgdr = RGDR('timeseries') - return rgdr + return RGDR("timeseries") def test_init(self): - rgdr = RGDR('timeseries') + rgdr = RGDR("timeseries") assert isinstance(rgdr, RGDR) - def test_lag_shifting(self, dummy_rgdr): - with pytest.raises(NotImplementedError): - dummy_rgdr.lag_shifting() - def test_fit(self, dummy_rgdr): with pytest.raises(NotImplementedError): dummy_rgdr.fit("data") - + def test_transform(self, dummy_rgdr): + with pytest.raises(NotImplementedError): + dummy_rgdr.transform("data") + + +class TestCorrelation: + @pytest.fixture(autouse=True) + def dummy_dataarray(self): + time = pd.date_range("20161001", "20211001", freq="60d") + + return xr.DataArray( + np.tile(np.arange(len(time)), (2, 2, 1)), + dims=["lat", "lon", "time"], + coords={ + "time": time, + "lat": np.arange(0, 2), + "lon": np.arange(0, 2), + }, + ) + + @pytest.fixture(autouse=True) + def dummy_timeseries(self, dummy_dataarray): + return dummy_dataarray.isel(lat=0, lon=0).drop_vars(["lat", "lon"]) + + def test_pearsonr(self): + result = _rgdr._pearsonr_nan([0, 0, 1], [0, 1, 1]) + expected = (0.5, 0.66666667) + np.testing.assert_array_almost_equal(result, expected) + + def test_pearsonr_nan(self): + result = _rgdr._pearsonr_nan([np.nan, 0, 1], [0, 1, 1]) + expected = (np.nan, np.nan) + np.testing.assert_array_almost_equal(result, expected) + + def test_correlation(self, dummy_dataarray, dummy_timeseries): + c_val, p_val = _rgdr.correlation(dummy_dataarray, dummy_timeseries) + + np.testing.assert_equal(c_val.values, 1) + np.testing.assert_equal(p_val.values, 0) + + def test_correlation_dim_name(self, dummy_dataarray, dummy_timeseries): + da = dummy_dataarray.rename({"time": "i_interval"}) + ts = dummy_timeseries.rename({"time": "i_interval"}) + c_val, p_val = _rgdr.correlation(da, ts, corr_dim="i_interval") + + np.testing.assert_equal(c_val.values, 1) + np.testing.assert_equal(p_val.values, 0) + + def test_correlation_wrong_target_dim_name(self, dummy_dataarray, dummy_timeseries): + ts = dummy_timeseries.rename({"time": "dummy"}) + with pytest.raises(AssertionError): + _rgdr.correlation(dummy_dataarray, ts) + + def test_correlation_wrong_field_dim_name(self, dummy_dataarray, dummy_timeseries): + da = dummy_dataarray.rename({"time": "dummy"}) + with pytest.raises(AssertionError): + _rgdr.correlation(da, dummy_timeseries) + + def test_correlation_wrong_target_dims(self, dummy_dataarray): + ts = dummy_dataarray.rename({"lat": "latitude"}) + with pytest.raises(AssertionError): + _rgdr.correlation(dummy_dataarray, ts) + + def test_partial_correlation(self): + with pytest.raises(NotImplementedError): + _rgdr.partial_correlation("field", "target", "z") + + def test_regression(self): + with pytest.raises(NotImplementedError): + _rgdr.regression("field", "target") + + +test_file_path = "./tests/test_rgdr/test_data" + + +class TestMapRegions: + @pytest.fixture(autouse=True) + def dummy_calendar(self): + return AdventCalendar(anchor_date=(8, 31), freq="30d") + + @pytest.fixture(autouse=True) + def example_target(self): + return xr.open_dataset(f"{test_file_path}/tf5_nc5_dendo_80d77.nc").sel( + cluster=3 + ) + + @pytest.fixture(autouse=True) + def example_field(self): + return xr.open_dataset( + f"{test_file_path}/sst_daily_1979-2018_5deg_Pacific_175_240E_25_50N.nc" + ) + + @pytest.fixture(autouse=True) + def example_corr(self, dummy_calendar, example_target, example_field): + field = dummy_calendar.resample(example_field) + target = dummy_calendar.resample(example_target) + + field["corr"], field["p_val"] = _rgdr.correlation( + field.sst, target.ts.sel(i_interval=0), corr_dim="anchor_year" + ) + + return field + + @pytest.fixture(autouse=True) + def expected_labels(self): + return np.array( + [ + [0.0, 0.0, 0.0, 0.0, -1.0, -1.0, 0.0, -2.0, -2.0, -2.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -2.0, -2.0, -2.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -2.0, -2.0, -2.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -2.0, 0.0, 0.0], + [1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, -2.0, -2.0, -2.0, -2.0, -2.0], + ] + ) + + def test_dbscan(self, example_corr, expected_labels): + clusters = _rgdr.masked_spherical_dbscan(example_corr.sel(i_interval=1)) + + np.testing.assert_array_equal(clusters["cluster_labels"], expected_labels) + + def test_dbscan_min_area(self, example_corr, expected_labels): + clusters = _rgdr.masked_spherical_dbscan( + example_corr.sel(i_interval=1), min_area_km2=3000**2 + ) + expected_labels[expected_labels == -1] = 0 # Small -1 cluster is missing + + np.testing.assert_array_equal(clusters["cluster_labels"], expected_labels) + + def test_cluster(self, example_corr): + clustered_data = _rgdr.reduce_to_clusters( + example_corr.sel(i_interval=1), eps_km=600, alpha=0.04 + ) + + np.testing.assert_array_equal(clustered_data["cluster_labels"], [-1, 0, 1]) + + assert set(clustered_data.sst.dims) == {"anchor_year", "cluster_labels"}