From 6eefa20fb7a2b4ce0f3f4bded03535919bd266ef Mon Sep 17 00:00:00 2001 From: Michele Pau Date: Wed, 4 Dec 2024 13:47:05 +0000 Subject: [PATCH] added support for ordered categoricals in kendall and spearman correlation --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/frame.py | 24 +++++++ pandas/core/series.py | 6 ++ pandas/tests/frame/methods/test_cov_corr.py | 29 ++++++++ pandas/tests/series/methods/test_cov_corr.py | 76 +++++++++++++++----- 5 files changed, 117 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index bb9f48d17b2e1..adbcaf8343cc5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -56,6 +56,7 @@ Other enhancements - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) +- :meth:`Series.corr`, :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith` with ``method="kendall"`` and ``method="spearman"`` now work with ordered categorical data types (:issue:`60306`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) - :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 33a419925f70c..8d6a7f1d76cf3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11034,6 +11034,10 @@ def corr( data = self._get_numeric_data() if numeric_only else self cols = data.columns idx = cols.copy() + + if method in ("spearman", "kendall"): + data = data._convert_ordered_cat_to_code() + mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) if method == "pearson": @@ -11321,6 +11325,8 @@ def corrwith( correl = num / dom elif method in ["kendall", "spearman"] or callable(method): + left = left._convert_ordered_cat_to_code() + right = right._convert_ordered_cat_to_code() def c(x): return nanops.nancorr(x[0], x[1], method=method) @@ -11352,6 +11358,24 @@ def c(x): return correl + def _convert_ordered_cat_to_code(self) -> DataFrame: + """ + Converts all category columns to their codes wherever possible + (i.e. wherever they are ordered) otherwise leaves shape unchanged + """ + categ = self.select_dtypes("category") + if len(categ.columns) == 0: + return self + + cols_convert = categ.loc[:, categ.agg(lambda x: x.cat.ordered)].columns + if len(cols_convert) > 0: + data = self.copy(deep=False) + data[cols_convert] = data[cols_convert].transform( + lambda x: x.cat.codes.replace(-1, np.nan) + ) + + return data + # ---------------------------------------------------------------------- # ndarray-like stats methods diff --git a/pandas/core/series.py b/pandas/core/series.py index 4fa8b86fa4c16..f991b78e455f0 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2680,6 +2680,12 @@ def corr( if len(this) == 0: return np.nan + if method in ("spearman", "kendall"): + if this.dtype == "category" and this.cat.ordered: + this = this.cat.codes.replace(-1, np.nan) + if other.dtype == "category" and other.cat.ordered: + other = other.cat.codes.replace(-1, np.nan) + this_values = this.to_numpy(dtype=float, na_value=np.nan, copy=False) other_values = other.to_numpy(dtype=float, na_value=np.nan, copy=False) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index c15952339ef18..b96035cab54a1 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -7,6 +7,7 @@ import pandas as pd from pandas import ( + Categorical, DataFrame, Index, Series, @@ -16,6 +17,19 @@ import pandas._testing as tm +@pytest.fixture +def categorical_frame(): + frame = DataFrame( + { + "A": Categorical(list("abcde") * 6, list("bacde"), ordered=True), + "B": Categorical(list("123") * 10, list("321"), ordered=True), + } + ) + frame.loc[frame.index[:5], "A"] = np.nan + frame.loc[frame.index[3:6], "B"] = np.nan + return frame + + class TestDataFrameCov: def test_cov(self, float_frame, float_string_frame): # min_periods no NAs (corner case) @@ -116,6 +130,13 @@ def test_corr_scipy_method(self, float_frame, method): expected = float_frame["A"].corr(float_frame["C"], method=method) tm.assert_almost_equal(correls["A"]["C"], expected) + @pytest.mark.parametrize("method", ["kendall", "spearman"]) + def test_corr_scipy_method_category(self, method, categorical_frame): + pytest.importorskip("scipy") + correls = categorical_frame.corr(method=method) + expected = categorical_frame["A"].corr(categorical_frame["B"], method=method) + tm.assert_almost_equal(correls["A"]["B"], expected) + # --------------------------------------------------------------------- def test_corr_non_numeric(self, float_string_frame): @@ -303,6 +324,14 @@ def test_corrwith(self, datetime_frame, dtype): dropped = a.corrwith(b, axis=1, drop=True) assert a.index[-1] not in dropped.index + @pytest.mark.parametrize("method", ["spearman", "kendall"]) + def test_corrwith_categorical(self, categorical_frame, method): + other = categorical_frame["B"] + result = categorical_frame.corrwith(other, method=method) + expected = categorical_frame.agg(lambda x: x.corr(other, method=method)) + tm.assert_almost_equal(result["A"], expected["A"]) + tm.assert_almost_equal(result["B"], expected["B"]) + def test_corrwith_non_timeseries_data(self): index = ["a", "b", "c", "d", "e"] columns = ["one", "two", "three", "four"] diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py index 7a4d48fb76940..dda4f7500f7ca 100644 --- a/pandas/tests/series/methods/test_cov_corr.py +++ b/pandas/tests/series/methods/test_cov_corr.py @@ -5,6 +5,7 @@ import pandas as pd from pandas import ( + Categorical, Series, date_range, isna, @@ -12,6 +13,35 @@ import pandas._testing as tm +@pytest.fixture +def A(): + return Series( + np.concatenate([np.arange(5, dtype=np.float64)] * 2), + index=date_range("2020-01-01", periods=10), + name="ts", + ) + + +@pytest.fixture +def B(): + return Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) + + +@pytest.fixture +def C(): + s = Series( + data=Categorical(list("12345") * 2, categories=list("54321"), ordered=True), + index=date_range("2020-01-01", periods=10), + name="categorical", + ) + s["2020-01-03"] = np.nan + return s + + class TestSeriesCov: def test_cov(self, datetime_series): # full overlap @@ -56,7 +86,7 @@ def test_cov_ddof(self, test_ddof, dtype): class TestSeriesCorr: - def test_corr(self, datetime_series, any_float_dtype): + def test_corr(self, B, datetime_series, any_float_dtype): stats = pytest.importorskip("scipy.stats") datetime_series = datetime_series.astype(any_float_dtype) @@ -81,29 +111,14 @@ def test_corr(self, datetime_series, any_float_dtype): cp[:] = np.nan assert isna(cp.corr(cp)) - A = Series( - np.arange(10, dtype=np.float64), - index=date_range("2020-01-01", periods=10), - name="ts", - ) - result = A.corr(A) - expected, _ = stats.pearsonr(A, A) + result = B.corr(B) + expected, _ = stats.pearsonr(B, B) tm.assert_almost_equal(result, expected) - def test_corr_rank(self): + def test_corr_rank(self, A, B): stats = pytest.importorskip("scipy.stats") # kendall and spearman - B = Series( - np.arange(10, dtype=np.float64), - index=date_range("2020-01-01", periods=10), - name="ts", - ) - A = Series( - np.concatenate([np.arange(5, dtype=np.float64)] * 2), - index=date_range("2020-01-01", periods=10), - name="ts", - ) result = A.corr(B, method="kendall") expected = stats.kendalltau(A, B)[0] tm.assert_almost_equal(result, expected) @@ -146,6 +161,29 @@ def test_corr_rank(self): tm.assert_almost_equal(A.corr(B, method="kendall"), kexp) tm.assert_almost_equal(A.corr(B, method="spearman"), sexp) + def test_corr_category(self, A, C): + stats = pytest.importorskip("scipy.stats") + + def get_codes(s: Series) -> Series: + return C.cat.codes.replace(-1, np.nan) + + result = A.corr(C, method="pearson") + expected = stats.pearsonr(A[C.notna()], C.dropna().astype("float"))[0] + tm.assert_almost_equal(result, expected) + tm.assert_almost_equal(result, 1) + + result = A.corr(C, method="spearman") + expected = stats.spearmanr(A, get_codes(C), nan_policy="omit")[0] + expected_pearson = stats.pearsonr(A[C.notna()], get_codes(C).dropna())[0] + + tm.assert_almost_equal(result, expected) + tm.assert_almost_equal(result, expected_pearson) + tm.assert_almost_equal(result, -1) + + result = A.corr(C, method="kendall") + expected = stats.kendalltau(A, get_codes(C), nan_policy="omit")[0] + tm.assert_almost_equal(result, expected) + def test_corr_invalid_method(self): # GH PR #22298 s1 = Series(np.random.default_rng(2).standard_normal(10))