Skip to content

Commit

Permalink
added support for ordered categoricals in kendall and spearman correl…
Browse files Browse the repository at this point in the history
…ation
  • Loading branch information
Michele Pau committed Dec 4, 2024
1 parent cfd0d3f commit 6eefa20
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 19 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ Other enhancements
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
- :meth:`Series.corr`, :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith` with ``method="kendall"`` and ``method="spearman"`` now work with ordered categorical data types (:issue:`60306`)
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
- :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
Expand Down
24 changes: 24 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -11034,6 +11034,10 @@ def corr(
data = self._get_numeric_data() if numeric_only else self
cols = data.columns
idx = cols.copy()

if method in ("spearman", "kendall"):
data = data._convert_ordered_cat_to_code()

mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)

if method == "pearson":
Expand Down Expand Up @@ -11321,6 +11325,8 @@ def corrwith(
correl = num / dom

elif method in ["kendall", "spearman"] or callable(method):
left = left._convert_ordered_cat_to_code()
right = right._convert_ordered_cat_to_code()

def c(x):
return nanops.nancorr(x[0], x[1], method=method)
Expand Down Expand Up @@ -11352,6 +11358,24 @@ def c(x):

return correl

def _convert_ordered_cat_to_code(self) -> DataFrame:
"""
Converts all category columns to their codes wherever possible
(i.e. wherever they are ordered) otherwise leaves shape unchanged
"""
categ = self.select_dtypes("category")
if len(categ.columns) == 0:
return self

cols_convert = categ.loc[:, categ.agg(lambda x: x.cat.ordered)].columns
if len(cols_convert) > 0:
data = self.copy(deep=False)
data[cols_convert] = data[cols_convert].transform(
lambda x: x.cat.codes.replace(-1, np.nan)
)

return data

# ----------------------------------------------------------------------
# ndarray-like stats methods

Expand Down
6 changes: 6 additions & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2680,6 +2680,12 @@ def corr(
if len(this) == 0:
return np.nan

if method in ("spearman", "kendall"):
if this.dtype == "category" and this.cat.ordered:
this = this.cat.codes.replace(-1, np.nan)
if other.dtype == "category" and other.cat.ordered:
other = other.cat.codes.replace(-1, np.nan)

this_values = this.to_numpy(dtype=float, na_value=np.nan, copy=False)
other_values = other.to_numpy(dtype=float, na_value=np.nan, copy=False)

Expand Down
29 changes: 29 additions & 0 deletions pandas/tests/frame/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import pandas as pd
from pandas import (
Categorical,
DataFrame,
Index,
Series,
Expand All @@ -16,6 +17,19 @@
import pandas._testing as tm


@pytest.fixture
def categorical_frame():
frame = DataFrame(
{
"A": Categorical(list("abcde") * 6, list("bacde"), ordered=True),
"B": Categorical(list("123") * 10, list("321"), ordered=True),
}
)
frame.loc[frame.index[:5], "A"] = np.nan
frame.loc[frame.index[3:6], "B"] = np.nan
return frame


class TestDataFrameCov:
def test_cov(self, float_frame, float_string_frame):
# min_periods no NAs (corner case)
Expand Down Expand Up @@ -116,6 +130,13 @@ def test_corr_scipy_method(self, float_frame, method):
expected = float_frame["A"].corr(float_frame["C"], method=method)
tm.assert_almost_equal(correls["A"]["C"], expected)

@pytest.mark.parametrize("method", ["kendall", "spearman"])
def test_corr_scipy_method_category(self, method, categorical_frame):
pytest.importorskip("scipy")
correls = categorical_frame.corr(method=method)
expected = categorical_frame["A"].corr(categorical_frame["B"], method=method)
tm.assert_almost_equal(correls["A"]["B"], expected)

# ---------------------------------------------------------------------

def test_corr_non_numeric(self, float_string_frame):
Expand Down Expand Up @@ -303,6 +324,14 @@ def test_corrwith(self, datetime_frame, dtype):
dropped = a.corrwith(b, axis=1, drop=True)
assert a.index[-1] not in dropped.index

@pytest.mark.parametrize("method", ["spearman", "kendall"])
def test_corrwith_categorical(self, categorical_frame, method):
other = categorical_frame["B"]
result = categorical_frame.corrwith(other, method=method)
expected = categorical_frame.agg(lambda x: x.corr(other, method=method))
tm.assert_almost_equal(result["A"], expected["A"])
tm.assert_almost_equal(result["B"], expected["B"])

def test_corrwith_non_timeseries_data(self):
index = ["a", "b", "c", "d", "e"]
columns = ["one", "two", "three", "four"]
Expand Down
76 changes: 57 additions & 19 deletions pandas/tests/series/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,43 @@

import pandas as pd
from pandas import (
Categorical,
Series,
date_range,
isna,
)
import pandas._testing as tm


@pytest.fixture
def A():
return Series(
np.concatenate([np.arange(5, dtype=np.float64)] * 2),
index=date_range("2020-01-01", periods=10),
name="ts",
)


@pytest.fixture
def B():
return Series(
np.arange(10, dtype=np.float64),
index=date_range("2020-01-01", periods=10),
name="ts",
)


@pytest.fixture
def C():
s = Series(
data=Categorical(list("12345") * 2, categories=list("54321"), ordered=True),
index=date_range("2020-01-01", periods=10),
name="categorical",
)
s["2020-01-03"] = np.nan
return s


class TestSeriesCov:
def test_cov(self, datetime_series):
# full overlap
Expand Down Expand Up @@ -56,7 +86,7 @@ def test_cov_ddof(self, test_ddof, dtype):


class TestSeriesCorr:
def test_corr(self, datetime_series, any_float_dtype):
def test_corr(self, B, datetime_series, any_float_dtype):
stats = pytest.importorskip("scipy.stats")

datetime_series = datetime_series.astype(any_float_dtype)
Expand All @@ -81,29 +111,14 @@ def test_corr(self, datetime_series, any_float_dtype):
cp[:] = np.nan
assert isna(cp.corr(cp))

A = Series(
np.arange(10, dtype=np.float64),
index=date_range("2020-01-01", periods=10),
name="ts",
)
result = A.corr(A)
expected, _ = stats.pearsonr(A, A)
result = B.corr(B)
expected, _ = stats.pearsonr(B, B)
tm.assert_almost_equal(result, expected)

def test_corr_rank(self):
def test_corr_rank(self, A, B):
stats = pytest.importorskip("scipy.stats")

# kendall and spearman
B = Series(
np.arange(10, dtype=np.float64),
index=date_range("2020-01-01", periods=10),
name="ts",
)
A = Series(
np.concatenate([np.arange(5, dtype=np.float64)] * 2),
index=date_range("2020-01-01", periods=10),
name="ts",
)
result = A.corr(B, method="kendall")
expected = stats.kendalltau(A, B)[0]
tm.assert_almost_equal(result, expected)
Expand Down Expand Up @@ -146,6 +161,29 @@ def test_corr_rank(self):
tm.assert_almost_equal(A.corr(B, method="kendall"), kexp)
tm.assert_almost_equal(A.corr(B, method="spearman"), sexp)

def test_corr_category(self, A, C):
stats = pytest.importorskip("scipy.stats")

def get_codes(s: Series) -> Series:
return C.cat.codes.replace(-1, np.nan)

result = A.corr(C, method="pearson")
expected = stats.pearsonr(A[C.notna()], C.dropna().astype("float"))[0]
tm.assert_almost_equal(result, expected)
tm.assert_almost_equal(result, 1)

result = A.corr(C, method="spearman")
expected = stats.spearmanr(A, get_codes(C), nan_policy="omit")[0]
expected_pearson = stats.pearsonr(A[C.notna()], get_codes(C).dropna())[0]

tm.assert_almost_equal(result, expected)
tm.assert_almost_equal(result, expected_pearson)
tm.assert_almost_equal(result, -1)

result = A.corr(C, method="kendall")
expected = stats.kendalltau(A, get_codes(C), nan_policy="omit")[0]
tm.assert_almost_equal(result, expected)

def test_corr_invalid_method(self):
# GH PR #22298
s1 = Series(np.random.default_rng(2).standard_normal(10))
Expand Down

0 comments on commit 6eefa20

Please sign in to comment.