Skip to content

Commit

Permalink
Merge branch 'main' into merge_asof
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl authored Dec 11, 2023
2 parents d8ccef3 + b4c9df8 commit 56101e3
Show file tree
Hide file tree
Showing 16 changed files with 210 additions and 52 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ Other enhancements

- :meth:`~DataFrame.to_sql` with method parameter set to ``multi`` works with Oracle on the backend
- :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`).
- :func:`get_dummies` now returning extension dtypes ``boolean`` or ``bool[pyarrow]`` that are compatible with the input dtype (:issue:`56273`)
- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
- :func:`read_sas` returns ``datetime64`` dtypes with resolutions better matching those stored natively in SAS, and avoids returning object-dtype in cases that cannot be stored with ``datetime64[ns]`` dtype (:issue:`56127`)
- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`)
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -932,7 +932,10 @@ def value_counts_internal(
idx = Index(keys)
if idx.dtype == bool and keys.dtype == object:
idx = idx.astype(object)
elif idx.dtype != keys.dtype:
elif (
idx.dtype != keys.dtype # noqa: PLR1714 # # pylint: disable=R1714
and idx.dtype != "string[pyarrow_numpy]"
):
warnings.warn(
# GH#56161
"The behavior of value_counts with object-dtype is deprecated. "
Expand Down
24 changes: 23 additions & 1 deletion pandas/core/reshape/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,14 @@
is_object_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
ArrowDtype,
CategoricalDtype,
)

from pandas.core.arrays import SparseArray
from pandas.core.arrays.categorical import factorize_from_iterable
from pandas.core.arrays.string_ import StringDtype
from pandas.core.frame import DataFrame
from pandas.core.indexes.api import (
Index,
Expand Down Expand Up @@ -244,8 +249,25 @@ def _get_dummies_1d(
# Series avoids inconsistent NaN handling
codes, levels = factorize_from_iterable(Series(data, copy=False))

if dtype is None:
if dtype is None and hasattr(data, "dtype"):
input_dtype = data.dtype
if isinstance(input_dtype, CategoricalDtype):
input_dtype = input_dtype.categories.dtype

if isinstance(input_dtype, ArrowDtype):
import pyarrow as pa

dtype = ArrowDtype(pa.bool_()) # type: ignore[assignment]
elif (
isinstance(input_dtype, StringDtype)
and input_dtype.storage != "pyarrow_numpy"
):
dtype = pandas_dtype("boolean") # type: ignore[assignment]
else:
dtype = np.dtype(bool)
elif dtype is None:
dtype = np.dtype(bool)

_dtype = pandas_dtype(dtype)

if is_object_dtype(_dtype):
Expand Down
16 changes: 13 additions & 3 deletions pandas/tests/arrays/boolean/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,16 @@ def test_op_int8(left_array, right_array, opname):
# -----------------------------------------------------------------------------


def test_error_invalid_values(data, all_arithmetic_operators):
def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
# invalid ops

if using_infer_string:
import pyarrow as pa

err = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
else:
err = TypeError

op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)
Expand All @@ -110,9 +117,10 @@ def test_error_invalid_values(data, all_arithmetic_operators):
[
r"unsupported operand type\(s\) for",
"Concatenation operation is not implemented for NumPy arrays",
"has no kernel",
]
)
with pytest.raises(TypeError, match=msg):
with pytest.raises(err, match=msg):
ops(pd.Timestamp("20180101"))

# invalid array-likes
Expand All @@ -123,7 +131,9 @@ def test_error_invalid_values(data, all_arithmetic_operators):
r"unsupported operand type\(s\) for",
"can only concatenate str",
"not all arguments converted during string formatting",
"has no kernel",
"not implemented",
]
)
with pytest.raises(TypeError, match=msg):
with pytest.raises(err, match=msg):
ops(pd.Series("foo", index=s.index))
2 changes: 1 addition & 1 deletion pandas/tests/arrays/categorical/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def test_astype(self, ordered):
expected = np.array(cat)
tm.assert_numpy_array_equal(result, expected)

msg = r"Cannot cast object dtype to float64"
msg = r"Cannot cast object|string dtype to float64"
with pytest.raises(ValueError, match=msg):
cat.astype(float)

Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/arrays/categorical/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import numpy as np
import pytest

from pandas._config import using_pyarrow_string_dtype

from pandas.core.dtypes.common import (
is_float_dtype,
is_integer_dtype,
Expand Down Expand Up @@ -447,6 +449,7 @@ def test_constructor_str_unknown(self):
with pytest.raises(ValueError, match="Unknown dtype"):
Categorical([1, 2], dtype="foo")

@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="Can't be NumPy strings")
def test_constructor_np_strs(self):
# GH#31499 Hashtable.map_locations needs to work on np.str_ objects
cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")])
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/arrays/categorical/test_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def test_comparisons(self, factor):
cat > cat_unordered

# comparison (in both directions) with Series will raise
s = Series(["b", "b", "b"])
s = Series(["b", "b", "b"], dtype=object)
msg = (
"Cannot compare a Categorical for op __gt__ with type "
r"<class 'numpy\.ndarray'>"
Expand All @@ -108,7 +108,7 @@ def test_comparisons(self, factor):

# comparison with numpy.array will raise in both direction, but only on
# newer numpy versions
a = np.array(["b", "b", "b"])
a = np.array(["b", "b", "b"], dtype=object)
with pytest.raises(TypeError, match=msg):
cat > a
with pytest.raises(TypeError, match=msg):
Expand Down Expand Up @@ -248,7 +248,7 @@ def test_comparisons(self, data, reverse, base):
cat_base = Series(
Categorical(base, categories=cat.cat.categories, ordered=True)
)
s = Series(base)
s = Series(base, dtype=object if base == list("bbb") else None)
a = np.array(base)

# comparisons need to take categories ordering into account
Expand Down
30 changes: 22 additions & 8 deletions pandas/tests/arrays/categorical/test_repr.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import numpy as np
import pytest

from pandas._config import using_pyarrow_string_dtype

from pandas import (
Categorical,
CategoricalDtype,
CategoricalIndex,
Index,
Series,
date_range,
option_context,
Expand All @@ -13,11 +17,17 @@


class TestCategoricalReprWithFactor:
def test_print(self, factor):
expected = [
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
"Categories (3, object): ['a' < 'b' < 'c']",
]
def test_print(self, factor, using_infer_string):
if using_infer_string:
expected = [
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
"Categories (3, string): [a < b < c]",
]
else:
expected = [
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
"Categories (3, object): ['a' < 'b' < 'c']",
]
expected = "\n".join(expected)
actual = repr(factor)
assert actual == expected
Expand All @@ -26,7 +36,7 @@ def test_print(self, factor):
class TestCategoricalRepr:
def test_big_print(self):
codes = np.array([0, 1, 2, 0, 1, 2] * 100)
dtype = CategoricalDtype(categories=["a", "b", "c"])
dtype = CategoricalDtype(categories=Index(["a", "b", "c"], dtype=object))
factor = Categorical.from_codes(codes, dtype=dtype)
expected = [
"['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']",
Expand All @@ -40,13 +50,13 @@ def test_big_print(self):
assert actual == expected

def test_empty_print(self):
factor = Categorical([], ["a", "b", "c"])
factor = Categorical([], Index(["a", "b", "c"], dtype=object))
expected = "[], Categories (3, object): ['a', 'b', 'c']"
actual = repr(factor)
assert actual == expected

assert expected == actual
factor = Categorical([], ["a", "b", "c"], ordered=True)
factor = Categorical([], Index(["a", "b", "c"], dtype=object), ordered=True)
expected = "[], Categories (3, object): ['a' < 'b' < 'c']"
actual = repr(factor)
assert expected == actual
Expand All @@ -66,6 +76,10 @@ def test_print_none_width(self):
with option_context("display.width", None):
assert exp == repr(a)

@pytest.mark.skipif(
using_pyarrow_string_dtype(),
reason="Change once infer_string is set to True by default",
)
def test_unicode_print(self):
c = Categorical(["aaaaa", "bb", "cccc"] * 20)
expected = """\
Expand Down
21 changes: 16 additions & 5 deletions pandas/tests/arrays/floating/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,11 +122,18 @@ def test_arith_zero_dim_ndarray(other):
# -----------------------------------------------------------------------------


def test_error_invalid_values(data, all_arithmetic_operators):
def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)

if using_infer_string:
import pyarrow as pa

errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
else:
errs = TypeError

# invalid scalars
msg = "|".join(
[
Expand All @@ -140,15 +147,17 @@ def test_error_invalid_values(data, all_arithmetic_operators):
"ufunc '.*' not supported for the input types, and the inputs could not",
"ufunc '.*' did not contain a loop with signature matching types",
"Concatenation operation is not implemented for NumPy arrays",
"has no kernel",
"not implemented",
]
)
with pytest.raises(TypeError, match=msg):
with pytest.raises(errs, match=msg):
ops("foo")
with pytest.raises(TypeError, match=msg):
with pytest.raises(errs, match=msg):
ops(pd.Timestamp("20180101"))

# invalid array-likes
with pytest.raises(TypeError, match=msg):
with pytest.raises(errs, match=msg):
ops(pd.Series("foo", index=s.index))

msg = "|".join(
Expand All @@ -167,9 +176,11 @@ def test_error_invalid_values(data, all_arithmetic_operators):
),
r"ufunc 'add' cannot use operands with types dtype\('float\d{2}'\)",
"cannot subtract DatetimeArray from ndarray",
"has no kernel",
"not implemented",
]
)
with pytest.raises(TypeError, match=msg):
with pytest.raises(errs, match=msg):
ops(pd.Series(pd.date_range("20180101", periods=len(s))))


Expand Down
33 changes: 24 additions & 9 deletions pandas/tests/arrays/integer/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,11 +172,18 @@ def test_numpy_zero_dim_ndarray(other):
# -----------------------------------------------------------------------------


def test_error_invalid_values(data, all_arithmetic_operators):
def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)

if using_infer_string:
import pyarrow as pa

errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
else:
errs = TypeError

# invalid scalars
msg = "|".join(
[
Expand All @@ -188,20 +195,26 @@ def test_error_invalid_values(data, all_arithmetic_operators):
"ufunc '.*' not supported for the input types, and the inputs could not",
"ufunc '.*' did not contain a loop with signature matching types",
"Addition/subtraction of integers and integer-arrays with Timestamp",
"has no kernel",
"not implemented",
]
)
with pytest.raises(TypeError, match=msg):
with pytest.raises(errs, match=msg):
ops("foo")
with pytest.raises(TypeError, match=msg):
with pytest.raises(errs, match=msg):
ops(pd.Timestamp("20180101"))

# invalid array-likes
str_ser = pd.Series("foo", index=s.index)
# with pytest.raises(TypeError, match=msg):
if all_arithmetic_operators in [
"__mul__",
"__rmul__",
]: # (data[~data.isna()] >= 0).all():
if (
all_arithmetic_operators
in [
"__mul__",
"__rmul__",
]
and not using_infer_string
): # (data[~data.isna()] >= 0).all():
res = ops(str_ser)
expected = pd.Series(["foo" * x for x in data], index=s.index)
expected = expected.fillna(np.nan)
Expand All @@ -210,7 +223,7 @@ def test_error_invalid_values(data, all_arithmetic_operators):
# more-correct than np.nan here.
tm.assert_series_equal(res, expected)
else:
with pytest.raises(TypeError, match=msg):
with pytest.raises(errs, match=msg):
ops(str_ser)

msg = "|".join(
Expand All @@ -223,9 +236,11 @@ def test_error_invalid_values(data, all_arithmetic_operators):
r"can only concatenate str \(not \"int\"\) to str",
"not all arguments converted during string",
"cannot subtract DatetimeArray from ndarray",
"has no kernel",
"not implemented",
]
)
with pytest.raises(TypeError, match=msg):
with pytest.raises(errs, match=msg):
ops(pd.Series(pd.date_range("20180101", periods=len(s))))


Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/arrays/integer/test_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,9 @@ def test_groupby_reductions(op, expected):
["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")],
],
)
def test_mixed_reductions(op, expected):
def test_mixed_reductions(op, expected, using_infer_string):
if op in ["any", "all"] and using_infer_string:
expected = expected.astype("bool")
df = DataFrame(
{
"A": ["a", "b", "b"],
Expand Down
Loading

0 comments on commit 56101e3

Please sign in to comment.