Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: cut with non-nano #56101

Merged
merged 12 commits into from
Dec 1, 2023
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -595,13 +595,15 @@ Styler
Other
^^^^^
- Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`)
- Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`)
- Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`)
- Bug in :func:`infer_freq` and :meth:`DatetimeIndex.inferred_freq` with weekly frequencies and non-nanosecond resolutions (:issue:`55609`)
- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`)
- Bug in :meth:`Dataframe.from_dict` which would always sort the rows of the created :class:`DataFrame`. (:issue:`55683`)
- Bug in rendering ``inf`` values inside a a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`)
- Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`)
- Bug in the error message when assigning an empty dataframe to a column (:issue:`55956`)
-

.. ***DO NOT USE THIS SECTION***

Expand Down
55 changes: 29 additions & 26 deletions pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,9 @@
Categorical,
Index,
IntervalIndex,
to_datetime,
to_timedelta,
)
import pandas.core.algorithms as algos
from pandas.core.arrays.datetimelike import dtype_to_unit

if TYPE_CHECKING:
from pandas._typing import (
Expand Down Expand Up @@ -364,38 +363,41 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
rng = (x_idx.min(), x_idx.max())
mn, mx = rng

is_dt_or_td = lib.is_np_dtype(x_idx.dtype, "mM") or isinstance(
x_idx.dtype, DatetimeTZDtype
)

if is_numeric_dtype(x_idx.dtype) and (np.isinf(mn) or np.isinf(mx)):
# GH#24314
raise ValueError(
"cannot specify integer `bins` when input data contains infinity"
)

if mn == mx: # adjust end points before binning
if is_dt_or_td:
if _is_dt_or_td(x_idx.dtype):
# using seconds=1 is pretty arbitrary here
td = Timedelta(seconds=1)
# error: Argument 1 to "dtype_to_unit" has incompatible type
# "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
unit = dtype_to_unit(x_idx.dtype) # type: ignore[arg-type]
td = Timedelta(seconds=1).as_unit(unit)
# Use DatetimeArray/TimedeltaArray method instead of linspace
# error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
# has no attribute "_generate_range"
bins = x_idx._values._generate_range( # type: ignore[union-attr]
start=mn - td, end=mx + td, periods=nbins + 1, freq=None
start=mn - td, end=mx + td, periods=nbins + 1, freq=None, unit=unit
)
else:
mn -= 0.001 * abs(mn) if mn != 0 else 0.001
mx += 0.001 * abs(mx) if mx != 0 else 0.001

bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
else: # adjust end points after binning
if is_dt_or_td:
if _is_dt_or_td(x_idx.dtype):
# Use DatetimeArray/TimedeltaArray method instead of linspace

# error: Argument 1 to "dtype_to_unit" has incompatible type
# "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
unit = dtype_to_unit(x_idx.dtype) # type: ignore[arg-type]
# error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
# has no attribute "_generate_range"
bins = x_idx._values._generate_range( # type: ignore[union-attr]
start=mn, end=mx, periods=nbins + 1, freq=None
start=mn, end=mx, periods=nbins + 1, freq=None, unit=unit
)
else:
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
Expand Down Expand Up @@ -519,14 +521,8 @@ def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]:
"""
dtype: DtypeObj | None = None

if isinstance(x.dtype, DatetimeTZDtype):
if _is_dt_or_td(x.dtype):
dtype = x.dtype
elif lib.is_np_dtype(x.dtype, "M"):
x = to_datetime(x).astype("datetime64[ns]", copy=False)
dtype = np.dtype("datetime64[ns]")
elif lib.is_np_dtype(x.dtype, "m"):
x = to_timedelta(x)
dtype = np.dtype("timedelta64[ns]")
elif is_bool_dtype(x.dtype):
# GH 20303
x = x.astype(np.int64)
Expand All @@ -541,6 +537,12 @@ def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]:
return Index(x), dtype


def _is_dt_or_td(dtype: DtypeObj) -> bool:
# Note: the dtype here comes from an Index.dtype, so we know that that any
# dt64/td64 dtype is of a supported unit.
return isinstance(dtype, DatetimeTZDtype) or lib.is_np_dtype(dtype, "mM")


def _format_labels(
bins: Index,
precision: int,
Expand All @@ -552,15 +554,12 @@ def _format_labels(

formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta]

if isinstance(bins.dtype, DatetimeTZDtype):
formatter = lambda x: x
adjust = lambda x: x - Timedelta("1ns")
elif lib.is_np_dtype(bins.dtype, "M"):
if _is_dt_or_td(bins.dtype):
# error: Argument 1 to "dtype_to_unit" has incompatible type
# "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
unit = dtype_to_unit(bins.dtype) # type: ignore[arg-type]
formatter = lambda x: x
adjust = lambda x: x - Timedelta("1ns")
elif lib.is_np_dtype(bins.dtype, "m"):
formatter = lambda x: x
adjust = lambda x: x - Timedelta("1ns")
adjust = lambda x: x - Timedelta(1, unit=unit).as_unit(unit)
else:
precision = _infer_precision(precision, bins)
formatter = lambda x: _round_frac(x, precision)
Expand All @@ -571,6 +570,10 @@ def _format_labels(
# adjust lhs of first interval by precision to account for being right closed
breaks[0] = adjust(breaks[0])

if _is_dt_or_td(bins.dtype):
# error: "Index" has no attribute "as_unit"
breaks = type(bins)(breaks).as_unit(unit) # type: ignore[attr-defined]

return IntervalIndex.from_breaks(breaks, closed=closed)


Expand Down
92 changes: 52 additions & 40 deletions pandas/tests/reshape/test_cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,46 +452,42 @@ def test_datetime_bin(conv):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"data",
[
to_datetime(Series(["2013-01-01", "2013-01-02", "2013-01-03"])),
[
np.datetime64("2013-01-01"),
np.datetime64("2013-01-02"),
np.datetime64("2013-01-03"),
],
np.array(
[
np.datetime64("2013-01-01"),
np.datetime64("2013-01-02"),
np.datetime64("2013-01-03"),
]
),
DatetimeIndex(["2013-01-01", "2013-01-02", "2013-01-03"]),
],
)
def test_datetime_cut(data):
@pytest.mark.parametrize("box", [Series, Index, np.array, list])
def test_datetime_cut(unit, box):
# see gh-14714
#
# Testing time data when it comes in various collection types.
data = to_datetime(["2013-01-01", "2013-01-02", "2013-01-03"]).astype(f"M8[{unit}]")
data = box(data)
result, _ = cut(data, 3, retbins=True)
expected = Series(
IntervalIndex(

if box is list:
# We don't (yet) do inference on these, so get nanos
unit = "ns"

if unit == "s":
# See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425
# for why we round to 8 seconds instead of 7
left = DatetimeIndex(
["2012-12-31 23:57:08", "2013-01-01 16:00:00", "2013-01-02 08:00:00"],
dtype=f"M8[{unit}]",
)
else:
left = DatetimeIndex(
[
Interval(
Timestamp("2012-12-31 23:57:07.200000"),
Timestamp("2013-01-01 16:00:00"),
),
Interval(
Timestamp("2013-01-01 16:00:00"), Timestamp("2013-01-02 08:00:00")
),
Interval(
Timestamp("2013-01-02 08:00:00"), Timestamp("2013-01-03 00:00:00")
),
]
"2012-12-31 23:57:07.200000",
"2013-01-01 16:00:00",
"2013-01-02 08:00:00",
],
dtype=f"M8[{unit}]",
)
).astype(CategoricalDtype(ordered=True))
right = DatetimeIndex(
["2013-01-01 16:00:00", "2013-01-02 08:00:00", "2013-01-03 00:00:00"],
dtype=f"M8[{unit}]",
)

exp_intervals = IntervalIndex.from_arrays(left, right)
expected = Series(exp_intervals).astype(CategoricalDtype(ordered=True))
tm.assert_series_equal(Series(result), expected)


Expand Down Expand Up @@ -576,17 +572,33 @@ def test_datetime_nan_mask():


@pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"])
def test_datetime_cut_roundtrip(tz):
def test_datetime_cut_roundtrip(tz, unit):
# see gh-19891
ser = Series(date_range("20180101", periods=3, tz=tz))
ser = Series(date_range("20180101", periods=3, tz=tz, unit=unit))
result, result_bins = cut(ser, 2, retbins=True)

expected = cut(ser, result_bins)
tm.assert_series_equal(result, expected)

expected_bins = DatetimeIndex(
["2017-12-31 23:57:07.200000", "2018-01-02 00:00:00", "2018-01-03 00:00:00"]
)
if unit == "s":
# TODO: constructing DatetimeIndex with dtype="M8[s]" without truncating
# the first entry here raises in array_to_datetime. Should truncate
# instead of raising?
# See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425
# for why we round to 8 seconds instead of 7
expected_bins = DatetimeIndex(
["2017-12-31 23:57:08", "2018-01-02 00:00:00", "2018-01-03 00:00:00"],
dtype=f"M8[{unit}]",
)
else:
expected_bins = DatetimeIndex(
[
"2017-12-31 23:57:07.200000",
"2018-01-02 00:00:00",
"2018-01-03 00:00:00",
],
dtype=f"M8[{unit}]",
)
expected_bins = expected_bins.tz_localize(tz)
tm.assert_index_equal(result_bins, expected_bins)

Expand Down Expand Up @@ -759,7 +771,7 @@ def test_cut_bins_datetime_intervalindex():
# https://github.com/pandas-dev/pandas/issues/46218
bins = interval_range(Timestamp("2022-02-25"), Timestamp("2022-02-27"), freq="1D")
# passing Series instead of list is important to trigger bug
result = cut(Series([Timestamp("2022-02-26")]), bins=bins)
result = cut(Series([Timestamp("2022-02-26")]).astype("M8[ns]"), bins=bins)
expected = Categorical.from_codes([0], bins, ordered=True)
tm.assert_categorical_equal(result.array, expected)

Expand Down
17 changes: 9 additions & 8 deletions pandas/tests/reshape/test_qcut.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
IntervalIndex,
NaT,
Series,
Timedelta,
TimedeltaIndex,
Timestamp,
cut,
Expand All @@ -22,10 +23,7 @@
import pandas._testing as tm
from pandas.api.types import CategoricalDtype

from pandas.tseries.offsets import (
Day,
Nano,
)
from pandas.tseries.offsets import Day


def test_qcut():
Expand Down Expand Up @@ -216,11 +214,14 @@ def test_single_quantile(data, start, end, length, labels):
],
ids=lambda x: str(x.dtype),
)
def test_qcut_nat(ser):
def test_qcut_nat(ser, unit):
# see gh-19768
intervals = IntervalIndex.from_tuples(
[(ser[0] - Nano(), ser[2] - Day()), np.nan, (ser[2] - Day(), ser[2])]
)
ser = ser.dt.as_unit(unit)
td = Timedelta(1, unit=unit).as_unit(unit)

left = Series([ser[0] - td, np.nan, ser[2] - Day()], dtype=ser.dtype)
right = Series([ser[2] - Day(), np.nan, ser[2]], dtype=ser.dtype)
intervals = IntervalIndex.from_arrays(left, right)
expected = Series(Categorical(intervals, ordered=True))

result = qcut(ser, 2)
Expand Down
Loading