Skip to content

Commit

Permalink
BUG: cut with non-nano (#56101)
Browse files Browse the repository at this point in the history
* BUG: IntervalIndex.factorize with non-nano

* GH ref

* BUG: cut with non-nano

* GH ref

* mypy fixup

* mypy fixup

* Update comment

* simplify
  • Loading branch information
jbrockmendel authored Dec 1, 2023
1 parent 147d68a commit 65af776
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 74 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -598,13 +598,15 @@ Styler
Other
^^^^^
- Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`)
- Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`)
- Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`)
- Bug in :func:`infer_freq` and :meth:`DatetimeIndex.inferred_freq` with weekly frequencies and non-nanosecond resolutions (:issue:`55609`)
- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`)
- Bug in :meth:`Dataframe.from_dict` which would always sort the rows of the created :class:`DataFrame`. (:issue:`55683`)
- Bug in rendering ``inf`` values inside a a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`)
- Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`)
- Bug in the error message when assigning an empty dataframe to a column (:issue:`55956`)
-

.. ***DO NOT USE THIS SECTION***
Expand Down
55 changes: 29 additions & 26 deletions pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,9 @@
Categorical,
Index,
IntervalIndex,
to_datetime,
to_timedelta,
)
import pandas.core.algorithms as algos
from pandas.core.arrays.datetimelike import dtype_to_unit

if TYPE_CHECKING:
from pandas._typing import (
Expand Down Expand Up @@ -364,38 +363,41 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
rng = (x_idx.min(), x_idx.max())
mn, mx = rng

is_dt_or_td = lib.is_np_dtype(x_idx.dtype, "mM") or isinstance(
x_idx.dtype, DatetimeTZDtype
)

if is_numeric_dtype(x_idx.dtype) and (np.isinf(mn) or np.isinf(mx)):
# GH#24314
raise ValueError(
"cannot specify integer `bins` when input data contains infinity"
)

if mn == mx: # adjust end points before binning
if is_dt_or_td:
if _is_dt_or_td(x_idx.dtype):
# using seconds=1 is pretty arbitrary here
td = Timedelta(seconds=1)
# error: Argument 1 to "dtype_to_unit" has incompatible type
# "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
unit = dtype_to_unit(x_idx.dtype) # type: ignore[arg-type]
td = Timedelta(seconds=1).as_unit(unit)
# Use DatetimeArray/TimedeltaArray method instead of linspace
# error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
# has no attribute "_generate_range"
bins = x_idx._values._generate_range( # type: ignore[union-attr]
start=mn - td, end=mx + td, periods=nbins + 1, freq=None
start=mn - td, end=mx + td, periods=nbins + 1, freq=None, unit=unit
)
else:
mn -= 0.001 * abs(mn) if mn != 0 else 0.001
mx += 0.001 * abs(mx) if mx != 0 else 0.001

bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
else: # adjust end points after binning
if is_dt_or_td:
if _is_dt_or_td(x_idx.dtype):
# Use DatetimeArray/TimedeltaArray method instead of linspace

# error: Argument 1 to "dtype_to_unit" has incompatible type
# "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
unit = dtype_to_unit(x_idx.dtype) # type: ignore[arg-type]
# error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
# has no attribute "_generate_range"
bins = x_idx._values._generate_range( # type: ignore[union-attr]
start=mn, end=mx, periods=nbins + 1, freq=None
start=mn, end=mx, periods=nbins + 1, freq=None, unit=unit
)
else:
bins = np.linspace(mn, mx, nbins + 1, endpoint=True)
Expand Down Expand Up @@ -519,14 +521,8 @@ def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]:
"""
dtype: DtypeObj | None = None

if isinstance(x.dtype, DatetimeTZDtype):
if _is_dt_or_td(x.dtype):
dtype = x.dtype
elif lib.is_np_dtype(x.dtype, "M"):
x = to_datetime(x).astype("datetime64[ns]", copy=False)
dtype = np.dtype("datetime64[ns]")
elif lib.is_np_dtype(x.dtype, "m"):
x = to_timedelta(x)
dtype = np.dtype("timedelta64[ns]")
elif is_bool_dtype(x.dtype):
# GH 20303
x = x.astype(np.int64)
Expand All @@ -541,6 +537,12 @@ def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]:
return Index(x), dtype


def _is_dt_or_td(dtype: DtypeObj) -> bool:
# Note: the dtype here comes from an Index.dtype, so we know that that any
# dt64/td64 dtype is of a supported unit.
return isinstance(dtype, DatetimeTZDtype) or lib.is_np_dtype(dtype, "mM")


def _format_labels(
bins: Index,
precision: int,
Expand All @@ -552,15 +554,12 @@ def _format_labels(

formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta]

if isinstance(bins.dtype, DatetimeTZDtype):
formatter = lambda x: x
adjust = lambda x: x - Timedelta("1ns")
elif lib.is_np_dtype(bins.dtype, "M"):
if _is_dt_or_td(bins.dtype):
# error: Argument 1 to "dtype_to_unit" has incompatible type
# "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]"
unit = dtype_to_unit(bins.dtype) # type: ignore[arg-type]
formatter = lambda x: x
adjust = lambda x: x - Timedelta("1ns")
elif lib.is_np_dtype(bins.dtype, "m"):
formatter = lambda x: x
adjust = lambda x: x - Timedelta("1ns")
adjust = lambda x: x - Timedelta(1, unit=unit).as_unit(unit)
else:
precision = _infer_precision(precision, bins)
formatter = lambda x: _round_frac(x, precision)
Expand All @@ -571,6 +570,10 @@ def _format_labels(
# adjust lhs of first interval by precision to account for being right closed
breaks[0] = adjust(breaks[0])

if _is_dt_or_td(bins.dtype):
# error: "Index" has no attribute "as_unit"
breaks = type(bins)(breaks).as_unit(unit) # type: ignore[attr-defined]

return IntervalIndex.from_breaks(breaks, closed=closed)


Expand Down
92 changes: 52 additions & 40 deletions pandas/tests/reshape/test_cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,46 +452,42 @@ def test_datetime_bin(conv):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"data",
[
to_datetime(Series(["2013-01-01", "2013-01-02", "2013-01-03"])),
[
np.datetime64("2013-01-01"),
np.datetime64("2013-01-02"),
np.datetime64("2013-01-03"),
],
np.array(
[
np.datetime64("2013-01-01"),
np.datetime64("2013-01-02"),
np.datetime64("2013-01-03"),
]
),
DatetimeIndex(["2013-01-01", "2013-01-02", "2013-01-03"]),
],
)
def test_datetime_cut(data):
@pytest.mark.parametrize("box", [Series, Index, np.array, list])
def test_datetime_cut(unit, box):
# see gh-14714
#
# Testing time data when it comes in various collection types.
data = to_datetime(["2013-01-01", "2013-01-02", "2013-01-03"]).astype(f"M8[{unit}]")
data = box(data)
result, _ = cut(data, 3, retbins=True)
expected = Series(
IntervalIndex(

if box is list:
# We don't (yet) do inference on these, so get nanos
unit = "ns"

if unit == "s":
# See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425
# for why we round to 8 seconds instead of 7
left = DatetimeIndex(
["2012-12-31 23:57:08", "2013-01-01 16:00:00", "2013-01-02 08:00:00"],
dtype=f"M8[{unit}]",
)
else:
left = DatetimeIndex(
[
Interval(
Timestamp("2012-12-31 23:57:07.200000"),
Timestamp("2013-01-01 16:00:00"),
),
Interval(
Timestamp("2013-01-01 16:00:00"), Timestamp("2013-01-02 08:00:00")
),
Interval(
Timestamp("2013-01-02 08:00:00"), Timestamp("2013-01-03 00:00:00")
),
]
"2012-12-31 23:57:07.200000",
"2013-01-01 16:00:00",
"2013-01-02 08:00:00",
],
dtype=f"M8[{unit}]",
)
).astype(CategoricalDtype(ordered=True))
right = DatetimeIndex(
["2013-01-01 16:00:00", "2013-01-02 08:00:00", "2013-01-03 00:00:00"],
dtype=f"M8[{unit}]",
)

exp_intervals = IntervalIndex.from_arrays(left, right)
expected = Series(exp_intervals).astype(CategoricalDtype(ordered=True))
tm.assert_series_equal(Series(result), expected)


Expand Down Expand Up @@ -576,17 +572,33 @@ def test_datetime_nan_mask():


@pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"])
def test_datetime_cut_roundtrip(tz):
def test_datetime_cut_roundtrip(tz, unit):
# see gh-19891
ser = Series(date_range("20180101", periods=3, tz=tz))
ser = Series(date_range("20180101", periods=3, tz=tz, unit=unit))
result, result_bins = cut(ser, 2, retbins=True)

expected = cut(ser, result_bins)
tm.assert_series_equal(result, expected)

expected_bins = DatetimeIndex(
["2017-12-31 23:57:07.200000", "2018-01-02 00:00:00", "2018-01-03 00:00:00"]
)
if unit == "s":
# TODO: constructing DatetimeIndex with dtype="M8[s]" without truncating
# the first entry here raises in array_to_datetime. Should truncate
# instead of raising?
# See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425
# for why we round to 8 seconds instead of 7
expected_bins = DatetimeIndex(
["2017-12-31 23:57:08", "2018-01-02 00:00:00", "2018-01-03 00:00:00"],
dtype=f"M8[{unit}]",
)
else:
expected_bins = DatetimeIndex(
[
"2017-12-31 23:57:07.200000",
"2018-01-02 00:00:00",
"2018-01-03 00:00:00",
],
dtype=f"M8[{unit}]",
)
expected_bins = expected_bins.tz_localize(tz)
tm.assert_index_equal(result_bins, expected_bins)

Expand Down Expand Up @@ -759,7 +771,7 @@ def test_cut_bins_datetime_intervalindex():
# https://github.com/pandas-dev/pandas/issues/46218
bins = interval_range(Timestamp("2022-02-25"), Timestamp("2022-02-27"), freq="1D")
# passing Series instead of list is important to trigger bug
result = cut(Series([Timestamp("2022-02-26")]), bins=bins)
result = cut(Series([Timestamp("2022-02-26")]).astype("M8[ns]"), bins=bins)
expected = Categorical.from_codes([0], bins, ordered=True)
tm.assert_categorical_equal(result.array, expected)

Expand Down
17 changes: 9 additions & 8 deletions pandas/tests/reshape/test_qcut.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
IntervalIndex,
NaT,
Series,
Timedelta,
TimedeltaIndex,
Timestamp,
cut,
Expand All @@ -22,10 +23,7 @@
import pandas._testing as tm
from pandas.api.types import CategoricalDtype

from pandas.tseries.offsets import (
Day,
Nano,
)
from pandas.tseries.offsets import Day


def test_qcut():
Expand Down Expand Up @@ -216,11 +214,14 @@ def test_single_quantile(data, start, end, length, labels):
],
ids=lambda x: str(x.dtype),
)
def test_qcut_nat(ser):
def test_qcut_nat(ser, unit):
# see gh-19768
intervals = IntervalIndex.from_tuples(
[(ser[0] - Nano(), ser[2] - Day()), np.nan, (ser[2] - Day(), ser[2])]
)
ser = ser.dt.as_unit(unit)
td = Timedelta(1, unit=unit).as_unit(unit)

left = Series([ser[0] - td, np.nan, ser[2] - Day()], dtype=ser.dtype)
right = Series([ser[2] - Day(), np.nan, ser[2]], dtype=ser.dtype)
intervals = IntervalIndex.from_arrays(left, right)
expected = Series(Categorical(intervals, ordered=True))

result = qcut(ser, 2)
Expand Down

0 comments on commit 65af776

Please sign in to comment.