From 8614088df2338c1db170bcd0961348220669e082 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 9 Dec 2023 10:38:17 -0800 Subject: [PATCH] BUG: fillna with mixed-resolution dt64/td64 (#56413) * BUG: fillna with mixed-resolution dt64/td64 * mypy fixup * troubleshoot docbuild * typo fixup in whatsnew --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/arrays/_mixins.py | 6 ++ pandas/core/arrays/datetimelike.py | 10 +- pandas/tests/series/methods/test_fillna.py | 111 +++++++++++++++++---- 4 files changed, 109 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index e51a347dec46c..6006fcbcdbf20 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -537,6 +537,7 @@ Datetimelike - Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`) - Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`) - Bug in :meth:`Series.dt.round` with non-nanosecond resolution and ``NaT`` entries incorrectly raising ``OverflowError`` (:issue:`56158`) +- Bug in :meth:`Series.fillna` with non-nanosecond resolution dtypes and higher-resolution vector values returning incorrect (internally-corrupted) results (:issue:`56410`) - Bug in :meth:`Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in :meth:`Timestamp.unit` being inferred incorrectly from an ISO8601 format string with minute or hour resolution and a timezone offset (:issue:`56208`) - Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetim64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`55979`) @@ -550,7 +551,6 @@ Datetimelike - Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`) - Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`) - Bug in the results of :func:`to_datetime` with an floating-dtype argument with ``unit`` not matching the pointwise results of :class:`Timestamp` (:issue:`56037`) -- Timedelta ^^^^^^^^^ diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index d6f4dbfe7f549..8d1f5262e7911 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -430,6 +430,12 @@ def _where(self: Self, mask: npt.NDArray[np.bool_], value) -> Self: value = self._validate_setitem_value(value) res_values = np.where(mask, self._ndarray, value) + if res_values.dtype != self._ndarray.dtype: + raise AssertionError( + # GH#56410 + "Something has gone wrong, please report a bug at " + "github.com/pandas-dev/pandas/" + ) return self._from_backing_data(res_values) # ------------------------------------------------------------------------ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 879f477106ae9..8928c72de750c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -646,6 +646,9 @@ def _validation_error_message(self, value, allow_listlike: bool = False) -> str: def _validate_listlike(self, value, allow_object: bool = False): if isinstance(value, type(self)): + if self.dtype.kind in "mM" and not allow_object: + # error: "DatetimeLikeArrayMixin" has no attribute "as_unit" + value = value.as_unit(self.unit, round_ok=False) # type: ignore[attr-defined] return value if isinstance(value, list) and len(value) == 0: @@ -694,6 +697,9 @@ def _validate_listlike(self, value, allow_object: bool = False): msg = self._validation_error_message(value, True) raise TypeError(msg) + if self.dtype.kind in "mM" and not allow_object: + # error: "DatetimeLikeArrayMixin" has no attribute "as_unit" + value = value.as_unit(self.unit, round_ok=False) # type: ignore[attr-defined] return value def _validate_setitem_value(self, value): @@ -2138,12 +2144,12 @@ def unit(self) -> str: # "ExtensionDtype"; expected "Union[DatetimeTZDtype, dtype[Any]]" return dtype_to_unit(self.dtype) # type: ignore[arg-type] - def as_unit(self, unit: str) -> Self: + def as_unit(self, unit: str, round_ok: bool = True) -> Self: if unit not in ["s", "ms", "us", "ns"]: raise ValueError("Supported units are 's', 'ms', 'us', 'ns'") dtype = np.dtype(f"{self.dtype.kind}8[{unit}]") - new_values = astype_overflowsafe(self._ndarray, dtype, round_ok=True) + new_values = astype_overflowsafe(self._ndarray, dtype, round_ok=round_ok) if isinstance(self.dtype, np.dtype): new_dtype = new_values.dtype diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index a5170898b1720..acc5805578f22 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -19,6 +19,7 @@ Timestamp, date_range, isna, + timedelta_range, ) import pandas._testing as tm from pandas.core.arrays import period_array @@ -239,7 +240,7 @@ def test_fillna_downcast_infer_objects_to_numeric(self): expected = Series([0, 1, 2.5, 4, 4], dtype=np.float64) tm.assert_series_equal(res, expected) - def test_timedelta_fillna(self, frame_or_series): + def test_timedelta_fillna(self, frame_or_series, unit): # GH#3371 ser = Series( [ @@ -247,7 +248,8 @@ def test_timedelta_fillna(self, frame_or_series): Timestamp("20130101"), Timestamp("20130102"), Timestamp("20130103 9:01:01"), - ] + ], + dtype=f"M8[{unit}]", ) td = ser.diff() obj = frame_or_series(td).copy() @@ -260,7 +262,8 @@ def test_timedelta_fillna(self, frame_or_series): timedelta(0), timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ] + ], + dtype=f"m8[{unit}]", ) expected = frame_or_series(expected) tm.assert_equal(result, expected) @@ -279,7 +282,8 @@ def test_timedelta_fillna(self, frame_or_series): timedelta(0), timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ] + ], + dtype=f"m8[{unit}]", ) expected = frame_or_series(expected) tm.assert_equal(result, expected) @@ -291,7 +295,8 @@ def test_timedelta_fillna(self, frame_or_series): timedelta(0), timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ] + ], + dtype=f"m8[{unit}]", ) expected = frame_or_series(expected) tm.assert_equal(result, expected) @@ -303,7 +308,8 @@ def test_timedelta_fillna(self, frame_or_series): timedelta(0), timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ] + ], + dtype=f"m8[{unit}]", ) expected = frame_or_series(expected) tm.assert_equal(result, expected) @@ -316,7 +322,7 @@ def test_timedelta_fillna(self, frame_or_series): timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1), ], - dtype="m8[ns]", + dtype=f"m8[{unit}]", ) expected = frame_or_series(expected) tm.assert_equal(result, expected) @@ -375,6 +381,72 @@ def test_datetime64_fillna(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "scalar", + [ + False, + pytest.param( + True, + marks=pytest.mark.xfail( + reason="GH#56410 scalar case not yet addressed" + ), + ), + ], + ) + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_datetime64_fillna_mismatched_reso_no_rounding(self, tz, scalar): + # GH#56410 + dti = date_range("2016-01-01", periods=3, unit="s", tz=tz) + item = Timestamp("2016-02-03 04:05:06.789", tz=tz) + vec = date_range(item, periods=3, unit="ms") + + exp_dtype = "M8[ms]" if tz is None else "M8[ms, UTC]" + expected = Series([item, dti[1], dti[2]], dtype=exp_dtype) + + ser = Series(dti) + ser[0] = NaT + ser2 = ser.copy() + + res = ser.fillna(item) + res2 = ser2.fillna(Series(vec)) + + if scalar: + tm.assert_series_equal(res, expected) + else: + tm.assert_series_equal(res2, expected) + + @pytest.mark.parametrize( + "scalar", + [ + False, + pytest.param( + True, + marks=pytest.mark.xfail( + reason="GH#56410 scalar case not yet addressed" + ), + ), + ], + ) + def test_timedelta64_fillna_mismatched_reso_no_rounding(self, scalar): + # GH#56410 + tdi = date_range("2016-01-01", periods=3, unit="s") - Timestamp("1970-01-01") + item = Timestamp("2016-02-03 04:05:06.789") - Timestamp("1970-01-01") + vec = timedelta_range(item, periods=3, unit="ms") + + expected = Series([item, tdi[1], tdi[2]], dtype="m8[ms]") + + ser = Series(tdi) + ser[0] = NaT + ser2 = ser.copy() + + res = ser.fillna(item) + res2 = ser2.fillna(Series(vec)) + + if scalar: + tm.assert_series_equal(res, expected) + else: + tm.assert_series_equal(res2, expected) + def test_datetime64_fillna_backfill(self): # GH#6587 # make sure that we are treating as integer when filling @@ -392,7 +464,7 @@ def test_datetime64_fillna_backfill(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("tz", ["US/Eastern", "Asia/Tokyo"]) - def test_datetime64_tz_fillna(self, tz): + def test_datetime64_tz_fillna(self, tz, unit): # DatetimeLikeBlock ser = Series( [ @@ -400,7 +472,8 @@ def test_datetime64_tz_fillna(self, tz): NaT, Timestamp("2011-01-03 10:00"), NaT, - ] + ], + dtype=f"M8[{unit}]", ) null_loc = Series([False, True, False, True]) @@ -411,7 +484,8 @@ def test_datetime64_tz_fillna(self, tz): Timestamp("2011-01-02 10:00"), Timestamp("2011-01-03 10:00"), Timestamp("2011-01-02 10:00"), - ] + ], + dtype=f"M8[{unit}]", ) tm.assert_series_equal(expected, result) # check s is not changed @@ -468,15 +542,18 @@ def test_datetime64_tz_fillna(self, tz): Timestamp("2011-01-02 10:00"), Timestamp("2011-01-03 10:00"), Timestamp("2011-01-04 10:00"), - ] + ], + dtype=f"M8[{unit}]", ) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) # DatetimeTZBlock - idx = DatetimeIndex(["2011-01-01 10:00", NaT, "2011-01-03 10:00", NaT], tz=tz) + idx = DatetimeIndex( + ["2011-01-01 10:00", NaT, "2011-01-03 10:00", NaT], tz=tz + ).as_unit(unit) ser = Series(idx) - assert ser.dtype == f"datetime64[ns, {tz}]" + assert ser.dtype == f"datetime64[{unit}, {tz}]" tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna(Timestamp("2011-01-02 10:00")) @@ -500,7 +577,7 @@ def test_datetime64_tz_fillna(self, tz): "2011-01-02 10:00", ], tz=tz, - ) + ).as_unit(unit) expected = Series(idx) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) @@ -514,7 +591,7 @@ def test_datetime64_tz_fillna(self, tz): "2011-01-02 10:00", ], tz=tz, - ) + ).as_unit(unit) expected = Series(idx) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) @@ -562,7 +639,7 @@ def test_datetime64_tz_fillna(self, tz): Timestamp("2011-01-03 10:00", tz=tz), Timestamp("2011-01-04 10:00", tz=tz), ] - ) + ).dt.as_unit(unit) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) @@ -589,7 +666,7 @@ def test_datetime64_tz_fillna(self, tz): Timestamp("2011-01-03 10:00", tz=tz), Timestamp("2013-01-01", tz="US/Pacific").tz_convert(tz), ] - ) + ).dt.as_unit(unit) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc)