From 4b249748880780f1cc3825ac28b0d5f3179c3c5e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 31 Oct 2023 05:09:02 -1000 Subject: [PATCH 1/4] CI: Fix concurrency group for new CoW warn build (#55770) * CI: Fix concurrency group for new CoW warn build * Formatting --- .github/workflows/unit-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index d00dfc87a0584..0b6843c5a7033 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -69,7 +69,7 @@ jobs: env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "1" - - name: "Copy-on-Write (warnings)" + - name: "Copy-on-Write 3.11 (warnings)" env_file: actions-311.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "warn" @@ -98,7 +98,7 @@ jobs: PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }} + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_copy_on_write || '' }} cancel-in-progress: true services: From 386a1eb1d2e9d6109fcaf305752d2e4bb29cc3ee Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 31 Oct 2023 09:31:43 -0700 Subject: [PATCH 2/4] BUG: OutOfBoundsDatetime with non-nano dt64tz dtype (#55768) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/_libs/tslib.pyi | 2 +- pandas/_libs/tslib.pyx | 4 ++-- pandas/core/arrays/datetimes.py | 19 +++++++++++-------- .../indexes/datetimes/test_constructors.py | 11 +++++++---- pandas/tests/series/methods/test_astype.py | 11 +++++++---- 6 files changed, 29 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d9909b0dbfad8..99b6310a80f83 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -330,6 +330,7 @@ Datetimelike - Bug in addition or subtraction of :class:`BusinessDay` offset with ``offset`` attribute to non-nanosecond :class:`Index`, :class:`Series`, or :class:`DataFrame` column giving incorrect results (:issue:`55608`) - Bug in addition or subtraction of :class:`DateOffset` objects with microsecond components to ``datetime64`` :class:`Index`, :class:`Series`, or :class:`DataFrame` columns with non-nanosecond resolution (:issue:`55595`) - Bug in addition or subtraction of very large :class:`Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) +- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`) - diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index 7f95bfc717633..a803ea0692c74 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -29,5 +29,5 @@ def array_to_datetime( # returned ndarray may be object dtype or datetime64[ns] def array_to_datetime_with_tz( - values: npt.NDArray[np.object_], tz: tzinfo + values: npt.NDArray[np.object_], tz: tzinfo, creso: int ) -> npt.NDArray[np.int64]: ... diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index d2eeea78ee7e8..bb96a89f7d1fe 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -671,7 +671,7 @@ cdef _array_to_datetime_object( return oresult_nd, None -def array_to_datetime_with_tz(ndarray values, tzinfo tz): +def array_to_datetime_with_tz(ndarray values, tzinfo tz, NPY_DATETIMEUNIT creso): """ Vectorized analogue to pd.Timestamp(value, tz=tz) @@ -707,7 +707,7 @@ def array_to_datetime_with_tz(ndarray values, tzinfo tz): else: # datetime64, tznaive pydatetime, int, float ts = ts.tz_localize(tz) - ts = ts.as_unit("ns") + ts = (<_Timestamp>ts)._as_creso(creso) ival = ts._value # Analogous to: result[i] = ival diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 3fff5cb2aa0c7..968b64e2c3de3 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -355,7 +355,7 @@ def _from_sequence_not_strict( # DatetimeTZDtype unit = dtype.unit - subarr, tz, inferred_freq = _sequence_to_dt64ns( + subarr, tz, inferred_freq = _sequence_to_dt64( data, copy=copy, tz=tz, @@ -2179,7 +2179,7 @@ def std( # Constructor Helpers -def _sequence_to_dt64ns( +def _sequence_to_dt64( data, *, copy: bool = False, @@ -2205,7 +2205,8 @@ def _sequence_to_dt64ns( Returns ------- result : numpy.ndarray - The sequence converted to a numpy array with dtype ``datetime64[ns]``. + The sequence converted to a numpy array with dtype ``datetime64[unit]``. + Where `unit` is "ns" unless specified otherwise by `out_unit`. tz : tzinfo or None Either the user-provided tzinfo or one inferred from the data. inferred_freq : Tick or None @@ -2228,9 +2229,9 @@ def _sequence_to_dt64ns( data, copy = maybe_convert_dtype(data, copy, tz=tz) data_dtype = getattr(data, "dtype", None) - out_dtype = DT64NS_DTYPE - if out_unit is not None: - out_dtype = np.dtype(f"M8[{out_unit}]") + if out_unit is None: + out_unit = "ns" + out_dtype = np.dtype(f"M8[{out_unit}]") if data_dtype == object or is_string_dtype(data_dtype): # TODO: We do not have tests specific to string-dtypes, @@ -2241,8 +2242,10 @@ def _sequence_to_dt64ns( elif tz is not None and ambiguous == "raise": # TODO: yearfirst/dayfirst/etc? obj_data = np.asarray(data, dtype=object) - i8data = tslib.array_to_datetime_with_tz(obj_data, tz) - return i8data.view(DT64NS_DTYPE), tz, None + i8data = tslib.array_to_datetime_with_tz( + obj_data, tz, abbrev_to_npy_unit(out_unit) + ) + return i8data.view(out_dtype), tz, None else: # data comes back here as either i8 to denote UTC timestamps # or M8[ns] to denote wall times diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index ef86e7800dbb7..90ddc9b5f618a 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -1013,16 +1013,19 @@ def test_dti_convert_datetime_list(self, tzstr): dr2 = DatetimeIndex(list(dr), name="foo", freq="D") tm.assert_index_equal(dr, dr2) - def test_dti_constructor_with_non_nano_dtype(self): - # GH#55756 + @pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"]) + def test_dti_constructor_with_non_nano_dtype(self, tz): + # GH#55756, GH#54620 ts = Timestamp("2999-01-01") dtype = "M8[us]" + if tz is not None: + dtype = f"M8[us, {tz}]" # NB: the 2500 is interpreted as nanoseconds and rounded *down* # to 2 microseconds vals = [ts, "2999-01-02 03:04:05.678910", 2500] result = DatetimeIndex(vals, dtype=dtype) - exp_arr = np.array([ts.asm8, vals[1], 2], dtype=dtype) - expected = DatetimeIndex(exp_arr, dtype=dtype) + exp_arr = np.array([ts.asm8, vals[1], 2], dtype="M8[us]") + expected = DatetimeIndex(exp_arr, dtype="M8[us]").tz_localize(tz) tm.assert_index_equal(result, expected) result2 = DatetimeIndex(np.array(vals, dtype=object), dtype=dtype) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index edd3062f7d472..2434290616618 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -107,18 +107,21 @@ def test_astype_dict_like(self, dtype_class): class TestAstype: - def test_astype_object_to_dt64_non_nano(self): - # GH#55756 + @pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"]) + def test_astype_object_to_dt64_non_nano(self, tz): + # GH#55756, GH#54620 ts = Timestamp("2999-01-01") dtype = "M8[us]" + if tz is not None: + dtype = f"M8[us, {tz}]" # NB: the 2500 is interpreted as nanoseconds and rounded *down* # to 2 microseconds vals = [ts, "2999-01-02 03:04:05.678910", 2500] ser = Series(vals, dtype=object) result = ser.astype(dtype) - exp_arr = np.array([ts.asm8, vals[1], 2], dtype=dtype) - expected = Series(exp_arr, dtype=dtype) + exp_arr = np.array([ts.asm8, vals[1], 2], dtype="M8[us]") + expected = Series(exp_arr, dtype="M8[us]").dt.tz_localize(tz) tm.assert_series_equal(result, expected) def test_astype_mixed_object_to_dt64tz(self): From f04da3c1c9f5fd81ce7a04e55a965e4021de2ae9 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 31 Oct 2023 12:33:32 -0400 Subject: [PATCH 3/4] BUG: DatetimeIndex.diff raising TypeError (#55761) * BUG: DatetimeIndex.diff raising TypeError * add test and whatsnew * typing * use Index --- doc/source/whatsnew/v2.1.3.rst | 2 +- pandas/core/indexes/base.py | 4 ++-- pandas/tests/indexes/test_datetimelike.py | 8 ++++++++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst index 1359123ef153e..3b1cd1c152baa 100644 --- a/doc/source/whatsnew/v2.1.3.rst +++ b/doc/source/whatsnew/v2.1.3.rst @@ -21,7 +21,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 761f5df3bb4e0..0301830df483b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6991,7 +6991,7 @@ def infer_objects(self, copy: bool = True) -> Index: return result @final - def diff(self, periods: int = 1) -> Self: + def diff(self, periods: int = 1) -> Index: """ Computes the difference between consecutive values in the Index object. @@ -7017,7 +7017,7 @@ def diff(self, periods: int = 1) -> Self: Index([nan, 10.0, 10.0, 10.0, 10.0], dtype='float64') """ - return self._constructor(self.to_series().diff(periods)) + return Index(self.to_series().diff(periods)) @final def round(self, decimals: int = 0) -> Self: diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 71cc7f29c62bc..5ad2e9b2f717e 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -159,3 +159,11 @@ def test_where_cast_str(self, simple_index): result = index.where(mask, ["foo"]) tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) + def test_diff(self, unit): + # GH 55080 + dti = pd.to_datetime([10, 20, 30], unit=unit).as_unit(unit) + result = dti.diff(1) + expected = pd.TimedeltaIndex([pd.NaT, 10, 10], unit=unit).as_unit(unit) + tm.assert_index_equal(result, expected) From b0a0c6851571016daf8f2fba24c0228bb224a712 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 31 Oct 2023 13:45:32 -0700 Subject: [PATCH 4/4] BUG: to_datetime with empty string (#55771) --- pandas/_libs/tslib.pyx | 28 ++++++++++++++------------ pandas/tests/tools/test_to_datetime.py | 11 ++++++++-- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index bb96a89f7d1fe..94a984c9db594 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -519,7 +519,12 @@ cpdef array_to_datetime( iresult[i] = _ts.value tz = _ts.tzinfo - if tz is not None: + if _ts.value == NPY_NAT: + # e.g. "NaT" string or empty string, we do not consider + # this as either tzaware or tznaive. See + # test_to_datetime_with_empty_str_utc_false_format_mixed + pass + elif tz is not None: # dateutil timezone objects cannot be hashed, so # store the UTC offsets in seconds instead nsecs = tz.utcoffset(None).total_seconds() @@ -610,7 +615,6 @@ cdef _array_to_datetime_object( # 1) NaT or NaT-like values # 2) datetime strings, which we return as datetime.datetime # 3) special strings - "now" & "today" - unique_timezones = set() for i in range(n): # Analogous to: val = values[i] val = (cnp.PyArray_MultiIter_DATA(mi, 1))[0] @@ -640,7 +644,6 @@ cdef _array_to_datetime_object( tzinfo=tsobj.tzinfo, fold=tsobj.fold, ) - unique_timezones.add(tsobj.tzinfo) except (ValueError, OverflowError) as ex: ex.args = (f"{ex}, at position {i}", ) @@ -658,16 +661,15 @@ cdef _array_to_datetime_object( cnp.PyArray_MultiIter_NEXT(mi) - if len(unique_timezones) > 1: - warnings.warn( - "In a future version of pandas, parsing datetimes with mixed time " - "zones will raise an error unless `utc=True`. " - "Please specify `utc=True` to opt in to the new behaviour " - "and silence this warning. To create a `Series` with mixed offsets and " - "`object` dtype, please use `apply` and `datetime.datetime.strptime`", - FutureWarning, - stacklevel=find_stack_level(), - ) + warnings.warn( + "In a future version of pandas, parsing datetimes with mixed time " + "zones will raise an error unless `utc=True`. " + "Please specify `utc=True` to opt in to the new behaviour " + "and silence this warning. To create a `Series` with mixed offsets and " + "`object` dtype, please use `apply` and `datetime.datetime.strptime`", + FutureWarning, + stacklevel=find_stack_level(), + ) return oresult_nd, None diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index b41257b19686d..ac58d312619fe 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3675,10 +3675,17 @@ def test_from_numeric_arrow_dtype(any_numeric_ea_dtype): def test_to_datetime_with_empty_str_utc_false_format_mixed(): # GH 50887 - result = to_datetime(["2020-01-01 00:00+00:00", ""], format="mixed") - expected = Index([Timestamp("2020-01-01 00:00+00:00"), "NaT"], dtype=object) + vals = ["2020-01-01 00:00+00:00", ""] + result = to_datetime(vals, format="mixed") + expected = Index([Timestamp("2020-01-01 00:00+00:00"), "NaT"], dtype="M8[ns, UTC]") tm.assert_index_equal(result, expected) + # Check that a couple of other similar paths work the same way + alt = to_datetime(vals) + tm.assert_index_equal(alt, expected) + alt2 = DatetimeIndex(vals) + tm.assert_index_equal(alt2, expected) + def test_to_datetime_with_empty_str_utc_false_offsets_and_format_mixed(): # GH 50887