From 0cea20cb674223e0cd4365852f164bf7088bf143 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 1 Nov 2023 09:30:07 -0700 Subject: [PATCH 1/4] BUG: mixed-type mixed-timezone/awareness --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/_libs/tslib.pyx | 25 +++++ pandas/_libs/tslibs/strptime.pxd | 2 + pandas/_libs/tslibs/strptime.pyx | 2 + pandas/tests/tools/test_to_datetime.py | 145 +++++++++++++++++++++++++ 5 files changed, 175 insertions(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 16d279bb0d52c..91977642eb510 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -322,6 +322,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ - Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame. (:issue:`52093`) +- Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing mixed-type objects with a mix of timezones or mix of timezone-awareness failing to raise ``ValueError`` (:issue:`55693`) - Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`) - Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`) - Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 94a984c9db594..da2470c7365d8 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -71,6 +71,7 @@ from pandas._libs.tslibs.nattype cimport ( c_nat_strings as nat_strings, ) from pandas._libs.tslibs.timestamps cimport _Timestamp +from pandas._libs.tslibs.timezones cimport tz_compare from pandas._libs.tslibs import ( Resolution, @@ -487,9 +488,11 @@ cpdef array_to_datetime( elif PyDate_Check(val): iresult[i] = pydate_to_dt64(val, &dts, reso=creso) check_dts_bounds(&dts, creso) + state.found_other = True elif is_datetime64_object(val): iresult[i] = get_datetime64_nanos(val, creso) + state.found_other = True elif is_integer_object(val) or is_float_object(val): # these must be ns unit by-definition @@ -499,6 +502,7 @@ cpdef array_to_datetime( else: # we now need to parse this as if unit='ns' iresult[i] = cast_from_unit(val, "ns", out_reso=creso) + state.found_other = True elif isinstance(val, str): # string @@ -534,6 +538,7 @@ cpdef array_to_datetime( # Add a marker for naive string, to track if we are # parsing mixed naive and aware strings out_tzoffset_vals.add("naive") + state.found_naive_str = True else: raise TypeError(f"{type(val)} is not convertible to datetime") @@ -557,9 +562,29 @@ cpdef array_to_datetime( is_same_offsets = len(out_tzoffset_vals) == 1 if not is_same_offsets: return _array_to_datetime_object(values, errors, dayfirst, yearfirst) + elif state.found_naive or state.found_other: + # e.g. test_to_datetime_mixed_awareness_mixed_types + raise ValueError("Cannot mix tz-aware with tz-naive values") + elif tz_out is not None: + # GH#55693 + tz_offset = out_tzoffset_vals.pop() + tz_out2 = timezone(timedelta(seconds=tz_offset)) + if not tz_compare(tz_out, tz_out2): + # e.g. test_to_datetime_mixed_tzs_mixed_types + raise ValueError( + "Mixed timezones detected. pass utc=True in to_datetime " + "or tz='UTC' in DatetimeIndex to convert to a common timezone." + ) + # e.g. test_to_datetime_mixed_types_matching_tzs else: tz_offset = out_tzoffset_vals.pop() tz_out = timezone(timedelta(seconds=tz_offset)) + elif not utc_convert: + if tz_out and (state.found_other or state.found_naive_str): + # found_other indicates a tz-naive int, float, dt64, or date + # e.g. test_to_datetime_mixed_awareness_mixed_types + raise ValueError("Cannot mix tz-aware with tz-naive values") + return result, tz_out diff --git a/pandas/_libs/tslibs/strptime.pxd b/pandas/_libs/tslibs/strptime.pxd index 32a8edc9ee4a3..516ffd779221b 100644 --- a/pandas/_libs/tslibs/strptime.pxd +++ b/pandas/_libs/tslibs/strptime.pxd @@ -14,5 +14,7 @@ cdef class DatetimeParseState: cdef: bint found_tz bint found_naive + bint found_naive_str + bint found_other cdef tzinfo process_datetime(self, datetime dt, tzinfo tz, bint utc_convert) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 69466511a67fd..a89a2c656adc5 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -235,6 +235,8 @@ cdef class DatetimeParseState: def __cinit__(self): self.found_tz = False self.found_naive = False + self.found_naive_str = False + self.found_other = False cdef tzinfo process_datetime(self, datetime dt, tzinfo tz, bint utc_convert): if dt.tzinfo is not None: diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index ac58d312619fe..233021f4c3e74 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3695,3 +3695,148 @@ def test_to_datetime_with_empty_str_utc_false_offsets_and_format_mixed(): to_datetime( ["2020-01-01 00:00+00:00", "2020-01-01 00:00+02:00", ""], format="mixed" ) + + +def test_to_datetime_mixed_tzs_mixed_types(): + # GH#55693 mismatched tzs but one is str and other is datetime object + ts = Timestamp("2016-01-02 03:04:05", tz="US/Pacific") + dtstr = "2023-10-30 15:06+01" + arr = [ts, dtstr] + + msg = ( + "Mixed timezones detected. pass utc=True in to_datetime or tz='UTC' " + "in DatetimeIndex to convert to a common timezone" + ) + with pytest.raises(ValueError, match=msg): + to_datetime(arr) + with pytest.raises(ValueError, match=msg): + to_datetime(arr, format="mixed") + with pytest.raises(ValueError, match=msg): + DatetimeIndex(arr) + + +def test_to_datetime_mixed_types_matching_tzs(): + dtstr = "2023-11-01 09:22:03-07:00" + ts = Timestamp(dtstr) + arr = [ts, dtstr] + res1 = to_datetime(arr) + res2 = to_datetime(arr[::-1])[::-1] + res3 = to_datetime(arr, format="mixed") + res4 = DatetimeIndex(arr) + + expected = DatetimeIndex([ts, ts]) + tm.assert_index_equal(res1, expected) + tm.assert_index_equal(res2, expected) + tm.assert_index_equal(res3, expected) + tm.assert_index_equal(res4, expected) + + +dtstr = "2020-01-01 00:00+00:00" +ts = Timestamp(dtstr) + + +@pytest.mark.filterwarnings("ignore:Could not infer format:UserWarning") +@pytest.mark.parametrize( + "aware_val", + [dtstr, Timestamp(dtstr)], + ids=lambda x: type(x).__name__, +) +@pytest.mark.parametrize( + "naive_val", + [dtstr[:-6], ts.tz_localize(None), ts.date(), ts.asm8, ts.value, float(ts.value)], + ids=lambda x: type(x).__name__, +) +@pytest.mark.parametrize("naive_first", [True, False]) +def test_to_datetime_mixed_awareness_mixed_types( + aware_val, naive_val, naive_first, request +): + # GH#55693 + # Empty string parses to NaT + vals = [aware_val, naive_val, ""] + + vec = vals + if naive_first: + # alas, the behavior is order-dependent, so we test both ways + vec = [naive_val, aware_val, ""] + + # both_strs-> paths that were previously already deprecated with warning + # issued in _array_to_datetime_object + both_strs = isinstance(aware_val, str) and isinstance(naive_val, str) + either_str = isinstance(aware_val, str) or isinstance(naive_val, str) + has_numeric = isinstance(naive_val, (int, float)) + + if has_numeric and either_str: + # np.array in ensure_arraylike_for_datetimelike incorrectly casts to + # string + mark = pytest.mark.xfail(reason="Regex pattern does not match") + request.applymarker(mark) + + depr_msg = "In a future version of pandas, parsing datetimes with mixed time zones" + + first_non_null = next(x for x in vec if x != "") + # if first_non_null is a not a string, _guess_datetime_format_for_array + # doesn't guess a format so we don't go through array_strptime + if not isinstance(first_non_null, str): + # that case goes through array_strptime which has different behavior + msg = "Cannot mix tz-aware with tz-naive values" + if naive_first and isinstance(aware_val, Timestamp): + if isinstance(naive_val, Timestamp): + msg = "Tz-aware datetime.datetime cannot be converted to datetime64" + with pytest.raises(ValueError, match=msg): + to_datetime(vec) + else: + with pytest.raises(ValueError, match=msg): + to_datetime(vec) + + # No warning/error with utc=True + to_datetime(vec, utc=True) + + elif has_numeric and vec.index(aware_val) < vec.index(naive_val): + msg = "time data .* doesn't match format" + with pytest.raises(ValueError, match=msg): + to_datetime(vec) + with pytest.raises(ValueError, match=msg): + to_datetime(vec, utc=True) + + elif both_strs and vec.index(aware_val) < vec.index(naive_val): + msg = r"time data \"2020-01-01 00:00\" doesn't match format" + with pytest.raises(ValueError, match=msg): + to_datetime(vec) + with pytest.raises(ValueError, match=msg): + to_datetime(vec, utc=True) + + elif both_strs and vec.index(naive_val) < vec.index(aware_val): + msg = "unconverted data remains when parsing with format" + with pytest.raises(ValueError, match=msg): + to_datetime(vec) + with pytest.raises(ValueError, match=msg): + to_datetime(vec, utc=True) + + else: + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + to_datetime(vec) + + # No warning/error with utc=True + to_datetime(vec, utc=True) + + if both_strs: + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + to_datetime(vec, format="mixed") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + msg = "DatetimeIndex has mixed timezones" + with pytest.raises(TypeError, match=msg): + DatetimeIndex(vec) + else: + msg = "Cannot mix tz-aware with tz-naive values" + if naive_first and isinstance(aware_val, Timestamp): + if isinstance(naive_val, Timestamp): + msg = "Tz-aware datetime.datetime cannot be converted to datetime64" + with pytest.raises(ValueError, match=msg): + to_datetime(vec, format="mixed") + with pytest.raises(ValueError, match=msg): + DatetimeIndex(vec) + else: + with pytest.raises(ValueError, match=msg): + to_datetime(vec, format="mixed") + with pytest.raises(ValueError, match=msg): + DatetimeIndex(vec) From 7396939bf2fe5c32b2a800daaa5a0e622f1ed94e Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 1 Nov 2023 09:34:01 -0700 Subject: [PATCH 2/4] more GH refs --- pandas/tests/tools/test_to_datetime.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 233021f4c3e74..ec07ec7692b97 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3698,7 +3698,8 @@ def test_to_datetime_with_empty_str_utc_false_offsets_and_format_mixed(): def test_to_datetime_mixed_tzs_mixed_types(): - # GH#55693 mismatched tzs but one is str and other is datetime object + # GH#55793, GH#55693 mismatched tzs but one is str and other is + # datetime object ts = Timestamp("2016-01-02 03:04:05", tz="US/Pacific") dtstr = "2023-10-30 15:06+01" arr = [ts, dtstr] @@ -3716,6 +3717,7 @@ def test_to_datetime_mixed_tzs_mixed_types(): def test_to_datetime_mixed_types_matching_tzs(): + # GH#55793 dtstr = "2023-11-01 09:22:03-07:00" ts = Timestamp(dtstr) arr = [ts, dtstr] @@ -3750,7 +3752,7 @@ def test_to_datetime_mixed_types_matching_tzs(): def test_to_datetime_mixed_awareness_mixed_types( aware_val, naive_val, naive_first, request ): - # GH#55693 + # GH#55793, GH#55693 # Empty string parses to NaT vals = [aware_val, naive_val, ""] From 60ec37434f6788410b6bbe72694adb8842da349f Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 2 Nov 2023 19:37:26 -0700 Subject: [PATCH 3/4] un-xfail --- pandas/tests/tools/test_to_datetime.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 7b103c5031d6f..fa6d20338149d 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3777,9 +3777,7 @@ def test_to_datetime_mixed_types_matching_tzs(): ids=lambda x: type(x).__name__, ) @pytest.mark.parametrize("naive_first", [True, False]) -def test_to_datetime_mixed_awareness_mixed_types( - aware_val, naive_val, naive_first, request -): +def test_to_datetime_mixed_awareness_mixed_types(aware_val, naive_val, naive_first): # GH#55793, GH#55693 # Empty string parses to NaT vals = [aware_val, naive_val, ""] @@ -3792,15 +3790,9 @@ def test_to_datetime_mixed_awareness_mixed_types( # both_strs-> paths that were previously already deprecated with warning # issued in _array_to_datetime_object both_strs = isinstance(aware_val, str) and isinstance(naive_val, str) - either_str = isinstance(aware_val, str) or isinstance(naive_val, str) + isinstance(aware_val, str) or isinstance(naive_val, str) has_numeric = isinstance(naive_val, (int, float)) - if has_numeric and either_str: - # np.array in ensure_arraylike_for_datetimelike incorrectly casts to - # string - mark = pytest.mark.xfail(reason="Regex pattern does not match") - request.applymarker(mark) - depr_msg = "In a future version of pandas, parsing datetimes with mixed time zones" first_non_null = next(x for x in vec if x != "") From 0f2b27ccd6f9f54f13874e137623c2ff2a31a6dd Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 7 Nov 2023 12:03:56 -0800 Subject: [PATCH 4/4] comments, test --- pandas/_libs/tslibs/strptime.pxd | 1 + pandas/_libs/tslibs/strptime.pyx | 4 ++++ pandas/tests/indexes/test_index_new.py | 12 ++++++++++++ pandas/tests/tools/test_to_datetime.py | 1 - 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/strptime.pxd b/pandas/_libs/tslibs/strptime.pxd index a1b5632067f25..dd8936f080b31 100644 --- a/pandas/_libs/tslibs/strptime.pxd +++ b/pandas/_libs/tslibs/strptime.pxd @@ -14,6 +14,7 @@ cdef bint parse_today_now( cdef class DatetimeParseState: cdef: + # See comments describing these attributes in the __cinit__ method bint found_tz bint found_naive bint found_naive_str diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index ccd77fe7f86ca..c8fd95be34cc0 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -242,8 +242,12 @@ cdef _get_format_regex(str fmt): cdef class DatetimeParseState: def __cinit__(self, NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_ns): + # found_tz and found_naive are specifically about datetime/Timestamp + # objects with and without tzinfos attached. self.found_tz = False self.found_naive = False + # found_naive_str refers to a string that was parsed to a timezone-naive + # datetime. self.found_naive_str = False self.found_other = False diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index acb0f85027da2..4231f172fc8fb 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -4,12 +4,15 @@ from datetime import ( datetime, timedelta, + timezone, ) from decimal import Decimal import numpy as np import pytest +from pandas._libs.tslibs.timezones import maybe_get_tz + from pandas import ( NA, Categorical, @@ -183,6 +186,15 @@ def test_constructor_datetime_and_datetime64(self, swap_objs): tm.assert_index_equal(Index(data), expected) tm.assert_index_equal(Index(np.array(data, dtype=object)), expected) + def test_constructor_datetimes_mixed_tzs(self): + # https://github.com/pandas-dev/pandas/pull/55793/files#r1383719998 + tz = maybe_get_tz("US/Central") + dt1 = datetime(2020, 1, 1, tzinfo=tz) + dt2 = datetime(2020, 1, 1, tzinfo=timezone.utc) + result = Index([dt1, dt2]) + expected = Index([dt1, dt2], dtype=object) + tm.assert_index_equal(result, expected) + class TestDtypeEnforced: # check we don't silently ignore the dtype keyword diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 60488ceda5b08..6c094d980126b 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3787,7 +3787,6 @@ def test_to_datetime_mixed_awareness_mixed_types(aware_val, naive_val, naive_fir # both_strs-> paths that were previously already deprecated with warning # issued in _array_to_datetime_object both_strs = isinstance(aware_val, str) and isinstance(naive_val, str) - isinstance(aware_val, str) or isinstance(naive_val, str) has_numeric = isinstance(naive_val, (int, float)) depr_msg = "In a future version of pandas, parsing datetimes with mixed time zones"