diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 38f39270d12c7..eb1c0924ff99b 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -352,6 +352,7 @@ Datetimelike - Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame. (:issue:`52093`) - Bug in :func:`testing.assert_extension_array_equal` that could use the wrong unit when comparing resolutions (:issue:`55730`) - Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing a list of mixed-string-and-numeric types incorrectly raising (:issue:`55780`) +- Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing mixed-type objects with a mix of timezones or mix of timezone-awareness failing to raise ``ValueError`` (:issue:`55693`) - Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`) - Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`) - Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index ad043d1695eac..72faf1757a7b0 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -72,6 +72,7 @@ from pandas._libs.tslibs.nattype cimport ( c_nat_strings as nat_strings, ) from pandas._libs.tslibs.timestamps cimport _Timestamp +from pandas._libs.tslibs.timezones cimport tz_compare from pandas._libs.tslibs import ( Resolution, @@ -488,9 +489,11 @@ cpdef array_to_datetime( elif PyDate_Check(val): iresult[i] = pydate_to_dt64(val, &dts, reso=creso) check_dts_bounds(&dts, creso) + state.found_other = True elif is_datetime64_object(val): iresult[i] = get_datetime64_nanos(val, creso) + state.found_other = True elif is_integer_object(val) or is_float_object(val): # these must be ns unit by-definition @@ -500,6 +503,7 @@ cpdef array_to_datetime( else: # we now need to parse this as if unit='ns' iresult[i] = cast_from_unit(val, "ns", out_reso=creso) + state.found_other = True elif isinstance(val, str): # string @@ -535,6 +539,7 @@ cpdef array_to_datetime( # Add a marker for naive string, to track if we are # parsing mixed naive and aware strings out_tzoffset_vals.add("naive") + state.found_naive_str = True else: raise TypeError(f"{type(val)} is not convertible to datetime") @@ -558,9 +563,29 @@ cpdef array_to_datetime( is_same_offsets = len(out_tzoffset_vals) == 1 if not is_same_offsets: return _array_to_datetime_object(values, errors, dayfirst, yearfirst) + elif state.found_naive or state.found_other: + # e.g. test_to_datetime_mixed_awareness_mixed_types + raise ValueError("Cannot mix tz-aware with tz-naive values") + elif tz_out is not None: + # GH#55693 + tz_offset = out_tzoffset_vals.pop() + tz_out2 = timezone(timedelta(seconds=tz_offset)) + if not tz_compare(tz_out, tz_out2): + # e.g. test_to_datetime_mixed_tzs_mixed_types + raise ValueError( + "Mixed timezones detected. pass utc=True in to_datetime " + "or tz='UTC' in DatetimeIndex to convert to a common timezone." + ) + # e.g. test_to_datetime_mixed_types_matching_tzs else: tz_offset = out_tzoffset_vals.pop() tz_out = timezone(timedelta(seconds=tz_offset)) + elif not utc_convert: + if tz_out and (state.found_other or state.found_naive_str): + # found_other indicates a tz-naive int, float, dt64, or date + # e.g. test_to_datetime_mixed_awareness_mixed_types + raise ValueError("Cannot mix tz-aware with tz-naive values") + return result, tz_out diff --git a/pandas/_libs/tslibs/strptime.pxd b/pandas/_libs/tslibs/strptime.pxd index fd8cafeaefb27..dd8936f080b31 100644 --- a/pandas/_libs/tslibs/strptime.pxd +++ b/pandas/_libs/tslibs/strptime.pxd @@ -14,8 +14,11 @@ cdef bint parse_today_now( cdef class DatetimeParseState: cdef: + # See comments describing these attributes in the __cinit__ method bint found_tz bint found_naive + bint found_naive_str + bint found_other bint creso_ever_changed NPY_DATETIMEUNIT creso diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index fc44d3aad4bed..c8fd95be34cc0 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -242,8 +242,15 @@ cdef _get_format_regex(str fmt): cdef class DatetimeParseState: def __cinit__(self, NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_ns): + # found_tz and found_naive are specifically about datetime/Timestamp + # objects with and without tzinfos attached. self.found_tz = False self.found_naive = False + # found_naive_str refers to a string that was parsed to a timezone-naive + # datetime. + self.found_naive_str = False + self.found_other = False + self.creso = creso self.creso_ever_changed = False diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index acb0f85027da2..4231f172fc8fb 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -4,12 +4,15 @@ from datetime import ( datetime, timedelta, + timezone, ) from decimal import Decimal import numpy as np import pytest +from pandas._libs.tslibs.timezones import maybe_get_tz + from pandas import ( NA, Categorical, @@ -183,6 +186,15 @@ def test_constructor_datetime_and_datetime64(self, swap_objs): tm.assert_index_equal(Index(data), expected) tm.assert_index_equal(Index(np.array(data, dtype=object)), expected) + def test_constructor_datetimes_mixed_tzs(self): + # https://github.com/pandas-dev/pandas/pull/55793/files#r1383719998 + tz = maybe_get_tz("US/Central") + dt1 = datetime(2020, 1, 1, tzinfo=tz) + dt2 = datetime(2020, 1, 1, tzinfo=timezone.utc) + result = Index([dt1, dt2]) + expected = Index([dt1, dt2], dtype=object) + tm.assert_index_equal(result, expected) + class TestDtypeEnforced: # check we don't silently ignore the dtype keyword diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index ae07cb00211a2..6c094d980126b 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3720,3 +3720,141 @@ def test_to_datetime_with_empty_str_utc_false_offsets_and_format_mixed(): to_datetime( ["2020-01-01 00:00+00:00", "2020-01-01 00:00+02:00", ""], format="mixed" ) + + +def test_to_datetime_mixed_tzs_mixed_types(): + # GH#55793, GH#55693 mismatched tzs but one is str and other is + # datetime object + ts = Timestamp("2016-01-02 03:04:05", tz="US/Pacific") + dtstr = "2023-10-30 15:06+01" + arr = [ts, dtstr] + + msg = ( + "Mixed timezones detected. pass utc=True in to_datetime or tz='UTC' " + "in DatetimeIndex to convert to a common timezone" + ) + with pytest.raises(ValueError, match=msg): + to_datetime(arr) + with pytest.raises(ValueError, match=msg): + to_datetime(arr, format="mixed") + with pytest.raises(ValueError, match=msg): + DatetimeIndex(arr) + + +def test_to_datetime_mixed_types_matching_tzs(): + # GH#55793 + dtstr = "2023-11-01 09:22:03-07:00" + ts = Timestamp(dtstr) + arr = [ts, dtstr] + res1 = to_datetime(arr) + res2 = to_datetime(arr[::-1])[::-1] + res3 = to_datetime(arr, format="mixed") + res4 = DatetimeIndex(arr) + + expected = DatetimeIndex([ts, ts]) + tm.assert_index_equal(res1, expected) + tm.assert_index_equal(res2, expected) + tm.assert_index_equal(res3, expected) + tm.assert_index_equal(res4, expected) + + +dtstr = "2020-01-01 00:00+00:00" +ts = Timestamp(dtstr) + + +@pytest.mark.filterwarnings("ignore:Could not infer format:UserWarning") +@pytest.mark.parametrize( + "aware_val", + [dtstr, Timestamp(dtstr)], + ids=lambda x: type(x).__name__, +) +@pytest.mark.parametrize( + "naive_val", + [dtstr[:-6], ts.tz_localize(None), ts.date(), ts.asm8, ts.value, float(ts.value)], + ids=lambda x: type(x).__name__, +) +@pytest.mark.parametrize("naive_first", [True, False]) +def test_to_datetime_mixed_awareness_mixed_types(aware_val, naive_val, naive_first): + # GH#55793, GH#55693 + # Empty string parses to NaT + vals = [aware_val, naive_val, ""] + + vec = vals + if naive_first: + # alas, the behavior is order-dependent, so we test both ways + vec = [naive_val, aware_val, ""] + + # both_strs-> paths that were previously already deprecated with warning + # issued in _array_to_datetime_object + both_strs = isinstance(aware_val, str) and isinstance(naive_val, str) + has_numeric = isinstance(naive_val, (int, float)) + + depr_msg = "In a future version of pandas, parsing datetimes with mixed time zones" + + first_non_null = next(x for x in vec if x != "") + # if first_non_null is a not a string, _guess_datetime_format_for_array + # doesn't guess a format so we don't go through array_strptime + if not isinstance(first_non_null, str): + # that case goes through array_strptime which has different behavior + msg = "Cannot mix tz-aware with tz-naive values" + if naive_first and isinstance(aware_val, Timestamp): + if isinstance(naive_val, Timestamp): + msg = "Tz-aware datetime.datetime cannot be converted to datetime64" + with pytest.raises(ValueError, match=msg): + to_datetime(vec) + else: + with pytest.raises(ValueError, match=msg): + to_datetime(vec) + + # No warning/error with utc=True + to_datetime(vec, utc=True) + + elif has_numeric and vec.index(aware_val) < vec.index(naive_val): + msg = "time data .* doesn't match format" + with pytest.raises(ValueError, match=msg): + to_datetime(vec) + with pytest.raises(ValueError, match=msg): + to_datetime(vec, utc=True) + + elif both_strs and vec.index(aware_val) < vec.index(naive_val): + msg = r"time data \"2020-01-01 00:00\" doesn't match format" + with pytest.raises(ValueError, match=msg): + to_datetime(vec) + with pytest.raises(ValueError, match=msg): + to_datetime(vec, utc=True) + + elif both_strs and vec.index(naive_val) < vec.index(aware_val): + msg = "unconverted data remains when parsing with format" + with pytest.raises(ValueError, match=msg): + to_datetime(vec) + with pytest.raises(ValueError, match=msg): + to_datetime(vec, utc=True) + + else: + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + to_datetime(vec) + + # No warning/error with utc=True + to_datetime(vec, utc=True) + + if both_strs: + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + to_datetime(vec, format="mixed") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + msg = "DatetimeIndex has mixed timezones" + with pytest.raises(TypeError, match=msg): + DatetimeIndex(vec) + else: + msg = "Cannot mix tz-aware with tz-naive values" + if naive_first and isinstance(aware_val, Timestamp): + if isinstance(naive_val, Timestamp): + msg = "Tz-aware datetime.datetime cannot be converted to datetime64" + with pytest.raises(ValueError, match=msg): + to_datetime(vec, format="mixed") + with pytest.raises(ValueError, match=msg): + DatetimeIndex(vec) + else: + with pytest.raises(ValueError, match=msg): + to_datetime(vec, format="mixed") + with pytest.raises(ValueError, match=msg): + DatetimeIndex(vec)