Skip to content

Commit

Permalink
BUG: mixed-type mixed-timezone/awareness (#55793)
Browse files Browse the repository at this point in the history
* BUG: mixed-type mixed-timezone/awareness

* more GH refs

* un-xfail

* comments, test
  • Loading branch information
jbrockmendel authored Nov 9, 2023
1 parent 0d786bf commit 5cedf87
Show file tree
Hide file tree
Showing 6 changed files with 186 additions and 0 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,7 @@ Datetimelike
- Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame. (:issue:`52093`)
- Bug in :func:`testing.assert_extension_array_equal` that could use the wrong unit when comparing resolutions (:issue:`55730`)
- Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing a list of mixed-string-and-numeric types incorrectly raising (:issue:`55780`)
- Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing mixed-type objects with a mix of timezones or mix of timezone-awareness failing to raise ``ValueError`` (:issue:`55693`)
- Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`)
- Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`)
- Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`)
Expand Down
25 changes: 25 additions & 0 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ from pandas._libs.tslibs.nattype cimport (
c_nat_strings as nat_strings,
)
from pandas._libs.tslibs.timestamps cimport _Timestamp
from pandas._libs.tslibs.timezones cimport tz_compare

from pandas._libs.tslibs import (
Resolution,
Expand Down Expand Up @@ -488,9 +489,11 @@ cpdef array_to_datetime(
elif PyDate_Check(val):
iresult[i] = pydate_to_dt64(val, &dts, reso=creso)
check_dts_bounds(&dts, creso)
state.found_other = True

elif is_datetime64_object(val):
iresult[i] = get_datetime64_nanos(val, creso)
state.found_other = True

elif is_integer_object(val) or is_float_object(val):
# these must be ns unit by-definition
Expand All @@ -500,6 +503,7 @@ cpdef array_to_datetime(
else:
# we now need to parse this as if unit='ns'
iresult[i] = cast_from_unit(val, "ns", out_reso=creso)
state.found_other = True

elif isinstance(val, str):
# string
Expand Down Expand Up @@ -535,6 +539,7 @@ cpdef array_to_datetime(
# Add a marker for naive string, to track if we are
# parsing mixed naive and aware strings
out_tzoffset_vals.add("naive")
state.found_naive_str = True

else:
raise TypeError(f"{type(val)} is not convertible to datetime")
Expand All @@ -558,9 +563,29 @@ cpdef array_to_datetime(
is_same_offsets = len(out_tzoffset_vals) == 1
if not is_same_offsets:
return _array_to_datetime_object(values, errors, dayfirst, yearfirst)
elif state.found_naive or state.found_other:
# e.g. test_to_datetime_mixed_awareness_mixed_types
raise ValueError("Cannot mix tz-aware with tz-naive values")
elif tz_out is not None:
# GH#55693
tz_offset = out_tzoffset_vals.pop()
tz_out2 = timezone(timedelta(seconds=tz_offset))
if not tz_compare(tz_out, tz_out2):
# e.g. test_to_datetime_mixed_tzs_mixed_types
raise ValueError(
"Mixed timezones detected. pass utc=True in to_datetime "
"or tz='UTC' in DatetimeIndex to convert to a common timezone."
)
# e.g. test_to_datetime_mixed_types_matching_tzs
else:
tz_offset = out_tzoffset_vals.pop()
tz_out = timezone(timedelta(seconds=tz_offset))
elif not utc_convert:
if tz_out and (state.found_other or state.found_naive_str):
# found_other indicates a tz-naive int, float, dt64, or date
# e.g. test_to_datetime_mixed_awareness_mixed_types
raise ValueError("Cannot mix tz-aware with tz-naive values")

return result, tz_out


Expand Down
3 changes: 3 additions & 0 deletions pandas/_libs/tslibs/strptime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@ cdef bint parse_today_now(

cdef class DatetimeParseState:
cdef:
# See comments describing these attributes in the __cinit__ method
bint found_tz
bint found_naive
bint found_naive_str
bint found_other
bint creso_ever_changed
NPY_DATETIMEUNIT creso

Expand Down
7 changes: 7 additions & 0 deletions pandas/_libs/tslibs/strptime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,15 @@ cdef _get_format_regex(str fmt):

cdef class DatetimeParseState:
def __cinit__(self, NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_ns):
# found_tz and found_naive are specifically about datetime/Timestamp
# objects with and without tzinfos attached.
self.found_tz = False
self.found_naive = False
# found_naive_str refers to a string that was parsed to a timezone-naive
# datetime.
self.found_naive_str = False
self.found_other = False

self.creso = creso
self.creso_ever_changed = False

Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/indexes/test_index_new.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@
from datetime import (
datetime,
timedelta,
timezone,
)
from decimal import Decimal

import numpy as np
import pytest

from pandas._libs.tslibs.timezones import maybe_get_tz

from pandas import (
NA,
Categorical,
Expand Down Expand Up @@ -183,6 +186,15 @@ def test_constructor_datetime_and_datetime64(self, swap_objs):
tm.assert_index_equal(Index(data), expected)
tm.assert_index_equal(Index(np.array(data, dtype=object)), expected)

def test_constructor_datetimes_mixed_tzs(self):
# https://github.com/pandas-dev/pandas/pull/55793/files#r1383719998
tz = maybe_get_tz("US/Central")
dt1 = datetime(2020, 1, 1, tzinfo=tz)
dt2 = datetime(2020, 1, 1, tzinfo=timezone.utc)
result = Index([dt1, dt2])
expected = Index([dt1, dt2], dtype=object)
tm.assert_index_equal(result, expected)


class TestDtypeEnforced:
# check we don't silently ignore the dtype keyword
Expand Down
138 changes: 138 additions & 0 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -3727,3 +3727,141 @@ def test_to_datetime_with_empty_str_utc_false_offsets_and_format_mixed():
to_datetime(
["2020-01-01 00:00+00:00", "2020-01-01 00:00+02:00", ""], format="mixed"
)


def test_to_datetime_mixed_tzs_mixed_types():
# GH#55793, GH#55693 mismatched tzs but one is str and other is
# datetime object
ts = Timestamp("2016-01-02 03:04:05", tz="US/Pacific")
dtstr = "2023-10-30 15:06+01"
arr = [ts, dtstr]

msg = (
"Mixed timezones detected. pass utc=True in to_datetime or tz='UTC' "
"in DatetimeIndex to convert to a common timezone"
)
with pytest.raises(ValueError, match=msg):
to_datetime(arr)
with pytest.raises(ValueError, match=msg):
to_datetime(arr, format="mixed")
with pytest.raises(ValueError, match=msg):
DatetimeIndex(arr)


def test_to_datetime_mixed_types_matching_tzs():
# GH#55793
dtstr = "2023-11-01 09:22:03-07:00"
ts = Timestamp(dtstr)
arr = [ts, dtstr]
res1 = to_datetime(arr)
res2 = to_datetime(arr[::-1])[::-1]
res3 = to_datetime(arr, format="mixed")
res4 = DatetimeIndex(arr)

expected = DatetimeIndex([ts, ts])
tm.assert_index_equal(res1, expected)
tm.assert_index_equal(res2, expected)
tm.assert_index_equal(res3, expected)
tm.assert_index_equal(res4, expected)


dtstr = "2020-01-01 00:00+00:00"
ts = Timestamp(dtstr)


@pytest.mark.filterwarnings("ignore:Could not infer format:UserWarning")
@pytest.mark.parametrize(
"aware_val",
[dtstr, Timestamp(dtstr)],
ids=lambda x: type(x).__name__,
)
@pytest.mark.parametrize(
"naive_val",
[dtstr[:-6], ts.tz_localize(None), ts.date(), ts.asm8, ts.value, float(ts.value)],
ids=lambda x: type(x).__name__,
)
@pytest.mark.parametrize("naive_first", [True, False])
def test_to_datetime_mixed_awareness_mixed_types(aware_val, naive_val, naive_first):
# GH#55793, GH#55693
# Empty string parses to NaT
vals = [aware_val, naive_val, ""]

vec = vals
if naive_first:
# alas, the behavior is order-dependent, so we test both ways
vec = [naive_val, aware_val, ""]

# both_strs-> paths that were previously already deprecated with warning
# issued in _array_to_datetime_object
both_strs = isinstance(aware_val, str) and isinstance(naive_val, str)
has_numeric = isinstance(naive_val, (int, float))

depr_msg = "In a future version of pandas, parsing datetimes with mixed time zones"

first_non_null = next(x for x in vec if x != "")
# if first_non_null is a not a string, _guess_datetime_format_for_array
# doesn't guess a format so we don't go through array_strptime
if not isinstance(first_non_null, str):
# that case goes through array_strptime which has different behavior
msg = "Cannot mix tz-aware with tz-naive values"
if naive_first and isinstance(aware_val, Timestamp):
if isinstance(naive_val, Timestamp):
msg = "Tz-aware datetime.datetime cannot be converted to datetime64"
with pytest.raises(ValueError, match=msg):
to_datetime(vec)
else:
with pytest.raises(ValueError, match=msg):
to_datetime(vec)

# No warning/error with utc=True
to_datetime(vec, utc=True)

elif has_numeric and vec.index(aware_val) < vec.index(naive_val):
msg = "time data .* doesn't match format"
with pytest.raises(ValueError, match=msg):
to_datetime(vec)
with pytest.raises(ValueError, match=msg):
to_datetime(vec, utc=True)

elif both_strs and vec.index(aware_val) < vec.index(naive_val):
msg = r"time data \"2020-01-01 00:00\" doesn't match format"
with pytest.raises(ValueError, match=msg):
to_datetime(vec)
with pytest.raises(ValueError, match=msg):
to_datetime(vec, utc=True)

elif both_strs and vec.index(naive_val) < vec.index(aware_val):
msg = "unconverted data remains when parsing with format"
with pytest.raises(ValueError, match=msg):
to_datetime(vec)
with pytest.raises(ValueError, match=msg):
to_datetime(vec, utc=True)

else:
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
to_datetime(vec)

# No warning/error with utc=True
to_datetime(vec, utc=True)

if both_strs:
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
to_datetime(vec, format="mixed")
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
msg = "DatetimeIndex has mixed timezones"
with pytest.raises(TypeError, match=msg):
DatetimeIndex(vec)
else:
msg = "Cannot mix tz-aware with tz-naive values"
if naive_first and isinstance(aware_val, Timestamp):
if isinstance(naive_val, Timestamp):
msg = "Tz-aware datetime.datetime cannot be converted to datetime64"
with pytest.raises(ValueError, match=msg):
to_datetime(vec, format="mixed")
with pytest.raises(ValueError, match=msg):
DatetimeIndex(vec)
else:
with pytest.raises(ValueError, match=msg):
to_datetime(vec, format="mixed")
with pytest.raises(ValueError, match=msg):
DatetimeIndex(vec)

0 comments on commit 5cedf87

Please sign in to comment.