Skip to content

Commit

Permalink
BUG: mixed-type mixed-timezone/awareness
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel committed Nov 1, 2023
1 parent eedf0d5 commit 0cea20c
Show file tree
Hide file tree
Showing 5 changed files with 175 additions and 0 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ Categorical
Datetimelike
^^^^^^^^^^^^
- Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame. (:issue:`52093`)
- Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing mixed-type objects with a mix of timezones or mix of timezone-awareness failing to raise ``ValueError`` (:issue:`55693`)
- Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`)
- Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`)
- Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`)
Expand Down
25 changes: 25 additions & 0 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ from pandas._libs.tslibs.nattype cimport (
c_nat_strings as nat_strings,
)
from pandas._libs.tslibs.timestamps cimport _Timestamp
from pandas._libs.tslibs.timezones cimport tz_compare

from pandas._libs.tslibs import (
Resolution,
Expand Down Expand Up @@ -487,9 +488,11 @@ cpdef array_to_datetime(
elif PyDate_Check(val):
iresult[i] = pydate_to_dt64(val, &dts, reso=creso)
check_dts_bounds(&dts, creso)
state.found_other = True

elif is_datetime64_object(val):
iresult[i] = get_datetime64_nanos(val, creso)
state.found_other = True

elif is_integer_object(val) or is_float_object(val):
# these must be ns unit by-definition
Expand All @@ -499,6 +502,7 @@ cpdef array_to_datetime(
else:
# we now need to parse this as if unit='ns'
iresult[i] = cast_from_unit(val, "ns", out_reso=creso)
state.found_other = True

elif isinstance(val, str):
# string
Expand Down Expand Up @@ -534,6 +538,7 @@ cpdef array_to_datetime(
# Add a marker for naive string, to track if we are
# parsing mixed naive and aware strings
out_tzoffset_vals.add("naive")
state.found_naive_str = True

else:
raise TypeError(f"{type(val)} is not convertible to datetime")
Expand All @@ -557,9 +562,29 @@ cpdef array_to_datetime(
is_same_offsets = len(out_tzoffset_vals) == 1
if not is_same_offsets:
return _array_to_datetime_object(values, errors, dayfirst, yearfirst)
elif state.found_naive or state.found_other:
# e.g. test_to_datetime_mixed_awareness_mixed_types
raise ValueError("Cannot mix tz-aware with tz-naive values")
elif tz_out is not None:
# GH#55693
tz_offset = out_tzoffset_vals.pop()
tz_out2 = timezone(timedelta(seconds=tz_offset))
if not tz_compare(tz_out, tz_out2):
# e.g. test_to_datetime_mixed_tzs_mixed_types
raise ValueError(
"Mixed timezones detected. pass utc=True in to_datetime "
"or tz='UTC' in DatetimeIndex to convert to a common timezone."
)
# e.g. test_to_datetime_mixed_types_matching_tzs
else:
tz_offset = out_tzoffset_vals.pop()
tz_out = timezone(timedelta(seconds=tz_offset))
elif not utc_convert:
if tz_out and (state.found_other or state.found_naive_str):
# found_other indicates a tz-naive int, float, dt64, or date
# e.g. test_to_datetime_mixed_awareness_mixed_types
raise ValueError("Cannot mix tz-aware with tz-naive values")

return result, tz_out


Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/tslibs/strptime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,7 @@ cdef class DatetimeParseState:
cdef:
bint found_tz
bint found_naive
bint found_naive_str
bint found_other

cdef tzinfo process_datetime(self, datetime dt, tzinfo tz, bint utc_convert)
2 changes: 2 additions & 0 deletions pandas/_libs/tslibs/strptime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,8 @@ cdef class DatetimeParseState:
def __cinit__(self):
self.found_tz = False
self.found_naive = False
self.found_naive_str = False
self.found_other = False

cdef tzinfo process_datetime(self, datetime dt, tzinfo tz, bint utc_convert):
if dt.tzinfo is not None:
Expand Down
145 changes: 145 additions & 0 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -3695,3 +3695,148 @@ def test_to_datetime_with_empty_str_utc_false_offsets_and_format_mixed():
to_datetime(
["2020-01-01 00:00+00:00", "2020-01-01 00:00+02:00", ""], format="mixed"
)


def test_to_datetime_mixed_tzs_mixed_types():
# GH#55693 mismatched tzs but one is str and other is datetime object
ts = Timestamp("2016-01-02 03:04:05", tz="US/Pacific")
dtstr = "2023-10-30 15:06+01"
arr = [ts, dtstr]

msg = (
"Mixed timezones detected. pass utc=True in to_datetime or tz='UTC' "
"in DatetimeIndex to convert to a common timezone"
)
with pytest.raises(ValueError, match=msg):
to_datetime(arr)
with pytest.raises(ValueError, match=msg):
to_datetime(arr, format="mixed")
with pytest.raises(ValueError, match=msg):
DatetimeIndex(arr)


def test_to_datetime_mixed_types_matching_tzs():
dtstr = "2023-11-01 09:22:03-07:00"
ts = Timestamp(dtstr)
arr = [ts, dtstr]
res1 = to_datetime(arr)
res2 = to_datetime(arr[::-1])[::-1]
res3 = to_datetime(arr, format="mixed")
res4 = DatetimeIndex(arr)

expected = DatetimeIndex([ts, ts])
tm.assert_index_equal(res1, expected)
tm.assert_index_equal(res2, expected)
tm.assert_index_equal(res3, expected)
tm.assert_index_equal(res4, expected)


dtstr = "2020-01-01 00:00+00:00"
ts = Timestamp(dtstr)


@pytest.mark.filterwarnings("ignore:Could not infer format:UserWarning")
@pytest.mark.parametrize(
"aware_val",
[dtstr, Timestamp(dtstr)],
ids=lambda x: type(x).__name__,
)
@pytest.mark.parametrize(
"naive_val",
[dtstr[:-6], ts.tz_localize(None), ts.date(), ts.asm8, ts.value, float(ts.value)],
ids=lambda x: type(x).__name__,
)
@pytest.mark.parametrize("naive_first", [True, False])
def test_to_datetime_mixed_awareness_mixed_types(
aware_val, naive_val, naive_first, request
):
# GH#55693
# Empty string parses to NaT
vals = [aware_val, naive_val, ""]

vec = vals
if naive_first:
# alas, the behavior is order-dependent, so we test both ways
vec = [naive_val, aware_val, ""]

# both_strs-> paths that were previously already deprecated with warning
# issued in _array_to_datetime_object
both_strs = isinstance(aware_val, str) and isinstance(naive_val, str)
either_str = isinstance(aware_val, str) or isinstance(naive_val, str)
has_numeric = isinstance(naive_val, (int, float))

if has_numeric and either_str:
# np.array in ensure_arraylike_for_datetimelike incorrectly casts to
# string
mark = pytest.mark.xfail(reason="Regex pattern does not match")
request.applymarker(mark)

depr_msg = "In a future version of pandas, parsing datetimes with mixed time zones"

first_non_null = next(x for x in vec if x != "")
# if first_non_null is a not a string, _guess_datetime_format_for_array
# doesn't guess a format so we don't go through array_strptime
if not isinstance(first_non_null, str):
# that case goes through array_strptime which has different behavior
msg = "Cannot mix tz-aware with tz-naive values"
if naive_first and isinstance(aware_val, Timestamp):
if isinstance(naive_val, Timestamp):
msg = "Tz-aware datetime.datetime cannot be converted to datetime64"
with pytest.raises(ValueError, match=msg):
to_datetime(vec)
else:
with pytest.raises(ValueError, match=msg):
to_datetime(vec)

# No warning/error with utc=True
to_datetime(vec, utc=True)

elif has_numeric and vec.index(aware_val) < vec.index(naive_val):
msg = "time data .* doesn't match format"
with pytest.raises(ValueError, match=msg):
to_datetime(vec)
with pytest.raises(ValueError, match=msg):
to_datetime(vec, utc=True)

elif both_strs and vec.index(aware_val) < vec.index(naive_val):
msg = r"time data \"2020-01-01 00:00\" doesn't match format"
with pytest.raises(ValueError, match=msg):
to_datetime(vec)
with pytest.raises(ValueError, match=msg):
to_datetime(vec, utc=True)

elif both_strs and vec.index(naive_val) < vec.index(aware_val):
msg = "unconverted data remains when parsing with format"
with pytest.raises(ValueError, match=msg):
to_datetime(vec)
with pytest.raises(ValueError, match=msg):
to_datetime(vec, utc=True)

else:
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
to_datetime(vec)

# No warning/error with utc=True
to_datetime(vec, utc=True)

if both_strs:
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
to_datetime(vec, format="mixed")
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
msg = "DatetimeIndex has mixed timezones"
with pytest.raises(TypeError, match=msg):
DatetimeIndex(vec)
else:
msg = "Cannot mix tz-aware with tz-naive values"
if naive_first and isinstance(aware_val, Timestamp):
if isinstance(naive_val, Timestamp):
msg = "Tz-aware datetime.datetime cannot be converted to datetime64"
with pytest.raises(ValueError, match=msg):
to_datetime(vec, format="mixed")
with pytest.raises(ValueError, match=msg):
DatetimeIndex(vec)
else:
with pytest.raises(ValueError, match=msg):
to_datetime(vec, format="mixed")
with pytest.raises(ValueError, match=msg):
DatetimeIndex(vec)

0 comments on commit 0cea20c

Please sign in to comment.