Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: disallow parsing datetimes with mixed time zones unless utc=True #57275

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions asv_bench/benchmarks/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def time_same_offset(self):
to_datetime(self.same_offset)

def time_different_offset(self):
to_datetime(self.diff_offset)
to_datetime(self.diff_offset, utc=True)


class ToDatetimeFormatQuarters:
Expand Down Expand Up @@ -231,9 +231,6 @@ def time_no_exact(self):
def time_same_offset(self):
to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z")

def time_different_offset(self):
to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z")

def time_same_offset_to_utc(self):
to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True)

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ Removal of prior version deprecations/changes
- All arguments in :meth:`Index.sort_values` are now keyword only (:issue:`56493`)
- All arguments in :meth:`Series.to_dict` are now keyword only (:issue:`56493`)
- Changed the default value of ``observed`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` to ``True`` (:issue:`51811`)
- Enforced deprecation disallowing parsing datetimes with mixed time zones unless user passes ``utc=True`` to :func:`to_datetime` (:issue:`57275`)
- Enforced silent-downcasting deprecation for :ref:`all relevant methods <whatsnew_220.silent_downcasting>` (:issue:`54710`)
- In :meth:`DataFrame.stack`, the default value of ``future_stack`` is now ``True``; specifying ``False`` will raise a ``FutureWarning`` (:issue:`55448`)
- Removed ``DataFrame.applymap``, ``Styler.applymap`` and ``Styler.applymap_index`` (:issue:`52364`)
Expand Down
122 changes: 7 additions & 115 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ from datetime import timezone
from cpython.datetime cimport (
PyDate_Check,
PyDateTime_Check,
datetime,
import_datetime,
timedelta,
tzinfo,
Expand Down Expand Up @@ -590,15 +589,17 @@ cpdef array_to_datetime(
return values, None

if seen_datetime_offset and not utc_convert:
# GH#17697
# GH#17697, GH#57275
# 1) If all the offsets are equal, return one offset for
# the parsed dates to (maybe) pass to DatetimeIndex
# 2) If the offsets are different, then force the parsing down the
# object path where an array of datetimes
# (with individual dateutil.tzoffsets) are returned
# 2) If the offsets are different, then do not force the parsing
# and raise a ValueError: "cannot parse datetimes with
# mixed time zones unless `utc=True`" instead
is_same_offsets = len(out_tzoffset_vals) == 1
if not is_same_offsets:
return _array_to_datetime_object(values, errors, dayfirst, yearfirst)
raise ValueError(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the comments above should also be revised cc @MarcoGorelli

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yup, thanks

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry, I missed this comment. I replaced it with

# 2) If the offsets are different, then do not force the parsing
#    and raise a ValueError: "cannot parse datetimes with
#    mixed time zones unless `utc=True`" instead

"cannot parse datetimes with mixed time zones unless `utc=True`"
)
elif state.found_naive or state.found_other:
# e.g. test_to_datetime_mixed_awareness_mixed_types
raise ValueError("Cannot mix tz-aware with tz-naive values")
Expand Down Expand Up @@ -647,115 +648,6 @@ cpdef array_to_datetime(
return result, tz_out


@cython.wraparound(False)
@cython.boundscheck(False)
cdef _array_to_datetime_object(
MarcoGorelli marked this conversation as resolved.
Show resolved Hide resolved
ndarray[object] values,
str errors,
bint dayfirst=False,
bint yearfirst=False,
):
"""
Fall back function for array_to_datetime

Attempts to parse datetime strings with dateutil to return an array
of datetime objects

Parameters
----------
values : ndarray[object]
date-like objects to convert
errors : str
error behavior when parsing
dayfirst : bool, default False
dayfirst parsing behavior when encountering datetime strings
yearfirst : bool, default False
yearfirst parsing behavior when encountering datetime strings

Returns
-------
np.ndarray[object]
Literal[None]
"""
cdef:
Py_ssize_t i, n = values.size
object val
bint is_coerce = errors == "coerce"
bint is_raise = errors == "raise"
ndarray oresult_nd
ndarray[object] oresult
npy_datetimestruct dts
cnp.broadcast mi
_TSObject tsobj

assert is_raise or is_coerce

oresult_nd = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_OBJECT, 0)
mi = cnp.PyArray_MultiIterNew2(oresult_nd, values)
oresult = oresult_nd.ravel()

# We return an object array and only attempt to parse:
# 1) NaT or NaT-like values
# 2) datetime strings, which we return as datetime.datetime
# 3) special strings - "now" & "today"
for i in range(n):
# Analogous to: val = values[i]
val = <object>(<PyObject**>cnp.PyArray_MultiIter_DATA(mi, 1))[0]

if checknull_with_nat_and_na(val) or PyDateTime_Check(val):
# GH 25978. No need to parse NaT-like or datetime-like vals
oresult[i] = val
elif isinstance(val, str):
if type(val) is not str:
# GH#32264 np.str_ objects
val = str(val)

if len(val) == 0 or val in nat_strings:
oresult[i] = "NaT"
cnp.PyArray_MultiIter_NEXT(mi)
continue

try:
tsobj = convert_str_to_tsobject(
val, None, dayfirst=dayfirst, yearfirst=yearfirst
)
tsobj.ensure_reso(NPY_FR_ns, val)

dts = tsobj.dts
oresult[i] = datetime(
dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us,
tzinfo=tsobj.tzinfo,
fold=tsobj.fold,
)

except (ValueError, OverflowError) as ex:
ex.args = (f"{ex}, at position {i}", )
if is_coerce:
oresult[i] = <object>NaT
cnp.PyArray_MultiIter_NEXT(mi)
continue
if is_raise:
raise
return values, None
else:
if is_raise:
raise
return values, None

cnp.PyArray_MultiIter_NEXT(mi)

warnings.warn(
"In a future version of pandas, parsing datetimes with mixed time "
"zones will raise an error unless `utc=True`. "
"Please specify `utc=True` to opt in to the new behaviour "
"and silence this warning. To create a `Series` with mixed offsets and "
"`object` dtype, please use `apply` and `datetime.datetime.strptime`",
FutureWarning,
stacklevel=find_stack_level(),
)
return oresult_nd, None


def array_to_datetime_with_tz(
ndarray values, tzinfo tz, bint dayfirst, bint yearfirst, NPY_DATETIMEUNIT creso
):
Expand Down
167 changes: 7 additions & 160 deletions pandas/_libs/tslibs/strptime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ from pandas._libs.tslibs.dtypes cimport (
)
from pandas._libs.tslibs.nattype cimport (
NPY_NAT,
c_NaT as NaT,
c_nat_strings as nat_strings,
)
from pandas._libs.tslibs.np_datetime cimport (
Expand Down Expand Up @@ -503,20 +502,18 @@ def array_strptime(
if seen_datetime_offset and not utc:
is_same_offsets = len(out_tzoffset_vals) == 1
if not is_same_offsets or (state.found_naive or state.found_other):
result2 = _array_strptime_object_fallback(
values, fmt=fmt, exact=exact, errors=errors, utc=utc
raise ValueError(
"cannot parse datetimes with mixed time zones unless `utc=True`"
)
return result2, None
MarcoGorelli marked this conversation as resolved.
Show resolved Hide resolved
elif tz_out is not None:
# GH#55693
tz_offset = out_tzoffset_vals.pop()
tz_out2 = timezone(timedelta(seconds=tz_offset))
if not tz_compare(tz_out, tz_out2):
# e.g. test_to_datetime_mixed_offsets_with_utc_false_deprecated
result2 = _array_strptime_object_fallback(
values, fmt=fmt, exact=exact, errors=errors, utc=utc
# e.g. test_to_datetime_mixed_offsets_with_utc_false_removed
raise ValueError(
"cannot parse datetimes with mixed time zones unless `utc=True`"
)
return result2, None
# e.g. test_guess_datetime_format_with_parseable_formats
else:
# e.g. test_to_datetime_iso8601_with_timezone_valid
Expand All @@ -525,10 +522,9 @@ def array_strptime(
elif not utc:
if tz_out and (state.found_other or state.found_naive_str):
# found_other indicates a tz-naive int, float, dt64, or date
result2 = _array_strptime_object_fallback(
values, fmt=fmt, exact=exact, errors=errors, utc=utc
raise ValueError(
"cannot parse datetimes with mixed time zones unless `utc=True`"
)
return result2, None
MarcoGorelli marked this conversation as resolved.
Show resolved Hide resolved

if infer_reso:
if state.creso_ever_changed:
Expand Down Expand Up @@ -790,155 +786,6 @@ cdef tzinfo _parse_with_format(
return tz


def _array_strptime_object_fallback(
ndarray[object] values,
str fmt,
bint exact=True,
errors="raise",
bint utc=False,
):

cdef:
Py_ssize_t i, n = len(values)
npy_datetimestruct dts
int64_t iresult
object val
tzinfo tz
bint is_raise = errors=="raise"
bint is_coerce = errors=="coerce"
bint iso_format = format_is_iso(fmt)
NPY_DATETIMEUNIT creso, out_bestunit, item_reso
int out_local = 0, out_tzoffset = 0
bint string_to_dts_succeeded = 0

assert is_raise or is_coerce

item_reso = NPY_DATETIMEUNIT.NPY_FR_GENERIC
format_regex, locale_time = _get_format_regex(fmt)

result = np.empty(n, dtype=object)

dts.us = dts.ps = dts.as = 0

for i in range(n):
val = values[i]
try:
if isinstance(val, str):
if len(val) == 0 or val in nat_strings:
result[i] = NaT
continue
elif checknull_with_nat_and_na(val):
result[i] = NaT
continue
elif PyDateTime_Check(val):
result[i] = Timestamp(val)
continue
elif PyDate_Check(val):
result[i] = Timestamp(val)
continue
elif cnp.is_datetime64_object(val):
result[i] = Timestamp(val)
continue
elif (
(is_integer_object(val) or is_float_object(val))
and (val != val or val == NPY_NAT)
):
result[i] = NaT
continue
else:
val = str(val)

if fmt == "ISO8601":
string_to_dts_succeeded = not string_to_dts(
val, &dts, &out_bestunit, &out_local,
&out_tzoffset, False, None, False
)
elif iso_format:
string_to_dts_succeeded = not string_to_dts(
val, &dts, &out_bestunit, &out_local,
&out_tzoffset, False, fmt, exact
)
if string_to_dts_succeeded:
# No error reported by string_to_dts, pick back up
# where we left off
creso = get_supported_reso(out_bestunit)
try:
value = npy_datetimestruct_to_datetime(creso, &dts)
except OverflowError as err:
raise OutOfBoundsDatetime(
f"Out of bounds nanosecond timestamp: {val}"
) from err
if out_local == 1:
tz = timezone(timedelta(minutes=out_tzoffset))
value = tz_localize_to_utc_single(
value, tz, ambiguous="raise", nonexistent=None, creso=creso
)
else:
tz = None
ts = Timestamp._from_value_and_reso(value, creso, tz)
result[i] = ts
continue

if parse_today_now(val, &iresult, utc, NPY_FR_ns):
result[i] = Timestamp(val)
continue

# Some ISO formats can't be parsed by string_to_dts
# For example, 6-digit YYYYMD. So, if there's an error, and a format
# was specified, then try the string-matching code below. If the format
# specified was 'ISO8601', then we need to error, because
# only string_to_dts handles mixed ISO8601 formats.
if not string_to_dts_succeeded and fmt == "ISO8601":
raise ValueError(f"Time data {val} is not ISO8601 format")

tz = _parse_with_format(
val, fmt, exact, format_regex, locale_time, &dts, &item_reso
)
try:
iresult = npy_datetimestruct_to_datetime(item_reso, &dts)
except OverflowError as err:
raise OutOfBoundsDatetime(
f"Out of bounds nanosecond timestamp: {val}"
) from err
if tz is not None:
iresult = tz_localize_to_utc_single(
iresult, tz, ambiguous="raise", nonexistent=None, creso=item_reso
)
ts = Timestamp._from_value_and_reso(iresult, item_reso, tz)
result[i] = ts

except (ValueError, OutOfBoundsDatetime) as ex:
ex.args = (
f"{str(ex)}, at position {i}. You might want to try:\n"
" - passing `format` if your strings have a consistent format;\n"
" - passing `format='ISO8601'` if your strings are "
"all ISO8601 but not necessarily in exactly the same format;\n"
" - passing `format='mixed'`, and the format will be "
"inferred for each element individually. "
"You might want to use `dayfirst` alongside this.",
)
if is_coerce:
result[i] = NaT
continue
else:
raise

import warnings

from pandas.util._exceptions import find_stack_level
warnings.warn(
"In a future version of pandas, parsing datetimes with mixed time "
"zones will raise an error unless `utc=True`. Please specify `utc=True` "
"to opt in to the new behaviour and silence this warning. "
"To create a `Series` with mixed offsets and `object` dtype, "
"please use `apply` and `datetime.datetime.strptime`",
FutureWarning,
stacklevel=find_stack_level(),
)

return result


class TimeRE(_TimeRE):
"""
Handle conversion from format directives to regexes.
Expand Down
Loading