Skip to content

Commit

Permalink
DEPR: disallow parsing datetimes with mixed time zones unless `utc=Tr…
Browse files Browse the repository at this point in the history
…ue` (pandas-dev#57275)

* correct def _array_to_datetime_object, _array_strptime_object_fallback, fix tests

* fix tests

* correct to_datetime docs, add a note to v3.0.0

* correct to_datetime docs

* fix failures in benchmarks/inference.py

* fix pre-commit error

* correct examples in to_datetime docs

* correct to_datetime docs

* delete time_different_offset from benchmarks/inference.py as redundant

* correct v3.0.0

* removed _array_to_datetime_object and _array_strptime_object_fallback

* correct to_datetime docstring, roll back changes in test_suppress_error_output

* fix pre-commit error

* correct test_to_datetime_mixed_awareness_mixed_types, and a comment in array_to_datetime
  • Loading branch information
natmokval authored and pmhatre1 committed May 7, 2024
1 parent e4d0796 commit 5914d8e
Show file tree
Hide file tree
Showing 11 changed files with 124 additions and 485 deletions.
5 changes: 1 addition & 4 deletions asv_bench/benchmarks/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def time_same_offset(self):
to_datetime(self.same_offset)

def time_different_offset(self):
to_datetime(self.diff_offset)
to_datetime(self.diff_offset, utc=True)


class ToDatetimeFormatQuarters:
Expand Down Expand Up @@ -231,9 +231,6 @@ def time_no_exact(self):
def time_same_offset(self):
to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z")

def time_different_offset(self):
to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z")

def time_same_offset_to_utc(self):
to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True)

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ Removal of prior version deprecations/changes
- All arguments in :meth:`Index.sort_values` are now keyword only (:issue:`56493`)
- All arguments in :meth:`Series.to_dict` are now keyword only (:issue:`56493`)
- Changed the default value of ``observed`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` to ``True`` (:issue:`51811`)
- Enforced deprecation disallowing parsing datetimes with mixed time zones unless user passes ``utc=True`` to :func:`to_datetime` (:issue:`57275`)
- Enforced silent-downcasting deprecation for :ref:`all relevant methods <whatsnew_220.silent_downcasting>` (:issue:`54710`)
- In :meth:`DataFrame.stack`, the default value of ``future_stack`` is now ``True``; specifying ``False`` will raise a ``FutureWarning`` (:issue:`55448`)
- Removed :meth:`DateOffset.is_anchored` and :meth:`offsets.Tick.is_anchored` (:issue:`56594`)
Expand Down
122 changes: 7 additions & 115 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ from datetime import timezone
from cpython.datetime cimport (
PyDate_Check,
PyDateTime_Check,
datetime,
import_datetime,
timedelta,
tzinfo,
Expand Down Expand Up @@ -590,15 +589,17 @@ cpdef array_to_datetime(
return values, None

if seen_datetime_offset and not utc_convert:
# GH#17697
# GH#17697, GH#57275
# 1) If all the offsets are equal, return one offset for
# the parsed dates to (maybe) pass to DatetimeIndex
# 2) If the offsets are different, then force the parsing down the
# object path where an array of datetimes
# (with individual dateutil.tzoffsets) are returned
# 2) If the offsets are different, then do not force the parsing
# and raise a ValueError: "cannot parse datetimes with
# mixed time zones unless `utc=True`" instead
is_same_offsets = len(out_tzoffset_vals) == 1
if not is_same_offsets:
return _array_to_datetime_object(values, errors, dayfirst, yearfirst)
raise ValueError(
"cannot parse datetimes with mixed time zones unless `utc=True`"
)
elif state.found_naive or state.found_other:
# e.g. test_to_datetime_mixed_awareness_mixed_types
raise ValueError("Cannot mix tz-aware with tz-naive values")
Expand Down Expand Up @@ -647,115 +648,6 @@ cpdef array_to_datetime(
return result, tz_out


@cython.wraparound(False)
@cython.boundscheck(False)
cdef _array_to_datetime_object(
ndarray[object] values,
str errors,
bint dayfirst=False,
bint yearfirst=False,
):
"""
Fall back function for array_to_datetime
Attempts to parse datetime strings with dateutil to return an array
of datetime objects
Parameters
----------
values : ndarray[object]
date-like objects to convert
errors : str
error behavior when parsing
dayfirst : bool, default False
dayfirst parsing behavior when encountering datetime strings
yearfirst : bool, default False
yearfirst parsing behavior when encountering datetime strings
Returns
-------
np.ndarray[object]
Literal[None]
"""
cdef:
Py_ssize_t i, n = values.size
object val
bint is_coerce = errors == "coerce"
bint is_raise = errors == "raise"
ndarray oresult_nd
ndarray[object] oresult
npy_datetimestruct dts
cnp.broadcast mi
_TSObject tsobj

assert is_raise or is_coerce

oresult_nd = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_OBJECT, 0)
mi = cnp.PyArray_MultiIterNew2(oresult_nd, values)
oresult = oresult_nd.ravel()

# We return an object array and only attempt to parse:
# 1) NaT or NaT-like values
# 2) datetime strings, which we return as datetime.datetime
# 3) special strings - "now" & "today"
for i in range(n):
# Analogous to: val = values[i]
val = <object>(<PyObject**>cnp.PyArray_MultiIter_DATA(mi, 1))[0]

if checknull_with_nat_and_na(val) or PyDateTime_Check(val):
# GH 25978. No need to parse NaT-like or datetime-like vals
oresult[i] = val
elif isinstance(val, str):
if type(val) is not str:
# GH#32264 np.str_ objects
val = str(val)

if len(val) == 0 or val in nat_strings:
oresult[i] = "NaT"
cnp.PyArray_MultiIter_NEXT(mi)
continue

try:
tsobj = convert_str_to_tsobject(
val, None, dayfirst=dayfirst, yearfirst=yearfirst
)
tsobj.ensure_reso(NPY_FR_ns, val)

dts = tsobj.dts
oresult[i] = datetime(
dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us,
tzinfo=tsobj.tzinfo,
fold=tsobj.fold,
)

except (ValueError, OverflowError) as ex:
ex.args = (f"{ex}, at position {i}", )
if is_coerce:
oresult[i] = <object>NaT
cnp.PyArray_MultiIter_NEXT(mi)
continue
if is_raise:
raise
return values, None
else:
if is_raise:
raise
return values, None

cnp.PyArray_MultiIter_NEXT(mi)

warnings.warn(
"In a future version of pandas, parsing datetimes with mixed time "
"zones will raise an error unless `utc=True`. "
"Please specify `utc=True` to opt in to the new behaviour "
"and silence this warning. To create a `Series` with mixed offsets and "
"`object` dtype, please use `apply` and `datetime.datetime.strptime`",
FutureWarning,
stacklevel=find_stack_level(),
)
return oresult_nd, None


def array_to_datetime_with_tz(
ndarray values, tzinfo tz, bint dayfirst, bint yearfirst, NPY_DATETIMEUNIT creso
):
Expand Down
167 changes: 7 additions & 160 deletions pandas/_libs/tslibs/strptime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ from pandas._libs.tslibs.dtypes cimport (
)
from pandas._libs.tslibs.nattype cimport (
NPY_NAT,
c_NaT as NaT,
c_nat_strings as nat_strings,
)
from pandas._libs.tslibs.np_datetime cimport (
Expand Down Expand Up @@ -503,20 +502,18 @@ def array_strptime(
if seen_datetime_offset and not utc:
is_same_offsets = len(out_tzoffset_vals) == 1
if not is_same_offsets or (state.found_naive or state.found_other):
result2 = _array_strptime_object_fallback(
values, fmt=fmt, exact=exact, errors=errors, utc=utc
raise ValueError(
"cannot parse datetimes with mixed time zones unless `utc=True`"
)
return result2, None
elif tz_out is not None:
# GH#55693
tz_offset = out_tzoffset_vals.pop()
tz_out2 = timezone(timedelta(seconds=tz_offset))
if not tz_compare(tz_out, tz_out2):
# e.g. test_to_datetime_mixed_offsets_with_utc_false_deprecated
result2 = _array_strptime_object_fallback(
values, fmt=fmt, exact=exact, errors=errors, utc=utc
# e.g. test_to_datetime_mixed_offsets_with_utc_false_removed
raise ValueError(
"cannot parse datetimes with mixed time zones unless `utc=True`"
)
return result2, None
# e.g. test_guess_datetime_format_with_parseable_formats
else:
# e.g. test_to_datetime_iso8601_with_timezone_valid
Expand All @@ -525,10 +522,9 @@ def array_strptime(
elif not utc:
if tz_out and (state.found_other or state.found_naive_str):
# found_other indicates a tz-naive int, float, dt64, or date
result2 = _array_strptime_object_fallback(
values, fmt=fmt, exact=exact, errors=errors, utc=utc
raise ValueError(
"cannot parse datetimes with mixed time zones unless `utc=True`"
)
return result2, None

if infer_reso:
if state.creso_ever_changed:
Expand Down Expand Up @@ -790,155 +786,6 @@ cdef tzinfo _parse_with_format(
return tz


def _array_strptime_object_fallback(
ndarray[object] values,
str fmt,
bint exact=True,
errors="raise",
bint utc=False,
):

cdef:
Py_ssize_t i, n = len(values)
npy_datetimestruct dts
int64_t iresult
object val
tzinfo tz
bint is_raise = errors=="raise"
bint is_coerce = errors=="coerce"
bint iso_format = format_is_iso(fmt)
NPY_DATETIMEUNIT creso, out_bestunit, item_reso
int out_local = 0, out_tzoffset = 0
bint string_to_dts_succeeded = 0

assert is_raise or is_coerce

item_reso = NPY_DATETIMEUNIT.NPY_FR_GENERIC
format_regex, locale_time = _get_format_regex(fmt)

result = np.empty(n, dtype=object)

dts.us = dts.ps = dts.as = 0

for i in range(n):
val = values[i]
try:
if isinstance(val, str):
if len(val) == 0 or val in nat_strings:
result[i] = NaT
continue
elif checknull_with_nat_and_na(val):
result[i] = NaT
continue
elif PyDateTime_Check(val):
result[i] = Timestamp(val)
continue
elif PyDate_Check(val):
result[i] = Timestamp(val)
continue
elif cnp.is_datetime64_object(val):
result[i] = Timestamp(val)
continue
elif (
(is_integer_object(val) or is_float_object(val))
and (val != val or val == NPY_NAT)
):
result[i] = NaT
continue
else:
val = str(val)

if fmt == "ISO8601":
string_to_dts_succeeded = not string_to_dts(
val, &dts, &out_bestunit, &out_local,
&out_tzoffset, False, None, False
)
elif iso_format:
string_to_dts_succeeded = not string_to_dts(
val, &dts, &out_bestunit, &out_local,
&out_tzoffset, False, fmt, exact
)
if string_to_dts_succeeded:
# No error reported by string_to_dts, pick back up
# where we left off
creso = get_supported_reso(out_bestunit)
try:
value = npy_datetimestruct_to_datetime(creso, &dts)
except OverflowError as err:
raise OutOfBoundsDatetime(
f"Out of bounds nanosecond timestamp: {val}"
) from err
if out_local == 1:
tz = timezone(timedelta(minutes=out_tzoffset))
value = tz_localize_to_utc_single(
value, tz, ambiguous="raise", nonexistent=None, creso=creso
)
else:
tz = None
ts = Timestamp._from_value_and_reso(value, creso, tz)
result[i] = ts
continue

if parse_today_now(val, &iresult, utc, NPY_FR_ns):
result[i] = Timestamp(val)
continue

# Some ISO formats can't be parsed by string_to_dts
# For example, 6-digit YYYYMD. So, if there's an error, and a format
# was specified, then try the string-matching code below. If the format
# specified was 'ISO8601', then we need to error, because
# only string_to_dts handles mixed ISO8601 formats.
if not string_to_dts_succeeded and fmt == "ISO8601":
raise ValueError(f"Time data {val} is not ISO8601 format")

tz = _parse_with_format(
val, fmt, exact, format_regex, locale_time, &dts, &item_reso
)
try:
iresult = npy_datetimestruct_to_datetime(item_reso, &dts)
except OverflowError as err:
raise OutOfBoundsDatetime(
f"Out of bounds nanosecond timestamp: {val}"
) from err
if tz is not None:
iresult = tz_localize_to_utc_single(
iresult, tz, ambiguous="raise", nonexistent=None, creso=item_reso
)
ts = Timestamp._from_value_and_reso(iresult, item_reso, tz)
result[i] = ts

except (ValueError, OutOfBoundsDatetime) as ex:
ex.args = (
f"{str(ex)}, at position {i}. You might want to try:\n"
" - passing `format` if your strings have a consistent format;\n"
" - passing `format='ISO8601'` if your strings are "
"all ISO8601 but not necessarily in exactly the same format;\n"
" - passing `format='mixed'`, and the format will be "
"inferred for each element individually. "
"You might want to use `dayfirst` alongside this.",
)
if is_coerce:
result[i] = NaT
continue
else:
raise

import warnings

from pandas.util._exceptions import find_stack_level
warnings.warn(
"In a future version of pandas, parsing datetimes with mixed time "
"zones will raise an error unless `utc=True`. Please specify `utc=True` "
"to opt in to the new behaviour and silence this warning. "
"To create a `Series` with mixed offsets and `object` dtype, "
"please use `apply` and `datetime.datetime.strptime`",
FutureWarning,
stacklevel=find_stack_level(),
)

return result


class TimeRE(_TimeRE):
"""
Handle conversion from format directives to regexes.
Expand Down
Loading

0 comments on commit 5914d8e

Please sign in to comment.