Skip to content

Commit

Permalink
Merge branch 'main' into enh-infer-array_to_datetime
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel committed Nov 1, 2023
2 parents 6ec542f + d1b2c44 commit 17865ad
Show file tree
Hide file tree
Showing 12 changed files with 246 additions and 50 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@ Timedelta
Timezones
^^^^^^^^^
- Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`)
- Bug in :class:`Timestamp` construction with an ambiguous value and a ``pytz`` timezone failing to raise ``pytz.AmbiguousTimeError`` (:issue:`55657`)
-

Numeric
Expand Down
28 changes: 15 additions & 13 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,12 @@ cpdef array_to_datetime(
iresult[i] = _ts.value

tz = _ts.tzinfo
if tz is not None:
if _ts.value == NPY_NAT:
# e.g. "NaT" string or empty string, we do not consider
# this as either tzaware or tznaive. See
# test_to_datetime_with_empty_str_utc_false_format_mixed
pass
elif tz is not None:
# dateutil timezone objects cannot be hashed, so
# store the UTC offsets in seconds instead
nsecs = tz.utcoffset(None).total_seconds()
Expand Down Expand Up @@ -667,7 +672,6 @@ cdef _array_to_datetime_object(
# 1) NaT or NaT-like values
# 2) datetime strings, which we return as datetime.datetime
# 3) special strings - "now" & "today"
unique_timezones = set()
for i in range(n):
# Analogous to: val = values[i]
val = <object>(<PyObject**>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
Expand Down Expand Up @@ -697,7 +701,6 @@ cdef _array_to_datetime_object(
tzinfo=tsobj.tzinfo,
fold=tsobj.fold,
)
unique_timezones.add(tsobj.tzinfo)

except (ValueError, OverflowError) as ex:
ex.args = (f"{ex}, at position {i}", )
Expand All @@ -715,16 +718,15 @@ cdef _array_to_datetime_object(

cnp.PyArray_MultiIter_NEXT(mi)

if len(unique_timezones) > 1:
warnings.warn(
"In a future version of pandas, parsing datetimes with mixed time "
"zones will raise an error unless `utc=True`. "
"Please specify `utc=True` to opt in to the new behaviour "
"and silence this warning. To create a `Series` with mixed offsets and "
"`object` dtype, please use `apply` and `datetime.datetime.strptime`",
FutureWarning,
stacklevel=find_stack_level(),
)
warnings.warn(
"In a future version of pandas, parsing datetimes with mixed time "
"zones will raise an error unless `utc=True`. "
"Please specify `utc=True` to opt in to the new behaviour "
"and silence this warning. To create a `Series` with mixed offsets and "
"`object` dtype, please use `apply` and `datetime.datetime.strptime`",
FutureWarning,
stacklevel=find_stack_level(),
)
return oresult_nd, None


Expand Down
3 changes: 2 additions & 1 deletion pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -645,7 +645,8 @@ cdef datetime _localize_pydatetime(datetime dt, tzinfo tz):
"""
try:
# datetime.replace with pytz may be incorrect result
return tz.localize(dt)
# TODO: try to respect `fold` attribute
return tz.localize(dt, is_dst=None)
except AttributeError:
return dt.replace(tzinfo=tz)

Expand Down
3 changes: 2 additions & 1 deletion pandas/_libs/tslibs/strptime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@ cdef class DatetimeParseState:
cdef:
bint found_tz
bint found_naive
bint creso_changed
bint creso_ever_changed
NPY_DATETIMEUNIT creso

cdef bint update_creso(self, NPY_DATETIMEUNIT creso)
cdef tzinfo process_datetime(self, datetime dt, tzinfo tz, bint utc_convert)
cdef bint update_creso(self, NPY_DATETIMEUNIT item_reso) noexcept
99 changes: 80 additions & 19 deletions pandas/_libs/tslibs/strptime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ from numpy cimport (

from pandas._libs.missing cimport checknull_with_nat_and_na
from pandas._libs.tslibs.conversion cimport get_datetime64_nanos
from pandas._libs.tslibs.dtypes cimport (
get_supported_reso,
npy_unit_to_abbrev,
)
from pandas._libs.tslibs.nattype cimport (
NPY_NAT,
c_nat_strings as nat_strings,
Expand All @@ -57,6 +61,7 @@ from pandas._libs.tslibs.np_datetime cimport (
NPY_DATETIMEUNIT,
NPY_FR_ns,
check_dts_bounds,
get_datetime64_unit,
import_pandas_datetime,
npy_datetimestruct,
npy_datetimestruct_to_datetime,
Expand Down Expand Up @@ -232,19 +237,19 @@ cdef _get_format_regex(str fmt):


cdef class DatetimeParseState:
def __cinit__(self, NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_GENERIC):
def __cinit__(self, NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_ns):
self.found_tz = False
self.found_naive = False
self.creso = creso
self.creso_changed = False
self.creso_ever_changed = False

cdef bint update_creso(self, NPY_DATETIMEUNIT item_reso):
# Return a bool to indicate we bumped to a higher resolution
cdef bint update_creso(self, NPY_DATETIMEUNIT item_reso) noexcept:
# Return a bool indicating whether we bumped to a higher resolution
if self.creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
self.creso = item_reso
elif item_reso > self.creso:
self.creso = item_reso
self.creso_changed = True
self.creso_ever_changed = True
return True
return False

Expand Down Expand Up @@ -280,6 +285,7 @@ def array_strptime(
bint exact=True,
errors="raise",
bint utc=False,
NPY_DATETIMEUNIT creso=NPY_FR_ns,
):
"""
Calculates the datetime structs represented by the passed array of strings
Expand All @@ -290,6 +296,8 @@ def array_strptime(
fmt : string-like regex
exact : matches must be exact if True, search if False
errors : string specifying error handling, {'raise', 'ignore', 'coerce'}
creso : NPY_DATETIMEUNIT, default NPY_FR_ns
Set to NPY_FR_GENERIC to infer a resolution.
"""

cdef:
Expand All @@ -303,17 +311,22 @@ def array_strptime(
bint is_coerce = errors=="coerce"
tzinfo tz_out = None
bint iso_format = format_is_iso(fmt)
NPY_DATETIMEUNIT out_bestunit
NPY_DATETIMEUNIT out_bestunit, item_reso
int out_local = 0, out_tzoffset = 0
bint string_to_dts_succeeded = 0
DatetimeParseState state = DatetimeParseState()
bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC
DatetimeParseState state = DatetimeParseState(creso)

assert is_raise or is_ignore or is_coerce

_validate_fmt(fmt)
format_regex, locale_time = _get_format_regex(fmt)

result = np.empty(n, dtype="M8[ns]")
if infer_reso:
abbrev = "ns"
else:
abbrev = npy_unit_to_abbrev(creso)
result = np.empty(n, dtype=f"M8[{abbrev}]")
iresult = result.view("i8")
result_timezone = np.empty(n, dtype="object")

Expand All @@ -330,20 +343,32 @@ def array_strptime(
iresult[i] = NPY_NAT
continue
elif PyDateTime_Check(val):
if isinstance(val, _Timestamp):
item_reso = val._creso
else:
item_reso = NPY_DATETIMEUNIT.NPY_FR_us
state.update_creso(item_reso)
tz_out = state.process_datetime(val, tz_out, utc)
if isinstance(val, _Timestamp):
iresult[i] = val.tz_localize(None).as_unit("ns")._value
val = (<_Timestamp>val)._as_creso(state.creso)
iresult[i] = val.tz_localize(None)._value
else:
iresult[i] = pydatetime_to_dt64(val.replace(tzinfo=None), &dts)
check_dts_bounds(&dts)
iresult[i] = pydatetime_to_dt64(
val.replace(tzinfo=None), &dts, reso=state.creso
)
check_dts_bounds(&dts, state.creso)
result_timezone[i] = val.tzinfo
continue
elif PyDate_Check(val):
iresult[i] = pydate_to_dt64(val, &dts)
check_dts_bounds(&dts)
item_reso = NPY_DATETIMEUNIT.NPY_FR_s
state.update_creso(item_reso)
iresult[i] = pydate_to_dt64(val, &dts, reso=state.creso)
check_dts_bounds(&dts, state.creso)
continue
elif is_datetime64_object(val):
iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)
item_reso = get_supported_reso(get_datetime64_unit(val))
state.update_creso(item_reso)
iresult[i] = get_datetime64_nanos(val, state.creso)
continue
elif (
(is_integer_object(val) or is_float_object(val))
Expand All @@ -367,7 +392,9 @@ def array_strptime(
if string_to_dts_succeeded:
# No error reported by string_to_dts, pick back up
# where we left off
value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
item_reso = get_supported_reso(out_bestunit)
state.update_creso(item_reso)
value = npy_datetimestruct_to_datetime(state.creso, &dts)
if out_local == 1:
# Store the out_tzoffset in seconds
# since we store the total_seconds of
Expand All @@ -380,7 +407,9 @@ def array_strptime(
check_dts_bounds(&dts)
continue

if parse_today_now(val, &iresult[i], utc, NPY_FR_ns):
if parse_today_now(val, &iresult[i], utc, state.creso):
item_reso = NPY_DATETIMEUNIT.NPY_FR_us
state.update_creso(item_reso)
continue

# Some ISO formats can't be parsed by string_to_dts
Expand All @@ -392,9 +421,10 @@ def array_strptime(
raise ValueError(f"Time data {val} is not ISO8601 format")

tz = _parse_with_format(
val, fmt, exact, format_regex, locale_time, &dts
val, fmt, exact, format_regex, locale_time, &dts, &item_reso
)
iresult[i] = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
state.update_creso(item_reso)
iresult[i] = npy_datetimestruct_to_datetime(state.creso, &dts)
check_dts_bounds(&dts)
result_timezone[i] = tz

Expand All @@ -415,11 +445,34 @@ def array_strptime(
raise
return values, []

if infer_reso:
if state.creso_ever_changed:
# We encountered mismatched resolutions, need to re-parse with
# the correct one.
return array_strptime(
values,
fmt=fmt,
exact=exact,
errors=errors,
utc=utc,
creso=state.creso,
)

# Otherwise we can use the single reso that we encountered and avoid
# a second pass.
abbrev = npy_unit_to_abbrev(state.creso)
result = iresult.base.view(f"M8[{abbrev}]")
return result, result_timezone.base


cdef tzinfo _parse_with_format(
str val, str fmt, bint exact, format_regex, locale_time, npy_datetimestruct* dts
str val,
str fmt,
bint exact,
format_regex,
locale_time,
npy_datetimestruct* dts,
NPY_DATETIMEUNIT* item_reso,
):
# Based on https://github.com/python/cpython/blob/main/Lib/_strptime.py#L293
cdef:
Expand Down Expand Up @@ -453,6 +506,8 @@ cdef tzinfo _parse_with_format(
f"time data \"{val}\" doesn't match format \"{fmt}\""
)

item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_s

iso_year = -1
year = 1900
month = day = 1
Expand Down Expand Up @@ -539,6 +594,12 @@ cdef tzinfo _parse_with_format(
elif parse_code == 10:
# e.g. val='10:10:10.100'; fmt='%H:%M:%S.%f'
s = found_dict["f"]
if len(s) <= 3:
item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_ms
elif len(s) <= 6:
item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_us
else:
item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_ns
# Pad to always return nanoseconds
s += "0" * (9 - len(s))
us = long(s)
Expand Down
17 changes: 10 additions & 7 deletions pandas/_libs/tslibs/tzconversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -332,13 +332,16 @@ timedelta-like}
# Shift the delta_idx by if the UTC offset of
# the target tz is greater than 0 and we're moving forward
# or vice versa
first_delta = info.deltas[0]
if (shift_forward or shift_delta > 0) and first_delta > 0:
delta_idx_offset = 1
elif (shift_backward or shift_delta < 0) and first_delta < 0:
delta_idx_offset = 1
else:
delta_idx_offset = 0
# TODO: delta_idx_offset and info.deltas are needed for zoneinfo timezones,
# but are not applicable for all timezones. Setting the former to 0 and
# length checking the latter avoids UB, but this could use a larger refactor
delta_idx_offset = 0
if len(info.deltas):
first_delta = info.deltas[0]
if (shift_forward or shift_delta > 0) and first_delta > 0:
delta_idx_offset = 1
elif (shift_backward or shift_delta < 0) and first_delta < 0:
delta_idx_offset = 1

for i in range(n):
val = vals[i]
Expand Down
12 changes: 6 additions & 6 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,12 +325,12 @@ class Index(IndexOpsMixin, PandasObject):
Parameters
----------
data : array-like (1-dimensional)
dtype : NumPy dtype (default: object)
If dtype is None, we find the dtype that best fits the data.
If an actual dtype is provided, we coerce to that dtype if it's safe.
Otherwise, an error will be raised.
copy : bool
Make a copy of input ndarray.
dtype : str, numpy.dtype, or ExtensionDtype, optional
Data type for the output Index. If not specified, this will be
inferred from `data`.
See the :ref:`user guide <basics.dtypes>` for more usages.
copy : bool, default False
Copy input data.
name : object
Name to be stored in the index.
tupleize_cols : bool (default: True)
Expand Down
5 changes: 4 additions & 1 deletion pandas/plotting/_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -961,7 +961,10 @@ def __call__(self, *args, **kwargs):
return plot_backend.plot(self._parent, x=x, y=y, kind=kind, **kwargs)

if kind not in self._all_kinds:
raise ValueError(f"{kind} is not a valid plot kind")
raise ValueError(
f"{kind} is not a valid plot kind "
f"Valid plot kinds: {self._all_kinds}"
)

# The original data structured can be transformed before passed to the
# backend. For example, for DataFrame is common to set the index as the
Expand Down
30 changes: 30 additions & 0 deletions pandas/tests/indexes/datetimes/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1013,6 +1013,36 @@ def test_dti_convert_datetime_list(self, tzstr):
dr2 = DatetimeIndex(list(dr), name="foo", freq="D")
tm.assert_index_equal(dr, dr2)

def test_dti_ambiguous_matches_timestamp(self):
# GH#47471 check that we get the same raising behavior in the DTI
# constructor and Timestamp constructor
dtstr = "2013-11-03 01:59:59.999999"
dtobj = Timestamp(dtstr).to_pydatetime()

tz = pytz.timezone("US/Eastern")
with pytest.raises(pytz.AmbiguousTimeError, match=dtstr):
Timestamp(dtstr, tz=tz)
with pytest.raises(pytz.AmbiguousTimeError, match=dtstr):
Timestamp(dtobj, tz=tz)
with pytest.raises(pytz.AmbiguousTimeError, match=dtstr):
DatetimeIndex([dtstr], tz=tz)
with pytest.raises(pytz.AmbiguousTimeError, match=dtstr):
DatetimeIndex([dtobj], tz=tz)

tz2 = gettz("US/Eastern")
with pytest.raises(pytz.AmbiguousTimeError, match=dtstr):
Timestamp(dtstr, tz=tz2)
# FIXME: The Timestamp constructor here behaves differently than all
# the other cases bc with dateutil/zoneinfo tzinfos we implicitly
# get fold=0. Having this raise is not important, but having the
# behavior be consistent across cases is.
# with pytest.raises(pytz.AmbiguousTimeError, match=dtstr):
# Timestamp(dtobj, tz=tz2)
with pytest.raises(pytz.AmbiguousTimeError, match=dtstr):
DatetimeIndex([dtstr], tz=tz2)
with pytest.raises(pytz.AmbiguousTimeError, match=dtstr):
DatetimeIndex([dtobj], tz=tz2)

@pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"])
def test_dti_constructor_with_non_nano_dtype(self, tz):
# GH#55756, GH#54620
Expand Down
Loading

0 comments on commit 17865ad

Please sign in to comment.