Skip to content

Commit

Permalink
ENH/BUG: infer reso in array_strptime (#55805)
Browse files Browse the repository at this point in the history
* ENH/BUG: infer reso in array_strptime

* increase tolerance 1000x
  • Loading branch information
jbrockmendel authored Nov 3, 2023
1 parent dd7441b commit 3fd3756
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 13 deletions.
4 changes: 3 additions & 1 deletion pandas/_libs/tslibs/strptime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ from numpy cimport int64_t
from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT


cdef bint parse_today_now(str val, int64_t* iresult, bint utc, NPY_DATETIMEUNIT creso)
cdef bint parse_today_now(
str val, int64_t* iresult, bint utc, NPY_DATETIMEUNIT creso, bint infer_reso=*
)


cdef class DatetimeParseState:
Expand Down
40 changes: 28 additions & 12 deletions pandas/_libs/tslibs/strptime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,16 @@ def _test_format_is_iso(f: str) -> bool:


cdef bint parse_today_now(
str val, int64_t* iresult, bint utc, NPY_DATETIMEUNIT creso
str val, int64_t* iresult, bint utc, NPY_DATETIMEUNIT creso, bint infer_reso = False
):
# We delay this check for as long as possible
# because it catches relatively rare cases
cdef:
_Timestamp ts

if val == "now":
if infer_reso:
creso = NPY_DATETIMEUNIT.NPY_FR_us
if utc:
ts = <_Timestamp>Timestamp.utcnow()
iresult[0] = ts._as_creso(creso)._value
Expand All @@ -135,6 +137,8 @@ cdef bint parse_today_now(
iresult[0] = ts._as_creso(creso)._value
return True
elif val == "today":
if infer_reso:
creso = NPY_DATETIMEUNIT.NPY_FR_us
ts = <_Timestamp>Timestamp.today()
iresult[0] = ts._as_creso(creso)._value
return True
Expand Down Expand Up @@ -348,27 +352,33 @@ def array_strptime(
else:
item_reso = NPY_DATETIMEUNIT.NPY_FR_us
state.update_creso(item_reso)
if infer_reso:
creso = state.creso
tz_out = state.process_datetime(val, tz_out, utc)
if isinstance(val, _Timestamp):
val = (<_Timestamp>val)._as_creso(state.creso)
val = (<_Timestamp>val)._as_creso(creso)
iresult[i] = val.tz_localize(None)._value
else:
iresult[i] = pydatetime_to_dt64(
val.replace(tzinfo=None), &dts, reso=state.creso
val.replace(tzinfo=None), &dts, reso=creso
)
check_dts_bounds(&dts, state.creso)
check_dts_bounds(&dts, creso)
result_timezone[i] = val.tzinfo
continue
elif PyDate_Check(val):
item_reso = NPY_DATETIMEUNIT.NPY_FR_s
state.update_creso(item_reso)
iresult[i] = pydate_to_dt64(val, &dts, reso=state.creso)
check_dts_bounds(&dts, state.creso)
if infer_reso:
creso = state.creso
iresult[i] = pydate_to_dt64(val, &dts, reso=creso)
check_dts_bounds(&dts, creso)
continue
elif is_datetime64_object(val):
item_reso = get_supported_reso(get_datetime64_unit(val))
state.update_creso(item_reso)
iresult[i] = get_datetime64_nanos(val, state.creso)
if infer_reso:
creso = state.creso
iresult[i] = get_datetime64_nanos(val, creso)
continue
elif (
(is_integer_object(val) or is_float_object(val))
Expand All @@ -394,7 +404,9 @@ def array_strptime(
# where we left off
item_reso = get_supported_reso(out_bestunit)
state.update_creso(item_reso)
value = npy_datetimestruct_to_datetime(state.creso, &dts)
if infer_reso:
creso = state.creso
value = npy_datetimestruct_to_datetime(creso, &dts)
if out_local == 1:
# Store the out_tzoffset in seconds
# since we store the total_seconds of
Expand All @@ -404,12 +416,14 @@ def array_strptime(
out_local = 0
out_tzoffset = 0
iresult[i] = value
check_dts_bounds(&dts)
check_dts_bounds(&dts, creso)
continue

if parse_today_now(val, &iresult[i], utc, state.creso):
if parse_today_now(val, &iresult[i], utc, creso, infer_reso=infer_reso):
item_reso = NPY_DATETIMEUNIT.NPY_FR_us
state.update_creso(item_reso)
if infer_reso:
creso = state.creso
continue

# Some ISO formats can't be parsed by string_to_dts
Expand All @@ -424,8 +438,10 @@ def array_strptime(
val, fmt, exact, format_regex, locale_time, &dts, &item_reso
)
state.update_creso(item_reso)
iresult[i] = npy_datetimestruct_to_datetime(state.creso, &dts)
check_dts_bounds(&dts)
if infer_reso:
creso = state.creso
iresult[i] = npy_datetimestruct_to_datetime(creso, &dts)
check_dts_bounds(&dts, creso)
result_timezone[i] = tz

except (ValueError, OutOfBoundsDatetime) as ex:
Expand Down
36 changes: 36 additions & 0 deletions pandas/tests/tslibs/test_strptime.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,39 @@ def test_array_strptime_resolution_mixed(self, tz):
fmt = "ISO8601"
res, _ = array_strptime(arr, fmt=fmt, utc=False, creso=creso_infer)
tm.assert_numpy_array_equal(res, expected)

def test_array_strptime_resolution_todaynow(self):
# specifically case where today/now is the *first* item
vals = np.array(["today", np.datetime64("2017-01-01", "us")], dtype=object)

now = Timestamp("now").asm8
res, _ = array_strptime(vals, fmt="%Y-%m-%d", utc=False, creso=creso_infer)
res2, _ = array_strptime(
vals[::-1], fmt="%Y-%m-%d", utc=False, creso=creso_infer
)

# 1s is an arbitrary cutoff for call overhead; in local testing the
# actual difference is about 250us
tolerance = np.timedelta64(1, "s")

assert res.dtype == "M8[us]"
assert abs(res[0] - now) < tolerance
assert res[1] == vals[1]

assert res2.dtype == "M8[us]"
assert abs(res2[1] - now) < tolerance * 2
assert res2[0] == vals[1]

def test_array_strptime_str_outside_nano_range(self):
vals = np.array(["2401-09-15"], dtype=object)
expected = np.array(["2401-09-15"], dtype="M8[s]")
fmt = "ISO8601"
res, _ = array_strptime(vals, fmt=fmt, creso=creso_infer)
tm.assert_numpy_array_equal(res, expected)

# non-iso -> different path
vals2 = np.array(["Sep 15, 2401"], dtype=object)
expected2 = np.array(["2401-09-15"], dtype="M8[s]")
fmt2 = "%b %d, %Y"
res2, _ = array_strptime(vals2, fmt=fmt2, creso=creso_infer)
tm.assert_numpy_array_equal(res2, expected2)

0 comments on commit 3fd3756

Please sign in to comment.