Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH/BUG: infer reso in array_strptime #55805

Merged
merged 3 commits into from
Nov 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion pandas/_libs/tslibs/strptime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ from numpy cimport int64_t
from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT


cdef bint parse_today_now(str val, int64_t* iresult, bint utc, NPY_DATETIMEUNIT creso)
cdef bint parse_today_now(
str val, int64_t* iresult, bint utc, NPY_DATETIMEUNIT creso, bint infer_reso=*
)


cdef class DatetimeParseState:
Expand Down
40 changes: 28 additions & 12 deletions pandas/_libs/tslibs/strptime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,16 @@ def _test_format_is_iso(f: str) -> bool:


cdef bint parse_today_now(
str val, int64_t* iresult, bint utc, NPY_DATETIMEUNIT creso
str val, int64_t* iresult, bint utc, NPY_DATETIMEUNIT creso, bint infer_reso = False
):
# We delay this check for as long as possible
# because it catches relatively rare cases
cdef:
_Timestamp ts

if val == "now":
if infer_reso:
creso = NPY_DATETIMEUNIT.NPY_FR_us
if utc:
ts = <_Timestamp>Timestamp.utcnow()
iresult[0] = ts._as_creso(creso)._value
Expand All @@ -135,6 +137,8 @@ cdef bint parse_today_now(
iresult[0] = ts._as_creso(creso)._value
return True
elif val == "today":
if infer_reso:
creso = NPY_DATETIMEUNIT.NPY_FR_us
ts = <_Timestamp>Timestamp.today()
iresult[0] = ts._as_creso(creso)._value
return True
Expand Down Expand Up @@ -348,27 +352,33 @@ def array_strptime(
else:
item_reso = NPY_DATETIMEUNIT.NPY_FR_us
state.update_creso(item_reso)
if infer_reso:
creso = state.creso
tz_out = state.process_datetime(val, tz_out, utc)
if isinstance(val, _Timestamp):
val = (<_Timestamp>val)._as_creso(state.creso)
val = (<_Timestamp>val)._as_creso(creso)
iresult[i] = val.tz_localize(None)._value
else:
iresult[i] = pydatetime_to_dt64(
val.replace(tzinfo=None), &dts, reso=state.creso
val.replace(tzinfo=None), &dts, reso=creso
)
check_dts_bounds(&dts, state.creso)
check_dts_bounds(&dts, creso)
result_timezone[i] = val.tzinfo
continue
elif PyDate_Check(val):
item_reso = NPY_DATETIMEUNIT.NPY_FR_s
state.update_creso(item_reso)
iresult[i] = pydate_to_dt64(val, &dts, reso=state.creso)
check_dts_bounds(&dts, state.creso)
if infer_reso:
creso = state.creso
iresult[i] = pydate_to_dt64(val, &dts, reso=creso)
check_dts_bounds(&dts, creso)
continue
elif is_datetime64_object(val):
item_reso = get_supported_reso(get_datetime64_unit(val))
state.update_creso(item_reso)
iresult[i] = get_datetime64_nanos(val, state.creso)
if infer_reso:
creso = state.creso
iresult[i] = get_datetime64_nanos(val, creso)
continue
elif (
(is_integer_object(val) or is_float_object(val))
Expand All @@ -394,7 +404,9 @@ def array_strptime(
# where we left off
item_reso = get_supported_reso(out_bestunit)
state.update_creso(item_reso)
value = npy_datetimestruct_to_datetime(state.creso, &dts)
if infer_reso:
creso = state.creso
value = npy_datetimestruct_to_datetime(creso, &dts)
if out_local == 1:
# Store the out_tzoffset in seconds
# since we store the total_seconds of
Expand All @@ -404,12 +416,14 @@ def array_strptime(
out_local = 0
out_tzoffset = 0
iresult[i] = value
check_dts_bounds(&dts)
check_dts_bounds(&dts, creso)
continue

if parse_today_now(val, &iresult[i], utc, state.creso):
if parse_today_now(val, &iresult[i], utc, creso, infer_reso=infer_reso):
item_reso = NPY_DATETIMEUNIT.NPY_FR_us
state.update_creso(item_reso)
if infer_reso:
creso = state.creso
continue

# Some ISO formats can't be parsed by string_to_dts
Expand All @@ -424,8 +438,10 @@ def array_strptime(
val, fmt, exact, format_regex, locale_time, &dts, &item_reso
)
state.update_creso(item_reso)
iresult[i] = npy_datetimestruct_to_datetime(state.creso, &dts)
check_dts_bounds(&dts)
if infer_reso:
creso = state.creso
iresult[i] = npy_datetimestruct_to_datetime(creso, &dts)
check_dts_bounds(&dts, creso)
result_timezone[i] = tz

except (ValueError, OutOfBoundsDatetime) as ex:
Expand Down
36 changes: 36 additions & 0 deletions pandas/tests/tslibs/test_strptime.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,39 @@ def test_array_strptime_resolution_mixed(self, tz):
fmt = "ISO8601"
res, _ = array_strptime(arr, fmt=fmt, utc=False, creso=creso_infer)
tm.assert_numpy_array_equal(res, expected)

def test_array_strptime_resolution_todaynow(self):
# specifically case where today/now is the *first* item
vals = np.array(["today", np.datetime64("2017-01-01", "us")], dtype=object)

now = Timestamp("now").asm8
res, _ = array_strptime(vals, fmt="%Y-%m-%d", utc=False, creso=creso_infer)
res2, _ = array_strptime(
vals[::-1], fmt="%Y-%m-%d", utc=False, creso=creso_infer
)

# 1s is an arbitrary cutoff for call overhead; in local testing the
# actual difference is about 250us
tolerance = np.timedelta64(1, "s")

assert res.dtype == "M8[us]"
assert abs(res[0] - now) < tolerance
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would be OK skipping the comparison to a separate "now" to potentially avoid near-miss occasional failures. I think at least passing "today" and "now" through array_strptime and seeing if us is return is good enough

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would you be happier with a bigger tolerance? I can imagine regressions here that cause this to miss by a huge amount, hard to imagine regressions that miss by a small amount

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah a larger tolerance would give me more peace.

assert res[1] == vals[1]

assert res2.dtype == "M8[us]"
assert abs(res2[1] - now) < tolerance * 2
assert res2[0] == vals[1]

def test_array_strptime_str_outside_nano_range(self):
vals = np.array(["2401-09-15"], dtype=object)
expected = np.array(["2401-09-15"], dtype="M8[s]")
fmt = "ISO8601"
res, _ = array_strptime(vals, fmt=fmt, creso=creso_infer)
tm.assert_numpy_array_equal(res, expected)

# non-iso -> different path
vals2 = np.array(["Sep 15, 2401"], dtype=object)
expected2 = np.array(["2401-09-15"], dtype="M8[s]")
fmt2 = "%b %d, %Y"
res2, _ = array_strptime(vals2, fmt=fmt2, creso=creso_infer)
tm.assert_numpy_array_equal(res2, expected2)
Loading