From fdab5f69497ea67601853f35af3777c200bf6ad7 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 2 Nov 2023 18:08:48 -0700 Subject: [PATCH 1/2] ENH/BUG: infer reso in array_strptime --- pandas/_libs/tslibs/strptime.pxd | 4 ++- pandas/_libs/tslibs/strptime.pyx | 40 +++++++++++++++++++--------- pandas/tests/tslibs/test_strptime.py | 36 +++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/tslibs/strptime.pxd b/pandas/_libs/tslibs/strptime.pxd index 64db2b59dfcff..fd8cafeaefb27 100644 --- a/pandas/_libs/tslibs/strptime.pxd +++ b/pandas/_libs/tslibs/strptime.pxd @@ -7,7 +7,9 @@ from numpy cimport int64_t from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT -cdef bint parse_today_now(str val, int64_t* iresult, bint utc, NPY_DATETIMEUNIT creso) +cdef bint parse_today_now( + str val, int64_t* iresult, bint utc, NPY_DATETIMEUNIT creso, bint infer_reso=* +) cdef class DatetimeParseState: diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 6b744ce4c8cdb..fc44d3aad4bed 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -117,7 +117,7 @@ def _test_format_is_iso(f: str) -> bool: cdef bint parse_today_now( - str val, int64_t* iresult, bint utc, NPY_DATETIMEUNIT creso + str val, int64_t* iresult, bint utc, NPY_DATETIMEUNIT creso, bint infer_reso = False ): # We delay this check for as long as possible # because it catches relatively rare cases @@ -125,6 +125,8 @@ cdef bint parse_today_now( _Timestamp ts if val == "now": + if infer_reso: + creso = NPY_DATETIMEUNIT.NPY_FR_us if utc: ts = <_Timestamp>Timestamp.utcnow() iresult[0] = ts._as_creso(creso)._value @@ -135,6 +137,8 @@ cdef bint parse_today_now( iresult[0] = ts._as_creso(creso)._value return True elif val == "today": + if infer_reso: + creso = NPY_DATETIMEUNIT.NPY_FR_us ts = <_Timestamp>Timestamp.today() iresult[0] = ts._as_creso(creso)._value return True @@ -348,27 +352,33 @@ def array_strptime( else: item_reso = NPY_DATETIMEUNIT.NPY_FR_us state.update_creso(item_reso) + if infer_reso: + creso = state.creso tz_out = state.process_datetime(val, tz_out, utc) if isinstance(val, _Timestamp): - val = (<_Timestamp>val)._as_creso(state.creso) + val = (<_Timestamp>val)._as_creso(creso) iresult[i] = val.tz_localize(None)._value else: iresult[i] = pydatetime_to_dt64( - val.replace(tzinfo=None), &dts, reso=state.creso + val.replace(tzinfo=None), &dts, reso=creso ) - check_dts_bounds(&dts, state.creso) + check_dts_bounds(&dts, creso) result_timezone[i] = val.tzinfo continue elif PyDate_Check(val): item_reso = NPY_DATETIMEUNIT.NPY_FR_s state.update_creso(item_reso) - iresult[i] = pydate_to_dt64(val, &dts, reso=state.creso) - check_dts_bounds(&dts, state.creso) + if infer_reso: + creso = state.creso + iresult[i] = pydate_to_dt64(val, &dts, reso=creso) + check_dts_bounds(&dts, creso) continue elif is_datetime64_object(val): item_reso = get_supported_reso(get_datetime64_unit(val)) state.update_creso(item_reso) - iresult[i] = get_datetime64_nanos(val, state.creso) + if infer_reso: + creso = state.creso + iresult[i] = get_datetime64_nanos(val, creso) continue elif ( (is_integer_object(val) or is_float_object(val)) @@ -394,7 +404,9 @@ def array_strptime( # where we left off item_reso = get_supported_reso(out_bestunit) state.update_creso(item_reso) - value = npy_datetimestruct_to_datetime(state.creso, &dts) + if infer_reso: + creso = state.creso + value = npy_datetimestruct_to_datetime(creso, &dts) if out_local == 1: # Store the out_tzoffset in seconds # since we store the total_seconds of @@ -404,12 +416,14 @@ def array_strptime( out_local = 0 out_tzoffset = 0 iresult[i] = value - check_dts_bounds(&dts) + check_dts_bounds(&dts, creso) continue - if parse_today_now(val, &iresult[i], utc, state.creso): + if parse_today_now(val, &iresult[i], utc, creso, infer_reso=infer_reso): item_reso = NPY_DATETIMEUNIT.NPY_FR_us state.update_creso(item_reso) + if infer_reso: + creso = state.creso continue # Some ISO formats can't be parsed by string_to_dts @@ -424,8 +438,10 @@ def array_strptime( val, fmt, exact, format_regex, locale_time, &dts, &item_reso ) state.update_creso(item_reso) - iresult[i] = npy_datetimestruct_to_datetime(state.creso, &dts) - check_dts_bounds(&dts) + if infer_reso: + creso = state.creso + iresult[i] = npy_datetimestruct_to_datetime(creso, &dts) + check_dts_bounds(&dts, creso) result_timezone[i] = tz except (ValueError, OutOfBoundsDatetime) as ex: diff --git a/pandas/tests/tslibs/test_strptime.py b/pandas/tests/tslibs/test_strptime.py index 0992eecf0eedd..27bcf80ed53b6 100644 --- a/pandas/tests/tslibs/test_strptime.py +++ b/pandas/tests/tslibs/test_strptime.py @@ -59,3 +59,39 @@ def test_array_strptime_resolution_mixed(self, tz): fmt = "ISO8601" res, _ = array_strptime(arr, fmt=fmt, utc=False, creso=creso_infer) tm.assert_numpy_array_equal(res, expected) + + def test_array_strptime_resolution_todaynow(self): + # specifically case where today/now is the *first* item + vals = np.array(["today", np.datetime64("2017-01-01", "us")], dtype=object) + + now = Timestamp("now").asm8 + res, _ = array_strptime(vals, fmt="%Y-%m-%d", utc=False, creso=creso_infer) + res2, _ = array_strptime( + vals[::-1], fmt="%Y-%m-%d", utc=False, creso=creso_infer + ) + + # 1ms is an arbitrary cutoff for call overhead; in local testing the + # actual difference is about 250us + tolerance = np.timedelta64(1, "ms") + + assert res.dtype == "M8[us]" + assert abs(res[0] - now) < tolerance + assert res[1] == vals[1] + + assert res2.dtype == "M8[us]" + assert abs(res2[1] - now) < tolerance * 2 + assert res2[0] == vals[1] + + def test_array_strptime_str_outside_nano_range(self): + vals = np.array(["2401-09-15"], dtype=object) + expected = np.array(["2401-09-15"], dtype="M8[s]") + fmt = "ISO8601" + res, _ = array_strptime(vals, fmt=fmt, creso=creso_infer) + tm.assert_numpy_array_equal(res, expected) + + # non-iso -> different path + vals2 = np.array(["Sep 15, 2401"], dtype=object) + expected2 = np.array(["2401-09-15"], dtype="M8[s]") + fmt2 = "%b %d, %Y" + res2, _ = array_strptime(vals2, fmt=fmt2, creso=creso_infer) + tm.assert_numpy_array_equal(res2, expected2) From 635d59404ad7ab734874b55372b5ffcc198a7f30 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 3 Nov 2023 13:44:45 -0700 Subject: [PATCH 2/2] increase tolerance 1000x --- pandas/tests/tslibs/test_strptime.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/tslibs/test_strptime.py b/pandas/tests/tslibs/test_strptime.py index 27bcf80ed53b6..ce45bdd10b8e8 100644 --- a/pandas/tests/tslibs/test_strptime.py +++ b/pandas/tests/tslibs/test_strptime.py @@ -70,9 +70,9 @@ def test_array_strptime_resolution_todaynow(self): vals[::-1], fmt="%Y-%m-%d", utc=False, creso=creso_infer ) - # 1ms is an arbitrary cutoff for call overhead; in local testing the + # 1s is an arbitrary cutoff for call overhead; in local testing the # actual difference is about 250us - tolerance = np.timedelta64(1, "ms") + tolerance = np.timedelta64(1, "s") assert res.dtype == "M8[us]" assert abs(res[0] - now) < tolerance