Skip to content

Commit

Permalink
BUG: nanoseconds and reso in dateutil paths (pandas-dev#56051)
Browse files Browse the repository at this point in the history
* BUG: nanoseconds and reso in dateutil paths

* GH ref
  • Loading branch information
jbrockmendel authored and phofl committed Nov 21, 2023
1 parent f2cef02 commit 7c7b725
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 15 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,8 @@ Datetimelike
- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`)
- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` (or :class:`DatetimeTZDtype`) from mixed-numeric inputs treating those as nanoseconds instead of as multiples of the dtype's unit (which would happen with non-mixed numeric inputs) (:issue:`56004`)
- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`)
- Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`)
- Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`)
-

Timedelta
Expand Down
10 changes: 7 additions & 3 deletions pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz,
npy_datetimestruct dts
int out_local = 0, out_tzoffset = 0, string_to_dts_failed
datetime dt
int64_t ival
int64_t ival, nanos = 0
NPY_DATETIMEUNIT out_bestunit, reso
_TSObject obj

Expand Down Expand Up @@ -560,10 +560,14 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz,
return obj

dt = parse_datetime_string(
ts, dayfirst=dayfirst, yearfirst=yearfirst, out_bestunit=&out_bestunit
ts,
dayfirst=dayfirst,
yearfirst=yearfirst,
out_bestunit=&out_bestunit,
nanos=&nanos,
)
reso = get_supported_reso(out_bestunit)
return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=reso)
return convert_datetime_to_tsobject(dt, tz, nanos=nanos, reso=reso)

return convert_datetime_to_tsobject(dt, tz)

Expand Down
4 changes: 3 additions & 1 deletion pandas/_libs/tslibs/parsing.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from cpython.datetime cimport datetime
from numpy cimport int64_t

from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT

Expand All @@ -10,5 +11,6 @@ cdef datetime parse_datetime_string(
str date_string,
bint dayfirst,
bint yearfirst,
NPY_DATETIMEUNIT* out_bestunit
NPY_DATETIMEUNIT* out_bestunit,
int64_t* nanos,
)
58 changes: 47 additions & 11 deletions pandas/_libs/tslibs/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ from numpy cimport (
PyArray_IterNew,
flatiter,
float64_t,
int64_t,
)

cnp.import_array()
Expand Down Expand Up @@ -272,8 +273,11 @@ def py_parse_datetime_string(
# parse_datetime_string cpdef bc it has a pointer argument)
cdef:
NPY_DATETIMEUNIT out_bestunit
int64_t nanos

return parse_datetime_string(date_string, dayfirst, yearfirst, &out_bestunit)
return parse_datetime_string(
date_string, dayfirst, yearfirst, &out_bestunit, &nanos
)


cdef datetime parse_datetime_string(
Expand All @@ -283,7 +287,8 @@ cdef datetime parse_datetime_string(
str date_string,
bint dayfirst,
bint yearfirst,
NPY_DATETIMEUNIT* out_bestunit
NPY_DATETIMEUNIT* out_bestunit,
int64_t* nanos,
):
"""
Parse datetime string, only returns datetime.
Expand Down Expand Up @@ -311,7 +316,7 @@ cdef datetime parse_datetime_string(
default = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
dt = dateutil_parse(date_string, default=default,
dayfirst=dayfirst, yearfirst=yearfirst,
ignoretz=False, out_bestunit=out_bestunit)
ignoretz=False, out_bestunit=out_bestunit, nanos=nanos)
return dt

dt = _parse_delimited_date(date_string, dayfirst, out_bestunit)
Expand All @@ -330,7 +335,7 @@ cdef datetime parse_datetime_string(

dt = dateutil_parse(date_string, default=_DEFAULT_DATETIME,
dayfirst=dayfirst, yearfirst=yearfirst,
ignoretz=False, out_bestunit=out_bestunit)
ignoretz=False, out_bestunit=out_bestunit, nanos=nanos)
return dt


Expand Down Expand Up @@ -436,7 +441,7 @@ def parse_datetime_string_with_reso(

parsed = dateutil_parse(date_string, _DEFAULT_DATETIME,
dayfirst=dayfirst, yearfirst=yearfirst,
ignoretz=False, out_bestunit=&out_bestunit)
ignoretz=False, out_bestunit=&out_bestunit, nanos=NULL)
reso = npy_unit_to_attrname[out_bestunit]
return parsed, reso

Expand Down Expand Up @@ -639,7 +644,8 @@ cdef datetime dateutil_parse(
bint ignoretz,
bint dayfirst,
bint yearfirst,
NPY_DATETIMEUNIT* out_bestunit
NPY_DATETIMEUNIT* out_bestunit,
int64_t* nanos,
):
""" lifted from dateutil to get resolution"""

Expand Down Expand Up @@ -671,11 +677,8 @@ cdef datetime dateutil_parse(
if reso is None:
raise DateParseError(f"Unable to parse datetime string: {timestr}")

if reso == "microsecond":
if repl["microsecond"] == 0:
reso = "second"
elif repl["microsecond"] % 1000 == 0:
reso = "millisecond"
if reso == "microsecond" and repl["microsecond"] % 1000 == 0:
reso = _find_subsecond_reso(timestr, nanos=nanos)

try:
ret = default.replace(**repl)
Expand Down Expand Up @@ -745,6 +748,38 @@ cdef datetime dateutil_parse(
return ret


cdef object _reso_pattern = re.compile(r"\d:\d{2}:\d{2}\.(?P<frac>\d+)")

cdef _find_subsecond_reso(str timestr, int64_t* nanos):
# GH#55737
# Check for trailing zeros in a H:M:S.f pattern
match = _reso_pattern.search(timestr)
if not match:
reso = "second"
else:
frac = match.groupdict()["frac"]
if len(frac) <= 3:
reso = "millisecond"
elif len(frac) > 6:
if frac[6:] == "0" * len(frac[6:]):
# corner case where we haven't lost any data
reso = "nanosecond"
elif len(frac) <= 9:
reso = "nanosecond"
if nanos is not NULL:
if len(frac) < 9:
frac = frac + "0" * (9 - len(frac))
nanos[0] = int(frac[6:])
else:
# TODO: should we warn/raise in higher-than-nano cases?
reso = "nanosecond"
if nanos is not NULL:
nanos[0] = int(frac[6:9])
else:
reso = "microsecond"
return reso


# ----------------------------------------------------------------------
# Parsing for type-inference

Expand Down Expand Up @@ -916,6 +951,7 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None:
yearfirst=False,
ignoretz=False,
out_bestunit=&out_bestunit,
nanos=NULL,
)
except (ValueError, OverflowError, InvalidOperation):
# In case the datetime can't be parsed, its format cannot be guessed
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/scalar/timestamp/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,18 @@ def test_constructor_str_infer_reso(self):
ts = Timestamp("300 June 1:30:01.300")
assert ts.unit == "ms"

# dateutil path -> don't drop trailing zeros
ts = Timestamp("01-01-2013T00:00:00.000000000+0000")
assert ts.unit == "ns"

ts = Timestamp("2016/01/02 03:04:05.001000 UTC")
assert ts.unit == "us"

# higher-than-nanosecond -> we drop the trailing bits
ts = Timestamp("01-01-2013T00:00:00.000000002100+0000")
assert ts == Timestamp("01-01-2013T00:00:00.000000002+0000")
assert ts.unit == "ns"


class TestTimestampConstructors:
def test_weekday_but_no_day_raises(self):
Expand Down

0 comments on commit 7c7b725

Please sign in to comment.