Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: non-nano datetime64s for read_sas #56127

Merged
merged 13 commits into from
Dec 5, 2023
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ Other enhancements
- :meth:`to_sql` with method parameter set to ``multi`` works with Oracle on the backend
- :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`).
- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
- :func:`read_sas` returns ``datetime64`` dtypes with resolutions better matching those stored natively in SAS, and avoids returning object-dtype in cases that cannot be stored with ``datetime64[ns]`` dtype (:issue:`56127`)
- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`)
- :func:`tseries.api.guess_datetime_format` is now part of the public API (:issue:`54727`)
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
Expand Down
4 changes: 3 additions & 1 deletion pandas/_libs/tslibs/conversion.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@ DT64NS_DTYPE: np.dtype
TD64NS_DTYPE: np.dtype

def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ...
def cast_from_unit_vectorized(values: np.ndarray, unit: str) -> np.ndarray: ...
def cast_from_unit_vectorized(
values: np.ndarray, unit: str, out_unit: str = ...
) -> np.ndarray: ...
5 changes: 3 additions & 2 deletions pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ TD64NS_DTYPE = np.dtype("m8[ns]")
def cast_from_unit_vectorized(
ndarray values,
str unit,
str out_unit="ns",
):
"""
Vectorized analogue to cast_from_unit.
Expand All @@ -122,11 +123,11 @@ def cast_from_unit_vectorized(
# GH#47266 go through np.datetime64 to avoid weird results e.g. with "Y"
# and 150 we'd get 2120-01-01 09:00:00
values = values.astype(f"M8[{unit}]")
dtype = np.dtype("M8[ns]")
dtype = np.dtype(f"M8[{out_unit}]")
return astype_overflowsafe(values, dtype=dtype, copy=False).view("i8")

in_reso = abbrev_to_npy_unit(unit)
out_reso = abbrev_to_npy_unit("ns")
out_reso = abbrev_to_npy_unit(out_unit)
m, p = precision_from_unit(in_reso, out_reso)

cdef:
Expand Down
32 changes: 18 additions & 14 deletions pandas/io/sas/sas7bdat.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,7 @@
timedelta,
)
import sys
from typing import (
TYPE_CHECKING,
cast,
)
from typing import TYPE_CHECKING

import numpy as np

Expand All @@ -39,14 +36,13 @@
Parser,
get_subheader_index,
)
from pandas.errors import (
EmptyDataError,
OutOfBoundsDatetime,
)
from pandas._libs.tslibs.conversion import cast_from_unit_vectorized
from pandas.errors import EmptyDataError

import pandas as pd
from pandas import (
DataFrame,
Timestamp,
isna,
)

Expand All @@ -62,6 +58,10 @@
)


_unix_origin = Timestamp("1970-01-01")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will these be reused in the future? Otherwise could just inline the 10 year timedelta in _convert_datetimes

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

either way. i think this is nicely clear about where the 10 year difference stems from (also cant do Timedelta(years=10))

_sas_origin = Timestamp("1960-01-01")


def _parse_datetime(sas_datetime: float, unit: str):
if isna(sas_datetime):
return pd.NaT
Expand Down Expand Up @@ -94,12 +94,16 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series:
Series
Series of datetime64 dtype or datetime.datetime.
"""
try:
return pd.to_datetime(sas_datetimes, unit=unit, origin="1960-01-01")
except OutOfBoundsDatetime:
s_series = sas_datetimes.apply(_parse_datetime, unit=unit)
s_series = cast(pd.Series, s_series)
return s_series
td = (_sas_origin - _unix_origin).as_unit("s")
if unit == "s":
millis = cast_from_unit_vectorized(
sas_datetimes._values, unit="s", out_unit="ms"
)
dt64ms = millis.view("M8[ms]") + td
return pd.Series(dt64ms, index=sas_datetimes.index)
else:
vals = np.array(sas_datetimes, dtype="M8[D]") + td
return pd.Series(vals, dtype="M8[s]", index=sas_datetimes.index)


class _Column:
Expand Down
127 changes: 68 additions & 59 deletions pandas/tests/io/sas/test_sas7bdat.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
import os
from pathlib import Path

import dateutil.parser
import numpy as np
import pytest

from pandas.compat import IS64
from pandas.errors import EmptyDataError
import pandas.util._test_decorators as td

Expand All @@ -27,9 +27,9 @@ def data_test_ix(request, dirpath):
df = pd.read_csv(fname)
epoch = datetime(1960, 1, 1)
t1 = pd.to_timedelta(df["Column4"], unit="d")
df["Column4"] = epoch + t1
df["Column4"] = (epoch + t1).astype("M8[s]")
t2 = pd.to_timedelta(df["Column12"], unit="d")
df["Column12"] = epoch + t2
df["Column12"] = (epoch + t2).astype("M8[s]")
for k in range(df.shape[1]):
col = df.iloc[:, k]
if col.dtype == np.int64:
Expand Down Expand Up @@ -59,7 +59,7 @@ def test_from_buffer(self, dirpath, data_test_ix):
buf, format="sas7bdat", iterator=True, encoding="utf-8"
) as rdr:
df = rdr.read()
tm.assert_frame_equal(df, df0, check_exact=False)
tm.assert_frame_equal(df, df0)

@pytest.mark.slow
def test_from_iterator(self, dirpath, data_test_ix):
Expand Down Expand Up @@ -157,6 +157,8 @@ def test_productsales(datapath):
df0 = pd.read_csv(fname, parse_dates=["MONTH"])
vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"]
df0[vn] = df0[vn].astype(np.float64)

df0["MONTH"] = df0["MONTH"].astype("M8[s]")
tm.assert_frame_equal(df, df0)


Expand All @@ -175,7 +177,7 @@ def test_airline(datapath):
fname = datapath("io", "sas", "data", "airline.csv")
df0 = pd.read_csv(fname)
df0 = df0.astype(np.float64)
tm.assert_frame_equal(df, df0, check_exact=False)
tm.assert_frame_equal(df, df0)


def test_date_time(datapath):
Expand All @@ -191,14 +193,20 @@ def test_date_time(datapath):
# access to SAS to read the sas7bdat file. We are really just testing
# that we are "close". This only seems to be an issue near the
# implementation bounds.
res = df.iloc[:, 3].dt.round("us").copy()

# the first and last elements are near the implementation bounds, where we
# would expect floating point error to occur.
res.iloc[0] -= pd.Timedelta(microseconds=1)
res.iloc[-1] += pd.Timedelta(microseconds=1)
df[df.columns[3]] = df.iloc[:, 3].dt.round("us")
df0["Date1"] = df0["Date1"].astype("M8[s]")
df0["Date2"] = df0["Date2"].astype("M8[s]")
df0["DateTime"] = df0["DateTime"].astype("M8[ms]")
df0["Taiw"] = df0["Taiw"].astype("M8[s]")

df["DateTimeHi"] = res
res = df0["DateTimeHi"].astype("M8[us]").dt.round("ms")
df0["DateTimeHi"] = res.astype("M8[ms]")

if not IS64:
# No good reason for this, just what we get on the CI
df0.loc[0, "DateTimeHi"] += np.timedelta64(1, "ms")
df0.loc[[2, 3], "DateTimeHi"] -= np.timedelta64(1, "ms")
tm.assert_frame_equal(df, df0)


Expand Down Expand Up @@ -258,16 +266,6 @@ def test_corrupt_read(datapath):
pd.read_sas(fname)


def round_datetime_to_ms(ts):
if isinstance(ts, datetime):
return ts.replace(microsecond=int(round(ts.microsecond, -3) / 1000) * 1000)
elif isinstance(ts, str):
_ts = dateutil.parser.parse(timestr=ts)
return _ts.replace(microsecond=int(round(_ts.microsecond, -3) / 1000) * 1000)
else:
return ts


def test_max_sas_date(datapath):
# GH 20927
# NB. max datetime in SAS dataset is 31DEC9999:23:59:59.999
Expand All @@ -276,30 +274,33 @@ def test_max_sas_date(datapath):
fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat")
df = pd.read_sas(fname, encoding="iso-8859-1")

# SAS likes to left pad strings with spaces - lstrip before comparing
df = df.map(lambda x: x.lstrip() if isinstance(x, str) else x)
# GH 19732: Timestamps imported from sas will incur floating point errors
try:
df["dt_as_dt"] = df["dt_as_dt"].dt.round("us")
except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime:
df = df.map(round_datetime_to_ms)
except AttributeError:
df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms)
# if there are any date/times > pandas.Timestamp.max then ALL in that chunk
# are returned as datetime.datetime
expected = pd.DataFrame(
{
"text": ["max", "normal"],
"dt_as_float": [253717747199.999, 1880323199.999],
"dt_as_dt": [
datetime(9999, 12, 29, 23, 59, 59, 999000),
datetime(2019, 8, 1, 23, 59, 59, 999000),
],
"dt_as_dt": np.array(
[
datetime(9999, 12, 29, 23, 59, 59, 999000),
datetime(2019, 8, 1, 23, 59, 59, 999000),
],
dtype="M8[ms]",
),
"date_as_float": [2936547.0, 21762.0],
"date_as_date": [datetime(9999, 12, 29), datetime(2019, 8, 1)],
"date_as_date": np.array(
[
datetime(9999, 12, 29),
datetime(2019, 8, 1),
],
dtype="M8[s]",
),
},
columns=["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"],
)

if not IS64:
# No good reason for this, just what we get on the CI
expected.loc[:, "dt_as_dt"] -= np.timedelta64(1, "ms")

tm.assert_frame_equal(df, expected)


Expand All @@ -312,41 +313,40 @@ def test_max_sas_date_iterator(datapath):
fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat")
results = []
for df in pd.read_sas(fname, encoding="iso-8859-1", chunksize=1):
# SAS likes to left pad strings with spaces - lstrip before comparing
df = df.map(lambda x: x.lstrip() if isinstance(x, str) else x)
# GH 19732: Timestamps imported from sas will incur floating point errors
try:
df["dt_as_dt"] = df["dt_as_dt"].dt.round("us")
except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime:
df = df.map(round_datetime_to_ms)
except AttributeError:
df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms)
df.reset_index(inplace=True, drop=True)
results.append(df)
expected = [
pd.DataFrame(
{
"text": ["max"],
"dt_as_float": [253717747199.999],
"dt_as_dt": [datetime(9999, 12, 29, 23, 59, 59, 999000)],
"dt_as_dt": np.array(
[datetime(9999, 12, 29, 23, 59, 59, 999000)], dtype="M8[ms]"
),
"date_as_float": [2936547.0],
"date_as_date": [datetime(9999, 12, 29)],
"date_as_date": np.array([datetime(9999, 12, 29)], dtype="M8[s]"),
},
columns=col_order,
),
pd.DataFrame(
{
"text": ["normal"],
"dt_as_float": [1880323199.999],
"dt_as_dt": [np.datetime64("2019-08-01 23:59:59.999")],
"dt_as_dt": np.array(["2019-08-01 23:59:59.999"], dtype="M8[ms]"),
"date_as_float": [21762.0],
"date_as_date": [np.datetime64("2019-08-01")],
"date_as_date": np.array(["2019-08-01"], dtype="M8[s]"),
},
columns=col_order,
),
]
for result, expected in zip(results, expected):
tm.assert_frame_equal(result, expected)
if not IS64:
# No good reason for this, just what we get on the CI
expected[0].loc[0, "dt_as_dt"] -= np.timedelta64(1, "ms")
expected[1].loc[0, "dt_as_dt"] -= np.timedelta64(1, "ms")

tm.assert_frame_equal(results[0], expected[0])
tm.assert_frame_equal(results[1], expected[1])


def test_null_date(datapath):
Expand All @@ -355,16 +355,25 @@ def test_null_date(datapath):

expected = pd.DataFrame(
{
"datecol": [
datetime(9999, 12, 29),
pd.NaT,
],
"datetimecol": [
datetime(9999, 12, 29, 23, 59, 59, 998993),
pd.NaT,
],
"datecol": np.array(
[
datetime(9999, 12, 29),
np.datetime64("NaT"),
],
dtype="M8[s]",
),
"datetimecol": np.array(
[
datetime(9999, 12, 29, 23, 59, 59, 999000),
np.datetime64("NaT"),
],
dtype="M8[ms]",
),
},
)
if not IS64:
# No good reason for this, just what we get on the CI
expected.loc[0, "datetimecol"] -= np.timedelta64(1, "ms")
tm.assert_frame_equal(df, expected)


Expand Down
Loading