Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TST: dt64 units #56261

Merged
merged 6 commits into from
Dec 4, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pandas/core/dtypes/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -686,7 +686,8 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True):
if isinstance(dtype, ExtensionDtype):
return dtype.na_value
elif dtype.kind in "mM":
return dtype.type("NaT", "ns")
unit = np.datetime_data(dtype)[0]
return dtype.type("NaT", unit)
elif dtype.kind == "f":
return np.nan
elif dtype.kind in "iu":
Expand Down
10 changes: 4 additions & 6 deletions pandas/tests/frame/methods/test_reset_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,11 +664,8 @@ def test_reset_index_dtypes_on_empty_frame_with_multiindex(array, dtype):

def test_reset_index_empty_frame_with_datetime64_multiindex():
# https://github.com/pandas-dev/pandas/issues/35606
idx = MultiIndex(
levels=[[Timestamp("2020-07-20 00:00:00")], [3, 4]],
codes=[[], []],
names=["a", "b"],
)
dti = pd.DatetimeIndex(["2020-07-20 00:00:00"], dtype="M8[ns]")
idx = MultiIndex.from_product([dti, [3, 4]], names=["a", "b"])[:0]
df = DataFrame(index=idx, columns=["c", "d"])
result = df.reset_index()
expected = DataFrame(
Expand All @@ -681,7 +678,8 @@ def test_reset_index_empty_frame_with_datetime64_multiindex():

def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby():
# https://github.com/pandas-dev/pandas/issues/35657
df = DataFrame({"c1": [10.0], "c2": ["a"], "c3": pd.to_datetime("2020-01-01")})
dti = pd.DatetimeIndex(["2020-01-01"], dtype="M8[ns]")
df = DataFrame({"c1": [10.0], "c2": ["a"], "c3": dti})
df = df.head(0).groupby(["c2", "c3"])[["c1"]].sum()
result = df.reset_index()
expected = DataFrame(
Expand Down
15 changes: 6 additions & 9 deletions pandas/tests/indexes/interval/test_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from pandas import (
DataFrame,
DatetimeIndex,
Index,
Interval,
IntervalIndex,
Expand Down Expand Up @@ -100,18 +101,14 @@ def test_get_values_for_csv(self, tuples, closed, expected_data):
expected = np.array(expected_data)
tm.assert_numpy_array_equal(result, expected)

def test_timestamp_with_timezone(self):
def test_timestamp_with_timezone(self, unit):
# GH 55035
index = IntervalIndex(
[
Interval(
Timestamp("2020-01-01", tz="UTC"), Timestamp("2020-01-02", tz="UTC")
)
]
)
left = DatetimeIndex(["2020-01-01"], dtype=f"M8[{unit}, UTC]")
right = DatetimeIndex(["2020-01-02"], dtype=f"M8[{unit}, UTC]")
index = IntervalIndex.from_arrays(left, right)
result = repr(index)
expected = (
"IntervalIndex([(2020-01-01 00:00:00+00:00, 2020-01-02 00:00:00+00:00]], "
"dtype='interval[datetime64[ns, UTC], right]')"
f"dtype='interval[datetime64[{unit}, UTC], right]')"
)
assert result == expected
78 changes: 55 additions & 23 deletions pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

from datetime import (
datetime,
time,
Expand Down Expand Up @@ -130,8 +132,15 @@ def df_ref(datapath):
return df_ref


def adjust_expected(expected: DataFrame, read_ext: str) -> None:
def get_exp_unit(read_ext: str, engine: str | None) -> str:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So your ultimate vision here is different engines will return different bases? Or is there just considered a temporary solution?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In #55901 this becomes

def get_exp_unit(read_ext: str, engine: str | None) -> str:
    unit = "us"
    if (read_ext == ".ods") ^ (engine == "calamine"):
        unit = "s"
    return unit

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What makes ods files or the calamine reader different here? My first thought is that is surprising behavior to have those be the only ones return seconds

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no idea. id be OK with coercing them all to micros

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the largest representable date for microsecond precision?

From what I see Excel only can display millseconds, but doesn't offer first class formula support for it. The ODS specification point 18.3.14 links its datetime format to an XML Schema Part 2

https://www.w3.org/TR/2004/REC-xmlschema-2-20041028/#dateTime

Which only mentions "fractional seconds" but without too much detail.

So seems like a wild west of implementation possibilities. Excel has an upper limit on dates at December 31, 9999 so maybe we just try to cover that?

https://support.microsoft.com/en-us/office/excel-specifications-and-limits-1672b34d-7043-467e-8e27-269d656771c3

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

December 31, 9999 23:59:59.999999 is also the highest value supported by the stdlib datetime. microseconds can go further than that, but most of the time when we get microseconds it is because we got a stdlib datetime objects

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(btw i definitely want to have this discussion but would prefer it to happen in #55901, this PR is just aimed at diff-trimming to make that one easier)

return "ns"


def adjust_expected(expected: DataFrame, read_ext: str, engine: str) -> None:
expected.index.name = None
unit = get_exp_unit(read_ext, engine)
# error: "Index" has no attribute "as_unit"
expected.index = expected.index.as_unit(unit) # type: ignore[attr-defined]


def xfail_datetimes_with_pyxlsb(engine, request):
Expand Down Expand Up @@ -225,7 +234,7 @@ def test_usecols_list(self, request, engine, read_ext, df_ref):
xfail_datetimes_with_pyxlsb(engine, request)

expected = df_ref[["B", "C"]]
adjust_expected(expected, read_ext)
adjust_expected(expected, read_ext, engine)

df1 = pd.read_excel(
"test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=[0, 2, 3]
Expand All @@ -246,7 +255,7 @@ def test_usecols_str(self, request, engine, read_ext, df_ref):
xfail_datetimes_with_pyxlsb(engine, request)

expected = df_ref[["A", "B", "C"]]
adjust_expected(expected, read_ext)
adjust_expected(expected, read_ext, engine)

df2 = pd.read_excel(
"test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A:D"
Expand All @@ -264,7 +273,7 @@ def test_usecols_str(self, request, engine, read_ext, df_ref):
tm.assert_frame_equal(df3, expected)

expected = df_ref[["B", "C"]]
adjust_expected(expected, read_ext)
adjust_expected(expected, read_ext, engine)

df2 = pd.read_excel(
"test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C,D"
Expand Down Expand Up @@ -302,7 +311,7 @@ def test_usecols_diff_positional_int_columns_order(
xfail_datetimes_with_pyxlsb(engine, request)

expected = df_ref[["A", "C"]]
adjust_expected(expected, read_ext)
adjust_expected(expected, read_ext, engine)

result = pd.read_excel(
"test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=usecols
Expand All @@ -321,7 +330,7 @@ def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref):
xfail_datetimes_with_pyxlsb(engine, request)

expected = df_ref
adjust_expected(expected, read_ext)
adjust_expected(expected, read_ext, engine)

result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0)
tm.assert_frame_equal(result, expected)
Expand All @@ -330,7 +339,7 @@ def test_usecols_excel_range_str(self, request, engine, read_ext, df_ref):
xfail_datetimes_with_pyxlsb(engine, request)

expected = df_ref[["C", "D"]]
adjust_expected(expected, read_ext)
adjust_expected(expected, read_ext, engine)

result = pd.read_excel(
"test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,D:E"
Expand Down Expand Up @@ -428,7 +437,7 @@ def test_excel_table(self, request, engine, read_ext, df_ref):
xfail_datetimes_with_pyxlsb(engine, request)

expected = df_ref
adjust_expected(expected, read_ext)
adjust_expected(expected, read_ext, engine)

df1 = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0)
df2 = pd.read_excel(
Expand All @@ -446,20 +455,24 @@ def test_excel_table(self, request, engine, read_ext, df_ref):
def test_reader_special_dtypes(self, request, engine, read_ext):
xfail_datetimes_with_pyxlsb(engine, request)

unit = get_exp_unit(read_ext, engine)
expected = DataFrame.from_dict(
{
"IntCol": [1, 2, -3, 4, 0],
"FloatCol": [1.25, 2.25, 1.83, 1.92, 0.0000000005],
"BoolCol": [True, False, True, True, False],
"StrCol": [1, 2, 3, 4, 5],
"Str2Col": ["a", 3, "c", "d", "e"],
"DateCol": [
datetime(2013, 10, 30),
datetime(2013, 10, 31),
datetime(1905, 1, 1),
datetime(2013, 12, 14),
datetime(2015, 3, 14),
],
"DateCol": Index(
[
datetime(2013, 10, 30),
datetime(2013, 10, 31),
datetime(1905, 1, 1),
datetime(2013, 12, 14),
datetime(2015, 3, 14),
],
dtype=f"M8[{unit}]",
),
},
)
basename = "test_types"
Expand Down Expand Up @@ -578,7 +591,7 @@ def test_reader_dtype_str(self, read_ext, dtype, expected):
actual = pd.read_excel(basename + read_ext, dtype=dtype)
tm.assert_frame_equal(actual, expected)

def test_dtype_backend(self, read_ext, dtype_backend):
def test_dtype_backend(self, read_ext, dtype_backend, engine):
# GH#36712
if read_ext in (".xlsb", ".xls"):
pytest.skip(f"No engine for filetype: '{read_ext}'")
Expand Down Expand Up @@ -621,6 +634,9 @@ def test_dtype_backend(self, read_ext, dtype_backend):
expected["j"] = ArrowExtensionArray(pa.array([None, None]))
else:
expected = df
unit = get_exp_unit(read_ext, engine)
expected["i"] = expected["i"].astype(f"M8[{unit}]")

tm.assert_frame_equal(result, expected)

def test_dtype_backend_and_dtype(self, read_ext):
Expand Down Expand Up @@ -812,7 +828,7 @@ def test_sheet_name(self, request, read_ext, engine, df_ref):
sheet_name = "Sheet1"

expected = df_ref
adjust_expected(expected, read_ext)
adjust_expected(expected, read_ext, engine)

df1 = pd.read_excel(
filename + read_ext, sheet_name=sheet_name, index_col=0
Expand Down Expand Up @@ -1014,6 +1030,8 @@ def test_read_excel_multiindex(self, request, engine, read_ext):
if engine == "calamine" and read_ext == ".ods":
request.applymarker(pytest.mark.xfail(reason="Last test fails in calamine"))

unit = get_exp_unit(read_ext, engine)

mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]])
mi_file = "testmultiindex" + read_ext

Expand All @@ -1027,6 +1045,7 @@ def test_read_excel_multiindex(self, request, engine, read_ext):
],
columns=mi,
)
expected[mi[2]] = expected[mi[2]].astype(f"M8[{unit}]")

actual = pd.read_excel(
mi_file, sheet_name="mi_column", header=[0, 1], index_col=0
Expand Down Expand Up @@ -1106,6 +1125,9 @@ def test_read_excel_multiindex_blank_after_name(

mi_file = "testmultiindex" + read_ext
mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]], names=["c1", "c2"])

unit = get_exp_unit(read_ext, engine)

expected = DataFrame(
[
[1, 2.5, pd.Timestamp("2015-01-01"), True],
Expand All @@ -1119,6 +1141,7 @@ def test_read_excel_multiindex_blank_after_name(
names=["ilvl1", "ilvl2"],
),
)
expected[mi[2]] = expected[mi[2]].astype(f"M8[{unit}]")
result = pd.read_excel(
mi_file,
sheet_name=sheet_name,
Expand Down Expand Up @@ -1222,6 +1245,8 @@ def test_read_excel_skiprows(self, request, engine, read_ext):
# GH 4903
xfail_datetimes_with_pyxlsb(engine, request)

unit = get_exp_unit(read_ext, engine)

actual = pd.read_excel(
"testskiprows" + read_ext, sheet_name="skiprows_list", skiprows=[0, 2]
)
Expand All @@ -1234,6 +1259,7 @@ def test_read_excel_skiprows(self, request, engine, read_ext):
],
columns=["a", "b", "c", "d"],
)
expected["c"] = expected["c"].astype(f"M8[{unit}]")
tm.assert_frame_equal(actual, expected)

actual = pd.read_excel(
Expand Down Expand Up @@ -1266,11 +1292,13 @@ def test_read_excel_skiprows(self, request, engine, read_ext):
],
columns=["a", "b", "c", "d"],
)
expected["c"] = expected["c"].astype(f"M8[{unit}]")
tm.assert_frame_equal(actual, expected)

def test_read_excel_skiprows_callable_not_in(self, request, engine, read_ext):
# GH 4903
xfail_datetimes_with_pyxlsb(engine, request)
unit = get_exp_unit(read_ext, engine)

actual = pd.read_excel(
"testskiprows" + read_ext,
Expand All @@ -1286,6 +1314,7 @@ def test_read_excel_skiprows_callable_not_in(self, request, engine, read_ext):
],
columns=["a", "b", "c", "d"],
)
expected["c"] = expected["c"].astype(f"M8[{unit}]")
tm.assert_frame_equal(actual, expected)

def test_read_excel_nrows(self, read_ext):
Expand Down Expand Up @@ -1542,7 +1571,7 @@ def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref):
xfail_datetimes_with_pyxlsb(engine, request)

expected = df_ref
adjust_expected(expected, read_ext)
adjust_expected(expected, read_ext, engine)

with pd.ExcelFile("test1" + read_ext) as excel:
df1 = pd.read_excel(excel, sheet_name=0, index_col=0)
Expand All @@ -1569,7 +1598,7 @@ def test_sheet_name(self, request, engine, read_ext, df_ref):
xfail_datetimes_with_pyxlsb(engine, request)

expected = df_ref
adjust_expected(expected, read_ext)
adjust_expected(expected, read_ext, engine)

filename = "test1"
sheet_name = "Sheet1"
Expand Down Expand Up @@ -1661,11 +1690,14 @@ def test_read_datetime_multiindex(self, request, engine, read_ext):
f = "test_datetime_mi" + read_ext
with pd.ExcelFile(f) as excel:
actual = pd.read_excel(excel, header=[0, 1], index_col=0, engine=engine)
expected_column_index = MultiIndex.from_tuples(
[(pd.to_datetime("02/29/2020"), pd.to_datetime("03/01/2020"))],

unit = get_exp_unit(read_ext, engine)
dti = pd.DatetimeIndex(["2020-02-29", "2020-03-01"], dtype=f"M8[{unit}]")
expected_column_index = MultiIndex.from_arrays(
[dti[:1], dti[1:]],
names=[
pd.to_datetime("02/29/2020").to_pydatetime(),
pd.to_datetime("03/01/2020").to_pydatetime(),
dti[0].to_pydatetime(),
dti[1].to_pydatetime(),
],
)
expected = DataFrame([], index=[], columns=expected_column_index)
Expand Down
Loading
Loading