Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: read_csv not respecting object dtype when option is set #56047

Merged
merged 14 commits into from
Dec 9, 2023
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Bug fixes
- Bug in :class:`Series` when trying to cast date-like string inputs to :class:`ArrowDtype` of ``pyarrow.timestamp`` (:issue:`56266`)
- Bug in :class:`Timestamp` construction with ``ts_input="now"`` or ``ts_input="today"`` giving a different unit from :meth:`Timestamp.now` or :meth:`Timestamp.today` (:issue:`55879`)
- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`)
- Fixed bug in :func:`read_csv` not respecting object dtype when ``infer_string`` option is set (:issue:`56047`)
- Fixed bug in :func:`to_numeric` converting to extension dtype for ``string[pyarrow_numpy]`` dtype (:issue:`56179`)
- Fixed bug in :meth:`.DataFrameGroupBy.min` and :meth:`.DataFrameGroupBy.max` not preserving extension dtype for empty object (:issue:`55619`)
- Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
Expand Down
14 changes: 2 additions & 12 deletions pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,18 +296,8 @@ def read(self) -> DataFrame:
dtype_mapping[pa.null()] = pd.Int64Dtype()
frame = table.to_pandas(types_mapper=dtype_mapping.get)
elif using_pyarrow_string_dtype():

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These mappers don't work, arrow supports type -> type not column -> type

def types_mapper(dtype):
dtype_dict = self.kwds["dtype"]
if dtype_dict is not None and dtype_dict.get(dtype, None) is not None:
return dtype_dict.get(dtype)
return arrow_string_types_mapper()(dtype)

frame = table.to_pandas(types_mapper=types_mapper)
frame = table.to_pandas(types_mapper=arrow_string_types_mapper())

else:
if isinstance(self.kwds.get("dtype"), dict):
frame = table.to_pandas(types_mapper=self.kwds["dtype"].get)
else:
frame = table.to_pandas()
frame = table.to_pandas()
return self._finalize_pandas_output(frame)
44 changes: 42 additions & 2 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
"""
from __future__ import annotations

from collections import abc
from collections import (
abc,
defaultdict,
)
import csv
import sys
from textwrap import fill
Expand All @@ -23,6 +26,8 @@

import numpy as np

from pandas._config import using_copy_on_write

from pandas._libs import lib
from pandas._libs.parsers import STR_NA_VALUES
from pandas.errors import (
Expand All @@ -38,8 +43,10 @@
is_float,
is_integer,
is_list_like,
pandas_dtype,
)

from pandas import Series
from pandas.core.frame import DataFrame
from pandas.core.indexes.api import RangeIndex
from pandas.core.shared_docs import _shared_docs
Expand Down Expand Up @@ -1846,7 +1853,40 @@ def read(self, nrows: int | None = None) -> DataFrame:
else:
new_rows = len(index)

df = DataFrame(col_dict, columns=columns, index=index)
if hasattr(self, "orig_options"):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we do something more explicit than a hasattr check?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, you can subclass the reader, so we don't have any control over it

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does anybody actually do this? i judge those people, their ethics, and their hygiene.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's something I can't answer, we might want to deprecate maybe, but we are stuck with hasattr here until then

dtype_arg = self.orig_options.get("dtype", None)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the dtype option normally applied in _engine.read? Just curious why it needs to be done here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but the DataFrame constructor infers object to string again if the option is set, which would discard the original dtype

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK makes sense.

Could we defer looping over col_dict if dtype isn't specified to be object-like?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated, only doing this now if we have a dict or object dtype

else:
dtype_arg = None

if isinstance(dtype_arg, dict):
dtype = defaultdict(lambda: None) # type: ignore[var-annotated]
dtype.update(dtype_arg)
elif dtype_arg is not None and pandas_dtype(dtype_arg) in (
np.str_,
np.object_,
):
dtype = defaultdict(lambda: dtype_arg)
else:
dtype = None

if dtype is not None:
new_col_dict = {}
for k, v in col_dict.items():
d = (
dtype[k]
if pandas_dtype(dtype[k]) in (np.str_, np.object_)
else None
)
new_col_dict[k] = Series(v, index=index, dtype=d, copy=False)
else:
new_col_dict = col_dict

df = DataFrame(
new_col_dict,
columns=columns,
index=index,
copy=not using_copy_on_write(),
)

self._currow += new_rows
return df
Expand Down
35 changes: 35 additions & 0 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,41 @@ def test_string_inference(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_])
def test_string_inference_object_dtype(all_parsers, dtype):
# GH#56047
pytest.importorskip("pyarrow")

data = """a,b
x,a
y,a
z,a"""
parser = all_parsers
with pd.option_context("future.infer_string", True):
result = parser.read_csv(StringIO(data), dtype=dtype)

expected = DataFrame(
{
"a": pd.Series(["x", "y", "z"], dtype=object),
"b": pd.Series(["a", "a", "a"], dtype=object),
},
columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
)
tm.assert_frame_equal(result, expected)

with pd.option_context("future.infer_string", True):
result = parser.read_csv(StringIO(data), dtype={"a": dtype})

expected = DataFrame(
{
"a": pd.Series(["x", "y", "z"], dtype=object),
"b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"),
},
columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
)
tm.assert_frame_equal(result, expected)


def test_accurate_parsing_of_large_integers(all_parsers):
# GH#52505
data = """SYMBOL,MOMENT,ID,ID_DEAL
Expand Down
Loading