Skip to content

Commit

Permalink
BUG: read_csv not respecting object dtype when option is set (#56047)
Browse files Browse the repository at this point in the history
* BUG: read_csv not respecting object dtype when option is set

* Update readers.py

* Cover str too

* Adjust

* Fixup

* Fixup

* Update readers.py
  • Loading branch information
phofl authored Dec 9, 2023
1 parent ce4169a commit fb05cc7
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 14 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Bug fixes
- Bug in :class:`Series` when trying to cast date-like string inputs to :class:`ArrowDtype` of ``pyarrow.timestamp`` (:issue:`56266`)
- Bug in :class:`Timestamp` construction with ``ts_input="now"`` or ``ts_input="today"`` giving a different unit from :meth:`Timestamp.now` or :meth:`Timestamp.today` (:issue:`55879`)
- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`)
- Fixed bug in :func:`read_csv` not respecting object dtype when ``infer_string`` option is set (:issue:`56047`)
- Fixed bug in :func:`to_numeric` converting to extension dtype for ``string[pyarrow_numpy]`` dtype (:issue:`56179`)
- Fixed bug in :meth:`.DataFrameGroupBy.min` and :meth:`.DataFrameGroupBy.max` not preserving extension dtype for empty object (:issue:`55619`)
- Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
Expand Down
14 changes: 2 additions & 12 deletions pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,18 +296,8 @@ def read(self) -> DataFrame:
dtype_mapping[pa.null()] = pd.Int64Dtype()
frame = table.to_pandas(types_mapper=dtype_mapping.get)
elif using_pyarrow_string_dtype():

def types_mapper(dtype):
dtype_dict = self.kwds["dtype"]
if dtype_dict is not None and dtype_dict.get(dtype, None) is not None:
return dtype_dict.get(dtype)
return arrow_string_types_mapper()(dtype)

frame = table.to_pandas(types_mapper=types_mapper)
frame = table.to_pandas(types_mapper=arrow_string_types_mapper())

else:
if isinstance(self.kwds.get("dtype"), dict):
frame = table.to_pandas(types_mapper=self.kwds["dtype"].get)
else:
frame = table.to_pandas()
frame = table.to_pandas()
return self._finalize_pandas_output(frame)
44 changes: 42 additions & 2 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
"""
from __future__ import annotations

from collections import abc
from collections import (
abc,
defaultdict,
)
import csv
import sys
from textwrap import fill
Expand All @@ -23,6 +26,8 @@

import numpy as np

from pandas._config import using_copy_on_write

from pandas._libs import lib
from pandas._libs.parsers import STR_NA_VALUES
from pandas.errors import (
Expand All @@ -38,8 +43,10 @@
is_float,
is_integer,
is_list_like,
pandas_dtype,
)

from pandas import Series
from pandas.core.frame import DataFrame
from pandas.core.indexes.api import RangeIndex
from pandas.core.shared_docs import _shared_docs
Expand Down Expand Up @@ -1846,7 +1853,40 @@ def read(self, nrows: int | None = None) -> DataFrame:
else:
new_rows = len(index)

df = DataFrame(col_dict, columns=columns, index=index)
if hasattr(self, "orig_options"):
dtype_arg = self.orig_options.get("dtype", None)
else:
dtype_arg = None

if isinstance(dtype_arg, dict):
dtype = defaultdict(lambda: None) # type: ignore[var-annotated]
dtype.update(dtype_arg)
elif dtype_arg is not None and pandas_dtype(dtype_arg) in (
np.str_,
np.object_,
):
dtype = defaultdict(lambda: dtype_arg)
else:
dtype = None

if dtype is not None:
new_col_dict = {}
for k, v in col_dict.items():
d = (
dtype[k]
if pandas_dtype(dtype[k]) in (np.str_, np.object_)
else None
)
new_col_dict[k] = Series(v, index=index, dtype=d, copy=False)
else:
new_col_dict = col_dict

df = DataFrame(
new_col_dict,
columns=columns,
index=index,
copy=not using_copy_on_write(),
)

self._currow += new_rows
return df
Expand Down
35 changes: 35 additions & 0 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,41 @@ def test_string_inference(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_])
def test_string_inference_object_dtype(all_parsers, dtype):
# GH#56047
pytest.importorskip("pyarrow")

data = """a,b
x,a
y,a
z,a"""
parser = all_parsers
with pd.option_context("future.infer_string", True):
result = parser.read_csv(StringIO(data), dtype=dtype)

expected = DataFrame(
{
"a": pd.Series(["x", "y", "z"], dtype=object),
"b": pd.Series(["a", "a", "a"], dtype=object),
},
columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
)
tm.assert_frame_equal(result, expected)

with pd.option_context("future.infer_string", True):
result = parser.read_csv(StringIO(data), dtype={"a": dtype})

expected = DataFrame(
{
"a": pd.Series(["x", "y", "z"], dtype=object),
"b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"),
},
columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
)
tm.assert_frame_equal(result, expected)


def test_accurate_parsing_of_large_integers(all_parsers):
# GH#52505
data = """SYMBOL,MOMENT,ID,ID_DEAL
Expand Down

0 comments on commit fb05cc7

Please sign in to comment.