BUG: read_csv not respecting object dtype when option is set (#56047)

* BUG: read_csv not respecting object dtype when option is set * Update readers.py * Cover str too * Adjust * Fixup * Fixup * Update readers.py
pandas-dev · Dec 9, 2023 · fb05cc7 · fb05cc7
1 parent ce4169a
commit fb05cc7
Show file tree

Hide file tree

Showing 4 changed files with 80 additions and 14 deletions.
diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst
@@ -24,6 +24,7 @@ Bug fixes
 - Bug in :class:`Series` when trying to cast date-like string inputs to :class:`ArrowDtype` of ``pyarrow.timestamp`` (:issue:`56266`)
 - Bug in :class:`Timestamp` construction with ``ts_input="now"`` or ``ts_input="today"`` giving a different unit from :meth:`Timestamp.now` or :meth:`Timestamp.today` (:issue:`55879`)
 - Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`)
+- Fixed bug in :func:`read_csv` not respecting object dtype when ``infer_string`` option is set (:issue:`56047`)
 - Fixed bug in :func:`to_numeric` converting to extension dtype for ``string[pyarrow_numpy]`` dtype (:issue:`56179`)
 - Fixed bug in :meth:`.DataFrameGroupBy.min` and :meth:`.DataFrameGroupBy.max` not preserving extension dtype for empty object (:issue:`55619`)
 - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)

diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -296,18 +296,8 @@ def read(self) -> DataFrame:
             dtype_mapping[pa.null()] = pd.Int64Dtype()
             frame = table.to_pandas(types_mapper=dtype_mapping.get)
         elif using_pyarrow_string_dtype():
-
-            def types_mapper(dtype):
-                dtype_dict = self.kwds["dtype"]
-                if dtype_dict is not None and dtype_dict.get(dtype, None) is not None:
-                    return dtype_dict.get(dtype)
-                return arrow_string_types_mapper()(dtype)
-
-            frame = table.to_pandas(types_mapper=types_mapper)
+            frame = table.to_pandas(types_mapper=arrow_string_types_mapper())
 
         else:
-            if isinstance(self.kwds.get("dtype"), dict):
-                frame = table.to_pandas(types_mapper=self.kwds["dtype"].get)
-            else:
-                frame = table.to_pandas()
+            frame = table.to_pandas()
         return self._finalize_pandas_output(frame)
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -5,7 +5,10 @@
 """
 from __future__ import annotations
 
-from collections import abc
+from collections import (
+    abc,
+    defaultdict,
+)
 import csv
 import sys
 from textwrap import fill
@@ -23,6 +26,8 @@
 
 import numpy as np
 
+from pandas._config import using_copy_on_write
+
 from pandas._libs import lib
 from pandas._libs.parsers import STR_NA_VALUES
 from pandas.errors import (
@@ -38,8 +43,10 @@
     is_float,
     is_integer,
     is_list_like,
+    pandas_dtype,
 )
 
+from pandas import Series
 from pandas.core.frame import DataFrame
 from pandas.core.indexes.api import RangeIndex
 from pandas.core.shared_docs import _shared_docs
@@ -1846,7 +1853,40 @@ def read(self, nrows: int | None = None) -> DataFrame:
             else:
                 new_rows = len(index)
 
-            df = DataFrame(col_dict, columns=columns, index=index)
+            if hasattr(self, "orig_options"):
+                dtype_arg = self.orig_options.get("dtype", None)
+            else:
+                dtype_arg = None
+
+            if isinstance(dtype_arg, dict):
+                dtype = defaultdict(lambda: None)  # type: ignore[var-annotated]
+                dtype.update(dtype_arg)
+            elif dtype_arg is not None and pandas_dtype(dtype_arg) in (
+                np.str_,
+                np.object_,
+            ):
+                dtype = defaultdict(lambda: dtype_arg)
+            else:
+                dtype = None
+
+            if dtype is not None:
+                new_col_dict = {}
+                for k, v in col_dict.items():
+                    d = (
+                        dtype[k]
+                        if pandas_dtype(dtype[k]) in (np.str_, np.object_)
+                        else None
+                    )
+                    new_col_dict[k] = Series(v, index=index, dtype=d, copy=False)
+            else:
+                new_col_dict = col_dict
+
+            df = DataFrame(
+                new_col_dict,
+                columns=columns,
+                index=index,
+                copy=not using_copy_on_write(),
+            )
 
             self._currow += new_rows
         return df

diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -574,6 +574,41 @@ def test_string_inference(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_])
+def test_string_inference_object_dtype(all_parsers, dtype):
+    # GH#56047
+    pytest.importorskip("pyarrow")
+
+    data = """a,b
+x,a
+y,a
+z,a"""
+    parser = all_parsers
+    with pd.option_context("future.infer_string", True):
+        result = parser.read_csv(StringIO(data), dtype=dtype)
+
+    expected = DataFrame(
+        {
+            "a": pd.Series(["x", "y", "z"], dtype=object),
+            "b": pd.Series(["a", "a", "a"], dtype=object),
+        },
+        columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
+    )
+    tm.assert_frame_equal(result, expected)
+
+    with pd.option_context("future.infer_string", True):
+        result = parser.read_csv(StringIO(data), dtype={"a": dtype})
+
+    expected = DataFrame(
+        {
+            "a": pd.Series(["x", "y", "z"], dtype=object),
+            "b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"),
+        },
+        columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
 def test_accurate_parsing_of_large_integers(all_parsers):
     # GH#52505
     data = """SYMBOL,MOMENT,ID,ID_DEAL