diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index d61714d9473c6..39cd6b39309cf 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -22,6 +22,8 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55753`) +- Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) +- Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 5903187769f08..39f0ddf172e33 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -49,6 +49,7 @@ from pandas.core.dtypes.common import ( is_list_like, is_object_dtype, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import NumpyEADtype @@ -548,6 +549,10 @@ def sanitize_array( # Avoid ending up with a NumpyExtensionArray dtype = dtype.numpy_dtype + object_index = False + if isinstance(data, ABCIndex) and data.dtype == object and dtype is None: + object_index = True + # extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray data = extract_array(data, extract_numpy=True, extract_range=True) @@ -601,6 +606,13 @@ def sanitize_array( subarr = data if data.dtype == object: subarr = maybe_infer_to_datetimelike(data) + if ( + object_index + and using_pyarrow_string_dtype() + and is_string_dtype(subarr) + ): + # Avoid inference when string option is set + subarr = data elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): from pandas.core.arrays.string_ import StringDtype diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 85b68c1bc2ec7..aaea5a58723a1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -23,6 +23,7 @@ from pandas._config import ( get_option, using_copy_on_write, + using_pyarrow_string_dtype, ) from pandas._libs import ( @@ -6948,7 +6949,14 @@ def insert(self, loc: int, item) -> Index: loc = loc if loc >= 0 else loc - 1 new_values[loc] = item - return Index._with_infer(new_values, name=self.name) + idx = Index._with_infer(new_values, name=self.name) + if ( + using_pyarrow_string_dtype() + and is_string_dtype(idx.dtype) + and new_values.dtype == object + ): + idx = idx.astype(new_values.dtype) + return idx def drop( self, diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index de8df15a9d747..51b0a0a13d90b 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1913,7 +1913,7 @@ def test_add_new_column_infer_string(): df.loc[df["x"] == 1, "y"] = "1" expected = DataFrame( {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, - columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + columns=Index(["x", "y"], dtype=object), ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index ccc1249088f9a..fc2e817b1600e 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -753,6 +753,15 @@ def test_setitem_frame_overwrite_with_ea_dtype(self, any_numeric_ea_dtype): ) tm.assert_frame_equal(df, expected) + def test_setitem_string_option_object_index(self): + # GH#55638 + pytest.importorskip("pyarrow") + df = DataFrame({"a": [1, 2]}) + with pd.option_context("future.infer_string", True): + df["b"] = Index(["a", "b"], dtype=object) + expected = DataFrame({"a": [1, 2], "b": Series(["a", "b"], dtype=object)}) + tm.assert_frame_equal(df, expected) + def test_setitem_frame_midx_columns(self): # GH#49121 df = DataFrame({("a", "b"): [10]})