Backport PR pandas-dev#57402: BUG: wrong future Warning on string ass…

…ignment in certain condition
meeseeksmachine · Feb 16, 2024 · 5db0ed7 · 5db0ed7
1 parent 11818ad
commit 5db0ed7
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 9 deletions.
diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst
@@ -24,6 +24,7 @@ Fixed regressions
 - Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`)
 - Fixed regression in :meth:`DataFrame.groupby` raising ``ValueError`` when grouping by a :class:`Series` in some cases (:issue:`57276`)
 - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`)
+- Fixed regression in :meth:`DataFrame.loc` which was unnecessarily throwing "incompatible dtype warning" when expanding with partial row indexer and multiple columns (see `PDEP6 <https://pandas.pydata.org/pdeps/0006-ban-upcasting.html>`_) (:issue:`56503`)
 - Fixed regression in :meth:`DataFrame.map` with ``na_action="ignore"`` not being respected for NumPy nullable and :class:`ArrowDtypes` (:issue:`57316`)
 - Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`)
 - Fixed regression in :meth:`DataFrame.shift` raising ``AssertionError`` for ``axis=1`` and empty :class:`DataFrame` (:issue:`57301`)

diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
@@ -647,6 +647,20 @@ def infer_fill_value(val):
     return np.nan
 
 
+def construct_1d_array_from_inferred_fill_value(
+    value: object, length: int
+) -> ArrayLike:
+    # Find our empty_value dtype by constructing an array
+    #  from our value and doing a .take on it
+    from pandas.core.algorithms import take_nd
+    from pandas.core.construction import sanitize_array
+    from pandas.core.indexes.base import Index
+
+    arr = sanitize_array(value, Index(range(1)), copy=False)
+    taker = -1 * np.ones(length, dtype=np.intp)
+    return take_nd(arr, taker)
+
+
 def maybe_fill(arr: np.ndarray) -> np.ndarray:
     """
     Fill numpy.ndarray with NaN, unless we have a integer or boolean dtype.

diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -57,6 +57,7 @@
     ABCSeries,
 )
 from pandas.core.dtypes.missing import (
+    construct_1d_array_from_inferred_fill_value,
     infer_fill_value,
     is_valid_na_for_dtype,
     isna,
@@ -68,7 +69,6 @@
 from pandas.core.construction import (
     array as pd_array,
     extract_array,
-    sanitize_array,
 )
 from pandas.core.indexers import (
     check_array_indexer,
@@ -844,7 +844,6 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None:
         if self.ndim != 2:
             return
 
-        orig_key = key
         if isinstance(key, tuple) and len(key) > 1:
             # key may be a tuple if we are .loc
             # if length of key is > 1 set key to column part
@@ -862,7 +861,7 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None:
             keys = self.obj.columns.union(key, sort=False)
             diff = Index(key).difference(self.obj.columns, sort=False)
 
-            if len(diff) and com.is_null_slice(orig_key[0]):
+            if len(diff):
                 # e.g. if we are doing df.loc[:, ["A", "B"]] = 7 and "B"
                 #  is a new column, add the new columns with dtype=np.void
                 #  so that later when we go through setitem_single_column
@@ -1878,12 +1877,9 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"):
 
                             self.obj[key] = empty_value
                         elif not is_list_like(value):
-                            # Find our empty_value dtype by constructing an array
-                            #  from our value and doing a .take on it
-                            arr = sanitize_array(value, Index(range(1)), copy=False)
-                            taker = -1 * np.ones(len(self.obj), dtype=np.intp)
-                            empty_value = algos.take_nd(arr, taker)
-                            self.obj[key] = empty_value
+                            self.obj[key] = construct_1d_array_from_inferred_fill_value(
+                                value, len(self.obj)
+                            )
                         else:
                             # FIXME: GH#42099#issuecomment-864326014
                             self.obj[key] = infer_fill_value(value)
@@ -2165,6 +2161,17 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None:
         else:
             # set value into the column (first attempting to operate inplace, then
             #  falling back to casting if necessary)
+            dtype = self.obj.dtypes.iloc[loc]
+            if dtype == np.void:
+                # This means we're expanding, with multiple columns, e.g.
+                #     df = pd.DataFrame({'A': [1,2,3], 'B': [4,5,6]})
+                #     df.loc[df.index <= 2, ['F', 'G']] = (1, 'abc')
+                # Columns F and G will initially be set to np.void.
+                # Here, we replace those temporary `np.void` columns with
+                # columns of the appropriate dtype, based on `value`.
+                self.obj.iloc[:, loc] = construct_1d_array_from_inferred_fill_value(
+                    value, len(self.obj)
+                )
             self.obj._mgr.column_setitem(loc, plane_indexer, value)
 
         self.obj._clear_item_cache()

diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py
@@ -1401,3 +1401,19 @@ def test_full_setter_loc_incompatible_dtype():
     df.loc[:, "a"] = {0: 3, 1: 4}
     expected = DataFrame({"a": [3, 4]})
     tm.assert_frame_equal(df, expected)
+
+
+def test_setitem_partial_row_multiple_columns():
+    # https://github.com/pandas-dev/pandas/issues/56503
+    df = DataFrame({"A": [1, 2, 3], "B": [4.0, 5, 6]})
+    # should not warn
+    df.loc[df.index <= 1, ["F", "G"]] = (1, "abc")
+    expected = DataFrame(
+        {
+            "A": [1, 2, 3],
+            "B": [4.0, 5, 6],
+            "F": [1.0, 1, float("nan")],
+            "G": ["abc", "abc", float("nan")],
+        }
+    )
+    tm.assert_frame_equal(df, expected)