Merge remote-tracking branch 'upstream/main' into redepr-pct-change

pandas-dev · Oct 15, 2023 · 80e190f · 80e190f
2 parents e7c6296 + 7b8c6f6
commit 80e190f
Show file tree

Hide file tree

Showing 4 changed files with 105 additions and 56 deletions.
diff --git a/doc/source/user_guide/copy_on_write.rst b/doc/source/user_guide/copy_on_write.rst
@@ -7,8 +7,8 @@ Copy-on-Write (CoW)
 *******************
 
 Copy-on-Write was first introduced in version 1.5.0. Starting from version 2.0 most of the
-optimizations that become possible through CoW are implemented and supported. A complete list
-can be found at :ref:`Copy-on-Write optimizations <copy_on_write.optimizations>`.
+optimizations that become possible through CoW are implemented and supported. All possible
+optimizations are supported starting from pandas 2.1.
 
 We expect that CoW will be enabled by default in version 3.0.
 
@@ -154,66 +154,86 @@ With copy on write this can be done by using ``loc``.
 
     df.loc[df["bar"] > 5, "foo"] = 100
 
+Read-only NumPy arrays
+----------------------
+
+Accessing the underlying NumPy array of a DataFrame will return a read-only array if the array
+shares data with the initial DataFrame:
+
+The array is a copy if the initial DataFrame consists of more than one array:
+
+
+.. ipython:: python
+
+    df = pd.DataFrame({"a": [1, 2], "b": [1.5, 2.5]})
+    df.to_numpy()
+
+The array shares data with the DataFrame if the DataFrame consists of only one NumPy array:
+
+.. ipython:: python
+
+    df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
+    df.to_numpy()
+
+This array is read-only, which means that it can't be modified inplace:
+
+.. ipython:: python
+    :okexcept:
+
+    arr = df.to_numpy()
+    arr[0, 0] = 100
+
+The same holds true for a Series, since a Series always consists of a single array.
+
+There are two potential solution to this:
+
+- Trigger a copy manually if you want to avoid updating DataFrames that share memory with your array.
+- Make the array writeable. This is a more performant solution but circumvents Copy-on-Write rules, so
+  it should be used with caution.
+
+.. ipython:: python
+
+    arr = df.to_numpy()
+    arr.flags.writeable = True
+    arr[0, 0] = 100
+    arr
+
+Patterns to avoid
+-----------------
+
+No defensive copy will be performed if two objects share the same data while
+you are modifying one object inplace.
+
+.. ipython:: python
+
+    df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    df2 = df.reset_index()
+    df2.iloc[0, 0] = 100
+
+This creates two objects that share data and thus the setitem operation will trigger a
+copy. This is not necessary if the initial object ``df`` isn't needed anymore.
+Simply reassigning to the same variable will invalidate the reference that is
+held by the object.
+
+.. ipython:: python
+
+    df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    df = df.reset_index()
+    df.iloc[0, 0] = 100
+
+No copy is necessary in this example.
+Creating multiple references keeps unnecessary references alive
+and thus will hurt performance with Copy-on-Write.
+
 .. _copy_on_write.optimizations:
 
 Copy-on-Write optimizations
 ---------------------------
 
 A new lazy copy mechanism that defers the copy until the object in question is modified
 and only if this object shares data with another object. This mechanism was added to
-following methods:
-
-  - :meth:`DataFrame.reset_index` / :meth:`Series.reset_index`
-  - :meth:`DataFrame.set_index`
-  - :meth:`DataFrame.set_axis` / :meth:`Series.set_axis`
-  - :meth:`DataFrame.set_flags` / :meth:`Series.set_flags`
-  - :meth:`DataFrame.rename_axis` / :meth:`Series.rename_axis`
-  - :meth:`DataFrame.reindex` / :meth:`Series.reindex`
-  - :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like`
-  - :meth:`DataFrame.assign`
-  - :meth:`DataFrame.drop`
-  - :meth:`DataFrame.dropna` / :meth:`Series.dropna`
-  - :meth:`DataFrame.select_dtypes`
-  - :meth:`DataFrame.align` / :meth:`Series.align`
-  - :meth:`Series.to_frame`
-  - :meth:`DataFrame.rename` / :meth:`Series.rename`
-  - :meth:`DataFrame.add_prefix` / :meth:`Series.add_prefix`
-  - :meth:`DataFrame.add_suffix` / :meth:`Series.add_suffix`
-  - :meth:`DataFrame.drop_duplicates` / :meth:`Series.drop_duplicates`
-  - :meth:`DataFrame.droplevel` / :meth:`Series.droplevel`
-  - :meth:`DataFrame.reorder_levels` / :meth:`Series.reorder_levels`
-  - :meth:`DataFrame.between_time` / :meth:`Series.between_time`
-  - :meth:`DataFrame.filter` / :meth:`Series.filter`
-  - :meth:`DataFrame.head` / :meth:`Series.head`
-  - :meth:`DataFrame.tail` / :meth:`Series.tail`
-  - :meth:`DataFrame.isetitem`
-  - :meth:`DataFrame.pipe` / :meth:`Series.pipe`
-  - :meth:`DataFrame.pop` / :meth:`Series.pop`
-  - :meth:`DataFrame.replace` / :meth:`Series.replace`
-  - :meth:`DataFrame.shift` / :meth:`Series.shift`
-  - :meth:`DataFrame.sort_index` / :meth:`Series.sort_index`
-  - :meth:`DataFrame.sort_values` / :meth:`Series.sort_values`
-  - :meth:`DataFrame.squeeze` / :meth:`Series.squeeze`
-  - :meth:`DataFrame.swapaxes`
-  - :meth:`DataFrame.swaplevel` / :meth:`Series.swaplevel`
-  - :meth:`DataFrame.take` / :meth:`Series.take`
-  - :meth:`DataFrame.to_timestamp` / :meth:`Series.to_timestamp`
-  - :meth:`DataFrame.to_period` / :meth:`Series.to_period`
-  - :meth:`DataFrame.truncate`
-  - :meth:`DataFrame.iterrows`
-  - :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize`
-  - :meth:`DataFrame.fillna` / :meth:`Series.fillna`
-  - :meth:`DataFrame.interpolate` / :meth:`Series.interpolate`
-  - :meth:`DataFrame.ffill` / :meth:`Series.ffill`
-  - :meth:`DataFrame.bfill` / :meth:`Series.bfill`
-  - :meth:`DataFrame.where` / :meth:`Series.where`
-  - :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects`
-  - :meth:`DataFrame.astype` / :meth:`Series.astype`
-  - :meth:`DataFrame.convert_dtypes` / :meth:`Series.convert_dtypes`
-  - :meth:`DataFrame.join`
-  - :meth:`DataFrame.eval`
-  - :func:`concat`
-  - :func:`merge`
+methods that don't require a copy of the underlying data. Popular examples are :meth:`DataFrame.drop` for ``axis=1``
+and :meth:`DataFrame.rename`.
 
 These methods return views when Copy-on-Write is enabled, which provides a significant
 performance improvement compared to the regular execution.

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -1627,6 +1627,15 @@ def _reduce(
         ------
         TypeError : subclass does not define reductions
         """
+        result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs)
+        if isinstance(result, pa.Array):
+            return type(self)(result)
+        else:
+            return result
+
+    def _reduce_calc(
+        self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
+    ):
         pa_result = self._reduce_pyarrow(name, skipna=skipna, **kwargs)
 
         if keepdims:
@@ -1637,7 +1646,7 @@ def _reduce(
                     [pa_result],
                     type=to_pyarrow_type(infer_dtype_from_scalar(pa_result)[0]),
                 )
-            return type(self)(result)
+            return result
 
         if pc.is_null(pa_result).as_py():
             return self.dtype.na_value

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -502,6 +502,17 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
     def _convert_int_dtype(self, result):
         return Int64Dtype().__from_arrow__(result)
 
+    def _reduce(
+        self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
+    ):
+        result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs)
+        if name in ("argmin", "argmax") and isinstance(result, pa.Array):
+            return self._convert_int_dtype(result)
+        elif isinstance(result, pa.Array):
+            return type(self)(result)
+        else:
+            return result
+
     def _rank(
         self,
         *,

diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
@@ -1073,6 +1073,15 @@ def test_idxmax_arrow_types(self):
         expected = Series([2, 1], index=["a", "b"])
         tm.assert_series_equal(result, expected)
 
+        df = DataFrame({"a": ["b", "c", "a"]}, dtype="string[pyarrow]")
+        result = df.idxmax(numeric_only=False)
+        expected = Series([1], index=["a"])
+        tm.assert_series_equal(result, expected)
+
+        result = df.idxmin(numeric_only=False)
+        expected = Series([2], index=["a"])
+        tm.assert_series_equal(result, expected)
+
     def test_idxmax_axis_2(self, float_frame):
         frame = float_frame
         msg = "No axis named 2 for object type DataFrame"