Merge branch 'main' into cow_dead_regf

pandas-dev · Oct 21, 2023 · 4227598 · 4227598
2 parents 4664c52 + 00f10db
commit 4227598
Show file tree

Hide file tree

Showing 208 changed files with 1,899 additions and 1,276 deletions.
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -236,7 +236,7 @@ jobs:
           . ~/virtualenvs/pandas-dev/bin/activate
           python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1
           python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true"
-          python -m pip install --no-cache-dir versioneer[toml] "cython<3.0.3" python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1
+          python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1
           python -m pip install --no-cache-dir --no-build-isolation -e .
           python -m pip list --no-cache-dir
           export PANDAS_CI=1
@@ -274,7 +274,7 @@ jobs:
           /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev
           . ~/virtualenvs/pandas-dev/bin/activate
           python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1
-          python -m pip install --no-cache-dir versioneer[toml] "cython<3.0.3" numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1
+          python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1
           python -m pip install --no-cache-dir --no-build-isolation -e .
           python -m pip list --no-cache-dir
 
@@ -347,7 +347,7 @@ jobs:
           python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1
           python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy
           python -m pip install versioneer[toml]
-          python -m pip install python-dateutil pytz tzdata "cython<3.0.3" hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17
+          python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17
           python -m pip install -ve . --no-build-isolation --no-index --no-deps
           python -m pip list
 

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -176,9 +176,8 @@ fi
 
 ### SINGLE-PAGE DOCS ###
 if [[ -z "$CHECK" || "$CHECK" == "single-docs" ]]; then
-    python doc/make.py --warnings-are-errors --single pandas.Series.value_counts
-    python doc/make.py --warnings-are-errors --single pandas.Series.str.split
-    python doc/make.py clean
+    python doc/make.py --warnings-are-errors --no-browser --single pandas.Series.value_counts
+    python doc/make.py --warnings-are-errors --no-browser --single pandas.Series.str.split
 fi
 
 exit $RET
diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
@@ -6,7 +6,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=0.29.33, <3.0.3
+  - cython>=0.29.33
   - meson[ninja]=1.2.1
   - meson-python=0.13.1
 

diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
@@ -7,7 +7,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=0.29.33, <3.0.3
+  - cython>=0.29.33
   - meson[ninja]=1.2.1
   - meson-python=0.13.1
 

diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml
@@ -8,7 +8,7 @@ dependencies:
   - versioneer[toml]
   - meson[ninja]=1.2.1
   - meson-python=0.13.1
-  - cython>=0.29.33, <3.0.3
+  - cython>=0.29.33
 
   # test dependencies
   - pytest>=7.3.2
@@ -29,5 +29,4 @@ dependencies:
     - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple"
     - "--pre"
     - "numpy"
-    - "scipy"
     - "tzdata>=2022.1"
diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml
@@ -7,7 +7,7 @@ dependencies:
   # build dependencies
   - versioneer[toml]
   - meson[ninja]=1.2.1
-  - cython>=0.29.33, <3.0.3
+  - cython>=0.29.33
   - meson-python=0.13.1
 
   # test dependencies

diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
@@ -6,7 +6,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=0.29.33, <3.0.3
+  - cython>=0.29.33
   - meson[ninja]=1.2.1
   - meson-python=0.13.1
 

diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml
@@ -8,7 +8,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=0.29.33, <3.0.3
+  - cython>=0.29.33
   - meson[ninja]=1.2.1
   - meson-python=0.13.1
 

diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
@@ -6,7 +6,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=0.29.33, <3.0.3
+  - cython>=0.29.33
   - meson[ninja]=1.2.1
   - meson-python=0.13.1
 

diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml
@@ -9,7 +9,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=0.29.33, <3.0.3
+  - cython>=0.29.33
   - meson[ninja]=1.2.1
   - meson-python=0.13.1
 

diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml
@@ -6,7 +6,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython>=0.29.33, <3.0.3
+  - cython>=0.29.33
   - meson[ninja]=1.2.1
   - meson-python=0.13.1
 

diff --git a/doc/make.py b/doc/make.py
@@ -45,12 +45,14 @@ def __init__(
         single_doc=None,
         verbosity=0,
         warnings_are_errors=False,
+        no_browser=False,
     ) -> None:
         self.num_jobs = num_jobs
         self.include_api = include_api
         self.whatsnew = whatsnew
         self.verbosity = verbosity
         self.warnings_are_errors = warnings_are_errors
+        self.no_browser = no_browser
 
         if single_doc:
             single_doc = self._process_single_doc(single_doc)
@@ -234,11 +236,11 @@ def html(self):
             os.remove(zip_fname)
 
         if ret_code == 0:
-            if self.single_doc_html is not None:
+            if self.single_doc_html is not None and not self.no_browser:
                 self._open_browser(self.single_doc_html)
             else:
                 self._add_redirects()
-                if self.whatsnew:
+                if self.whatsnew and not self.no_browser:
                     self._open_browser(os.path.join("whatsnew", "index.html"))
 
         return ret_code
@@ -349,6 +351,12 @@ def main():
         action="store_true",
         help="fail if warnings are raised",
     )
+    argparser.add_argument(
+        "--no-browser",
+        help="Don't open browser",
+        default=False,
+        action="store_true",
+    )
     args = argparser.parse_args()
 
     if args.command not in cmds:
@@ -374,6 +382,7 @@ def main():
         args.single,
         args.verbosity,
         args.warnings_are_errors,
+        args.no_browser,
     )
     return getattr(builder, args.command)()
 

diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst
@@ -540,7 +540,7 @@ xfail during the testing phase. To do so, use the ``request`` fixture:
 
     def test_xfail(request):
         mark = pytest.mark.xfail(raises=TypeError, reason="Indicate why here")
-        request.node.add_marker(mark)
+        request.applymarker(mark)
 
 xfail is not to be used for tests involving failure due to invalid user arguments.
 For these tests, we need to verify the correct exception type and error message

diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst
@@ -164,24 +164,24 @@ The pandas equivalent would be:
 
     tips.groupby("sex").size()
 
-Notice that in the pandas code we used :meth:`~pandas.core.groupby.DataFrameGroupBy.size` and not
-:meth:`~pandas.core.groupby.DataFrameGroupBy.count`. This is because
-:meth:`~pandas.core.groupby.DataFrameGroupBy.count` applies the function to each column, returning
+Notice that in the pandas code we used :meth:`.DataFrameGroupBy.size` and not
+:meth:`.DataFrameGroupBy.count`. This is because
+:meth:`.DataFrameGroupBy.count` applies the function to each column, returning
 the number of ``NOT NULL`` records within each.
 
 .. ipython:: python
 
     tips.groupby("sex").count()
 
-Alternatively, we could have applied the :meth:`~pandas.core.groupby.DataFrameGroupBy.count` method
+Alternatively, we could have applied the :meth:`.DataFrameGroupBy.count` method
 to an individual column:
 
 .. ipython:: python
 
     tips.groupby("sex")["total_bill"].count()
 
 Multiple functions can also be applied at once. For instance, say we'd like to see how tip amount
-differs by day of the week - :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` allows you to pass a dictionary
+differs by day of the week - :meth:`.DataFrameGroupBy.agg` allows you to pass a dictionary
 to your grouped DataFrame, indicating which functions to apply to specific columns.
 
 .. code-block:: sql

diff --git a/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst b/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst
@@ -266,7 +266,7 @@ For more information about :meth:`~DataFrame.pivot_table`, see the user guide se
 
     ::
 
-        air_quality.groupby(["parameter", "location"]).mean()
+        air_quality.groupby(["parameter", "location"])[["value"]].mean()
 
 .. raw:: html
 

diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst
@@ -525,7 +525,7 @@ See the :ref:`Grouping section <groupby>`.
    df
 
 Grouping by a column label, selecting column labels, and then applying the
-:meth:`~pandas.core.groupby.DataFrameGroupBy.sum` function to the resulting
+:meth:`.DataFrameGroupBy.sum` function to the resulting
 groups:
 
 .. ipython:: python
@@ -763,12 +763,14 @@ Parquet
 Writing to a Parquet file:
 
 .. ipython:: python
+   :okwarning:
 
    df.to_parquet("foo.parquet")
 
 Reading from a Parquet file Store using :func:`read_parquet`:
 
 .. ipython:: python
+   :okwarning:
 
    pd.read_parquet("foo.parquet")
 

diff --git a/doc/source/user_guide/copy_on_write.rst b/doc/source/user_guide/copy_on_write.rst
@@ -7,8 +7,8 @@ Copy-on-Write (CoW)
 *******************
 
 Copy-on-Write was first introduced in version 1.5.0. Starting from version 2.0 most of the
-optimizations that become possible through CoW are implemented and supported. A complete list
-can be found at :ref:`Copy-on-Write optimizations <copy_on_write.optimizations>`.
+optimizations that become possible through CoW are implemented and supported. All possible
+optimizations are supported starting from pandas 2.1.
 
 We expect that CoW will be enabled by default in version 3.0.
 
@@ -154,66 +154,86 @@ With copy on write this can be done by using ``loc``.
 
     df.loc[df["bar"] > 5, "foo"] = 100
 
+Read-only NumPy arrays
+----------------------
+
+Accessing the underlying NumPy array of a DataFrame will return a read-only array if the array
+shares data with the initial DataFrame:
+
+The array is a copy if the initial DataFrame consists of more than one array:
+
+
+.. ipython:: python
+
+    df = pd.DataFrame({"a": [1, 2], "b": [1.5, 2.5]})
+    df.to_numpy()
+
+The array shares data with the DataFrame if the DataFrame consists of only one NumPy array:
+
+.. ipython:: python
+
+    df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
+    df.to_numpy()
+
+This array is read-only, which means that it can't be modified inplace:
+
+.. ipython:: python
+    :okexcept:
+
+    arr = df.to_numpy()
+    arr[0, 0] = 100
+
+The same holds true for a Series, since a Series always consists of a single array.
+
+There are two potential solution to this:
+
+- Trigger a copy manually if you want to avoid updating DataFrames that share memory with your array.
+- Make the array writeable. This is a more performant solution but circumvents Copy-on-Write rules, so
+  it should be used with caution.
+
+.. ipython:: python
+
+    arr = df.to_numpy()
+    arr.flags.writeable = True
+    arr[0, 0] = 100
+    arr
+
+Patterns to avoid
+-----------------
+
+No defensive copy will be performed if two objects share the same data while
+you are modifying one object inplace.
+
+.. ipython:: python
+
+    df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    df2 = df.reset_index()
+    df2.iloc[0, 0] = 100
+
+This creates two objects that share data and thus the setitem operation will trigger a
+copy. This is not necessary if the initial object ``df`` isn't needed anymore.
+Simply reassigning to the same variable will invalidate the reference that is
+held by the object.
+
+.. ipython:: python
+
+    df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    df = df.reset_index()
+    df.iloc[0, 0] = 100
+
+No copy is necessary in this example.
+Creating multiple references keeps unnecessary references alive
+and thus will hurt performance with Copy-on-Write.
+
 .. _copy_on_write.optimizations:
 
 Copy-on-Write optimizations
 ---------------------------
 
 A new lazy copy mechanism that defers the copy until the object in question is modified
 and only if this object shares data with another object. This mechanism was added to
-following methods:
-
-  - :meth:`DataFrame.reset_index` / :meth:`Series.reset_index`
-  - :meth:`DataFrame.set_index`
-  - :meth:`DataFrame.set_axis` / :meth:`Series.set_axis`
-  - :meth:`DataFrame.set_flags` / :meth:`Series.set_flags`
-  - :meth:`DataFrame.rename_axis` / :meth:`Series.rename_axis`
-  - :meth:`DataFrame.reindex` / :meth:`Series.reindex`
-  - :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like`
-  - :meth:`DataFrame.assign`
-  - :meth:`DataFrame.drop`
-  - :meth:`DataFrame.dropna` / :meth:`Series.dropna`
-  - :meth:`DataFrame.select_dtypes`
-  - :meth:`DataFrame.align` / :meth:`Series.align`
-  - :meth:`Series.to_frame`
-  - :meth:`DataFrame.rename` / :meth:`Series.rename`
-  - :meth:`DataFrame.add_prefix` / :meth:`Series.add_prefix`
-  - :meth:`DataFrame.add_suffix` / :meth:`Series.add_suffix`
-  - :meth:`DataFrame.drop_duplicates` / :meth:`Series.drop_duplicates`
-  - :meth:`DataFrame.droplevel` / :meth:`Series.droplevel`
-  - :meth:`DataFrame.reorder_levels` / :meth:`Series.reorder_levels`
-  - :meth:`DataFrame.between_time` / :meth:`Series.between_time`
-  - :meth:`DataFrame.filter` / :meth:`Series.filter`
-  - :meth:`DataFrame.head` / :meth:`Series.head`
-  - :meth:`DataFrame.tail` / :meth:`Series.tail`
-  - :meth:`DataFrame.isetitem`
-  - :meth:`DataFrame.pipe` / :meth:`Series.pipe`
-  - :meth:`DataFrame.pop` / :meth:`Series.pop`
-  - :meth:`DataFrame.replace` / :meth:`Series.replace`
-  - :meth:`DataFrame.shift` / :meth:`Series.shift`
-  - :meth:`DataFrame.sort_index` / :meth:`Series.sort_index`
-  - :meth:`DataFrame.sort_values` / :meth:`Series.sort_values`
-  - :meth:`DataFrame.squeeze` / :meth:`Series.squeeze`
-  - :meth:`DataFrame.swapaxes`
-  - :meth:`DataFrame.swaplevel` / :meth:`Series.swaplevel`
-  - :meth:`DataFrame.take` / :meth:`Series.take`
-  - :meth:`DataFrame.to_timestamp` / :meth:`Series.to_timestamp`
-  - :meth:`DataFrame.to_period` / :meth:`Series.to_period`
-  - :meth:`DataFrame.truncate`
-  - :meth:`DataFrame.iterrows`
-  - :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize`
-  - :meth:`DataFrame.fillna` / :meth:`Series.fillna`
-  - :meth:`DataFrame.interpolate` / :meth:`Series.interpolate`
-  - :meth:`DataFrame.ffill` / :meth:`Series.ffill`
-  - :meth:`DataFrame.bfill` / :meth:`Series.bfill`
-  - :meth:`DataFrame.where` / :meth:`Series.where`
-  - :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects`
-  - :meth:`DataFrame.astype` / :meth:`Series.astype`
-  - :meth:`DataFrame.convert_dtypes` / :meth:`Series.convert_dtypes`
-  - :meth:`DataFrame.join`
-  - :meth:`DataFrame.eval`
-  - :func:`concat`
-  - :func:`merge`
+methods that don't require a copy of the underlying data. Popular examples are :meth:`DataFrame.drop` for ``axis=1``
+and :meth:`DataFrame.rename`.
 
 These methods return views when Copy-on-Write is enabled, which provides a significant
 performance improvement compared to the regular execution.
-Original file line number
+Diff line change
@@ Expand Up @@
         ::
-            air_quality.groupby(["parameter", "location"]).mean()
+            air_quality.groupby(["parameter", "location"])[["value"]].mean()
     .. raw:: html
@@ Expand Down @@