Merge remote-tracking branch 'upstream/main' into chained_view_only

pandas-dev · Dec 4, 2023 · 0dffb5a · 0dffb5a
2 parents 93df035 + e0f3a18
commit 0dffb5a
Show file tree

Hide file tree

Showing 223 changed files with 3,595 additions and 2,520 deletions.
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -26,7 +26,7 @@ jobs:
     timeout-minutes: 90
     strategy:
       matrix:
-        env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml]
+        env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml]
         # Prevent the include jobs from overriding other jobs
         pattern: [""]
         include:
@@ -69,6 +69,10 @@ jobs:
             env_file: actions-311.yaml
             pattern: "not slow and not network and not single_cpu"
             pandas_copy_on_write: "1"
+          - name: "Copy-on-Write 3.12"
+            env_file: actions-312.yaml
+            pattern: "not slow and not network and not single_cpu"
+            pandas_copy_on_write: "1"
           - name: "Copy-on-Write 3.11 (warnings)"
             env_file: actions-311.yaml
             pattern: "not slow and not network and not single_cpu"
@@ -190,7 +194,7 @@ jobs:
     strategy:
       matrix:
         os: [macos-latest, windows-latest]
-        env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml]
+        env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml]
       fail-fast: false
     runs-on: ${{ matrix.os }}
     name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }}
@@ -321,7 +325,7 @@ jobs:
     #    To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs
     #    to the corresponding posix/windows-macos/sdist etc. workflows.
     # Feel free to modify this comment as necessary.
-    #if: false # Uncomment this to freeze the workflow, comment it to unfreeze
+    if: false # Uncomment this to freeze the workflow, comment it to unfreeze
     defaults:
       run:
         shell: bash -eou pipefail {0}

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -240,8 +240,9 @@ repos:
             # pytest raises without context
             |\s\ pytest.raises
 
+            # TODO
             # pytest.warns (use tm.assert_produces_warning instead)
-            |pytest\.warns
+            # |pytest\.warns
 
             # os.remove
             |os\.remove

diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py
@@ -6,12 +6,12 @@
 import pandas as pd
 from pandas import (
     DataFrame,
+    Index,
     Series,
     Timestamp,
     date_range,
     to_timedelta,
 )
-import pandas._testing as tm
 from pandas.core.algorithms import checked_add_with_arr
 
 from .pandas_vb_common import numeric_dtypes
@@ -323,8 +323,10 @@ class IndexArithmetic:
 
     def setup(self, dtype):
         N = 10**6
-        indexes = {"int": "makeIntIndex", "float": "makeFloatIndex"}
-        self.index = getattr(tm, indexes[dtype])(N)
+        if dtype == "float":
+            self.index = Index(np.arange(N), dtype=np.float64)
+        elif dtype == "int":
+            self.index = Index(np.arange(N), dtype=np.int64)
 
     def time_add(self, dtype):
         self.index + 2

diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -6,8 +6,6 @@
 
 import pandas as pd
 
-from .pandas_vb_common import tm
-
 try:
     from pandas.api.types import union_categoricals
 except ImportError:
@@ -189,7 +187,7 @@ def setup(self):
         N = 10**5
         ncats = 15
 
-        self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str)
+        self.s_str = pd.Series(np.random.randint(0, ncats, size=N).astype(str))
         self.s_str_cat = pd.Series(self.s_str, dtype="category")
         with warnings.catch_warnings(record=True):
             str_cat_type = pd.CategoricalDtype(set(self.s_str), ordered=True)
@@ -242,7 +240,7 @@ def time_categorical_series_is_monotonic_decreasing(self):
 class Contains:
     def setup(self):
         N = 10**5
-        self.ci = tm.makeCategoricalIndex(N)
+        self.ci = pd.CategoricalIndex(np.arange(N))
         self.c = self.ci.values
         self.key = self.ci.categories[0]
 
@@ -325,7 +323,7 @@ def time_sort_values(self):
 class SearchSorted:
     def setup(self):
         N = 10**5
-        self.ci = tm.makeCategoricalIndex(N).sort_values()
+        self.ci = pd.CategoricalIndex(np.arange(N)).sort_values()
         self.c = self.ci.values
         self.key = self.ci.categories[1]
 

diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
@@ -46,7 +46,7 @@ dependencies:
   - pyqt>=5.15.9
   - pyreadstat>=1.2.0
   - pytables>=3.8.0
-  - python-calamine>=0.1.6
+  - python-calamine>=0.1.7
   - pyxlsb>=1.0.10
   - s3fs>=2022.11.0
   - scipy>=1.10.0

diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
@@ -47,7 +47,7 @@ dependencies:
   - pyqt>=5.15.9
   - pyreadstat>=1.2.0
   - pytables>=3.8.0
-  - python-calamine>=0.1.6
+  - python-calamine>=0.1.7
   - pyxlsb>=1.0.10
   - s3fs>=2022.11.0
   - scipy>=1.10.0

diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
@@ -46,7 +46,7 @@ dependencies:
   - pymysql>=1.0.2
   - pyreadstat>=1.2.0
   - pytables>=3.8.0
-  - python-calamine>=0.1.6
+  - python-calamine>=0.1.7
   - pyxlsb>=1.0.10
   - s3fs>=2022.11.0
   - scipy>=1.10.0

diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml
@@ -0,0 +1,63 @@
+name: pandas-dev-312
+channels:
+  - conda-forge
+dependencies:
+  - python=3.12
+
+  # build dependencies
+  - versioneer[toml]
+  - cython>=0.29.33
+  - meson[ninja]=1.2.1
+  - meson-python=0.13.1
+
+  # test dependencies
+  - pytest>=7.3.2
+  - pytest-cov
+  - pytest-xdist>=2.2.0
+  - pytest-localserver>=0.7.1
+  - pytest-qt>=4.2.0
+  - boto3
+
+  # required dependencies
+  - python-dateutil
+  - numpy<2
+  - pytz
+
+  # optional dependencies
+  - beautifulsoup4>=4.11.2
+  - blosc>=1.21.3
+  - bottleneck>=1.3.6
+  - fastparquet>=2022.12.0
+  - fsspec>=2022.11.0
+  - html5lib>=1.1
+  - hypothesis>=6.46.1
+  - gcsfs>=2022.11.0
+  - jinja2>=3.1.2
+  - lxml>=4.9.2
+  - matplotlib>=3.6.3
+  # - numba>=0.56.4
+  - numexpr>=2.8.4
+  - odfpy>=1.4.1
+  - qtpy>=2.3.0
+  - pyqt>=5.15.9
+  - openpyxl>=3.1.0
+  - psycopg2>=2.9.6
+  - pyarrow>=10.0.1
+  - pymysql>=1.0.2
+  - pyreadstat>=1.2.0
+  # - pytables>=3.8.0
+  - python-calamine>=0.1.7
+  - pyxlsb>=1.0.10
+  - s3fs>=2022.11.0
+  - scipy>=1.10.0
+  - sqlalchemy>=2.0.0
+  - tabulate>=0.9.0
+  - xarray>=2022.12.0
+  - xlrd>=2.0.1
+  - xlsxwriter>=3.0.5
+  - zstandard>=0.19.0
+
+  - pip:
+    - adbc-driver-postgresql>=0.8.0
+    - adbc-driver-sqlite>=0.8.0
+    - tzdata>=2022.7
diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml
@@ -48,7 +48,7 @@ dependencies:
   - pyqt=5.15.9
   - pyreadstat=1.2.0
   - pytables=3.8.0
-  - python-calamine=0.1.6
+  - python-calamine=0.1.7
   - pyxlsb=1.0.10
   - s3fs=2022.11.0
   - scipy=1.10.0

diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
@@ -46,7 +46,7 @@ dependencies:
   - pyqt>=5.15.9
   - pyreadstat>=1.2.0
   - pytables>=3.8.0
-  - python-calamine>=0.1.6
+  - python-calamine>=0.1.7
   - pyxlsb>=1.0.10
   - s3fs>=2022.11.0
   - scipy>=1.10.0

diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml
@@ -46,7 +46,7 @@ dependencies:
   - pyqt>=5.15.9
   - pyreadstat>=1.2.0
   - pytables>=3.8.0
-  - python-calamine>=0.1.6
+  - python-calamine>=0.1.7
   - pyxlsb>=1.0.10
   - s3fs>=2022.11.0
   - scipy>=1.10.0

diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst
@@ -44,8 +44,9 @@ and consult the ``Linux`` instructions below.
 **macOS**
 
 To use the :ref:`mamba <contributing.mamba>`-based compilers, you will need to install the
-Developer Tools using ``xcode-select --install``. Otherwise
-information about compiler installation can be found here:
+Developer Tools using ``xcode-select --install``.
+
+If you prefer to use a different compiler, general information can be found here:
 https://devguide.python.org/setup/#macos
 
 **Linux**
@@ -86,12 +87,12 @@ Before we begin, please:
 Option 1: using mamba (recommended)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-* Install `mamba <https://mamba.readthedocs.io/en/latest/installation/mamba-installation.html>`_
+* Install miniforge to get `mamba <https://mamba.readthedocs.io/en/latest/installation/mamba-installation.html>`_
 * Make sure your mamba is up to date (``mamba update mamba``)
+* Create and activate the ``pandas-dev`` mamba environment using the following commands:
 
 .. code-block:: none
 
-   # Create and activate the build environment
    mamba env create --file environment.yml
    mamba activate pandas-dev
 
@@ -273,13 +274,21 @@ uses to import the extension from the build folder, which may cause errors such
    You will need to repeat this step each time the C extensions change, for example
    if you modified any file in ``pandas/_libs`` or if you did a fetch and merge from ``upstream/main``.
 
+**Checking the build**
+
 At this point you should be able to import pandas from your locally built version::
 
    $ python
    >>> import pandas
    >>> print(pandas.__version__)  # note: the exact output may differ
    2.0.0.dev0+880.g2b9e661fbb.dirty
 
+
+At this point you may want to try
+`running the test suite <https://pandas.pydata.org/docs/dev/development/contributing_codebase.html#running-the-test-suite>`_.
+
+**Keeping up to date with the latest build**
+
 When building pandas with meson, importing pandas will automatically trigger a rebuild, even when C/Cython files are modified.
 By default, no output will be produced by this rebuild (the import will just take longer). If you would like to see meson's
 output when importing pandas, you can set the environment variable ``MESONPY_EDTIABLE_VERBOSE``. For example, this would be::

diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst
@@ -449,9 +449,13 @@ which will be triggered when the tag is pushed.
     git tag -a v1.5.0.dev0 -m "DEV: Start 1.5.0"
     git push upstream main --follow-tags
 
-3. Build the source distribution (git must be in the tag commit)::
+3. Download the source distribution and wheels from the `wheel staging area <https://anaconda.org/scientific-python-nightly-wheels/pandas>`_.
+   Be careful to make sure that no wheels are missing (e.g. due to failed builds).
 
-    ./setup.py sdist --formats=gztar --quiet
+   Running scripts/download_wheels.sh with the version that you want to download wheels/the sdist for should do the trick.
+   This script will make a ``dist`` folder inside your clone of pandas and put the downloaded wheels and sdist there::
+
+    scripts/download_wheels.sh <VERSION>
 
 4. Create a `new GitHub release <https://github.com/pandas-dev/pandas/releases/new>`_:
 
@@ -463,23 +467,19 @@ which will be triggered when the tag is pushed.
    - Set as the latest release: Leave checked, unless releasing a patch release for an older version
      (e.g. releasing 1.4.5 after 1.5 has been released)
 
-5. The GitHub release will after some hours trigger an
+5. Upload wheels to PyPI::
+
+    twine upload pandas/dist/pandas-<version>*.{whl,tar.gz} --skip-existing
+
+6. The GitHub release will after some hours trigger an
    `automated conda-forge PR <https://github.com/conda-forge/pandas-feedstock/pulls>`_.
+   (If you don't want to wait, you can open an issue titled ``@conda-forge-admin, please update version`` to trigger the bot.)
    Merge it once the CI is green, and it will generate the conda-forge packages.
+
    In case a manual PR needs to be done, the version, sha256 and build fields are the
    ones that usually need to be changed. If anything else in the recipe has changed since
    the last release, those changes should be available in ``ci/meta.yaml``.
 
-6. Packages for supported versions in PyPI are built automatically from our CI.
-   Once all packages are build download all wheels from the
-   `Anaconda repository <https://anaconda.org/multibuild-wheels-staging/pandas/files?version=\<version\>>`_
-   where our CI published them to the ``dist/`` directory in your local pandas copy.
-   You can use the script ``scripts/download_wheels.sh`` to download all wheels at once.
-
-7. Upload wheels to PyPI::
-
-    twine upload pandas/dist/pandas-<version>*.{whl,tar.gz} --skip-existing
-
 Post-Release
 ````````````
 

diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
@@ -281,7 +281,7 @@ xlrd                      2.0.1              excel           Reading Excel
 xlsxwriter                3.0.5              excel           Writing Excel
 openpyxl                  3.1.0              excel           Reading / writing for xlsx files
 pyxlsb                    1.0.10             excel           Reading for xlsb files
-python-calamine           0.1.6              excel           Reading for xls/xlsx/xlsb/ods files
+python-calamine           0.1.7              excel           Reading for xls/xlsx/xlsb/ods files
 ========================= ================== =============== =============================================================
 
 HTML

diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst
@@ -269,7 +269,7 @@ using ``fillna`` if you wish).
 .. ipython:: python
 
    df2 = df.copy()
-   df2["three"]["a"] = 1.0
+   df2.loc["a", "three"] = 1.0
    df
    df2
    df + df2

diff --git a/doc/source/user_guide/copy_on_write.rst b/doc/source/user_guide/copy_on_write.rst
@@ -26,7 +26,7 @@ Previous behavior
 -----------------
 
 pandas indexing behavior is tricky to understand. Some operations return views while
-other return copies. Depending on the result of the operation, mutation one object
+other return copies. Depending on the result of the operation, mutating one object
 might accidentally mutate another:
 
 .. ipython:: python
@@ -138,6 +138,7 @@ Chained assignment references a technique where an object is updated through
 two subsequent indexing operations, e.g.
 
 .. ipython:: python
+    :okwarning:
 
     with pd.option_context("mode.copy_on_write", False):
         df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})

diff --git a/doc/source/whatsnew/v0.13.1.rst b/doc/source/whatsnew/v0.13.1.rst
@@ -29,11 +29,10 @@ Highlights include:
 
    This would previously segfault:
 
-   .. ipython:: python
+   .. code-block:: python
 
       df = pd.DataFrame({"A": np.array(["foo", "bar", "bah", "foo", "bar"])})
       df["A"].iloc[0] = np.nan
-      df
 
    The recommended way to do this type of assignment is:
 

diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst
@@ -23,9 +23,11 @@ Bug fixes
 ~~~~~~~~~
 - Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`)
 - Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`)
+- Fixed bug in :func:`to_numeric` converting to extension dtype for ``string[pyarrow_numpy]`` dtype (:issue:`56179`)
 - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
 - Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`)
 - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
+- Fixed bug in :meth:`Series.mode` not keeping object dtype when ``infer_string`` is set (:issue:`56183`)
 - Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`)
 
 .. ---------------------------------------------------------------------------