Merge branch 'main' into cow-warnings-frame-setitem

pandas-dev · Dec 2, 2023 · bca55bf · bca55bf
2 parents 1d42814 + 7c6d26f
commit bca55bf
Show file tree

Hide file tree

Showing 172 changed files with 2,512 additions and 1,871 deletions.
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -26,7 +26,7 @@ jobs:
     timeout-minutes: 90
     strategy:
       matrix:
-        env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml]
+        env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml]
         # Prevent the include jobs from overriding other jobs
         pattern: [""]
         include:
@@ -69,6 +69,10 @@ jobs:
             env_file: actions-311.yaml
             pattern: "not slow and not network and not single_cpu"
             pandas_copy_on_write: "1"
+          - name: "Copy-on-Write 3.12"
+            env_file: actions-312.yaml
+            pattern: "not slow and not network and not single_cpu"
+            pandas_copy_on_write: "1"
           - name: "Copy-on-Write 3.11 (warnings)"
             env_file: actions-311.yaml
             pattern: "not slow and not network and not single_cpu"
@@ -190,7 +194,7 @@ jobs:
     strategy:
       matrix:
         os: [macos-latest, windows-latest]
-        env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml]
+        env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml]
       fail-fast: false
     runs-on: ${{ matrix.os }}
     name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }}
@@ -321,7 +325,7 @@ jobs:
     #    To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs
     #    to the corresponding posix/windows-macos/sdist etc. workflows.
     # Feel free to modify this comment as necessary.
-    #if: false # Uncomment this to freeze the workflow, comment it to unfreeze
+    if: false # Uncomment this to freeze the workflow, comment it to unfreeze
     defaults:
       run:
         shell: bash -eou pipefail {0}

diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py
@@ -6,12 +6,12 @@
 import pandas as pd
 from pandas import (
     DataFrame,
+    Index,
     Series,
     Timestamp,
     date_range,
     to_timedelta,
 )
-import pandas._testing as tm
 from pandas.core.algorithms import checked_add_with_arr
 
 from .pandas_vb_common import numeric_dtypes
@@ -323,8 +323,10 @@ class IndexArithmetic:
 
     def setup(self, dtype):
         N = 10**6
-        indexes = {"int": "makeIntIndex", "float": "makeFloatIndex"}
-        self.index = getattr(tm, indexes[dtype])(N)
+        if dtype == "float":
+            self.index = Index(np.arange(N), dtype=np.float64)
+        elif dtype == "int":
+            self.index = Index(np.arange(N), dtype=np.int64)
 
     def time_add(self, dtype):
         self.index + 2

diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
@@ -46,7 +46,7 @@ dependencies:
   - pyqt>=5.15.9
   - pyreadstat>=1.2.0
   - pytables>=3.8.0
-  - python-calamine>=0.1.6
+  - python-calamine>=0.1.7
   - pyxlsb>=1.0.10
   - s3fs>=2022.11.0
   - scipy>=1.10.0

diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
@@ -47,7 +47,7 @@ dependencies:
   - pyqt>=5.15.9
   - pyreadstat>=1.2.0
   - pytables>=3.8.0
-  - python-calamine>=0.1.6
+  - python-calamine>=0.1.7
   - pyxlsb>=1.0.10
   - s3fs>=2022.11.0
   - scipy>=1.10.0

diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
@@ -46,7 +46,7 @@ dependencies:
   - pymysql>=1.0.2
   - pyreadstat>=1.2.0
   - pytables>=3.8.0
-  - python-calamine>=0.1.6
+  - python-calamine>=0.1.7
   - pyxlsb>=1.0.10
   - s3fs>=2022.11.0
   - scipy>=1.10.0

diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml
@@ -0,0 +1,63 @@
+name: pandas-dev-312
+channels:
+  - conda-forge
+dependencies:
+  - python=3.12
+
+  # build dependencies
+  - versioneer[toml]
+  - cython>=0.29.33
+  - meson[ninja]=1.2.1
+  - meson-python=0.13.1
+
+  # test dependencies
+  - pytest>=7.3.2
+  - pytest-cov
+  - pytest-xdist>=2.2.0
+  - pytest-localserver>=0.7.1
+  - pytest-qt>=4.2.0
+  - boto3
+
+  # required dependencies
+  - python-dateutil
+  - numpy<2
+  - pytz
+
+  # optional dependencies
+  - beautifulsoup4>=4.11.2
+  - blosc>=1.21.3
+  - bottleneck>=1.3.6
+  - fastparquet>=2022.12.0
+  - fsspec>=2022.11.0
+  - html5lib>=1.1
+  - hypothesis>=6.46.1
+  - gcsfs>=2022.11.0
+  - jinja2>=3.1.2
+  - lxml>=4.9.2
+  - matplotlib>=3.6.3
+  # - numba>=0.56.4
+  - numexpr>=2.8.4
+  - odfpy>=1.4.1
+  - qtpy>=2.3.0
+  - pyqt>=5.15.9
+  - openpyxl>=3.1.0
+  - psycopg2>=2.9.6
+  - pyarrow>=10.0.1
+  - pymysql>=1.0.2
+  - pyreadstat>=1.2.0
+  # - pytables>=3.8.0
+  - python-calamine>=0.1.7
+  - pyxlsb>=1.0.10
+  - s3fs>=2022.11.0
+  - scipy>=1.10.0
+  - sqlalchemy>=2.0.0
+  - tabulate>=0.9.0
+  - xarray>=2022.12.0
+  - xlrd>=2.0.1
+  - xlsxwriter>=3.0.5
+  - zstandard>=0.19.0
+
+  - pip:
+    - adbc-driver-postgresql>=0.8.0
+    - adbc-driver-sqlite>=0.8.0
+    - tzdata>=2022.7
diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml
@@ -48,7 +48,7 @@ dependencies:
   - pyqt=5.15.9
   - pyreadstat=1.2.0
   - pytables=3.8.0
-  - python-calamine=0.1.6
+  - python-calamine=0.1.7
   - pyxlsb=1.0.10
   - s3fs=2022.11.0
   - scipy=1.10.0

diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
@@ -46,7 +46,7 @@ dependencies:
   - pyqt>=5.15.9
   - pyreadstat>=1.2.0
   - pytables>=3.8.0
-  - python-calamine>=0.1.6
+  - python-calamine>=0.1.7
   - pyxlsb>=1.0.10
   - s3fs>=2022.11.0
   - scipy>=1.10.0

diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml
@@ -46,7 +46,7 @@ dependencies:
   - pyqt>=5.15.9
   - pyreadstat>=1.2.0
   - pytables>=3.8.0
-  - python-calamine>=0.1.6
+  - python-calamine>=0.1.7
   - pyxlsb>=1.0.10
   - s3fs>=2022.11.0
   - scipy>=1.10.0

diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst
@@ -449,9 +449,13 @@ which will be triggered when the tag is pushed.
     git tag -a v1.5.0.dev0 -m "DEV: Start 1.5.0"
     git push upstream main --follow-tags
 
-3. Build the source distribution (git must be in the tag commit)::
+3. Download the source distribution and wheels from the `wheel staging area <https://anaconda.org/scientific-python-nightly-wheels/pandas>`_.
+   Be careful to make sure that no wheels are missing (e.g. due to failed builds).
 
-    ./setup.py sdist --formats=gztar --quiet
+   Running scripts/download_wheels.sh with the version that you want to download wheels/the sdist for should do the trick.
+   This script will make a ``dist`` folder inside your clone of pandas and put the downloaded wheels and sdist there::
+
+    scripts/download_wheels.sh <VERSION>
 
 4. Create a `new GitHub release <https://github.com/pandas-dev/pandas/releases/new>`_:
 
@@ -463,23 +467,19 @@ which will be triggered when the tag is pushed.
    - Set as the latest release: Leave checked, unless releasing a patch release for an older version
      (e.g. releasing 1.4.5 after 1.5 has been released)
 
-5. The GitHub release will after some hours trigger an
+5. Upload wheels to PyPI::
+
+    twine upload pandas/dist/pandas-<version>*.{whl,tar.gz} --skip-existing
+
+6. The GitHub release will after some hours trigger an
    `automated conda-forge PR <https://github.com/conda-forge/pandas-feedstock/pulls>`_.
+   (If you don't want to wait, you can open an issue titled ``@conda-forge-admin, please update version`` to trigger the bot.)
    Merge it once the CI is green, and it will generate the conda-forge packages.
+
    In case a manual PR needs to be done, the version, sha256 and build fields are the
    ones that usually need to be changed. If anything else in the recipe has changed since
    the last release, those changes should be available in ``ci/meta.yaml``.
 
-6. Packages for supported versions in PyPI are built automatically from our CI.
-   Once all packages are build download all wheels from the
-   `Anaconda repository <https://anaconda.org/multibuild-wheels-staging/pandas/files?version=\<version\>>`_
-   where our CI published them to the ``dist/`` directory in your local pandas copy.
-   You can use the script ``scripts/download_wheels.sh`` to download all wheels at once.
-
-7. Upload wheels to PyPI::
-
-    twine upload pandas/dist/pandas-<version>*.{whl,tar.gz} --skip-existing
-
 Post-Release
 ````````````
 

diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
@@ -281,7 +281,7 @@ xlrd                      2.0.1              excel           Reading Excel
 xlsxwriter                3.0.5              excel           Writing Excel
 openpyxl                  3.1.0              excel           Reading / writing for xlsx files
 pyxlsb                    1.0.10             excel           Reading for xlsb files
-python-calamine           0.1.6              excel           Reading for xls/xlsx/xlsb/ods files
+python-calamine           0.1.7              excel           Reading for xls/xlsx/xlsb/ods files
 ========================= ================== =============== =============================================================
 
 HTML

diff --git a/doc/source/user_guide/copy_on_write.rst b/doc/source/user_guide/copy_on_write.rst
@@ -26,7 +26,7 @@ Previous behavior
 -----------------
 
 pandas indexing behavior is tricky to understand. Some operations return views while
-other return copies. Depending on the result of the operation, mutation one object
+other return copies. Depending on the result of the operation, mutating one object
 might accidentally mutate another:
 
 .. ipython:: python

diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst
@@ -23,9 +23,11 @@ Bug fixes
 ~~~~~~~~~
 - Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`)
 - Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`)
+- Fixed bug in :func:`to_numeric` converting to extension dtype for ``string[pyarrow_numpy]`` dtype (:issue:`56179`)
 - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
 - Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`)
 - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
+- Fixed bug in :meth:`Series.mode` not keeping object dtype when ``infer_string`` is set (:issue:`56183`)
 - Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`)
 
 .. ---------------------------------------------------------------------------

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -105,6 +105,37 @@ For a full list of ADBC drivers and their development status, see the `ADBC Driv
 Implementation Status <https://arrow.apache.org/adbc/current/driver/status.html>`_
 documentation.
 
+.. _whatsnew_220.enhancements.to_numpy_ea:
+
+ExtensionArray.to_numpy converts to suitable NumPy dtype
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:meth:`ExtensionArray.to_numpy`` will now convert to a suitable NumPy dtype instead
+of ``object`` dtype for nullable extension dtypes.
+
+*Old behavior:*
+
+.. code-block:: ipython
+
+    In [1]: ser = pd.Series([1, 2, 3], dtype="Int64")
+    In [2]: ser.to_numpy()
+    Out[2]: array([1, 2, 3], dtype=object)
+
+*New behavior:*
+
+.. ipython:: python
+
+    ser = pd.Series([1, 2, 3], dtype="Int64")
+    ser.to_numpy()
+
+The default NumPy dtype (without any arguments) is determined as follows:
+
+- float dtypes are cast to NumPy floats
+- integer dtypes without missing values are cast to NumPy integer dtypes
+- integer dtypes with missing values are cast to NumPy float dtypes and ``NaN`` is used as missing value indicator
+- boolean dtypes without missing values are cast to NumPy bool dtype
+- boolean dtypes with missing values keep object dtype
+
 .. _whatsnew_220.enhancements.struct_accessor:
 
 Series.struct accessor to with PyArrow structured data
@@ -399,7 +430,9 @@ Other Deprecations
 - Deprecated the :class:`.Grouping` attributes ``group_index``, ``result_index``, and ``group_arraylike``; these will be removed in a future version of pandas (:issue:`56148`)
 - Deprecated the ``errors="ignore"`` option in :func:`to_datetime`, :func:`to_timedelta`, and :func:`to_numeric`; explicitly catch exceptions instead (:issue:`54467`)
 - Deprecated the ``fastpath`` keyword in the :class:`Series` constructor (:issue:`20110`)
+- Deprecated the ``kind`` keyword in :meth:`Series.resample` and :meth:`DataFrame.resample`, explicitly cast the object's ``index`` instead (:issue:`55895`)
 - Deprecated the ``ordinal`` keyword in :class:`PeriodIndex`, use :meth:`PeriodIndex.from_ordinals` instead (:issue:`55960`)
+- Deprecated the ``unit`` keyword in :class:`TimedeltaIndex` construction, use :func:`to_timedelta` instead (:issue:`55499`)
 - Deprecated the behavior of :meth:`Series.value_counts` and :meth:`Index.value_counts` with object dtype; in a future version these will not perform dtype inference on the resulting :class:`Index`, do ``result.index = result.index.infer_objects()`` to retain the old behavior (:issue:`56161`)
 - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`)
 - Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`)
@@ -428,6 +461,7 @@ Performance improvements
 - Performance improvement in :meth:`Series.str.get_dummies` when dtype is ``"string[pyarrow]"`` or ``"string[pyarrow_numpy]"`` (:issue:`56110`)
 - Performance improvement in :meth:`Series.str` methods (:issue:`55736`)
 - Performance improvement in :meth:`Series.value_counts` and :meth:`Series.mode` for masked dtypes (:issue:`54984`, :issue:`55340`)
+- Performance improvement in :meth:`DataFrameGroupBy.nunique` and :meth:`SeriesGroupBy.nunique` (:issue:`55972`)
 - Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`)
 - Performance improvement when indexing into a non-unique index (:issue:`55816`)
 - Performance improvement when indexing with more than 4 keys (:issue:`54550`)
@@ -520,7 +554,7 @@ Indexing
 
 Missing
 ^^^^^^^
--
+- Bug in :meth:`DataFrame.update` wasn't updating in-place for tz-aware datetime64 dtypes (:issue:`56227`)
 -
 
 MultiIndex
@@ -559,12 +593,14 @@ Groupby/resample/rolling
 - Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, and :meth:`.SeriesGroupBy.idxmax` would not retain :class:`.Categorical` dtype when the index was a :class:`.CategoricalIndex` that contained NA values (:issue:`54234`)
 - Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` when ``observed=False`` and ``f="idxmin"`` or ``f="idxmax"`` would incorrectly raise on unobserved categories (:issue:`54234`)
 - Bug in :meth:`DataFrame.asfreq` and :meth:`Series.asfreq` with a :class:`DatetimeIndex` with non-nanosecond resolution incorrectly converting to nanosecond resolution (:issue:`55958`)
+- Bug in :meth:`DataFrame.ewm` when passed ``times`` with non-nanosecond ``datetime64`` or :class:`DatetimeTZDtype` dtype (:issue:`56262`)
 - Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`)
 - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
 - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`)
 - Bug in :meth:`DataFrameGroupBy.value_counts` and :meth:`SeriesGroupBy.value_count` could result in incorrect sorting if the columns of the DataFrame or name of the Series are integers (:issue:`55951`)
 - Bug in :meth:`DataFrameGroupBy.value_counts` and :meth:`SeriesGroupBy.value_count` would not respect ``sort=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`55951`)
 - Bug in :meth:`DataFrameGroupBy.value_counts` and :meth:`SeriesGroupBy.value_count` would sort by proportions rather than frequencies when ``sort=True`` and ``normalize=True`` (:issue:`55951`)
+-
 
 Reshaping
 ^^^^^^^^^
@@ -593,13 +629,15 @@ Styler
 Other
 ^^^^^
 - Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`)
+- Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`)
 - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`)
 - Bug in :func:`infer_freq` and :meth:`DatetimeIndex.inferred_freq` with weekly frequencies and non-nanosecond resolutions (:issue:`55609`)
 - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`)
 - Bug in :meth:`Dataframe.from_dict` which would always sort the rows of the created :class:`DataFrame`.  (:issue:`55683`)
 - Bug in rendering ``inf`` values inside a a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`)
 - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`)
 - Bug in the error message when assigning an empty dataframe to a column (:issue:`55956`)
+-
 
 .. ***DO NOT USE THIS SECTION***
 

diff --git a/environment.yml b/environment.yml
@@ -48,7 +48,7 @@ dependencies:
   - pymysql>=1.0.2
   - pyreadstat>=1.2.0
   - pytables>=3.8.0
-  - python-calamine>=0.1.6
+  - python-calamine>=0.1.7
   - pyxlsb>=1.0.10
   - s3fs>=2022.11.0
   - scipy>=1.10.0

diff --git a/pandas/_libs/include/pandas/datetime/date_conversions.h b/pandas/_libs/include/pandas/datetime/date_conversions.h
@@ -21,8 +21,4 @@ int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit);
 char *int64ToIso(int64_t value, NPY_DATETIMEUNIT valueUnit,
                  NPY_DATETIMEUNIT base, size_t *len);
 
-// TODO(username): this function doesn't do a lot; should augment or
-// replace with scaleNanosecToUnit
-npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base);
-
 char *int64ToIsoDuration(int64_t value, size_t *len);
diff --git a/pandas/_libs/include/pandas/datetime/pd_datetime.h b/pandas/_libs/include/pandas/datetime/pd_datetime.h
@@ -35,7 +35,6 @@ typedef struct {
                                                  const npy_datetimestruct *);
   int (*scaleNanosecToUnit)(npy_int64 *, NPY_DATETIMEUNIT);
   char *(*int64ToIso)(int64_t, NPY_DATETIMEUNIT, NPY_DATETIMEUNIT, size_t *);
-  npy_datetime (*NpyDateTimeToEpoch)(npy_datetime, NPY_DATETIMEUNIT);
   char *(*PyDateTimeToIso)(PyObject *, NPY_DATETIMEUNIT, size_t *);
   npy_datetime (*PyDateTimeToEpoch)(PyObject *, NPY_DATETIMEUNIT);
   char *(*int64ToIsoDuration)(int64_t, size_t *);