diff --git a/.github/workflows/cache-cleanup-weekly.yml b/.github/workflows/cache-cleanup-daily.yml similarity index 90% rename from .github/workflows/cache-cleanup-weekly.yml rename to .github/workflows/cache-cleanup-daily.yml index 6da31f7354457..8eadfb2ccd2a9 100644 --- a/.github/workflows/cache-cleanup-weekly.yml +++ b/.github/workflows/cache-cleanup-daily.yml @@ -1,8 +1,8 @@ -name: Purge caches once a week +name: Purge caches daily on: schedule: - # 4:10 UTC on Sunday - - cron: "10 4 * * 0" + # 4:10 UTC daily + - cron: "10 4 * * *" jobs: cleanup: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 43dc202fce5b3..a5de902866611 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -90,6 +90,7 @@ repos: rev: v0.9.1 hooks: - id: sphinx-lint + args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format rev: v18.1.4 hooks: diff --git a/LICENSES/XARRAY_LICENSE b/LICENSES/XARRAY_LICENSE deleted file mode 100644 index 8405e89a0b120..0000000000000 --- a/LICENSES/XARRAY_LICENSE +++ /dev/null @@ -1,191 +0,0 @@ -Apache License -Version 2.0, January 2004 -http://www.apache.org/licenses/ - -TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - -1. Definitions. - -"License" shall mean the terms and conditions for use, reproduction, and -distribution as defined by Sections 1 through 9 of this document. - -"Licensor" shall mean the copyright owner or entity authorized by the copyright -owner that is granting the License. - -"Legal Entity" shall mean the union of the acting entity and all other entities -that control, are controlled by, or are under common control with that entity. -For the purposes of this definition, "control" means (i) the power, direct or -indirect, to cause the direction or management of such entity, whether by -contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the -outstanding shares, or (iii) beneficial ownership of such entity. - -"You" (or "Your") shall mean an individual or Legal Entity exercising -permissions granted by this License. - -"Source" form shall mean the preferred form for making modifications, including -but not limited to software source code, documentation source, and configuration -files. - -"Object" form shall mean any form resulting from mechanical transformation or -translation of a Source form, including but not limited to compiled object code, -generated documentation, and conversions to other media types. - -"Work" shall mean the work of authorship, whether in Source or Object form, made -available under the License, as indicated by a copyright notice that is included -in or attached to the work (an example is provided in the Appendix below). - -"Derivative Works" shall mean any work, whether in Source or Object form, that -is based on (or derived from) the Work and for which the editorial revisions, -annotations, elaborations, or other modifications represent, as a whole, an -original work of authorship. For the purposes of this License, Derivative Works -shall not include works that remain separable from, or merely link (or bind by -name) to the interfaces of, the Work and Derivative Works thereof. - -"Contribution" shall mean any work of authorship, including the original version -of the Work and any modifications or additions to that Work or Derivative Works -thereof, that is intentionally submitted to Licensor for inclusion in the Work -by the copyright owner or by an individual or Legal Entity authorized to submit -on behalf of the copyright owner. For the purposes of this definition, -"submitted" means any form of electronic, verbal, or written communication sent -to the Licensor or its representatives, including but not limited to -communication on electronic mailing lists, source code control systems, and -issue tracking systems that are managed by, or on behalf of, the Licensor for -the purpose of discussing and improving the Work, but excluding communication -that is conspicuously marked or otherwise designated in writing by the copyright -owner as "Not a Contribution." - -"Contributor" shall mean Licensor and any individual or Legal Entity on behalf -of whom a Contribution has been received by Licensor and subsequently -incorporated within the Work. - -2. Grant of Copyright License. - -Subject to the terms and conditions of this License, each Contributor hereby -grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, -irrevocable copyright license to reproduce, prepare Derivative Works of, -publicly display, publicly perform, sublicense, and distribute the Work and such -Derivative Works in Source or Object form. - -3. Grant of Patent License. - -Subject to the terms and conditions of this License, each Contributor hereby -grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, -irrevocable (except as stated in this section) patent license to make, have -made, use, offer to sell, sell, import, and otherwise transfer the Work, where -such license applies only to those patent claims licensable by such Contributor -that are necessarily infringed by their Contribution(s) alone or by combination -of their Contribution(s) with the Work to which such Contribution(s) was -submitted. If You institute patent litigation against any entity (including a -cross-claim or counterclaim in a lawsuit) alleging that the Work or a -Contribution incorporated within the Work constitutes direct or contributory -patent infringement, then any patent licenses granted to You under this License -for that Work shall terminate as of the date such litigation is filed. - -4. Redistribution. - -You may reproduce and distribute copies of the Work or Derivative Works thereof -in any medium, with or without modifications, and in Source or Object form, -provided that You meet the following conditions: - -You must give any other recipients of the Work or Derivative Works a copy of -this License; and -You must cause any modified files to carry prominent notices stating that You -changed the files; and -You must retain, in the Source form of any Derivative Works that You distribute, -all copyright, patent, trademark, and attribution notices from the Source form -of the Work, excluding those notices that do not pertain to any part of the -Derivative Works; and -If the Work includes a "NOTICE" text file as part of its distribution, then any -Derivative Works that You distribute must include a readable copy of the -attribution notices contained within such NOTICE file, excluding those notices -that do not pertain to any part of the Derivative Works, in at least one of the -following places: within a NOTICE text file distributed as part of the -Derivative Works; within the Source form or documentation, if provided along -with the Derivative Works; or, within a display generated by the Derivative -Works, if and wherever such third-party notices normally appear. The contents of -the NOTICE file are for informational purposes only and do not modify the -License. You may add Your own attribution notices within Derivative Works that -You distribute, alongside or as an addendum to the NOTICE text from the Work, -provided that such additional attribution notices cannot be construed as -modifying the License. -You may add Your own copyright statement to Your modifications and may provide -additional or different license terms and conditions for use, reproduction, or -distribution of Your modifications, or for any such Derivative Works as a whole, -provided Your use, reproduction, and distribution of the Work otherwise complies -with the conditions stated in this License. - -5. Submission of Contributions. - -Unless You explicitly state otherwise, any Contribution intentionally submitted -for inclusion in the Work by You to the Licensor shall be under the terms and -conditions of this License, without any additional terms or conditions. -Notwithstanding the above, nothing herein shall supersede or modify the terms of -any separate license agreement you may have executed with Licensor regarding -such Contributions. - -6. Trademarks. - -This License does not grant permission to use the trade names, trademarks, -service marks, or product names of the Licensor, except as required for -reasonable and customary use in describing the origin of the Work and -reproducing the content of the NOTICE file. - -7. Disclaimer of Warranty. - -Unless required by applicable law or agreed to in writing, Licensor provides the -Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, -including, without limitation, any warranties or conditions of TITLE, -NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are -solely responsible for determining the appropriateness of using or -redistributing the Work and assume any risks associated with Your exercise of -permissions under this License. - -8. Limitation of Liability. - -In no event and under no legal theory, whether in tort (including negligence), -contract, or otherwise, unless required by applicable law (such as deliberate -and grossly negligent acts) or agreed to in writing, shall any Contributor be -liable to You for damages, including any direct, indirect, special, incidental, -or consequential damages of any character arising as a result of this License or -out of the use or inability to use the Work (including but not limited to -damages for loss of goodwill, work stoppage, computer failure or malfunction, or -any and all other commercial damages or losses), even if such Contributor has -been advised of the possibility of such damages. - -9. Accepting Warranty or Additional Liability. - -While redistributing the Work or Derivative Works thereof, You may choose to -offer, and charge a fee for, acceptance of support, warranty, indemnity, or -other liability obligations and/or rights consistent with this License. However, -in accepting such obligations, You may act only on Your own behalf and on Your -sole responsibility, not on behalf of any other Contributor, and only if You -agree to indemnify, defend, and hold each Contributor harmless for any liability -incurred by, or claims asserted against, such Contributor by reason of your -accepting any such warranty or additional liability. - -END OF TERMS AND CONDITIONS - -APPENDIX: How to apply the Apache License to your work - -To apply the Apache License to your work, attach the following boilerplate -notice, with the fields enclosed by brackets "[]" replaced with your own -identifying information. (Don't include the brackets!) The text should be -enclosed in the appropriate comment syntax for the file format. We also -recommend that a file or class name and description of purpose be included on -the same "printed page" as the copyright notice for easier identification within -third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. \ No newline at end of file diff --git a/ci/code_checks.sh b/ci/code_checks.sh index f93e87074d650..ec46e043c8294 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -148,7 +148,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.cat.set_categories PR01,PR02" \ -i "pandas.Series.dt.as_unit PR01,PR02" \ -i "pandas.Series.dt.ceil PR01,PR02" \ - -i "pandas.Series.dt.components SA01" \ -i "pandas.Series.dt.day_name PR01,PR02" \ -i "pandas.Series.dt.floor PR01,PR02" \ -i "pandas.Series.dt.freq GL08" \ @@ -165,12 +164,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.tz_convert PR01,PR02" \ -i "pandas.Series.dt.tz_localize PR01,PR02" \ -i "pandas.Series.dt.unit GL08" \ - -i "pandas.Series.eq SA01" \ -i "pandas.Series.ge SA01" \ -i "pandas.Series.gt SA01" \ - -i "pandas.Series.kurt RT03,SA01" \ - -i "pandas.Series.kurtosis RT03,SA01" \ - -i "pandas.Series.le SA01" \ -i "pandas.Series.list.__getitem__ SA01" \ -i "pandas.Series.list.flatten SA01" \ -i "pandas.Series.list.len SA01" \ @@ -221,24 +216,19 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.str.wrap RT03,SA01" \ -i "pandas.Series.str.zfill RT03" \ -i "pandas.Series.struct.dtypes SA01" \ - -i "pandas.Series.sum RT03" \ - -i "pandas.Series.swaplevel SA01" \ -i "pandas.Series.to_dict SA01" \ -i "pandas.Series.to_frame SA01" \ -i "pandas.Series.to_markdown SA01" \ - -i "pandas.Series.to_string SA01" \ -i "pandas.Series.update PR07,SA01" \ - -i "pandas.Series.var PR01,RT03,SA01" \ - -i "pandas.Timedelta PR07,SA01" \ -i "pandas.Timedelta.as_unit SA01" \ -i "pandas.Timedelta.asm8 SA01" \ -i "pandas.Timedelta.ceil SA01" \ -i "pandas.Timedelta.components SA01" \ -i "pandas.Timedelta.days SA01" \ -i "pandas.Timedelta.floor SA01" \ - -i "pandas.Timedelta.max PR02,PR07,SA01" \ - -i "pandas.Timedelta.min PR02,PR07,SA01" \ - -i "pandas.Timedelta.resolution PR02,PR07,SA01" \ + -i "pandas.Timedelta.max PR02" \ + -i "pandas.Timedelta.min PR02" \ + -i "pandas.Timedelta.resolution PR02" \ -i "pandas.Timedelta.round SA01" \ -i "pandas.Timedelta.to_numpy PR01" \ -i "pandas.Timedelta.to_timedelta64 SA01" \ @@ -250,20 +240,15 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.TimedeltaIndex.nanoseconds SA01" \ -i "pandas.TimedeltaIndex.seconds SA01" \ -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \ - -i "pandas.Timestamp.astimezone SA01" \ - -i "pandas.Timestamp.ceil SA01" \ -i "pandas.Timestamp.combine PR01,SA01" \ -i "pandas.Timestamp.ctime SA01" \ -i "pandas.Timestamp.date SA01" \ -i "pandas.Timestamp.day GL08" \ - -i "pandas.Timestamp.dst SA01" \ -i "pandas.Timestamp.floor SA01" \ -i "pandas.Timestamp.fold GL08" \ -i "pandas.Timestamp.fromordinal SA01" \ -i "pandas.Timestamp.fromtimestamp PR01,SA01" \ -i "pandas.Timestamp.hour GL08" \ - -i "pandas.Timestamp.is_leap_year SA01" \ - -i "pandas.Timestamp.isocalendar SA01" \ -i "pandas.Timestamp.isoweekday SA01" \ -i "pandas.Timestamp.max PR02" \ -i "pandas.Timestamp.microsecond GL08" \ @@ -276,7 +261,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.now SA01" \ -i "pandas.Timestamp.quarter SA01" \ -i "pandas.Timestamp.replace PR07,SA01" \ - -i "pandas.Timestamp.resolution PR02,PR07,SA01" \ + -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.second GL08" \ -i "pandas.Timestamp.strptime PR01,SA01" \ -i "pandas.Timestamp.time SA01" \ @@ -290,7 +275,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.today SA01" \ -i "pandas.Timestamp.toordinal SA01" \ -i "pandas.Timestamp.tz SA01" \ - -i "pandas.Timestamp.tz_convert SA01" \ -i "pandas.Timestamp.tz_localize SA01" \ -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.tzname SA01" \ @@ -299,10 +283,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.utcoffset SA01" \ -i "pandas.Timestamp.utctimetuple SA01" \ -i "pandas.Timestamp.value GL08" \ - -i "pandas.Timestamp.weekday SA01" \ -i "pandas.Timestamp.year GL08" \ - -i "pandas.api.extensions.ExtensionArray._from_sequence_of_strings SA01" \ - -i "pandas.api.extensions.ExtensionArray._hash_pandas_object RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray._pad_or_backfill PR01,RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray._reduce RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray._values_for_factorize SA01" \ @@ -310,7 +291,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.extensions.ExtensionArray.dropna RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray.dtype SA01" \ -i "pandas.api.extensions.ExtensionArray.duplicated RT03,SA01" \ - -i "pandas.api.extensions.ExtensionArray.equals SA01" \ -i "pandas.api.extensions.ExtensionArray.fillna SA01" \ -i "pandas.api.extensions.ExtensionArray.insert PR07,RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray.interpolate PR01,SA01" \ @@ -319,21 +299,18 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.extensions.ExtensionArray.nbytes SA01" \ -i "pandas.api.extensions.ExtensionArray.ndim SA01" \ -i "pandas.api.extensions.ExtensionArray.ravel RT03,SA01" \ - -i "pandas.api.extensions.ExtensionArray.shift SA01" \ -i "pandas.api.extensions.ExtensionArray.take RT03" \ -i "pandas.api.extensions.ExtensionArray.tolist RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray.unique RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray.view SA01" \ -i "pandas.api.indexers.VariableOffsetWindowIndexer PR01,SA01" \ -i "pandas.api.interchange.from_dataframe RT03,SA01" \ - -i "pandas.api.types.infer_dtype PR07,SA01" \ -i "pandas.api.types.is_any_real_numeric_dtype SA01" \ -i "pandas.api.types.is_bool PR01,SA01" \ -i "pandas.api.types.is_bool_dtype SA01" \ -i "pandas.api.types.is_categorical_dtype SA01" \ -i "pandas.api.types.is_complex PR01,SA01" \ -i "pandas.api.types.is_complex_dtype SA01" \ - -i "pandas.api.types.is_datetime64_any_dtype SA01" \ -i "pandas.api.types.is_datetime64_dtype SA01" \ -i "pandas.api.types.is_datetime64_ns_dtype SA01" \ -i "pandas.api.types.is_datetime64tz_dtype SA01" \ @@ -355,11 +332,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_period_dtype SA01" \ -i "pandas.api.types.is_re PR07,SA01" \ -i "pandas.api.types.is_re_compilable PR07,SA01" \ - -i "pandas.api.types.is_scalar SA01" \ -i "pandas.api.types.is_signed_integer_dtype SA01" \ -i "pandas.api.types.is_sparse SA01" \ -i "pandas.api.types.is_string_dtype SA01" \ - -i "pandas.api.types.is_timedelta64_dtype SA01" \ -i "pandas.api.types.is_timedelta64_ns_dtype SA01" \ -i "pandas.api.types.is_unsigned_integer_dtype SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ @@ -465,7 +440,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.UnsortedIndexError SA01" \ -i "pandas.errors.UnsupportedFunctionCall SA01" \ -i "pandas.errors.ValueLabelTypeMismatch SA01" \ - -i "pandas.get_option SA01" \ -i "pandas.infer_freq SA01" \ -i "pandas.interval_range RT03" \ -i "pandas.io.formats.style.Styler.apply RT03" \ @@ -530,9 +504,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.testing.assert_index_equal PR07,SA01" \ -i "pandas.testing.assert_series_equal PR07,SA01" \ -i "pandas.timedelta_range SA01" \ - -i "pandas.tseries.api.guess_datetime_format SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ - -i "pandas.tseries.offsets.BMonthBegin PR02" \ -i "pandas.tseries.offsets.BQuarterBegin PR02" \ -i "pandas.tseries.offsets.BQuarterBegin.freqstr SA01" \ -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \ @@ -541,7 +513,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.BQuarterBegin.normalize GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.rule_code GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.startingMonth GL08" \ - -i "pandas.tseries.offsets.BQuarterEnd PR02" \ -i "pandas.tseries.offsets.BQuarterEnd.freqstr SA01" \ -i "pandas.tseries.offsets.BQuarterEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.BQuarterEnd.n GL08" \ @@ -586,7 +557,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.BusinessHour.rule_code GL08" \ -i "pandas.tseries.offsets.BusinessHour.start GL08" \ -i "pandas.tseries.offsets.BusinessHour.weekmask GL08" \ - -i "pandas.tseries.offsets.BusinessMonthBegin PR02" \ -i "pandas.tseries.offsets.BusinessMonthBegin.freqstr SA01" \ -i "pandas.tseries.offsets.BusinessMonthBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.BusinessMonthBegin.n GL08" \ @@ -653,7 +623,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.DateOffset.nanos GL08" \ -i "pandas.tseries.offsets.DateOffset.normalize GL08" \ -i "pandas.tseries.offsets.DateOffset.rule_code GL08" \ - -i "pandas.tseries.offsets.Day PR02" \ -i "pandas.tseries.offsets.Day.freqstr SA01" \ -i "pandas.tseries.offsets.Day.is_on_offset GL08" \ -i "pandas.tseries.offsets.Day.n GL08" \ @@ -758,7 +727,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.QuarterBegin.normalize GL08" \ -i "pandas.tseries.offsets.QuarterBegin.rule_code GL08" \ -i "pandas.tseries.offsets.QuarterBegin.startingMonth GL08" \ - -i "pandas.tseries.offsets.QuarterEnd PR02" \ -i "pandas.tseries.offsets.QuarterEnd.freqstr SA01" \ -i "pandas.tseries.offsets.QuarterEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.QuarterEnd.n GL08" \ diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index f6ff95aa72c6c..4f4a1cd811b61 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -462,9 +462,9 @@ Post-Release the appropriate ones for the version you are releasing): - Log in to the server and use the correct user. - - `cd /var/www/html/pandas-docs/` - - `ln -sfn version/2.1 stable` (for a major or minor release) - - `ln -sfn version/2.0.3 version/2.0` (for a patch release) + - ``cd /var/www/html/pandas-docs/`` + - ``ln -sfn version/2.1 stable`` (for a major or minor release) + - ``ln -sfn version/2.0.3 version/2.0`` (for a patch release) 2. If releasing a major or minor release, open a PR in our source code to update ``web/pandas/versions.json``, to have the desired versions in the documentation diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 53fc9245aa972..f3b849dc6de45 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -763,7 +763,7 @@ Joining a single Index to a MultiIndex ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ You can join a :class:`DataFrame` with a :class:`Index` to a :class:`DataFrame` with a :class:`MultiIndex` on a level. -The ``name`` of the :class:`Index` with match the level name of the :class:`MultiIndex`. +The ``name`` of the :class:`Index` will match the level name of the :class:`MultiIndex`. .. ipython:: python diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index 8dafed1efee97..70982e723016f 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -445,7 +445,7 @@ Rolling/expanding moments improvements 3 5 dtype: float64 -- :func:`rolling_window` now normalizes the weights properly in rolling mean mode (`mean=True`) so that +- :func:`rolling_window` now normalizes the weights properly in rolling mean mode (``mean=True``) so that the calculated weighted means (e.g. 'triang', 'gaussian') are distributed about the same means as those calculated without weighting (i.e. 'boxcar'). See :ref:`the note on normalization ` for further details. (:issue:`7618`) diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst index acc5409b86d09..a9003710540d7 100644 --- a/doc/source/whatsnew/v0.15.2.rst +++ b/doc/source/whatsnew/v0.15.2.rst @@ -212,7 +212,7 @@ Other enhancements: 4 True True True True - Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on ``Timestamp`` class (:issue:`5351`). -- Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See `here `__. +- Added Google Analytics (``pandas.io.ga``) basic documentation (:issue:`8835`). See `here `__. - ``Timedelta`` arithmetic returns ``NotImplemented`` in unknown cases, allowing extensions by custom classes (:issue:`8813`). - ``Timedelta`` now supports arithmetic with ``numpy.ndarray`` objects of the appropriate dtype (numpy 1.8 or newer only) (:issue:`8884`). - Added ``Timedelta.to_timedelta64()`` method to the public API (:issue:`8884`). diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index c63d047f03823..8ddc8e5d058ca 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1653,7 +1653,7 @@ Timedelta - Bug in :class:`Index` with numeric dtype when multiplying or dividing an array with dtype ``timedelta64`` (:issue:`22390`) - Bug in :class:`TimedeltaIndex` incorrectly allowing indexing with ``Timestamp`` object (:issue:`20464`) - Fixed bug where subtracting :class:`Timedelta` from an object-dtyped array would raise ``TypeError`` (:issue:`21980`) -- Fixed bug in adding a :class:`DataFrame` with all-`timedelta64[ns]` dtypes to a :class:`DataFrame` with all-integer dtypes returning incorrect results instead of raising ``TypeError`` (:issue:`22696`) +- Fixed bug in adding a :class:`DataFrame` with all-``timedelta64[ns]`` dtypes to a :class:`DataFrame` with all-integer dtypes returning incorrect results instead of raising ``TypeError`` (:issue:`22696`) - Bug in :class:`TimedeltaIndex` where adding a timezone-aware datetime scalar incorrectly returned a timezone-naive :class:`DatetimeIndex` (:issue:`23215`) - Bug in :class:`TimedeltaIndex` where adding ``np.timedelta64('NaT')`` incorrectly returned an all-``NaT`` :class:`DatetimeIndex` instead of an all-``NaT`` :class:`TimedeltaIndex` (:issue:`23215`) - Bug in :class:`Timedelta` and :func:`to_timedelta()` have inconsistencies in supported unit string (:issue:`21762`) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 5cf5623f73036..50be28a912cf6 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -1082,7 +1082,7 @@ Numeric - Bug in :meth:`~pandas.eval` when comparing floats with scalar operators, for example: ``x < -0.1`` (:issue:`25928`) - Fixed bug where casting all-boolean array to integer extension array failed (:issue:`25211`) - Bug in ``divmod`` with a :class:`Series` object containing zeros incorrectly raising ``AttributeError`` (:issue:`26987`) -- Inconsistency in :class:`Series` floor-division (`//`) and ``divmod`` filling positive//zero with ``NaN`` instead of ``Inf`` (:issue:`27321`) +- Inconsistency in :class:`Series` floor-division (``//``) and ``divmod`` filling positive//zero with ``NaN`` instead of ``Inf`` (:issue:`27321`) - Conversion diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst index ef0c4d741ca58..c6fda04070240 100644 --- a/doc/source/whatsnew/v1.1.4.rst +++ b/doc/source/whatsnew/v1.1.4.rst @@ -31,7 +31,7 @@ Fixed regressions - Fixed regression in setitem with :meth:`DataFrame.iloc` which raised error when trying to set a value while filtering with a boolean list (:issue:`36741`) - Fixed regression in setitem with a Series getting aligned before setting the values (:issue:`37427`) - Fixed regression in :attr:`MultiIndex.is_monotonic_increasing` returning wrong results with ``NaN`` in at least one of the levels (:issue:`37220`) -- Fixed regression in inplace arithmetic operation (`+=`) on a Series not updating the parent DataFrame/Series (:issue:`36373`) +- Fixed regression in inplace arithmetic operation (``+=``) on a Series not updating the parent DataFrame/Series (:issue:`36373`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 5973cc0404e0b..a15da861cfbec 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -221,6 +221,7 @@ Removal of prior version deprecations/changes - :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`) - :func:`to_datetime` with a ``unit`` specified no longer parses strings into floats, instead parses them the same way as without ``unit`` (:issue:`50735`) - :meth:`DataFrame.groupby` with ``as_index=False`` and aggregation methods will no longer exclude from the result the groupings that do not arise from the input (:issue:`49519`) +- :meth:`ExtensionArray._reduce` now requires a ``keepdims: bool = False`` parameter in the signature (:issue:`52788`) - :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`) - :meth:`SeriesGroupBy.agg` no longer pins the name of the group to the input passed to the provided ``func`` (:issue:`51703`) - All arguments except ``name`` in :meth:`Index.rename` are now keyword only (:issue:`56493`) @@ -246,6 +247,7 @@ Removal of prior version deprecations/changes - Removed the "closed" and "normalize" keywords in :meth:`DatetimeIndex.__new__` (:issue:`52628`) - Removed the deprecated ``delim_whitespace`` keyword in :func:`read_csv` and :func:`read_table`, use ``sep=r"\s+"`` instead (:issue:`55569`) - Require :meth:`SparseDtype.fill_value` to be a valid value for the :meth:`SparseDtype.subtype` (:issue:`53043`) +- Stopped automatically casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`) - Stopped performing dtype inference when setting a :class:`Index` into a :class:`DataFrame` (:issue:`56102`) - Stopped performing dtype inference with in :meth:`Index.insert` with object-dtype index; this often affects the index/columns that result when setting new entries into an empty :class:`Series` or :class:`DataFrame` (:issue:`51363`) - Removed the "closed" and "unit" keywords in :meth:`TimedeltaIndex.__new__` (:issue:`52628`, :issue:`55499`) @@ -335,6 +337,7 @@ Removal of prior version deprecations/changes Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Eliminated circular reference in to original pandas object in accessor attributes (e.g. :attr:`Series.str`). However, accessor instantiation is no longer cached (:issue:`47667`, :issue:`41357`) - :attr:`Categorical.categories` returns a :class:`RangeIndex` columns instead of an :class:`Index` if the constructed ``values`` was a ``range``. (:issue:`57787`) - :class:`DataFrame` returns a :class:`RangeIndex` columns when possible when ``data`` is a ``dict`` (:issue:`57943`) - :class:`Series` returns a :class:`RangeIndex` index when possible when ``data`` is a ``dict`` (:issue:`58118`) @@ -460,6 +463,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- Bug in :meth:`.DataFrameGroupBy.__len__` and :meth:`.SeriesGroupBy.__len__` would raise when the grouping contained NA values and ``dropna=False`` (:issue:`58644`) - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`) - Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`) - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index e1999dd536999..e746933ac0bf7 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -15,7 +15,6 @@ "describe_option", "option_context", "options", - "using_copy_on_write", ] from pandas._config import config from pandas._config import dates # pyright: ignore[reportUnusedImport] # noqa: F401 @@ -31,10 +30,6 @@ from pandas._config.display import detect_console_encoding -def using_copy_on_write() -> bool: - return True - - def using_pyarrow_string_dtype() -> bool: _mode_options = _global_config["future"] return _mode_options["infer_string"] diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 8921e1b686303..1b91a7c3ee636 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -157,6 +157,12 @@ def get_option(pat: str) -> Any: ------ OptionError : if no such option exists + See Also + -------- + set_option : Set the value of the specified option or options. + reset_option : Reset one or more options to their default value. + describe_option : Print the description for one or more registered options. + Notes ----- For all available options, please view the :ref:`User Guide ` diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 6a31ce84ed418..83373cb4b1d9f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -184,6 +184,13 @@ def is_scalar(val: object) -> bool: bool Return True if given object is scalar. + See Also + -------- + api.types.is_list_like : Check if the input is list-like. + api.types.is_integer : Check if the input is an integer. + api.types.is_float : Check if the input is a float. + api.types.is_bool : Check if the input is a boolean. + Examples -------- >>> import datetime @@ -1442,6 +1449,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: Parameters ---------- value : scalar, list, ndarray, or pandas type + The input data to infer the dtype. skipna : bool, default True Ignore NaN values when inferring the type. @@ -1476,6 +1484,14 @@ def infer_dtype(value: object, skipna: bool = True) -> str: TypeError If ndarray-like but cannot infer the dtype + See Also + -------- + api.types.is_scalar : Check if the input is a scalar. + api.types.is_list_like : Check if the input is list-like. + api.types.is_integer : Check if the input is an integer. + api.types.is_float : Check if the input is a float. + api.types.is_bool : Check if the input is a boolean. + Notes ----- - 'mixed' is the catchall for anything that is not otherwise diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index dbe9f72af9750..c483814a3ef74 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -419,6 +419,12 @@ class NaTType(_NaT): Monday == 0 ... Sunday == 6. + See Also + -------- + Timestamp.dayofweek : Return the day of the week with Monday=0, Sunday=6. + Timestamp.isoweekday : Return the day of the week with Monday=1, Sunday=7. + datetime.date.weekday : Equivalent method in datetime module. + Examples -------- >>> ts = pd.Timestamp('2023-01-01') @@ -528,6 +534,12 @@ class NaTType(_NaT): """ Return a named tuple containing ISO year, week number, and weekday. + See Also + -------- + DatetimeIndex.isocalendar : Return a 3-tuple containing ISO year, + week number, and weekday for the given DatetimeIndex object. + datetime.date.isocalendar : The equivalent method for `datetime.date` objects. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -542,6 +554,14 @@ class NaTType(_NaT): """ Return the daylight saving time (DST) adjustment. + This method returns the DST adjustment as a `datetime.timedelta` object + if the Timestamp is timezone-aware and DST is applicable. + + See Also + -------- + Timestamp.tz_localize : Localize the Timestamp to a timezone. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Examples -------- >>> ts = pd.Timestamp('2000-06-01 00:00:00', tz='Europe/Brussels') @@ -814,6 +834,11 @@ class NaTType(_NaT): """ Convert timezone-aware Timestamp to another time zone. + This method is used to convert a timezone-aware Timestamp object to a + different time zone. The original UTC time remains the same; only the + time zone information is changed. If the Timestamp is timezone-naive, a + TypeError is raised. + Parameters ---------- tz : str, pytz.timezone, dateutil.tz.tzfile or None @@ -829,6 +854,13 @@ class NaTType(_NaT): TypeError If Timestamp is tz-naive. + See Also + -------- + Timestamp.tz_localize : Localize the Timestamp to a timezone. + DatetimeIndex.tz_convert : Convert a DatetimeIndex to another time zone. + DatetimeIndex.tz_localize : Localize a DatetimeIndex to a specific time zone. + datetime.datetime.astimezone : Convert a datetime object to another time zone. + Examples -------- Create a timestamp object with UTC timezone: @@ -1196,6 +1228,12 @@ timedelta}, default 'raise' ------ ValueError if the freq cannot be converted. + See Also + -------- + Timestamp.floor : Round down a Timestamp to the specified resolution. + Timestamp.round : Round a Timestamp to the specified resolution. + Series.dt.ceil : Ceil the datetime values in a Series. + Notes ----- If the Timestamp has a timezone, ceiling will take place relative to the @@ -1256,6 +1294,11 @@ timedelta}, default 'raise' """ Convert timezone-aware Timestamp to another time zone. + This method is used to convert a timezone-aware Timestamp object to a + different time zone. The original UTC time remains the same; only the + time zone information is changed. If the Timestamp is timezone-naive, a + TypeError is raised. + Parameters ---------- tz : str, pytz.timezone, dateutil.tz.tzfile or None @@ -1271,6 +1314,13 @@ timedelta}, default 'raise' TypeError If Timestamp is tz-naive. + See Also + -------- + Timestamp.tz_localize : Localize the Timestamp to a timezone. + DatetimeIndex.tz_convert : Convert a DatetimeIndex to another time zone. + DatetimeIndex.tz_localize : Localize a DatetimeIndex to a specific time zone. + datetime.datetime.astimezone : Convert a datetime object to another time zone. + Examples -------- Create a timestamp object with UTC timezone: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 046b4dfc5606b..0d681e0c2aae6 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1104,7 +1104,7 @@ cdef class Day(Tick): """ Offset ``n`` days. - Parameters + Attributes ---------- n : int, default 1 The number of days represented. @@ -2756,7 +2756,7 @@ cdef class BQuarterEnd(QuarterOffset): startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... startingMonth = 3 corresponds to dates like 3/30/2007, 6/29/2007, ... - Parameters + Attributes ---------- n : int, default 1 The number of quarters represented. @@ -2838,7 +2838,7 @@ cdef class QuarterEnd(QuarterOffset): startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... startingMonth = 3 corresponds to dates like 3/31/2007, 6/30/2007, ... - Parameters + Attributes ---------- n : int, default 1 The number of quarters represented. @@ -3052,7 +3052,7 @@ cdef class BusinessMonthBegin(MonthOffset): BusinessMonthBegin goes to the next date which is the first business day of the month. - Parameters + Attributes ---------- n : int, default 1 The number of months represented. diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index c448a7e7c01b5..35d2433a707a0 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -859,6 +859,10 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: """ Guess the datetime format of a given datetime string. + This function attempts to deduce the format of a given datetime string. It is + useful for situations where the datetime format is unknown and needs to be + determined for proper parsing. The function is not guaranteed to return a format. + Parameters ---------- dt_str : str @@ -876,6 +880,12 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: datetime format string (for `strftime` or `strptime`), or None if it can't be guessed. + See Also + -------- + to_datetime : Convert argument to datetime. + Timestamp : Pandas replacement for python datetime.datetime object. + DatetimeIndex : Immutable ndarray-like of datetime64 data. + Examples -------- >>> from pandas.tseries.api import guess_datetime_format diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 9078fd4116899..4ff2df34ac717 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1785,6 +1785,7 @@ class Timedelta(_Timedelta): Parameters ---------- value : Timedelta, timedelta, np.timedelta64, str, or int + Input value. unit : str, default 'ns' Denote the unit of the input, if input is an integer. @@ -1810,6 +1811,15 @@ class Timedelta(_Timedelta): Values for construction in compat with datetime.timedelta. Numpy ints and floats will be coerced to python ints and floats. + See Also + -------- + Timestamp : Represents a single timestamp in time. + TimedeltaIndex : Immutable Index of timedelta64 data. + DateOffset : Standard kind of date increment used for a date range. + to_timedelta : Convert argument to timedelta. + datetime.timedelta : Represents a duration in the datetime module. + numpy.timedelta64 : Represents a duration compatible with NumPy. + Notes ----- The constructor may take in either both values of value and unit or diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index fb22563d34e11..04bd439b40b8d 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -820,9 +820,20 @@ cdef class _Timestamp(ABCTimestamp): """ Return True if year is a leap year. + A leap year is a year, which has 366 days (instead of 365) including 29th of + February as an intercalary day. Leap years are years which are multiples of + four with the exception of years divisible by 100 but not by 400. + Returns ------- bool + True if year is a leap year, else False + + See Also + -------- + Period.is_leap_year : Return True if the period’s year is in a leap year. + DatetimeIndex.is_leap_year : Boolean indicator if the date belongs to a + leap year. Examples -------- @@ -1658,6 +1669,14 @@ class Timestamp(_Timestamp): """ Return the daylight saving time (DST) adjustment. + This method returns the DST adjustment as a `datetime.timedelta` object + if the Timestamp is timezone-aware and DST is applicable. + + See Also + -------- + Timestamp.tz_localize : Localize the Timestamp to a timezone. + Timestamp.tz_convert : Convert timezone-aware Timestamp to another time zone. + Examples -------- >>> ts = pd.Timestamp('2000-06-01 00:00:00', tz='Europe/Brussels') @@ -1672,6 +1691,12 @@ class Timestamp(_Timestamp): """ Return a named tuple containing ISO year, week number, and weekday. + See Also + -------- + DatetimeIndex.isocalendar : Return a 3-tuple containing ISO year, + week number, and weekday for the given DatetimeIndex object. + datetime.date.isocalendar : The equivalent method for `datetime.date` objects. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -2266,6 +2291,12 @@ timedelta}, default 'raise' ------ ValueError if the freq cannot be converted. + See Also + -------- + Timestamp.floor : Round down a Timestamp to the specified resolution. + Timestamp.round : Round a Timestamp to the specified resolution. + Series.dt.ceil : Ceil the datetime values in a Series. + Notes ----- If the Timestamp has a timezone, ceiling will take place relative to the @@ -2451,6 +2482,11 @@ default 'raise' """ Convert timezone-aware Timestamp to another time zone. + This method is used to convert a timezone-aware Timestamp object to a + different time zone. The original UTC time remains the same; only the + time zone information is changed. If the Timestamp is timezone-naive, a + TypeError is raised. + Parameters ---------- tz : str, pytz.timezone, dateutil.tz.tzfile or None @@ -2466,6 +2502,13 @@ default 'raise' TypeError If Timestamp is tz-naive. + See Also + -------- + Timestamp.tz_localize : Localize the Timestamp to a timezone. + DatetimeIndex.tz_convert : Convert a DatetimeIndex to another time zone. + DatetimeIndex.tz_localize : Localize a DatetimeIndex to a specific time zone. + datetime.datetime.astimezone : Convert a datetime object to another time zone. + Examples -------- Create a timestamp object with UTC timezone: @@ -2709,6 +2752,12 @@ default 'raise' Monday == 0 ... Sunday == 6. + See Also + -------- + Timestamp.dayofweek : Return the day of the week with Monday=0, Sunday=6. + Timestamp.isoweekday : Return the day of the week with Monday=1, Sunday=7. + datetime.date.weekday : Equivalent method in datetime module. + Examples -------- >>> ts = pd.Timestamp('2023-01-01') diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py index 7ebed8857f0af..91b5d2a981bef 100644 --- a/pandas/_testing/contexts.py +++ b/pandas/_testing/contexts.py @@ -11,8 +11,6 @@ ) import uuid -from pandas._config import using_copy_on_write - from pandas.compat import PYPY from pandas.errors import ChainedAssignmentError @@ -158,34 +156,25 @@ def with_csv_dialect(name: str, **kwargs) -> Generator[None, None, None]: csv.unregister_dialect(name) -def raises_chained_assignment_error(warn=True, extra_warnings=(), extra_match=()): +def raises_chained_assignment_error(extra_warnings=(), extra_match=()): from pandas._testing import assert_produces_warning - if not warn: - from contextlib import nullcontext - - return nullcontext() - - if PYPY and not extra_warnings: - from contextlib import nullcontext + if PYPY: + if not extra_warnings: + from contextlib import nullcontext - return nullcontext() - elif PYPY and extra_warnings: - return assert_produces_warning( - extra_warnings, - match=extra_match, - ) - else: - if using_copy_on_write(): - warning = ChainedAssignmentError - match = ( - "A value is trying to be set on a copy of a DataFrame or Series " - "through chained assignment" - ) + return nullcontext() else: - warning = FutureWarning # type: ignore[assignment] - # TODO update match - match = "ChainedAssignmentError" + return assert_produces_warning( + extra_warnings, + match=extra_match, + ) + else: + warning = ChainedAssignmentError + match = ( + "A value is trying to be set on a copy of a DataFrame or Series " + "through chained assignment" + ) if extra_warnings: warning = (warning, *extra_warnings) # type: ignore[assignment] return assert_produces_warning( diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 39c471c3db440..3acbfc3eabbac 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -195,17 +195,11 @@ def add_delegate_accessors(cls): return add_delegate_accessors -# Ported with modifications from xarray; licence at LICENSES/XARRAY_LICENSE -# https://github.com/pydata/xarray/blob/master/xarray/core/extensions.py -# 1. We don't need to catch and re-raise AttributeErrors as RuntimeErrors -# 2. We use a UserWarning instead of a custom Warning - - -class CachedAccessor: +class Accessor: """ Custom property-like object. - A descriptor for caching accessors. + A descriptor for accessors. Parameters ---------- @@ -229,13 +223,12 @@ def __get__(self, obj, cls): if obj is None: # we're accessing the attribute of the class, i.e., Dataset.geo return self._accessor - accessor_obj = self._accessor(obj) - # Replace the property with the accessor object. Inspired by: - # https://www.pydanny.com/cached-property.html - # We need to use object.__setattr__ because we overwrite __setattr__ on - # NDFrame - object.__setattr__(obj, self._name, accessor_obj) - return accessor_obj + return self._accessor(obj) + + +# Alias kept for downstream libraries +# TODO: Deprecate as name is now misleading +CachedAccessor = Accessor @doc(klass="", examples="", others="") @@ -295,7 +288,7 @@ def decorator(accessor: TypeT) -> TypeT: UserWarning, stacklevel=find_stack_level(), ) - setattr(cls, name, CachedAccessor(name, accessor)) + setattr(cls, name, Accessor(name, accessor)) cls._accessors.add(name) return accessor diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index 1fa610f35f56b..03c73489bd3d8 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -329,8 +329,8 @@ def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes)) if self.ndim == 1: - names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] - name = names[0] if len(set(names)) == 1 else None + names = {getattr(x, "name") for x in inputs if hasattr(x, "name")} + name = names.pop() if len(names) == 1 else None reconstruct_kwargs = {"name": name} else: reconstruct_kwargs = {} diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e403ecfdef4bc..3d55513ab914c 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1052,8 +1052,7 @@ def _pad_or_backfill( copy: bool = True, ) -> Self: if not self._hasna: - # TODO(CoW): Not necessary anymore when CoW is the default - return self.copy() + return self if limit is None and limit_area is None: method = missing.clean_fill_method(method) @@ -1084,7 +1083,6 @@ def fillna( copy: bool = True, ) -> Self: if not self._hasna: - # TODO(CoW): Not necessary anymore when CoW is the default return self.copy() if limit is not None: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f7e6b513663f6..f83fdcd46b371 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -365,6 +365,16 @@ def _from_sequence_of_strings( ------- ExtensionArray + See Also + -------- + api.extensions.ExtensionArray._from_sequence : Construct a new ExtensionArray + from a sequence of scalars. + api.extensions.ExtensionArray._from_factorized : Reconstruct an ExtensionArray + after factorization. + api.extensions.ExtensionArray._from_scalars : Strict analogue to _from_sequence, + allowing only sequences of scalars that should be specifically inferred to + the given dtype. + Examples -------- >>> pd.arrays.IntegerArray._from_sequence_of_strings( @@ -1182,6 +1192,13 @@ def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: ExtensionArray Shifted. + See Also + -------- + api.extensions.ExtensionArray.transpose : Return a transposed view on + this array. + api.extensions.ExtensionArray.factorize : Encode the extension array as an + enumerated type. + Notes ----- If ``self`` is empty or ``periods`` is 0, a copy of ``self`` is @@ -1317,12 +1334,23 @@ def equals(self, other: object) -> bool: boolean Whether the arrays are equivalent. + See Also + -------- + numpy.array_equal : Equivalent method for numpy array. + Series.equals : Equivalent method for Series. + DataFrame.equals : Equivalent method for DataFrame. + Examples -------- >>> arr1 = pd.array([1, 2, np.nan]) >>> arr2 = pd.array([1, 2, np.nan]) >>> arr1.equals(arr2) True + + >>> arr1 = pd.array([1, 3, np.nan]) + >>> arr2 = pd.array([1, 2, np.nan]) + >>> arr1.equals(arr2) + False """ if type(self) != type(other): return False @@ -1933,12 +1961,6 @@ def _reduce( keepdims : bool, default False If False, a scalar is returned. If True, the result has dimension with size one along the reduced axis. - - .. versionadded:: 2.1 - - This parameter is not required in the _reduce signature to keep backward - compatibility, but will become required in the future. If the parameter - is not found in the method signature, a FutureWarning will be emitted. **kwargs Additional keyword arguments passed to the reduction function. Currently, `ddof` is the only supported kwarg. @@ -2011,6 +2033,13 @@ def _hash_pandas_object( Returns ------- np.ndarray[uint64] + An array of hashed values. + + See Also + -------- + api.extensions.ExtensionArray._values_for_factorize : Return an array and + missing value suitable for factorization. + util.hash_array : Given a 1d array, return an array of hashed values. Examples -------- diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 26095e0af7878..925858a20ce41 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -759,14 +759,6 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: values = ensure_wrapped_if_datetimelike(values) if not isinstance(values, type(self)): - inferable = [ - "timedelta", - "timedelta64", - "datetime", - "datetime64", - "date", - "period", - ] if values.dtype == object: values = lib.maybe_convert_objects( values, # type: ignore[arg-type] @@ -775,32 +767,11 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: ) if values.dtype != object: return self.isin(values) - - inferred = lib.infer_dtype(values, skipna=False) - if inferred not in inferable: - if inferred == "string": - pass - - elif "mixed" in inferred: - return isin(self.astype(object), values) - else: - return np.zeros(self.shape, dtype=bool) - - try: - values = type(self)._from_sequence(values) - except ValueError: - return isin(self.astype(object), values) - else: - warnings.warn( - # GH#53111 - f"The behavior of 'isin' with dtype={self.dtype} and " - "castable values (e.g. strings) is deprecated. In a " - "future version, these will not be considered matching " - "by isin. Explicitly cast to the appropriate dtype before " - "calling isin instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) + else: + # TODO: Deprecate this case + # https://github.com/pandas-dev/pandas/pull/58645/files#r1604055791 + return isin(self.astype(object), values) + return np.zeros(self.shape, dtype=bool) if self.dtype.kind in "mM": self = cast("DatetimeArray | TimedeltaArray", self) diff --git a/pandas/core/common.py b/pandas/core/common.py index 77e986a26fbe9..96291991227d9 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -335,11 +335,12 @@ def is_empty_slice(obj) -> bool: ) -def is_true_slices(line) -> list[bool]: +def is_true_slices(line: abc.Iterable) -> abc.Generator[bool, None, None]: """ - Find non-trivial slices in "line": return a list of booleans with same length. + Find non-trivial slices in "line": yields a bool. """ - return [isinstance(k, slice) and not is_null_slice(k) for k in line] + for k in line: + yield isinstance(k, slice) and not is_null_slice(k) # TODO: used only once in indexing; belongs elsewhere? diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 4d8d3c2816f69..2ac75a0700759 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -362,6 +362,13 @@ def is_timedelta64_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of the timedelta64 dtype. + See Also + -------- + api.types.is_timedelta64_ns_dtype : Check whether the provided array or dtype is + of the timedelta64[ns] dtype. + api.types.is_period_dtype : Check whether an array-like or dtype is of the + Period dtype. + Examples -------- >>> from pandas.core.dtypes.common import is_timedelta64_dtype @@ -873,6 +880,15 @@ def is_datetime64_any_dtype(arr_or_dtype) -> bool: bool Whether or not the array or dtype is of the datetime64 dtype. + See Also + -------- + api.types.is_datetime64_dtype : Check whether an array-like or dtype is of the + datetime64 dtype. + api.is_datetime64_ns_dtype : Check whether the provided array or dtype is of the + datetime64[ns] dtype. + api.is_datetime64tz_dtype : Check whether an array-like or dtype is of a + DatetimeTZDtype dtype. + Examples -------- >>> from pandas.api.types import is_datetime64_any_dtype diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index e10080604260a..45814ca77b70f 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -680,10 +680,11 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: return None # categorical is aware of Sparse -> extract sparse subdtypes - dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes] + subtypes = (x.subtype if isinstance(x, SparseDtype) else x for x in dtypes) # extract the categories' dtype non_cat_dtypes = [ - x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes + x.categories.dtype if isinstance(x, CategoricalDtype) else x + for x in subtypes ] # TODO should categorical always give an answer? from pandas.core.dtypes.cast import find_common_type diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9ede2c301c85e..c875ec78891d6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -21,7 +21,6 @@ Sequence, ) import functools -from inspect import signature from io import StringIO import itertools import operator @@ -125,7 +124,7 @@ ops, roperator, ) -from pandas.core.accessor import CachedAccessor +from pandas.core.accessor import Accessor from pandas.core.apply import reconstruct_and_relabel_result from pandas.core.array_algos.take import take_2d_multi from pandas.core.arraylike import OpsMixin @@ -7000,19 +6999,19 @@ def sort_values( f" != length of by ({len(by)})" ) if len(by) > 1: - keys = [self._get_label_or_level_values(x, axis=axis) for x in by] + keys = (self._get_label_or_level_values(x, axis=axis) for x in by) # need to rewrap columns in Series to apply key function if key is not None: - # error: List comprehension has incompatible type List[Series]; - # expected List[ndarray] - keys = [ - Series(k, name=name) # type: ignore[misc] - for (k, name) in zip(keys, by) - ] + keys_data = [Series(k, name=name) for (k, name) in zip(keys, by)] + else: + # error: Argument 1 to "list" has incompatible type + # "Generator[ExtensionArray | ndarray[Any, Any], None, None]"; + # expected "Iterable[Series]" + keys_data = list(keys) # type: ignore[arg-type] indexer = lexsort_indexer( - keys, orders=ascending, na_position=na_position, key=key + keys_data, orders=ascending, na_position=na_position, key=key ) elif len(by): # len(by) == 1 @@ -11408,28 +11407,11 @@ def func(values: np.ndarray): # We only use this in the case that operates on self.values return op(values, axis=axis, skipna=skipna, **kwds) - dtype_has_keepdims: dict[ExtensionDtype, bool] = {} - def blk_func(values, axis: Axis = 1): if isinstance(values, ExtensionArray): if not is_1d_only_ea_dtype(values.dtype): return values._reduce(name, axis=1, skipna=skipna, **kwds) - has_keepdims = dtype_has_keepdims.get(values.dtype) - if has_keepdims is None: - sign = signature(values._reduce) - has_keepdims = "keepdims" in sign.parameters - dtype_has_keepdims[values.dtype] = has_keepdims - if has_keepdims: - return values._reduce(name, skipna=skipna, keepdims=True, **kwds) - else: - warnings.warn( - f"{type(values)}._reduce will require a `keepdims` parameter " - "in the future", - FutureWarning, - stacklevel=find_stack_level(), - ) - result = values._reduce(name, skipna=skipna, **kwds) - return np.array([result]) + return values._reduce(name, skipna=skipna, keepdims=True, **kwds) else: return op(values, axis=axis, skipna=skipna, **kwds) @@ -13487,10 +13469,10 @@ def isin_(x): # ---------------------------------------------------------------------- # Add plotting methods to DataFrame - plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) + plot = Accessor("plot", pandas.plotting.PlotAccessor) hist = pandas.plotting.hist_frame boxplot = pandas.plotting.boxplot_frame - sparse = CachedAccessor("sparse", SparseFrameAccessor) + sparse = Accessor("sparse", SparseFrameAccessor) # ---------------------------------------------------------------------- # Internal Interface Methods diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0a048d11d0b4d..a20577e8d3df9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -387,7 +387,7 @@ def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame: raise SpecificationError("nested renamer is not supported") if any(isinstance(x, (tuple, list)) for x in arg): - arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg] + arg = ((x, x) if not isinstance(x, (tuple, list)) else x for x in arg) else: # list of functions / function names columns = (com.get_callable_name(f) or f for f in arg) @@ -2077,7 +2077,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame: obj = self._obj_with_exclusions columns = obj.columns - sgbs = [ + sgbs = ( SeriesGroupBy( obj.iloc[:, i], selection=colname, @@ -2086,7 +2086,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame: observed=self.observed, ) for i, colname in enumerate(obj.columns) - ] + ) results = [func(sgb) for sgb in sgbs] if not len(results): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4ebc149256336..1b58317c08736 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -11,6 +11,7 @@ class providing the base-class of operations. from collections.abc import ( Hashable, + Iterable, Iterator, Mapping, Sequence, @@ -758,7 +759,7 @@ def get_converter(s): ) raise ValueError(msg) from err - converters = [get_converter(s) for s in index_sample] + converters = (get_converter(s) for s in index_sample) names = (tuple(f(n) for f, n in zip(converters, name)) for name in names) else: @@ -2645,7 +2646,7 @@ def _value_counts( } if isinstance(obj, Series): _name = obj.name - keys = [] if _name in in_axis_names else [obj] + keys: Iterable[Series] = [] if _name in in_axis_names else [obj] else: unique_cols = set(obj.columns) if subset is not None: @@ -2665,12 +2666,12 @@ def _value_counts( else: subsetted = unique_cols - keys = [ + keys = ( # Can't use .values because the column label needs to be preserved obj.iloc[:, idx] for idx, _name in enumerate(obj.columns) if _name not in in_axis_names and _name in subsetted - ] + ) groupings = list(self._grouper.groupings) for key in keys: diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 3dcd1fedc8d64..3cb51f7447677 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -489,10 +489,20 @@ def components(self) -> DataFrame: """ Return a Dataframe of the components of the Timedeltas. + Each row of the DataFrame corresponds to a Timedelta in the original + Series and contains the individual components (days, hours, minutes, + seconds, milliseconds, microseconds, nanoseconds) of the Timedelta. + Returns ------- DataFrame + See Also + -------- + TimedeltaIndex.components : Return a DataFrame of the individual resolution + components of the Timedeltas. + Series.dt.total_seconds : Return the total number of seconds in the duration. + Examples -------- >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="s")) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index c5e3f3a50e10d..83e8df5072b92 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -212,20 +212,25 @@ def union_indexes(indexes, sort: bool | None = True) -> Index: if kind == "special": result = indexes[0] - dtis = [x for x in indexes if isinstance(x, DatetimeIndex)] - dti_tzs = [x for x in dtis if x.tz is not None] - if len(dti_tzs) not in [0, len(dtis)]: + num_dtis = 0 + num_dti_tzs = 0 + for idx in indexes: + if isinstance(idx, DatetimeIndex): + num_dtis += 1 + if idx.tz is not None: + num_dti_tzs += 1 + if num_dti_tzs not in [0, num_dtis]: # TODO: this behavior is not tested (so may not be desired), # but is kept in order to keep behavior the same when # deprecating union_many # test_frame_from_dict_with_mixed_indexes raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") - if len(dtis) == len(indexes): + if num_dtis == len(indexes): sort = True result = indexes[0] - elif len(dtis) > 1: + elif num_dtis > 1: # If we have mixed timezones, our casting behavior may depend on # the order of indexes, which we don't want. sort = False diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 308205678e388..6a3fb8bc851df 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -142,7 +142,7 @@ nanops, ops, ) -from pandas.core.accessor import CachedAccessor +from pandas.core.accessor import Accessor import pandas.core.algorithms as algos from pandas.core.array_algos.putmask import ( setitem_datetimelike_compat, @@ -462,7 +462,7 @@ def _engine_type( _accessors = {"str"} - str = CachedAccessor("str", StringMethods) + str = Accessor("str", StringMethods) _references = None @@ -3140,7 +3140,7 @@ def _union(self, other: Index, sort: bool | None): # worth making this faster? a very unusual case value_set = set(lvals) - value_list.extend([x for x in rvals if x not in value_set]) + value_list.extend(x for x in rvals if x not in value_set) # If objects are unorderable, we must have object dtype. return np.array(value_list, dtype=object) @@ -7620,8 +7620,8 @@ def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]: list A list representing the unanimous 'names' found. """ - name_tups = [tuple(i.names) for i in indexes] - name_sets = [{*ns} for ns in zip_longest(*name_tups)] + name_tups = (tuple(i.names) for i in indexes) + name_sets = ({*ns} for ns in zip_longest(*name_tups)) names = tuple(ns.pop() if len(ns) == 1 else None for ns in name_sets) return names diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a5bcf49c5490b..3927619a567bf 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1387,7 +1387,7 @@ def _formatter_func(self, tup): """ Formats each item in tup according to its level's formatter function. """ - formatter_funcs = [level._formatter_func for level in self.levels] + formatter_funcs = (level._formatter_func for level in self.levels) return tuple(func(val) for func, val in zip(formatter_funcs, tup)) def _get_values_for_csv( @@ -1537,7 +1537,7 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None: if level is None: level = range(self.nlevels) else: - level = [self._get_level_number(lev) for lev in level] + level = (self._get_level_number(lev) for lev in level) # set the name for lev, name in zip(level, names): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index d29bef02a16cf..e9bd3b389dd75 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2372,8 +2372,7 @@ def ravel(i): # we have a frame, with multiple indexers on both axes; and a # series, so need to broadcast (see GH5206) if sum_aligners == self.ndim and all(is_sequence(_) for _ in indexer): - # TODO(CoW): copy shouldn't be needed here - ser_values = ser.reindex(obj.axes[0][indexer[0]]).copy()._values + ser_values = ser.reindex(obj.axes[0][indexer[0]])._values # single indexer if len(indexer) > 1 and not multiindex_indexer: diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index de6c5416e08c9..7055201b5a1ee 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -560,7 +560,7 @@ def get_result(self): # combine as columns in a frame else: - data = dict(zip(range(len(self.objs)), self.objs)) + data = dict(enumerate(self.objs)) # GH28330 Preserves subclassed objects through concat cons = sample._constructor_expanddim @@ -874,7 +874,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde if isinstance(new_index, MultiIndex): new_levels.extend(new_index.levels) - new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes]) + new_codes.extend(np.tile(lab, kpieces) for lab in new_index.codes) else: new_levels.append(new_index.unique()) single_codes = new_index.unique().get_indexer(new_index) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 4a4080a9dc57b..5426c72a356d6 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -137,24 +137,24 @@ def __init__( self.removed_level = self.removed_level.take(unique_codes) self.removed_level_full = self.removed_level_full.take(unique_codes) - # Bug fix GH 20601 - # If the data frame is too big, the number of unique index combination - # will cause int32 overflow on windows environments. - # We want to check and raise an warning before this happens - num_rows = np.max([index_level.size for index_level in self.new_index_levels]) - num_columns = self.removed_level.size - - # GH20601: This forces an overflow if the number of cells is too high. - num_cells = num_rows * num_columns - - # GH 26314: Previous ValueError raised was too restrictive for many users. - if get_option("performance_warnings") and num_cells > np.iinfo(np.int32).max: - warnings.warn( - f"The following operation may generate {num_cells} cells " - f"in the resulting pandas object.", - PerformanceWarning, - stacklevel=find_stack_level(), - ) + if get_option("performance_warnings"): + # Bug fix GH 20601 + # If the data frame is too big, the number of unique index combination + # will cause int32 overflow on windows environments. + # We want to check and raise an warning before this happens + num_rows = max(index_level.size for index_level in self.new_index_levels) + num_columns = self.removed_level.size + + # GH20601: This forces an overflow if the number of cells is too high. + # GH 26314: Previous ValueError raised was too restrictive for many users. + num_cells = num_rows * num_columns + if num_cells > np.iinfo(np.int32).max: + warnings.warn( + f"The following operation may generate {num_cells} cells " + f"in the resulting pandas object.", + PerformanceWarning, + stacklevel=find_stack_level(), + ) self._make_selectors() @@ -737,10 +737,10 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex | Index: if len(columns.levels) <= 2: return columns.levels[0]._rename(name=columns.names[0]) - levs = [ + levs = ( [lev[c] if c >= 0 else None for c in codes] for lev, codes in zip(columns.levels[:-1], columns.codes[:-1]) - ] + ) # Remove duplicate tuples in the MultiIndex. tuples = zip(*levs) @@ -931,13 +931,26 @@ def _reorder_for_extension_array_stack( def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: if frame.columns.nunique() != len(frame.columns): raise ValueError("Columns with duplicate values are not supported in stack") - - # If we need to drop `level` from columns, it needs to be in descending order set_levels = set(level) - drop_levnums = sorted(level, reverse=True) stack_cols = frame.columns._drop_level_numbers( [k for k in range(frame.columns.nlevels - 1, -1, -1) if k not in set_levels] ) + + result = stack_reshape(frame, level, set_levels, stack_cols) + + # Construct the correct MultiIndex by combining the frame's index and + # stacked columns. + ratio = 0 if frame.empty else len(result) // len(frame) + + index_levels: list | FrozenList + if isinstance(frame.index, MultiIndex): + index_levels = frame.index.levels + index_codes = list(np.tile(frame.index.codes, (1, ratio))) + else: + codes, uniques = factorize(frame.index, use_na_sentinel=False) + index_levels = [uniques] + index_codes = list(np.tile(codes, (1, ratio))) + if len(level) > 1: # Arrange columns in the order we want to take them, e.g. level=[2, 0, 1] sorter = np.argsort(level) @@ -945,13 +958,72 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: ordered_stack_cols = stack_cols._reorder_ilevels(sorter) else: ordered_stack_cols = stack_cols - - stack_cols_unique = stack_cols.unique() ordered_stack_cols_unique = ordered_stack_cols.unique() + if isinstance(ordered_stack_cols, MultiIndex): + column_levels = ordered_stack_cols.levels + column_codes = ordered_stack_cols.drop_duplicates().codes + else: + column_levels = [ordered_stack_cols_unique] + column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]] + + # error: Incompatible types in assignment (expression has type "list[ndarray[Any, + # dtype[Any]]]", variable has type "FrozenList") + column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] # type: ignore[assignment] + result.index = MultiIndex( + levels=index_levels + column_levels, + codes=index_codes + column_codes, + names=frame.index.names + list(ordered_stack_cols.names), + verify_integrity=False, + ) + + # sort result, but faster than calling sort_index since we know the order we need + len_df = len(frame) + n_uniques = len(ordered_stack_cols_unique) + indexer = np.arange(n_uniques) + idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques) + result = result.take(idxs) + + # Reshape/rename if needed and dropna + if result.ndim == 2 and frame.columns.nlevels == len(level): + if len(result.columns) == 0: + result = Series(index=result.index) + else: + result = result.iloc[:, 0] + if result.ndim == 1: + result.name = None + + return result + + +def stack_reshape( + frame: DataFrame, level: list[int], set_levels: set[int], stack_cols: Index +) -> Series | DataFrame: + """Reshape the data of a frame for stack. + + This function takes care of most of the work that stack needs to do. Caller + will sort the result once the appropriate index is set. + + Parameters + ---------- + frame: DataFrame + DataFrame that is to be stacked. + level: list of ints. + Levels of the columns to stack. + set_levels: set of ints. + Same as level, but as a set. + stack_cols: Index. + Columns of the result when the DataFrame is stacked. + + Returns + ------- + The data of behind the stacked DataFrame. + """ + # If we need to drop `level` from columns, it needs to be in descending order + drop_levnums = sorted(level, reverse=True) # Grab data for each unique index to be stacked buf = [] - for idx in stack_cols_unique: + for idx in stack_cols.unique(): if len(frame.columns) == 1: data = frame.copy() else: @@ -978,10 +1050,8 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: data.columns = RangeIndex(len(data.columns)) buf.append(data) - result: Series | DataFrame if len(buf) > 0 and not frame.empty: result = concat(buf, ignore_index=True) - ratio = len(result) // len(frame) else: # input is empty if len(level) < frame.columns.nlevels: @@ -990,7 +1060,6 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: else: new_columns = [0] result = DataFrame(columns=new_columns, dtype=frame._values.dtype) - ratio = 0 if len(level) < frame.columns.nlevels: # concat column order may be different from dropping the levels @@ -998,46 +1067,4 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: if not result.columns.equals(desired_columns): result = result[desired_columns] - # Construct the correct MultiIndex by combining the frame's index and - # stacked columns. - index_levels: list | FrozenList - if isinstance(frame.index, MultiIndex): - index_levels = frame.index.levels - index_codes = list(np.tile(frame.index.codes, (1, ratio))) - else: - codes, uniques = factorize(frame.index, use_na_sentinel=False) - index_levels = [uniques] - index_codes = list(np.tile(codes, (1, ratio))) - if isinstance(ordered_stack_cols, MultiIndex): - column_levels = ordered_stack_cols.levels - column_codes = ordered_stack_cols.drop_duplicates().codes - else: - column_levels = [ordered_stack_cols.unique()] - column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]] - # error: Incompatible types in assignment (expression has type "list[ndarray[Any, - # dtype[Any]]]", variable has type "FrozenList") - column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] # type: ignore[assignment] - result.index = MultiIndex( - levels=index_levels + column_levels, - codes=index_codes + column_codes, - names=frame.index.names + list(ordered_stack_cols.names), - verify_integrity=False, - ) - - # sort result, but faster than calling sort_index since we know the order we need - len_df = len(frame) - n_uniques = len(ordered_stack_cols_unique) - indexer = np.arange(n_uniques) - idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques) - result = result.take(idxs) - - # Reshape/rename if needed and dropna - if result.ndim == 2 and frame.columns.nlevels == len(level): - if len(result.columns) == 0: - result = Series(index=result.index) - else: - result = result.iloc[:, 0] - if result.ndim == 1: - result.name = None - return result diff --git a/pandas/core/series.py b/pandas/core/series.py index 97a53650ec5ff..c49eef49f7393 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -101,7 +101,7 @@ ops, roperator, ) -from pandas.core.accessor import CachedAccessor +from pandas.core.accessor import Accessor from pandas.core.apply import SeriesApply from pandas.core.arrays import ExtensionArray from pandas.core.arrays.arrow import ( @@ -1491,6 +1491,13 @@ def to_string( str or None String representation of Series if ``buf=None``, otherwise None. + See Also + -------- + Series.to_dict : Convert Series to dict object. + Series.to_frame : Convert Series to DataFrame object. + Series.to_markdown : Print Series in Markdown-friendly format. + Series.to_timestamp : Cast to DatetimeIndex of Timestamps. + Examples -------- >>> ser = pd.Series([1, 2, 3]).to_string() @@ -3968,26 +3975,44 @@ def nsmallest( """ return selectn.SelectNSeries(self, n=n, keep=keep).nsmallest() - @doc( - klass=_shared_doc_kwargs["klass"], - extra_params=dedent( - """copy : bool, default True - Whether to copy underlying data. + def swaplevel( + self, i: Level = -2, j: Level = -1, copy: bool | lib.NoDefault = lib.no_default + ) -> Series: + """ + Swap levels i and j in a :class:`MultiIndex`. - .. note:: - The `copy` keyword will change behavior in pandas 3.0. - `Copy-on-Write - `__ - will be enabled by default, which means that all methods with a - `copy` keyword will use a lazy copy mechanism to defer the copy and - ignore the `copy` keyword. The `copy` keyword will be removed in a - future version of pandas. + Default is to swap the two innermost levels of the index. + + Parameters + ---------- + i, j : int or str + Levels of the indices to be swapped. Can pass level name as string. + copy : bool, default True + Whether to copy underlying data. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy + and ignore the `copy` keyword. The `copy` keyword will be + removed in a future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + + Returns + ------- + Series + Series with levels swapped in MultiIndex. + + See Also + -------- + DataFrame.swaplevel : Swap levels i and j in a :class:`DataFrame`. + Series.reorder_levels : Rearrange index levels using input order. + MultiIndex.swaplevel : Swap levels i and j in a :class:`MultiIndex`. - You can already get the future behavior and improvements through - enabling copy on write ``pd.options.mode.copy_on_write = True``""" - ), - examples=dedent( - """\ Examples -------- >>> s = pd.Series( @@ -4037,29 +4062,7 @@ def nsmallest( Geography Final exam February B History Coursework March A Geography Coursework April C - dtype: object""" - ), - ) - def swaplevel( - self, i: Level = -2, j: Level = -1, copy: bool | lib.NoDefault = lib.no_default - ) -> Series: - """ - Swap levels i and j in a :class:`MultiIndex`. - - Default is to swap the two innermost levels of the index. - - Parameters - ---------- - i, j : int or str - Levels of the indices to be swapped. Can pass level name as string. - {extra_params} - - Returns - ------- - {klass} - {klass} with levels swapped in MultiIndex. - - {examples} + dtype: object """ self._check_copy_deprecation(copy) assert isinstance(self.index, MultiIndex) @@ -5754,13 +5757,13 @@ def to_period( # ---------------------------------------------------------------------- # Accessor Methods # ---------------------------------------------------------------------- - str = CachedAccessor("str", StringMethods) - dt = CachedAccessor("dt", CombinedDatetimelikeProperties) - cat = CachedAccessor("cat", CategoricalAccessor) - plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) - sparse = CachedAccessor("sparse", SparseAccessor) - struct = CachedAccessor("struct", StructAccessor) - list = CachedAccessor("list", ListAccessor) + str = Accessor("str", StringMethods) + dt = Accessor("dt", CombinedDatetimelikeProperties) + cat = Accessor("cat", CategoricalAccessor) + plot = Accessor("plot", pandas.plotting.PlotAccessor) + sparse = Accessor("sparse", SparseAccessor) + struct = Accessor("struct", StructAccessor) + list = Accessor("list", ListAccessor) # ---------------------------------------------------------------------- # Add plotting methods to Series @@ -5916,7 +5919,6 @@ def _flex_method(self, other, op, *, level=None, fill_value=None, axis: Axis = 0 return op(self, other) - @Appender(ops.make_flex_doc("eq", "series")) def eq( self, other, @@ -5924,6 +5926,63 @@ def eq( fill_value: float | None = None, axis: Axis = 0, ) -> Series: + """ + Return Equal to of series and other, element-wise (binary operator `eq`). + + Equivalent to ``series == other``, but with support to substitute a fill_value + for missing data in either one of the inputs. + + Parameters + ---------- + other : Series or scalar value + The second operand in this operation. + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result of filling (at that location) will be missing. + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + + Returns + ------- + Series + The result of the operation. + + See Also + -------- + Series.ge : Return elementwise Greater than or equal to of series and other. + Series.le : Return elementwise Less than or equal to of series and other. + Series.gt : Return elementwise Greater than of series and other. + Series.lt : Return elementwise Less than of series and other. + + Examples + -------- + >>> a = pd.Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"]) + >>> a + a 1.0 + b 1.0 + c 1.0 + d NaN + dtype: float64 + >>> b = pd.Series([1, np.nan, 1, np.nan], index=["a", "b", "d", "e"]) + >>> b + a 1.0 + b NaN + d 1.0 + e NaN + dtype: float64 + >>> a.eq(b, fill_value=0) + a True + b False + c False + d False + e False + dtype: bool + """ return self._flex_method( other, operator.eq, level=level, fill_value=fill_value, axis=axis ) @@ -5934,8 +5993,68 @@ def ne(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: other, operator.ne, level=level, fill_value=fill_value, axis=axis ) - @Appender(ops.make_flex_doc("le", "series")) def le(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: + """ + Return Less than or equal to of series and other, \ + element-wise (binary operator `le`). + + Equivalent to ``series <= other``, but with support to substitute a + fill_value for missing data in either one of the inputs. + + Parameters + ---------- + other : Series or scalar value + The second operand in this operation. + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : None or float value, default None (NaN) + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result of filling (at that location) will be missing. + axis : {0 or 'index'} + Unused. Parameter needed for compatibility with DataFrame. + + Returns + ------- + Series + The result of the operation. + + See Also + -------- + Series.ge : Return elementwise Greater than or equal to of series and other. + Series.lt : Return elementwise Less than of series and other. + Series.gt : Return elementwise Greater than of series and other. + Series.eq : Return elementwise equal to of series and other. + + Examples + -------- + >>> a = pd.Series([1, 1, 1, np.nan, 1], index=['a', 'b', 'c', 'd', 'e']) + >>> a + a 1.0 + b 1.0 + c 1.0 + d NaN + e 1.0 + dtype: float64 + >>> b = pd.Series([0, 1, 2, np.nan, 1], index=['a', 'b', 'c', 'd', 'f']) + >>> b + a 0.0 + b 1.0 + c 2.0 + d NaN + f 1.0 + dtype: float64 + >>> a.le(b, fill_value=0) + a False + b True + c True + d False + e False + f True + dtype: bool + """ return self._flex_method( other, operator.le, level=level, fill_value=fill_value, axis=axis ) @@ -6528,7 +6647,6 @@ def max( ) @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="sum") - @doc(make_doc("sum", ndim=1)) def sum( self, axis: Axis | None = None, @@ -6537,6 +6655,89 @@ def sum( min_count: int = 0, **kwargs, ): + """ + Return the sum of the values over the requested axis. + + This is equivalent to the method ``numpy.sum``. + + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.sum with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer than + ``min_count`` non-NA values are present the result will be NA. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar or Series (if level specified) + Median of the values for the requested axis. + + See Also + -------- + numpy.sum : Equivalent numpy function for computing sum. + Series.mean : Mean of the values. + Series.median : Median of the values. + Series.std : Standard deviation of the values. + Series.var : Variance of the values. + Series.min : Minimum value. + Series.max : Maximum value. + + Examples + -------- + >>> idx = pd.MultiIndex.from_arrays( + ... [["warm", "warm", "cold", "cold"], ["dog", "falcon", "fish", "spider"]], + ... names=["blooded", "animal"], + ... ) + >>> s = pd.Series([4, 2, 0, 8], name="legs", index=idx) + >>> s + blooded animal + warm dog 4 + falcon 2 + cold fish 0 + spider 8 + Name: legs, dtype: int64 + + >>> s.sum() + 14 + + By default, the sum of an empty or all-NA Series is ``0``. + + >>> pd.Series([], dtype="float64").sum() # min_count=0 is the default + 0.0 + + This can be controlled with the ``min_count`` parameter. For example, if + you'd like the sum of an empty series to be NaN, pass ``min_count=1``. + + >>> pd.Series([], dtype="float64").sum(min_count=1) + nan + + Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and + empty series identically. + + >>> pd.Series([np.nan]).sum() + 0.0 + + >>> pd.Series([np.nan]).sum(min_count=1) + nan + """ return NDFrame.sum( self, axis=axis, @@ -6720,7 +6921,6 @@ def sem( ) @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="var") - @doc(make_doc("var", ndim=1)) def var( self, axis: Axis | None = None, @@ -6729,6 +6929,75 @@ def var( numeric_only: bool = False, **kwargs, ): + """ + Return unbiased variance over requested axis. + + Normalized by N-1 by default. This can be changed using the ddof argument. + + Parameters + ---------- + axis : {index (0)} + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.var with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + **kwargs : + Additional keywords passed. + + Returns + ------- + scalar or Series (if level specified) + Unbiased variance over requested axis. + + See Also + -------- + numpy.var : Equivalent function in NumPy. + Series.std : Returns the standard deviation of the Series. + DataFrame.var : Returns the variance of the DataFrame. + DataFrame.std : Return standard deviation of the values over + the requested axis. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "person_id": [0, 1, 2, 3], + ... "age": [21, 25, 62, 43], + ... "height": [1.61, 1.87, 1.49, 2.01], + ... } + ... ).set_index("person_id") + >>> df + age height + person_id + 0 21 1.61 + 1 25 1.87 + 2 62 1.49 + 3 43 2.01 + + >>> df.var() + age 352.916667 + height 0.056367 + dtype: float64 + + Alternatively, ``ddof=0`` can be set to normalize by N instead of N-1: + + >>> df.var(ddof=0) + age 264.687500 + height 0.042275 + dtype: float64 + """ return NDFrame.var( self, axis=axis, @@ -6771,7 +7040,6 @@ def skew( ) @deprecate_nonkeyword_arguments(version="3.0", allowed_args=["self"], name="kurt") - @doc(make_doc("kurt", ndim=1)) def kurt( self, axis: Axis | None = 0, @@ -6779,6 +7047,54 @@ def kurt( numeric_only: bool = False, **kwargs, ): + """ + Return unbiased kurtosis over requested axis. + + Kurtosis obtained using Fisher's definition of + kurtosis (kurtosis of normal == 0.0). Normalized by N-1. + + Parameters + ---------- + axis : {index (0)} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + For DataFrames, specifying ``axis=None`` will apply the aggregation + across both axes. + + .. versionadded:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values when computing the result. + numeric_only : bool, default False + Include only float, int, boolean columns. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + scalar + Unbiased kurtosis. + + See Also + -------- + Series.skew : Return unbiased skew over requested axis. + Series.var : Return unbiased variance over requested axis. + Series.std : Return unbiased standard deviation over requested axis. + + Examples + -------- + >>> s = pd.Series([1, 2, 2, 3], index=["cat", "dog", "dog", "mouse"]) + >>> s + cat 1 + dog 2 + dog 2 + mouse 3 + dtype: int64 + >>> s.kurt() + 1.5 + """ return NDFrame.kurt( self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs ) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 476856e8038d6..d4b4fd90658ad 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -73,7 +73,7 @@ def read_orc( .. versionadded:: 2.0 filesystem : fsspec or pyarrow filesystem, default None - Filesystem object to use when reading the parquet file. + Filesystem object to use when reading the orc file. .. versionadded:: 2.1.0 @@ -99,7 +99,7 @@ def read_orc( -------- >>> result = pd.read_orc("example_pa.orc") # doctest: +SKIP """ - # we require a newer version of pyarrow than we support for parquet + # we require a newer version of pyarrow than we support for orc orc = import_optional_dependency("pyarrow.orc") diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 2ed4d8b66960c..66edbcaa755ed 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -322,9 +322,9 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): You can also pass: - "ISO8601", to parse any `ISO8601 `_ - time string (not necessarily in exactly the same format); + time string (not necessarily in exactly the same format); - "mixed", to infer the format for each element individually. This is risky, - and you should probably use it along with `dayfirst`. + and you should probably use it along with `dayfirst`. .. versionadded:: 2.0.0 dayfirst : bool, default False diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index e2121526c16af..50fa722f6dd72 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -430,7 +430,7 @@ def __call__(self): freq = f"{interval}ms" tz = self.tz.tzname(None) st = dmin.replace(tzinfo=None) - ed = dmin.replace(tzinfo=None) + ed = dmax.replace(tzinfo=None) all_dates = date_range(start=st, end=ed, freq=freq, tz=tz).astype(object) try: diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 2d959bb16e7d5..d1e4104e16465 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -6,7 +6,6 @@ from pandas.compat.pyarrow import pa_version_under12p0 import pandas.util._test_decorators as td -import pandas as pd from pandas import ( DataFrame, Series, @@ -79,8 +78,7 @@ def test_astype_different_target_dtype(dtype): def test_astype_numpy_to_ea(): ser = Series([1, 2, 3]) - with pd.option_context("mode.copy_on_write", True): - result = ser.astype("Int64") + result = ser.astype("Int64") assert np.shares_memory(get_array(ser), get_array(result)) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index e27c782c1bdcf..6f30dcfaaba7e 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -322,6 +322,8 @@ def test_groupby_as_index_apply(): tm.assert_index_equal(res_as_apply, exp_as_apply) tm.assert_index_equal(res_not_as_apply, exp_not_as_apply) + +def test_groupby_as_index_apply_str(): ind = Index(list("abcde")) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) msg = "DataFrameGroupBy.apply operated on the grouping columns" @@ -379,8 +381,8 @@ def f(piece): {"value": piece, "demeaned": piece - piece.mean(), "logged": logged} ) - dr = bdate_range("1/1/2000", periods=100) - ts = Series(np.random.default_rng(2).standard_normal(100), index=dr) + dr = bdate_range("1/1/2000", periods=10) + ts = Series(np.random.default_rng(2).standard_normal(10), index=dr) grouped = ts.groupby(lambda x: x.month, group_keys=False) result = grouped.apply(f) @@ -639,13 +641,13 @@ def reindex_helper(x): def test_apply_corner_cases(): # #535, can't use sliding iterator - N = 1000 + N = 10 labels = np.random.default_rng(2).integers(0, 100, size=N) df = DataFrame( { "key": labels, "value1": np.random.default_rng(2).standard_normal(N), - "value2": ["foo", "bar", "baz", "qux"] * (N // 4), + "value2": ["foo", "bar", "baz", "qux", "a"] * (N // 5), } ) @@ -680,6 +682,8 @@ def test_apply_numeric_coercion_when_datetime(): result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) tm.assert_series_equal(result["Str"], expected["Str"]) + +def test_apply_numeric_coercion_when_datetime_getitem(): # GH 15421 df = DataFrame( {"A": [10, 20, 30], "B": ["foo", "3", "4"], "T": [pd.Timestamp("12:31:22")] * 3} @@ -695,6 +699,8 @@ def get_B(g): expected.index = df.A tm.assert_series_equal(result, expected) + +def test_apply_numeric_coercion_when_datetime_with_nat(): # GH 14423 def predictions(tool): out = Series(index=["p1", "p2", "useTime"], dtype=object) @@ -843,10 +849,24 @@ def test_func(x): tm.assert_frame_equal(result, expected) -def test_groupby_apply_none_first(): +@pytest.mark.parametrize( + "in_data, out_idx, out_data", + [ + [ + {"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]}, + [[1, 1], [0, 2]], + {"groups": [1, 1], "vars": [0, 2]}, + ], + [ + {"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]}, + [[2, 2], [1, 3]], + {"groups": [2, 2], "vars": [1, 3]}, + ], + ], +) +def test_groupby_apply_none_first(in_data, out_idx, out_data): # GH 12824. Tests if apply returns None first. - test_df1 = DataFrame({"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]}) - test_df2 = DataFrame({"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]}) + test_df1 = DataFrame(in_data) def test_func(x): if x.shape[0] < 2: @@ -856,14 +876,9 @@ def test_func(x): msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(DeprecationWarning, match=msg): result1 = test_df1.groupby("groups").apply(test_func) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result2 = test_df2.groupby("groups").apply(test_func) - index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None]) - index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None]) - expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1) - expected2 = DataFrame({"groups": [2, 2], "vars": [1, 3]}, index=index2) + index1 = MultiIndex.from_arrays(out_idx, names=["groups", None]) + expected1 = DataFrame(out_data, index=index1) tm.assert_frame_equal(result1, expected1) - tm.assert_frame_equal(result2, expected2) def test_groupby_apply_return_empty_chunk(): @@ -883,18 +898,16 @@ def test_groupby_apply_return_empty_chunk(): tm.assert_series_equal(result, expected) -def test_apply_with_mixed_types(): +@pytest.mark.parametrize("meth", ["apply", "transform"]) +def test_apply_with_mixed_types(meth): # gh-20949 df = DataFrame({"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]}) g = df.groupby("A", group_keys=False) - result = g.transform(lambda x: x / x.sum()) + result = getattr(g, meth)(lambda x: x / x.sum()) expected = DataFrame({"B": [1 / 3.0, 2 / 3.0, 1], "C": [0.4, 0.6, 1.0]}) tm.assert_frame_equal(result, expected) - result = g.apply(lambda x: x / x.sum()) - tm.assert_frame_equal(result, expected) - def test_func_returns_object(): # GH 28652 @@ -1106,7 +1119,7 @@ def test_apply_function_with_indexing_return_column(): @pytest.mark.parametrize( "udf", - [(lambda x: x.copy()), (lambda x: x.copy().rename(lambda y: y + 1))], + [lambda x: x.copy(), lambda x: x.copy().rename(lambda y: y + 1)], ) @pytest.mark.parametrize("group_keys", [True, False]) def test_apply_result_type(group_keys, udf): @@ -1214,7 +1227,7 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): expected = df.iloc[[0, 2, 3]] expected = expected.reset_index() expected.index = MultiIndex.from_frame(expected[["A", "B", "idx"]]) - expected = expected.drop(columns="idx") + expected = expected.drop(columns=["idx"]) tm.assert_frame_equal(result, expected) for val in result.index.levels[1]: diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 5a43a42aa936f..2194e5692aa0e 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -82,7 +82,7 @@ def get_stats(group): assert result.index.names[0] == "C" -def test_basic(using_infer_string): # TODO: split this test +def test_basic(): cats = Categorical( ["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], @@ -95,17 +95,20 @@ def test_basic(using_infer_string): # TODO: split this test result = data.groupby("b", observed=False).mean() tm.assert_frame_equal(result, expected) + +def test_basic_single_grouper(): cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) - # single grouper gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True) expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)}) result = gb.sum(numeric_only=True) tm.assert_frame_equal(result, expected) + +def test_basic_string(using_infer_string): # GH 8623 x = DataFrame( [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]], @@ -133,8 +136,9 @@ def f(x): expected["person_name"] = expected["person_name"].astype(dtype) tm.assert_frame_equal(result, expected) + +def test_basic_monotonic(): # GH 9921 - # Monotonic df = DataFrame({"a": [5, 15, 25]}) c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) @@ -165,7 +169,8 @@ def f(x): tm.assert_series_equal(df.a.groupby(c, observed=False).filter(np.all), df["a"]) tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df) - # Non-monotonic + +def test_basic_non_monotonic(): df = DataFrame({"a": [5, 15, 25, -5]}) c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) @@ -183,6 +188,8 @@ def f(x): df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[["a"]] ) + +def test_basic_cut_grouping(): # GH 9603 df = DataFrame({"a": [1, 0, 0, 0]}) c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd"))) @@ -193,13 +200,14 @@ def f(x): expected.index.name = "a" tm.assert_series_equal(result, expected) - # more basic + +def test_more_basic(): levels = ["foo", "bar", "baz", "qux"] - codes = np.random.default_rng(2).integers(0, 4, size=100) + codes = np.random.default_rng(2).integers(0, 4, size=10) cats = Categorical.from_codes(codes, levels, ordered=True) - data = DataFrame(np.random.default_rng(2).standard_normal((100, 4))) + data = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) result = data.groupby(cats, observed=False).mean() @@ -225,9 +233,9 @@ def f(x): # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) - tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) + tm.assert_index_equal(desc_result.stack().index.get_level_values(0), exp) exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) - tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp) + tm.assert_index_equal(desc_result.stack().index.get_level_values(1), exp) def test_level_get_group(observed): @@ -352,6 +360,8 @@ def test_observed(observed): tm.assert_frame_equal(result, expected) + +def test_observed_single_column(observed): # https://github.com/pandas-dev/pandas/issues/8138 d = { "cat": Categorical( @@ -362,7 +372,6 @@ def test_observed(observed): } df = DataFrame(d) - # Grouping on a single column groups_single_key = df.groupby("cat", observed=observed) result = groups_single_key.mean() @@ -378,7 +387,17 @@ def test_observed(observed): tm.assert_frame_equal(result, expected) - # Grouping on two columns + +def test_observed_two_columns(observed): + # https://github.com/pandas-dev/pandas/issues/8138 + d = { + "cat": Categorical( + ["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True + ), + "ints": [1, 1, 2, 2], + "val": [10, 20, 30, 40], + } + df = DataFrame(d) groups_double_key = df.groupby(["cat", "ints"], observed=observed) result = groups_double_key.agg("mean") expected = DataFrame( @@ -404,6 +423,8 @@ def test_observed(observed): expected = df[(df.cat == c) & (df.ints == i)] tm.assert_frame_equal(result, expected) + +def test_observed_with_as_index(observed): # gh-8869 # with as_index d = { @@ -591,7 +612,6 @@ def test_dataframe_categorical_with_nan(observed): @pytest.mark.parametrize("ordered", [True, False]) -@pytest.mark.parametrize("observed", [True, False]) def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): # GH 25871: Fix groupby sorting on ordered Categoricals # GH 25167: Groupby with observed=True doesn't sort @@ -627,11 +647,11 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): def test_datetime(): # GH9049: ensure backward compatibility levels = pd.date_range("2014-01-01", periods=4) - codes = np.random.default_rng(2).integers(0, 4, size=100) + codes = np.random.default_rng(2).integers(0, 4, size=10) cats = Categorical.from_codes(codes, levels, ordered=True) - data = DataFrame(np.random.default_rng(2).standard_normal((100, 4))) + data = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) result = data.groupby(cats, observed=False).mean() expected = data.groupby(np.asarray(cats), observed=False).mean() @@ -832,7 +852,10 @@ def test_preserve_categories(): df.groupby("A", sort=False, observed=False).first().index, nosort_index ) - # ordered=False + +def test_preserve_categories_ordered_false(): + # GH-13179 + categories = list("abc") df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)}) sort_index = CategoricalIndex(categories, categories, ordered=False, name="A") # GH#48749 - don't change order of categories @@ -846,7 +869,8 @@ def test_preserve_categories(): ) -def test_preserve_categorical_dtype(): +@pytest.mark.parametrize("col", ["C1", "C2"]) +def test_preserve_categorical_dtype(col): # GH13743, GH13854 df = DataFrame( { @@ -865,18 +889,15 @@ def test_preserve_categorical_dtype(): "C2": Categorical(list("bac"), categories=list("bac"), ordered=True), } ) - for col in ["C1", "C2"]: - result1 = df.groupby(by=col, as_index=False, observed=False).mean( - numeric_only=True - ) - result2 = ( - df.groupby(by=col, as_index=True, observed=False) - .mean(numeric_only=True) - .reset_index() - ) - expected = exp_full.reindex(columns=result1.columns) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) + result1 = df.groupby(by=col, as_index=False, observed=False).mean(numeric_only=True) + result2 = ( + df.groupby(by=col, as_index=True, observed=False) + .mean(numeric_only=True) + .reset_index() + ) + expected = exp_full.reindex(columns=result1.columns) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) @pytest.mark.parametrize( @@ -931,6 +952,8 @@ def test_categorical_no_compress(): ) tm.assert_series_equal(result, exp) + +def test_categorical_no_compress_string(): cats = Categorical( ["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], @@ -965,7 +988,7 @@ def test_sort(): # has a sorted x axis # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') - df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 100)}) + df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 10)}) labels = [f"{i} - {i+499}" for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 2622895f9f8d2..47ad18c9ad2c8 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -321,19 +321,22 @@ def test_count_object(): expected = Series([3, 3], index=Index([2, 3], name="c"), name="a") tm.assert_series_equal(result, expected) + +def test_count_object_nan(): df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3}) result = df.groupby("c").a.count() expected = Series([1, 3], index=Index([2, 3], name="c"), name="a") tm.assert_series_equal(result, expected) -def test_count_cross_type(): +@pytest.mark.parametrize("typ", ["object", "float32"]) +def test_count_cross_type(typ): # GH8169 # Set float64 dtype to avoid upcast when setting nan below vals = np.hstack( ( - np.random.default_rng(2).integers(0, 5, (100, 2)), - np.random.default_rng(2).integers(0, 2, (100, 2)), + np.random.default_rng(2).integers(0, 5, (10, 2)), + np.random.default_rng(2).integers(0, 2, (10, 2)), ) ).astype("float64") @@ -341,11 +344,10 @@ def test_count_cross_type(): df[df == 2] = np.nan expected = df.groupby(["c", "d"]).count() - for t in ["float32", "object"]: - df["a"] = df["a"].astype(t) - df["b"] = df["b"].astype(t) - result = df.groupby(["c", "d"]).count() - tm.assert_frame_equal(result, expected) + df["a"] = df["a"].astype(typ) + df["b"] = df["b"].astype(typ) + result = df.groupby(["c", "d"]).count() + tm.assert_frame_equal(result, expected) def test_lower_int_prec_count(): diff --git a/pandas/tests/groupby/test_cumulative.py b/pandas/tests/groupby/test_cumulative.py index 28dcb38d173f2..b0a0414c1feb2 100644 --- a/pandas/tests/groupby/test_cumulative.py +++ b/pandas/tests/groupby/test_cumulative.py @@ -94,21 +94,28 @@ def test_groupby_cumprod_nan_influences_other_columns(): def test_cummin(dtypes_for_minmax): dtype = dtypes_for_minmax[0] - min_val = dtypes_for_minmax[1] # GH 15048 base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] df = base_df.astype(dtype) - expected = DataFrame({"B": expected_mins}).astype(dtype) result = df.groupby("A").cummin() tm.assert_frame_equal(result, expected) result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() tm.assert_frame_equal(result, expected) - # Test w/ min value for dtype + +def test_cummin_min_value_for_dtype(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + min_val = dtypes_for_minmax[1] + + # GH 15048 + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] + expected = DataFrame({"B": expected_mins}).astype(dtype) + df = base_df.astype(dtype) df.loc[[2, 6], "B"] = min_val df.loc[[1, 5], "B"] = min_val + 1 expected.loc[[2, 3, 6, 7], "B"] = min_val @@ -120,8 +127,10 @@ def test_cummin(dtypes_for_minmax): ) tm.assert_frame_equal(result, expected, check_exact=True) - # Test nan in some values + +def test_cummin_nan_in_some_values(dtypes_for_minmax): # Explicit cast to float to avoid implicit cast when setting nan + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) base_df = base_df.astype({"B": "float"}) base_df.loc[[0, 2, 4, 6], "B"] = np.nan expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) @@ -132,6 +141,8 @@ def test_cummin(dtypes_for_minmax): ) tm.assert_frame_equal(result, expected) + +def test_cummin_datetime(): # GH 15561 df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) expected = Series(pd.to_datetime("2001"), index=[0], name="b") @@ -139,6 +150,8 @@ def test_cummin(dtypes_for_minmax): result = df.groupby("a")["b"].cummin() tm.assert_series_equal(expected, result) + +def test_cummin_getattr_series(): # GH 15635 df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]}) result = df.groupby("a").b.cummin() @@ -163,7 +176,6 @@ def test_cummin_max_all_nan_column(method, dtype): def test_cummax(dtypes_for_minmax): dtype = dtypes_for_minmax[0] - max_val = dtypes_for_minmax[2] # GH 15048 base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) @@ -177,8 +189,18 @@ def test_cummax(dtypes_for_minmax): result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() tm.assert_frame_equal(result, expected) - # Test w/ max value for dtype + +def test_cummax_min_value_for_dtype(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + max_val = dtypes_for_minmax[2] + + # GH 15048 + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] + + df = base_df.astype(dtype) df.loc[[2, 6], "B"] = max_val + expected = DataFrame({"B": expected_maxs}).astype(dtype) expected.loc[[2, 3, 6, 7], "B"] = max_val result = df.groupby("A").cummax() tm.assert_frame_equal(result, expected) @@ -187,8 +209,11 @@ def test_cummax(dtypes_for_minmax): ) tm.assert_frame_equal(result, expected) + +def test_cummax_nan_in_some_values(dtypes_for_minmax): # Test nan in some values # Explicit cast to float to avoid implicit cast when setting nan + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) base_df = base_df.astype({"B": "float"}) base_df.loc[[0, 2, 4, 6], "B"] = np.nan expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) @@ -199,6 +224,8 @@ def test_cummax(dtypes_for_minmax): ) tm.assert_frame_equal(result, expected) + +def test_cummax_datetime(): # GH 15561 df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) expected = Series(pd.to_datetime("2001"), index=[0], name="b") @@ -206,6 +233,8 @@ def test_cummax(dtypes_for_minmax): result = df.groupby("a")["b"].cummax() tm.assert_series_equal(expected, result) + +def test_cummax_getattr_series(): # GH 15635 df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]}) result = df.groupby("a").b.cummax() @@ -292,15 +321,12 @@ def test_nullable_int_not_cast_as_float(method, dtype, val): tm.assert_frame_equal(result, expected) -def test_cython_api2(): +def test_cython_api2(as_index): # this takes the fast apply path # cumsum (GH5614) + # GH 5755 - cumsum is a transformer and should ignore as_index df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"]) expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"]) - result = df.groupby("A").cumsum() - tm.assert_frame_equal(result, expected) - - # GH 5755 - cumsum is a transformer and should ignore as_index - result = df.groupby("A", as_index=False).cumsum() + result = df.groupby("A", as_index=as_index).cumsum() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index a34170e9b55db..04883b3ef6b78 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -85,6 +85,9 @@ def test_filter_out_no_groups(): grouped = s.groupby(grouper) filtered = grouped.filter(lambda x: x.mean() > 0) tm.assert_series_equal(filtered, s) + + +def test_filter_out_no_groups_dataframe(): df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) @@ -100,6 +103,9 @@ def test_filter_out_all_groups_in_df(): expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3}) tm.assert_frame_equal(expected, res) + +def test_filter_out_all_groups_in_df_dropna_true(): + # GH12768 df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]}) res = df.groupby("a") res = res.filter(lambda x: x["b"].sum() > 5, dropna=True) @@ -179,7 +185,7 @@ def test_filter_pdna_is_false(): def test_filter_against_workaround_ints(): # Series of ints - s = Series(np.random.default_rng(2).integers(0, 100, 100)) + s = Series(np.random.default_rng(2).integers(0, 100, 10)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 @@ -191,7 +197,7 @@ def test_filter_against_workaround_ints(): def test_filter_against_workaround_floats(): # Series of floats - s = 100 * Series(np.random.default_rng(2).random(100)) + s = 100 * Series(np.random.default_rng(2).random(10)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 @@ -203,13 +209,13 @@ def test_filter_against_workaround_floats(): def test_filter_against_workaround_dataframe(): # Set up DataFrame of ints, floats, strings. letters = np.array(list(ascii_lowercase)) - N = 100 + N = 10 random_letters = letters.take( np.random.default_rng(2).integers(0, 26, N, dtype=int) ) df = DataFrame( { - "ints": Series(np.random.default_rng(2).integers(0, 100, N)), + "ints": Series(np.random.default_rng(2).integers(0, 10, N)), "floats": N / 10 * Series(np.random.default_rng(2).random(N)), "letters": Series(random_letters), } @@ -217,26 +223,26 @@ def test_filter_against_workaround_dataframe(): # Group by ints; filter on floats. grouped = df.groupby("ints") - old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")] - new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20) + old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 2).astype("bool")] + new_way = grouped.filter(lambda x: x["floats"].mean() > N / 2) tm.assert_frame_equal(new_way, old_way) # Group by floats (rounded); filter on strings. grouper = df.floats.apply(lambda x: np.round(x, -1)) grouped = df.groupby(grouper) - old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")] - new_way = grouped.filter(lambda x: len(x.letters) < N / 10) + old_way = df[grouped.letters.transform(lambda x: len(x) < N / 2).astype("bool")] + new_way = grouped.filter(lambda x: len(x.letters) < N / 2) tm.assert_frame_equal(new_way, old_way) # Group by strings; filter on ints. grouped = df.groupby("letters") - old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")] - new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20) + old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 2).astype("bool")] + new_way = grouped.filter(lambda x: x["ints"].mean() > N / 2) tm.assert_frame_equal(new_way, old_way) def test_filter_using_len(): - # BUG GH4447 + # GH 4447 df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) grouped = df.groupby("B") actual = grouped.filter(lambda x: len(x) > 2) @@ -250,8 +256,10 @@ def test_filter_using_len(): expected = df.loc[[]] tm.assert_frame_equal(actual, expected) - # Series have always worked properly, but we'll test anyway. - s = df["B"] + +def test_filter_using_len_series(): + # GH 4447 + s = Series(list("aabbbbcc"), name="B") grouped = s.groupby(s) actual = grouped.filter(lambda x: len(x) > 2) expected = Series(4 * ["b"], index=np.arange(2, 6, dtype=np.int64), name="B") @@ -262,10 +270,14 @@ def test_filter_using_len(): tm.assert_series_equal(actual, expected) -def test_filter_maintains_ordering(): - # Simple case: index is sequential. #4621 +@pytest.mark.parametrize( + "index", [range(8), range(7, -1, -1), [0, 2, 1, 3, 4, 6, 5, 7]] +) +def test_filter_maintains_ordering(index): + # GH 4621 df = DataFrame( - {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]} + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, + index=index, ) s = df["pid"] grouped = df.groupby("tag") @@ -278,33 +290,6 @@ def test_filter_maintains_ordering(): expected = s.iloc[[1, 2, 4, 7]] tm.assert_series_equal(actual, expected) - # Now index is sequentially decreasing. - df.index = np.arange(len(df) - 1, -1, -1) - s = df["pid"] - grouped = df.groupby("tag") - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - tm.assert_frame_equal(actual, expected) - - grouped = s.groupby(df["tag"]) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - tm.assert_series_equal(actual, expected) - - # Index is shuffled. - SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3] - df.index = df.index[SHUFFLED] - s = df["pid"] - grouped = df.groupby("tag") - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - tm.assert_frame_equal(actual, expected) - - grouped = s.groupby(df["tag"]) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - tm.assert_series_equal(actual, expected) - def test_filter_multiple_timestamp(): # GH 10114 diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index 338509dd239e6..e5956f808286d 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -75,6 +75,5 @@ def test_inference_on_pandas_objects(self): def test_constructor_not_read_only(self): # GH#57130 ser = Series([1, 2], dtype=object) - with pd.option_context("mode.copy_on_write", True): - idx = Index(ser) - assert idx._values.flags.writeable + idx = Index(ser) + assert idx._values.flags.writeable diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index b28c3cba7d310..efae0b4dd84cc 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -284,6 +284,7 @@ def test_detect_chained_assignment_changing_dtype(self): with tm.raises_chained_assignment_error(): df.loc[2]["C"] = "foo" tm.assert_frame_equal(df, df_original) + # TODO: Use tm.raises_chained_assignment_error() when PDEP-6 is enforced with tm.raises_chained_assignment_error( extra_warnings=(FutureWarning,), extra_match=(None,) ): diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index ff8c6a98e1819..2511474e03ff7 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -1,3 +1,5 @@ +import weakref + import numpy as np import pytest @@ -68,6 +70,15 @@ def test_api(any_string_dtype): assert isinstance(Series([""], dtype=any_string_dtype).str, StringMethods) +def test_no_circular_reference(any_string_dtype): + # GH 47667 + ser = Series([""], dtype=any_string_dtype) + ref = weakref.ref(ser) + ser.str # Used to cache and cause circular reference + del ser + assert ref() is None + + def test_api_mi_raises(): # GH 23679 mi = MultiIndex.from_arrays([["a", "b", "c"]]) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 1b5d33fc10595..6da6ad27f853f 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -990,21 +990,18 @@ def test_isin_datetimelike_all_nat(self, dtype): tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]", "M8[ns, UTC]"]) - def test_isin_datetimelike_strings_deprecated(self, dtype): + def test_isin_datetimelike_strings_returns_false(self, dtype): # GH#53111 dta = date_range("2013-01-01", periods=3)._values arr = Series(dta.view("i8")).array.view(dtype) vals = [str(x) for x in arr] - msg = "The behavior of 'isin' with dtype=.* is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = algos.isin(arr, vals) - assert res.all() + res = algos.isin(arr, vals) + assert not res.any() vals2 = np.array(vals, dtype=str) - with tm.assert_produces_warning(FutureWarning, match=msg): - res2 = algos.isin(arr, vals2) - assert res2.all() + res2 = algos.isin(arr, vals2) + assert not res2.any() def test_isin_dt64tz_with_nat(self): # the all-NaT values used to get inferred to tznaive, which was evaluated diff --git a/pandas/tests/test_register_accessor.py b/pandas/tests/test_register_accessor.py index 4e569dc40005d..9deff56139394 100644 --- a/pandas/tests/test_register_accessor.py +++ b/pandas/tests/test_register_accessor.py @@ -1,5 +1,6 @@ from collections.abc import Generator import contextlib +import weakref import pytest @@ -101,3 +102,22 @@ def __init__(self, data) -> None: with pytest.raises(AttributeError, match="whoops"): pd.Series([], dtype=object).bad + + +@pytest.mark.parametrize( + "klass, registrar", + [ + (pd.Series, pd.api.extensions.register_series_accessor), + (pd.DataFrame, pd.api.extensions.register_dataframe_accessor), + (pd.Index, pd.api.extensions.register_index_accessor), + ], +) +def test_no_circular_reference(klass, registrar): + # GH 41357 + with ensure_removed(klass, "access"): + registrar("access")(MyAccessor) + obj = klass([0]) + ref = weakref.ref(obj) + assert obj.access.obj is obj + del obj + assert ref() is None