diff --git a/.circleci/config.yml b/.circleci/config.yml index 50f6a116a6630..ba124533e953a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -48,7 +48,7 @@ jobs: name: Build aarch64 wheels no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that command: | - pip3 install cibuildwheel==2.14.1 + pip3 install cibuildwheel==2.15.0 cibuildwheel --prerelease-pythons --output-dir wheelhouse environment: CIBW_BUILD: << parameters.cibw-build >> @@ -92,5 +92,4 @@ workflows: only: /^v.*/ matrix: parameters: - # TODO: Enable Python 3.12 wheels when numpy releases a version that supports Python 3.12 - cibw-build: ["cp39-manylinux_aarch64", "cp310-manylinux_aarch64", "cp311-manylinux_aarch64"]#, "cp312-manylinux_aarch64"] + cibw-build: ["cp39-manylinux_aarch64", "cp310-manylinux_aarch64", "cp311-manylinux_aarch64", "cp312-manylinux_aarch64"] diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 9daa1a67bc2c1..3bd68c07dcbc3 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -33,7 +33,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -109,7 +109,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -124,7 +124,7 @@ jobs: run: | cd asv_bench asv machine --yes - asv run --quick --dry-run --strict --durations=30 --python=same + asv run --quick --dry-run --durations=30 --python=same build_docker_dev_environment: name: Build Docker Dev Environment @@ -143,7 +143,7 @@ jobs: run: docker image prune -f - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -164,7 +164,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 8715c5306a3b0..2182e89731990 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -27,7 +27,7 @@ jobs: - python steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: github/codeql-action/init@v2 with: languages: ${{ matrix.language }} diff --git a/.github/workflows/comment-commands.yml b/.github/workflows/comment-commands.yml index 2550d4de34a45..55dd733d25b50 100644 --- a/.github/workflows/comment-commands.yml +++ b/.github/workflows/comment-commands.yml @@ -51,7 +51,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index e05f12ac6416a..deaf2be0a0423 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -36,7 +36,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 04abcf4ce8816..64a94d7fde5a9 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -34,7 +34,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -62,7 +62,7 @@ jobs: cancel-in-progress: true steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 030c9546fecca..139cfcde95b2c 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -136,7 +136,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -194,7 +194,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -230,11 +230,13 @@ jobs: git -c user.email="you@example.com" merge --no-commit my_ref_name fi - name: Build environment and Run Tests + # https://github.com/numpy/numpy/issues/24703#issuecomment-1722379388 run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.0.1 meson-python==0.13.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true" + python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir export PANDAS_CI=1 @@ -271,7 +273,7 @@ jobs: run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.0.1 + python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir @@ -330,7 +332,7 @@ jobs: PYTEST_TARGET: pandas steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 @@ -342,7 +344,7 @@ jobs: - name: Build Environment run: | python --version - python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.0.1 meson-python==0.13.1 + python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 5f541f1bae1fd..83d14b51092e6 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -48,7 +48,7 @@ jobs: sdist_file: ${{ steps.save-path.outputs.sdist_name }} steps: - name: Checkout pandas - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -97,14 +97,13 @@ jobs: - [macos-12, macosx_*] - [windows-2022, win_amd64] # TODO: support PyPy? - # TODO: Enable Python 3.12 wheels when numpy releases a version that supports Python 3.12 - python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"]]#, ["cp312", "3.12"]] + python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]] env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} steps: - name: Checkout pandas - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -150,8 +149,10 @@ jobs: uses: mamba-org/setup-micromamba@v1 with: environment-name: wheel-env + # Use a fixed Python, since we might have an unreleased Python not + # yet present on conda-forge create-args: >- - python=${{ matrix.python[1] }} + python=3.11 anaconda-client wheel cache-downloads: true @@ -167,12 +168,13 @@ jobs: shell: pwsh run: | $TST_CMD = @" - python -m pip install pytz six numpy python-dateutil tzdata>=2022.1 hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17; - python -m pip install --find-links=pandas\wheelhouse --no-index pandas; + python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17; + python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); python -c `'import pandas as pd; pd.test(extra_args=[\"`\"--no-strict-data-files`\"\", \"`\"-m not clipboard and not single_cpu and not slow and not network and not db`\"\"])`'; "@ - docker pull python:${{ matrix.python[1] }}-windowsservercore - docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] }}-windowsservercore powershell -Command $TST_CMD + # add rc to the end of the image name if the Python version is unreleased + docker pull python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} + docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} powershell -Command $TST_CMD - uses: actions/upload-artifact@v3 with: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1bda47e0631a0..c01bf65818167 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,7 +24,7 @@ repos: hooks: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.0.284 + rev: v0.0.287 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -34,7 +34,7 @@ repos: alias: ruff-selected-autofixes args: [--select, "ANN001,ANN204", --fix-only, --exit-non-zero-on-fix] - repo: https://github.com/jendrikseipp/vulture - rev: 'v2.7' + rev: 'v2.9.1' hooks: - id: vulture entry: python scripts/run_vulture.py @@ -84,7 +84,7 @@ repos: '--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size' ] - repo: https://github.com/pylint-dev/pylint - rev: v3.0.0a6 + rev: v3.0.0a7 hooks: - id: pylint stages: [manual] @@ -124,7 +124,7 @@ repos: types: [text] # overwrite types: [rst] types_or: [python, rst] - repo: https://github.com/sphinx-contrib/sphinx-lint - rev: v0.6.7 + rev: v0.6.8 hooks: - id: sphinx-lint - repo: local diff --git a/LICENSES/BOTTLENECK_LICENCE b/LICENSES/BOTTLENECK_LICENCE new file mode 100644 index 0000000000000..f4bdbb1647ee6 --- /dev/null +++ b/LICENSES/BOTTLENECK_LICENCE @@ -0,0 +1,25 @@ +Copyright (c) 2010-2019 Keith Goodman +Copyright (c) 2019 Bottleneck Developers +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/LICENSES/DATEUTIL_LICENSE b/LICENSES/DATEUTIL_LICENSE index 6053d35cfc60b..1e65815cf0b31 100644 --- a/LICENSES/DATEUTIL_LICENSE +++ b/LICENSES/DATEUTIL_LICENSE @@ -51,4 +51,4 @@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -The above BSD License Applies to all code, even that also covered by Apache 2.0. +The above BSD License Applies to all code, even that also covered by Apache 2.0. \ No newline at end of file diff --git a/LICENSES/KLIB_LICENSE b/LICENSES/KLIB_LICENSE index 0a996fae3360f..2de13e402643b 100644 --- a/LICENSES/KLIB_LICENSE +++ b/LICENSES/KLIB_LICENSE @@ -20,4 +20,4 @@ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +SOFTWARE. \ No newline at end of file diff --git a/LICENSES/MUSL_LICENSE b/LICENSES/MUSL_LICENSE index a8833d4bc4744..5ff71b5f5a7fe 100644 --- a/LICENSES/MUSL_LICENSE +++ b/LICENSES/MUSL_LICENSE @@ -1,7 +1,7 @@ musl as a whole is licensed under the following standard MIT license: ---------------------------------------------------------------------- -Copyright © 2005-2014 Rich Felker, et al. +Copyright © 2005-2020 Rich Felker, et al. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -25,37 +25,88 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Authors/contributors include: +A. Wilcox +Ada Worcester +Alex Dowad +Alex Suykov +Alexander Monakov +Andre McCurdy +Andrew Kelley Anthony G. Basile +Aric Belsito Arvid Picciani +Bartosz Brachaczek +Benjamin Peterson Bobby Bingham Boris Brezillon Brent Cook Chris Spiegel Clément Vasseur +Daniel Micay +Daniel Sabogal +Daurnimator +David Carlier +David Edelsohn +Denys Vlasenko +Dmitry Ivanov +Dmitry V. Levin +Drew DeVault Emil Renner Berthing +Fangrui Song +Felix Fietkau +Felix Janda +Gianluca Anzolin +Hauke Mehrtens +He X Hiltjo Posthuma Isaac Dunham +Jaydeep Patil Jens Gustedt Jeremy Huntwork +Jo-Philipp Wich +Joakim Sindholt John Spencer +Julien Ramseier Justin Cormack +Kaarle Ritvanen +Khem Raj +Kylie McClain +Leah Neukirchen Luca Barbato Luka Perkov M Farkas-Dyck (Strake) +Mahesh Bodapati +Markus Wichmann +Masanori Ogino +Michael Clark Michael Forney +Mikhail Kremnyov +Natanael Copa Nicholas J. Kain orc Pascal Cuoq +Patrick Oppenlander +Petr Hosek +Petr Skocik Pierre Carrier +Reini Urban Rich Felker Richard Pennington +Ryan Fairfax +Samuel Holland +Segev Finer +Shiz sin Solar Designer Stefan Kristiansson +Stefan O'Rear Szabolcs Nagy Timo Teräs +Trutz Behn Valentin Ochs +Will Dietz William Haddon +William Pitcock Portions of this software are derived from third-party works licensed under terms compatible with the above MIT license: @@ -71,18 +122,22 @@ Copyright © 1993,2004 Sun Microsystems or Copyright © 2003-2011 David Schultz or Copyright © 2003-2009 Steven G. Kargl or Copyright © 2003-2009 Bruce D. Evans or -Copyright © 2008 Stephen L. Moshier +Copyright © 2008 Stephen L. Moshier or +Copyright © 2017-2018 Arm Limited and labelled as such in comments in the individual source files. All have been licensed under extremely permissive terms. -The ARM memcpy code (src/string/armel/memcpy.s) is Copyright © 2008 +The ARM memcpy code (src/string/arm/memcpy.S) is Copyright © 2008 The Android Open Source Project and is licensed under a two-clause BSD license. It was taken from Bionic libc, used on Android. -The implementation of DES for crypt (src/misc/crypt_des.c) is +The AArch64 memcpy and memset code (src/string/aarch64/*) are +Copyright © 1999-2019, Arm Limited. + +The implementation of DES for crypt (src/crypt/crypt_des.c) is Copyright © 1994 David Burren. It is licensed under a BSD license. -The implementation of blowfish crypt (src/misc/crypt_blowfish.c) was +The implementation of blowfish crypt (src/crypt/crypt_blowfish.c) was originally written by Solar Designer and placed into the public domain. The code also comes with a fallback permissive license for use in jurisdictions that may not recognize the public domain. @@ -90,22 +145,17 @@ in jurisdictions that may not recognize the public domain. The smoothsort implementation (src/stdlib/qsort.c) is Copyright © 2011 Valentin Ochs and is licensed under an MIT-style license. -The BSD PRNG implementation (src/prng/random.c) and XSI search API -(src/search/*.c) functions are Copyright © 2011 Szabolcs Nagy and -licensed under following terms: "Permission to use, copy, modify, -and/or distribute this code for any purpose with or without fee is -hereby granted. There is no warranty." - -The x86_64 port was written by Nicholas J. Kain. Several files (crt) -were released into the public domain; others are licensed under the -standard MIT license terms at the top of this file. See individual -files for their copyright status. +The x86_64 port was written by Nicholas J. Kain and is licensed under +the standard MIT terms. The mips and microblaze ports were originally written by Richard Pennington for use in the ellcc project. The original code was adapted by Rich Felker for build system and code conventions during upstream integration. It is licensed under the standard MIT terms. +The mips64 port was contributed by Imagination Technologies and is +licensed under the standard MIT terms. + The powerpc port was also originally written by Richard Pennington, and later supplemented and integrated by John Spencer. It is licensed under the standard MIT terms. @@ -118,15 +168,26 @@ can be found in the git version control history of the project. The omission of copyright and license comments in each file is in the interest of source tree size. -All public header files (include/* and arch/*/bits/*) should be -treated as Public Domain as they intentionally contain no content -which can be covered by copyright. Some source modules may fall in -this category as well. If you believe that a file is so trivial that -it should be in the Public Domain, please contact the authors and -request an explicit statement releasing it from copyright. +In addition, permission is hereby granted for all public header files +(include/* and arch/*/bits/*) and crt files intended to be linked into +applications (crt/*, ldso/dlstart.c, and arch/*/crt_arch.h) to omit +the copyright notice and permission notice otherwise required by the +license, and to use these files without any requirement of +attribution. These files include substantial contributions from: + +Bobby Bingham +John Spencer +Nicholas J. Kain +Rich Felker +Richard Pennington +Stefan Kristiansson +Szabolcs Nagy -The following files are trivial, believed not to be copyrightable in -the first place, and hereby explicitly released to the Public Domain: +all of whom have explicitly granted such permission. -All public headers: include/*, arch/*/bits/* -Startup files: crt/* +This file previously contained text expressing a belief that most of +the files covered by the above exception were sufficiently trivial not +to be subject to copyright, resulting in confusion over whether it +negated the permissions granted in the license. In the spirit of +permissive licensing, and of not having licensing issues being an +obstacle to adoption, that text has been removed. \ No newline at end of file diff --git a/LICENSES/NUMPY_LICENSE b/LICENSES/NUMPY_LICENSE index 7e972cff80759..f2d647bf0bc48 100644 --- a/LICENSES/NUMPY_LICENSE +++ b/LICENSES/NUMPY_LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2005-2011, NumPy Developers. +Copyright (c) 2005-2023, NumPy Developers. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -27,4 +27,4 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/LICENSES/OTHER b/LICENSES/OTHER deleted file mode 100644 index e156152990cc4..0000000000000 --- a/LICENSES/OTHER +++ /dev/null @@ -1,57 +0,0 @@ -Bottleneck license ------------------- - -Copyright (c) 2010-2012 Archipel Asset Management AB. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -Pyperclip v1.3 license ----------------------- - -Copyright (c) 2010, Albert Sweigart -All rights reserved. - -BSD-style license: - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the pyperclip nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY Albert Sweigart "AS IS" AND ANY -EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL Albert Sweigart BE LIABLE FOR ANY -DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/LICENSES/PACKAGING_LICENSE b/LICENSES/PACKAGING_LICENSE index 4216ea1ce2379..2bfcd5297c470 100644 --- a/LICENSES/PACKAGING_LICENSE +++ b/LICENSES/PACKAGING_LICENSE @@ -199,4 +199,4 @@ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/LICENSES/PSF_LICENSE b/LICENSES/PSF_LICENSE index 5cdb01e8d24af..f26bcf4d2de6e 100644 --- a/LICENSES/PSF_LICENSE +++ b/LICENSES/PSF_LICENSE @@ -2,25 +2,24 @@ A. HISTORY OF THE SOFTWARE ========================== Python was created in the early 1990s by Guido van Rossum at Stichting -Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands +Mathematisch Centrum (CWI, see https://www.cwi.nl) in the Netherlands as a successor of a language called ABC. Guido remains Python's principal author, although it includes many contributions from others. In 1995, Guido continued his work on Python at the Corporation for -National Research Initiatives (CNRI, see http://www.cnri.reston.va.us) +National Research Initiatives (CNRI, see https://www.cnri.reston.va.us) in Reston, Virginia where he released several versions of the software. In May 2000, Guido and the Python core development team moved to BeOpen.com to form the BeOpen PythonLabs team. In October of the same -year, the PythonLabs team moved to Digital Creations (now Zope -Corporation, see http://www.zope.com). In 2001, the Python Software -Foundation (PSF, see http://www.python.org/psf/) was formed, a -non-profit organization created specifically to own Python-related -Intellectual Property. Zope Corporation is a sponsoring member of -the PSF. - -All Python releases are Open Source (see http://www.opensource.org for +year, the PythonLabs team moved to Digital Creations, which became +Zope Corporation. In 2001, the Python Software Foundation (PSF, see +https://www.python.org/psf/) was formed, a non-profit organization +created specifically to own Python-related Intellectual Property. +Zope Corporation was a sponsoring member of the PSF. + +All Python releases are Open Source (see https://opensource.org for the Open Source Definition). Historically, most, but not all, Python releases have also been GPL-compatible; the table below summarizes the various releases. @@ -36,34 +35,9 @@ the various releases. 2.1 2.0+1.6.1 2001 PSF no 2.0.1 2.0+1.6.1 2001 PSF yes 2.1.1 2.1+2.0.1 2001 PSF yes - 2.2 2.1.1 2001 PSF yes 2.1.2 2.1.1 2002 PSF yes 2.1.3 2.1.2 2002 PSF yes - 2.2.1 2.2 2002 PSF yes - 2.2.2 2.2.1 2002 PSF yes - 2.2.3 2.2.2 2003 PSF yes - 2.3 2.2.2 2002-2003 PSF yes - 2.3.1 2.3 2002-2003 PSF yes - 2.3.2 2.3.1 2002-2003 PSF yes - 2.3.3 2.3.2 2002-2003 PSF yes - 2.3.4 2.3.3 2004 PSF yes - 2.3.5 2.3.4 2005 PSF yes - 2.4 2.3 2004 PSF yes - 2.4.1 2.4 2005 PSF yes - 2.4.2 2.4.1 2005 PSF yes - 2.4.3 2.4.2 2006 PSF yes - 2.4.4 2.4.3 2006 PSF yes - 2.5 2.4 2006 PSF yes - 2.5.1 2.5 2007 PSF yes - 2.5.2 2.5.1 2008 PSF yes - 2.5.3 2.5.2 2008 PSF yes - 2.6 2.5 2008 PSF yes - 2.6.1 2.6 2008 PSF yes - 2.6.2 2.6.1 2009 PSF yes - 2.6.3 2.6.2 2009 PSF yes - 2.6.4 2.6.3 2009 PSF yes - 2.6.5 2.6.4 2010 PSF yes - 2.7 2.6 2010 PSF yes + 2.2 and above 2.1.1 2001-now PSF yes Footnotes: @@ -85,6 +59,17 @@ direction to make these releases possible. B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON =============================================================== +Python software and documentation are licensed under the +Python Software Foundation License Version 2. + +Starting with Python 3.8.6, examples, recipes, and other code in +the documentation are dual licensed under the PSF License Version 2 +and the Zero-Clause BSD license. + +Some software incorporated into Python is under different licenses. +The licenses are listed with code falling under that license. + + PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 -------------------------------------------- @@ -98,9 +83,10 @@ grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, analyze, test, perform and/or display publicly, prepare derivative works, distribute, and otherwise use Python alone or in any derivative version, provided, however, that PSF's License Agreement and PSF's notice of copyright, -i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 -Python Software Foundation; All Rights Reserved" are retained in Python alone or -in any derivative version prepared by Licensee. +i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, +2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023 Python Software Foundation; +All Rights Reserved" are retained in Python alone or in any derivative version +prepared by Licensee. 3. In the event Licensee prepares a derivative work that is based on or incorporates Python or any part thereof, and wants to make @@ -205,9 +191,9 @@ version prepared by Licensee. Alternately, in lieu of CNRI's License Agreement, Licensee may substitute the following text (omitting the quotes): "Python 1.6.1 is made available subject to the terms and conditions in CNRI's License Agreement. This Agreement together with -Python 1.6.1 may be located on the Internet using the following +Python 1.6.1 may be located on the internet using the following unique, persistent identifier (known as a handle): 1895.22/1013. This -Agreement may also be obtained from a proxy server on the Internet +Agreement may also be obtained from a proxy server on the internet using the following URL: http://hdl.handle.net/1895.22/1013". 3. In the event Licensee prepares a derivative work that is based on @@ -277,3 +263,17 @@ FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +ZERO-CLAUSE BSD LICENSE FOR CODE IN THE PYTHON DOCUMENTATION +---------------------------------------------------------------------- + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH +REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, +INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR +OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THIS SOFTWARE. diff --git a/LICENSES/PYPERCLIP_LICENSE b/LICENSES/PYPERCLIP_LICENSE new file mode 100644 index 0000000000000..07cc746cd5ad6 --- /dev/null +++ b/LICENSES/PYPERCLIP_LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2014, Al Sweigart +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the {organization} nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/LICENSES/PYUPGRADE_LICENSE b/LICENSES/PYUPGRADE_LICENSE index 522fbe20b8991..edeac73dade04 100644 --- a/LICENSES/PYUPGRADE_LICENSE +++ b/LICENSES/PYUPGRADE_LICENSE @@ -16,4 +16,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. +THE SOFTWARE. \ No newline at end of file diff --git a/LICENSES/SAS7BDAT_LICENSE b/LICENSES/SAS7BDAT_LICENSE index 8fbf194013e93..94b7fe934e85c 100644 --- a/LICENSES/SAS7BDAT_LICENSE +++ b/LICENSES/SAS7BDAT_LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2015 Jared Hobbs +Copyright (c) 2015-2019 Jared Hobbs Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -16,4 +16,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +SOFTWARE. \ No newline at end of file diff --git a/LICENSES/SCIPY_LICENSE b/LICENSES/SCIPY_LICENSE deleted file mode 100644 index d887ce5f9890f..0000000000000 --- a/LICENSES/SCIPY_LICENSE +++ /dev/null @@ -1,31 +0,0 @@ -Copyright (c) 2001, 2002 Enthought, Inc. -All rights reserved. - -Copyright (c) 2003-2012 SciPy Developers. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - a. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - b. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - c. Neither the name of Enthought nor the names of the SciPy Developers - may be used to endorse or promote products derived from this software - without specific prior written permission. - - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH -DAMAGE. - diff --git a/LICENSES/ULTRAJSON_LICENSE b/LICENSES/ULTRAJSON_LICENSE index a905fb017d813..58fc9dfc0a35b 100644 --- a/LICENSES/ULTRAJSON_LICENSE +++ b/LICENSES/ULTRAJSON_LICENSE @@ -1,21 +1,22 @@ -Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom +Developed by ESN, an Electronic Arts Inc. studio. +Copyright (c) 2014, Electronic Arts Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the ESN Social Software AB nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of ESN, Electronic Arts Inc. nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE +DISCLAIMED. IN NO EVENT SHALL ELECTRONIC ARTS INC. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND @@ -23,12 +24,91 @@ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +---- Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. -Numeric decoder derived from TCL library -http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms + Copyright 2005, 2006, 2007 + Nick Galbreath -- nickg [at] modp [dot] com + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + Neither the name of the modp.com nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + This is the standard "new" BSD license: + http://www.opensource.org/licenses/bsd-license.php + +https://github.com/client9/stringencoders/blob/cfd5c1507325ae497ea9bacdacba12c0ffd79d30/COPYING + +---- + +Numeric decoder derived from from TCL library +https://opensource.apple.com/source/tcl/tcl-14/tcl/license.terms * Copyright (c) 1988-1993 The Regents of the University of California. * Copyright (c) 1994 Sun Microsystems, Inc. + + This software is copyrighted by the Regents of the University of + California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState + Corporation and other parties. The following terms apply to all files + associated with the software unless explicitly disclaimed in + individual files. + + The authors hereby grant permission to use, copy, modify, distribute, + and license this software and its documentation for any purpose, provided + that existing copyright notices are retained in all copies and that this + notice is included verbatim in any distributions. No written agreement, + license, or royalty fee is required for any of the authorized uses. + Modifications to this software may be copyrighted by their authors + and need not follow the licensing terms described here, provided that + the new terms are clearly indicated on the first page of each file where + they apply. + + IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY + FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES + ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY + DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + + THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES, + INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE + IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE + NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR + MODIFICATIONS. + + GOVERNMENT USE: If you are acquiring this software on behalf of the + U.S. government, the Government shall have only "Restricted Rights" + in the software and related documentation as defined in the Federal + Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you + are acquiring the software on behalf of the Department of Defense, the + software shall be classified as "Commercial Computer Software" and the + Government shall have only "Restricted Rights" as defined in Clause + 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the + authors grant the U.S. Government and others acting in its behalf + permission to use and distribute the software in accordance with the + terms specified in this license. \ No newline at end of file diff --git a/LICENSES/XARRAY_LICENSE b/LICENSES/XARRAY_LICENSE index 6bafeb9d3d80e..8405e89a0b120 100644 --- a/LICENSES/XARRAY_LICENSE +++ b/LICENSES/XARRAY_LICENSE @@ -1,7 +1,3 @@ -Copyright 2014-2019, xarray Developers - --------------------------------------------------------------------------------- - Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ @@ -192,4 +188,4 @@ third-party archives. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. \ No newline at end of file diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index 4fd9740f184c8..49543c166d047 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -262,7 +262,7 @@ class Timeseries: def setup(self, tz): N = 10**6 halfway = (N // 2) - 1 - self.s = Series(date_range("20010101", periods=N, freq="T", tz=tz)) + self.s = Series(date_range("20010101", periods=N, freq="min", tz=tz)) self.ts = self.s[halfway] self.s2 = Series(date_range("20010101", periods=N, freq="s", tz=tz)) @@ -460,7 +460,7 @@ class OffsetArrayArithmetic: def setup(self, offset): N = 10000 - rng = date_range(start="1/1/2000", periods=N, freq="T") + rng = date_range(start="1/1/2000", periods=N, freq="min") self.rng = rng self.ser = Series(rng) @@ -479,7 +479,7 @@ class ApplyIndex: def setup(self, offset): N = 10000 - rng = date_range(start="1/1/2000", periods=N, freq="T") + rng = date_range(start="1/1/2000", periods=N, freq="min") self.rng = rng def time_apply_index(self, offset): diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index 09c4acc0ab309..0229cf15fbfb8 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -90,7 +90,7 @@ def time_setitem(self, multiple_chunks): self.array[i] = "foo" def time_setitem_list(self, multiple_chunks): - indexer = list(range(0, 50)) + list(range(-1000, 0, 50)) + indexer = list(range(50)) + list(range(-1000, 0, 50)) self.array[indexer] = ["foo"] * len(indexer) def time_setitem_slice(self, multiple_chunks): diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py index 8a3d224c59a09..656d16a910a9f 100644 --- a/asv_bench/benchmarks/eval.py +++ b/asv_bench/benchmarks/eval.py @@ -44,7 +44,7 @@ class Query: def setup(self): N = 10**6 halfway = (N // 2) - 1 - index = pd.date_range("20010101", periods=N, freq="T") + index = pd.date_range("20010101", periods=N, freq="min") s = pd.Series(index) self.ts = s.iloc[halfway] self.df = pd.DataFrame({"a": np.random.randn(N), "dates": index}, index=index) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 96fe90ce08c5f..e56fbf1d8c32f 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -693,20 +693,30 @@ def time_frame_sort_values(self, ascending): self.df.sort_values(by="A", ascending=ascending) -class SortIndexByColumns: - def setup(self): +class SortMultiKey: + params = [True, False] + param_names = ["monotonic"] + + def setup(self, monotonic): N = 10000 K = 10 - self.df = DataFrame( + df = DataFrame( { "key1": tm.makeStringIndex(N).values.repeat(K), "key2": tm.makeStringIndex(N).values.repeat(K), "value": np.random.randn(N * K), } ) + if monotonic: + df = df.sort_values(["key1", "key2"]) + self.df_by_columns = df + self.df_by_index = df.set_index(["key1", "key2"]) + + def time_sort_values(self, monotonic): + self.df_by_columns.sort_values(by=["key1", "key2"]) - def time_frame_sort_values_by_columns(self): - self.df.sort_values(by=["key1", "key2"]) + def time_sort_index(self, monotonic): + self.df_by_index.sort_index() class Quantile: diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 4d5c31d2dddf8..4993ffd2c47d0 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -178,7 +178,7 @@ def time_kth_smallest(self): class ParallelDatetimeFields: def setup(self): N = 10**6 - self.dti = date_range("1900-01-01", periods=N, freq="T") + self.dti = date_range("1900-01-01", periods=N, freq="min") self.period = self.dti.to_period("D") def time_datetime_field_year(self): diff --git a/asv_bench/benchmarks/index_cached_properties.py b/asv_bench/benchmarks/index_cached_properties.py index b3d8de39a858a..d21bbe15c4cc8 100644 --- a/asv_bench/benchmarks/index_cached_properties.py +++ b/asv_bench/benchmarks/index_cached_properties.py @@ -25,14 +25,14 @@ def setup(self, index_type): N = 10**5 if index_type == "MultiIndex": self.idx = pd.MultiIndex.from_product( - [pd.date_range("1/1/2000", freq="T", periods=N // 2), ["a", "b"]] + [pd.date_range("1/1/2000", freq="min", periods=N // 2), ["a", "b"]] ) elif index_type == "DatetimeIndex": - self.idx = pd.date_range("1/1/2000", freq="T", periods=N) + self.idx = pd.date_range("1/1/2000", freq="min", periods=N) elif index_type == "Int64Index": self.idx = pd.Index(range(N), dtype="int64") elif index_type == "PeriodIndex": - self.idx = pd.period_range("1/1/2000", freq="T", periods=N) + self.idx = pd.period_range("1/1/2000", freq="min", periods=N) elif index_type == "RangeIndex": self.idx = pd.RangeIndex(start=0, stop=N) elif index_type == "IntervalIndex": diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index bdc8a6a7aa1df..2d8014570466e 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -25,7 +25,7 @@ class SetOperations: def setup(self, index_structure, dtype, method): N = 10**5 - dates_left = date_range("1/1/2000", periods=N, freq="T") + dates_left = date_range("1/1/2000", periods=N, freq="min") fmt = "%Y-%m-%d %H:%M:%S" date_str_left = Index(dates_left.strftime(fmt)) int_left = Index(np.arange(N)) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 9eaffddd8b87f..bebf6ee993aba 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -290,7 +290,7 @@ def time_float_longint_str_lines(self): class ToJSONMem: def setup_cache(self): df = DataFrame([[1]]) - df2 = DataFrame(range(8), date_range("1/1/2000", periods=8, freq="T")) + df2 = DataFrame(range(8), date_range("1/1/2000", periods=8, freq="min")) frames = {"int": df, "float": df.astype(float), "datetime": df2} return frames diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 4f325335829af..04ac47a892a22 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -212,7 +212,7 @@ class JoinNonUnique: # outer join of non-unique # GH 6329 def setup(self): - date_index = date_range("01-Jan-2013", "23-Jan-2013", freq="T") + date_index = date_range("01-Jan-2013", "23-Jan-2013", freq="min") daily_dates = date_index.to_period("D").to_timestamp("S", "S") self.fracofday = date_index.values - daily_dates.values self.fracofday = self.fracofday.astype("timedelta64[ns]") @@ -338,7 +338,7 @@ class MergeDatetime: def setup(self, units, tz): unit_left, unit_right = units N = 10_000 - keys = Series(date_range("2012-01-01", freq="T", periods=N, tz=tz)) + keys = Series(date_range("2012-01-01", freq="min", periods=N, tz=tz)) self.left = DataFrame( { "key": keys.sample(N * 10, replace=True).dt.as_unit(unit_left), @@ -360,14 +360,14 @@ class MergeCategoricals: def setup(self): self.left_object = DataFrame( { - "X": np.random.choice(range(0, 10), size=(10000,)), + "X": np.random.choice(range(10), size=(10000,)), "Y": np.random.choice(["one", "two", "three"], size=(10000,)), } ) self.right_object = DataFrame( { - "X": np.random.choice(range(0, 10), size=(10000,)), + "X": np.random.choice(range(10), size=(10000,)), "Z": np.random.choice(["jjj", "kkk", "sss"], size=(10000,)), } ) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index c8a9a9e6e9176..22a5511e4c678 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -22,7 +22,7 @@ class SparseSeriesToFrame: def setup(self): K = 50 N = 50001 - rng = date_range("1/1/2000", periods=N, freq="T") + rng = date_range("1/1/2000", periods=N, freq="min") self.series = {} for i in range(1, K): data = np.random.randn(N)[:-i] diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 1253fefde2d5f..8c78a9c1723df 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -116,7 +116,7 @@ def time_infer_freq(self, freq): class TimeDatetimeConverter: def setup(self): N = 100000 - self.rng = date_range(start="1/1/2000", periods=N, freq="T") + self.rng = date_range(start="1/1/2000", periods=N, freq="min") def time_convert(self): DatetimeConverter.convert(self.rng, None, None) @@ -129,9 +129,9 @@ class Iteration: def setup(self, time_index): N = 10**6 if time_index is timedelta_range: - self.idx = time_index(start=0, freq="T", periods=N) + self.idx = time_index(start=0, freq="min", periods=N) else: - self.idx = time_index(start="20140101", freq="T", periods=N) + self.idx = time_index(start="20140101", freq="min", periods=N) self.exit = 10000 def time_iter(self, time_index): @@ -149,7 +149,7 @@ class ResampleDataFrame: param_names = ["method"] def setup(self, method): - rng = date_range(start="20130101", periods=100000, freq="50L") + rng = date_range(start="20130101", periods=100000, freq="50ms") df = DataFrame(np.random.randn(100000, 2), index=rng) self.resample = getattr(df.resample("1s"), method) @@ -163,8 +163,8 @@ class ResampleSeries: def setup(self, index, freq, method): indexes = { - "period": period_range(start="1/1/2000", end="1/1/2001", freq="T"), - "datetime": date_range(start="1/1/2000", end="1/1/2001", freq="T"), + "period": period_range(start="1/1/2000", end="1/1/2001", freq="min"), + "datetime": date_range(start="1/1/2000", end="1/1/2001", freq="min"), } idx = indexes[index] ts = Series(np.random.randn(len(idx)), index=idx) @@ -178,7 +178,7 @@ class ResampleDatetetime64: # GH 7754 def setup(self): rng3 = date_range( - start="2000-01-01 00:00:00", end="2000-01-01 10:00:00", freq="555000U" + start="2000-01-01 00:00:00", end="2000-01-01 10:00:00", freq="555000us" ) self.dt_ts = Series(5, rng3, dtype="datetime64[ns]") @@ -270,7 +270,7 @@ class DatetimeAccessor: def setup(self, tz): N = 100000 - self.series = Series(date_range(start="1/1/2000", periods=N, freq="T", tz=tz)) + self.series = Series(date_range(start="1/1/2000", periods=N, freq="min", tz=tz)) def time_dt_accessor(self, tz): self.series.dt diff --git a/asv_bench/benchmarks/tslibs/timestamp.py b/asv_bench/benchmarks/tslibs/timestamp.py index d7706a39dfae5..082220ee0dff2 100644 --- a/asv_bench/benchmarks/tslibs/timestamp.py +++ b/asv_bench/benchmarks/tslibs/timestamp.py @@ -136,10 +136,10 @@ def time_to_julian_date(self, tz): self.ts.to_julian_date() def time_floor(self, tz): - self.ts.floor("5T") + self.ts.floor("5min") def time_ceil(self, tz): - self.ts.ceil("5T") + self.ts.ceil("5min") class TimestampAcrossDst: diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 638aaedecf2b5..2522605cf5c38 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer[toml] - cython>=0.29.33 - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies @@ -34,7 +34,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 @@ -46,6 +46,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 - pytables>=3.7.0 + - python-calamine>=0.1.6 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 4d925786c22d6..dc67122c6e72e 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -8,7 +8,7 @@ dependencies: # build dependencies - versioneer[toml] - cython>=0.29.33 - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies @@ -35,7 +35,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 @@ -47,6 +47,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 - pytables>=3.7.0 + - python-calamine>=0.1.6 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 2cd4d5f3528f8..7fd3a65ec91f8 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -6,8 +6,9 @@ dependencies: # build dependencies - versioneer[toml] - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 + - cython>=0.29.33 # test dependencies - pytest>=7.3.2 @@ -25,7 +26,6 @@ dependencies: - pip - pip: - - "cython" - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - "--pre" - "numpy" diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index f24e866af0439..893341350f4ef 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - cython>=0.29.33 - meson-python=0.13.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 698107341c26d..c9faaa2146235 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer[toml] - cython>=0.29.33 - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies @@ -34,7 +34,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 @@ -46,6 +46,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 # - pytables>=3.7.0, 3.8.0 is first version that supports 3.11 + - python-calamine>=0.1.6 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index a0aafb1d772a7..b7cc6e9e891ce 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -9,7 +9,7 @@ dependencies: # build dependencies - versioneer[toml] - cython>=0.29.33 - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies @@ -48,6 +48,7 @@ dependencies: - pymysql=1.0.2 - pyreadstat=1.1.5 - pytables=3.7.0 + - python-calamine=0.1.6 - pyxlsb=1.0.9 - s3fs=2022.05.0 - scipy=1.8.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 6c6b1de165496..f51a8e04fbc7e 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer[toml] - cython>=0.29.33 - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies @@ -34,7 +34,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 @@ -46,6 +46,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 - pytables>=3.7.0 + - python-calamine>=0.1.6 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index 035395d55eb3a..4923c94ab08f3 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -10,7 +10,7 @@ dependencies: # build dependencies - versioneer[toml] - cython>=0.29.33 - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index d436e90fa6186..d1a9e336eaeac 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer[toml] - cython>=0.29.33 - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies @@ -34,7 +34,7 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 # test_numba_vs_cython segfaults with numba 0.57 - numba>=0.55.2, <0.57.0 - numexpr>=2.8.0 @@ -47,6 +47,7 @@ dependencies: - pymysql>=1.0.2 # - pyreadstat>=1.1.5 not available on ARM - pytables>=3.7.0 + - python-calamine>=0.1.6 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 diff --git a/doc/cheatsheet/README.md b/doc/cheatsheet/README.md new file mode 100644 index 0000000000000..6c33de104ed90 --- /dev/null +++ b/doc/cheatsheet/README.md @@ -0,0 +1,22 @@ +# Pandas Cheat Sheet + +The Pandas Cheat Sheet was created using Microsoft Powerpoint 2013. +To create the PDF version, within Powerpoint, simply do a "Save As" +and pick "PDF" as the format. + +This cheat sheet, originally written by Irv Lustig, [Princeton Consultants](https://www.princetonoptimization.com/), was inspired by the [RStudio Data Wrangling Cheatsheet](https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf). + +| Topic | PDF | PPT | +|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Pandas_Cheat_Sheet | | | +| Pandas_Cheat_Sheet_JA | | | + + +**Alternative** + +Alternatively, if you want to complement your learning, you can use the Pandas Cheat sheets +developed by [DataCamp](https://www.datacamp.com/) in "PDF", "Google Colab" and "Streamlit" formats. + +| Topic | PDF | Streamlit | Google Colab | +|-------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Pandas | | | Open In Colab | diff --git a/doc/cheatsheet/README.txt b/doc/cheatsheet/README.txt deleted file mode 100644 index c57da38b31777..0000000000000 --- a/doc/cheatsheet/README.txt +++ /dev/null @@ -1,8 +0,0 @@ -The Pandas Cheat Sheet was created using Microsoft Powerpoint 2013. -To create the PDF version, within Powerpoint, simply do a "Save As" -and pick "PDF" as the format. - -This cheat sheet was inspired by the RStudio Data Wrangling Cheatsheet[1], written by Irv Lustig, Princeton Consultants[2]. - -[1]: https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf -[2]: https://www.princetonoptimization.com/ diff --git a/doc/make.py b/doc/make.py index 937b2638fb098..23b18d69acc04 100755 --- a/doc/make.py +++ b/doc/make.py @@ -123,14 +123,14 @@ def _sphinx_build(self, kind: str): Parameters ---------- - kind : {'html', 'latex'} + kind : {'html', 'latex', 'linkcheck'} Examples -------- >>> DocBuilder(num_jobs=4)._sphinx_build('html') """ - if kind not in ("html", "latex"): - raise ValueError(f"kind must be html or latex, not {kind}") + if kind not in ("html", "latex", "linkcheck"): + raise ValueError(f"kind must be html, latex or linkcheck, not {kind}") cmd = ["sphinx-build", "-b", kind] if self.num_jobs: @@ -159,10 +159,10 @@ def _get_page_title(self, page): Open the rst file `page` and extract its title. """ fname = os.path.join(SOURCE_PATH, f"{page}.rst") - option_parser = docutils.frontend.OptionParser( - components=(docutils.parsers.rst.Parser,) + doc = docutils.utils.new_document( + "", + docutils.frontend.get_default_settings(docutils.parsers.rst.Parser), ) - doc = docutils.utils.new_document("", option_parser.get_default_values()) with open(fname, encoding="utf-8") as f: data = f.read() @@ -288,6 +288,12 @@ def zip_html(self): os.chdir(dirname) self._run_os("zip", zip_fname, "-r", "-q", *fnames) + def linkcheck(self): + """ + Check for broken links in the documentation. + """ + return self._sphinx_build("linkcheck") + def main(): cmds = [method for method in dir(DocBuilder) if not method.startswith("_")] diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 247f5cb0cb62d..82af8122a6bbd 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -311,6 +311,7 @@ If using :ref:`mamba `, run: .. code-block:: shell git checkout main + git fetch upstream git merge upstream/main mamba activate pandas-dev mamba env update -f environment.yml --prune @@ -320,6 +321,7 @@ If using :ref:`pip ` , do: .. code-block:: shell git checkout main + git fetch upstream git merge upstream/main # activate the virtual environment based on your platform python -m pip install --upgrade -r requirements-dev.txt diff --git a/doc/source/development/debugging_extensions.rst b/doc/source/development/debugging_extensions.rst index 9ac4cf4083475..27f812b65e261 100644 --- a/doc/source/development/debugging_extensions.rst +++ b/doc/source/development/debugging_extensions.rst @@ -21,12 +21,16 @@ By default building pandas from source will generate a release build. To generat pip install -ve . --no-build-isolation --config-settings=builddir="debug" --config-settings=setup-args="-Dbuildtype=debug" +.. note:: + + conda environements update CFLAGS/CPPFLAGS with flags that are geared towards generating releases. If using conda, you may need to set ``CFLAGS="$CFLAGS -O0"`` and ``CPPFLAGS="$CPPFLAGS -O0"`` to ensure optimizations are turned off for debugging + By specifying ``builddir="debug"`` all of the targets will be built and placed in the debug directory relative to the project root. This helps to keep your debug and release artifacts separate; you are of course able to choose a different directory name or omit altogether if you do not care to separate build types. Editor support -------------- -The meson build system generates a `compilation database `_ automatically and places it in the build directory. Many language servers and IDEs can use this information to provide code-completion, go-to-defintion and error checking support as you type. +The meson build system generates a `compilation database `_ automatically and places it in the build directory. Many language servers and IDEs can use this information to provide code-completion, go-to-definition and error checking support as you type. How each language server / IDE chooses to look for the compilation database may vary. When in doubt you may want to create a symlink at the root of the project that points to the compilation database in your build directory. Assuming you used *debug* as your directory name, you can run:: diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index ae7c9d4ea9c62..2c0787397e047 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -281,6 +281,7 @@ xlrd 2.0.1 excel Reading Excel xlsxwriter 3.0.3 excel Writing Excel openpyxl 3.0.10 excel Reading / writing for xlsx files pyxlsb 1.0.9 excel Reading for xlsb files +python-calamine 0.1.6 excel Reading for xls/xlsx/xlsb/ods files ========================= ================== =============== ============================================================= HTML diff --git a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst index 2dcc8b0abe3b8..caaff3557ae40 100644 --- a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst +++ b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst @@ -106,9 +106,9 @@ between square brackets ``[]``. .. note:: - If you are familiar to Python + If you are familiar with Python :ref:`dictionaries `, the selection of a - single column is very similar to selection of dictionary values based on + single column is very similar to the selection of dictionary values based on the key. You can create a ``Series`` from scratch as well: diff --git a/doc/source/getting_started/intro_tutorials/09_timeseries.rst b/doc/source/getting_started/intro_tutorials/09_timeseries.rst index 470b3908802b2..b0530087e5b84 100644 --- a/doc/source/getting_started/intro_tutorials/09_timeseries.rst +++ b/doc/source/getting_started/intro_tutorials/09_timeseries.rst @@ -295,7 +295,7 @@ Aggregate the current hourly time series values to the monthly maximum value in .. ipython:: python - monthly_max = no_2.resample("M").max() + monthly_max = no_2.resample("ME").max() monthly_max A very powerful method on time series data with a datetime index, is the diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index eb2529716b55e..83f830bb11198 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -34,11 +34,13 @@ objects. api.extensions.ExtensionArray._accumulate api.extensions.ExtensionArray._concat_same_type + api.extensions.ExtensionArray._explode api.extensions.ExtensionArray._formatter api.extensions.ExtensionArray._from_factorized api.extensions.ExtensionArray._from_sequence api.extensions.ExtensionArray._from_sequence_of_strings api.extensions.ExtensionArray._hash_pandas_object + api.extensions.ExtensionArray._pad_or_backfill api.extensions.ExtensionArray._reduce api.extensions.ExtensionArray._values_for_argsort api.extensions.ExtensionArray._values_for_factorize @@ -54,7 +56,6 @@ objects. api.extensions.ExtensionArray.interpolate api.extensions.ExtensionArray.isin api.extensions.ExtensionArray.isna - api.extensions.ExtensionArray.pad_or_backfill api.extensions.ExtensionArray.ravel api.extensions.ExtensionArray.repeat api.extensions.ExtensionArray.searchsorted diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 58351bab07b22..9acbab7a42800 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -525,6 +525,29 @@ Sparse-dtype specific methods and attributes are provided under the Series.sparse.from_coo Series.sparse.to_coo + +.. _api.series.struct: + +Struct accessor +~~~~~~~~~~~~~~~ + +Arrow struct-dtype specific methods and attributes are provided under the +``Series.struct`` accessor. + +.. autosummary:: + :toctree: api/ + :template: autosummary/accessor_attribute.rst + + Series.struct.dtypes + +.. autosummary:: + :toctree: api/ + :template: autosummary/accessor_method.rst + + Series.struct.field + Series.struct.explode + + .. _api.series.flags: Flags diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 1a891dca839e3..5def84b91705c 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -610,7 +610,7 @@ financial applications. See the :ref:`Time Series section `. .. ipython:: python - rng = pd.date_range("1/1/2012", periods=100, freq="S") + rng = pd.date_range("1/1/2012", periods=100, freq="s") ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) ts.resample("5Min").sum() diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 9efa7df3ff669..34d04745ccdb5 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -832,9 +832,6 @@ The following table summarizes the results of merging ``Categoricals``: | category (int) | category (float) | False | float (dtype is inferred) | +-------------------+------------------------+----------------------+-----------------------------+ -See also the section on :ref:`merge dtypes` for notes about -preserving merge dtypes and performance. - .. _categorical.union: Unioning diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 66ee571d6b5a5..b1a6aa8753be1 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -459,14 +459,14 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df # List the size of the animals with the highest weight. - df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()]) + df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()], include_groups=False) `Using get_group `__ .. ipython:: python - gb = df.groupby(["animal"]) + gb = df.groupby("animal") gb.get_group("cat") `Apply to different items in a group @@ -482,7 +482,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to return pd.Series(["L", avg_weight, True], index=["size", "weight", "adult"]) - expected_df = gb.apply(GrowUp) + expected_df = gb.apply(GrowUp, include_groups=False) expected_df `Expanding apply @@ -771,7 +771,7 @@ To create year and month cross tabulation: df = pd.DataFrame( {"value": np.random.randn(36)}, - index=pd.date_range("2011-01-01", freq="M", periods=36), + index=pd.date_range("2011-01-01", freq="ME", periods=36), ) pd.pivot_table( @@ -794,12 +794,12 @@ Apply index=["I", "II", "III"], ) - def make_df(ser): - new_vals = [pd.Series(value, name=name) for name, value in ser.items()] - return pd.DataFrame(new_vals) - - df_orgz = pd.concat({ind: row.pipe(make_df) for ind, row in df.iterrows()}) + def SeriesFromSubList(aList): + return pd.Series(aList) + df_orgz = pd.concat( + {ind: row.apply(SeriesFromSubList) for ind, row in df.iterrows()} + ) df_orgz `Rolling apply with a DataFrame returning a Series diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index d60532f5f4027..d1e981ee1bbdc 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -308,7 +308,7 @@ The row and column labels can be accessed respectively by accessing the From dict of ndarrays / lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The ndarrays must all be the same length. If an index is passed, it must +All ndarrays must share the same length. If an index is passed, it must also be the same length as the arrays. If no index is passed, the result will be ``range(n)``, where ``n`` is the array length. diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index c00a236ff4e9d..99c85ac66623d 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -379,7 +379,7 @@ constructors using something similar to the following: .. ipython:: python x = np.array(list(range(10)), ">i4") # big endian - newx = x.byteswap().newbyteorder() # force native byteorder + newx = x.byteswap().view(x.dtype.newbyteorder()) # force native byteorder s = pd.Series(newx) See `the NumPy documentation on byte order diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 75c816f66d5e4..b2df1eaec60cc 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -211,9 +211,9 @@ For example, the groups created by ``groupby()`` below are in the order they app .. ipython:: python df3 = pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}) - df3.groupby(["X"]).get_group("A") + df3.groupby("X").get_group("A") - df3.groupby(["X"]).get_group("B") + df3.groupby(["X"]).get_group(("B",)) .. _groupby.dropna: @@ -420,6 +420,12 @@ This is mainly syntactic sugar for the alternative, which is much more verbose: Additionally, this method avoids recomputing the internal grouping information derived from the passed key. +You can also include the grouping columns if you want to operate on them. + +.. ipython:: python + + grouped[["A", "B"]].sum() + .. _groupby.iterating-label: Iterating through groups @@ -1053,7 +1059,7 @@ missing values with the ``ffill()`` method. ).set_index("date") df_re - df_re.groupby("group").resample("1D").ffill() + df_re.groupby("group").resample("1D", include_groups=False).ffill() .. _groupby.filter: @@ -1207,6 +1213,19 @@ The dimension of the returned result can also change: grouped.apply(f) +``apply`` on a Series can operate on a returned value from the applied function +that is itself a series, and possibly upcast the result to a DataFrame: + +.. ipython:: python + + def f(x): + return pd.Series([x, x ** 2], index=["x", "x^2"]) + + + s = pd.Series(np.random.rand(5)) + s + s.apply(f) + Similar to :ref:`groupby.aggregate.agg`, the resulting dtype will reflect that of the apply function. If the results from different groups have different dtypes, then a common dtype will be determined in the same way as ``DataFrame`` construction. @@ -1219,13 +1238,13 @@ the argument ``group_keys`` which defaults to ``True``. Compare .. ipython:: python - df.groupby("A", group_keys=True).apply(lambda x: x) + df.groupby("A", group_keys=True).apply(lambda x: x, include_groups=False) with .. ipython:: python - df.groupby("A", group_keys=False).apply(lambda x: x) + df.groupby("A", group_keys=False).apply(lambda x: x, include_groups=False) Numba Accelerated Routines @@ -1397,7 +1416,7 @@ Groupby a specific column with the desired frequency. This is like resampling. .. ipython:: python - df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"])[["Quantity"]].sum() + df.groupby([pd.Grouper(freq="1ME", key="Date"), "Buyer"])[["Quantity"]].sum() When ``freq`` is specified, the object returned by ``pd.Grouper`` will be an instance of ``pandas.api.typing.TimeGrouper``. You have an ambiguous specification @@ -1407,9 +1426,9 @@ in that you have a named index and a column that could be potential groupers. df = df.set_index("Date") df["Date"] = df.index + pd.offsets.MonthEnd(2) - df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"])[["Quantity"]].sum() + df.groupby([pd.Grouper(freq="6ME", key="Date"), "Buyer"])[["Quantity"]].sum() - df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"])[["Quantity"]].sum() + df.groupby([pd.Grouper(freq="6ME", level="Date"), "Buyer"])[["Quantity"]].sum() Taking the first rows of each group @@ -1709,7 +1728,7 @@ column index name will be used as the name of the inserted column: result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()} return pd.Series(result, name="metrics") - result = df.groupby("a").apply(compute_metrics) + result = df.groupby("a").apply(compute_metrics, include_groups=False) result diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 52bc43f52b1d3..7541cf3c8af3c 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1837,7 +1837,7 @@ This however is operating on a copy and will not work. :okwarning: :okexcept: - with option_context('mode.chained_assignment','warn'): + with pd.option_context('mode.chained_assignment','warn'): dfb[dfb['a'].str.startswith('o')]['c'] = 42 A chained assignment can also crop up in setting in a mixed dtype frame. @@ -1879,7 +1879,7 @@ Last, the subsequent example will **not** work at all, and so should be avoided: :okwarning: :okexcept: - with option_context('mode.chained_assignment','raise'): + with pd.option_context('mode.chained_assignment','raise'): dfd.loc[0]['a'] = 1111 .. warning:: diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 93294d3cbdcfe..6bd181740c78d 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1811,8 +1811,8 @@ Writing JSON A ``Series`` or ``DataFrame`` can be converted to a valid JSON string. Use ``to_json`` with optional parameters: -* ``path_or_buf`` : the pathname or buffer to write the output - This can be ``None`` in which case a JSON string is returned +* ``path_or_buf`` : the pathname or buffer to write the output. + This can be ``None`` in which case a JSON string is returned. * ``orient`` : ``Series``: @@ -2448,7 +2448,7 @@ Read a URL with no options: .. code-block:: ipython - In [320]: "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list" + In [320]: url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list" In [321]: pd.read_html(url) Out[321]: [ Bank NameBank CityCity StateSt ... Acquiring InstitutionAI Closing DateClosing FundFund @@ -2619,7 +2619,7 @@ columns to strings. .. code-block:: python - url_mcc = "https://en.wikipedia.org/wiki/Mobile_country_code" + url_mcc = "https://en.wikipedia.org/wiki/Mobile_country_code?oldid=899173761" dfs = pd.read_html( url_mcc, match="Telekom Albania", @@ -3453,7 +3453,8 @@ Excel files The :func:`~pandas.read_excel` method can read Excel 2007+ (``.xlsx``) files using the ``openpyxl`` Python module. Excel 2003 (``.xls``) files can be read using ``xlrd``. Binary Excel (``.xlsb``) -files can be read using ``pyxlsb``. +files can be read using ``pyxlsb``. All formats can be read +using :ref:`calamine` engine. The :meth:`~DataFrame.to_excel` instance method is used for saving a ``DataFrame`` to Excel. Generally the semantics are similar to working with :ref:`csv` data. @@ -3494,6 +3495,9 @@ using internally. * For the engine odf, pandas is using :func:`odf.opendocument.load` to read in (``.ods``) files. +* For the engine calamine, pandas is using :func:`python_calamine.load_workbook` + to read in (``.xlsx``), (``.xlsm``), (``.xls``), (``.xlsb``), (``.ods``) files. + .. code-block:: python # Returns a DataFrame @@ -3935,7 +3939,8 @@ The :func:`~pandas.read_excel` method can also read binary Excel files using the ``pyxlsb`` module. The semantics and features for reading binary Excel files mostly match what can be done for `Excel files`_ using ``engine='pyxlsb'``. ``pyxlsb`` does not recognize datetime types -in files and will return floats instead. +in files and will return floats instead (you can use :ref:`calamine` +if you need recognize datetime types). .. code-block:: python @@ -3947,6 +3952,20 @@ in files and will return floats instead. Currently pandas only supports *reading* binary Excel files. Writing is not implemented. +.. _io.calamine: + +Calamine (Excel and ODS files) +------------------------------ + +The :func:`~pandas.read_excel` method can read Excel file (``.xlsx``, ``.xlsm``, ``.xls``, ``.xlsb``) +and OpenDocument spreadsheets (``.ods``) using the ``python-calamine`` module. +This module is a binding for Rust library `calamine `__ +and is faster than other engines in most cases. The optional dependency 'python-calamine' needs to be installed. + +.. code-block:: python + + # Returns a DataFrame + pd.read_excel("path_to_file.xlsb", engine="calamine") .. _io.clipboard: diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 10793a6973f8a..3e0e3245e8d64 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -15,27 +15,27 @@ Merge, join, concatenate and compare ************************************ -pandas provides various facilities for easily combining together Series or -DataFrame with various kinds of set logic for the indexes -and relational algebra functionality in the case of join / merge-type -operations. +pandas provides various methods for combining and comparing :class:`Series` or +:class:`DataFrame`. -In addition, pandas also provides utilities to compare two Series or DataFrame -and summarize their differences. +* :func:`~pandas.concat`: Merge multiple :class:`Series` or :class:`DataFrame` objects along a shared index or column +* :meth:`DataFrame.join`: Merge multiple :class:`DataFrame` objects along the columns +* :meth:`DataFrame.combine_first`: Update missing values with non-missing values in the same location +* :func:`~pandas.merge`: Combine two :class:`Series` or :class:`DataFrame` objects with SQL-style joining +* :func:`~pandas.merge_ordered`: Combine two :class:`Series` or :class:`DataFrame` objects along an ordered axis +* :func:`~pandas.merge_asof`: Combine two :class:`Series` or :class:`DataFrame` objects by near instead of exact matching keys +* :meth:`Series.compare` and :meth:`DataFrame.compare`: Show differences in values between two :class:`Series` or :class:`DataFrame` objects .. _merging.concat: -Concatenating objects ---------------------- - -The :func:`~pandas.concat` function (in the main pandas namespace) does all of -the heavy lifting of performing concatenation operations along an axis while -performing optional set logic (union or intersection) of the indexes (if any) on -the other axes. Note that I say "if any" because there is only a single possible -axis of concatenation for Series. +:func:`~pandas.concat` +---------------------- -Before diving into all of the details of ``concat`` and what it can do, here is -a simple example: +The :func:`~pandas.concat` function concatenates an arbitrary amount of +:class:`Series` or :class:`DataFrame` objects along an axis while +performing optional set logic (union or intersection) of the indexes on +the other axes. Like ``numpy.concatenate``, :func:`~pandas.concat` +takes a list or dict of homogeneously-typed objects and concatenates them. .. ipython:: python @@ -71,6 +71,7 @@ a simple example: frames = [df1, df2, df3] result = pd.concat(frames) + result .. ipython:: python :suppress: @@ -79,81 +80,12 @@ a simple example: p.plot(frames, result, labels=["df1", "df2", "df3"], vertical=True); plt.close("all"); -Like its sibling function on ndarrays, ``numpy.concatenate``, ``pandas.concat`` -takes a list or dict of homogeneously-typed objects and concatenates them with -some configurable handling of "what to do with the other axes": - -:: - - pd.concat( - objs, - axis=0, - join="outer", - ignore_index=False, - keys=None, - levels=None, - names=None, - verify_integrity=False, - copy=True, - ) - -* ``objs`` : a sequence or mapping of Series or DataFrame objects. If a - dict is passed, the sorted keys will be used as the ``keys`` argument, unless - it is passed, in which case the values will be selected (see below). Any None - objects will be dropped silently unless they are all None in which case a - ValueError will be raised. -* ``axis`` : {0, 1, ...}, default 0. The axis to concatenate along. -* ``join`` : {'inner', 'outer'}, default 'outer'. How to handle indexes on - other axis(es). Outer for union and inner for intersection. -* ``ignore_index`` : boolean, default False. If True, do not use the index - values on the concatenation axis. The resulting axis will be labeled 0, ..., - n - 1. This is useful if you are concatenating objects where the - concatenation axis does not have meaningful indexing information. Note - the index values on the other axes are still respected in the join. -* ``keys`` : sequence, default None. Construct hierarchical index using the - passed keys as the outermost level. If multiple levels passed, should - contain tuples. -* ``levels`` : list of sequences, default None. Specific levels (unique values) - to use for constructing a MultiIndex. Otherwise they will be inferred from the - keys. -* ``names`` : list, default None. Names for the levels in the resulting - hierarchical index. -* ``verify_integrity`` : boolean, default False. Check whether the new - concatenated axis contains duplicates. This can be very expensive relative - to the actual data concatenation. -* ``copy`` : boolean, default True. If False, do not copy data unnecessarily. - -Without a little bit of context many of these arguments don't make much sense. -Let's revisit the above example. Suppose we wanted to associate specific keys -with each of the pieces of the chopped up DataFrame. We can do this using the -``keys`` argument: - -.. ipython:: python - - result = pd.concat(frames, keys=["x", "y", "z"]) - -.. ipython:: python - :suppress: - - @savefig merging_concat_keys.png - p.plot(frames, result, labels=["df1", "df2", "df3"], vertical=True) - plt.close("all"); - -As you can see (if you've read the rest of the documentation), the resulting -object's index has a :ref:`hierarchical index `. This -means that we can now select out each chunk by key: - -.. ipython:: python - - result.loc["y"] - -It's not a stretch to see how this can be very useful. More detail on this -functionality below. - .. note:: - It is worth noting that :func:`~pandas.concat` makes a full copy of the data, and that constantly - reusing this function can create a significant performance hit. If you need - to use the operation over several datasets, use a list comprehension. + + :func:`~pandas.concat` makes a full copy of the data, and iteratively + reusing :func:`~pandas.concat` can create unnecessary copies. Collect all + :class:`DataFrame` or :class:`Series` objects in a list before using + :func:`~pandas.concat`. .. code-block:: python @@ -162,26 +94,20 @@ functionality below. .. note:: - When concatenating DataFrames with named axes, pandas will attempt to preserve + When concatenating :class:`DataFrame` with named axes, pandas will attempt to preserve these index/column names whenever possible. In the case where all inputs share a common name, this name will be assigned to the result. When the input names do not all agree, the result will be unnamed. The same is true for :class:`MultiIndex`, but the logic is applied separately on a level-by-level basis. -Set logic on the other axes -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Joining logic of the resulting axis +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -When gluing together multiple DataFrames, you have a choice of how to handle -the other axes (other than the one being concatenated). This can be done in -the following two ways: +The ``join`` keyword specifies how to handle axis values that don't exist in the first +:class:`DataFrame`. -* Take the union of them all, ``join='outer'``. This is the default - option as it results in zero information loss. -* Take the intersection, ``join='inner'``. - -Here is an example of each of these methods. First, the default ``join='outer'`` -behavior: +``join='outer'`` takes the union of all axis values .. ipython:: python @@ -194,6 +120,7 @@ behavior: index=[2, 3, 6, 7], ) result = pd.concat([df1, df4], axis=1) + result .. ipython:: python @@ -203,11 +130,12 @@ behavior: p.plot([df1, df4], result, labels=["df1", "df4"], vertical=False); plt.close("all"); -Here is the same thing with ``join='inner'``: +``join='inner'`` takes the intersection of the axis values .. ipython:: python result = pd.concat([df1, df4], axis=1, join="inner") + result .. ipython:: python :suppress: @@ -216,18 +144,13 @@ Here is the same thing with ``join='inner'``: p.plot([df1, df4], result, labels=["df1", "df4"], vertical=False); plt.close("all"); -Lastly, suppose we just wanted to reuse the *exact index* from the original -DataFrame: +To perform an effective "left" join using the *exact index* from the original +:class:`DataFrame`, result can be reindexed. .. ipython:: python result = pd.concat([df1, df4], axis=1).reindex(df1.index) - -Similarly, we could index before the concatenation: - -.. ipython:: python - - pd.concat([df1, df4.reindex(df1.index)], axis=1) + result .. ipython:: python :suppress: @@ -240,13 +163,14 @@ Similarly, we could index before the concatenation: Ignoring indexes on the concatenation axis ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For ``DataFrame`` objects which don't have a meaningful index, you may wish -to append them and ignore the fact that they may have overlapping indexes. To -do this, use the ``ignore_index`` argument: + +For :class:`DataFrame` objects which don't have a meaningful index, the ``ignore_index`` +ignores overlapping indexes. .. ipython:: python result = pd.concat([df1, df4], ignore_index=True, sort=False) + result .. ipython:: python :suppress: @@ -257,17 +181,18 @@ do this, use the ``ignore_index`` argument: .. _merging.mixed_ndims: -Concatenating with mixed ndims -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Concatenating :class:`Series` and :class:`DataFrame` together +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -You can concatenate a mix of ``Series`` and ``DataFrame`` objects. The -``Series`` will be transformed to ``DataFrame`` with the column name as -the name of the ``Series``. +You can concatenate a mix of :class:`Series` and :class:`DataFrame` objects. The +:class:`Series` will be transformed to :class:`DataFrame` with the column name as +the name of the :class:`Series`. .. ipython:: python s1 = pd.Series(["X0", "X1", "X2", "X3"], name="X") result = pd.concat([df1, s1], axis=1) + result .. ipython:: python :suppress: @@ -276,19 +201,13 @@ the name of the ``Series``. p.plot([df1, s1], result, labels=["df1", "s1"], vertical=False); plt.close("all"); -.. note:: - - Since we're concatenating a ``Series`` to a ``DataFrame``, we could have - achieved the same result with :meth:`DataFrame.assign`. To concatenate an - arbitrary number of pandas objects (``DataFrame`` or ``Series``), use - ``concat``. - -If unnamed ``Series`` are passed they will be numbered consecutively. +Unnamed :class:`Series` will be numbered consecutively. .. ipython:: python s2 = pd.Series(["_0", "_1", "_2", "_3"]) result = pd.concat([df1, s2, s2, s2], axis=1) + result .. ipython:: python :suppress: @@ -297,11 +216,12 @@ If unnamed ``Series`` are passed they will be numbered consecutively. p.plot([df1, s2], result, labels=["df1", "s2"], vertical=False); plt.close("all"); -Passing ``ignore_index=True`` will drop all name references. +``ignore_index=True`` will drop all name references. .. ipython:: python result = pd.concat([df1, s1], axis=1, ignore_index=True) + result .. ipython:: python :suppress: @@ -310,48 +230,45 @@ Passing ``ignore_index=True`` will drop all name references. p.plot([df1, s1], result, labels=["df1", "s1"], vertical=False); plt.close("all"); -More concatenating with group keys -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Resulting ``keys`` +~~~~~~~~~~~~~~~~~~ -A fairly common use of the ``keys`` argument is to override the column names -when creating a new ``DataFrame`` based on existing ``Series``. -Notice how the default behaviour consists on letting the resulting ``DataFrame`` -inherit the parent ``Series``' name, when these existed. +The ``keys`` argument adds another axis level to the resulting index or column (creating +a :class:`MultiIndex`) associate specific keys with each original :class:`DataFrame`. .. ipython:: python - s3 = pd.Series([0, 1, 2, 3], name="foo") - s4 = pd.Series([0, 1, 2, 3]) - s5 = pd.Series([0, 1, 4, 5]) - - pd.concat([s3, s4, s5], axis=1) - -Through the ``keys`` argument we can override the existing column names. + result = pd.concat(frames, keys=["x", "y", "z"]) + result + result.loc["y"] .. ipython:: python + :suppress: - pd.concat([s3, s4, s5], axis=1, keys=["red", "blue", "yellow"]) + @savefig merging_concat_keys.png + p.plot(frames, result, labels=["df1", "df2", "df3"], vertical=True) + plt.close("all"); -Let's consider a variation of the very first example presented: +The ``keys`` argument cane override the column names +when creating a new :class:`DataFrame` based on existing :class:`Series`. .. ipython:: python - result = pd.concat(frames, keys=["x", "y", "z"]) - -.. ipython:: python - :suppress: + s3 = pd.Series([0, 1, 2, 3], name="foo") + s4 = pd.Series([0, 1, 2, 3]) + s5 = pd.Series([0, 1, 4, 5]) - @savefig merging_concat_group_keys2.png - p.plot(frames, result, labels=["df1", "df2", "df3"], vertical=True); - plt.close("all"); + pd.concat([s3, s4, s5], axis=1) + pd.concat([s3, s4, s5], axis=1, keys=["red", "blue", "yellow"]) -You can also pass a dict to ``concat`` in which case the dict keys will be used -for the ``keys`` argument (unless other keys are specified): +You can also pass a dict to :func:`concat` in which case the dict keys will be used +for the ``keys`` argument unless other ``keys`` argument is specified: .. ipython:: python pieces = {"x": df1, "y": df2, "z": df3} result = pd.concat(pieces) + result .. ipython:: python :suppress: @@ -363,6 +280,7 @@ for the ``keys`` argument (unless other keys are specified): .. ipython:: python result = pd.concat(pieces, keys=["z", "y"]) + result .. ipython:: python :suppress: @@ -371,21 +289,21 @@ for the ``keys`` argument (unless other keys are specified): p.plot([df1, df2, df3], result, labels=["df1", "df2", "df3"], vertical=True); plt.close("all"); -The MultiIndex created has levels that are constructed from the passed keys and -the index of the ``DataFrame`` pieces: +The :class:`MultiIndex` created has levels that are constructed from the passed keys and +the index of the :class:`DataFrame` pieces: .. ipython:: python result.index.levels -If you wish to specify other levels (as will occasionally be the case), you can -do so using the ``levels`` argument: +``levels`` argument allows specifying resulting levels associated with the ``keys`` .. ipython:: python result = pd.concat( pieces, keys=["x", "y", "z"], levels=[["z", "y", "x", "w"]], names=["group_key"] ) + result .. ipython:: python :suppress: @@ -398,21 +316,19 @@ do so using the ``levels`` argument: result.index.levels -This is fairly esoteric, but it is actually necessary for implementing things -like GroupBy where the order of a categorical variable is meaningful. - .. _merging.append.row: -Appending rows to a DataFrame -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Appending rows to a :class:`DataFrame` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -If you have a series that you want to append as a single row to a ``DataFrame``, you can convert the row into a -``DataFrame`` and use ``concat`` +If you have a :class:`Series` that you want to append as a single row to a :class:`DataFrame`, you can convert the row into a +:class:`DataFrame` and use :func:`concat` .. ipython:: python s2 = pd.Series(["X0", "X1", "X2", "X3"], index=["A", "B", "C", "D"]) result = pd.concat([df1, s2.to_frame().T], ignore_index=True) + result .. ipython:: python :suppress: @@ -421,131 +337,35 @@ If you have a series that you want to append as a single row to a ``DataFrame``, p.plot([df1, s2], result, labels=["df1", "s2"], vertical=True); plt.close("all"); -You should use ``ignore_index`` with this method to instruct DataFrame to -discard its index. If you wish to preserve the index, you should construct an -appropriately-indexed DataFrame and append or concatenate those objects. - .. _merging.join: -Database-style DataFrame or named Series joining/merging --------------------------------------------------------- - -pandas has full-featured, **high performance** in-memory join operations -idiomatically very similar to relational databases like SQL. These methods -perform significantly better (in some cases well over an order of magnitude -better) than other open source implementations (like ``base::merge.data.frame`` -in R). The reason for this is careful algorithmic design and the internal layout -of the data in ``DataFrame``. - -See the :ref:`cookbook` for some advanced strategies. +:func:`~pandas.merge` +--------------------- -Users who are familiar with SQL but new to pandas might be interested in a +:func:`~pandas.merge` performs join operations similar to relational databases like SQL. +Users who are familiar with SQL but new to pandas can reference a :ref:`comparison with SQL`. -pandas provides a single function, :func:`~pandas.merge`, as the entry point for -all standard database join operations between ``DataFrame`` or named ``Series`` objects: - -:: - - pd.merge( - left, - right, - how="inner", - on=None, - left_on=None, - right_on=None, - left_index=False, - right_index=False, - sort=True, - suffixes=("_x", "_y"), - copy=True, - indicator=False, - validate=None, - ) +Merge types +~~~~~~~~~~~ + +:func:`~pandas.merge` implements common SQL style joining operations. -* ``left``: A DataFrame or named Series object. -* ``right``: Another DataFrame or named Series object. -* ``on``: Column or index level names to join on. Must be found in both the left - and right DataFrame and/or Series objects. If not passed and ``left_index`` and - ``right_index`` are ``False``, the intersection of the columns in the - DataFrames and/or Series will be inferred to be the join keys. -* ``left_on``: Columns or index levels from the left DataFrame or Series to use as - keys. Can either be column names, index level names, or arrays with length - equal to the length of the DataFrame or Series. -* ``right_on``: Columns or index levels from the right DataFrame or Series to use as - keys. Can either be column names, index level names, or arrays with length - equal to the length of the DataFrame or Series. -* ``left_index``: If ``True``, use the index (row labels) from the left - DataFrame or Series as its join key(s). In the case of a DataFrame or Series with a MultiIndex - (hierarchical), the number of levels must match the number of join keys - from the right DataFrame or Series. -* ``right_index``: Same usage as ``left_index`` for the right DataFrame or Series -* ``how``: One of ``'left'``, ``'right'``, ``'outer'``, ``'inner'``, ``'cross'``. Defaults - to ``inner``. See below for more detailed description of each method. -* ``sort``: Sort the result DataFrame by the join keys in lexicographical - order. Defaults to ``True``, setting to ``False`` will improve performance - substantially in many cases. -* ``suffixes``: A tuple of string suffixes to apply to overlapping - columns. Defaults to ``('_x', '_y')``. -* ``copy``: Always copy data (default ``True``) from the passed DataFrame or named Series - objects, even when reindexing is not necessary. Cannot be avoided in many - cases but may improve performance / memory usage. The cases where copying - can be avoided are somewhat pathological but this option is provided - nonetheless. -* ``indicator``: Add a column to the output DataFrame called ``_merge`` - with information on the source of each row. ``_merge`` is Categorical-type - and takes on a value of ``left_only`` for observations whose merge key - only appears in ``'left'`` DataFrame or Series, ``right_only`` for observations whose - merge key only appears in ``'right'`` DataFrame or Series, and ``both`` if the - observation's merge key is found in both. - -* ``validate`` : string, default None. - If specified, checks if merge is of specified type. - - * "one_to_one" or "1:1": checks if merge keys are unique in both - left and right datasets. - * "one_to_many" or "1:m": checks if merge keys are unique in left - dataset. - * "many_to_one" or "m:1": checks if merge keys are unique in right - dataset. - * "many_to_many" or "m:m": allowed, but does not result in checks. - -The return type will be the same as ``left``. If ``left`` is a ``DataFrame`` or named ``Series`` -and ``right`` is a subclass of ``DataFrame``, the return type will still be ``DataFrame``. - -``merge`` is a function in the pandas namespace, and it is also available as a -``DataFrame`` instance method :meth:`~DataFrame.merge`, with the calling -``DataFrame`` being implicitly considered the left object in the join. - -The related :meth:`~DataFrame.join` method, uses ``merge`` internally for the -index-on-index (by default) and column(s)-on-index join. If you are joining on -index only, you may wish to use ``DataFrame.join`` to save yourself some typing. - -Brief primer on merge methods (relational algebra) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Experienced users of relational databases like SQL will be familiar with the -terminology used to describe join operations between two SQL-table like -structures (``DataFrame`` objects). There are several cases to consider which -are very important to understand: - -* **one-to-one** joins: for example when joining two ``DataFrame`` objects on - their indexes (which must contain unique values). -* **many-to-one** joins: for example when joining an index (unique) to one or - more columns in a different ``DataFrame``. -* **many-to-many** joins: joining columns on columns. +* **one-to-one**: joining two :class:`DataFrame` objects on + their indexes which must contain unique values. +* **many-to-one**: joining a unique index to one or + more columns in a different :class:`DataFrame`. +* **many-to-many** : joining columns on columns. .. note:: - When joining columns on columns (potentially a many-to-many join), any - indexes on the passed ``DataFrame`` objects **will be discarded**. + When joining columns on columns, potentially a many-to-many join, any + indexes on the passed :class:`DataFrame` objects **will be discarded**. -It is worth spending some time understanding the result of the **many-to-many** -join case. In SQL / standard relational algebra, if a key combination appears -more than once in both tables, the resulting table will have the **Cartesian -product** of the associated data. Here is a very basic example with one unique -key combination: +For a **many-to-many** join, if a key combination appears +more than once in both tables, the :class:`DataFrame` will have the **Cartesian +product** of the associated data. .. ipython:: python @@ -565,6 +385,7 @@ key combination: } ) result = pd.merge(left, right, on="key") + result .. ipython:: python :suppress: @@ -573,41 +394,8 @@ key combination: p.plot([left, right], result, labels=["left", "right"], vertical=False); plt.close("all"); -Here is a more complicated example with multiple join keys. Only the keys -appearing in ``left`` and ``right`` are present (the intersection), since -``how='inner'`` by default. - -.. ipython:: python - - left = pd.DataFrame( - { - "key1": ["K0", "K0", "K1", "K2"], - "key2": ["K0", "K1", "K0", "K1"], - "A": ["A0", "A1", "A2", "A3"], - "B": ["B0", "B1", "B2", "B3"], - } - ) - - right = pd.DataFrame( - { - "key1": ["K0", "K1", "K1", "K2"], - "key2": ["K0", "K0", "K0", "K0"], - "C": ["C0", "C1", "C2", "C3"], - "D": ["D0", "D1", "D2", "D3"], - } - ) - - result = pd.merge(left, right, on=["key1", "key2"]) - -.. ipython:: python - :suppress: - - @savefig merging_merge_on_key_multiple.png - p.plot([left, right], result, labels=["left", "right"], vertical=False); - plt.close("all"); - -The ``how`` argument to ``merge`` specifies how to determine which keys are to -be included in the resulting table. If a key combination **does not appear** in +The ``how`` argument to :func:`~pandas.merge` specifies which keys are +included in the resulting table. If a key combination **does not appear** in either the left or right tables, the values in the joined table will be ``NA``. Here is a summary of the ``how`` options and their SQL equivalent names: @@ -623,7 +411,24 @@ either the left or right tables, the values in the joined table will be .. ipython:: python + left = pd.DataFrame( + { + "key1": ["K0", "K0", "K1", "K2"], + "key2": ["K0", "K1", "K0", "K1"], + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + } + ) + right = pd.DataFrame( + { + "key1": ["K0", "K1", "K1", "K2"], + "key2": ["K0", "K0", "K0", "K0"], + "C": ["C0", "C1", "C2", "C3"], + "D": ["D0", "D1", "D2", "D3"], + } + ) result = pd.merge(left, right, how="left", on=["key1", "key2"]) + result .. ipython:: python :suppress: @@ -635,6 +440,7 @@ either the left or right tables, the values in the joined table will be .. ipython:: python result = pd.merge(left, right, how="right", on=["key1", "key2"]) + result .. ipython:: python :suppress: @@ -645,6 +451,7 @@ either the left or right tables, the values in the joined table will be .. ipython:: python result = pd.merge(left, right, how="outer", on=["key1", "key2"]) + result .. ipython:: python :suppress: @@ -656,6 +463,7 @@ either the left or right tables, the values in the joined table will be .. ipython:: python result = pd.merge(left, right, how="inner", on=["key1", "key2"]) + result .. ipython:: python :suppress: @@ -667,6 +475,7 @@ either the left or right tables, the values in the joined table will be .. ipython:: python result = pd.merge(left, right, how="cross") + result .. ipython:: python :suppress: @@ -675,10 +484,9 @@ either the left or right tables, the values in the joined table will be p.plot([left, right], result, labels=["left", "right"], vertical=False); plt.close("all"); -You can merge a mult-indexed Series and a DataFrame, if the names of -the MultiIndex correspond to the columns from the DataFrame. Transform -the Series to a DataFrame using :meth:`Series.reset_index` before merging, -as shown in the following example. +You can :class:`Series` and a :class:`DataFrame` with a :class:`MultiIndex` if the names of +the :class:`MultiIndex` correspond to the columns from the :class:`DataFrame`. Transform +the :class:`Series` to a :class:`DataFrame` using :meth:`Series.reset_index` before merging .. ipython:: python @@ -696,7 +504,7 @@ as shown in the following example. pd.merge(df, ser.reset_index(), on=["Let", "Num"]) -Here is another example with duplicate join keys in DataFrames: +Performing an outer join with duplicate join keys in :class:`DataFrame` .. ipython:: python @@ -705,6 +513,7 @@ Here is another example with duplicate join keys in DataFrames: right = pd.DataFrame({"A": [4, 5, 6], "B": [2, 2, 2]}) result = pd.merge(left, right, on="B", how="outer") + result .. ipython:: python :suppress: @@ -716,21 +525,17 @@ Here is another example with duplicate join keys in DataFrames: .. warning:: - Joining / merging on duplicate keys can cause a returned frame that is the multiplication of the row dimensions, which may result in memory overflow. It is the user' s responsibility to manage duplicate values in keys before joining large DataFrames. + Merging on duplicate keys sigificantly increase the dimensions of the result + and can cause a memory overflow. .. _merging.validation: -Checking for duplicate keys -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Users can use the ``validate`` argument to automatically check whether there -are unexpected duplicates in their merge keys. Key uniqueness is checked before -merge operations and so should protect against memory overflows. Checking key -uniqueness is also a good way to ensure user data structures are as expected. +Merge key uniqueness +~~~~~~~~~~~~~~~~~~~~ -In the following example, there are duplicate values of ``B`` in the right -``DataFrame``. As this is not a one-to-one merge -- as specified in the -``validate`` argument -- an exception will be raised. +The ``validate`` argument checks whether the uniqueness of merge keys. +Key uniqueness is checked before merge operations and can protect against memory overflows +and unexpected key duplication. .. ipython:: python :okexcept: @@ -739,8 +544,8 @@ In the following example, there are duplicate values of ``B`` in the right right = pd.DataFrame({"A": [4, 5, 6], "B": [2, 2, 2]}) result = pd.merge(left, right, on="B", how="outer", validate="one_to_one") -If the user is aware of the duplicates in the right ``DataFrame`` but wants to -ensure there are no duplicates in the left DataFrame, one can use the +If the user is aware of the duplicates in the right :class:`DataFrame` but wants to +ensure there are no duplicates in the left :class:`DataFrame`, one can use the ``validate='one_to_many'`` argument instead, which will not raise an exception. .. ipython:: python @@ -750,8 +555,8 @@ ensure there are no duplicates in the left DataFrame, one can use the .. _merging.indicator: -The merge indicator -~~~~~~~~~~~~~~~~~~~ +Merge result indicator +~~~~~~~~~~~~~~~~~~~~~~ :func:`~pandas.merge` accepts the argument ``indicator``. If ``True``, a Categorical-type column called ``_merge`` will be added to the output object @@ -771,97 +576,53 @@ that takes on values: df2 = pd.DataFrame({"col1": [1, 2, 2], "col_right": [2, 2, 2]}) pd.merge(df1, df2, on="col1", how="outer", indicator=True) -The ``indicator`` argument will also accept string arguments, in which case the indicator function will use the value of the passed string as the name for the indicator column. +A string argument to ``indicator`` will use the value as the name for the indicator column. .. ipython:: python pd.merge(df1, df2, on="col1", how="outer", indicator="indicator_column") -.. _merging.dtypes: - -Merge dtypes -~~~~~~~~~~~~ - -Merging will preserve the dtype of the join keys. - -.. ipython:: python - - left = pd.DataFrame({"key": [1], "v1": [10]}) - left - right = pd.DataFrame({"key": [1, 2], "v1": [20, 30]}) - right - -We are able to preserve the join keys: - -.. ipython:: python - - pd.merge(left, right, how="outer") - pd.merge(left, right, how="outer").dtypes - -Of course if you have missing values that are introduced, then the -resulting dtype will be upcast. - -.. ipython:: python - - pd.merge(left, right, how="outer", on="key") - pd.merge(left, right, how="outer", on="key").dtypes - -Merging will preserve ``category`` dtypes of the mergands. See also the section on :ref:`categoricals `. +Overlapping value columns +~~~~~~~~~~~~~~~~~~~~~~~~~ -The left frame. +The merge ``suffixes`` argument takes a tuple of list of strings to append to +overlapping column names in the input :class:`DataFrame` to disambiguate the result +columns: .. ipython:: python - from pandas.api.types import CategoricalDtype - - X = pd.Series(np.random.choice(["foo", "bar"], size=(10,))) - X = X.astype(CategoricalDtype(categories=["foo", "bar"])) - - left = pd.DataFrame( - {"X": X, "Y": np.random.choice(["one", "two", "three"], size=(10,))} - ) - left - left.dtypes + left = pd.DataFrame({"k": ["K0", "K1", "K2"], "v": [1, 2, 3]}) + right = pd.DataFrame({"k": ["K0", "K0", "K3"], "v": [4, 5, 6]}) -The right frame. + result = pd.merge(left, right, on="k") + result .. ipython:: python + :suppress: - right = pd.DataFrame( - { - "X": pd.Series(["foo", "bar"], dtype=CategoricalDtype(["foo", "bar"])), - "Z": [1, 2], - } - ) - right - right.dtypes - -The merged result: + @savefig merging_merge_overlapped.png + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); .. ipython:: python - result = pd.merge(left, right, how="outer") + result = pd.merge(left, right, on="k", suffixes=("_l", "_r")) result - result.dtypes - -.. note:: - The category dtypes must be *exactly* the same, meaning the same categories and the ordered attribute. - Otherwise the result will coerce to the categories' dtype. - -.. note:: - - Merging on ``category`` dtypes that are the same can be quite performant compared to ``object`` dtype merging. +.. ipython:: python + :suppress: -.. _merging.join.index: + @savefig merging_merge_overlapped_suffix.png + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); -Joining on index -~~~~~~~~~~~~~~~~ +:meth:`DataFrame.join` +---------------------- -:meth:`DataFrame.join` is a convenient method for combining the columns of two -potentially differently-indexed ``DataFrames`` into a single result -``DataFrame``. Here is a very basic example: +:meth:`DataFrame.join` combines the columns of multiple, +potentially differently-indexed :class:`DataFrame` into a single result +:class:`DataFrame`. .. ipython:: python @@ -874,6 +635,7 @@ potentially differently-indexed ``DataFrames`` into a single result ) result = left.join(right) + result .. ipython:: python :suppress: @@ -885,6 +647,7 @@ potentially differently-indexed ``DataFrames`` into a single result .. ipython:: python result = left.join(right, how="outer") + result .. ipython:: python :suppress: @@ -893,11 +656,10 @@ potentially differently-indexed ``DataFrames`` into a single result p.plot([left, right], result, labels=["left", "right"], vertical=False); plt.close("all"); -The same as above, but with ``how='inner'``. - .. ipython:: python result = left.join(right, how="inner") + result .. ipython:: python :suppress: @@ -906,50 +668,9 @@ The same as above, but with ``how='inner'``. p.plot([left, right], result, labels=["left", "right"], vertical=False); plt.close("all"); -The data alignment here is on the indexes (row labels). This same behavior can -be achieved using ``merge`` plus additional arguments instructing it to use the -indexes: - -.. ipython:: python - - result = pd.merge(left, right, left_index=True, right_index=True, how="outer") - -.. ipython:: python - :suppress: - - @savefig merging_merge_index_outer.png - p.plot([left, right], result, labels=["left", "right"], vertical=False); - plt.close("all"); - -.. ipython:: python - - result = pd.merge(left, right, left_index=True, right_index=True, how="inner") - -.. ipython:: python - :suppress: - - @savefig merging_merge_index_inner.png - p.plot([left, right], result, labels=["left", "right"], vertical=False); - plt.close("all"); - -Joining key columns on an index -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -:meth:`~DataFrame.join` takes an optional ``on`` argument which may be a column -or multiple column names, which specifies that the passed ``DataFrame`` is to be -aligned on that column in the ``DataFrame``. These two function calls are -completely equivalent: - -:: - - left.join(right, on=key_or_keys) - pd.merge( - left, right, left_on=key_or_keys, right_index=True, how="left", sort=False - ) - -Obviously you can choose whichever form you find more convenient. For -many-to-one joins (where one of the ``DataFrame``'s is already indexed by the -join key), using ``join`` may be more convenient. Here is a simple example: +:meth:`DataFrame.join` takes an optional ``on`` argument which may be a column +or multiple column names that the passed :class:`DataFrame` is to be +aligned. .. ipython:: python @@ -964,6 +685,7 @@ join key), using ``join`` may be more convenient. Here is a simple example: right = pd.DataFrame({"C": ["C0", "C1"], "D": ["D0", "D1"]}, index=["K0", "K1"]) result = left.join(right, on="key") + result .. ipython:: python :suppress: @@ -977,6 +699,7 @@ join key), using ``join`` may be more convenient. Here is a simple example: result = pd.merge( left, right, left_on="key", right_index=True, how="left", sort=False ) + result .. ipython:: python :suppress: @@ -987,7 +710,7 @@ join key), using ``join`` may be more convenient. Here is a simple example: .. _merging.multikey_join: -To join on multiple keys, the passed DataFrame must have a ``MultiIndex``: +To join on multiple keys, the passed :class:`DataFrame` must have a :class:`MultiIndex`: .. ipython:: python @@ -1006,12 +729,8 @@ To join on multiple keys, the passed DataFrame must have a ``MultiIndex``: right = pd.DataFrame( {"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]}, index=index ) - -Now this can be joined by passing the two key column names: - -.. ipython:: python - result = left.join(right, on=["key1", "key2"]) + result .. ipython:: python :suppress: @@ -1022,14 +741,14 @@ Now this can be joined by passing the two key column names: .. _merging.df_inner_join: -The default for ``DataFrame.join`` is to perform a left join (essentially a -"VLOOKUP" operation, for Excel users), which uses only the keys found in the -calling DataFrame. Other join types, for example inner join, can be just as -easily performed: +The default for :class:`DataFrame.join` is to perform a left join +which uses only the keys found in the +calling :class:`DataFrame`. Other join types can be specified with ``how``. .. ipython:: python result = left.join(right, on=["key1", "key2"], how="inner") + result .. ipython:: python :suppress: @@ -1038,16 +757,13 @@ easily performed: p.plot([left, right], result, labels=["left", "right"], vertical=False); plt.close("all"); -As you can see, this drops any rows where there was no match. - .. _merging.join_on_mi: Joining a single Index to a MultiIndex ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -You can join a singly-indexed ``DataFrame`` with a level of a MultiIndexed ``DataFrame``. -The level will match on the name of the index of the singly-indexed frame against -a level name of the MultiIndexed frame. +You can join a :class:`DataFrame` with a :class:`Index` to a :class:`DataFrame` with a :class:`MultiIndex` on a level. +The ``name`` of the :class:`Index` with match the level name of the :class:`MultiIndex`. .. ipython:: python @@ -1066,6 +782,7 @@ a level name of the MultiIndexed frame. ) result = left.join(right, how="inner") + result .. ipython:: python @@ -1075,29 +792,13 @@ a level name of the MultiIndexed frame. p.plot([left, right], result, labels=["left", "right"], vertical=False); plt.close("all"); -This is equivalent but less verbose and more memory efficient / faster than this. - -.. ipython:: python - - result = pd.merge( - left.reset_index(), right.reset_index(), on=["key"], how="inner" - ).set_index(["key","Y"]) - -.. ipython:: python - :suppress: - - @savefig merging_merge_multiindex_alternative.png - p.plot([left, right], result, labels=["left", "right"], vertical=False); - plt.close("all"); - .. _merging.join_with_two_multi_indexes: -Joining with two MultiIndexes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Joining with two :class:`MultiIndex` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -This is supported in a limited way, provided that the index for the right -argument is completely used in the join, and is a subset of the indices in -the left argument, as in this example: +The :class:`MultiIndex` of the input argument must be completely used +in the join and is a subset of the indices in the left argument. .. ipython:: python @@ -1115,9 +816,6 @@ the left argument, as in this example: left.join(right, on=["abc", "xy"], how="inner") -If that condition is not satisfied, a join with two multi-indexes can be -done using the following code. - .. ipython:: python leftindex = pd.MultiIndex.from_tuples( @@ -1137,6 +835,7 @@ done using the following code. result = pd.merge( left.reset_index(), right.reset_index(), on=["key"], how="inner" ).set_index(["key", "X", "Y"]) + result .. ipython:: python :suppress: @@ -1152,7 +851,7 @@ Merging on a combination of columns and index levels Strings passed as the ``on``, ``left_on``, and ``right_on`` parameters may refer to either column names or index level names. This enables merging -``DataFrame`` instances on a combination of index levels and columns without +:class:`DataFrame` instances on a combination of index levels and columns without resetting indexes. .. ipython:: python @@ -1180,6 +879,7 @@ resetting indexes. ) result = left.merge(right, on=["key1", "key2"]) + result .. ipython:: python :suppress: @@ -1190,76 +890,23 @@ resetting indexes. .. note:: - When DataFrames are merged on a string that matches an index level in both - frames, the index level is preserved as an index level in the resulting - DataFrame. - -.. note:: - When DataFrames are merged using only some of the levels of a ``MultiIndex``, - the extra levels will be dropped from the resulting merge. In order to - preserve those levels, use ``reset_index`` on those level names to move - those levels to columns prior to doing the merge. + When :class:`DataFrame` are joined on a string that matches an index level in both + arguments, the index level is preserved as an index level in the resulting + :class:`DataFrame`. .. note:: - If a string matches both a column name and an index level name, then a - warning is issued and the column takes precedence. This will result in an - ambiguity error in a future version. - -Overlapping value columns -~~~~~~~~~~~~~~~~~~~~~~~~~ - -The merge ``suffixes`` argument takes a tuple of list of strings to append to -overlapping column names in the input ``DataFrame``\ s to disambiguate the result -columns: - -.. ipython:: python - - left = pd.DataFrame({"k": ["K0", "K1", "K2"], "v": [1, 2, 3]}) - right = pd.DataFrame({"k": ["K0", "K0", "K3"], "v": [4, 5, 6]}) - - result = pd.merge(left, right, on="k") - -.. ipython:: python - :suppress: - - @savefig merging_merge_overlapped.png - p.plot([left, right], result, labels=["left", "right"], vertical=False); - plt.close("all"); - -.. ipython:: python - - result = pd.merge(left, right, on="k", suffixes=("_l", "_r")) - -.. ipython:: python - :suppress: - - @savefig merging_merge_overlapped_suffix.png - p.plot([left, right], result, labels=["left", "right"], vertical=False); - plt.close("all"); - -:meth:`DataFrame.join` has ``lsuffix`` and ``rsuffix`` arguments which behave -similarly. - -.. ipython:: python - - left = left.set_index("k") - right = right.set_index("k") - result = left.join(right, lsuffix="_l", rsuffix="_r") - -.. ipython:: python - :suppress: - - @savefig merging_merge_overlapped_multi_suffix.png - p.plot([left, right], result, labels=["left", "right"], vertical=False); - plt.close("all"); + When :class:`DataFrame` are joined using only some of the levels of a :class:`MultiIndex`, + the extra levels will be dropped from the resulting join. To + preserve those levels, use :meth:`DataFrame.reset_index` on those level + names to move those levels to columns prior to the join. .. _merging.multiple_join: -Joining multiple DataFrames -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Joining multiple :class:`DataFrame` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -A list or tuple of ``DataFrames`` can also be passed to :meth:`~DataFrame.join` +A list or tuple of ``:class:`DataFrame``` can also be passed to :meth:`~DataFrame.join` to join them together on their indexes. .. ipython:: python @@ -1281,12 +928,12 @@ to join them together on their indexes. .. _merging.combine_first.update: -Merging together values within Series or DataFrame columns -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +:meth:`DataFrame.combine_first` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Another fairly common situation is to have two like-indexed (or similarly -indexed) ``Series`` or ``DataFrame`` objects and wanting to "patch" values in -one object from values for matching indices in the other. Here is an example: +:meth:`DataFrame.combine_first` update missing values from one :class:`DataFrame` +with the non-missing values in another :class:`DataFrame` in the corresponding +location. .. ipython:: python @@ -1294,12 +941,8 @@ one object from values for matching indices in the other. Here is an example: [[np.nan, 3.0, 5.0], [-4.6, np.nan, np.nan], [np.nan, 7.0, np.nan]] ) df2 = pd.DataFrame([[-42.6, np.nan, -8.2], [-5.0, 1.6, 4]], index=[1, 2]) - -For this, use the :meth:`~DataFrame.combine_first` method: - -.. ipython:: python - result = df1.combine_first(df2) + result .. ipython:: python :suppress: @@ -1308,39 +951,13 @@ For this, use the :meth:`~DataFrame.combine_first` method: p.plot([df1, df2], result, labels=["df1", "df2"], vertical=False); plt.close("all"); -Note that this method only takes values from the right ``DataFrame`` if they are -missing in the left ``DataFrame``. A related method, :meth:`~DataFrame.update`, -alters non-NA values in place: - -.. ipython:: python - :suppress: - - df1_copy = df1.copy() - -.. ipython:: python - - df1.update(df2) - -.. ipython:: python - :suppress: - - @savefig merging_update.png - p.plot([df1_copy, df2], df1, labels=["df1", "df2"], vertical=False); - plt.close("all"); - -.. _merging.time_series: - -Timeseries friendly merging ---------------------------- - .. _merging.merge_ordered: -Merging ordered data -~~~~~~~~~~~~~~~~~~~~ +:func:`merge_ordered` +--------------------- -A :func:`merge_ordered` function allows combining time series and other -ordered data. In particular it has an optional ``fill_method`` keyword to -fill/interpolate missing data: +:func:`merge_ordered` combines order data such as numeric or time series data +with optional filling of missing data with ``fill_method``. .. ipython:: python @@ -1354,19 +971,16 @@ fill/interpolate missing data: .. _merging.merge_asof: -Merging asof -~~~~~~~~~~~~ - -A :func:`merge_asof` is similar to an ordered left-join except that we match on -nearest key rather than equal keys. For each row in the ``left`` ``DataFrame``, -we select the last row in the ``right`` ``DataFrame`` whose ``on`` key is less -than the left's key. Both DataFrames must be sorted by the key. +:func:`merge_asof` +--------------------- -Optionally an asof merge can perform a group-wise merge. This matches the -``by`` key equally, in addition to the nearest match on the ``on`` key. +:func:`merge_asof` is similar to an ordered left-join except that mactches are on the +nearest key rather than equal keys. For each row in the ``left`` :class:`DataFrame`, +the last row in the ``right`` :class:`DataFrame` are selected where the ``on`` key is less +than the left's key. Both :class:`DataFrame` must be sorted by the key. -For example; we might have ``trades`` and ``quotes`` and we want to ``asof`` -merge them. +Optionally an :func:`merge_asof` can perform a group-wise merge by matching the +``by`` key in addition to the nearest match on the ``on`` key. .. ipython:: python @@ -1408,25 +1022,17 @@ merge them. }, columns=["time", "ticker", "bid", "ask"], ) - -.. ipython:: python - trades quotes - -By default we are taking the asof of the quotes. - -.. ipython:: python - pd.merge_asof(trades, quotes, on="time", by="ticker") -We only asof within ``2ms`` between the quote time and the trade time. +:func:`merge_asof` within ``2ms`` between the quote time and the trade time. .. ipython:: python pd.merge_asof(trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms")) -We only asof within ``10ms`` between the quote time and the trade time and we +:func:`merge_asof` within ``10ms`` between the quote time and the trade time and exclude exact matches on time. Note that though we exclude the exact matches (of the quotes), prior quotes **do** propagate to that point in time. @@ -1443,14 +1049,11 @@ exclude exact matches on time. Note that though we exclude the exact matches .. _merging.compare: -Comparing objects ------------------ - -The :meth:`~Series.compare` and :meth:`~DataFrame.compare` methods allow you to -compare two DataFrame or Series, respectively, and summarize their differences. +:meth:`~Series.compare` +----------------------- -For example, you might want to compare two ``DataFrame`` and stack their differences -side by side. +The :meth:`Series.compare` and :meth:`DataFrame.compare` methods allow you to +compare two :class:`DataFrame` or :class:`Series`, respectively, and summarize their differences. .. ipython:: python @@ -1463,36 +1066,29 @@ side by side. columns=["col1", "col2", "col3"], ) df - -.. ipython:: python - df2 = df.copy() df2.loc[0, "col1"] = "c" df2.loc[2, "col3"] = 4.0 df2 - -.. ipython:: python - df.compare(df2) By default, if two corresponding values are equal, they will be shown as ``NaN``. Furthermore, if all values in an entire row / column, the row / column will be omitted from the result. The remaining differences will be aligned on columns. -If you wish, you may choose to stack the differences on rows. +Stack the differences on rows. .. ipython:: python df.compare(df2, align_axis=0) -If you wish to keep all original rows and columns, set ``keep_shape`` argument -to ``True``. +Keep all original rows and columns with ``keep_shape=True`` .. ipython:: python df.compare(df2, keep_shape=True) -You may also keep all the original values even if they are equal. +Keep all the original values even if they are equal. .. ipython:: python diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index e0e752099b77a..4a2aa565dd15c 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -401,7 +401,7 @@ Limit the number of NA values filled df.ffill(limit=1) -NA values can be replaced with corresponding value from a :class:`Series`` or :class:`DataFrame`` +NA values can be replaced with corresponding value from a :class:`Series` or :class:`DataFrame` where the index and column aligns between the original object and the filled object. .. ipython:: python @@ -660,7 +660,7 @@ Pass a list of regular expressions that will replace matches with a scalar. .. ipython:: python - df.replace([r"\s*\.\s*", r"a|b"], np.nan, regex=True) + df.replace([r"\s*\.\s*", r"a|b"], "placeholder", regex=True) All of the regular expression examples can also be passed with the ``to_replace`` argument as the ``regex`` argument. In this case the ``value`` @@ -669,7 +669,7 @@ dictionary. .. ipython:: python - df.replace(regex=[r"\s*\.\s*", r"a|b"], value=np.nan) + df.replace(regex=[r"\s*\.\s*", r"a|b"], value="placeholder") .. note:: diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index fabe498cb3bdb..83bf453b560ec 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -136,7 +136,7 @@ Also, you can use :class:`Grouper` for ``index`` and ``columns`` keywords. For d .. ipython:: python - pd.pivot_table(df, values="D", index=pd.Grouper(freq="M", key="F"), columns="C") + pd.pivot_table(df, values="D", index=pd.Grouper(freq="ME", key="F"), columns="C") .. _reshaping.pivot.margins: diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index bc49c7f958cb7..b262de5d71439 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -40,7 +40,7 @@ Suppose our raw dataset on disk has many columns. return df timeseries = [ - make_timeseries(freq="1T", seed=i).rename(columns=lambda x: f"{x}_{i}") + make_timeseries(freq="1min", seed=i).rename(columns=lambda x: f"{x}_{i}") for i in range(10) ] ts_wide = pd.concat(timeseries, axis=1) @@ -87,7 +87,7 @@ can store larger datasets in memory. .. ipython:: python :okwarning: - ts = make_timeseries(freq="30S", seed=0) + ts = make_timeseries(freq="30s", seed=0) ts.to_parquet("timeseries.parquet") ts = pd.read_parquet("timeseries.parquet") ts @@ -173,7 +173,7 @@ files. Each file in the directory represents a different year of the entire data pathlib.Path("data/timeseries").mkdir(exist_ok=True) for i, (start, end) in enumerate(zip(starts, ends)): - ts = make_timeseries(start=start, end=end, freq="1T", seed=i) + ts = make_timeseries(start=start, end=end, freq="1min", seed=i) ts.to_parquet(f"data/timeseries/ts-{i:0>2d}.parquet") diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst index a6eb96f91a4bf..cd567f8442671 100644 --- a/doc/source/user_guide/timedeltas.rst +++ b/doc/source/user_guide/timedeltas.rst @@ -390,7 +390,7 @@ The ``freq`` parameter can passed a variety of :ref:`frequency aliases `: .. ipython:: python - pd.date_range(start, periods=1000, freq="M") + pd.date_range(start, periods=1000, freq="ME") pd.bdate_range(start, periods=250, freq="BQS") @@ -603,7 +603,7 @@ would include matching times on an included date: dft = pd.DataFrame( np.random.randn(100000, 1), columns=["A"], - index=pd.date_range("20130101", periods=100000, freq="T"), + index=pd.date_range("20130101", periods=100000, freq="min"), ) dft dft.loc["2013"] @@ -882,7 +882,7 @@ into ``freq`` keyword arguments. The available date offsets and associated frequ :class:`~pandas.tseries.offsets.Week`, ``'W'``, "one week, optionally anchored on a day of the week" :class:`~pandas.tseries.offsets.WeekOfMonth`, ``'WOM'``, "the x-th day of the y-th week of each month" :class:`~pandas.tseries.offsets.LastWeekOfMonth`, ``'LWOM'``, "the x-th day of the last week of each month" - :class:`~pandas.tseries.offsets.MonthEnd`, ``'M'``, "calendar month end" + :class:`~pandas.tseries.offsets.MonthEnd`, ``'ME'``, "calendar month end" :class:`~pandas.tseries.offsets.MonthBegin`, ``'MS'``, "calendar month begin" :class:`~pandas.tseries.offsets.BMonthEnd` or :class:`~pandas.tseries.offsets.BusinessMonthEnd`, ``'BM'``, "business month end" :class:`~pandas.tseries.offsets.BMonthBegin` or :class:`~pandas.tseries.offsets.BusinessMonthBegin`, ``'BMS'``, "business month begin" @@ -905,11 +905,11 @@ into ``freq`` keyword arguments. The available date offsets and associated frequ :class:`~pandas.tseries.offsets.CustomBusinessHour`, ``'CBH'``, "custom business hour" :class:`~pandas.tseries.offsets.Day`, ``'D'``, "one absolute day" :class:`~pandas.tseries.offsets.Hour`, ``'H'``, "one hour" - :class:`~pandas.tseries.offsets.Minute`, ``'T'`` or ``'min'``,"one minute" - :class:`~pandas.tseries.offsets.Second`, ``'S'``, "one second" - :class:`~pandas.tseries.offsets.Milli`, ``'L'`` or ``'ms'``, "one millisecond" - :class:`~pandas.tseries.offsets.Micro`, ``'U'`` or ``'us'``, "one microsecond" - :class:`~pandas.tseries.offsets.Nano`, ``'N'``, "one nanosecond" + :class:`~pandas.tseries.offsets.Minute`, ``'min'``,"one minute" + :class:`~pandas.tseries.offsets.Second`, ``'s'``, "one second" + :class:`~pandas.tseries.offsets.Milli`, ``'ms'``, "one millisecond" + :class:`~pandas.tseries.offsets.Micro`, ``'us'``, "one microsecond" + :class:`~pandas.tseries.offsets.Nano`, ``'ns'``, "one nanosecond" ``DateOffsets`` additionally have :meth:`rollforward` and :meth:`rollback` methods for moving a date forward or backward respectively to a valid offset @@ -1246,7 +1246,7 @@ frequencies. We will refer to these aliases as *offset aliases*. "C", "custom business day frequency" "D", "calendar day frequency" "W", "weekly frequency" - "M", "month end frequency" + "ME", "month end frequency" "SM", "semi-month end frequency (15th and end of month)" "BM", "business month end frequency" "CBM", "custom business month end frequency" @@ -1264,11 +1264,16 @@ frequencies. We will refer to these aliases as *offset aliases*. "BAS, BYS", "business year start frequency" "BH", "business hour frequency" "H", "hourly frequency" - "T, min", "minutely frequency" - "S", "secondly frequency" - "L, ms", "milliseconds" - "U, us", "microseconds" - "N", "nanoseconds" + "min", "minutely frequency" + "s", "secondly frequency" + "ms", "milliseconds" + "us", "microseconds" + "ns", "nanoseconds" + +.. deprecated:: 2.2.0 + + Aliases ``T``, ``S``, ``L``, ``U``, and ``N`` are deprecated in favour of the aliases + ``min``, ``s``, ``ms``, ``us``, and ``ns``. .. note:: @@ -1318,11 +1323,16 @@ frequencies. We will refer to these aliases as *period aliases*. "Q", "quarterly frequency" "A, Y", "yearly frequency" "H", "hourly frequency" - "T, min", "minutely frequency" - "S", "secondly frequency" - "L, ms", "milliseconds" - "U, us", "microseconds" - "N", "nanoseconds" + "min", "minutely frequency" + "s", "secondly frequency" + "ms", "milliseconds" + "us", "microseconds" + "ns", "nanoseconds" + +.. deprecated:: 2.2.0 + + Aliases ``T``, ``S``, ``L``, ``U``, and ``N`` are deprecated in favour of the aliases + ``min``, ``s``, ``ms``, ``us``, and ``ns``. Combining aliases @@ -1343,7 +1353,7 @@ You can combine together day and intraday offsets: pd.date_range(start, periods=10, freq="2h20min") - pd.date_range(start, periods=10, freq="1D10U") + pd.date_range(start, periods=10, freq="1D10us") Anchored offsets ~~~~~~~~~~~~~~~~ @@ -1635,7 +1645,7 @@ Basics .. ipython:: python - rng = pd.date_range("1/1/2012", periods=100, freq="S") + rng = pd.date_range("1/1/2012", periods=100, freq="s") ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) @@ -1680,7 +1690,7 @@ the end of the interval. .. warning:: The default values for ``label`` and ``closed`` is '**left**' for all - frequency offsets except for 'M', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W' + frequency offsets except for 'ME', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W' which all have a default of 'right'. This might unintendedly lead to looking ahead, where the value for a later @@ -1725,11 +1735,11 @@ For upsampling, you can specify a way to upsample and the ``limit`` parameter to # from secondly to every 250 milliseconds - ts[:2].resample("250L").asfreq() + ts[:2].resample("250ms").asfreq() - ts[:2].resample("250L").ffill() + ts[:2].resample("250ms").ffill() - ts[:2].resample("250L").ffill(limit=2) + ts[:2].resample("250ms").ffill(limit=2) Sparse resampling ~~~~~~~~~~~~~~~~~ @@ -1752,7 +1762,7 @@ If we want to resample to the full range of the series: .. ipython:: python - ts.resample("3T").sum() + ts.resample("3min").sum() We can instead only resample those groups where we have points as follows: @@ -1766,7 +1776,7 @@ We can instead only resample those groups where we have points as follows: freq = to_offset(freq) return pd.Timestamp((t.value // freq.delta.value) * freq.delta.value) - ts.groupby(partial(round, freq="3T")).sum() + ts.groupby(partial(round, freq="3min")).sum() .. _timeseries.aggregate: @@ -1783,10 +1793,10 @@ Resampling a ``DataFrame``, the default will be to act on all columns with the s df = pd.DataFrame( np.random.randn(1000, 3), - index=pd.date_range("1/1/2012", freq="S", periods=1000), + index=pd.date_range("1/1/2012", freq="s", periods=1000), columns=["A", "B", "C"], ) - r = df.resample("3T") + r = df.resample("3min") r.mean() We can select a specific column or columns using standard getitem. @@ -1846,7 +1856,7 @@ to resample based on datetimelike column in the frame, it can passed to the ), ) df - df.resample("M", on="date")[["a"]].sum() + df.resample("ME", on="date")[["a"]].sum() Similarly, if you instead want to resample by a datetimelike level of ``MultiIndex``, its name or location can be passed to the @@ -1854,7 +1864,7 @@ level of ``MultiIndex``, its name or location can be passed to the .. ipython:: python - df.resample("M", level="d")[["a"]].sum() + df.resample("ME", level="d")[["a"]].sum() .. _timeseries.iterating-label: @@ -2127,7 +2137,7 @@ The ``period`` dtype can be used in ``.astype(...)``. It allows one to change th pi.astype("datetime64[ns]") # convert to PeriodIndex - dti = pd.date_range("2011-01-01", freq="M", periods=3) + dti = pd.date_range("2011-01-01", freq="ME", periods=3) dti dti.astype("period[M]") @@ -2155,7 +2165,7 @@ Passing a string representing a lower frequency than ``PeriodIndex`` returns par dfp = pd.DataFrame( np.random.randn(600, 1), columns=["A"], - index=pd.period_range("2013-01-01 9:00", periods=600, freq="T"), + index=pd.period_range("2013-01-01 9:00", periods=600, freq="min"), ) dfp dfp.loc["2013-01-01 10H"] @@ -2246,7 +2256,7 @@ and vice-versa using ``to_timestamp``: .. ipython:: python - rng = pd.date_range("1/1/2012", periods=5, freq="M") + rng = pd.date_range("1/1/2012", periods=5, freq="ME") ts = pd.Series(np.random.randn(len(rng)), index=rng) diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index e933b3e84c481..dbc7800196007 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -24,6 +24,8 @@ Version 2.1 .. toctree:: :maxdepth: 2 + v2.1.2 + v2.1.1 v2.1.0 Version 2.0 diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst index 3425986a37743..be50c34d7d14c 100644 --- a/doc/source/whatsnew/v0.10.0.rst +++ b/doc/source/whatsnew/v0.10.0.rst @@ -180,19 +180,36 @@ labeled the aggregated group with the end of the interval: the next day). DataFrame constructor with no columns specified. The v0.9.0 behavior (names ``X0``, ``X1``, ...) can be reproduced by specifying ``prefix='X'``: -.. ipython:: python - :okexcept: +.. code-block:: ipython + + In [6]: import io + + In [7]: data = """ + ...: a,b,c + ...: 1,Yes,2 + ...: 3,No,4 + ...: """ + ...: + + In [8]: print(data) + + a,b,c + 1,Yes,2 + 3,No,4 - import io + In [9]: pd.read_csv(io.StringIO(data), header=None) + Out[9]: + 0 1 2 + 0 a b c + 1 1 Yes 2 + 2 3 No 4 - data = """ - a,b,c - 1,Yes,2 - 3,No,4 - """ - print(data) - pd.read_csv(io.StringIO(data), header=None) - pd.read_csv(io.StringIO(data), header=None, prefix="X") + In [10]: pd.read_csv(io.StringIO(data), header=None, prefix="X") + Out[10]: + X0 X1 X2 + 0 a b c + 1 1 Yes 2 + 2 3 No 4 - Values like ``'Yes'`` and ``'No'`` are not interpreted as boolean by default, though this can be controlled by new ``true_values`` and ``false_values`` @@ -244,26 +261,15 @@ Convenience methods ``ffill`` and ``bfill`` have been added: function, that is itself a series, and possibly upcast the result to a DataFrame - .. code-block:: python - - >>> def f(x): - ... return pd.Series([x, x ** 2], index=["x", "x^2"]) - >>> - >>> s = pd.Series(np.random.rand(5)) - >>> s - 0 0.340445 - 1 0.984729 - 2 0.919540 - 3 0.037772 - 4 0.861549 - dtype: float64 - >>> s.apply(f) - x x^2 - 0 0.340445 0.115903 - 1 0.984729 0.969691 - 2 0.919540 0.845555 - 3 0.037772 0.001427 - 4 0.861549 0.742267 + .. ipython:: python + + def f(x): + return pd.Series([x, x ** 2], index=["x", "x^2"]) + + + s = pd.Series(np.random.rand(5)) + s + s.apply(f) - New API functions for working with pandas options (:issue:`2097`): diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst index aa83e7933a444..f05cbc7f07d7d 100644 --- a/doc/source/whatsnew/v0.11.0.rst +++ b/doc/source/whatsnew/v0.11.0.rst @@ -367,15 +367,27 @@ Enhancements - You can now select with a string from a DataFrame with a datelike index, in a similar way to a Series (:issue:`3070`) - .. ipython:: python - :okexcept: + .. code-block:: ipython - idx = pd.date_range("2001-10-1", periods=5, freq='M') - ts = pd.Series(np.random.rand(len(idx)), index=idx) - ts['2001'] + In [30]: idx = pd.date_range("2001-10-1", periods=5, freq='M') - df = pd.DataFrame({'A': ts}) - df['2001'] + In [31]: ts = pd.Series(np.random.rand(len(idx)), index=idx) + + In [32]: ts['2001'] + Out[32]: + 2001-10-31 0.117967 + 2001-11-30 0.702184 + 2001-12-31 0.414034 + Freq: M, dtype: float64 + + In [33]: df = pd.DataFrame({'A': ts}) + + In [34]: df['2001'] + Out[34]: + A + 2001-10-31 0.117967 + 2001-11-30 0.702184 + 2001-12-31 0.414034 - ``Squeeze`` to possibly remove length 1 dimensions from an object. diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst index c60a821968c0c..f2e29121760ab 100644 --- a/doc/source/whatsnew/v0.13.0.rst +++ b/doc/source/whatsnew/v0.13.0.rst @@ -537,7 +537,6 @@ Enhancements is frequency conversion. See :ref:`the docs` for the docs. .. ipython:: python - :okexcept: import datetime td = pd.Series(pd.date_range('20130101', periods=4)) - pd.Series( @@ -546,13 +545,41 @@ Enhancements td[3] = np.nan td + .. code-block:: ipython + # to days - td / np.timedelta64(1, 'D') - td.astype('timedelta64[D]') + In [63]: td / np.timedelta64(1, 'D') + Out[63]: + 0 31.000000 + 1 31.000000 + 2 31.003507 + 3 NaN + dtype: float64 + + In [64]: td.astype('timedelta64[D]') + Out[64]: + 0 31.0 + 1 31.0 + 2 31.0 + 3 NaN + dtype: float64 # to seconds - td / np.timedelta64(1, 's') - td.astype('timedelta64[s]') + In [65]: td / np.timedelta64(1, 's') + Out[65]: + 0 2678400.0 + 1 2678400.0 + 2 2678703.0 + 3 NaN + dtype: float64 + + In [66]: td.astype('timedelta64[s]') + Out[66]: + 0 2678400.0 + 1 2678400.0 + 2 2678703.0 + 3 NaN + dtype: float64 Dividing or multiplying a ``timedelta64[ns]`` Series by an integer or integer Series @@ -642,9 +669,16 @@ Enhancements Period conversions in the range of seconds and below were reworked and extended up to nanoseconds. Periods in the nanosecond range are now available. - .. ipython:: python + .. code-block:: python - pd.date_range('2013-01-01', periods=5, freq='5N') + In [79]: pd.date_range('2013-01-01', periods=5, freq='5N') + Out[79]: + DatetimeIndex([ '2013-01-01 00:00:00', + '2013-01-01 00:00:00.000000005', + '2013-01-01 00:00:00.000000010', + '2013-01-01 00:00:00.000000015', + '2013-01-01 00:00:00.000000020'], + dtype='datetime64[ns]', freq='5N') or with frequency as offset diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst index 92c37243b7e81..efb30d087b8b4 100644 --- a/doc/source/whatsnew/v0.14.0.rst +++ b/doc/source/whatsnew/v0.14.0.rst @@ -328,19 +328,36 @@ More consistent behavior for some groupby methods: - groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation: - .. ipython:: python + .. code-block:: ipython - df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - g.head(1) # filters DataFrame + In [1]: df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + + In [2]: g = df.groupby('A') + + In [3]: g.head(1) # filters DataFrame + Out[3]: + A B + 0 1 2 + 2 5 6 - g.apply(lambda x: x.head(1)) # used to simply fall-through + In [4]: g.apply(lambda x: x.head(1)) # used to simply fall-through + Out[4]: + A B + A + 1 0 1 2 + 5 2 5 6 - groupby head and tail respect column selection: - .. ipython:: python + .. code-block:: ipython + + In [19]: g[['B']].head(1) + Out[19]: + B + 0 2 + 2 6 - g[['B']].head(1) + [2 rows x 1 columns] - groupby ``nth`` now reduces by default; filtering can be achieved by passing ``as_index=False``. With an optional ``dropna`` argument to ignore NaN. See :ref:`the docs `. @@ -843,10 +860,20 @@ Enhancements datetime.datetime(2013, 9, 5, 10, 0)]}) df - df.pivot_table(values='Quantity', - index=pd.Grouper(freq='M', key='Date'), - columns=pd.Grouper(freq='M', key='PayDay'), - aggfunc="sum") + .. code-block:: ipython + + In [75]: df.pivot_table(values='Quantity', + ....: index=pd.Grouper(freq='M', key='Date'), + ....: columns=pd.Grouper(freq='M', key='PayDay'), + ....: aggfunc="sum") + Out[75]: + PayDay 2013-09-30 2013-10-31 2013-11-30 + Date + 2013-09-30 NaN 3.0 NaN + 2013-10-31 6.0 NaN 1.0 + 2013-11-30 NaN 9.0 NaN + + [3 rows x 3 columns] - Arrays of strings can be wrapped to a specified width (``str.wrap``) (:issue:`6999`) - Add :meth:`~Series.nsmallest` and :meth:`Series.nlargest` methods to Series, See :ref:`the docs ` (:issue:`3960`) diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index 6b962cbb49c74..dffb4c7b9ff9e 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -185,7 +185,29 @@ Constructing a ``TimedeltaIndex`` with a regular range .. ipython:: python pd.timedelta_range('1 days', periods=5, freq='D') - pd.timedelta_range(start='1 days', end='2 days', freq='30T') + +.. code-block:: python + + In [20]: pd.timedelta_range(start='1 days', end='2 days', freq='30T') + Out[20]: + TimedeltaIndex(['1 days 00:00:00', '1 days 00:30:00', '1 days 01:00:00', + '1 days 01:30:00', '1 days 02:00:00', '1 days 02:30:00', + '1 days 03:00:00', '1 days 03:30:00', '1 days 04:00:00', + '1 days 04:30:00', '1 days 05:00:00', '1 days 05:30:00', + '1 days 06:00:00', '1 days 06:30:00', '1 days 07:00:00', + '1 days 07:30:00', '1 days 08:00:00', '1 days 08:30:00', + '1 days 09:00:00', '1 days 09:30:00', '1 days 10:00:00', + '1 days 10:30:00', '1 days 11:00:00', '1 days 11:30:00', + '1 days 12:00:00', '1 days 12:30:00', '1 days 13:00:00', + '1 days 13:30:00', '1 days 14:00:00', '1 days 14:30:00', + '1 days 15:00:00', '1 days 15:30:00', '1 days 16:00:00', + '1 days 16:30:00', '1 days 17:00:00', '1 days 17:30:00', + '1 days 18:00:00', '1 days 18:30:00', '1 days 19:00:00', + '1 days 19:30:00', '1 days 20:00:00', '1 days 20:30:00', + '1 days 21:00:00', '1 days 21:30:00', '1 days 22:00:00', + '1 days 22:30:00', '1 days 23:00:00', '1 days 23:30:00', + '2 days 00:00:00'], + dtype='timedelta64[ns]', freq='30T') You can now use a ``TimedeltaIndex`` as the index of a pandas object diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst index bb7beef449d93..acc5409b86d09 100644 --- a/doc/source/whatsnew/v0.15.2.rst +++ b/doc/source/whatsnew/v0.15.2.rst @@ -24,25 +24,61 @@ API changes - Indexing in ``MultiIndex`` beyond lex-sort depth is now supported, though a lexically sorted index will have a better performance. (:issue:`2646`) - .. ipython:: python - :okexcept: - :okwarning: + .. code-block:: ipython + + In [1]: df = pd.DataFrame({'jim':[0, 0, 1, 1], + ...: 'joe':['x', 'x', 'z', 'y'], + ...: 'jolie':np.random.rand(4)}).set_index(['jim', 'joe']) + ...: - df = pd.DataFrame({'jim':[0, 0, 1, 1], - 'joe':['x', 'x', 'z', 'y'], - 'jolie':np.random.rand(4)}).set_index(['jim', 'joe']) - df - df.index.lexsort_depth + In [2]: df + Out[2]: + jolie + jim joe + 0 x 0.126970 + x 0.966718 + 1 z 0.260476 + y 0.897237 + + [4 rows x 1 columns] + + In [3]: df.index.lexsort_depth + Out[3]: 1 # in prior versions this would raise a KeyError # will now show a PerformanceWarning - df.loc[(1, 'z')] + In [4]: df.loc[(1, 'z')] + Out[4]: + jolie + jim joe + 1 z 0.260476 + + [1 rows x 1 columns] # lexically sorting - df2 = df.sort_index() - df2 - df2.index.lexsort_depth - df2.loc[(1,'z')] + In [5]: df2 = df.sort_index() + + In [6]: df2 + Out[6]: + jolie + jim joe + 0 x 0.126970 + x 0.966718 + 1 y 0.897237 + z 0.260476 + + [4 rows x 1 columns] + + In [7]: df2.index.lexsort_depth + Out[7]: 2 + + In [8]: df2.loc[(1,'z')] + Out[8]: + jolie + jim joe + 1 z 0.260476 + + [1 rows x 1 columns] - Bug in unique of Series with ``category`` dtype, which returned all categories regardless whether they were "used" or not (see :issue:`8559` for the discussion). diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst index a05b9bb1a88ef..02e47553cd184 100644 --- a/doc/source/whatsnew/v0.18.0.rst +++ b/doc/source/whatsnew/v0.18.0.rst @@ -837,9 +837,24 @@ Previously New API -.. ipython:: python +.. code-block:: ipython - s.resample('M').ffill() + In [91]: s.resample('M').ffill() + Out[91]: + 2010-03-31 0 + 2010-04-30 0 + 2010-05-31 0 + 2010-06-30 1 + 2010-07-31 1 + 2010-08-31 1 + 2010-09-30 2 + 2010-10-31 2 + 2010-11-30 2 + 2010-12-31 3 + 2011-01-31 3 + 2011-02-28 3 + 2011-03-31 4 + Freq: M, Length: 13, dtype: int64 .. note:: diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst index 7d9008fdbdecd..ee6a60144bc35 100644 --- a/doc/source/whatsnew/v0.18.1.rst +++ b/doc/source/whatsnew/v0.18.1.rst @@ -77,9 +77,52 @@ Previously you would have to do this to get a rolling window mean per-group: df = pd.DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) df -.. ipython:: python +.. code-block:: ipython - df.groupby("A").apply(lambda x: x.rolling(4).B.mean()) + In [1]: df.groupby("A").apply(lambda x: x.rolling(4).B.mean()) + Out[1]: + A + 1 0 NaN + 1 NaN + 2 NaN + 3 1.5 + 4 2.5 + 5 3.5 + 6 4.5 + 7 5.5 + 8 6.5 + 9 7.5 + 10 8.5 + 11 9.5 + 12 10.5 + 13 11.5 + 14 12.5 + 15 13.5 + 16 14.5 + 17 15.5 + 18 16.5 + 19 17.5 + 2 20 NaN + 21 NaN + 22 NaN + 23 21.5 + 24 22.5 + 25 23.5 + 26 24.5 + 27 25.5 + 28 26.5 + 29 27.5 + 30 28.5 + 31 29.5 + 3 32 NaN + 33 NaN + 34 NaN + 35 33.5 + 36 34.5 + 37 35.5 + 38 36.5 + 39 37.5 + Name: B, dtype: float64 Now you can do: @@ -101,15 +144,53 @@ For ``.resample(..)`` type of operations, previously you would have to: df -.. ipython:: python +.. code-block:: ipython - df.groupby("group").apply(lambda x: x.resample("1D").ffill()) + In[1]: df.groupby("group").apply(lambda x: x.resample("1D").ffill()) + Out[1]: + group val + group date + 1 2016-01-03 1 5 + 2016-01-04 1 5 + 2016-01-05 1 5 + 2016-01-06 1 5 + 2016-01-07 1 5 + 2016-01-08 1 5 + 2016-01-09 1 5 + 2016-01-10 1 6 + 2 2016-01-17 2 7 + 2016-01-18 2 7 + 2016-01-19 2 7 + 2016-01-20 2 7 + 2016-01-21 2 7 + 2016-01-22 2 7 + 2016-01-23 2 7 + 2016-01-24 2 8 Now you can do: -.. ipython:: python +.. code-block:: ipython - df.groupby("group").resample("1D").ffill() + In[1]: df.groupby("group").resample("1D").ffill() + Out[1]: + group val + group date + 1 2016-01-03 1 5 + 2016-01-04 1 5 + 2016-01-05 1 5 + 2016-01-06 1 5 + 2016-01-07 1 5 + 2016-01-08 1 5 + 2016-01-09 1 5 + 2016-01-10 1 6 + 2 2016-01-17 2 7 + 2016-01-18 2 7 + 2016-01-19 2 7 + 2016-01-20 2 7 + 2016-01-21 2 7 + 2016-01-22 2 7 + 2016-01-23 2 7 + 2016-01-24 2 8 .. _whatsnew_0181.enhancements.method_chain: diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index d4b879f137698..32b3ced4f9d39 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -498,8 +498,26 @@ Other enhancements ), ) df - df.resample("M", on="date")[["a"]].sum() - df.resample("M", level="d")[["a"]].sum() + + .. code-block:: ipython + + In [74]: df.resample("M", on="date")[["a"]].sum() + Out[74]: + a + date + 2015-01-31 6 + 2015-02-28 4 + + [2 rows x 1 columns] + + In [75]: df.resample("M", level="d")[["a"]].sum() + Out[75]: + a + d + 2015-01-31 6 + 2015-02-28 4 + + [2 rows x 1 columns] - The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch `the application default credentials `__. See the docs for more details (:issue:`13577`). - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behavior remains to raising a ``NonExistentTimeError`` (:issue:`13057`) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 38c6e1123aaae..b013d03b2d68c 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -286,6 +286,7 @@ value. (:issue:`17054`) .. ipython:: python + from io import StringIO result = pd.read_html(StringIO(""" @@ -1377,18 +1378,22 @@ the object's ``freq`` attribute (:issue:`21939`, :issue:`23878`). *New behavior*: -.. ipython:: python - :okexcept: - :okwarning: +.. code-block:: ipython + + In [108]: ts = pd.Timestamp('1994-05-06 12:15:16', freq=pd.offsets.Hour()) + + In[109]: ts + 2 * ts.freq + Out[109]: Timestamp('1994-05-06 14:15:16', freq='H') + + In [110]: tdi = pd.timedelta_range('1D', periods=2) - ts = pd.Timestamp('1994-05-06 12:15:16', freq=pd.offsets.Hour()) - ts + 2 * ts.freq + In [111]: tdi - np.array([2 * tdi.freq, 1 * tdi.freq]) + Out[111]: TimedeltaIndex(['-1 days', '1 days'], dtype='timedelta64[ns]', freq=None) - tdi = pd.timedelta_range('1D', periods=2) - tdi - np.array([2 * tdi.freq, 1 * tdi.freq]) + In [112]: dti = pd.date_range('2001-01-01', periods=2, freq='7D') - dti = pd.date_range('2001-01-01', periods=2, freq='7D') - dti + pd.Index([1 * dti.freq, 2 * dti.freq]) + In [113]: dti + pd.Index([1 * dti.freq, 2 * dti.freq]) + Out[113]: DatetimeIndex(['2001-01-08', '2001-01-22'], dtype='datetime64[ns]', freq=None) .. _whatsnew_0240.deprecations.integer_tz: diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 7d26005315c33..be63da09846c8 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -76,7 +76,7 @@ Below is a possibly non-exhaustive list of changes: .. ipython:: python - idx = pd.date_range(start='1/1/2018', periods=3, freq='M') + idx = pd.date_range(start='1/1/2018', periods=3, freq='ME') idx.array.year idx.year diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 43a64a79e691b..51b4c4f297b07 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -1,6 +1,6 @@ .. _whatsnew_210: -What's new in 2.1.0 (Aug 11, 2023) +What's new in 2.1.0 (Aug 30, 2023) -------------------------------------- These are the changes in pandas 2.1.0. See :ref:`release` for a full changelog @@ -21,7 +21,7 @@ PyArrow will become a required dependency with pandas 3.0 `PyArrow `_ will become a required dependency of pandas starting with pandas 3.0. This decision was made based on -`PDEP 12 `_. +`PDEP 10 `_. This will enable more changes that are hugely beneficial to pandas users, including but not limited to: @@ -39,9 +39,15 @@ We are collecting feedback on this decision `here `__. +This engine supports Excel files (``.xlsx``, ``.xlsm``, ``.xls``, ``.xlsb``) and OpenDocument spreadsheets (``.ods``) (:issue:`50395`). + +There are two advantages of this engine: + +1. Calamine is often faster than other engines, some benchmarks show results up to 5x faster than 'openpyxl', 20x - 'odf', 4x - 'pyxlsb', and 1.5x - 'xlrd'. + But, 'openpyxl' and 'pyxlsb' are faster in reading a few rows from large files because of lazy iteration over rows. +2. Calamine supports the recognition of datetime in ``.xlsb`` files, unlike 'pyxlsb' which is the only other engine in pandas that can read ``.xlsb`` files. + +.. code-block:: python + + pd.read_excel("path_to_file.xlsb", engine="calamine") + + +For more, see :ref:`io.calamine` in the user guide on IO tools. + +.. _whatsnew_220.enhancements.struct_accessor: + +Series.struct accessor to with PyArrow structured data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``Series.struct`` accessor provides attributes and methods for processing +data with ``struct[pyarrow]`` dtype Series. For example, +:meth:`Series.struct.explode` converts PyArrow structured data to a pandas +DataFrame. (:issue:`54938`) + +.. ipython:: python + + import pyarrow as pa + series = pd.Series( + [ + {"project": "pandas", "version": "2.2.0"}, + {"project": "numpy", "version": "1.25.2"}, + {"project": "pyarrow", "version": "13.0.0"}, + ], + dtype=pd.ArrowDtype( + pa.struct([ + ("project", pa.string()), + ("version", pa.string()), + ]) + ), + ) + series.struct.explode() .. _whatsnew_220.enhancements.enhancement2: @@ -28,7 +73,8 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) +- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - .. --------------------------------------------------------------------------- @@ -39,10 +85,49 @@ Notable bug fixes These are bug fixes that might have notable behavior changes. -.. _whatsnew_220.notable_bug_fixes.notable_bug_fix1: +.. _whatsnew_220.notable_bug_fixes.merge_sort_behavior: -notable_bug_fix1 -^^^^^^^^^^^^^^^^ +:func:`merge` and :meth:`DataFrame.join` now consistently follow documented sort behavior +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` did not +always return a result that followed the documented sort behavior. pandas now +follows the documented sort behavior in merge and join operations (:issue:`54611`). + +As documented, ``sort=True`` sorts the join keys lexicographically in the resulting +:class:`DataFrame`. With ``sort=False``, the order of the join keys depends on the +join type (``how`` keyword): + +- ``how="left"``: preserve the order of the left keys +- ``how="right"``: preserve the order of the right keys +- ``how="inner"``: preserve the order of the left keys +- ``how="outer"``: sort keys lexicographically + +One example with changing behavior is inner joins with non-unique left join keys +and ``sort=False``: + +.. ipython:: python + + left = pd.DataFrame({"a": [1, 2, 1]}) + right = pd.DataFrame({"a": [1, 2]}) + result = pd.merge(left, right, how="inner", on="a", sort=False) + +*Old Behavior* + +.. code-block:: ipython + + In [5]: result + Out[5]: + a + 0 1 + 1 1 + 2 2 + +*New Behavior* + +.. ipython:: python + + result .. _whatsnew_220.notable_bug_fixes.notable_bug_fix2: @@ -92,20 +177,67 @@ Other API changes Deprecations ~~~~~~~~~~~~ + +Deprecate alias ``M`` in favour of ``ME`` for offsets +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The alias ``M`` is deprecated in favour of ``ME`` for offsets, please use ``ME`` for "month end" instead of ``M`` (:issue:`9586`) + +For example: + +*Previous behavior*: + +.. code-block:: ipython + + In [7]: pd.date_range('2020-01-01', periods=3, freq='M') + Out [7]: + DatetimeIndex(['2020-01-31', '2020-02-29', '2020-03-31'], + dtype='datetime64[ns]', freq='M') + +*Future behavior*: + +.. ipython:: python + + pd.date_range('2020-01-01', periods=3, freq='ME') + +Other Deprecations +^^^^^^^^^^^^^^^^^^ +- Changed :meth:`Timedelta.resolution_string` to return ``min``, ``s``, ``ms``, ``us``, and ``ns`` instead of ``T``, ``S``, ``L``, ``U``, and ``N``, for compatibility with respective deprecations in frequency aliases (:issue:`52536`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard`. (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf``. (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_dict`. (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_excel` except ``excel_writer``. (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_gbq` except ``destination_table``. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_hdf` except ``path_or_buf``. (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_html` except ``buf``. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_json` except ``path_or_buf``. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_latex` except ``buf``. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_markdown` except ``buf``. (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_parquet` except ``path``. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_pickle` except ``path``. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf``. (:issue:`54229`) -- +- Deprecated automatic downcasting of object-dtype results in :meth:`Series.replace` and :meth:`DataFrame.replace`, explicitly call ``result = result.infer_objects(copy=False)`` instead. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`54710`) +- Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`53656`) +- Deprecated including the groups in computations when using :meth:`DataFrameGroupBy.apply` and :meth:`DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`) +- Deprecated not passing a tuple to :class:`DataFrameGroupBy.get_group` or :class:`SeriesGroupBy.get_group` when grouping by a length-1 list-like (:issue:`25971`) +- Deprecated strings ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`) +- Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`) +- Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`) +- Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) +- Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`) +- Deprecating downcasting the results of :meth:`DataFrame.fillna`, :meth:`Series.fillna`, :meth:`DataFrame.ffill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill`, :meth:`Series.bfill` in object-dtype cases. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`54261`) .. --------------------------------------------------------------------------- .. _whatsnew_220.performance: Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- +- Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`) +- Performance improvement in :func:`to_dict` on converting DataFrame to dictionary (:issue:`50990`) +- Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`) +- Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`) +- Performance improvement in :meth:`Index.difference` (:issue:`55108`) +- Performance improvement when indexing with more than 4 keys (:issue:`54550`) - .. --------------------------------------------------------------------------- @@ -113,10 +245,14 @@ Performance improvements Bug fixes ~~~~~~~~~ +- Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`) +- Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) +- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) +- Bug in :meth:`pandas.read_excel` with a ODS file without cached formatted cell for float values (:issue:`55219`) Categorical ^^^^^^^^^^^ -- +- :meth:`Categorical.isin` raising ``InvalidIndexError`` for categorical containing overlapping :class:`Interval` values (:issue:`34974`) - Datetimelike @@ -136,7 +272,7 @@ Timezones Numeric ^^^^^^^ -- +- Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`) - Conversion @@ -151,12 +287,15 @@ Strings Interval ^^^^^^^^ -- +- Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown. (:issue:`55015`) +- Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`) +- Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`) - Indexing ^^^^^^^^ -- +- Bug in :meth:`Index.difference` not returning a unique set of values when ``other`` is empty or ``other`` is considered non-comparable (:issue:`55113`) +- Bug in setting :class:`Categorical` values into a :class:`DataFrame` with numpy dtypes raising ``RecursionError`` (:issue:`52927`) - Missing @@ -171,8 +310,9 @@ MultiIndex I/O ^^^ -- -- +- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) +- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) +- Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`) Period ^^^^^^ @@ -181,7 +321,7 @@ Period Plotting ^^^^^^^^ -- +- Bug in :meth:`DataFrame.plot.box` with ``vert=False`` and a matplotlib ``Axes`` created with ``sharey=True`` (:issue:`54941`) - Groupby/resample/rolling @@ -191,8 +331,8 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- -- +- Bug in :func:`concat` ignoring ``sort`` parameter when passed :class:`DatetimeIndex` indexes (:issue:`54769`) +- Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) Sparse ^^^^^^ @@ -211,6 +351,7 @@ Styler Other ^^^^^ +- Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) .. ***DO NOT USE THIS SECTION*** diff --git a/environment.yml b/environment.yml index 1a9dffb55bca7..db6138c34f37c 100644 --- a/environment.yml +++ b/environment.yml @@ -9,7 +9,7 @@ dependencies: # build dependencies - versioneer[toml] - cython=0.29.33 - - meson[ninja]=1.0.1 + - meson[ninja]=1.2.1 - meson-python=0.13.1 # test dependencies @@ -36,7 +36,7 @@ dependencies: - ipython - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1 + - matplotlib>=3.6.1, <3.8 - numba>=0.55.2 - numexpr>=2.8.0 - openpyxl>=3.0.10 @@ -47,6 +47,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.5 - pytables>=3.7.0 + - python-calamine>=0.1.6 - pyxlsb>=1.0.9 - s3fs>=2022.05.0 - scipy>=1.8.1 @@ -84,7 +85,7 @@ dependencies: - google-auth - natsort # DataFrame.sort_values doctest - numpydoc - - pydata-sphinx-theme + - pydata-sphinx-theme=0.13 - pytest-cython # doctest - sphinx - sphinx-design @@ -105,7 +106,7 @@ dependencies: - ipykernel # web - - jinja2 # in optional dependencies, but documented here as needed + # - jinja2 # already listed in optional dependencies, but documented here for reference - markdown - feedparser - pyyaml diff --git a/generate_version.py b/generate_version.py index 46e9f52bfc5de..06e38ce0fd978 100644 --- a/generate_version.py +++ b/generate_version.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Note: This file has to live next to setup.py or versioneer will not work import argparse import os diff --git a/meson.build b/meson.build index a927b59abeaf9..68018046c081f 100644 --- a/meson.build +++ b/meson.build @@ -2,24 +2,17 @@ project( 'pandas', 'c', 'cpp', 'cython', - version: run_command(['python', 'generate_version.py', '--print'], check: true).stdout().strip(), + version: run_command(['generate_version.py', '--print'], check: true).stdout().strip(), license: 'BSD-3', - meson_version: '>=1.0.1', + meson_version: '>=1.2.1', default_options: [ - # TODO: investigate, does meson try to compile against debug Python - # when buildtype = debug, this seems to be causing problems on CI - # where provided Python is not compiled in debug mode 'buildtype=release', - # TODO: Reactivate werror, some warnings on Windows - #'werror=true', 'c_std=c99' ] ) -py_mod = import('python') fs = import('fs') -py = py_mod.find_installation('python') -py_dep = py.dependency() +py = import('python').find_installation(pure: false) tempita = files('generate_pxi.py') versioneer = files('generate_version.py') @@ -35,7 +28,7 @@ add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'c if fs.exists('_version_meson.py') - py.install_sources('_version_meson.py', pure: false, subdir: 'pandas') + py.install_sources('_version_meson.py', subdir: 'pandas') else custom_target('write_version_file', output: '_version_meson.py', @@ -45,11 +38,15 @@ else build_by_default: true, build_always_stale: true, install: true, - install_dir: py.get_install_dir(pure: false) / 'pandas' + install_dir: py.get_install_dir() / 'pandas' ) meson.add_dist_script(py, versioneer, '-o', '_version_meson.py') endif # Needed by pandas.test() when it looks for the pytest ini options -py.install_sources('pyproject.toml', pure: false, subdir: 'pandas') +py.install_sources( + 'pyproject.toml', + subdir: 'pandas' +) + subdir('pandas') diff --git a/pandas/__init__.py b/pandas/__init__.py index d11a429987ac4..41e34309232ee 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -1,5 +1,8 @@ from __future__ import annotations +import os +import warnings + __docformat__ = "restructuredtext" # Let users know if they're missing any of our hard dependencies @@ -190,6 +193,17 @@ __git_version__ = v.get("full-revisionid") del get_versions, v +# GH#55043 - deprecation of the data_manager option +if "PANDAS_DATA_MANAGER" in os.environ: + warnings.warn( + "The env variable PANDAS_DATA_MANAGER is set. The data_manager option is " + "deprecated and will be removed in a future version. Only the BlockManager " + "will be available. Unset this environment variable to silence this warning.", + FutureWarning, + stacklevel=2, + ) +# Don't allow users to use pandas.os or pandas.warnings +del os, warnings # module level doc-string __doc__ = """ diff --git a/pandas/_libs/include/pandas/portable.h b/pandas/_libs/include/pandas/portable.h index 954b5c3cce082..2569aa4fded08 100644 --- a/pandas/_libs/include/pandas/portable.h +++ b/pandas/_libs/include/pandas/portable.h @@ -16,7 +16,7 @@ The full license is in the LICENSE file, distributed with this software. #endif // GH-23516 - works around locale perf issues -// from MUSL libc, MIT Licensed - see LICENSES +// from MUSL libc, licence at LICENSES/MUSL_LICENSE #define isdigit_ascii(c) (((unsigned)(c) - '0') < 10u) #define getdigit_ascii(c, default) (isdigit_ascii(c) ? ((int)((c) - '0')) : default) #define isspace_ascii(c) (((c) == ' ') || (((unsigned)(c) - '\t') < 5)) diff --git a/pandas/_libs/include/pandas/vendored/klib/khash.h b/pandas/_libs/include/pandas/vendored/klib/khash.h index 95b25d053a9df..758f089cc11a5 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash.h @@ -1,27 +1,4 @@ -/* The MIT License - - Copyright (c) 2008, 2009, 2011 by Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ +//Licence at LICENSES/KLIB_LICENSE /* An example: diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index 76b212a4b4660..1009671adc56b 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -1,3 +1,5 @@ +//Licence at LICENSES/KLIB_LICENSE + #include #include diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index e974b5d0eec46..5c7680bc6fb6c 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -375,7 +375,7 @@ cdef class IndexEngine: # map each starget to its position in the index if ( stargets and - len(stargets) < 5 and + len(stargets) < (n / (2 * n.bit_length())) and not na_in_stargets and self.is_monotonic_increasing ): diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 7a9a3b84fd69f..3b1a6bc7436c3 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -897,6 +897,11 @@ cdef class BlockValuesRefs: else: self.referenced_blocks = [] + def _clear_dead_references(self) -> None: + self.referenced_blocks = [ + ref for ref in self.referenced_blocks if ref() is not None + ] + def add_reference(self, blk: Block) -> None: """Adds a new reference to our reference collection. @@ -905,6 +910,7 @@ cdef class BlockValuesRefs: blk : Block The block that the new references should point to. """ + self._clear_dead_references() self.referenced_blocks.append(weakref.ref(blk)) def add_index_reference(self, index: object) -> None: @@ -915,6 +921,7 @@ cdef class BlockValuesRefs: index : Index The index that the new reference should point to. """ + self._clear_dead_references() self.referenced_blocks.append(weakref.ref(index)) def has_reference(self) -> bool: @@ -927,8 +934,6 @@ cdef class BlockValuesRefs: ------- bool """ - self.referenced_blocks = [ - ref for ref in self.referenced_blocks if ref() is not None - ] + self._clear_dead_references() # Checking for more references than block pointing to itself return len(self.referenced_blocks) > 1 diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index e07d80dd04b31..82f69c1dedd53 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -478,31 +478,16 @@ cdef class Interval(IntervalMixin): args = (self.left, self.right, self.closed) return (type(self), args) - def _repr_base(self): - left = self.left - right = self.right - - # TODO: need more general formatting methodology here - if isinstance(left, _Timestamp) and isinstance(right, _Timestamp): - left = left._short_repr - right = right._short_repr - - return left, right - def __repr__(self) -> str: - - left, right = self._repr_base() - disp = str if isinstance(left, np.generic) else repr + disp = str if isinstance(self.left, (np.generic, _Timestamp)) else repr name = type(self).__name__ - repr_str = f"{name}({disp(left)}, {disp(right)}, closed={repr(self.closed)})" + repr_str = f"{name}({disp(self.left)}, {disp(self.right)}, closed={repr(self.closed)})" # noqa: E501 return repr_str def __str__(self) -> str: - - left, right = self._repr_base() start_symbol = "[" if self.closed_left else "(" end_symbol = "]" if self.closed_right else ")" - return f"{start_symbol}{left}, {right}{end_symbol}" + return f"{start_symbol}{self.left}, {self.right}{end_symbol}" def __add__(self, y): if ( diff --git a/pandas/_libs/join.pyi b/pandas/_libs/join.pyi index 7ee649a55fd8f..c7761cbf8aba9 100644 --- a/pandas/_libs/join.pyi +++ b/pandas/_libs/join.pyi @@ -6,6 +6,7 @@ def inner_join( left: np.ndarray, # const intp_t[:] right: np.ndarray, # const intp_t[:] max_groups: int, + sort: bool = ..., ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... def left_outer_join( left: np.ndarray, # const intp_t[:] diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 5929647468785..385a089a8c59d 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -23,7 +23,7 @@ from pandas._libs.dtypes cimport ( @cython.wraparound(False) @cython.boundscheck(False) def inner_join(const intp_t[:] left, const intp_t[:] right, - Py_ssize_t max_groups): + Py_ssize_t max_groups, bint sort=True): cdef: Py_ssize_t i, j, k, count = 0 intp_t[::1] left_sorter, right_sorter @@ -70,7 +70,20 @@ def inner_join(const intp_t[:] left, const intp_t[:] right, _get_result_indexer(left_sorter, left_indexer) _get_result_indexer(right_sorter, right_indexer) - return np.asarray(left_indexer), np.asarray(right_indexer) + if not sort: + # if not asked to sort, revert to original order + if len(left) == len(left_indexer): + # no multiple matches for any row on the left + # this is a short-cut to avoid groupsort_indexer + # otherwise, the `else` path also works in this case + rev = np.empty(len(left), dtype=np.intp) + rev.put(np.asarray(left_sorter), np.arange(len(left))) + else: + rev, _ = groupsort_indexer(left_indexer, len(left)) + + return np.asarray(left_indexer).take(rev), np.asarray(right_indexer).take(rev) + else: + return np.asarray(left_indexer), np.asarray(right_indexer) @cython.wraparound(False) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 32641319a6b96..15bd5a7379105 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -195,6 +195,7 @@ def array_equivalent_object( right: npt.NDArray[np.object_], ) -> bool: ... def has_infs(arr: np.ndarray) -> bool: ... # const floating[:] +def has_only_ints_or_nan(arr: np.ndarray) -> bool: ... # const floating[:] def get_reverse_indexer( indexer: np.ndarray, # const intp_t[:] length: int, diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 2681115bbdcfb..23c8066b973f8 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -530,6 +530,22 @@ def has_infs(floating[:] arr) -> bool: return ret +@cython.boundscheck(False) +@cython.wraparound(False) +def has_only_ints_or_nan(floating[:] arr) -> bool: + cdef: + floating val + intp_t i + + for i in range(len(arr)): + val = arr[i] + if (val != val) or (val == val): + continue + else: + return False + return True + + def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, int max_len): cdef: Py_ssize_t i, n = len(indices) @@ -760,6 +776,7 @@ cpdef ndarray[object] ensure_string_array( cdef: Py_ssize_t i = 0, n = len(arr) bint already_copied = True + ndarray[object] newarr if hasattr(arr, "to_numpy"): @@ -784,8 +801,30 @@ cpdef ndarray[object] ensure_string_array( # short-circuit, all elements are str return result + if arr.dtype.kind == "f": # non-optimized path + for i in range(n): + val = arr[i] + + if not already_copied: + result = result.copy() + already_copied = True + + if not checknull(val): + # f"{val}" is not always equivalent to str(val) for floats + result[i] = str(val) + else: + if convert_na_value: + val = na_value + if skipna: + result[i] = val + else: + result[i] = f"{val}" + + return result + + newarr = np.asarray(arr, dtype=object) for i in range(n): - val = arr[i] + val = newarr[i] if isinstance(val, str): continue @@ -2682,11 +2721,9 @@ def maybe_convert_objects(ndarray[object] objects, elif seen.str_: if using_pyarrow_string_dtype() and is_string_array(objects, skipna=True): - import pyarrow as pa - - from pandas.core.dtypes.dtypes import ArrowDtype + from pandas.core.arrays.string_ import StringDtype - dtype = ArrowDtype(pa.string()) + dtype = StringDtype(storage="pyarrow_numpy") return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True @@ -2817,9 +2854,14 @@ NoDefault = Literal[_NoDefault.no_default] @cython.boundscheck(False) @cython.wraparound(False) -def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=True, - object na_value=no_default, cnp.dtype dtype=np.dtype(object) - ) -> np.ndarray: +def map_infer_mask( + ndarray[object] arr, + object f, + const uint8_t[:] mask, + bint convert=True, + object na_value=no_default, + cnp.dtype dtype=np.dtype(object) +) -> np.ndarray: """ Substitute for np.vectorize with pandas-friendly dtype inference. diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index f302c649bc7bd..fd632790546f6 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -69,7 +69,8 @@ libs_sources = { 'index': {'sources': ['index.pyx', _index_class_helper]}, 'indexing': {'sources': ['indexing.pyx']}, 'internals': {'sources': ['internals.pyx']}, - 'interval': {'sources': ['interval.pyx', _intervaltree_helper]}, + 'interval': {'sources': ['interval.pyx', _intervaltree_helper], + 'deps': _khash_primitive_helper_dep}, 'join': {'sources': ['join.pyx', _khash_primitive_helper], 'deps': _khash_primitive_helper_dep}, 'lib': {'sources': ['lib.pyx', 'src/parser/tokenizer.c']}, @@ -113,8 +114,40 @@ foreach ext_name, ext_dict : libs_sources ) endforeach -py.install_sources('__init__.py', - pure: false, - subdir: 'pandas/_libs') +# Basically just __init__.py and the .pyi files +sources_to_install = [ + '__init__.py', + 'algos.pyi', + 'arrays.pyi', + 'byteswap.pyi', + 'groupby.pyi', + 'hashing.pyi', + 'hashtable.pyi', + 'index.pyi', + 'indexing.pyi', + 'internals.pyi', + 'interval.pyi', + 'join.pyi', + 'json.pyi', + 'lib.pyi', + 'missing.pyi', + 'ops.pyi', + 'ops_dispatch.pyi', + 'parsers.pyi', + 'properties.pyi', + 'reshape.pyi', + 'sas.pyi', + 'sparse.pyi', + 'testing.pyi', + 'tslib.pyi', + 'writers.pyi' +] + +foreach source: sources_to_install + py.install_sources( + source, + subdir: 'pandas/_libs' + ) +endforeach subdir('window') diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 6d66e21ce49f5..5f51f48b43ca9 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -6,7 +6,6 @@ from csv import ( QUOTE_NONE, QUOTE_NONNUMERIC, ) -import sys import time import warnings @@ -880,9 +879,15 @@ cdef class TextReader: cdef _check_tokenize_status(self, int status): if self.parser.warn_msg != NULL: - print(PyUnicode_DecodeUTF8( - self.parser.warn_msg, strlen(self.parser.warn_msg), - self.encoding_errors), file=sys.stderr) + warnings.warn( + PyUnicode_DecodeUTF8( + self.parser.warn_msg, + strlen(self.parser.warn_msg), + self.encoding_errors + ), + ParserWarning, + stacklevel=find_stack_level() + ) free(self.parser.warn_msg) self.parser.warn_msg = NULL diff --git a/pandas/_libs/sas.pyx b/pandas/_libs/sas.pyx index cf2e0e4d80bb5..9e1af2cb9c3e7 100644 --- a/pandas/_libs/sas.pyx +++ b/pandas/_libs/sas.pyx @@ -62,6 +62,7 @@ cdef buf_free(Buffer buf): # algorithm. It is partially documented here: # # https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf +# Licence at LICENSES/SAS7BDAT_LICENSE cdef int rle_decompress(Buffer inbuff, Buffer outbuff) except? 0: cdef: diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index abd3fb9e1fef3..ce8a38df172ef 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -664,7 +664,8 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes, ((!self->delim_whitespace && c == ' ' && self->skipinitialspace)) // applied when in a field -#define IS_DELIMITER(c) ((c == delimiter) || (delim_whitespace && isblank(c))) +#define IS_DELIMITER(c) \ + ((!delim_whitespace && c == delimiter) || (delim_whitespace && isblank(c))) #define _TOKEN_CLEANUP() \ self->stream_len = slen; \ diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index 7e5cb53cf8f62..49016f79de5b9 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -14,6 +14,8 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt */ +// Licence at LICENSES/NUMPY_LICENSE + #define NO_IMPORT #ifndef NPY_NO_DEPRECATED_API diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c index 629d88ca6f589..9646183fa1786 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c @@ -19,6 +19,8 @@ This file implements string parsing and creation for NumPy datetime. */ +// LICENSES/NUMPY_LICENSE + #define PY_SSIZE_T_CLEAN #define NO_IMPORT diff --git a/pandas/_libs/src/vendored/ujson/lib/ultrajsondec.c b/pandas/_libs/src/vendored/ujson/lib/ultrajsondec.c index 9ec12cb242728..70794786016fb 100644 --- a/pandas/_libs/src/vendored/ujson/lib/ultrajsondec.c +++ b/pandas/_libs/src/vendored/ujson/lib/ultrajsondec.c @@ -38,6 +38,8 @@ Numeric decoder derived from TCL library * Copyright (c) 1994 Sun Microsystems, Inc. */ +// Licence at LICENSES/ULTRAJSON_LICENSE + #include #include #include diff --git a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c index 726676799af65..942bd0b518144 100644 --- a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c @@ -38,10 +38,13 @@ Numeric decoder derived from TCL library * Copyright (c) 1994 Sun Microsystems, Inc. */ +// Licence at LICENSES/ULTRAJSON_LICENSE + #include #include #include #include +#include #include #include #include @@ -761,7 +764,12 @@ void Buffer_AppendIntUnchecked(JSONObjectEncoder *enc, JSINT32 value) { void Buffer_AppendLongUnchecked(JSONObjectEncoder *enc, JSINT64 value) { char *wstr; - JSUINT64 uvalue = (value < 0) ? -value : value; + JSUINT64 uvalue; + if (value == INT64_MIN) { + uvalue = INT64_MAX + UINT64_C(1); + } else { + uvalue = (value < 0) ? -value : value; + } wstr = enc->offset; // Conversion. Number is reversed. diff --git a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c index f4055fcedcfa6..d230642b0632d 100644 --- a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c @@ -35,6 +35,8 @@ Numeric decoder derived from TCL library * Copyright (c) 1994 Sun Microsystems, Inc. */ +// Licence at LICENSES/ULTRAJSON_LICENSE + #define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY #define NO_IMPORT_ARRAY #define PY_SSIZE_T_CLEAN diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 4a22de886742c..8c55505f61b51 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -36,6 +36,8 @@ Numeric decoder derived from TCL library * Copyright (c) 1994 Sun Microsystems, Inc. */ +// Licence at LICENSES/ULTRAJSON_LICENSE + #define PY_SSIZE_T_CLEAN #include #include @@ -262,21 +264,6 @@ static Py_ssize_t get_attr_length(PyObject *obj, char *attr) { return ret; } -static int is_simple_frame(PyObject *obj) { - PyObject *mgr = PyObject_GetAttrString(obj, "_mgr"); - if (!mgr) { - return 0; - } - int ret; - if (PyObject_HasAttrString(mgr, "blocks")) { - ret = (get_attr_length(mgr, "blocks") <= 1); - } else { - ret = 0; - } - - Py_DECREF(mgr); - return ret; -} static npy_int64 get_long_attr(PyObject *o, const char *attr) { // NB we are implicitly assuming that o is a Timedelta or Timestamp, or NaT @@ -1140,15 +1127,8 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); } else if (index == 2) { memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); - if (is_simple_frame(obj)) { - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); - if (!GET_TC(tc)->itemValue) { - return 0; - } - } else { - Py_INCREF(obj); - GET_TC(tc)->itemValue = obj; - } + Py_INCREF(obj); + GET_TC(tc)->itemValue = obj; } else { return 0; } @@ -1756,22 +1736,10 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { return; } - if (is_simple_frame(obj)) { - pc->iterBegin = NpyArr_iterBegin; - pc->iterEnd = NpyArr_iterEnd; - pc->iterNext = NpyArr_iterNext; - pc->iterGetName = NpyArr_iterGetName; - - pc->newObj = PyObject_GetAttrString(obj, "values"); - if (!pc->newObj) { - goto INVALID; - } - } else { - pc->iterBegin = PdBlock_iterBegin; - pc->iterEnd = PdBlock_iterEnd; - pc->iterNext = PdBlock_iterNext; - pc->iterGetName = PdBlock_iterGetName; - } + pc->iterBegin = PdBlock_iterBegin; + pc->iterEnd = PdBlock_iterEnd; + pc->iterNext = PdBlock_iterNext; + pc->iterGetName = PdBlock_iterGetName; pc->iterGetValue = NpyArr_iterGetValue; if (enc->outputFormat == VALUES) { diff --git a/pandas/_libs/src/vendored/ujson/python/ujson.c b/pandas/_libs/src/vendored/ujson/python/ujson.c index 6a6a1bdcbe0e0..12dbb33f2874f 100644 --- a/pandas/_libs/src/vendored/ujson/python/ujson.c +++ b/pandas/_libs/src/vendored/ujson/python/ujson.c @@ -35,6 +35,8 @@ Numeric decoder derived from TCL library * Copyright (c) 1994 Sun Microsystems, Inc. */ +// Licence at LICENSES/ULTRAJSON_LICENSE + #include "pandas/vendored/ujson/python/version.h" #define PY_SSIZE_T_CLEAN #include diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd index a8dd88c763c14..e050ac5a6c7b7 100644 --- a/pandas/_libs/tslibs/dtypes.pxd +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -11,6 +11,9 @@ cpdef int64_t periods_per_second(NPY_DATETIMEUNIT reso) except? -1 cpdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso) cpdef bint is_supported_unit(NPY_DATETIMEUNIT reso) +cpdef freq_to_period_freqstr(freq_n, freq_name) +cdef dict c_OFFSET_TO_PERIOD_FREQSTR +cdef dict c_DEPR_ABBREVS cdef dict attrname_to_abbrevs cdef dict npy_unit_to_attrname cdef dict attrname_to_npy_unit diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi index bea3e18273318..72a8fa8ff0b38 100644 --- a/pandas/_libs/tslibs/dtypes.pyi +++ b/pandas/_libs/tslibs/dtypes.pyi @@ -1,9 +1,13 @@ from enum import Enum +from pandas._libs.tslibs.timedeltas import UnitChoices + # These are not public API, but are exposed in the .pyi file because they # are imported in tests. _attrname_to_abbrevs: dict[str, str] _period_code_map: dict[str, int] +OFFSET_TO_PERIOD_FREQSTR: dict[str, str] +DEPR_ABBREVS: dict[str, UnitChoices] def periods_per_day(reso: int) -> int: ... def periods_per_second(reso: int) -> int: ... @@ -11,6 +15,7 @@ def is_supported_unit(reso: int) -> bool: ... def npy_unit_to_abbrev(reso: int) -> str: ... def get_supported_reso(reso: int) -> int: ... def abbrev_to_npy_unit(abbrev: str) -> int: ... +def freq_to_period_freqstr(freq_n: int, freq_name: str) -> str: ... class PeriodDtypeBase: _dtype_code: int # PeriodDtypeCode diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index 19f4c83e6cecf..cca379c620aeb 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -1,6 +1,9 @@ # period frequency constants corresponding to scikits timeseries # originals from enum import Enum +import warnings + +from pandas.util._exceptions import find_stack_level from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, @@ -10,7 +13,6 @@ from pandas._libs.tslibs.np_datetime cimport ( import_pandas_datetime() - cdef class PeriodDtypeBase: """ Similar to an actual dtype, this contains all of the information @@ -141,11 +143,11 @@ _period_code_map = { "B": PeriodDtypeCode.B, # Business days "D": PeriodDtypeCode.D, # Daily "H": PeriodDtypeCode.H, # Hourly - "T": PeriodDtypeCode.T, # Minutely - "S": PeriodDtypeCode.S, # Secondly - "L": PeriodDtypeCode.L, # Millisecondly - "U": PeriodDtypeCode.U, # Microsecondly - "N": PeriodDtypeCode.N, # Nanosecondly + "min": PeriodDtypeCode.T, # Minutely + "s": PeriodDtypeCode.S, # Secondly + "ms": PeriodDtypeCode.L, # Millisecondly + "us": PeriodDtypeCode.U, # Microsecondly + "ns": PeriodDtypeCode.N, # Nanosecondly } _reverse_period_code_map = { @@ -174,15 +176,68 @@ _attrname_to_abbrevs = { "month": "M", "day": "D", "hour": "H", - "minute": "T", - "second": "S", - "millisecond": "L", - "microsecond": "U", - "nanosecond": "N", + "minute": "min", + "second": "s", + "millisecond": "ms", + "microsecond": "us", + "nanosecond": "ns", } cdef dict attrname_to_abbrevs = _attrname_to_abbrevs cdef dict _abbrev_to_attrnames = {v: k for k, v in attrname_to_abbrevs.items()} +OFFSET_TO_PERIOD_FREQSTR: dict = { + "WEEKDAY": "D", + "EOM": "M", + "BM": "M", + "BQS": "Q", + "QS": "Q", + "BQ": "Q", + "BA": "A", + "AS": "A", + "BAS": "A", + "MS": "M", + "D": "D", + "B": "B", + "min": "min", + "s": "s", + "ms": "ms", + "us": "us", + "ns": "ns", + "H": "H", + "Q": "Q", + "A": "A", + "W": "W", + "ME": "M", + "Y": "A", + "BY": "A", + "YS": "A", + "BYS": "A", +} +cdef dict c_OFFSET_TO_PERIOD_FREQSTR = OFFSET_TO_PERIOD_FREQSTR + +cpdef freq_to_period_freqstr(freq_n, freq_name): + if freq_n == 1: + freqstr = f"""{c_OFFSET_TO_PERIOD_FREQSTR.get( + freq_name, freq_name)}""" + else: + freqstr = f"""{freq_n}{c_OFFSET_TO_PERIOD_FREQSTR.get( + freq_name, freq_name)}""" + return freqstr + +# Map deprecated resolution abbreviations to correct resolution abbreviations +DEPR_ABBREVS: dict[str, str]= { + "T": "min", + "t": "min", + "S": "s", + "L": "ms", + "l": "ms", + "U": "us", + "u": "us", + "N": "ns", + "n": "ns", +} +cdef dict c_DEPR_ABBREVS = DEPR_ABBREVS + class FreqGroup(Enum): # Mirrors c_FreqGroup in the .pxd file @@ -273,6 +328,15 @@ class Resolution(Enum): True """ try: + if freq in DEPR_ABBREVS: + warnings.warn( + f"\'{freq}\' is deprecated and will be removed in a future " + f"version. Please use \'{DEPR_ABBREVS.get(freq)}\' " + "instead of \'{freq}\'.", + FutureWarning, + stacklevel=find_stack_level(), + ) + freq = DEPR_ABBREVS[freq] attr_name = _abbrev_to_attrnames[freq] except KeyError: # For quarterly and yearly resolutions, we need to chop off @@ -283,6 +347,15 @@ class Resolution(Enum): if split_freq[1] not in _month_names: # i.e. we want e.g. "Q-DEC", not "Q-INVALID" raise + if split_freq[0] in DEPR_ABBREVS: + warnings.warn( + f"\'{split_freq[0]}\' is deprecated and will be removed in a " + f"future version. Please use \'{DEPR_ABBREVS.get(split_freq[0])}\' " + f"instead of \'{split_freq[0]}\'.", + FutureWarning, + stacklevel=find_stack_level(), + ) + split_freq[0] = DEPR_ABBREVS[split_freq[0]] attr_name = _abbrev_to_attrnames[split_freq[0]] return cls.from_attrname(attr_name) @@ -425,7 +498,6 @@ cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) noexcept nogil: return NPY_DATETIMEUNIT.NPY_FR_D -# TODO: use in _matplotlib.converter? cpdef int64_t periods_per_day( NPY_DATETIMEUNIT reso=NPY_DATETIMEUNIT.NPY_FR_ns ) except? -1: diff --git a/pandas/_libs/tslibs/meson.build b/pandas/_libs/tslibs/meson.build index 14d2eef46da20..a1b0c54d1f48c 100644 --- a/pandas/_libs/tslibs/meson.build +++ b/pandas/_libs/tslibs/meson.build @@ -31,6 +31,28 @@ foreach ext_name, ext_dict : tslibs_sources ) endforeach -py.install_sources('__init__.py', - pure: false, - subdir: 'pandas/_libs/tslibs') +sources_to_install = [ + '__init__.py', + 'ccalendar.pyi', + 'conversion.pyi', + 'dtypes.pyi', + 'fields.pyi', + 'nattype.pyi', + 'np_datetime.pyi', + 'offsets.pyi', + 'parsing.pyi', + 'period.pyi', + 'strptime.pyi', + 'timedeltas.pyi', + 'timestamps.pyi', + 'timezones.pyi', + 'tzconversion.pyi', + 'vectorized.pyi' +] + +foreach source: sources_to_install + py.install_sources( + source, + subdir: 'pandas/_libs/tslibs' + ) +endforeach diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 7d75fa3114d2b..bb497f2e17b93 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -1003,23 +1003,23 @@ timedelta}, default 'raise' >>> ts.round(freq='H') # hour Timestamp('2020-03-14 16:00:00') - >>> ts.round(freq='T') # minute + >>> ts.round(freq='min') # minute Timestamp('2020-03-14 15:33:00') - >>> ts.round(freq='S') # seconds + >>> ts.round(freq='s') # seconds Timestamp('2020-03-14 15:32:52') - >>> ts.round(freq='L') # milliseconds + >>> ts.round(freq='ms') # milliseconds Timestamp('2020-03-14 15:32:52.193000') - ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes): + ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): - >>> ts.round(freq='5T') + >>> ts.round(freq='5min') Timestamp('2020-03-14 15:35:00') - or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1H30min' (i.e. 1 hour and 30 minutes): - >>> ts.round(freq='1H30T') + >>> ts.round(freq='1H30min') Timestamp('2020-03-14 15:00:00') Analogous for ``pd.NaT``: @@ -1092,23 +1092,23 @@ timedelta}, default 'raise' >>> ts.floor(freq='H') # hour Timestamp('2020-03-14 15:00:00') - >>> ts.floor(freq='T') # minute + >>> ts.floor(freq='min') # minute Timestamp('2020-03-14 15:32:00') - >>> ts.floor(freq='S') # seconds + >>> ts.floor(freq='s') # seconds Timestamp('2020-03-14 15:32:52') - >>> ts.floor(freq='N') # nanoseconds + >>> ts.floor(freq='ns') # nanoseconds Timestamp('2020-03-14 15:32:52.192548651') - ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes): + ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): - >>> ts.floor(freq='5T') + >>> ts.floor(freq='5min') Timestamp('2020-03-14 15:30:00') - or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1H30min' (i.e. 1 hour and 30 minutes): - >>> ts.floor(freq='1H30T') + >>> ts.floor(freq='1H30min') Timestamp('2020-03-14 15:00:00') Analogous for ``pd.NaT``: @@ -1181,23 +1181,23 @@ timedelta}, default 'raise' >>> ts.ceil(freq='H') # hour Timestamp('2020-03-14 16:00:00') - >>> ts.ceil(freq='T') # minute + >>> ts.ceil(freq='min') # minute Timestamp('2020-03-14 15:33:00') - >>> ts.ceil(freq='S') # seconds + >>> ts.ceil(freq='s') # seconds Timestamp('2020-03-14 15:32:53') - >>> ts.ceil(freq='U') # microseconds + >>> ts.ceil(freq='us') # microseconds Timestamp('2020-03-14 15:32:52.192549') - ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes): + ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): - >>> ts.ceil(freq='5T') + >>> ts.ceil(freq='5min') Timestamp('2020-03-14 15:35:00') - or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1H30min' (i.e. 1 hour and 30 minutes): - >>> ts.ceil(freq='1H30T') + >>> ts.ceil(freq='1H30min') Timestamp('2020-03-14 16:30:00') Analogous for ``pd.NaT``: diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 7b2ee68c73ad2..c3ee68e14a8d4 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -1,4 +1,3 @@ -cimport cython from cpython.datetime cimport ( PyDateTime_CheckExact, PyDateTime_DATE_GET_HOUR, @@ -18,6 +17,7 @@ from cpython.object cimport ( Py_LT, Py_NE, ) +from libc.stdint cimport INT64_MAX import_datetime() PandasDateTime_IMPORT @@ -545,7 +545,6 @@ cdef ndarray astype_round_check( return iresult -@cython.overflowcheck(True) cdef int64_t get_conversion_factor( NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit @@ -553,6 +552,7 @@ cdef int64_t get_conversion_factor( """ Find the factor by which we need to multiply to convert from from_unit to to_unit. """ + cdef int64_t value, overflow_limit, factor if ( from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC or to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC @@ -565,28 +565,44 @@ cdef int64_t get_conversion_factor( return 1 if from_unit == NPY_DATETIMEUNIT.NPY_FR_W: - return 7 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit) + factor = 7 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_D: - return 24 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit) + factor = 24 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_h: - return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit) + factor = 60 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_m: - return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit) + factor = 60 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_s: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ms: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_us: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ns: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ps: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_fs: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit) + factor = 1000 else: raise ValueError("Converting from M or Y units is not supported.") + overflow_limit = INT64_MAX // factor + if value > overflow_limit or value < -overflow_limit: + raise OverflowError("result would overflow") + + return factor * value + cdef int64_t convert_reso( int64_t value, @@ -595,7 +611,7 @@ cdef int64_t convert_reso( bint round_ok, ) except? -1: cdef: - int64_t res_value, mult, div, mod + int64_t res_value, mult, div, mod, overflow_limit if from_reso == to_reso: return value @@ -624,9 +640,12 @@ cdef int64_t convert_reso( else: # e.g. ns -> us, risk of overflow, but no risk of lossy rounding mult = get_conversion_factor(from_reso, to_reso) - with cython.overflowcheck(True): + overflow_limit = INT64_MAX // mult + if value > overflow_limit or value < -overflow_limit: # Note: caller is responsible for re-raising as OutOfBoundsTimedelta - res_value = value * mult + raise OverflowError("result would overflow") + + res_value = value * mult return res_value diff --git a/pandas/_libs/tslibs/offsets.pxd b/pandas/_libs/tslibs/offsets.pxd index 215c3f849281f..cda6825de35be 100644 --- a/pandas/_libs/tslibs/offsets.pxd +++ b/pandas/_libs/tslibs/offsets.pxd @@ -1,7 +1,7 @@ from numpy cimport int64_t -cpdef to_offset(object obj) +cpdef to_offset(object obj, bint is_period=*) cdef bint is_offset_object(object obj) cdef bint is_tick_object(object obj) diff --git a/pandas/_libs/tslibs/offsets.pyi b/pandas/_libs/tslibs/offsets.pyi index 1a4742111db89..1f0fad6abb8b4 100644 --- a/pandas/_libs/tslibs/offsets.pyi +++ b/pandas/_libs/tslibs/offsets.pyi @@ -103,11 +103,11 @@ class SingleConstructorOffset(BaseOffset): def __reduce__(self): ... @overload -def to_offset(freq: None) -> None: ... +def to_offset(freq: None, is_period: bool = ...) -> None: ... @overload -def to_offset(freq: _BaseOffsetT) -> _BaseOffsetT: ... +def to_offset(freq: _BaseOffsetT, is_period: bool = ...) -> _BaseOffsetT: ... @overload -def to_offset(freq: timedelta | str) -> BaseOffset: ... +def to_offset(freq: timedelta | str, is_period: bool = ...) -> BaseOffset: ... class Tick(SingleConstructorOffset): _creso: int diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 958fe1181d309..74398eb0e2405 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1,5 +1,8 @@ import re import time +import warnings + +from pandas.util._exceptions import find_stack_level cimport cython from cpython.datetime cimport ( @@ -12,6 +15,7 @@ from cpython.datetime cimport ( time as dt_time, timedelta, ) +import warnings import_datetime() @@ -42,6 +46,8 @@ from pandas._libs.tslibs.ccalendar import ( int_to_weekday, weekday_to_int, ) +from pandas.util._exceptions import find_stack_level + from pandas._libs.tslibs.ccalendar cimport ( dayofweek, @@ -50,7 +56,10 @@ from pandas._libs.tslibs.ccalendar cimport ( get_lastbday, ) from pandas._libs.tslibs.conversion cimport localize_pydatetime -from pandas._libs.tslibs.dtypes cimport periods_per_day +from pandas._libs.tslibs.dtypes cimport ( + c_DEPR_ABBREVS, + periods_per_day, +) from pandas._libs.tslibs.nattype cimport ( NPY_NAT, c_NaT as NaT, @@ -621,10 +630,10 @@ cdef class BaseOffset: '2BH' >>> pd.offsets.Nano().freqstr - 'N' + 'ns' >>> pd.offsets.Nano(-3).freqstr - '-3N' + '-3ns' """ try: code = self.rule_code @@ -1191,7 +1200,7 @@ cdef class Minute(Tick): Timestamp('2022-12-09 14:50:00') """ _nanos_inc = 60 * 1_000_000_000 - _prefix = "T" + _prefix = "min" _period_dtype_code = PeriodDtypeCode.T _creso = NPY_DATETIMEUNIT.NPY_FR_m @@ -1227,28 +1236,28 @@ cdef class Second(Tick): Timestamp('2022-12-09 14:59:50') """ _nanos_inc = 1_000_000_000 - _prefix = "S" + _prefix = "s" _period_dtype_code = PeriodDtypeCode.S _creso = NPY_DATETIMEUNIT.NPY_FR_s cdef class Milli(Tick): _nanos_inc = 1_000_000 - _prefix = "L" + _prefix = "ms" _period_dtype_code = PeriodDtypeCode.L _creso = NPY_DATETIMEUNIT.NPY_FR_ms cdef class Micro(Tick): _nanos_inc = 1000 - _prefix = "U" + _prefix = "us" _period_dtype_code = PeriodDtypeCode.U _creso = NPY_DATETIMEUNIT.NPY_FR_us cdef class Nano(Tick): _nanos_inc = 1 - _prefix = "N" + _prefix = "ns" _period_dtype_code = PeriodDtypeCode.N _creso = NPY_DATETIMEUNIT.NPY_FR_ns @@ -1487,6 +1496,28 @@ class DateOffset(RelativeDeltaOffset, metaclass=OffsetMeta): normalize : bool, default False Whether to round the result of a DateOffset addition down to the previous midnight. + weekday : int {0, 1, ..., 6}, default 0 + + A specific integer for the day of the week. + + - 0 is Monday + - 1 is Tuesday + - 2 is Wednesday + - 3 is Thursday + - 4 is Friday + - 5 is Saturday + - 6 is Sunday + + Instead Weekday type from dateutil.relativedelta can be used. + + - MO is Monday + - TU is Tuesday + - WE is Wednesday + - TH is Thursday + - FR is Friday + - SA is Saturday + - SU is Sunday. + **kwds Temporal parameter that add to or replace the offset value. @@ -1621,6 +1652,8 @@ cdef class BusinessDay(BusinessMixin): The number of days represented. normalize : bool, default False Normalize start/end dates to midnight. + offset : timedelta, default timedelta(0) + Time offset to apply. Examples -------- @@ -2821,7 +2854,7 @@ cdef class MonthEnd(MonthOffset): Timestamp('2022-01-31 00:00:00') """ _period_dtype_code = PeriodDtypeCode.M - _prefix = "M" + _prefix = "ME" _day_opt = "end" @@ -3148,6 +3181,10 @@ cdef class Week(SingleConstructorOffset): Parameters ---------- + n : int, default 1 + The number of weeks represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. weekday : int or None, default None Always generate specific day of week. 0 for Monday and 6 for Sunday. @@ -3398,6 +3435,9 @@ cdef class LastWeekOfMonth(WeekOfMonthMixin): Parameters ---------- n : int, default 1 + The number of months represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. weekday : int {0, 1, ..., 6}, default 0 A specific integer for the day of the week. @@ -3540,6 +3580,9 @@ cdef class FY5253(FY5253Mixin): Parameters ---------- n : int + The number of fiscal years represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. weekday : int {0, 1, ..., 6}, default 0 A specific integer for the day of the week. @@ -3562,11 +3605,31 @@ cdef class FY5253(FY5253Mixin): - "nearest" means year end is **weekday** closest to last day of month in year. - "last" means year end is final **weekday** of the final month in fiscal year. + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + Examples -------- + In the example below the default parameters give the next 52-53 week fiscal year. + >>> ts = pd.Timestamp(2022, 1, 1) >>> ts + pd.offsets.FY5253() Timestamp('2022-01-31 00:00:00') + + By the parameter ``startingMonth`` we can specify + the month in which fiscal years end. + + >>> ts = pd.Timestamp(2022, 1, 1) + >>> ts + pd.offsets.FY5253(startingMonth=3) + Timestamp('2022-03-28 00:00:00') + + 52-53 week fiscal year can be specified by + ``weekday`` and ``variation`` parameters. + + >>> ts = pd.Timestamp(2022, 1, 1) + >>> ts + pd.offsets.FY5253(weekday=5, startingMonth=12, variation="last") + Timestamp('2022-12-31 00:00:00') """ _prefix = "RE" @@ -3720,6 +3783,9 @@ cdef class FY5253Quarter(FY5253Mixin): Parameters ---------- n : int + The number of business quarters represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. weekday : int {0, 1, ..., 6}, default 0 A specific integer for the day of the week. @@ -3745,11 +3811,32 @@ cdef class FY5253Quarter(FY5253Mixin): - "nearest" means year end is **weekday** closest to last day of month in year. - "last" means year end is final **weekday** of the final month in fiscal year. + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + Examples -------- + In the example below the default parameters give + the next business quarter for 52-53 week fiscal year. + >>> ts = pd.Timestamp(2022, 1, 1) >>> ts + pd.offsets.FY5253Quarter() Timestamp('2022-01-31 00:00:00') + + By the parameter ``startingMonth`` we can specify + the month in which fiscal years end. + + >>> ts = pd.Timestamp(2022, 1, 1) + >>> ts + pd.offsets.FY5253Quarter(startingMonth=3) + Timestamp('2022-03-28 00:00:00') + + Business quarters for 52-53 week fiscal year can be specified by + ``weekday`` and ``variation`` parameters. + + >>> ts = pd.Timestamp(2022, 1, 1) + >>> ts + pd.offsets.FY5253Quarter(weekday=5, startingMonth=12, variation="last") + Timestamp('2022-04-02 00:00:00') """ _prefix = "REQ" @@ -4150,6 +4237,8 @@ cdef class CustomBusinessHour(BusinessHour): Start time of your custom business hour in 24h format. end : str, time, or list of str/time, default: "17:00" End time of your custom business hour in 24h format. + offset : timedelta, default timedelta(0) + Time offset to apply. Examples -------- @@ -4371,18 +4460,18 @@ prefix_mapping = { CustomBusinessMonthEnd, # 'CBM' CustomBusinessMonthBegin, # 'CBMS' CustomBusinessHour, # 'CBH' - MonthEnd, # 'M' + MonthEnd, # 'ME' MonthBegin, # 'MS' - Nano, # 'N' + Nano, # 'ns' SemiMonthEnd, # 'SM' SemiMonthBegin, # 'SMS' Week, # 'W' - Second, # 'S' - Minute, # 'T' - Micro, # 'U' + Second, # 's' + Minute, # 'min' + Micro, # 'us' QuarterEnd, # 'Q' QuarterBegin, # 'QS' - Milli, # 'L' + Milli, # 'ms' Hour, # 'H' Day, # 'D' WeekOfMonth, # 'WOM' @@ -4409,14 +4498,15 @@ _lite_rule_alias = { "BAS": "BAS-JAN", # BYearBegin(month=1), "BYS": "BAS-JAN", - "Min": "T", - "min": "T", - "ms": "L", - "us": "U", - "ns": "N", + "Min": "min", + "min": "min", + "ms": "ms", + "us": "us", + "ns": "ns", } -_dont_uppercase = {"MS", "ms"} +_dont_uppercase = {"MS", "ms", "s", "me"} + INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" @@ -4457,7 +4547,7 @@ def _get_offset(name: str) -> BaseOffset: return _offset_map[name] -cpdef to_offset(freq): +cpdef to_offset(freq, bint is_period=False): """ Return DateOffset object from string or datetime.timedelta object. @@ -4525,6 +4615,22 @@ cpdef to_offset(freq): tups = zip(split[0::4], split[1::4], split[2::4]) for n, (sep, stride, name) in enumerate(tups): + if is_period is False and name == "M": + warnings.warn( + "\'M\' will be deprecated, please use \'ME\' " + "for \'month end\'", + UserWarning, + stacklevel=find_stack_level(), + ) + name = "ME" + if is_period is True and name == "ME": + raise ValueError( + r"for Period, please use \'M\' " + "instead of \'ME\'" + ) + elif is_period is True and name == "M": + name = "ME" + if sep != "" and not sep.isspace(): raise ValueError("separator must be spaces") prefix = _lite_rule_alias.get(name) or name @@ -4533,7 +4639,16 @@ cpdef to_offset(freq): if not stride: stride = 1 - if prefix in {"D", "H", "T", "S", "L", "U", "N"}: + if prefix in c_DEPR_ABBREVS: + warnings.warn( + f"\'{prefix}\' is deprecated and will be removed in a " + f"future version. Please use \'{c_DEPR_ABBREVS.get(prefix)}\' " + f"instead of \'{prefix}\'.", + FutureWarning, + stacklevel=find_stack_level(), + ) + prefix = c_DEPR_ABBREVS[prefix] + if prefix in {"D", "H", "min", "s", "ms", "us", "ns"}: # For these prefixes, we have something like "3H" or # "2.5T", so we can construct a Timedelta with the # matching unit and get our offset from delta_to_tick diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 3643c840a50a6..71be1d213437a 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -576,7 +576,7 @@ cdef datetime _parse_dateabbr_string(str date_string, datetime default, # e.g. if "Q" is not in date_string and .index raised pass - if date_len == 6 and freq == "M": + if date_len == 6 and freq == "ME": year = int(date_string[:4]) month = int(date_string[4:6]) try: @@ -796,7 +796,7 @@ def try_parse_year_month_day( # is not practical. In fact, using this class issues warnings (xref gh-21322). # Thus, we port the class over so that both issues are resolved. # -# Copyright (c) 2017 - dateutil contributors +# Licence at LICENSES/DATEUTIL_LICENSE class _timelex: def __init__(self, instream): if getattr(instream, "decode", None) is not None: diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index 8826757e31c32..a4aecd2ce0a09 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -89,8 +89,8 @@ class Period(PeriodMixin): @classmethod def _from_ordinal(cls, ordinal: int, freq) -> Period: ... @classmethod - def now(cls, freq: BaseOffset = ...) -> Period: ... - def strftime(self, fmt: str) -> str: ... + def now(cls, freq: Frequency = ...) -> Period: ... + def strftime(self, fmt: str | None) -> str: ... def to_timestamp( self, freq: str | BaseOffset | None = ..., diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index c37e9cd7ef1f3..5fecc77044b4b 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -38,6 +38,11 @@ from libc.time cimport ( tm, ) +from pandas._libs.tslibs.dtypes cimport ( + c_OFFSET_TO_PERIOD_FREQSTR, + freq_to_period_freqstr, +) + # import datetime C API import_datetime() @@ -91,6 +96,9 @@ from pandas._libs.tslibs.dtypes cimport ( attrname_to_abbrevs, freq_group_code_to_npy_unit, ) + +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr + from pandas._libs.tslibs.parsing cimport quarter_to_myear from pandas._libs.tslibs.parsing import parse_datetime_string_with_reso @@ -1507,7 +1515,7 @@ def from_ordinals(const int64_t[:] values, freq): int64_t[::1] result = np.empty(len(values), dtype="i8") int64_t val - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) if not isinstance(freq, BaseOffset): raise ValueError("freq not specified and cannot be inferred") @@ -1539,7 +1547,7 @@ def extract_ordinals(ndarray values, freq) -> np.ndarray: # if we don't raise here, we'll segfault later! raise TypeError("extract_ordinals values must be object-dtype") - freqstr = Period._maybe_convert_freq(freq).freqstr + freqstr = freq_to_period_freqstr(freq.n, freq.name) for i in range(n): # Analogous to: p = values[i] @@ -1712,10 +1720,12 @@ cdef class PeriodMixin: condition = self.freq != other_freq if condition: + freqstr = freq_to_period_freqstr(self.freq.n, self.freq.name) + other_freqstr = freq_to_period_freqstr(other_freq.n, other_freq.name) msg = DIFFERENT_FREQ.format( cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=other_freq.freqstr, + own_freq=freqstr, + other_freq=other_freqstr, ) raise IncompatibleFrequency(msg) @@ -1759,7 +1769,7 @@ cdef class _Period(PeriodMixin): elif isinstance(freq, PeriodDtypeBase): freq = freq._freqstr - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) if freq.n <= 0: raise ValueError("Frequency must be positive, because it " @@ -2487,7 +2497,8 @@ cdef class _Period(PeriodMixin): >>> pd.Period('2020-01', 'D').freqstr 'D' """ - return self._dtype._freqstr + freqstr = freq_to_period_freqstr(self.freq.n, self.freq.name) + return freqstr def __repr__(self) -> str: base = self._dtype._dtype_code @@ -2511,11 +2522,12 @@ cdef class _Period(PeriodMixin): object_state = None, self.freq, self.ordinal return (Period, object_state) - def strftime(self, fmt: str) -> str: + def strftime(self, fmt: str | None) -> str: r""" Returns a formatted string representation of the :class:`Period`. - ``fmt`` must be a string containing one or several directives. + ``fmt`` must be ``None`` or a string containing one or several directives. + When ``None``, the format will be determined from the frequency of the Period. The method recognizes the same directives as the :func:`time.strftime` function of the standard Python distribution, as well as the specific additional directives ``%f``, ``%F``, ``%q``, ``%l``, ``%u``, ``%n``. @@ -2788,7 +2800,8 @@ class Period(_Period): if freq is None and ordinal != NPY_NAT: # Skip NaT, since it doesn't have a resolution freq = attrname_to_abbrevs[reso] - freq = to_offset(freq) + freq = c_OFFSET_TO_PERIOD_FREQSTR.get(freq, freq) + freq = to_offset(freq, is_period=True) elif PyDateTime_Check(value): dt = value @@ -2881,7 +2894,7 @@ cdef _parse_weekly_str(value, BaseOffset freq): if freq is None: day_name = end.day_name()[:3].upper() freqstr = f"W-{day_name}" - freq = to_offset(freqstr) + freq = to_offset(freqstr, is_period=True) # We _should_ have freq.is_on_offset(end) return end, freq diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 542afa9315a60..e6924a4e24dff 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -3,6 +3,7 @@ TimeRE, _calc_julian_from_U_or_W are vendored from the standard library, see https://github.com/python/cpython/blob/main/Lib/_strptime.py +Licence at LICENSES/PSF_LICENSE The original module-level docstring follows. Strptime-related classes and functions. @@ -672,7 +673,8 @@ cdef tzinfo parse_timezone_directive(str z): Notes ----- This is essentially similar to the cpython implementation - https://github.com/python/cpython/blob/master/Lib/_strptime.py#L457-L479 + https://github.com/python/cpython/blob/546cab84448b892c92e68d9c1a3d3b58c13b3463/Lib/_strptime.py#L437-L454 + Licence at LICENSES/PSF_LICENSE """ cdef: diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index aba9b25b23154..6d993722ce1d4 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -14,6 +14,7 @@ from pandas._libs.tslibs import ( Tick, ) from pandas._typing import ( + Frequency, Self, npt, ) @@ -117,9 +118,9 @@ class Timedelta(timedelta): @property def asm8(self) -> np.timedelta64: ... # TODO: round/floor/ceil could return NaT? - def round(self, freq: str) -> Self: ... - def floor(self, freq: str) -> Self: ... - def ceil(self, freq: str) -> Self: ... + def round(self, freq: Frequency) -> Self: ... + def floor(self, freq: Frequency) -> Self: ... + def ceil(self, freq: Frequency) -> Self: ... @property def resolution_string(self) -> str: ... def __add__(self, other: timedelta) -> Timedelta: ... diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index ffa9a67542e21..2f6fa35cae070 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1,6 +1,8 @@ import collections import warnings +from pandas.util._exceptions import find_stack_level + cimport cython from cpython.object cimport ( Py_EQ, @@ -41,6 +43,7 @@ from pandas._libs.tslibs.conversion cimport ( precision_from_unit, ) from pandas._libs.tslibs.dtypes cimport ( + c_DEPR_ABBREVS, get_supported_reso, is_supported_unit, npy_unit_to_abbrev, @@ -124,7 +127,6 @@ cdef dict timedelta_abbrevs = { "minute": "m", "min": "m", "minutes": "m", - "t": "m", "s": "s", "seconds": "s", "sec": "s", @@ -134,20 +136,17 @@ cdef dict timedelta_abbrevs = { "millisecond": "ms", "milli": "ms", "millis": "ms", - "l": "ms", "us": "us", "microseconds": "us", "microsecond": "us", "µs": "us", "micro": "us", "micros": "us", - "u": "us", "ns": "ns", "nanoseconds": "ns", "nano": "ns", "nanos": "ns", "nanosecond": "ns", - "n": "ns", } _no_input = object() @@ -725,6 +724,15 @@ cpdef inline str parse_timedelta_unit(str unit): return "ns" elif unit == "M": return unit + elif unit in c_DEPR_ABBREVS: + warnings.warn( + f"\'{unit}\' is deprecated and will be removed in a " + f"future version. Please use \'{c_DEPR_ABBREVS.get(unit)}\' " + f"instead of \'{unit}\'.", + FutureWarning, + stacklevel=find_stack_level(), + ) + unit = c_DEPR_ABBREVS[unit] try: return timedelta_abbrevs[unit.lower()] except KeyError: @@ -901,7 +909,7 @@ cdef int64_t parse_iso_format_string(str ts) except? -1: elif c == ".": # append any seconds if len(number): - r = timedelta_from_spec(number, "0", "S") + r = timedelta_from_spec(number, "0", "s") result += timedelta_as_neg(r, neg) unit, number = [], [] have_dot = 1 @@ -918,7 +926,7 @@ cdef int64_t parse_iso_format_string(str ts) except? -1: r = timedelta_from_spec(number, "0", dec_unit) result += timedelta_as_neg(r, neg) else: # seconds - r = timedelta_from_spec(number, "0", "S") + r = timedelta_from_spec(number, "0", "s") result += timedelta_as_neg(r, neg) else: raise ValueError(err_msg) @@ -1435,11 +1443,11 @@ cdef class _Timedelta(timedelta): * Days: 'D' * Hours: 'H' - * Minutes: 'T' - * Seconds: 'S' - * Milliseconds: 'L' - * Microseconds: 'U' - * Nanoseconds: 'N' + * Minutes: 'min' + * Seconds: 's' + * Milliseconds: 'ms' + * Microseconds: 'us' + * Nanoseconds: 'ns' Returns ------- @@ -1450,31 +1458,31 @@ cdef class _Timedelta(timedelta): -------- >>> td = pd.Timedelta('1 days 2 min 3 us 42 ns') >>> td.resolution_string - 'N' + 'ns' >>> td = pd.Timedelta('1 days 2 min 3 us') >>> td.resolution_string - 'U' + 'us' >>> td = pd.Timedelta('2 min 3 s') >>> td.resolution_string - 'S' + 's' >>> td = pd.Timedelta(36, unit='us') >>> td.resolution_string - 'U' + 'us' """ self._ensure_components() if self._ns: - return "N" + return "ns" elif self._us: - return "U" + return "us" elif self._ms: - return "L" + return "ms" elif self._s: - return "S" + return "s" elif self._m: - return "T" + return "min" elif self._h: return "H" else: @@ -1706,15 +1714,20 @@ class Timedelta(_Timedelta): Possible values: - * 'W', 'D', 'T', 'S', 'L', 'U', or 'N' - * 'days' or 'day' + * 'W', or 'D' + * 'days', or 'day' * 'hours', 'hour', 'hr', or 'h' * 'minutes', 'minute', 'min', or 'm' - * 'seconds', 'second', or 'sec' - * 'milliseconds', 'millisecond', 'millis', or 'milli' - * 'microseconds', 'microsecond', 'micros', or 'micro' + * 'seconds', 'second', 'sec', or 's' + * 'milliseconds', 'millisecond', 'millis', 'milli', or 'ms' + * 'microseconds', 'microsecond', 'micros', 'micro', or 'us' * 'nanoseconds', 'nanosecond', 'nanos', 'nano', or 'ns'. + .. deprecated:: 2.2.0 + + Values `T`, `S`, `L`, `U`, and `N` are deprecated in favour of the values + `min`, `s`, `ms`, `us`, and `ns`. + **kwargs Available kwargs: {days, seconds, microseconds, milliseconds, minutes, hours, weeks}. @@ -1929,6 +1942,7 @@ class Timedelta(_Timedelta): ---------- freq : str Frequency string indicating the rounding resolution. + It uses the same units as class constructor :class:`~pandas.Timedelta`. Returns ------- @@ -1956,6 +1970,7 @@ class Timedelta(_Timedelta): ---------- freq : str Frequency string indicating the flooring resolution. + It uses the same units as class constructor :class:`~pandas.Timedelta`. Examples -------- @@ -1975,6 +1990,7 @@ class Timedelta(_Timedelta): ---------- freq : str Frequency string indicating the ceiling resolution. + It uses the same units as class constructor :class:`~pandas.Timedelta`. Examples -------- diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index 36ae2d6d892f1..e23f01b800874 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -8,6 +8,8 @@ from datetime import ( from time import struct_time from typing import ( ClassVar, + Literal, + TypeAlias, TypeVar, overload, ) @@ -27,6 +29,7 @@ from pandas._typing import ( ) _DatetimeT = TypeVar("_DatetimeT", bound=datetime) +_TimeZones: TypeAlias = str | _tzinfo | None | int def integer_op_not_supported(obj: object) -> TypeError: ... @@ -51,13 +54,13 @@ class Timestamp(datetime): tzinfo: _tzinfo | None = ..., *, nanosecond: int | None = ..., - tz: str | _tzinfo | None | int = ..., + tz: _TimeZones = ..., unit: str | int | None = ..., fold: int | None = ..., ) -> _DatetimeT | NaTType: ... @classmethod def _from_value_and_reso( - cls, value: int, reso: int, tz: _tzinfo | None + cls, value: int, reso: int, tz: _TimeZones ) -> Timestamp: ... @property def value(self) -> int: ... # np.int64 @@ -84,19 +87,19 @@ class Timestamp(datetime): @property def fold(self) -> int: ... @classmethod - def fromtimestamp(cls, ts: float, tz: _tzinfo | None = ...) -> Self: ... + def fromtimestamp(cls, ts: float, tz: _TimeZones = ...) -> Self: ... @classmethod def utcfromtimestamp(cls, ts: float) -> Self: ... @classmethod - def today(cls, tz: _tzinfo | str | None = ...) -> Self: ... + def today(cls, tz: _TimeZones = ...) -> Self: ... @classmethod def fromordinal( cls, ordinal: int, - tz: _tzinfo | str | None = ..., + tz: _TimeZones = ..., ) -> Self: ... @classmethod - def now(cls, tz: _tzinfo | str | None = ...) -> Self: ... + def now(cls, tz: _TimeZones = ...) -> Self: ... @classmethod def utcnow(cls) -> Self: ... # error: Signature of "combine" incompatible with supertype "datetime" @@ -131,7 +134,7 @@ class Timestamp(datetime): fold: int | None = ..., ) -> Self: ... # LSP violation: datetime.datetime.astimezone has a default value for tz - def astimezone(self, tz: _tzinfo | None) -> Self: ... # type: ignore[override] + def astimezone(self, tz: _TimeZones) -> Self: ... # type: ignore[override] def ctime(self) -> str: ... def isoformat(self, sep: str = ..., timespec: str = ...) -> str: ... @classmethod @@ -184,12 +187,12 @@ class Timestamp(datetime): def to_julian_date(self) -> np.float64: ... @property def asm8(self) -> np.datetime64: ... - def tz_convert(self, tz: _tzinfo | str | None) -> Self: ... + def tz_convert(self, tz: _TimeZones) -> Self: ... # TODO: could return NaT? def tz_localize( self, - tz: _tzinfo | str | None, - ambiguous: str = ..., + tz: _TimeZones, + ambiguous: bool | Literal["raise", "NaT"] = ..., nonexistent: TimestampNonexistent = ..., ) -> Self: ... def normalize(self) -> Self: ... @@ -197,19 +200,19 @@ class Timestamp(datetime): def round( self, freq: str, - ambiguous: bool | str = ..., + ambiguous: bool | Literal["raise", "NaT"] = ..., nonexistent: TimestampNonexistent = ..., ) -> Self: ... def floor( self, freq: str, - ambiguous: bool | str = ..., + ambiguous: bool | Literal["raise", "NaT"] = ..., nonexistent: TimestampNonexistent = ..., ) -> Self: ... def ceil( self, freq: str, - ambiguous: bool | str = ..., + ambiguous: bool | Literal["raise", "NaT"] = ..., nonexistent: TimestampNonexistent = ..., ) -> Self: ... def day_name(self, locale: str | None = ...) -> str: ... diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 844fc8f0ed187..1b4332c2d26cf 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1078,18 +1078,6 @@ cdef class _Timestamp(ABCTimestamp): return result - @property - def _short_repr(self) -> str: - # format a Timestamp with only _date_repr if possible - # otherwise _repr_base - if (self.hour == 0 and - self.minute == 0 and - self.second == 0 and - self.microsecond == 0 and - self.nanosecond == 0): - return self._date_repr - return self._repr_base - # ----------------------------------------------------------------- # Conversion Methods @@ -1907,7 +1895,7 @@ class Timestamp(_Timestamp): cdef: int64_t nanos - freq = to_offset(freq) + freq = to_offset(freq, is_period=False) freq.nanos # raises on non-fixed freq nanos = delta_to_nanoseconds(freq, self._creso) if nanos == 0: @@ -1997,23 +1985,23 @@ timedelta}, default 'raise' >>> ts.round(freq='H') # hour Timestamp('2020-03-14 16:00:00') - >>> ts.round(freq='T') # minute + >>> ts.round(freq='min') # minute Timestamp('2020-03-14 15:33:00') - >>> ts.round(freq='S') # seconds + >>> ts.round(freq='s') # seconds Timestamp('2020-03-14 15:32:52') - >>> ts.round(freq='L') # milliseconds + >>> ts.round(freq='ms') # milliseconds Timestamp('2020-03-14 15:32:52.193000') - ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes): + ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): - >>> ts.round(freq='5T') + >>> ts.round(freq='5min') Timestamp('2020-03-14 15:35:00') - or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1H30min' (i.e. 1 hour and 30 minutes): - >>> ts.round(freq='1H30T') + >>> ts.round(freq='1H30min') Timestamp('2020-03-14 15:00:00') Analogous for ``pd.NaT``: @@ -2088,23 +2076,23 @@ timedelta}, default 'raise' >>> ts.floor(freq='H') # hour Timestamp('2020-03-14 15:00:00') - >>> ts.floor(freq='T') # minute + >>> ts.floor(freq='min') # minute Timestamp('2020-03-14 15:32:00') - >>> ts.floor(freq='S') # seconds + >>> ts.floor(freq='s') # seconds Timestamp('2020-03-14 15:32:52') - >>> ts.floor(freq='N') # nanoseconds + >>> ts.floor(freq='ns') # nanoseconds Timestamp('2020-03-14 15:32:52.192548651') - ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes): + ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): - >>> ts.floor(freq='5T') + >>> ts.floor(freq='5min') Timestamp('2020-03-14 15:30:00') - or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1H30min' (i.e. 1 hour and 30 minutes): - >>> ts.floor(freq='1H30T') + >>> ts.floor(freq='1H30min') Timestamp('2020-03-14 15:00:00') Analogous for ``pd.NaT``: @@ -2177,23 +2165,23 @@ timedelta}, default 'raise' >>> ts.ceil(freq='H') # hour Timestamp('2020-03-14 16:00:00') - >>> ts.ceil(freq='T') # minute + >>> ts.ceil(freq='min') # minute Timestamp('2020-03-14 15:33:00') - >>> ts.ceil(freq='S') # seconds + >>> ts.ceil(freq='s') # seconds Timestamp('2020-03-14 15:32:53') - >>> ts.ceil(freq='U') # microseconds + >>> ts.ceil(freq='us') # microseconds Timestamp('2020-03-14 15:32:52.192549') - ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes): + ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): - >>> ts.ceil(freq='5T') + >>> ts.ceil(freq='5min') Timestamp('2020-03-14 15:35:00') - or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1H30min' (i.e. 1 hour and 30 minutes): - >>> ts.ceil(freq='1H30T') + >>> ts.ceil(freq='1H30min') Timestamp('2020-03-14 16:30:00') Analogous for ``pd.NaT``: diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 9c151b8269a52..6365c030b695b 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -987,9 +987,8 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start, # ---------------------------------------------------------------------- -# Moving maximum / minimum code taken from Bottleneck under the terms -# of its Simplified BSD license -# https://github.com/pydata/bottleneck +# Moving maximum / minimum code taken from Bottleneck +# Licence at LICENSES/BOTTLENECK_LICENCE cdef float64_t init_mm(float64_t ai, Py_ssize_t *nobs, bint is_max) noexcept nogil: diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 02934346130a5..7b306c5e681e0 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -138,6 +138,8 @@ def calculate_variable_window_bounds( break # end bound is previous end # or current index + elif index[end[i - 1]] == end_bound and not right_closed: + end[i] = end[i - 1] + 1 elif (index[end[i - 1]] - end_bound) * index_growth_sign <= 0: end[i] = i + 1 else: diff --git a/pandas/_libs/window/meson.build b/pandas/_libs/window/meson.build index b83d8730f9447..ad15644f73a0c 100644 --- a/pandas/_libs/window/meson.build +++ b/pandas/_libs/window/meson.build @@ -3,7 +3,6 @@ py.extension_module( ['aggregations.pyx'], cython_args: ['-X always_allow_keywords=true'], include_directories: [inc_np, inc_pd], - dependencies: [py_dep], subdir: 'pandas/_libs/window', override_options : ['cython_language=cpp'], install: true @@ -14,7 +13,19 @@ py.extension_module( ['indexers.pyx'], cython_args: ['-X always_allow_keywords=true'], include_directories: [inc_np, inc_pd], - dependencies: [py_dep], subdir: 'pandas/_libs/window', install: true ) + +sources_to_install = [ + '__init__.py', + 'aggregations.pyi', + 'indexers.pyi' +] + +foreach source: sources_to_install + py.install_sources( + source, + subdir: 'pandas/_libs/window' + ) +endforeach diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx index bd5c0290b2879..48eedd1fd1af6 100644 --- a/pandas/_libs/writers.pyx +++ b/pandas/_libs/writers.pyx @@ -1,4 +1,5 @@ cimport cython +from cython cimport Py_ssize_t import numpy as np from cpython cimport ( diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py index edbba9452b50a..95977edb600ad 100644 --- a/pandas/_testing/_io.py +++ b/pandas/_testing/_io.py @@ -118,7 +118,7 @@ def round_trip_localpath(writer, reader, path: str | None = None): return obj -def write_to_compressed(compression, path, data, dest: str = "test"): +def write_to_compressed(compression, path, data, dest: str = "test") -> None: """ Write data to a compressed file. diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 0591394f5d9ed..d4e7e196dc2d4 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -416,7 +416,8 @@ def assert_attr_equal(attr: str, left, right, obj: str = "Attributes") -> None: def assert_is_valid_plot_return_object(objs) -> None: - import matplotlib.pyplot as plt + from matplotlib.artist import Artist + from matplotlib.axes import Axes if isinstance(objs, (Series, np.ndarray)): for el in objs.ravel(): @@ -424,14 +425,14 @@ def assert_is_valid_plot_return_object(objs) -> None: "one of 'objs' is not a matplotlib Axes instance, " f"type encountered {repr(type(el).__name__)}" ) - assert isinstance(el, (plt.Axes, dict)), msg + assert isinstance(el, (Axes, dict)), msg else: msg = ( "objs is neither an ndarray of Artist instances nor a single " "ArtistArtist instance, tuple, or dict, 'objs' is a " f"{repr(type(objs).__name__)}" ) - assert isinstance(objs, (plt.Artist, tuple, dict)), msg + assert isinstance(objs, (Artist, tuple, dict)), msg def assert_is_sorted(seq) -> None: diff --git a/pandas/_typing.py b/pandas/_typing.py index 743815b91210d..c2bbebfbe2857 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -112,7 +112,7 @@ # Cannot use `Sequence` because a string is a sequence, and we don't want to # accept that. Could refine if https://github.com/python/typing/issues/256 is # resolved to differentiate between Sequence[str] and str -ListLike = Union[AnyArrayLike, list, range] +ListLike = Union[AnyArrayLike, list, tuple, range] # scalars diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index 32e2afc0eef52..a11755275d00e 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -36,7 +36,7 @@ ] -def __getattr__(name: str): +def __getattr__(name: str) -> type[NumpyExtensionArray]: if name == "PandasArray": # GH#53694 import warnings diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index be0a762642e46..684e9dccdc0f9 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -30,6 +30,7 @@ pa_version_under9p0, pa_version_under11p0, pa_version_under13p0, + pa_version_under14p0, ) if TYPE_CHECKING: @@ -186,6 +187,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under9p0", "pa_version_under11p0", "pa_version_under13p0", + "pa_version_under14p0", "IS64", "ISMUSL", "PY310", diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index c5792fa1379fe..fa0e9e974ea39 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -37,6 +37,7 @@ "pyarrow": "7.0.0", "pyreadstat": "1.1.5", "pytest": "7.3.2", + "python-calamine": "0.1.6", "pyxlsb": "1.0.9", "s3fs": "2022.05.0", "scipy": "1.8.1", @@ -62,6 +63,7 @@ "lxml.etree": "lxml", "odf": "odfpy", "pandas_gbq": "pandas-gbq", + "python_calamine": "python-calamine", "sqlalchemy": "SQLAlchemy", "tables": "pytables", } diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 8282ec25c1d58..4f067d958b799 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -26,7 +26,7 @@ from collections.abc import Generator -def load_reduce(self): +def load_reduce(self) -> None: stack = self.stack args = stack.pop() func = stack[-1] diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 049ce50920e28..12f58be109d98 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -15,6 +15,7 @@ pa_version_under11p0 = _palv < Version("11.0.0") pa_version_under12p0 = _palv < Version("12.0.0") pa_version_under13p0 = _palv < Version("13.0.0") + pa_version_under14p0 = _palv < Version("14.0.0") except ImportError: pa_version_under7p0 = True pa_version_under8p0 = True @@ -23,3 +24,4 @@ pa_version_under11p0 = True pa_version_under12p0 = True pa_version_under13p0 = True + pa_version_under14p0 = True diff --git a/pandas/conftest.py b/pandas/conftest.py index 5210e727aeb3c..62f22921f0482 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -49,6 +49,8 @@ utc, ) +from pandas._config.config import _get_option + import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import ( @@ -71,6 +73,7 @@ Index, MultiIndex, ) +from pandas.util.version import Version if TYPE_CHECKING: from collections.abc import ( @@ -142,6 +145,7 @@ def pytest_collection_modifyitems(items, config) -> None: ("is_sparse", "is_sparse is deprecated"), ("NDFrame.replace", "The 'method' keyword"), ("NDFrame.replace", "Series.replace without 'value'"), + ("NDFrame.clip", "Downcasting behavior in Series and DataFrame methods"), ("Series.idxmin", "The behavior of Series.idxmin"), ("Series.idxmax", "The behavior of Series.idxmax"), ("SeriesGroupBy.idxmin", "The behavior of Series.idxmin"), @@ -190,6 +194,10 @@ def pytest_collection_modifyitems(items, config) -> None: item.add_marker(pytest.mark.arraymanager) +hypothesis_health_checks = [hypothesis.HealthCheck.too_slow] +if Version(hypothesis.__version__) >= Version("6.83.2"): + hypothesis_health_checks.append(hypothesis.HealthCheck.differing_executors) + # Hypothesis hypothesis.settings.register_profile( "ci", @@ -201,7 +209,7 @@ def pytest_collection_modifyitems(items, config) -> None: # 2022-02-09: Changed deadline from 500 -> None. Deadline leads to # non-actionable, flaky CI failures (# GH 24641, 44969, 45118, 44969) deadline=None, - suppress_health_check=(hypothesis.HealthCheck.too_slow,), + suppress_health_check=tuple(hypothesis_health_checks), ) hypothesis.settings.load_profile("ci") @@ -491,7 +499,7 @@ def box_with_array(request): @pytest.fixture -def dict_subclass(): +def dict_subclass() -> type[dict]: """ Fixture for a dictionary subclass. """ @@ -504,7 +512,7 @@ def __init__(self, *args, **kwargs) -> None: @pytest.fixture -def non_dict_mapping_subclass(): +def non_dict_mapping_subclass() -> type[abc.Mapping]: """ Fixture for a non-mapping dictionary subclass. """ @@ -1321,6 +1329,7 @@ def nullable_string_dtype(request): params=[ "python", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), + pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")), ] ) def string_storage(request): @@ -1329,6 +1338,7 @@ def string_storage(request): * 'python' * 'pyarrow' + * 'pyarrow_numpy' """ return request.param @@ -1380,6 +1390,7 @@ def object_dtype(request): "object", "string[python]", pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), ] ) def any_string_dtype(request): @@ -1974,7 +1985,7 @@ def using_array_manager() -> bool: """ Fixture to check if the array manager is being used. """ - return pd.options.mode.data_manager == "array" + return _get_option("mode.data_manager", silent=True) == "array" @pytest.fixture @@ -1982,7 +1993,10 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" + return ( + pd.options.mode.copy_on_write + and _get_option("mode.data_manager", silent=True) == "block" + ) warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"] @@ -2000,4 +2014,4 @@ def warsaw(request) -> str: @pytest.fixture() def arrow_string_storage(): - return ("pyarrow",) + return ("pyarrow", "pyarrow_numpy") diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py index 5cd4779907146..0a26acb7df60a 100644 --- a/pandas/core/_numba/executor.py +++ b/pandas/core/_numba/executor.py @@ -15,6 +15,45 @@ from pandas.compat._optional import import_optional_dependency +@functools.cache +def generate_apply_looper(func, nopython=True, nogil=True, parallel=False): + if TYPE_CHECKING: + import numba + else: + numba = import_optional_dependency("numba") + nb_compat_func = numba.extending.register_jitable(func) + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def nb_looper(values, axis): + # Operate on the first row/col in order to get + # the output shape + if axis == 0: + first_elem = values[:, 0] + dim0 = values.shape[1] + else: + first_elem = values[0] + dim0 = values.shape[0] + res0 = nb_compat_func(first_elem) + # Use np.asarray to get shape for + # https://github.com/numba/numba/issues/4202#issuecomment-1185981507 + buf_shape = (dim0,) + np.atleast_1d(np.asarray(res0)).shape + if axis == 0: + buf_shape = buf_shape[::-1] + buff = np.empty(buf_shape) + + if axis == 1: + buff[0] = res0 + for i in numba.prange(1, values.shape[0]): + buff[i] = nb_compat_func(values[i]) + else: + buff[:, 0] = res0 + for j in numba.prange(1, values.shape[1]): + buff[:, j] = nb_compat_func(values[:, j]) + return buff + + return nb_looper + + @functools.cache def make_looper(func, result_dtype, is_grouped_kernel, nopython, nogil, parallel): if TYPE_CHECKING: diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 1b36659944561..698abb2ec4989 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -187,7 +187,7 @@ def add_delegate_accessors(cls): return add_delegate_accessors -# Ported with modifications from xarray +# Ported with modifications from xarray; licence at LICENSES/XARRAY_LICENSE # https://github.com/pydata/xarray/blob/master/xarray/core/extensions.py # 1. We don't need to catch and re-raise AttributeErrors as RuntimeErrors # 2. We use a UserWarning instead of a custom Warning diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 14dee202a9d8d..1d74bb8b83e4e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -55,6 +55,7 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ( + ArrowDtype, BaseMaskedDtype, CategoricalDtype, ExtensionDtype, @@ -517,9 +518,9 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: return isin(np.asarray(comps_array), np.asarray(values)) # GH16012 - # Ensure np.in1d doesn't get object types or it *may* throw an exception + # Ensure np.isin doesn't get object types or it *may* throw an exception # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), - # in1d is faster for small sizes + # isin is faster for small sizes if ( len(comps_array) > _MINIMUM_COMP_ARR_LEN and len(values) <= 26 @@ -530,10 +531,10 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: if isna(values).any(): def f(c, v): - return np.logical_or(np.in1d(c, v), np.isnan(c)) + return np.logical_or(np.isin(c, v).ravel(), np.isnan(c)) else: - f = np.in1d + f = lambda a, b: np.isin(a, b).ravel() else: common = np_find_common_type(values.dtype, comps_array.dtype) @@ -877,7 +878,9 @@ def value_counts_internal( if bins is not None: from pandas.core.reshape.tile import cut - values = Series(values, copy=False) + if isinstance(values, Series): + values = values._values + try: ii = cut(values, bins, include_lowest=True) except TypeError as err: @@ -996,9 +999,13 @@ def duplicated( ------- duplicated : ndarray[bool] """ - if hasattr(values, "dtype") and isinstance(values.dtype, BaseMaskedDtype): - values = cast("BaseMaskedArray", values) - return htable.duplicated(values._data, keep=keep, mask=values._mask) + if hasattr(values, "dtype"): + if isinstance(values.dtype, ArrowDtype) and values.dtype.kind in "ifub": + values = values._to_masked() # type: ignore[union-attr] + + if isinstance(values.dtype, BaseMaskedDtype): + values = cast("BaseMaskedArray", values) + return htable.duplicated(values._data, keep=keep, mask=values._mask) values = _ensure_data(values) return htable.duplicated(values, keep=keep) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 6ab2f958b8730..1525e316f345f 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -49,14 +49,15 @@ ABCSeries, ) +from pandas.core._numba.executor import generate_apply_looper import pandas.core.common as com from pandas.core.construction import ensure_wrapped_if_datetimelike if TYPE_CHECKING: from collections.abc import ( + Generator, Hashable, Iterable, - Iterator, Sequence, ) @@ -80,6 +81,8 @@ def frame_apply( raw: bool = False, result_type: str | None = None, by_row: Literal[False, "compat"] = "compat", + engine: str = "python", + engine_kwargs: dict[str, bool] | None = None, args=None, kwargs=None, ) -> FrameApply: @@ -100,6 +103,8 @@ def frame_apply( raw=raw, result_type=result_type, by_row=by_row, + engine=engine, + engine_kwargs=engine_kwargs, args=args, kwargs=kwargs, ) @@ -253,7 +258,7 @@ def transform(self) -> DataFrame | Series: return result - def transform_dict_like(self, func): + def transform_dict_like(self, func) -> DataFrame: """ Compute transform in the case of a dict-like func """ @@ -315,7 +320,7 @@ def compute_list_like( op_name: Literal["agg", "apply"], selected_obj: Series | DataFrame, kwargs: dict[str, Any], - ) -> tuple[list[Hashable], list[Any]]: + ) -> tuple[list[Hashable] | Index, list[Any]]: """ Compute agg/apply results for like-like input. @@ -330,7 +335,7 @@ def compute_list_like( Returns ------- - keys : list[hashable] + keys : list[Hashable] or Index Index labels for result. results : list Data for result. When aggregating with a Series, this can contain any @@ -370,12 +375,14 @@ def compute_list_like( new_res = getattr(colg, op_name)(func, *args, **kwargs) results.append(new_res) indices.append(index) - keys = selected_obj.columns.take(indices) + # error: Incompatible types in assignment (expression has type "Any | + # Index", variable has type "list[Any | Callable[..., Any] | str]") + keys = selected_obj.columns.take(indices) # type: ignore[assignment] return keys, results def wrap_results_list_like( - self, keys: list[Hashable], results: list[Series | DataFrame] + self, keys: Iterable[Hashable], results: list[Series | DataFrame] ): from pandas.core.reshape.concat import concat @@ -434,7 +441,13 @@ def compute_dict_like( Data for result. When aggregating with a Series, this can contain any Python object. """ + from pandas.core.groupby.generic import ( + DataFrameGroupBy, + SeriesGroupBy, + ) + obj = self.obj + is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy)) func = cast(AggFuncTypeDict, self.func) func = self.normalize_dictlike_arg(op_name, selected_obj, func) @@ -448,7 +461,7 @@ def compute_dict_like( colg = obj._gotitem(selection, ndim=1) results = [getattr(colg, op_name)(how, **kwargs) for _, how in func.items()] keys = list(func.keys()) - elif is_non_unique_col: + elif not is_groupby and is_non_unique_col: # key used for column selection and output # GH#51099 results = [] @@ -748,11 +761,15 @@ def __init__( result_type: str | None, *, by_row: Literal[False, "compat"] = False, + engine: str = "python", + engine_kwargs: dict[str, bool] | None = None, args, kwargs, ) -> None: if by_row is not False and by_row != "compat": raise ValueError(f"by_row={by_row} not allowed") + self.engine = engine + self.engine_kwargs = engine_kwargs super().__init__( obj, func, raw, result_type, by_row=by_row, args=args, kwargs=kwargs ) @@ -772,7 +789,7 @@ def result_columns(self) -> Index: @property @abc.abstractmethod - def series_generator(self) -> Iterator[Series]: + def series_generator(self) -> Generator[Series, None, None]: pass @abc.abstractmethod @@ -797,6 +814,12 @@ def values(self): def apply(self) -> DataFrame | Series: """compute the results""" + + if self.engine == "numba" and not self.raw: + raise ValueError( + "The numba engine in DataFrame.apply can only be used when raw=True" + ) + # dispatch to handle list-like or dict-like if is_list_like(self.func): return self.apply_list_or_dict_like() @@ -826,7 +849,7 @@ def apply(self) -> DataFrame | Series: # raw elif self.raw: - return self.apply_raw() + return self.apply_raw(engine=self.engine, engine_kwargs=self.engine_kwargs) return self.apply_standard() @@ -899,7 +922,7 @@ def apply_empty_result(self): else: return self.obj.copy() - def apply_raw(self): + def apply_raw(self, engine="python", engine_kwargs=None): """apply to the values as a numpy array""" def wrap_function(func): @@ -917,7 +940,27 @@ def wrapper(*args, **kwargs): return wrapper - result = np.apply_along_axis(wrap_function(self.func), self.axis, self.values) + if engine == "numba": + engine_kwargs = {} if engine_kwargs is None else engine_kwargs + + # error: Argument 1 to "__call__" of "_lru_cache_wrapper" has + # incompatible type "Callable[..., Any] | str | list[Callable + # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | + # list[Callable[..., Any] | str]]"; expected "Hashable" + nb_looper = generate_apply_looper( + self.func, **engine_kwargs # type: ignore[arg-type] + ) + result = nb_looper(self.values, self.axis) + # If we made the result 2-D, squeeze it back to 1-D + result = np.squeeze(result) + else: + result = np.apply_along_axis( + wrap_function(self.func), + self.axis, + self.values, + *self.args, + **self.kwargs, + ) # TODO: mixed type case if result.ndim == 2: @@ -1014,7 +1057,7 @@ class FrameRowApply(FrameApply): axis: AxisInt = 0 @property - def series_generator(self): + def series_generator(self) -> Generator[Series, None, None]: return (self.obj._ixs(i, axis=1) for i in range(len(self.columns))) @property @@ -1075,7 +1118,7 @@ def apply_broadcast(self, target: DataFrame) -> DataFrame: return result.T @property - def series_generator(self): + def series_generator(self) -> Generator[Series, None, None]: values = self.values values = ensure_wrapped_if_datetimelike(values) assert len(values) > 0 @@ -1283,14 +1326,6 @@ def curried(x): ) if len(mapped) and isinstance(mapped[0], ABCSeries): - warnings.warn( - "Returning a DataFrame from Series.apply when the supplied function " - "returns a Series is deprecated and will be removed in a future " - "version.", - FutureWarning, - stacklevel=find_stack_level(), - ) # GH52116 - # GH#43986 Need to do list(mapped) in order to get treated as nested # See also GH#25959 regarding EA support return obj._constructor_expanddim(list(mapped), index=obj.index) @@ -1824,12 +1859,12 @@ def warn_alias_replacement( full_alias = alias else: full_alias = f"{type(obj).__name__}.{alias}" - alias = f"'{alias}'" + alias = f'"{alias}"' warnings.warn( f"The provided callable {func} is currently using " f"{full_alias}. In a future version of pandas, " f"the provided callable will be used directly. To keep current " - f"behavior pass {alias} instead.", + f"behavior pass the string {alias} instead.", category=FutureWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index 8ea70e2694d92..ac674e31586e7 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -66,8 +66,7 @@ def take_nd( """ Specialized Cython take which sets NaN values in one pass - This dispatches to ``take`` defined on ExtensionArrays. It does not - currently dispatch to ``SparseArray.take`` for sparse ``arr``. + This dispatches to ``take`` defined on ExtensionArrays. Note: this function assumes that the indexer is a valid(ated) indexer with no out of bound indices. diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 359e0161e763c..6d21f4a1ac8a2 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -296,13 +296,8 @@ def _fill_mask_inplace( func = missing.get_fill_func(method, ndim=self.ndim) func(self._ndarray.T, limit=limit, mask=mask.T) - def pad_or_backfill( - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: mask = self.isna() if mask.any(): diff --git a/pandas/core/arrays/arrow/__init__.py b/pandas/core/arrays/arrow/__init__.py index 58b268cbdd221..a3d33f91f597d 100644 --- a/pandas/core/arrays/arrow/__init__.py +++ b/pandas/core/arrays/arrow/__init__.py @@ -1,3 +1,4 @@ +from pandas.core.arrays.arrow.accessors import StructAccessor from pandas.core.arrays.arrow.array import ArrowExtensionArray -__all__ = ["ArrowExtensionArray"] +__all__ = ["ArrowExtensionArray", "StructAccessor"] diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py new file mode 100644 index 0000000000000..e4ed255476e8e --- /dev/null +++ b/pandas/core/arrays/arrow/accessors.py @@ -0,0 +1,196 @@ +"""Accessors for arrow-backed data.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from pandas.compat import pa_version_under7p0 + +if not pa_version_under7p0: + import pyarrow as pa + import pyarrow.compute as pc + + from pandas.core.dtypes.dtypes import ArrowDtype + +if TYPE_CHECKING: + from pandas import ( + DataFrame, + Series, + ) + + +class StructAccessor: + """ + Accessor object for structured data properties of the Series values. + + Parameters + ---------- + data : Series + Series containing Arrow struct data. + """ + + _validation_msg = ( + "Can only use the '.struct' accessor with 'struct[pyarrow]' dtype, not {dtype}." + ) + + def __init__(self, data=None) -> None: + self._parent = data + self._validate(data) + + def _validate(self, data): + dtype = data.dtype + if not isinstance(dtype, ArrowDtype): + # Raise AttributeError so that inspect can handle non-struct Series. + raise AttributeError(self._validation_msg.format(dtype=dtype)) + + if not pa.types.is_struct(dtype.pyarrow_dtype): + # Raise AttributeError so that inspect can handle non-struct Series. + raise AttributeError(self._validation_msg.format(dtype=dtype)) + + @property + def dtypes(self) -> Series: + """ + Return the dtype object of each child field of the struct. + + Returns + ------- + pandas.Series + The data type of each child field. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=pd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + >>> s.struct.dtypes + version int64[pyarrow] + project string[pyarrow] + dtype: object + """ + from pandas import ( + Index, + Series, + ) + + pa_type = self._parent.dtype.pyarrow_dtype + types = [ArrowDtype(struct.type) for struct in pa_type] + names = [struct.name for struct in pa_type] + return Series(types, index=Index(names)) + + def field(self, name_or_index: str | int) -> Series: + """ + Extract a child field of a struct as a Series. + + Parameters + ---------- + name_or_index : str | int + Name or index of the child field to extract. + + Returns + ------- + pandas.Series + The data corresponding to the selected child field. + + See Also + -------- + Series.struct.explode : Return all child fields as a DataFrame. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=pd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + + Extract by field name. + + >>> s.struct.field("project") + 0 pandas + 1 pandas + 2 numpy + Name: project, dtype: string[pyarrow] + + Extract by field index. + + >>> s.struct.field(0) + 0 1 + 1 2 + 2 1 + Name: version, dtype: int64[pyarrow] + """ + from pandas import Series + + pa_arr = self._parent.array._pa_array + if isinstance(name_or_index, int): + index = name_or_index + elif isinstance(name_or_index, str): + index = pa_arr.type.get_field_index(name_or_index) + else: + raise ValueError( + "name_or_index must be an int or str, " + f"got {type(name_or_index).__name__}" + ) + + pa_field = pa_arr.type[index] + field_arr = pc.struct_field(pa_arr, [index]) + return Series( + field_arr, + dtype=ArrowDtype(field_arr.type), + index=self._parent.index, + name=pa_field.name, + ) + + def explode(self) -> DataFrame: + """ + Extract all child fields of a struct as a DataFrame. + + Returns + ------- + pandas.DataFrame + The data corresponding to all child fields. + + See Also + -------- + Series.struct.field : Return a single child field as a Series. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=pd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + + >>> s.struct.explode() + version project + 0 1 pandas + 1 2 pandas + 2 1 numpy + """ + from pandas import concat + + pa_type = self._parent.dtype.pyarrow_dtype + return concat( + [self.field(i) for i in range(pa_type.num_fields)], axis="columns" + ) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3c65e6b4879e2..4b79d0dbb683e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -18,6 +18,7 @@ from pandas._libs.tslibs import ( Timedelta, Timestamp, + timezones, ) from pandas.compat import ( pa_version_under7p0, @@ -29,18 +30,21 @@ from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs +from pandas.core.dtypes.cast import can_hold_element from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, is_integer, is_list_like, - is_object_dtype, is_scalar, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna -from pandas.core import roperator +from pandas.core import ( + missing, + roperator, +) from pandas.core.arraylike import OpsMixin from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays.base import ( @@ -84,6 +88,15 @@ "rxor": lambda x, y: pc.xor(y, x), } + ARROW_BIT_WISE_FUNCS = { + "and_": pc.bit_wise_and, + "rand_": lambda x, y: pc.bit_wise_and(y, x), + "or_": pc.bit_wise_or, + "ror_": lambda x, y: pc.bit_wise_or(y, x), + "xor": pc.bit_wise_xor, + "rxor": lambda x, y: pc.bit_wise_xor(y, x), + } + def cast_for_truediv( arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar ) -> pa.ChunkedArray: @@ -372,8 +385,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: elif isna(value): pa_scalar = pa.scalar(None, type=pa_type) else: - # GH 53171: pyarrow does not yet handle pandas non-nano correctly - # see https://github.com/apache/arrow/issues/33321 + # Workaround https://github.com/apache/arrow/issues/37291 if isinstance(value, Timedelta): if pa_type is None: pa_type = pa.duration(value.unit) @@ -439,8 +451,7 @@ def _box_pa_array( and pa.types.is_duration(pa_type) and (not isinstance(value, np.ndarray) or value.dtype.kind not in "mi") ): - # GH 53171: pyarrow does not yet handle pandas non-nano correctly - # see https://github.com/apache/arrow/issues/33321 + # Workaround https://github.com/apache/arrow/issues/37291 from pandas.core.tools.timedeltas import to_timedelta value = to_timedelta(value, unit=pa_type.unit).as_unit(pa_type.unit) @@ -453,8 +464,7 @@ def _box_pa_array( pa_array = pa.array(value, from_pandas=True) if pa_type is None and pa.types.is_duration(pa_array.type): - # GH 53171: pyarrow does not yet handle pandas non-nano correctly - # see https://github.com/apache/arrow/issues/33321 + # Workaround https://github.com/apache/arrow/issues/37291 from pandas.core.tools.timedeltas import to_timedelta value = to_timedelta(value) @@ -506,7 +516,10 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, np.ndarray): if not len(item): # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] - if self._dtype.name == "string" and self._dtype.storage == "pyarrow": + if self._dtype.name == "string" and self._dtype.storage in ( + "pyarrow", + "pyarrow_numpy", + ): pa_dtype = pa.string() else: pa_dtype = self._dtype.pyarrow_dtype @@ -582,7 +595,11 @@ def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: return self.to_numpy(dtype=dtype) def __invert__(self) -> Self: - return type(self)(pc.invert(self._pa_array)) + # This is a bit wise op for integer types + if pa.types.is_integer(self._pa_array.type): + return type(self)(pc.bit_wise_not(self._pa_array)) + else: + return type(self)(pc.invert(self._pa_array)) def __neg__(self) -> Self: return type(self)(pc.negate_checked(self._pa_array)) @@ -610,20 +627,22 @@ def __setstate__(self, state) -> None: def _cmp_method(self, other, op): pc_func = ARROW_CMP_FUNCS[op.__name__] - try: + if isinstance(other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)): result = pc_func(self._pa_array, self._box_pa(other)) - except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): - if is_scalar(other): + elif is_scalar(other): + try: + result = pc_func(self._pa_array, self._box_pa(other)) + except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): mask = isna(self) | isna(other) valid = ~mask result = np.zeros(len(self), dtype="bool") result[valid] = op(np.array(self)[valid], other) result = pa.array(result, type=pa.bool_()) result = pc.if_else(valid, result, None) - else: - raise NotImplementedError( - f"{op.__name__} not implemented for {type(other)}" - ) + else: + raise NotImplementedError( + f"{op.__name__} not implemented for {type(other)}" + ) return ArrowExtensionArray(result) def _evaluate_op_method(self, other, op, arrow_funcs): @@ -657,7 +676,12 @@ def _evaluate_op_method(self, other, op, arrow_funcs): return type(self)(result) def _logical_method(self, other, op): - return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS) + # For integer types `^`, `|`, `&` are bitwise operators and return + # integer types. Otherwise these are boolean ops. + if pa.types.is_integer(self._pa_array.type): + return self._evaluate_op_method(other, op, ARROW_BIT_WISE_FUNCS) + else: + return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS) def _arith_method(self, other, op): return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS) @@ -903,23 +927,30 @@ def dropna(self) -> Self: """ return type(self)(pc.drop_null(self._pa_array)) - def pad_or_backfill( - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: if not self._hasna: # TODO(CoW): Not necessary anymore when CoW is the default return self.copy() + if limit is None: + method = missing.clean_fill_method(method) + try: + if method == "pad": + return type(self)(pc.fill_null_forward(self._pa_array)) + elif method == "backfill": + return type(self)(pc.fill_null_backward(self._pa_array)) + except pa.ArrowNotImplementedError: + # ArrowNotImplementedError: Function 'coalesce' has no kernel + # matching input types (duration[ns], duration[ns]) + # TODO: remove try/except wrapper if/when pyarrow implements + # a kernel for duration types. + pass + # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove # this method entirely. - return super().pad_or_backfill( - method=method, limit=limit, limit_area=limit_area, copy=copy - ) + return super()._pad_or_backfill(method=method, limit=limit, copy=copy) @doc(ExtensionArray.fillna) def fillna( @@ -935,9 +966,12 @@ def fillna( # TODO(CoW): Not necessary anymore when CoW is the default return self.copy() - if limit is not None or method is not None: + if limit is not None: return super().fillna(value=value, method=method, limit=limit, copy=copy) + if method is not None: + return super().fillna(method=method, limit=limit, copy=copy) + if isinstance(value, (np.ndarray, ExtensionArray)): # Similar to check_value_size, but we do not mask here since we may # end up passing it to the super() method. @@ -947,31 +981,14 @@ def fillna( f" expected {len(self)}" ) - def convert_fill_value(value, pa_type, dtype): - if value is None: - return value - if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)): - return value - if is_array_like(value): - pa_box = pa.array - else: - pa_box = pa.scalar - try: - value = pa_box(value, type=pa_type, from_pandas=True) - except pa.ArrowTypeError as err: - msg = f"Invalid value '{str(value)}' for dtype {dtype}" - raise TypeError(msg) from err - return value - - fill_value = convert_fill_value(value, self._pa_array.type, self.dtype) + try: + fill_value = self._box_pa(value, pa_type=self._pa_array.type) + except pa.ArrowTypeError as err: + msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" + raise TypeError(msg) from err try: - if method is None: - return type(self)(pc.fill_null(self._pa_array, fill_value=fill_value)) - elif method == "pad": - return type(self)(pc.fill_null_forward(self._pa_array)) - elif method == "backfill": - return type(self)(pc.fill_null_backward(self._pa_array)) + return type(self)(pc.fill_null(self._pa_array, fill_value=fill_value)) except pa.ArrowNotImplementedError: # ArrowNotImplementedError: Function 'coalesce' has no kernel # matching input types (duration[ns], duration[ns]) @@ -1029,13 +1046,15 @@ def factorize( indices = np.array([], dtype=np.intp) uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type)) else: - pa_indices = encoded.combine_chunks().indices + # GH 54844 + combined = encoded.combine_chunks() + pa_indices = combined.indices if pa_indices.null_count > 0: pa_indices = pc.fill_null(pa_indices, -1) indices = pa_indices.to_numpy(zero_copy_only=False, writable=True).astype( np.intp, copy=False ) - uniques = type(self)(encoded.chunk(0).dictionary) + uniques = type(self)(combined.dictionary) if pa_version_under11p0 and pa.types.is_duration(pa_type): uniques = cast(ArrowExtensionArray, uniques.astype(self.dtype)) @@ -1224,46 +1243,50 @@ def to_numpy( ) -> np.ndarray: if dtype is not None: dtype = np.dtype(dtype) - elif self._hasna: - dtype = np.dtype(object) if na_value is lib.no_default: na_value = self.dtype.na_value pa_type = self._pa_array.type + if not self._hasna or isna(na_value) or pa.types.is_null(pa_type): + data = self + else: + data = self.fillna(na_value) + copy = False + if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type): - result = self._maybe_convert_datelike_array() + result = data._maybe_convert_datelike_array() if dtype is None or dtype.kind == "O": result = result.to_numpy(dtype=object, na_value=na_value) else: result = result.to_numpy(dtype=dtype) - return result elif pa.types.is_time(pa_type) or pa.types.is_date(pa_type): # convert to list of python datetime.time objects before # wrapping in ndarray - result = np.array(list(self), dtype=dtype) - elif is_object_dtype(dtype) and self._hasna: - result = np.empty(len(self), dtype=object) - mask = ~self.isna() - result[mask] = np.asarray(self[mask]._pa_array) - elif pa.types.is_null(self._pa_array.type): - fill_value = None if isna(na_value) else na_value - return np.full(len(self), fill_value=fill_value, dtype=dtype) - elif self._hasna: - data = self.fillna(na_value) + result = np.array(list(data), dtype=dtype) + if data._hasna: + result[data.isna()] = na_value + elif pa.types.is_null(pa_type): + if dtype is not None and isna(na_value): + na_value = None + result = np.full(len(data), fill_value=na_value, dtype=dtype) + elif not data._hasna or (pa.types.is_floating(pa_type) and na_value is np.nan): result = data._pa_array.to_numpy() - if dtype is not None: - result = result.astype(dtype, copy=False) - return result - else: - result = self._pa_array.to_numpy() if dtype is not None: result = result.astype(dtype, copy=False) if copy: result = result.copy() - return result - if self._hasna: - result[self.isna()] = na_value + else: + if dtype is None: + empty = pa.array([], type=pa_type).to_numpy(zero_copy_only=False) + if can_hold_element(empty, na_value): + dtype = empty.dtype + else: + dtype = np.object_ + result = np.empty(len(data), dtype=dtype) + mask = data.isna() + result[mask] = na_value + result[~mask] = data[~mask]._pa_array.to_numpy() return result def unique(self) -> Self: @@ -1588,6 +1611,10 @@ def _explode(self): """ See Series.explode.__doc__. """ + # child class explode method supports only list types; return + # default implementation for non list types. + if not pa.types.is_list(self.dtype.pyarrow_dtype): + return super()._explode() values = self counts = pa.compute.list_value_length(values._pa_array) counts = counts.fill_null(1).to_numpy() @@ -1972,9 +1999,17 @@ def _groupby_op( **kwargs, ) - masked = self._to_masked() + # maybe convert to a compatible dtype optimized for groupby + values: ExtensionArray + pa_type = self._pa_array.type + if pa.types.is_timestamp(pa_type): + values = self._to_datetimearray() + elif pa.types.is_duration(pa_type): + values = self._to_timedeltaarray() + else: + values = self._to_masked() - result = masked._groupby_op( + result = values._groupby_op( how=how, has_dropped_na=has_dropped_na, min_count=min_count, @@ -2172,11 +2207,11 @@ def _str_rstrip(self, to_strip=None): return type(self)(result) def _str_removeprefix(self, prefix: str): - # TODO: Should work once https://github.com/apache/arrow/issues/14991 is fixed - # starts_with = pc.starts_with(self._pa_array, pattern=prefix) - # removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - # result = pc.if_else(starts_with, removed, self._pa_array) - # return type(self)(result) + if not pa_version_under13p0: + starts_with = pc.starts_with(self._pa_array, pattern=prefix) + removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + result = pc.if_else(starts_with, removed, self._pa_array) + return type(self)(result) predicate = lambda val: val.removeprefix(prefix) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) @@ -2405,7 +2440,7 @@ def _dt_time(self): @property def _dt_tz(self): - return self.dtype.pyarrow_dtype.tz + return timezones.maybe_get_tz(self.dtype.pyarrow_dtype.tz) @property def _dt_unit(self): @@ -2441,11 +2476,11 @@ def _round_temporally( "W": "week", "D": "day", "H": "hour", - "T": "minute", - "S": "second", - "L": "millisecond", - "U": "microsecond", - "N": "nanosecond", + "min": "minute", + "s": "second", + "ms": "millisecond", + "us": "microsecond", + "ns": "nanosecond", } unit = pa_supported_unit.get(offset._prefix, None) if unit is None: diff --git a/pandas/core/arrays/arrow/extension_types.py b/pandas/core/arrays/arrow/extension_types.py index 3249c1c829546..7814a77a1cdc5 100644 --- a/pandas/core/arrays/arrow/extension_types.py +++ b/pandas/core/arrays/arrow/extension_types.py @@ -48,7 +48,7 @@ def __ne__(self, other) -> bool: def __hash__(self) -> int: return hash((str(self), self.freq)) - def to_pandas_dtype(self): + def to_pandas_dtype(self) -> PeriodDtype: return PeriodDtype(freq=self.freq) @@ -105,7 +105,7 @@ def __ne__(self, other) -> bool: def __hash__(self) -> int: return hash((str(self), str(self.subtype), self.closed)) - def to_pandas_dtype(self): + def to_pandas_dtype(self) -> IntervalDtype: return IntervalDtype(self.subtype.to_pandas_dtype(), self.closed) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 3a6db34b0e8b5..933944dbd4632 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -132,7 +132,6 @@ class ExtensionArray: interpolate isin isna - pad_or_backfill ravel repeat searchsorted @@ -143,11 +142,13 @@ class ExtensionArray: view _accumulate _concat_same_type + _explode _formatter _from_factorized _from_sequence _from_sequence_of_strings _hash_pandas_object + _pad_or_backfill _reduce _values_for_argsort _values_for_factorize @@ -183,7 +184,7 @@ class ExtensionArray: methods: * fillna - * pad_or_backfill + * _pad_or_backfill * dropna * unique * factorize / _values_for_factorize @@ -918,13 +919,8 @@ def interpolate( f"{type(self).__name__} does not implement interpolate" ) - def pad_or_backfill( - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: """ Pad or backfill values, used by Series/DataFrame ffill and bfill. @@ -960,26 +956,26 @@ def pad_or_backfill( Examples -------- >>> arr = pd.array([np.nan, np.nan, 2, 3, np.nan, np.nan]) - >>> arr.pad_or_backfill(method="backfill", limit=1) + >>> arr._pad_or_backfill(method="backfill", limit=1) [, 2, 2, 3, , ] Length: 6, dtype: Int64 """ # If a 3rd-party EA has implemented this functionality in fillna, - # we warn that they need to implement pad_or_backfill instead. + # we warn that they need to implement _pad_or_backfill instead. if ( type(self).fillna is not ExtensionArray.fillna - and type(self).pad_or_backfill is ExtensionArray.pad_or_backfill + and type(self)._pad_or_backfill is ExtensionArray._pad_or_backfill ): - # Check for pad_or_backfill here allows us to call - # super().pad_or_backfill without getting this warning + # Check for _pad_or_backfill here allows us to call + # super()._pad_or_backfill without getting this warning warnings.warn( "ExtensionArray.fillna 'method' keyword is deprecated. " - "In a future version. arr.pad_or_backfill will be called " + "In a future version. arr._pad_or_backfill will be called " "instead. 3rd-party ExtensionArray authors need to implement " - "pad_or_backfill.", - FutureWarning, + "_pad_or_backfill.", + DeprecationWarning, stacklevel=find_stack_level(), ) return self.fillna(method=method, limit=limit) @@ -1063,8 +1059,7 @@ def fillna( if method is not None: warnings.warn( f"The 'method' keyword in {type(self).__name__}.fillna is " - "deprecated and will be removed in a future version. " - "Use pad_or_backfill instead.", + "deprecated and will be removed in a future version.", FutureWarning, stacklevel=find_stack_level(), ) @@ -1930,6 +1925,41 @@ def _hash_pandas_object( values, encoding=encoding, hash_key=hash_key, categorize=categorize ) + def _explode(self) -> tuple[Self, npt.NDArray[np.uint64]]: + """ + Transform each element of list-like to a row. + + For arrays that do not contain list-like elements the default + implementation of this method just returns a copy and an array + of ones (unchanged index). + + Returns + ------- + ExtensionArray + Array with the exploded values. + np.ndarray[uint64] + The original lengths of each list-like for determining the + resulting index. + + See Also + -------- + Series.explode : The method on the ``Series`` object that this + extension array method is meant to support. + + Examples + -------- + >>> import pyarrow as pa + >>> a = pd.array([[1, 2, 3], [4], [5, 6]], + ... dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + >>> a._explode() + ( + [1, 2, 3, 4, 5, 6] + Length: 6, dtype: int64[pyarrow], array([3, 1, 2], dtype=int32)) + """ + values = self.copy() + counts = np.ones(shape=(len(self),), dtype=np.uint64) + return values, counts + def tolist(self) -> list: """ Return a list of the values. @@ -2042,6 +2072,7 @@ def _where(self, mask: npt.NDArray[np.bool_], value) -> Self: result[~mask] = val return result + # TODO(3.0): this can be removed once GH#33302 deprecation is enforced def _fill_mask_inplace( self, method: str, limit: int | None, mask: npt.NDArray[np.bool_] ) -> None: diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 43344f04085ae..04e6f0a0bcdde 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -3,6 +3,7 @@ import numbers from typing import ( TYPE_CHECKING, + ClassVar, cast, ) @@ -60,7 +61,7 @@ class BooleanDtype(BaseMaskedDtype): BooleanDtype """ - name = "boolean" + name: ClassVar[str] = "boolean" # https://github.com/python/mypy/issues/4125 # error: Signature of "type" incompatible with supertype "BaseMaskedDtype" diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index aefc94ebd665c..8d2633c10b428 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2410,7 +2410,7 @@ def _mode(self, dropna: bool = True) -> Categorical: # ------------------------------------------------------------------ # ExtensionArray Interface - def unique(self): + def unique(self) -> Self: """ Return the ``Categorical`` which ``categories`` and ``codes`` are unique. @@ -2597,7 +2597,7 @@ def isin(self, values) -> npt.NDArray[np.bool_]: ) values = sanitize_array(values, None, None) null_mask = np.asarray(isna(values)) - code_values = self.categories.get_indexer(values) + code_values = self.categories.get_indexer_for(values) code_values = code_values[null_mask | (code_values >= 0)] return algorithms.isin(self.codes, code_values) @@ -2898,17 +2898,15 @@ def _delegate_method(self, name: str, *args, **kwargs): # utility routines -def _get_codes_for_values(values, categories: Index) -> np.ndarray: +def _get_codes_for_values( + values: Index | Series | ExtensionArray | np.ndarray, + categories: Index, +) -> np.ndarray: """ utility routine to turn values into codes given the specified categories If `values` is known to be a Categorical, use recode_for_categories instead. """ - if values.ndim > 1: - flat = values.ravel() - codes = _get_codes_for_values(flat, categories) - return codes.reshape(values.shape) - codes = categories.get_indexer_for(values) return coerce_indexer_dtype(codes, categories) @@ -2950,7 +2948,7 @@ def recode_for_categories( return codes indexer = coerce_indexer_dtype( - new_categories.get_indexer(old_categories), new_categories + new_categories.get_indexer_for(old_categories), new_categories ) new_codes = take_nd(indexer, codes, fill_value=-1) return new_codes diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c3b8d1c0e79e8..52596f29ffc0c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -107,6 +107,7 @@ algorithms, missing, nanops, + ops, ) from pandas.core.algorithms import ( checked_add_with_arr, @@ -625,15 +626,19 @@ def _validation_error_message(self, value, allow_listlike: bool = False) -> str: ------- str """ + if hasattr(value, "dtype") and getattr(value, "ndim", 0) > 0: + msg_got = f"{value.dtype} array" + else: + msg_got = f"'{type(value).__name__}'" if allow_listlike: msg = ( f"value should be a '{self._scalar_type.__name__}', 'NaT', " - f"or array of those. Got '{type(value).__name__}' instead." + f"or array of those. Got {msg_got} instead." ) else: msg = ( f"value should be a '{self._scalar_type.__name__}' or 'NaT'. " - f"Got '{type(value).__name__}' instead." + f"Got {msg_got} instead." ) return msg @@ -943,8 +948,12 @@ def _cmp_method(self, other, op): dtype = getattr(other, "dtype", None) if is_object_dtype(dtype): - return op(np.asarray(self, dtype=object), other) - + # We have to use comp_method_OBJECT_ARRAY instead of numpy + # comparison otherwise it would raise when comparing to None + result = ops.comp_method_OBJECT_ARRAY( + op, np.asarray(self.astype(object)), other + ) + return result if other is NaT: if op is operator.ne: result = np.ones(self.shape, dtype=bool) @@ -1813,7 +1822,7 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', '2018-01-01 12:01:00'], - dtype='datetime64[ns]', freq='T') + dtype='datetime64[ns]', freq='min') """ _round_example = """>>> rng.round('H') diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 8ad51e4a90027..b520f9f4a6deb 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -39,7 +39,10 @@ tz_convert_from_utc, tzconversion, ) -from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit +from pandas._libs.tslibs.dtypes import ( + abbrev_to_npy_unit, + freq_to_period_freqstr, +) from pandas.errors import PerformanceWarning from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_inclusive @@ -1207,6 +1210,8 @@ def to_period(self, freq=None) -> PeriodArray: if freq is None: freq = self.freqstr or self.inferred_freq + if isinstance(self.freq, BaseOffset): + freq = freq_to_period_freqstr(self.freq.n, self.freq.name) if freq is None: raise ValueError( @@ -1220,7 +1225,6 @@ def to_period(self, freq=None) -> PeriodArray: res = freq freq = res - return PeriodArray._from_datetime64(self._ndarray, freq, tz=self.tz) # ----------------------------------------------------------------- @@ -1245,7 +1249,7 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: Examples -------- - >>> s = pd.Series(pd.date_range(start='2018-01', freq='M', periods=3)) + >>> s = pd.Series(pd.date_range(start='2018-01', freq='ME', periods=3)) >>> s 0 2018-01-31 1 2018-02-28 @@ -1257,10 +1261,10 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: 2 March dtype: object - >>> idx = pd.date_range(start='2018-01', freq='M', periods=3) + >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3) >>> idx DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], - dtype='datetime64[ns]', freq='M') + dtype='datetime64[ns]', freq='ME') >>> idx.month_name() Index(['January', 'February', 'March'], dtype='object') @@ -1268,10 +1272,10 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: for example: ``idx.month_name(locale='pt_BR.utf8')`` will return month names in Brazilian Portuguese language. - >>> idx = pd.date_range(start='2018-01', freq='M', periods=3) + >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3) >>> idx DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], - dtype='datetime64[ns]', freq='M') + dtype='datetime64[ns]', freq='ME') >>> idx.month_name(locale='pt_BR.utf8') # doctest: +SKIP Index(['Janeiro', 'Fevereiro', 'Março'], dtype='object') """ @@ -1520,7 +1524,7 @@ def isocalendar(self) -> DataFrame: Examples -------- >>> datetime_series = pd.Series( - ... pd.date_range("2000-01-01", periods=3, freq="M") + ... pd.date_range("2000-01-01", periods=3, freq="ME") ... ) >>> datetime_series 0 2000-01-31 @@ -1589,7 +1593,7 @@ def isocalendar(self) -> DataFrame: Examples -------- >>> datetime_series = pd.Series( - ... pd.date_range("2000-01-01", periods=3, freq="T") + ... pd.date_range("2000-01-01", periods=3, freq="min") ... ) >>> datetime_series 0 2000-01-01 00:00:00 diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index bd3c03f9218d4..40c8347c6f1a3 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -1,5 +1,7 @@ from __future__ import annotations +from typing import ClassVar + import numpy as np from pandas.core.dtypes.base import register_extension_dtype @@ -156,14 +158,14 @@ class FloatingArray(NumericArray): @register_extension_dtype class Float32Dtype(FloatingDtype): type = np.float32 - name = "Float32" + name: ClassVar[str] = "Float32" __doc__ = _dtype_docstring.format(dtype="float32") @register_extension_dtype class Float64Dtype(FloatingDtype): type = np.float64 - name = "Float64" + name: ClassVar[str] = "Float64" __doc__ = _dtype_docstring.format(dtype="float64") diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 0e6e7a484bbb7..f9384e25ba9d9 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,5 +1,7 @@ from __future__ import annotations +from typing import ClassVar + import numpy as np from pandas.core.dtypes.base import register_extension_dtype @@ -205,56 +207,56 @@ class IntegerArray(NumericArray): @register_extension_dtype class Int8Dtype(IntegerDtype): type = np.int8 - name = "Int8" + name: ClassVar[str] = "Int8" __doc__ = _dtype_docstring.format(dtype="int8") @register_extension_dtype class Int16Dtype(IntegerDtype): type = np.int16 - name = "Int16" + name: ClassVar[str] = "Int16" __doc__ = _dtype_docstring.format(dtype="int16") @register_extension_dtype class Int32Dtype(IntegerDtype): type = np.int32 - name = "Int32" + name: ClassVar[str] = "Int32" __doc__ = _dtype_docstring.format(dtype="int32") @register_extension_dtype class Int64Dtype(IntegerDtype): type = np.int64 - name = "Int64" + name: ClassVar[str] = "Int64" __doc__ = _dtype_docstring.format(dtype="int64") @register_extension_dtype class UInt8Dtype(IntegerDtype): type = np.uint8 - name = "UInt8" + name: ClassVar[str] = "UInt8" __doc__ = _dtype_docstring.format(dtype="uint8") @register_extension_dtype class UInt16Dtype(IntegerDtype): type = np.uint16 - name = "UInt16" + name: ClassVar[str] = "UInt16" __doc__ = _dtype_docstring.format(dtype="uint16") @register_extension_dtype class UInt32Dtype(IntegerDtype): type = np.uint32 - name = "UInt32" + name: ClassVar[str] = "UInt32" __doc__ = _dtype_docstring.format(dtype="uint32") @register_extension_dtype class UInt64Dtype(IntegerDtype): type = np.uint64 - name = "UInt64" + name: ClassVar[str] = "UInt64" __doc__ = _dtype_docstring.format(dtype="uint64") diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 3263dd73fe4dc..58ade1ee935ec 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -110,7 +110,7 @@ ) -IntervalSideT = Union[TimeArrayLike, np.ndarray] +IntervalSide = Union[TimeArrayLike, np.ndarray] IntervalOrNA = Union[Interval, float] _interval_shared_docs: dict[str, str] = {} @@ -219,8 +219,8 @@ def ndim(self) -> Literal[1]: return 1 # To make mypy recognize the fields - _left: IntervalSideT - _right: IntervalSideT + _left: IntervalSide + _right: IntervalSide _dtype: IntervalDtype # --------------------------------------------------------------------- @@ -237,8 +237,8 @@ def __new__( data = extract_array(data, extract_numpy=True) if isinstance(data, cls): - left: IntervalSideT = data._left - right: IntervalSideT = data._right + left: IntervalSide = data._left + right: IntervalSide = data._right closed = closed or data.closed dtype = IntervalDtype(left.dtype, closed=closed) else: @@ -280,8 +280,8 @@ def __new__( @classmethod def _simple_new( cls, - left: IntervalSideT, - right: IntervalSideT, + left: IntervalSide, + right: IntervalSide, dtype: IntervalDtype, ) -> Self: result = IntervalMixin.__new__(cls) @@ -299,7 +299,7 @@ def _ensure_simple_new_inputs( closed: IntervalClosedType | None = None, copy: bool = False, dtype: Dtype | None = None, - ) -> tuple[IntervalSideT, IntervalSideT, IntervalDtype]: + ) -> tuple[IntervalSide, IntervalSide, IntervalDtype]: """Ensure correctness of input parameters for cls._simple_new.""" from pandas.core.indexes.base import ensure_index @@ -764,7 +764,7 @@ def _cmp_method(self, other, op): if self.closed != other.categories.closed: return invalid_comparison(self, other, op) - other = other.categories.take( + other = other.categories._values.take( other.codes, allow_fill=True, fill_value=other.categories._na_value ) @@ -890,19 +890,12 @@ def max(self, *, axis: AxisInt | None = None, skipna: bool = True) -> IntervalOr indexer = obj.argsort()[-1] return obj[indexer] - def pad_or_backfill( # pylint: disable=useless-parent-delegation - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( # pylint: disable=useless-parent-delegation + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove # this method entirely. - return super().pad_or_backfill( - method=method, limit=limit, limit_area=limit_area, copy=copy - ) + return super()._pad_or_backfill(method=method, limit=limit, copy=copy) def fillna( self, value=None, method=None, limit: int | None = None, copy: bool = True @@ -1038,8 +1031,8 @@ def _concat_same_type(cls, to_concat: Sequence[IntervalArray]) -> Self: raise ValueError("Intervals must all be closed on the same side.") closed = closed_set.pop() - left = np.concatenate([interval.left for interval in to_concat]) - right = np.concatenate([interval.right for interval in to_concat]) + left: IntervalSide = np.concatenate([interval.left for interval in to_concat]) + right: IntervalSide = np.concatenate([interval.right for interval in to_concat]) left, right, dtype = cls._ensure_simple_new_inputs(left, right, closed=closed) @@ -1290,7 +1283,7 @@ def _format_space(self) -> str: # Vectorized Interval Properties/Attributes @property - def left(self): + def left(self) -> Index: """ Return the left endpoints of each Interval in the IntervalArray as an Index. @@ -1310,7 +1303,7 @@ def left(self): return Index(self._left, copy=False) @property - def right(self): + def right(self) -> Index: """ Return the right endpoints of each Interval in the IntervalArray as an Index. @@ -1844,14 +1837,14 @@ def isin(self, values) -> npt.NDArray[np.bool_]: # complex128 ndarray is much more performant. left = self._combined.view("complex128") right = values._combined.view("complex128") - # error: Argument 1 to "in1d" has incompatible type + # error: Argument 1 to "isin" has incompatible type # "Union[ExtensionArray, ndarray[Any, Any], # ndarray[Any, dtype[Any]]]"; expected # "Union[_SupportsArray[dtype[Any]], # _NestedSequence[_SupportsArray[dtype[Any]]], bool, # int, float, complex, str, bytes, _NestedSequence[ # Union[bool, int, float, complex, str, bytes]]]" - return np.in1d(left, right) # type: ignore[arg-type] + return np.isin(left, right).ravel() # type: ignore[arg-type] elif needs_i8_conversion(self.left.dtype) ^ needs_i8_conversion( values.left.dtype @@ -1862,11 +1855,17 @@ def isin(self, values) -> npt.NDArray[np.bool_]: return isin(self.astype(object), values.astype(object)) @property - def _combined(self) -> IntervalSideT: - left = self.left._values.reshape(-1, 1) - right = self.right._values.reshape(-1, 1) + def _combined(self) -> IntervalSide: + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "reshape" [union-attr] + left = self.left._values.reshape(-1, 1) # type: ignore[union-attr] + right = self.right._values.reshape(-1, 1) # type: ignore[union-attr] if needs_i8_conversion(left.dtype): - comb = left._concat_same_type([left, right], axis=1) + # error: Item "ndarray[Any, Any]" of "Any | ndarray[Any, Any]" has + # no attribute "_concat_same_type" + comb = left._concat_same_type( # type: ignore[union-attr] + [left, right], axis=1 + ) else: comb = np.concatenate([left, right], axis=1) return comb diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index bec875f2bbfa1..fdcbe67bbc371 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -190,13 +190,8 @@ def __getitem__(self, item: PositionalIndexer) -> Self | Any: return self._simple_new(self._data[item], newmask) - def pad_or_backfill( - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: mask = self._mask @@ -1492,6 +1487,9 @@ def _groupby_op( else: result_mask = np.zeros(ngroups, dtype=bool) + if how == "rank" and kwargs.get("na_option") in ["top", "bottom"]: + result_mask[:] = False + res_values = op._cython_op_ndim_compat( self._data, min_count=min_count, diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 99a3586871d10..efe0c0df45e00 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -240,7 +240,10 @@ def _values_for_factorize(self) -> tuple[np.ndarray, float | None]: fv = np.nan return self._ndarray, fv - def pad_or_backfill( + # Base EA class (and all other EA classes) don't have limit_area keyword + # This can be removed here as well when the interpolate ffill/bfill method + # deprecation is enforced + def _pad_or_backfill( self, *, method: FillnaOptions, diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 024cd6d4aad14..4532e5bffe7a9 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -33,7 +33,10 @@ period as libperiod, to_offset, ) -from pandas._libs.tslibs.dtypes import FreqGroup +from pandas._libs.tslibs.dtypes import ( + FreqGroup, + freq_to_period_freqstr, +) from pandas._libs.tslibs.fields import isleapyear_arr from pandas._libs.tslibs.offsets import ( Tick, @@ -319,6 +322,8 @@ def _from_datetime64(cls, data, freq, tz=None) -> Self: ------- PeriodArray[freq] """ + if isinstance(freq, BaseOffset): + freq = freq_to_period_freqstr(freq.n, freq.name) data, freq = dt64arr_to_periodarr(data, freq, tz) dtype = PeriodDtype(freq) return cls(data, dtype=dtype) @@ -386,6 +391,10 @@ def freq(self) -> BaseOffset: """ return self.dtype.freq + @property + def freqstr(self) -> str: + return freq_to_period_freqstr(self.freq.n, self.freq.name) + def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: if dtype == "i8": return self.asi8 @@ -717,7 +726,8 @@ def asfreq(self, freq=None, how: str = "E") -> Self: '2015-01'], dtype='period[M]') """ how = libperiod.validate_end_alias(how) - + if isinstance(freq, BaseOffset): + freq = freq_to_period_freqstr(freq.n, freq.name) freq = Period._maybe_convert_freq(freq) base1 = self._dtype._dtype_code @@ -791,20 +801,13 @@ def searchsorted( m8arr = self._ndarray.view("M8[ns]") return m8arr.searchsorted(npvalue, side=side, sorter=sorter) - def pad_or_backfill( - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: # view as dt64 so we get treated as timelike in core.missing, # similar to dtl._period_dispatch dta = self.view("M8[ns]") - result = dta.pad_or_backfill( - method=method, limit=limit, limit_area=limit_area, copy=copy - ) + result = dta._pad_or_backfill(method=method, limit=limit, copy=copy) if copy: return cast("Self", result.view(self.dtype)) else: @@ -955,7 +958,7 @@ def _check_timedeltalike_freq_compat(self, other): return lib.item_from_zerodim(delta) -def raise_on_incompatible(left, right): +def raise_on_incompatible(left, right) -> IncompatibleFrequency: """ Helper function to render a consistent error message when raising IncompatibleFrequency. @@ -973,13 +976,16 @@ def raise_on_incompatible(left, right): # GH#24283 error message format depends on whether right is scalar if isinstance(right, (np.ndarray, ABCTimedeltaArray)) or right is None: other_freq = None - elif isinstance(right, (ABCPeriodIndex, PeriodArray, Period, BaseOffset)): + elif isinstance(right, BaseOffset): + other_freq = freq_to_period_freqstr(right.n, right.name) + elif isinstance(right, (ABCPeriodIndex, PeriodArray, Period)): other_freq = right.freqstr else: other_freq = delta_to_tick(Timedelta(right)).freqstr + own_freq = freq_to_period_freqstr(left.freq.n, left.freq.name) msg = DIFFERENT_FREQ.format( - cls=type(left).__name__, own_freq=left.freqstr, other_freq=other_freq + cls=type(left).__name__, own_freq=own_freq, other_freq=other_freq ) return IncompatibleFrequency(msg) @@ -1096,7 +1102,7 @@ def validate_dtype_freq(dtype, freq: timedelta | str | None) -> BaseOffset: def validate_dtype_freq( - dtype, freq: BaseOffsetT | timedelta | str | None + dtype, freq: BaseOffsetT | BaseOffset | timedelta | str | None ) -> BaseOffsetT: """ If both a dtype and a freq are available, ensure they match. If only @@ -1117,10 +1123,7 @@ def validate_dtype_freq( IncompatibleFrequency : mismatch between dtype and freq """ if freq is not None: - # error: Incompatible types in assignment (expression has type - # "BaseOffset", variable has type "Union[BaseOffsetT, timedelta, - # str, None]") - freq = to_offset(freq) # type: ignore[assignment] + freq = to_offset(freq, is_period=True) if dtype is not None: dtype = pandas_dtype(dtype) @@ -1183,7 +1186,7 @@ def _get_ordinal_range(start, end, periods, freq, mult: int = 1): ) if freq is not None: - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) mult = freq.n if start is not None: @@ -1247,10 +1250,10 @@ def _range_from_fields( if quarter is not None: if freq is None: - freq = to_offset("Q") + freq = to_offset("Q", is_period=True) base = FreqGroup.FR_QTR.value else: - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) base = libperiod.freq_to_dtype_code(freq) if base != FreqGroup.FR_QTR.value: raise AssertionError("base must equal FR_QTR") @@ -1264,7 +1267,7 @@ def _range_from_fields( ) ordinals.append(val) else: - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) base = libperiod.freq_to_dtype_code(freq) arrays = _make_field_arrays(year, month, day, hour, minute, second) for y, mth, d, h, mn, s in zip(*arrays): diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index d32c98535d7cb..00cbe1286c195 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -702,7 +702,9 @@ def npoints(self) -> int: """ return self.sp_index.npoints - def isna(self): + # error: Return type "SparseArray" of "isna" incompatible with return type + # "ndarray[Any, Any] | ExtensionArraySupportsAnyAll" in supertype "ExtensionArray" + def isna(self) -> Self: # type: ignore[override] # If null fill value, we want SparseDtype[bool, true] # to preserve the same memory usage. dtype = SparseDtype(bool, self._null_fill_value) @@ -712,19 +714,12 @@ def isna(self): mask[self.sp_index.indices] = isna(self.sp_values) return type(self)(mask, fill_value=False, dtype=dtype) - def pad_or_backfill( # pylint: disable=useless-parent-delegation - self, - *, - method: FillnaOptions, - limit: int | None = None, - limit_area: Literal["inside", "outside"] | None = None, - copy: bool = True, + def _pad_or_backfill( # pylint: disable=useless-parent-delegation + self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True ) -> Self: # TODO(3.0): We can remove this method once deprecation for fillna method # keyword is enforced. - return super().pad_or_backfill( - method=method, limit=limit, limit_area=limit_area, copy=copy - ) + return super()._pad_or_backfill(method=method, limit=limit, copy=copy) def fillna( self, @@ -1428,7 +1423,7 @@ def all(self, axis=None, *args, **kwargs): return values.all() - def any(self, axis: AxisInt = 0, *args, **kwargs): + def any(self, axis: AxisInt = 0, *args, **kwargs) -> bool: """ Tests whether at least one of elements evaluate True diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index d74fb3fc99e22..693ebad0ca16f 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -2,6 +2,7 @@ from typing import ( TYPE_CHECKING, + ClassVar, Literal, ) @@ -14,6 +15,7 @@ missing as libmissing, ) from pandas._libs.arrays import NDArrayBacked +from pandas._libs.lib import ensure_string_array from pandas.compat import pa_version_under7p0 from pandas.compat.numpy import function as nv from pandas.util._decorators import doc @@ -57,6 +59,7 @@ NumpySorter, NumpyValueArrayLike, Scalar, + Self, npt, type_t, ) @@ -76,7 +79,7 @@ class StringDtype(StorageExtensionDtype): Parameters ---------- - storage : {"python", "pyarrow"}, optional + storage : {"python", "pyarrow", "pyarrow_numpy"}, optional If not given, the value of ``pd.options.mode.string_storage``. Attributes @@ -96,23 +99,34 @@ class StringDtype(StorageExtensionDtype): string[pyarrow] """ - name = "string" + # error: Cannot override instance variable (previously declared on + # base class "StorageExtensionDtype") with class variable + name: ClassVar[str] = "string" # type: ignore[misc] - #: StringDtype().na_value uses pandas.NA + #: StringDtype().na_value uses pandas.NA except the implementation that + # follows NumPy semantics, which uses nan. @property - def na_value(self) -> libmissing.NAType: - return libmissing.NA + def na_value(self) -> libmissing.NAType | float: # type: ignore[override] + if self.storage == "pyarrow_numpy": + return np.nan + else: + return libmissing.NA _metadata = ("storage",) def __init__(self, storage=None) -> None: if storage is None: - storage = get_option("mode.string_storage") - if storage not in {"python", "pyarrow"}: + infer_string = get_option("future.infer_string") + if infer_string: + storage = "pyarrow_numpy" + else: + storage = get_option("mode.string_storage") + if storage not in {"python", "pyarrow", "pyarrow_numpy"}: raise ValueError( - f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." + f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. " + f"Got {storage} instead." ) - if storage == "pyarrow" and pa_version_under7p0: + if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under7p0: raise ImportError( "pyarrow>=7.0.0 is required for PyArrow backed StringArray." ) @@ -123,7 +137,7 @@ def type(self) -> type[str]: return str @classmethod - def construct_from_string(cls, string): + def construct_from_string(cls, string) -> Self: """ Construct a StringDtype from a string. @@ -160,6 +174,8 @@ def construct_from_string(cls, string): return cls(storage="python") elif string == "string[pyarrow]": return cls(storage="pyarrow") + elif string == "string[pyarrow_numpy]": + return cls(storage="pyarrow_numpy") else: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") @@ -176,12 +192,17 @@ def construct_array_type( # type: ignore[override] ------- type """ - from pandas.core.arrays.string_arrow import ArrowStringArray + from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, + ) if self.storage == "python": return StringArray - else: + elif self.storage == "pyarrow": return ArrowStringArray + else: + return ArrowStringArrayNumpySemantics def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray @@ -193,6 +214,10 @@ def __from_arrow__( from pandas.core.arrays.string_arrow import ArrowStringArray return ArrowStringArray(array) + elif self.storage == "pyarrow_numpy": + from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics + + return ArrowStringArrayNumpySemantics(array) else: import pyarrow @@ -206,7 +231,7 @@ def __from_arrow__( arr = np.array([], dtype=object) else: arr = pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False) - arr = lib.convert_nans_to_NA(arr) + arr = ensure_string_array(arr, na_value=libmissing.NA) # Bypass validation inside StringArray constructor, see GH#47781 new_string_array = StringArray.__new__(StringArray) NDArrayBacked.__init__( diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 76f0d0699b272..6262055827428 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,5 +1,6 @@ from __future__ import annotations +from functools import partial import re from typing import ( TYPE_CHECKING, @@ -14,7 +15,10 @@ lib, missing as libmissing, ) -from pandas.compat import pa_version_under7p0 +from pandas.compat import ( + pa_version_under7p0, + pa_version_under13p0, +) from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( @@ -27,6 +31,7 @@ ) from pandas.core.dtypes.missing import isna +from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays.arrow import ArrowExtensionArray from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import Int64Dtype @@ -45,12 +50,16 @@ if TYPE_CHECKING: + from collections.abc import Sequence + from pandas._typing import ( Dtype, Scalar, npt, ) + from pandas import Series + ArrowStringScalarOrNAT = Union[str, libmissing.NAType] @@ -113,10 +122,11 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr # error: Incompatible types in assignment (expression has type "StringDtype", # base class "ArrowExtensionArray" defined the type as "ArrowDtype") _dtype: StringDtype # type: ignore[assignment] + _storage = "pyarrow" def __init__(self, values) -> None: super().__init__(values) - self._dtype = StringDtype(storage="pyarrow") + self._dtype = StringDtype(storage=self._storage) if not pa.types.is_string(self._pa_array.type) and not ( pa.types.is_dictionary(self._pa_array.type) @@ -144,7 +154,10 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow" + assert isinstance(dtype, StringDtype) and dtype.storage in ( + "pyarrow", + "pyarrow_numpy", + ) if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and @@ -178,6 +191,10 @@ def insert(self, loc: int, item) -> ArrowStringArray: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) + @classmethod + def _result_converter(cls, values, na=None): + return BooleanDtype().__from_arrow__(values) + def _maybe_convert_setitem_value(self, value): """Maybe convert value to be pyarrow compatible.""" if is_scalar(value): @@ -313,7 +330,7 @@ def _str_contains( result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) else: result = pc.match_substring(self._pa_array, pat, ignore_case=not case) - result = BooleanDtype().__from_arrow__(result) + result = self._result_converter(result, na=na) if not isna(na): result[isna(result)] = bool(na) return result @@ -322,19 +339,13 @@ def _str_startswith(self, pat: str, na=None): result = pc.starts_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - result = BooleanDtype().__from_arrow__(result) - if not isna(na): - result[isna(result)] = bool(na) - return result + return self._result_converter(result) def _str_endswith(self, pat: str, na=None): result = pc.ends_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - result = BooleanDtype().__from_arrow__(result) - if not isna(na): - result[isna(result)] = bool(na) - return result + return self._result_converter(result) def _str_replace( self, @@ -353,6 +364,12 @@ def _str_replace( result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n) return type(self)(result) + def _str_repeat(self, repeats: int | Sequence[int]): + if not isinstance(repeats, int): + return super()._str_repeat(repeats) + else: + return type(self)(pc.binary_repeat(self._pa_array, repeats)) + def _str_match( self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None ): @@ -367,45 +384,58 @@ def _str_fullmatch( pat = f"{pat}$" return self._str_match(pat, case, flags, na) + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ): + if stop is None: + return super()._str_slice(start, stop, step) + if start is None: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + def _str_isalnum(self): result = pc.utf8_is_alnum(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isalpha(self): result = pc.utf8_is_alpha(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isdigit(self): result = pc.utf8_is_digit(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_islower(self): result = pc.utf8_is_lower(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isnumeric(self): result = pc.utf8_is_numeric(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isspace(self): result = pc.utf8_is_space(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_istitle(self): result = pc.utf8_is_title(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isupper(self): result = pc.utf8_is_upper(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_len(self): result = pc.utf8_length(self._pa_array) - return Int64Dtype().__from_arrow__(result) + return self._convert_int_dtype(result) def _str_lower(self): return type(self)(pc.utf8_lower(self._pa_array)) @@ -433,3 +463,153 @@ def _str_rstrip(self, to_strip=None): else: result = pc.utf8_rtrim(self._pa_array, characters=to_strip) return type(self)(result) + + def _str_removeprefix(self, prefix: str): + if not pa_version_under13p0: + starts_with = pc.starts_with(self._pa_array, pattern=prefix) + removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + result = pc.if_else(starts_with, removed, self._pa_array) + return type(self)(result) + return super()._str_removeprefix(prefix) + + def _str_removesuffix(self, suffix: str): + ends_with = pc.ends_with(self._pa_array, pattern=suffix) + removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) + result = pc.if_else(ends_with, removed, self._pa_array) + return type(self)(result) + + def _str_count(self, pat: str, flags: int = 0): + if flags: + return super()._str_count(pat, flags) + result = pc.count_substring_regex(self._pa_array, pat) + return self._convert_int_dtype(result) + + def _str_find(self, sub: str, start: int = 0, end: int | None = None): + if start != 0 and end is not None: + slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) + result = pc.find_substring(slices, sub) + not_found = pc.equal(result, -1) + offset_result = pc.add(result, end - start) + result = pc.if_else(not_found, result, offset_result) + elif start == 0 and end is None: + slices = self._pa_array + result = pc.find_substring(slices, sub) + else: + return super()._str_find(sub, start, end) + return self._convert_int_dtype(result) + + def _convert_int_dtype(self, result): + return Int64Dtype().__from_arrow__(result) + + +class ArrowStringArrayNumpySemantics(ArrowStringArray): + _storage = "pyarrow_numpy" + + def __init__(self, values) -> None: + _chk_pyarrow_available() + + if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_large_string( + values.type + ): + values = pc.cast(values, pa.string()) + super().__init__(values) + + @classmethod + def _result_converter(cls, values, na=None): + if not isna(na): + values = values.fill_null(bool(na)) + return ArrowExtensionArray(values).to_numpy(na_value=np.nan) + + def __getattribute__(self, item): + # ArrowStringArray and we both inherit from ArrowExtensionArray, which + # creates inheritance problems (Diamond inheritance) + if item in ArrowStringArrayMixin.__dict__ and item not in ( + "_pa_array", + "__dict__", + ): + return partial(getattr(ArrowStringArrayMixin, item), self) + return super().__getattribute__(item) + + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + if dtype is None: + dtype = self.dtype + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + if is_integer_dtype(dtype): + na_value = np.nan + else: + na_value = False + try: + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(dtype), # type: ignore[arg-type] + ) + return result + + except ValueError: + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + ) + if convert and result.dtype == object: + result = lib.maybe_convert_objects(result) + return result + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) + return type(self)(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) + + def _convert_int_dtype(self, result): + result = result.to_numpy() + if result.dtype == np.int32: + result = result.astype(np.int64) + return result + + def _cmp_method(self, other, op): + result = super()._cmp_method(other, op) + return result.to_numpy(np.bool_, na_value=False) + + def value_counts(self, dropna: bool = True) -> Series: + from pandas import Series + + result = super().value_counts(dropna) + return Series( + result._values.to_numpy(), index=result.index, name=result.name, copy=False + ) + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + if name in ["any", "all"]: + arr = pc.and_kleene( + pc.invert(pc.is_null(self._pa_array)), pc.not_equal(self._pa_array, "") + ) + return ArrowExtensionArray(arr)._reduce( + name, skipna=skipna, keepdims=keepdims, **kwargs + ) + else: + return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a81609e1bb618..b7b81b8271106 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -854,7 +854,7 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: -------- For Series: - >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='S')) + >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='s')) >>> ser 0 0 days 00:00:01 1 0 days 00:00:02 @@ -868,7 +868,7 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: For TimedeltaIndex: - >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='S') + >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='s') >>> tdelta_idx TimedeltaIndex(['0 days 00:00:01', '0 days 00:00:02', '0 days 00:00:03'], dtype='timedelta64[ns]', freq=None) @@ -888,7 +888,7 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: -------- For Series: - >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='U')) + >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='us')) >>> ser 0 0 days 00:00:00.000001 1 0 days 00:00:00.000002 @@ -902,7 +902,7 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: For TimedeltaIndex: - >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='U') + >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='us') >>> tdelta_idx TimedeltaIndex(['0 days 00:00:00.000001', '0 days 00:00:00.000002', '0 days 00:00:00.000003'], @@ -923,7 +923,7 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: -------- For Series: - >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='N')) + >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='ns')) >>> ser 0 0 days 00:00:00.000000001 1 0 days 00:00:00.000000002 @@ -937,7 +937,7 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: For TimedeltaIndex: - >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='N') + >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='ns') >>> tdelta_idx TimedeltaIndex(['0 days 00:00:00.000000001', '0 days 00:00:00.000000002', '0 days 00:00:00.000000003'], diff --git a/pandas/core/base.py b/pandas/core/base.py index d973f8f5fe35a..3026189e747bb 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -485,8 +485,8 @@ def array(self) -> ExtensionArray: types, this is the actual array. For NumPy native types, this is a thin (no copy) wrapper around :class:`numpy.ndarray`. - ``.array`` differs ``.values`` which may require converting the - data to a different form. + ``.array`` differs from ``.values``, which may require converting + the data to a different form. See Also -------- diff --git a/pandas/core/common.py b/pandas/core/common.py index 1679e01ff9fe1..8fd8b10c6fc32 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -138,7 +138,7 @@ def is_bool_indexer(key: Any) -> bool: elif isinstance(key, list): # check if np.array(key).dtype would be bool if len(key) > 0: - if type(key) is not list: + if type(key) is not list: # noqa: E721 # GH#42461 cython will raise TypeError if we pass a subclass key = list(key) return lib.is_bool_list(key) diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 2f94856702465..4770f403b1bdb 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -12,6 +12,7 @@ import tokenize from typing import ( Callable, + ClassVar, TypeVar, ) @@ -349,8 +350,8 @@ class BaseExprVisitor(ast.NodeVisitor): preparser : callable """ - const_type: type[Term] = Constant - term_type = Term + const_type: ClassVar[type[Term]] = Constant + term_type: ClassVar[type[Term]] = Term binary_ops = CMP_OPS_SYMS + BOOL_OPS_SYMS + ARITH_OPS_SYMS binary_op_nodes = ( @@ -540,7 +541,7 @@ def visit_UnaryOp(self, node, **kwargs): operand = self.visit(node.operand) return op(operand) - def visit_Name(self, node, **kwargs): + def visit_Name(self, node, **kwargs) -> Term: return self.term_type(node.id, self.env, **kwargs) # TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min @@ -555,11 +556,11 @@ def visit_Constant(self, node, **kwargs) -> Term: return self.const_type(node.value, self.env) # TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min - def visit_Str(self, node, **kwargs): + def visit_Str(self, node, **kwargs) -> Term: name = self.env.add_tmp(node.s) return self.term_type(name, self.env) - def visit_List(self, node, **kwargs): + def visit_List(self, node, **kwargs) -> Term: name = self.env.add_tmp([self.visit(e)(self.env) for e in node.elts]) return self.term_type(name, self.env) @@ -569,7 +570,7 @@ def visit_Index(self, node, **kwargs): """df.index[4]""" return self.visit(node.value) - def visit_Subscript(self, node, **kwargs): + def visit_Subscript(self, node, **kwargs) -> Term: from pandas import eval as pd_eval value = self.visit(node.value) @@ -589,7 +590,7 @@ def visit_Subscript(self, node, **kwargs): name = self.env.add_tmp(v) return self.term_type(name, env=self.env) - def visit_Slice(self, node, **kwargs): + def visit_Slice(self, node, **kwargs) -> slice: """df.index[slice(4,6)]""" lower = node.lower if lower is not None: diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 852bfae1cc79a..95ac20ba39edc 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -617,5 +617,5 @@ def __init__(self, name: str) -> None: self.name = name self.func = getattr(np, name) - def __call__(self, *args): + def __call__(self, *args) -> MathCall: return MathCall(self, args) diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 77d8d79506258..138a3ee42f686 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -10,6 +10,7 @@ from typing import ( TYPE_CHECKING, Any, + ClassVar, ) import numpy as np @@ -40,7 +41,10 @@ ) if TYPE_CHECKING: - from pandas._typing import npt + from pandas._typing import ( + Self, + npt, + ) class PyTablesScope(_scope.Scope): @@ -283,7 +287,7 @@ def __repr__(self) -> str: return "Filter: Not Initialized" return pprint_thing(f"[Filter : [{self.filter[0]}] -> [{self.filter[1]}]") - def invert(self): + def invert(self) -> Self: """invert the filter""" if self.filter is not None: self.filter = ( @@ -297,7 +301,8 @@ def format(self): """return the actual filter format""" return [self.filter] - def evaluate(self): + # error: Signature of "evaluate" incompatible with supertype "BinOp" + def evaluate(self) -> Self | None: # type: ignore[override] if not self.is_valid: raise ValueError(f"query term is not valid [{self}]") @@ -336,7 +341,8 @@ class JointFilterBinOp(FilterBinOp): def format(self): raise NotImplementedError("unable to collapse Joint Filters") - def evaluate(self): + # error: Signature of "evaluate" incompatible with supertype "BinOp" + def evaluate(self) -> Self: # type: ignore[override] return self @@ -357,7 +363,8 @@ def format(self): """return the actual ne format""" return self.condition - def evaluate(self): + # error: Signature of "evaluate" incompatible with supertype "BinOp" + def evaluate(self) -> Self | None: # type: ignore[override] if not self.is_valid: raise ValueError(f"query term is not valid [{self}]") @@ -385,7 +392,8 @@ def evaluate(self): class JointConditionBinOp(ConditionBinOp): - def evaluate(self): + # error: Signature of "evaluate" incompatible with supertype "BinOp" + def evaluate(self) -> Self: # type: ignore[override] self.condition = f"({self.lhs.condition} {self.op} {self.rhs.condition})" return self @@ -410,8 +418,8 @@ def prune(self, klass): class PyTablesExprVisitor(BaseExprVisitor): - const_type = Constant - term_type = Term + const_type: ClassVar[type[ops.Term]] = Constant + term_type: ClassVar[type[Term]] = Term def __init__(self, env, engine, parser, **kwargs) -> None: super().__init__(env, engine, parser) @@ -423,13 +431,15 @@ def __init__(self, env, engine, parser, **kwargs) -> None: lambda node, bin_op=bin_op: partial(BinOp, bin_op, **kwargs), ) - def visit_UnaryOp(self, node, **kwargs): + def visit_UnaryOp(self, node, **kwargs) -> ops.Term | UnaryOp | None: if isinstance(node.op, (ast.Not, ast.Invert)): return UnaryOp("~", self.visit(node.operand)) elif isinstance(node.op, ast.USub): return self.const_type(-self.visit(node.operand).value, self.env) elif isinstance(node.op, ast.UAdd): raise NotImplementedError("Unary addition not supported") + # TODO: return None might never be reached + return None def visit_Index(self, node, **kwargs): return self.visit(node.value).value @@ -440,7 +450,7 @@ def visit_Assign(self, node, **kwargs): ) return self.visit(cmpr) - def visit_Subscript(self, node, **kwargs): + def visit_Subscript(self, node, **kwargs) -> ops.Term: # only allow simple subscripts value = self.visit(node.value) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 27e9bf8958ab0..4652acdcae287 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -265,7 +265,7 @@ def use_numba_cb(key) -> None: """ pc_max_info_rows_doc = """ -: int or None +: int df.info() will usually show null-counts for each column. For large frames this can be quite slow. max_info_rows and max_info_cols limit this null check only to frames with smaller dimensions than @@ -322,7 +322,7 @@ def is_terminal() -> bool: "max_info_rows", 1690785, pc_max_info_rows_doc, - validator=is_instance_factory((int, type(None))), + validator=is_int, ) cf.register_option("max_rows", 60, pc_max_rows_doc, validator=is_nonnegative_int) cf.register_option( @@ -420,6 +420,7 @@ def is_terminal() -> bool: def use_inf_as_na_cb(key) -> None: + # TODO(3.0): enforcing this deprecation will close GH#52501 from pandas.core.dtypes.missing import _use_inf_as_na _use_inf_as_na(key) @@ -453,6 +454,13 @@ def use_inf_as_na_cb(key) -> None: validator=is_one_of_factory(["block", "array"]), ) +cf.deprecate_option( + # GH#55043 + "mode.data_manager", + "data_manager option is deprecated and will be removed in a future " + "version. Only the BlockManager will be available.", +) + # TODO better name? copy_on_write_doc = """ @@ -492,7 +500,8 @@ def use_inf_as_na_cb(key) -> None: string_storage_doc = """ : string - The default storage for StringDtype. + The default storage for StringDtype. This option is ignored if + ``future.infer_string`` is set to True. """ with cf.config_prefix("mode"): @@ -500,7 +509,7 @@ def use_inf_as_na_cb(key) -> None: "string_storage", "python", string_storage_doc, - validator=is_one_of_factory(["python", "pyarrow"]), + validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]), ) @@ -511,11 +520,11 @@ def use_inf_as_na_cb(key) -> None: auto, {others}. """ -_xls_options = ["xlrd"] -_xlsm_options = ["xlrd", "openpyxl"] -_xlsx_options = ["xlrd", "openpyxl"] -_ods_options = ["odf"] -_xlsb_options = ["pyxlsb"] +_xls_options = ["xlrd", "calamine"] +_xlsm_options = ["xlrd", "openpyxl", "calamine"] +_xlsx_options = ["xlrd", "openpyxl", "calamine"] +_ods_options = ["odf", "calamine"] +_xlsb_options = ["pyxlsb", "calamine"] with cf.config_prefix("io.excel.xls"): @@ -900,3 +909,14 @@ def register_converter_cb(key) -> None: "(at which point this option will be deprecated).", validator=is_one_of_factory([True, False]), ) + + cf.register_option( + "no_silent_downcasting", + False, + "Whether to opt-in to the future behavior which will *not* silently " + "downcast results from Series and DataFrame `where`, `mask`, and `clip` " + "methods. " + "Silent downcasting will be removed in pandas 3.0 " + "(at which point this option will be deprecated).", + validator=is_one_of_factory([True, False]), + ) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 5757c69bb6ec7..aaac0dc73486f 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -19,6 +19,8 @@ import numpy as np from numpy import ma +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas._libs.tslibs import ( Period, @@ -589,6 +591,11 @@ def sanitize_array( subarr = data if data.dtype == object: subarr = maybe_infer_to_datetimelike(data) + elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype(storage="pyarrow_numpy") + subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) if subarr is data and copy: subarr = subarr.copy() diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index a055afe6ec0ae..6567ca7155b0d 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -418,6 +418,33 @@ def index_class(self) -> type_t[Index]: return Index + @property + def _supports_2d(self) -> bool: + """ + Do ExtensionArrays with this dtype support 2D arrays? + + Historically ExtensionArrays were limited to 1D. By returning True here, + authors can indicate that their arrays support 2D instances. This can + improve performance in some cases, particularly operations with `axis=1`. + + Arrays that support 2D values should: + + - implement Array.reshape + - subclass the Dim2CompatTests in tests.extension.base + - _concat_same_type should support `axis` keyword + - _reduce and reductions should support `axis` keyword + """ + return False + + @property + def _can_fast_transpose(self) -> bool: + """ + Is transposing an array with this dtype zero-copy? + + Only relevant for cases where _supports_2d is True. + """ + return False + class StorageExtensionDtype(ExtensionDtype): """ExtensionDtype that may be backed by more than one implementation.""" diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 657cbce40087a..74e785be06356 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -68,6 +68,7 @@ PeriodDtype, ) from pandas.core.dtypes.generic import ( + ABCExtensionArray, ABCIndex, ABCSeries, ) @@ -799,10 +800,9 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: dtype = _dtype_obj if using_pyarrow_string_dtype(): - import pyarrow as pa + from pandas.core.arrays.string_ import StringDtype - pa_dtype = pa.string() - dtype = ArrowDtype(pa_dtype) + dtype = StringDtype(storage="pyarrow_numpy") elif isinstance(val, (np.datetime64, dt.datetime)): try: @@ -1707,8 +1707,6 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool: arr._validate_setitem_value(element) return True except (ValueError, TypeError): - # TODO: re-use _catch_deprecated_value_error to ensure we are - # strict about what exceptions we allow through here. return False # This is technically incorrect, but maintains the behavior of @@ -1775,6 +1773,23 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: return casted raise LossySetitemError + elif isinstance(element, ABCExtensionArray) and isinstance( + element.dtype, CategoricalDtype + ): + # GH#52927 setting Categorical value into non-EA frame + # TODO: general-case for EAs? + try: + casted = element.astype(dtype) + except (ValueError, TypeError): + raise LossySetitemError + # Check for cases of either + # a) lossy overflow/rounding or + # b) semantic changes like dt64->int64 + comp = casted == element + if not comp.all(): + raise LossySetitemError + return casted + # Anything other than integer we cannot hold raise LossySetitemError if ( diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index c2e498e75b7d3..9da4eac6a42c8 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -498,7 +498,7 @@ def is_categorical_dtype(arr_or_dtype) -> bool: # GH#52527 warnings.warn( "is_categorical_dtype is deprecated and will be removed in a future " - "version. Use isinstance(dtype, CategoricalDtype) instead", + "version. Use isinstance(dtype, pd.CategoricalDtype) instead", FutureWarning, stacklevel=find_stack_level(), ) @@ -1256,13 +1256,7 @@ def is_1d_only_ea_dtype(dtype: DtypeObj | None) -> bool: """ Analogue to is_extension_array_dtype but excluding DatetimeTZDtype. """ - # Note: if other EA dtypes are ever held in HybridBlock, exclude those - # here too. - # NB: need to check DatetimeTZDtype and not is_datetime64tz_dtype - # to exclude ArrowTimestampUSDtype - return isinstance(dtype, ExtensionDtype) and not isinstance( - dtype, (DatetimeTZDtype, PeriodDtype) - ) + return isinstance(dtype, ExtensionDtype) and not dtype._supports_2d def is_extension_array_dtype(arr_or_dtype) -> bool: @@ -1614,6 +1608,15 @@ def pandas_dtype(dtype) -> DtypeObj: # registered extension types result = registry.find(dtype) if result is not None: + if isinstance(result, type): + # GH 31356, GH 54592 + warnings.warn( + f"Instantiating {result.__name__} without any arguments." + f"Pass a {result.__name__} instance to silence this warning.", + UserWarning, + stacklevel=find_stack_level(), + ) + result = result() return result # try a numpy dtype diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index a9618963e0a51..beff71d5e9dd9 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -61,6 +61,8 @@ is_list_like, ) +from pandas.util import capitalize_first_letter + if not pa_version_under7p0: import pyarrow as pa @@ -68,13 +70,14 @@ from collections.abc import MutableMapping from datetime import tzinfo - import pyarrow as pa # noqa: F811, TCH004 + import pyarrow as pa # noqa: TCH004 from pandas._typing import ( Dtype, DtypeObj, IntervalClosedType, Ordered, + Self, npt, type_t, ) @@ -210,6 +213,8 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): base = np.dtype("O") _metadata = ("categories", "ordered") _cache_dtypes: dict[str_type, PandasExtensionDtype] = {} + _supports_2d = False + _can_fast_transpose = False def __init__(self, categories=None, ordered: Ordered = False) -> None: self._finalize(categories, ordered, fastpath=False) @@ -727,6 +732,8 @@ class DatetimeTZDtype(PandasExtensionDtype): _metadata = ("unit", "tz") _match = re.compile(r"(datetime64|M8)\[(?P.+), (?P.+)\]") _cache_dtypes: dict[str_type, PandasExtensionDtype] = {} + _supports_2d = True + _can_fast_transpose = True @property def na_value(self) -> NaTType: @@ -970,8 +977,10 @@ class PeriodDtype(PeriodDtypeBase, PandasExtensionDtype): _cache_dtypes: dict[BaseOffset, int] = {} # type: ignore[assignment] __hash__ = PeriodDtypeBase.__hash__ _freq: BaseOffset + _supports_2d = True + _can_fast_transpose = True - def __new__(cls, freq): + def __new__(cls, freq) -> PeriodDtype: # noqa: PYI034 """ Parameters ---------- @@ -985,6 +994,7 @@ def __new__(cls, freq): if isinstance(freq, BDay): # GH#53446 + # TODO(3.0): enforcing this will close GH#10575 warnings.warn( "PeriodDtype[B] is deprecated and will be removed in a future " "version. Use a DatetimeIndex with freq='B' instead", @@ -1001,11 +1011,11 @@ def __new__(cls, freq): u._freq = freq return u - def __reduce__(self): + def __reduce__(self) -> tuple[type_t[Self], tuple[str_type]]: return type(self), (self.name,) @property - def freq(self): + def freq(self) -> BaseOffset: """ The frequency object of this PeriodDtype. @@ -1025,7 +1035,7 @@ def _parse_dtype_strict(cls, freq: str_type) -> BaseOffset: if m is not None: freq = m.group("freq") - freq_offset = to_offset(freq) + freq_offset = to_offset(freq, is_period=True) if freq_offset is not None: return freq_offset @@ -1070,7 +1080,7 @@ def na_value(self) -> NaTType: def __eq__(self, other: object) -> bool: if isinstance(other, str): - return other in [self.name, self.name.title()] + return other in [self.name, capitalize_first_letter(self.name)] return super().__eq__(other) @@ -1431,6 +1441,8 @@ class NumpyEADtype(ExtensionDtype): """ _metadata = ("_dtype",) + _supports_2d = False + _can_fast_transpose = False def __init__(self, dtype: npt.DTypeLike | NumpyEADtype | None) -> None: if isinstance(dtype, NumpyEADtype): @@ -1516,7 +1528,6 @@ class BaseMaskedDtype(ExtensionDtype): Base class for dtypes for BaseMaskedArray subclasses. """ - name: str base = None type: type @@ -1727,7 +1738,7 @@ def fill_value(self): """ return self._fill_value - def _check_fill_value(self): + def _check_fill_value(self) -> None: if not lib.is_scalar(self._fill_value): raise ValueError( f"fill_value must be a scalar. Got {self._fill_value} instead" @@ -2137,6 +2148,8 @@ def type(self): return CategoricalDtypeType elif pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type): return list + elif pa.types.is_fixed_size_list(pa_type): + return list elif pa.types.is_map(pa_type): return list elif pa.types.is_struct(pa_type): diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 9c04e57be36fc..f551716772f61 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -401,7 +401,7 @@ def is_sequence(obj) -> bool: return False -def is_dataclass(item): +def is_dataclass(item) -> bool: """ Checks if the object is a data-class instance diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index de99f828d604f..7117e34b23ca4 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -285,7 +285,7 @@ def _isna_array(values: ArrayLike, inf_as_na: bool = False): # "Union[ndarray[Any, Any], ExtensionArraySupportsAnyAll]", variable has # type "ndarray[Any, dtype[bool_]]") result = values.isna() # type: ignore[assignment] - elif isinstance(values, np.recarray): + elif isinstance(values, np.rec.recarray): # GH 48526 result = _isna_recarray_dtype(values, inf_as_na=inf_as_na) elif is_string_or_object_np_dtype(values.dtype): @@ -332,7 +332,9 @@ def _has_record_inf_value(record_as_array: np.ndarray) -> np.bool_: return np.any(is_inf_in_record) -def _isna_recarray_dtype(values: np.recarray, inf_as_na: bool) -> npt.NDArray[np.bool_]: +def _isna_recarray_dtype( + values: np.rec.recarray, inf_as_na: bool +) -> npt.NDArray[np.bool_]: result = np.zeros(values.shape, dtype=bool) for i, record in enumerate(values): record_as_array = np.array(record.tolist()) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 25fc5bd6664f5..3e32a6d93b023 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -43,6 +43,7 @@ get_option, using_copy_on_write, ) +from pandas._config.config import _get_option from pandas._libs import ( algos as libalgos, @@ -418,10 +419,10 @@ lkey value_x rkey value_y 0 foo 1 foo 5 1 foo 1 foo 8 -2 foo 5 foo 5 -3 foo 5 foo 8 -4 bar 2 bar 6 -5 baz 3 baz 7 +2 bar 2 bar 6 +3 baz 3 baz 7 +4 foo 5 foo 5 +5 foo 5 foo 8 Merge DataFrames df1 and df2 with specified left and right suffixes appended to any overlapping columns. @@ -431,10 +432,10 @@ lkey value_left rkey value_right 0 foo 1 foo 5 1 foo 1 foo 8 -2 foo 5 foo 5 -3 foo 5 foo 8 -4 bar 2 bar 6 -5 baz 3 baz 7 +2 bar 2 bar 6 +3 baz 3 baz 7 +4 foo 5 foo 5 +5 foo 5 foo 8 Merge DataFrames df1 and df2, but raise an exception if the DataFrames have any overlapping columns. @@ -694,7 +695,7 @@ def __init__( NDFrame.__init__(self, data) return - manager = get_option("mode.data_manager") + manager = _get_option("mode.data_manager", silent=True) # GH47215 if isinstance(index, set): @@ -1926,13 +1927,22 @@ def to_dict( self, orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., into: type[dict] = ..., + index: bool = ..., ) -> dict: ... @overload - def to_dict(self, orient: Literal["records"], into: type[dict] = ...) -> list[dict]: + def to_dict( + self, + orient: Literal["records"], + into: type[dict] = ..., + index: bool = ..., + ) -> list[dict]: ... + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "orient"], name="to_dict" + ) def to_dict( self, orient: Literal[ @@ -2045,6 +2055,9 @@ def to_dict( return to_dict(self, orient, into, index) + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "destination_table"], name="to_gbq" + ) def to_gbq( self, destination_table: str, @@ -2399,14 +2412,14 @@ def maybe_reorder( columns = columns.drop(exclude) - manager = get_option("mode.data_manager") + manager = _get_option("mode.data_manager", silent=True) mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager) return cls(mgr) def to_records( self, index: bool = True, column_dtypes=None, index_dtypes=None - ) -> np.recarray: + ) -> np.rec.recarray: """ Convert DataFrame to a NumPy record array. @@ -2431,7 +2444,7 @@ def to_records( Returns ------- - numpy.recarray + numpy.rec.recarray NumPy ndarray with the DataFrame labels as fields and each row of the DataFrame as entries. @@ -2439,7 +2452,7 @@ def to_records( -------- DataFrame.from_records: Convert structured or record ndarray to DataFrame. - numpy.recarray: An ndarray that allows field access using + numpy.rec.recarray: An ndarray that allows field access using attributes, analogous to typed columns in a spreadsheet. @@ -2600,7 +2613,7 @@ def _from_arrays( if dtype is not None: dtype = pandas_dtype(dtype) - manager = get_option("mode.data_manager") + manager = _get_option("mode.data_manager", silent=True) columns = ensure_index(columns) if len(columns) != len(arrays): raise ValueError("len(columns) must match len(arrays)") @@ -2878,6 +2891,9 @@ def to_parquet( ) -> None: ... + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "path"], name="to_parquet" + ) @doc(storage_options=_shared_docs["storage_options"]) def to_parquet( self, @@ -3134,6 +3150,9 @@ def to_html( ) -> str: ... + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "buf"], name="to_html" + ) @Substitution( header_type="bool", header="Whether to print column labels, default True", @@ -4897,7 +4916,9 @@ def insert( column : str, number, or hashable object Label of the inserted column. value : Scalar, Series, or array-like + Content of the inserted column. allow_duplicates : bool, optional, default lib.no_default + Allow duplicate column labels to be created. See Also -------- @@ -5624,10 +5645,14 @@ def shift( ) -> DataFrame: if freq is not None and fill_value is not lib.no_default: # GH#53832 - raise ValueError( - "Cannot pass both 'freq' and 'fill_value' to " - f"{type(self).__name__}.shift" + warnings.warn( + "Passing a 'freq' together with a 'fill_value' silently ignores " + "the fill_value and is deprecated. This will raise in a future " + "version.", + FutureWarning, + stacklevel=find_stack_level(), ) + fill_value = lib.no_default axis = self._get_axis_number(axis) @@ -7267,7 +7292,7 @@ def value_counts( subset = self.columns.tolist() name = "proportion" if normalize else "count" - counts = self.groupby(subset, dropna=dropna).grouper.size() + counts = self.groupby(subset, dropna=dropna, observed=False).grouper.size() counts.name = name if sort: @@ -7936,6 +7961,7 @@ def to_series(right): ) elif isinstance(right, Series): # axis=1 is default for DataFrame-with-Series op + axis = axis if axis is not None else 1 if not flex: if not left.axes[axis].equals(right.index): raise ValueError( @@ -8055,43 +8081,49 @@ def _flex_cmp_method(self, other, op, *, axis: Axis = "columns", level=None): return self._construct_result(new_data) @Appender(ops.make_flex_doc("eq", "dataframe")) - def eq(self, other, axis: Axis = "columns", level=None): + def eq(self, other, axis: Axis = "columns", level=None) -> DataFrame: return self._flex_cmp_method(other, operator.eq, axis=axis, level=level) @Appender(ops.make_flex_doc("ne", "dataframe")) - def ne(self, other, axis: Axis = "columns", level=None): + def ne(self, other, axis: Axis = "columns", level=None) -> DataFrame: return self._flex_cmp_method(other, operator.ne, axis=axis, level=level) @Appender(ops.make_flex_doc("le", "dataframe")) - def le(self, other, axis: Axis = "columns", level=None): + def le(self, other, axis: Axis = "columns", level=None) -> DataFrame: return self._flex_cmp_method(other, operator.le, axis=axis, level=level) @Appender(ops.make_flex_doc("lt", "dataframe")) - def lt(self, other, axis: Axis = "columns", level=None): + def lt(self, other, axis: Axis = "columns", level=None) -> DataFrame: return self._flex_cmp_method(other, operator.lt, axis=axis, level=level) @Appender(ops.make_flex_doc("ge", "dataframe")) - def ge(self, other, axis: Axis = "columns", level=None): + def ge(self, other, axis: Axis = "columns", level=None) -> DataFrame: return self._flex_cmp_method(other, operator.ge, axis=axis, level=level) @Appender(ops.make_flex_doc("gt", "dataframe")) - def gt(self, other, axis: Axis = "columns", level=None): + def gt(self, other, axis: Axis = "columns", level=None) -> DataFrame: return self._flex_cmp_method(other, operator.gt, axis=axis, level=level) @Appender(ops.make_flex_doc("add", "dataframe")) - def add(self, other, axis: Axis = "columns", level=None, fill_value=None): + def add( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, operator.add, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("radd", "dataframe")) - def radd(self, other, axis: Axis = "columns", level=None, fill_value=None): + def radd( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, roperator.radd, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("sub", "dataframe")) - def sub(self, other, axis: Axis = "columns", level=None, fill_value=None): + def sub( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, operator.sub, level=level, fill_value=fill_value, axis=axis ) @@ -8099,13 +8131,17 @@ def sub(self, other, axis: Axis = "columns", level=None, fill_value=None): subtract = sub @Appender(ops.make_flex_doc("rsub", "dataframe")) - def rsub(self, other, axis: Axis = "columns", level=None, fill_value=None): + def rsub( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, roperator.rsub, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("mul", "dataframe")) - def mul(self, other, axis: Axis = "columns", level=None, fill_value=None): + def mul( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, operator.mul, level=level, fill_value=fill_value, axis=axis ) @@ -8113,13 +8149,17 @@ def mul(self, other, axis: Axis = "columns", level=None, fill_value=None): multiply = mul @Appender(ops.make_flex_doc("rmul", "dataframe")) - def rmul(self, other, axis: Axis = "columns", level=None, fill_value=None): + def rmul( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, roperator.rmul, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("truediv", "dataframe")) - def truediv(self, other, axis: Axis = "columns", level=None, fill_value=None): + def truediv( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, operator.truediv, level=level, fill_value=fill_value, axis=axis ) @@ -8128,7 +8168,9 @@ def truediv(self, other, axis: Axis = "columns", level=None, fill_value=None): divide = truediv @Appender(ops.make_flex_doc("rtruediv", "dataframe")) - def rtruediv(self, other, axis: Axis = "columns", level=None, fill_value=None): + def rtruediv( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, roperator.rtruediv, level=level, fill_value=fill_value, axis=axis ) @@ -8136,37 +8178,49 @@ def rtruediv(self, other, axis: Axis = "columns", level=None, fill_value=None): rdiv = rtruediv @Appender(ops.make_flex_doc("floordiv", "dataframe")) - def floordiv(self, other, axis: Axis = "columns", level=None, fill_value=None): + def floordiv( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, operator.floordiv, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("rfloordiv", "dataframe")) - def rfloordiv(self, other, axis: Axis = "columns", level=None, fill_value=None): + def rfloordiv( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, roperator.rfloordiv, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("mod", "dataframe")) - def mod(self, other, axis: Axis = "columns", level=None, fill_value=None): + def mod( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, operator.mod, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("rmod", "dataframe")) - def rmod(self, other, axis: Axis = "columns", level=None, fill_value=None): + def rmod( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, roperator.rmod, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("pow", "dataframe")) - def pow(self, other, axis: Axis = "columns", level=None, fill_value=None): + def pow( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, operator.pow, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("rpow", "dataframe")) - def rpow(self, other, axis: Axis = "columns", level=None, fill_value=None): + def rpow( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, roperator.rpow, level=level, fill_value=fill_value, axis=axis ) @@ -8818,20 +8872,20 @@ def update( >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', ... 'Parrot', 'Parrot'], ... 'Max Speed': [380., 370., 24., 26.]}) - >>> df.groupby("Animal", group_keys=True).apply(lambda x: x) - Animal Max Speed + >>> df.groupby("Animal", group_keys=True)[['Max Speed']].apply(lambda x: x) + Max Speed Animal - Falcon 0 Falcon 380.0 - 1 Falcon 370.0 - Parrot 2 Parrot 24.0 - 3 Parrot 26.0 - - >>> df.groupby("Animal", group_keys=False).apply(lambda x: x) - Animal Max Speed - 0 Falcon 380.0 - 1 Falcon 370.0 - 2 Parrot 24.0 - 3 Parrot 26.0 + Falcon 0 380.0 + 1 370.0 + Parrot 2 24.0 + 3 26.0 + + >>> df.groupby("Animal", group_keys=False)[['Max Speed']].apply(lambda x: x) + Max Speed + 0 380.0 + 1 370.0 + 2 24.0 + 3 26.0 """ ) ) @@ -9778,12 +9832,12 @@ def _gotitem( -------- DataFrame.apply : Perform any type of operations. DataFrame.transform : Perform transformation type operations. - core.groupby.GroupBy : Perform operations over groups. - core.resample.Resampler : Perform operations over resampled bins. - core.window.Rolling : Perform operations over rolling window. - core.window.Expanding : Perform operations over expanding window. - core.window.ExponentialMovingWindow : Perform operation over exponential weighted - window. + pandas.DataFrame.groupby : Perform operations over groups. + pandas.DataFrame.resample : Perform operations over resampled bins. + pandas.DataFrame.rolling : Perform operations over rolling window. + pandas.DataFrame.expanding : Perform operations over expanding window. + pandas.core.window.ewm.ExponentialMovingWindow : Perform operation over exponential + weighted window. """ ) @@ -9874,6 +9928,8 @@ def apply( result_type: Literal["expand", "reduce", "broadcast"] | None = None, args=(), by_row: Literal[False, "compat"] = "compat", + engine: Literal["python", "numba"] = "python", + engine_kwargs: dict[str, bool] | None = None, **kwargs, ): """ @@ -9933,6 +9989,35 @@ def apply( If False, the funcs will be passed the whole Series at once. .. versionadded:: 2.1.0 + + engine : {'python', 'numba'}, default 'python' + Choose between the python (default) engine or the numba engine in apply. + + The numba engine will attempt to JIT compile the passed function, + which may result in speedups for large DataFrames. + It also supports the following engine_kwargs : + + - nopython (compile the function in nopython mode) + - nogil (release the GIL inside the JIT compiled function) + - parallel (try to apply the function in parallel over the DataFrame) + + Note: The numba compiler only supports a subset of + valid Python/numpy operations. + + Please read more about the `supported python features + `_ + and `supported numpy features + `_ + in numba to learn what you can or cannot use in the passed function. + + As of right now, the numba engine can only be used with raw=True. + + .. versionadded:: 2.2.0 + + engine_kwargs : dict + Pass keyword arguments to the engine. + This is currently only used by the numba engine, + see the documentation for the engine argument for more information. **kwargs Additional keyword arguments to pass as keywords arguments to `func`. @@ -10033,6 +10118,8 @@ def apply( raw=raw, result_type=result_type, by_row=by_row, + engine=engine, + engine_kwargs=engine_kwargs, args=args, kwargs=kwargs, ) @@ -11252,7 +11339,7 @@ def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: def any( # type: ignore[override] self, *, - axis: Axis = 0, + axis: Axis | None = 0, bool_only: bool = False, skipna: bool = True, **kwargs, @@ -11267,7 +11354,7 @@ def any( # type: ignore[override] @doc(make_doc("all", ndim=2)) def all( self, - axis: Axis = 0, + axis: Axis | None = 0, bool_only: bool = False, skipna: bool = True, **kwargs, @@ -11666,6 +11753,7 @@ def quantile( axis: Axis = ..., numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., + method: Literal["single", "table"] = ..., ) -> Series: ... @@ -11676,6 +11764,7 @@ def quantile( axis: Axis = ..., numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., + method: Literal["single", "table"] = ..., ) -> Series | DataFrame: ... @@ -11686,6 +11775,7 @@ def quantile( axis: Axis = ..., numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., + method: Literal["single", "table"] = ..., ) -> Series | DataFrame: ... @@ -11785,11 +11875,10 @@ def quantile( if not is_list_like(q): # BlockManager.quantile expects listlike, so we wrap and unwrap here - # error: List item 0 has incompatible type "Union[float, Union[Union[ - # ExtensionArray, ndarray[Any, Any]], Index, Series], Sequence[float]]"; - # expected "float" - res_df = self.quantile( # type: ignore[call-overload] - [q], + # error: List item 0 has incompatible type "float | ExtensionArray | + # ndarray[Any, Any] | Index | Series | Sequence[float]"; expected "float" + res_df = self.quantile( + [q], # type: ignore[list-item] axis=axis, numeric_only=numeric_only, interpolation=interpolation, @@ -12202,8 +12291,7 @@ def _to_dict_of_blocks(self, copy: bool = True): """ mgr = self._mgr # convert to BlockManager if needed -> this way support ArrayManager as well - mgr = mgr_to_mgr(mgr, "block") - mgr = cast(BlockManager, mgr) + mgr = cast(BlockManager, mgr_to_mgr(mgr, "block")) return { k: self._constructor_from_mgr(v, axes=v.axes).__finalize__(self) for k, v, in mgr.to_dict(copy=copy).items() diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7da41b890598d..c75c6c546aad0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -39,6 +39,7 @@ Timestamp, to_offset, ) +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr from pandas._typing import ( AlignJoin, AnyArrayLike, @@ -255,8 +256,6 @@ class NDFrame(PandasObject, indexing.IndexingMixin): "_is_copy", "_name", "_metadata", - "__array_struct__", - "__array_interface__", "_flags", ] _internal_names_set: set[str] = set(_internal_names) @@ -1528,7 +1527,8 @@ def bool(self) -> bool_t: .. deprecated:: 2.1.0 - bool is deprecated and will be removed in future version of pandas + bool is deprecated and will be removed in future version of pandas. + For ``Series`` use ``pandas.Series.item``. This must be a boolean scalar value, either True or False. It will raise a ValueError if the Series or DataFrame does not have exactly 1 element, or that @@ -1558,6 +1558,14 @@ def bool(self) -> bool_t: True >>> pd.DataFrame({'col': [False]}).bool() # doctest: +SKIP False + + This is an alternative method and will only work + for single element objects with a boolean value: + + >>> pd.Series([True]).item() # doctest: +SKIP + True + >>> pd.Series([False]).item() # doctest: +SKIP + False """ warnings.warn( @@ -2189,6 +2197,9 @@ def _repr_data_resource_(self): # I/O Methods @final + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "excel_writer"], name="to_excel" + ) @doc( klass="object", storage_options=_shared_docs["storage_options"], @@ -2800,7 +2811,7 @@ def to_hdf( @final @deprecate_nonkeyword_arguments( - version="3.0", allowed_args=["self", "name"], name="to_sql" + version="3.0", allowed_args=["self", "name", "con"], name="to_sql" ) def to_sql( self, @@ -2846,7 +2857,7 @@ def to_sql( index : bool, default True Write DataFrame index as a column. Uses `index_label` as the column - name in the table. + name in the table. Creates a table index for this column. index_label : str or sequence, default None Column label for index column(s). If None is given (default) and `index` is True, then the index names are used. @@ -3098,6 +3109,9 @@ def to_pickle( ) @final + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self"], name="to_clipboard" + ) def to_clipboard( self, excel: bool_t = True, sep: str | None = None, **kwargs ) -> None: @@ -3728,6 +3742,9 @@ def to_csv( ... @final + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "path_or_buf"], name="to_csv" + ) @doc( storage_options=_shared_docs["storage_options"], compression_options=_shared_docs["compression_options"] % "path_or_buf", @@ -5711,10 +5728,12 @@ def filter( if items is not None: name = self._get_axis_name(axis) + items = Index(items).intersection(labels) + if len(items) == 0: + # Keep the dtype of labels when we are empty + items = items.astype(labels.dtype) # error: Keywords must be strings - return self.reindex( # type: ignore[misc] - **{name: labels.intersection(items)} - ) + return self.reindex(**{name: items}) # type: ignore[misc] elif like: def f(x) -> bool_t: @@ -6970,6 +6989,9 @@ def _pad_or_backfill( method = clean_fill_method(method) if not self._mgr.is_single_block and axis == 1: + # e.g. test_align_fill_method + # TODO(3.0): once downcast is removed, we can do the .T + # in all axis=1 cases, and remove axis kward from mgr.pad_or_backfill. if inplace: raise NotImplementedError() result = self.T._pad_or_backfill(method=method, limit=limit).T @@ -7089,6 +7111,8 @@ def fillna( See Also -------- + ffill : Fill values by propagating the last valid observation to next valid. + bfill : Fill values by using the next valid observation to fill the gap. interpolate : Fill NaN values using interpolation. reindex : Conform object to new index. asfreq : Convert TimeSeries to specified frequency. @@ -7348,7 +7372,10 @@ def ffill( ... @final - @doc(klass=_shared_doc_kwargs["klass"]) + @doc( + klass=_shared_doc_kwargs["klass"], + axes_single_arg=_shared_doc_kwargs["axes_single_arg"], + ) def ffill( self, *, @@ -7358,7 +7385,28 @@ def ffill( downcast: dict | None | lib.NoDefault = lib.no_default, ) -> Self | None: """ - Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``. + Fill NA/NaN values by propagating the last valid observation to next valid. + + Parameters + ---------- + axis : {axes_single_arg} + Axis along which to fill missing values. For `Series` + this parameter is unused and defaults to 0. + inplace : bool, default False + If True, fill in-place. Note: this will modify any + other views on this object (e.g., a no-copy slice for a column in a + DataFrame). + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Must be greater than 0 if not None. + downcast : dict, default is None + A dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible). Returns ------- @@ -7427,7 +7475,7 @@ def pad( downcast: dict | None | lib.NoDefault = lib.no_default, ) -> Self | None: """ - Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``. + Fill NA/NaN values by propagating the last valid observation to next valid. .. deprecated:: 2.0 @@ -7484,7 +7532,10 @@ def bfill( ... @final - @doc(klass=_shared_doc_kwargs["klass"]) + @doc( + klass=_shared_doc_kwargs["klass"], + axes_single_arg=_shared_doc_kwargs["axes_single_arg"], + ) def bfill( self, *, @@ -7494,7 +7545,28 @@ def bfill( downcast: dict | None | lib.NoDefault = lib.no_default, ) -> Self | None: """ - Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``. + Fill NA/NaN values by using the next valid observation to fill the gap. + + Parameters + ---------- + axis : {axes_single_arg} + Axis along which to fill missing values. For `Series` + this parameter is unused and defaults to 0. + inplace : bool, default False + If True, fill in-place. Note: this will modify any + other views on this object (e.g., a no-copy slice for a column in a + DataFrame). + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. Must be greater than 0 if not None. + downcast : dict, default is None + A dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible). Returns ------- @@ -7573,7 +7645,7 @@ def backfill( downcast: dict | None | lib.NoDefault = lib.no_default, ) -> Self | None: """ - Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``. + Fill NA/NaN values by using the next valid observation to fill the gap. .. deprecated:: 2.0 @@ -7878,6 +7950,51 @@ def replace( else: return result.__finalize__(self, method="replace") + @overload + def interpolate( + self, + method: InterpolateOptions = ..., + *, + axis: Axis = ..., + limit: int | None = ..., + inplace: Literal[False] = ..., + limit_direction: Literal["forward", "backward", "both"] | None = ..., + limit_area: Literal["inside", "outside"] | None = ..., + downcast: Literal["infer"] | None | lib.NoDefault = ..., + **kwargs, + ) -> Self: + ... + + @overload + def interpolate( + self, + method: InterpolateOptions = ..., + *, + axis: Axis = ..., + limit: int | None = ..., + inplace: Literal[True], + limit_direction: Literal["forward", "backward", "both"] | None = ..., + limit_area: Literal["inside", "outside"] | None = ..., + downcast: Literal["infer"] | None | lib.NoDefault = ..., + **kwargs, + ) -> None: + ... + + @overload + def interpolate( + self, + method: InterpolateOptions = ..., + *, + axis: Axis = ..., + limit: int | None = ..., + inplace: bool_t = ..., + limit_direction: Literal["forward", "backward", "both"] | None = ..., + limit_area: Literal["inside", "outside"] | None = ..., + downcast: Literal["infer"] | None | lib.NoDefault = ..., + **kwargs, + ) -> Self | None: + ... + @final def interpolate( self, @@ -8120,10 +8237,11 @@ def interpolate( stacklevel=find_stack_level(), ) - if "fill_value" in kwargs: + if method in fillna_methods and "fill_value" in kwargs: raise ValueError( "'fill_value' is not a valid keyword for " - f"{type(self).__name__}.interpolate" + f"{type(self).__name__}.interpolate with method from " + f"{fillna_methods}" ) if isinstance(obj.index, MultiIndex) and method != "linear": @@ -8215,8 +8333,6 @@ def asof(self, where, subset=None): * DataFrame : when `self` is a DataFrame and `where` is an array-like - Return scalar, Series, or DataFrame. - See Also -------- merge_asof : Perform an asof merge. Similar to left join. @@ -8547,6 +8663,42 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): # GH 40420 return self.where(subset, threshold, axis=axis, inplace=inplace) + @overload + def clip( + self, + lower=..., + upper=..., + *, + axis: Axis | None = ..., + inplace: Literal[False] = ..., + **kwargs, + ) -> Self: + ... + + @overload + def clip( + self, + lower=..., + upper=..., + *, + axis: Axis | None = ..., + inplace: Literal[True], + **kwargs, + ) -> None: + ... + + @overload + def clip( + self, + lower=..., + upper=..., + *, + axis: Axis | None = ..., + inplace: bool_t = ..., + **kwargs, + ) -> Self | None: + ... + @final def clip( self, @@ -8784,7 +8936,7 @@ def asfreq( -------- Start by creating a series with 4 one minute timestamps. - >>> index = pd.date_range('1/1/2000', periods=4, freq='T') + >>> index = pd.date_range('1/1/2000', periods=4, freq='min') >>> series = pd.Series([0.0, None, 2.0, 3.0], index=index) >>> df = pd.DataFrame({{'s': series}}) >>> df @@ -8796,7 +8948,7 @@ def asfreq( Upsample the series into 30 second bins. - >>> df.asfreq(freq='30S') + >>> df.asfreq(freq='30s') s 2000-01-01 00:00:00 0.0 2000-01-01 00:00:30 NaN @@ -8808,7 +8960,7 @@ def asfreq( Upsample again, providing a ``fill value``. - >>> df.asfreq(freq='30S', fill_value=9.0) + >>> df.asfreq(freq='30s', fill_value=9.0) s 2000-01-01 00:00:00 0.0 2000-01-01 00:00:30 9.0 @@ -8820,7 +8972,7 @@ def asfreq( Upsample again, providing a ``method``. - >>> df.asfreq(freq='30S', method='bfill') + >>> df.asfreq(freq='30s', method='bfill') s 2000-01-01 00:00:00 0.0 2000-01-01 00:00:30 NaN @@ -9020,11 +9172,11 @@ def resample( Use frame.T.resample(...) instead. closed : {{'right', 'left'}}, default None Which side of bin interval is closed. The default is 'left' - for all frequency offsets except for 'M', 'A', 'Q', 'BM', + for all frequency offsets except for 'ME', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W' which all have a default of 'right'. label : {{'right', 'left'}}, default None Which bin edge label to label bucket with. The default is 'left' - for all frequency offsets except for 'M', 'A', 'Q', 'BM', + for all frequency offsets except for 'ME', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W' which all have a default of 'right'. convention : {{'start', 'end', 's', 'e'}}, default 'start' For `PeriodIndex` only, controls whether to use the start or @@ -9096,7 +9248,7 @@ def resample( -------- Start by creating a series with 9 one minute timestamps. - >>> index = pd.date_range('1/1/2000', periods=9, freq='T') + >>> index = pd.date_range('1/1/2000', periods=9, freq='min') >>> series = pd.Series(range(9), index=index) >>> series 2000-01-01 00:00:00 0 @@ -9108,16 +9260,16 @@ def resample( 2000-01-01 00:06:00 6 2000-01-01 00:07:00 7 2000-01-01 00:08:00 8 - Freq: T, dtype: int64 + Freq: min, dtype: int64 Downsample the series into 3 minute bins and sum the values of the timestamps falling into a bin. - >>> series.resample('3T').sum() + >>> series.resample('3min').sum() 2000-01-01 00:00:00 3 2000-01-01 00:03:00 12 2000-01-01 00:06:00 21 - Freq: 3T, dtype: int64 + Freq: 3min, dtype: int64 Downsample the series into 3 minute bins as above, but label each bin using the right edge instead of the left. Please note that the @@ -9129,64 +9281,64 @@ def resample( To include this value close the right side of the bin interval as illustrated in the example below this one. - >>> series.resample('3T', label='right').sum() + >>> series.resample('3min', label='right').sum() 2000-01-01 00:03:00 3 2000-01-01 00:06:00 12 2000-01-01 00:09:00 21 - Freq: 3T, dtype: int64 + Freq: 3min, dtype: int64 Downsample the series into 3 minute bins as above, but close the right side of the bin interval. - >>> series.resample('3T', label='right', closed='right').sum() + >>> series.resample('3min', label='right', closed='right').sum() 2000-01-01 00:00:00 0 2000-01-01 00:03:00 6 2000-01-01 00:06:00 15 2000-01-01 00:09:00 15 - Freq: 3T, dtype: int64 + Freq: 3min, dtype: int64 Upsample the series into 30 second bins. - >>> series.resample('30S').asfreq()[0:5] # Select first 5 rows + >>> series.resample('30s').asfreq()[0:5] # Select first 5 rows 2000-01-01 00:00:00 0.0 2000-01-01 00:00:30 NaN 2000-01-01 00:01:00 1.0 2000-01-01 00:01:30 NaN 2000-01-01 00:02:00 2.0 - Freq: 30S, dtype: float64 + Freq: 30s, dtype: float64 Upsample the series into 30 second bins and fill the ``NaN`` values using the ``ffill`` method. - >>> series.resample('30S').ffill()[0:5] + >>> series.resample('30s').ffill()[0:5] 2000-01-01 00:00:00 0 2000-01-01 00:00:30 0 2000-01-01 00:01:00 1 2000-01-01 00:01:30 1 2000-01-01 00:02:00 2 - Freq: 30S, dtype: int64 + Freq: 30s, dtype: int64 Upsample the series into 30 second bins and fill the ``NaN`` values using the ``bfill`` method. - >>> series.resample('30S').bfill()[0:5] + >>> series.resample('30s').bfill()[0:5] 2000-01-01 00:00:00 0 2000-01-01 00:00:30 1 2000-01-01 00:01:00 1 2000-01-01 00:01:30 2 2000-01-01 00:02:00 2 - Freq: 30S, dtype: int64 + Freq: 30s, dtype: int64 Pass a custom function via ``apply`` >>> def custom_resampler(arraylike): ... return np.sum(arraylike) + 5 ... - >>> series.resample('3T').apply(custom_resampler) + >>> series.resample('3min').apply(custom_resampler) 2000-01-01 00:00:00 8 2000-01-01 00:03:00 17 2000-01-01 00:06:00 26 - Freq: 3T, dtype: int64 + Freq: 3min, dtype: int64 For a Series with a PeriodIndex, the keyword `convention` can be used to control whether to use the start or end of `rule`. @@ -9256,7 +9408,7 @@ def resample( 5 18 100 2018-02-11 6 17 40 2018-02-18 7 19 50 2018-02-25 - >>> df.resample('M', on='week_starting').mean() + >>> df.resample('ME', on='week_starting').mean() price volume week_starting 2018-01-31 10.75 62.5 @@ -9306,7 +9458,7 @@ def resample( 2000-10-02 00:12:00 18 2000-10-02 00:19:00 21 2000-10-02 00:26:00 24 - Freq: 7T, dtype: int64 + Freq: 7min, dtype: int64 >>> ts.resample('17min').sum() 2000-10-01 23:14:00 0 @@ -9314,7 +9466,7 @@ def resample( 2000-10-01 23:48:00 21 2000-10-02 00:05:00 54 2000-10-02 00:22:00 24 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 >>> ts.resample('17min', origin='epoch').sum() 2000-10-01 23:18:00 0 @@ -9322,7 +9474,7 @@ def resample( 2000-10-01 23:52:00 27 2000-10-02 00:09:00 39 2000-10-02 00:26:00 24 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 >>> ts.resample('17W', origin='2000-01-01').sum() 2000-01-02 0 @@ -9339,14 +9491,14 @@ def resample( 2000-10-01 23:47:00 21 2000-10-02 00:04:00 54 2000-10-02 00:21:00 24 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 >>> ts.resample('17min', offset='23h30min').sum() 2000-10-01 23:30:00 9 2000-10-01 23:47:00 21 2000-10-02 00:04:00 54 2000-10-02 00:21:00 24 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 If you want to take the largest Timestamp as the end of the bins: @@ -9355,7 +9507,7 @@ def resample( 2000-10-01 23:52:00 18 2000-10-02 00:09:00 27 2000-10-02 00:26:00 63 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 In contrast with the `start_day`, you can use `end_day` to take the ceiling midnight of the largest Timestamp as the end of the bins and drop the bins @@ -9366,7 +9518,7 @@ def resample( 2000-10-01 23:55:00 15 2000-10-02 00:12:00 45 2000-10-02 00:29:00 45 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 """ from pandas.core.resample import get_resampler @@ -9416,7 +9568,7 @@ def first(self, offset) -> Self: ---------- offset : str, DateOffset or dateutil.relativedelta The offset length of the data that will be selected. For instance, - '1M' will display all the rows having their index within the first month. + '1ME' will display all the rows having their index within the first month. Returns ------- @@ -10251,7 +10403,14 @@ def _where( # make sure we are boolean fill_value = bool(inplace) - cond = cond.fillna(fill_value) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + cond = cond.fillna(fill_value) + cond = cond.infer_objects(copy=False) msg = "Boolean array expected for the condition, not {dtype}" @@ -10762,10 +10921,14 @@ def shift( if freq is not None and fill_value is not lib.no_default: # GH#53832 - raise ValueError( - "Cannot pass both 'freq' and 'fill_value' to " - f"{type(self).__name__}.shift" + warnings.warn( + "Passing a 'freq' together with a 'fill_value' silently ignores " + "the fill_value and is deprecated. This will raise in a future " + "version.", + FutureWarning, + stacklevel=find_stack_level(), ) + fill_value = lib.no_default if periods == 0: return self.copy(deep=None) @@ -10804,15 +10967,17 @@ def _shift_with_freq(self, periods: int, axis: int, freq) -> Self: raise ValueError(msg) elif isinstance(freq, str): - freq = to_offset(freq) + is_period = isinstance(index, PeriodIndex) + freq = to_offset(freq, is_period=is_period) if isinstance(index, PeriodIndex): orig_freq = to_offset(index.freq) if freq != orig_freq: assert orig_freq is not None # for mypy raise ValueError( - f"Given freq {freq.rule_code} does not match " - f"PeriodIndex freq {orig_freq.rule_code}" + f"Given freq {freq_to_period_freqstr(freq.n, freq.name)} " + f"does not match PeriodIndex freq " + f"{freq_to_period_freqstr(orig_freq.n, orig_freq.name)}" ) new_ax = index.shift(periods) else: @@ -11535,7 +11700,7 @@ def pct_change( .. deprecated:: 2.1 freq : DateOffset, timedelta, or str, optional - Increment to use from time series API (e.g. 'M' or BDay()). + Increment to use from time series API (e.g. 'ME' or BDay()). **kwargs Additional keyword arguments are passed into `DataFrame.shift` or `Series.shift`. @@ -11645,15 +11810,21 @@ def pct_change( stacklevel=find_stack_level(), ) if fill_method is lib.no_default: - if self.isna().values.any(): - warnings.warn( - "The default fill_method='pad' in " - f"{type(self).__name__}.pct_change is deprecated and will be " - "removed in a future version. Call ffill before calling " - "pct_change to retain current behavior and silence this warning.", - FutureWarning, - stacklevel=find_stack_level(), - ) + cols = self.items() if self.ndim == 2 else [(None, self)] + for _, col in cols: + mask = col.isna().values + mask = mask[np.argmax(~mask) :] + if mask.any(): + warnings.warn( + "The default fill_method='pad' in " + f"{type(self).__name__}.pct_change is deprecated and will be " + "removed in a future version. Call ffill before calling " + "pct_change to retain current behavior and silence this " + "warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) + break fill_method = "pad" if limit is lib.no_default: limit = None @@ -11679,7 +11850,7 @@ def _logical_func( self, name: str, func, - axis: Axis = 0, + axis: Axis | None = 0, bool_only: bool_t = False, skipna: bool_t = True, **kwargs, @@ -11692,7 +11863,10 @@ def _logical_func( res = self._logical_func( name, func, axis=0, bool_only=bool_only, skipna=skipna, **kwargs ) - return res._logical_func(name, func, skipna=skipna, **kwargs) + # error: Item "bool" of "Series | bool" has no attribute "_logical_func" + return res._logical_func( # type: ignore[union-attr] + name, func, skipna=skipna, **kwargs + ) elif axis is None: axis = 0 diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8bef167b747e2..dbabd04a87c36 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -721,8 +721,10 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: return self._reindex_output(result, fill_value=0) @doc(Series.describe) - def describe(self, **kwargs): - return super().describe(**kwargs) + def describe(self, percentiles=None, include=None, exclude=None) -> Series: + return super().describe( + percentiles=percentiles, include=include, exclude=exclude + ) def value_counts( self, @@ -770,6 +772,7 @@ def value_counts( mask = ids != -1 ids, val = ids[mask], val[mask] + lab: Index | np.ndarray if bins is None: lab, lev = algorithms.factorize(val, sort=True) llab = lambda lab, inc: lab[inc] @@ -1152,7 +1155,7 @@ def alt(obj): @property @doc(Series.plot.__doc__) - def plot(self): + def plot(self) -> GroupByPlot: result = GroupByPlot(self) return result diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5a7f42a535951..d607baf18d6cb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -107,6 +107,7 @@ class providing the base-class of operations. IntegerArray, SparseArray, ) +from pandas.core.arrays.string_ import StringDtype from pandas.core.base import ( PandasObject, SelectionMixin, @@ -142,6 +143,7 @@ class providing the base-class of operations. if TYPE_CHECKING: from typing import Any + from pandas.core.resample import Resampler from pandas.core.window import ( ExpandingGroupby, ExponentialMovingWindowGroupby, @@ -178,6 +180,19 @@ class providing the base-class of operations. A callable that takes a {input} as its first argument, and returns a dataframe, a series or a scalar. In addition the callable may take positional and keyword arguments. + include_groups : bool, default True + When True, will attempt to apply ``func`` to the groupings in + the case that they are columns of the DataFrame. If this raises a + TypeError, the result will be computed with the groupings excluded. + When False, the groupings will be excluded when applying ``func``. + + .. versionadded:: 2.2.0 + + .. deprecated:: 2.2.0 + + Setting include_groups to True is deprecated. Only the value + False will be allowed in a future version of pandas. + args, kwargs : tuple and dict Optional positional and keyword arguments to pass to ``func``. @@ -270,7 +285,7 @@ class providing the base-class of operations. each group together into a Series, including setting the index as appropriate: - >>> g1.apply(lambda x: x.C.max() - x.B.min()) + >>> g1.apply(lambda x: x.C.max() - x.B.min(), include_groups=False) A a 5 b 2 @@ -1033,7 +1048,7 @@ def get_group(self, name, obj=None) -> DataFrame | Series: owl 1 2 3 toucan 1 5 6 eagle 7 8 9 - >>> df.groupby(by=["a"]).get_group(1) + >>> df.groupby(by=["a"]).get_group((1,)) a b c owl 1 2 3 toucan 1 5 6 @@ -1053,12 +1068,33 @@ def get_group(self, name, obj=None) -> DataFrame | Series: 2023-01-15 2 dtype: int64 """ + keys = self.keys + level = self.level + # mypy doesn't recognize level/keys as being sized when passed to len + if (is_list_like(level) and len(level) == 1) or ( # type: ignore[arg-type] + is_list_like(keys) and len(keys) == 1 # type: ignore[arg-type] + ): + # GH#25971 + if isinstance(name, tuple) and len(name) == 1: + # Allow users to pass tuples of length 1 to silence warning + name = name[0] + elif not isinstance(name, tuple): + warnings.warn( + "When grouping with a length-1 list-like, " + "you will need to pass a length-1 tuple to get_group in a future " + "version of pandas. Pass `(name,)` instead of `name` to silence " + "this warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) + inds = self._get_index(name) if not len(inds): raise KeyError(name) if obj is None: - return self._selected_obj.iloc[inds] + indexer = inds if self.axis == 0 else (slice(None), inds) + return self._selected_obj.iloc[indexer] else: warnings.warn( "obj is deprecated and will be removed in a future version. " @@ -1725,7 +1761,7 @@ def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): input="dataframe", examples=_apply_docs["dataframe_examples"] ) ) - def apply(self, func, *args, **kwargs) -> NDFrameT: + def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: orig_func = func func = com.is_builtin_func(func) if orig_func != func: @@ -1758,10 +1794,25 @@ def f(g): else: f = func + if not include_groups: + return self._python_apply_general(f, self._obj_with_exclusions) + # ignore SettingWithCopy here in case the user mutates with option_context("mode.chained_assignment", None): try: result = self._python_apply_general(f, self._selected_obj) + if ( + not isinstance(self.obj, Series) + and self._selection is None + and self._selected_obj.shape != self._obj_with_exclusions.shape + ): + warnings.warn( + message=_apply_groupings_depr.format( + type(self).__name__, "apply" + ), + category=FutureWarning, + stacklevel=find_stack_level(), + ) except TypeError: # gh-20949 # try again, with .apply acting as a filtering @@ -2073,7 +2124,7 @@ def _obj_1d_constructor(self) -> Callable: @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) - def any(self, skipna: bool = True): + def any(self, skipna: bool = True) -> NDFrameT: """ Return True if any value in the group is truthful, else False. @@ -2129,7 +2180,7 @@ def any(self, skipna: bool = True): @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) - def all(self, skipna: bool = True): + def all(self, skipna: bool = True) -> NDFrameT: """ Return True if all values in the group are truthful, else False. @@ -2261,7 +2312,9 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: return IntegerArray( counted[0], mask=np.zeros(counted.shape[1], dtype=np.bool_) ) - elif isinstance(bvalues, ArrowExtensionArray): + elif isinstance(bvalues, ArrowExtensionArray) and not isinstance( + bvalues.dtype, StringDtype + ): return type(bvalues)._from_sequence(counted[0]) if is_series: assert counted.ndim == 2 @@ -2376,7 +2429,7 @@ def mean( return result.__finalize__(self.obj, method="groupby") @final - def median(self, numeric_only: bool = False): + def median(self, numeric_only: bool = False) -> NDFrameT: """ Compute median of groups, excluding missing values. @@ -2805,7 +2858,7 @@ def _value_counts( return result.__finalize__(self.obj, method="value_counts") @final - def sem(self, ddof: int = 1, numeric_only: bool = False): + def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: """ Compute standard error of the mean of groups, excluding missing values. @@ -3096,7 +3149,7 @@ def sum( 2 30 72""" ), ) - def prod(self, numeric_only: bool = False, min_count: int = 0): + def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod ) @@ -3238,7 +3291,7 @@ def max( ) @final - def first(self, numeric_only: bool = False, min_count: int = -1): + def first(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: """ Compute the first non-null entry of each column. @@ -3308,7 +3361,7 @@ def first(x: Series): ) @final - def last(self, numeric_only: bool = False, min_count: int = -1): + def last(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: """ Compute the last non-null entry of each column. @@ -3495,7 +3548,7 @@ def describe( return result @final - def resample(self, rule, *args, **kwargs): + def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resampler: """ Provide resampling when using a TimeGrouper. @@ -3509,7 +3562,23 @@ def resample(self, rule, *args, **kwargs): ---------- rule : str or DateOffset The offset string or object representing target grouper conversion. - *args, **kwargs + *args + Possible arguments are `how`, `fill_method`, `limit`, `kind` and + `on`, and other arguments of `TimeGrouper`. + include_groups : bool, default True + When True, will attempt to include the groupings in the operation in + the case that they are columns of the DataFrame. If this raises a + TypeError, the result will be computed with the groupings excluded. + When False, the groupings will be excluded when applying ``func``. + + .. versionadded:: 2.2.0 + + .. deprecated:: 2.2.0 + + Setting include_groups to True is deprecated. Only the value + False will be allowed in a future version of pandas. + + **kwargs Possible arguments are `how`, `fill_method`, `limit`, `kind` and `on`, and other arguments of `TimeGrouper`. @@ -3530,7 +3599,7 @@ def resample(self, rule, *args, **kwargs): Examples -------- - >>> idx = pd.date_range('1/1/2000', periods=4, freq='T') + >>> idx = pd.date_range('1/1/2000', periods=4, freq='min') >>> df = pd.DataFrame(data=4 * [range(2)], ... index=idx, ... columns=['a', 'b']) @@ -3545,59 +3614,71 @@ def resample(self, rule, *args, **kwargs): Downsample the DataFrame into 3 minute bins and sum the values of the timestamps falling into a bin. - >>> df.groupby('a').resample('3T').sum() - a b + >>> df.groupby('a').resample('3min', include_groups=False).sum() + b a - 0 2000-01-01 00:00:00 0 2 - 2000-01-01 00:03:00 0 1 - 5 2000-01-01 00:00:00 5 1 + 0 2000-01-01 00:00:00 2 + 2000-01-01 00:03:00 1 + 5 2000-01-01 00:00:00 1 Upsample the series into 30 second bins. - >>> df.groupby('a').resample('30S').sum() - a b + >>> df.groupby('a').resample('30s', include_groups=False).sum() + b a - 0 2000-01-01 00:00:00 0 1 - 2000-01-01 00:00:30 0 0 - 2000-01-01 00:01:00 0 1 - 2000-01-01 00:01:30 0 0 - 2000-01-01 00:02:00 0 0 - 2000-01-01 00:02:30 0 0 - 2000-01-01 00:03:00 0 1 - 5 2000-01-01 00:02:00 5 1 + 0 2000-01-01 00:00:00 1 + 2000-01-01 00:00:30 0 + 2000-01-01 00:01:00 1 + 2000-01-01 00:01:30 0 + 2000-01-01 00:02:00 0 + 2000-01-01 00:02:30 0 + 2000-01-01 00:03:00 1 + 5 2000-01-01 00:02:00 1 Resample by month. Values are assigned to the month of the period. - >>> df.groupby('a').resample('M').sum() - a b + >>> df.groupby('a').resample('ME', include_groups=False).sum() + b a - 0 2000-01-31 0 3 - 5 2000-01-31 5 1 + 0 2000-01-31 3 + 5 2000-01-31 1 Downsample the series into 3 minute bins as above, but close the right side of the bin interval. - >>> df.groupby('a').resample('3T', closed='right').sum() - a b + >>> ( + ... df.groupby('a') + ... .resample('3min', closed='right', include_groups=False) + ... .sum() + ... ) + b a - 0 1999-12-31 23:57:00 0 1 - 2000-01-01 00:00:00 0 2 - 5 2000-01-01 00:00:00 5 1 + 0 1999-12-31 23:57:00 1 + 2000-01-01 00:00:00 2 + 5 2000-01-01 00:00:00 1 Downsample the series into 3 minute bins and close the right side of the bin interval, but label each bin using the right edge instead of the left. - >>> df.groupby('a').resample('3T', closed='right', label='right').sum() - a b + >>> ( + ... df.groupby('a') + ... .resample('3min', closed='right', label='right', include_groups=False) + ... .sum() + ... ) + b a - 0 2000-01-01 00:00:00 0 1 - 2000-01-01 00:03:00 0 2 - 5 2000-01-01 00:03:00 5 1 + 0 2000-01-01 00:00:00 1 + 2000-01-01 00:03:00 2 + 5 2000-01-01 00:03:00 1 """ from pandas.core.resample import get_resampler_for_grouping - return get_resampler_for_grouping(self, rule, *args, **kwargs) + # mypy flags that include_groups could be specified via `*args` or `**kwargs` + # GH#54961 would resolve. + return get_resampler_for_grouping( # type: ignore[misc] + self, rule, *args, include_groups=include_groups, **kwargs + ) @final def rolling(self, *args, **kwargs) -> RollingGroupby: @@ -5703,3 +5784,13 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde mi = MultiIndex(levels=levels, codes=codes, names=[idx.name, None]) return mi + + +# GH#7155 +_apply_groupings_depr = ( + "{}.{} operated on the grouping columns. This behavior is deprecated, " + "and in a future version of pandas the grouping columns will be excluded " + "from the operation. Either pass `include_groups=False` to exclude the " + "groupings or explicitly select the grouping columns after groupby to silence " + "this warning." +) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index ea92fbae9566d..c51c17e04796a 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -189,7 +189,7 @@ class Grouper: 2000-10-02 00:12:00 18 2000-10-02 00:19:00 21 2000-10-02 00:26:00 24 - Freq: 7T, dtype: int64 + Freq: 7min, dtype: int64 >>> ts.groupby(pd.Grouper(freq='17min')).sum() 2000-10-01 23:14:00 0 @@ -197,7 +197,7 @@ class Grouper: 2000-10-01 23:48:00 21 2000-10-02 00:05:00 54 2000-10-02 00:22:00 24 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 >>> ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum() 2000-10-01 23:18:00 0 @@ -205,7 +205,7 @@ class Grouper: 2000-10-01 23:52:00 27 2000-10-02 00:09:00 39 2000-10-02 00:26:00 24 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 >>> ts.groupby(pd.Grouper(freq='17W', origin='2000-01-01')).sum() 2000-01-02 0 @@ -222,14 +222,14 @@ class Grouper: 2000-10-01 23:47:00 21 2000-10-02 00:04:00 54 2000-10-02 00:21:00 24 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 >>> ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum() 2000-10-01 23:30:00 9 2000-10-01 23:47:00 21 2000-10-02 00:04:00 54 2000-10-02 00:21:00 24 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 To replace the use of the deprecated `base` argument, you can now use `offset`, in this example it is equivalent to have `base=2`: @@ -240,7 +240,7 @@ class Grouper: 2000-10-01 23:50:00 36 2000-10-02 00:07:00 39 2000-10-02 00:24:00 24 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 """ sort: bool @@ -289,12 +289,12 @@ def __init__( self.dropna = dropna self._grouper_deprecated = None - self._indexer_deprecated = None + self._indexer_deprecated: npt.NDArray[np.intp] | None = None self._obj_deprecated = None self._gpr_index = None self.binner = None self._grouper = None - self._indexer = None + self._indexer: npt.NDArray[np.intp] | None = None def _get_grouper( self, obj: NDFrameT, validate: bool = True @@ -329,8 +329,8 @@ def _get_grouper( @final def _set_grouper( - self, obj: NDFrame, sort: bool = False, *, gpr_index: Index | None = None - ): + self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None + ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]: """ given an object and the specifications, setup the internal grouper for this particular specification @@ -350,8 +350,6 @@ def _set_grouper( """ assert obj is not None - indexer = None - if self.key is not None and self.level is not None: raise ValueError("The Grouper cannot specify both a key and a level!") @@ -398,6 +396,7 @@ def _set_grouper( raise ValueError(f"The level {level} is not valid") # possibly sort + indexer: npt.NDArray[np.intp] | None = None if (self.sort or sort) and not ax.is_monotonic_increasing: # use stable sort to support first, last, nth # TODO: why does putting na_position="first" fix datetimelike cases? @@ -441,6 +440,8 @@ def indexer(self): @final @property def obj(self): + # TODO(3.0): enforcing these deprecations on Grouper should close + # GH#25564, GH#41930 warnings.warn( f"{type(self).__name__}.obj is deprecated and will be removed " "in a future version. Use GroupBy.indexer instead.", diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 71525c8c1a223..607059e5183ec 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -77,7 +77,7 @@ from pandas.core.generic import NDFrame -def check_result_array(obj, dtype): +def check_result_array(obj, dtype) -> None: # Our operation is supposed to be an aggregation/reduction. If # it returns an ndarray, this likely means an invalid operation has # been passed. See test_apply_without_aggregation, test_agg_must_agg diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index 694a420ad2494..c13ec51ff3851 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -262,7 +262,9 @@ def get_window_bounds( # end bound is previous end # or current index end_diff = (self.index[end[i - 1]] - end_bound) * index_growth_sign - if end_diff <= zero: + if end_diff == zero and not right_closed: + end[i] = end[i - 1] + 1 + elif end_diff <= zero: end[i] = i + 1 else: end[i] = end[i - 1] diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index d972983532e3c..af8fa441f8b3f 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -227,7 +227,7 @@ def to_pydatetime(self): ) return cast(ArrowExtensionArray, self._parent.array)._dt_to_pydatetime() - def isocalendar(self): + def isocalendar(self) -> DataFrame: from pandas import DataFrame result = ( @@ -414,7 +414,7 @@ class TimedeltaProperties(Properties): Examples -------- >>> seconds_series = pd.Series( - ... pd.timedelta_range(start="1 second", periods=3, freq="S") + ... pd.timedelta_range(start="1 second", periods=3, freq="s") ... ) >>> seconds_series 0 0 days 00:00:01 @@ -528,7 +528,7 @@ class PeriodProperties(Properties): 1 2000-01-01 00:00:01 2 2000-01-01 00:00:02 3 2000-01-01 00:00:03 - dtype: period[S] + dtype: period[s] >>> seconds_series.dt.second 0 0 1 1 diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 781dfae7fef64..877b8edb32520 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -239,8 +239,12 @@ def _unique_indices(inds, dtype) -> Index: Index """ if all(isinstance(ind, Index) for ind in inds): - result = inds[0].append(inds[1:]).unique() - result = result.astype(dtype, copy=False) + inds = [ind.astype(dtype, copy=False) for ind in inds] + result = inds[0].unique() + other = inds[1].append(inds[2:]) + diff = other[result.get_indexer_for(other) == -1] + if len(diff): + result = result.append(diff.unique()) if sort: result = result.sort_values() return result @@ -288,7 +292,6 @@ def _find_common_index_dtype(inds): raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") if len(dtis) == len(indexes): - sort = True result = indexes[0] elif len(dtis) > 1: @@ -377,5 +380,5 @@ def all_indexes_same(indexes) -> bool: def default_index(n: int) -> RangeIndex: - rng = range(0, n) + rng = range(n) return RangeIndex._simple_new(rng, name=None) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ee36a3515c4b3..8703fef1e5940 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -124,6 +124,7 @@ from pandas.core.dtypes.generic import ( ABCDataFrame, ABCDatetimeIndex, + ABCIntervalIndex, ABCMultiIndex, ABCPeriodIndex, ABCSeries, @@ -373,9 +374,6 @@ def _left_indexer_unique(self, other: Self) -> npt.NDArray[np.intp]: # Caller is responsible for ensuring other.dtype == self.dtype sv = self._get_join_target() ov = other._get_join_target() - # can_use_libjoin assures sv and ov are ndarrays - sv = cast(np.ndarray, sv) - ov = cast(np.ndarray, ov) # similar but not identical to ov.searchsorted(sv) return libjoin.left_join_indexer_unique(sv, ov) @@ -386,9 +384,6 @@ def _left_indexer( # Caller is responsible for ensuring other.dtype == self.dtype sv = self._get_join_target() ov = other._get_join_target() - # can_use_libjoin assures sv and ov are ndarrays - sv = cast(np.ndarray, sv) - ov = cast(np.ndarray, ov) joined_ndarray, lidx, ridx = libjoin.left_join_indexer(sv, ov) joined = self._from_join_target(joined_ndarray) return joined, lidx, ridx @@ -400,9 +395,6 @@ def _inner_indexer( # Caller is responsible for ensuring other.dtype == self.dtype sv = self._get_join_target() ov = other._get_join_target() - # can_use_libjoin assures sv and ov are ndarrays - sv = cast(np.ndarray, sv) - ov = cast(np.ndarray, ov) joined_ndarray, lidx, ridx = libjoin.inner_join_indexer(sv, ov) joined = self._from_join_target(joined_ndarray) return joined, lidx, ridx @@ -414,9 +406,6 @@ def _outer_indexer( # Caller is responsible for ensuring other.dtype == self.dtype sv = self._get_join_target() ov = other._get_join_target() - # can_use_libjoin assures sv and ov are ndarrays - sv = cast(np.ndarray, sv) - ov = cast(np.ndarray, ov) joined_ndarray, lidx, ridx = libjoin.outer_join_indexer(sv, ov) joined = self._from_join_target(joined_ndarray) return joined, lidx, ridx @@ -1132,7 +1121,7 @@ def take( allow_fill: bool = True, fill_value=None, **kwargs, - ): + ) -> Self: if kwargs: nv.validate_take((), kwargs) if is_scalar(indices): @@ -1217,7 +1206,7 @@ def _maybe_disallow_fill(self, allow_fill: bool, fill_value, indices) -> bool: """ @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) - def repeat(self, repeats, axis=None): + def repeat(self, repeats, axis: None = None) -> Self: repeats = ensure_platform_int(repeats) nv.validate_repeat((), {"axis": axis}) res_values = self._values.repeat(repeats) @@ -1408,8 +1397,8 @@ def _format_with_header(self, header: list[str_t], na_rep: str_t) -> list[str_t] values = self._values - if is_object_dtype(values.dtype): - values = cast(np.ndarray, values) + if is_object_dtype(values.dtype) or is_string_dtype(values.dtype): + values = np.asarray(values) values = lib.maybe_convert_objects(values, safe=True) result = [pprint_thing(x, escape_chars=("\t", "\r", "\n")) for x in values] @@ -2184,11 +2173,6 @@ def _drop_level_numbers(self, levnums: list[int]): @final def _can_hold_na(self) -> bool: if isinstance(self.dtype, ExtensionDtype): - if isinstance(self.dtype, IntervalDtype): - # FIXME(GH#45720): this is inaccurate for integer-backed - # IntervalArray, but without it other.categories.take raises - # in IntervalArray._cmp_method - return True return self.dtype._can_hold_na if self.dtype.kind in "iub": return False @@ -3354,6 +3338,7 @@ def _union(self, other: Index, sort: bool | None): and other.is_monotonic_increasing and not (self.has_duplicates and other.has_duplicates) and self._can_use_libjoin + and other._can_use_libjoin ): # Both are monotonic and at least one is unique, so can use outer join # (actually don't need either unique, but without this restriction @@ -3452,7 +3437,7 @@ def intersection(self, other, sort: bool = False): self, other = self._dti_setop_align_tzs(other, "intersection") if self.equals(other): - if self.has_duplicates: + if not self.is_unique: result = self.unique()._get_reconciled_name_object(other) else: result = self._get_reconciled_name_object(other) @@ -3507,7 +3492,7 @@ def _intersection(self, other: Index, sort: bool = False): self.is_monotonic_increasing and other.is_monotonic_increasing and self._can_use_libjoin - and not isinstance(self, ABCMultiIndex) + and other._can_use_libjoin ): try: res_indexer, indexer, _ = self._inner_indexer(other) @@ -3542,7 +3527,7 @@ def _intersection_via_get_indexer( Returns ------- - np.ndarray or ExtensionArray + np.ndarray or ExtensionArray or MultiIndex The returned array will be unique. """ left_unique = self.unique() @@ -3559,6 +3544,7 @@ def _intersection_via_get_indexer( # unnecessary in the case with sort=None bc we will sort later taker = np.sort(taker) + result: MultiIndex | ExtensionArray | np.ndarray if isinstance(left_unique, ABCMultiIndex): result = left_unique.take(taker) else: @@ -3612,14 +3598,14 @@ def difference(self, other, sort=None): if len(other) == 0: # Note: we do not (yet) sort even if sort=None GH#24959 - result = self.rename(result_name) + result = self.unique().rename(result_name) if sort is True: return result.sort_values() return result if not self._should_compare(other): # Nothing matches -> difference is everything - result = self.rename(result_name) + result = self.unique().rename(result_name) if sort is True: return result.sort_values() return result @@ -3629,21 +3615,10 @@ def difference(self, other, sort=None): def _difference(self, other, sort): # overridden by RangeIndex - - this = self.unique() - - indexer = this.get_indexer_for(other) - indexer = indexer.take((indexer != -1).nonzero()[0]) - - label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) - - the_diff: MultiIndex | ArrayLike - if isinstance(this, ABCMultiIndex): - the_diff = this.take(label_diff) - else: - the_diff = this._values.take(label_diff) + other = other.unique() + the_diff = self[other.get_indexer_for(self) == -1] + the_diff = the_diff if self.is_unique else the_diff.unique() the_diff = _maybe_try_sort(the_diff, sort) - return the_diff def _wrap_difference_result(self, other, result): @@ -3791,9 +3766,15 @@ def get_loc(self, key): self._check_indexing_error(key) raise - _index_shared_docs[ - "get_indexer" - ] = """ + @final + def get_indexer( + self, + target, + method: ReindexMethod | None = None, + limit: int | None = None, + tolerance=None, + ) -> npt.NDArray[np.intp]: + """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the @@ -3801,7 +3782,7 @@ def get_loc(self, key): Parameters ---------- - target : %(target_klass)s + target : Index method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional * default: exact matches only. * pad / ffill: find the PREVIOUS index value if no exact match. @@ -3828,7 +3809,7 @@ def get_loc(self, key): Integers from 0 to n - 1 indicating that the index at these positions matches the corresponding target values. Missing values in the target are marked by -1. - %(raises_section)s + Notes ----- Returns -1 for unmatched values, for further explanation see the @@ -3843,16 +3824,6 @@ def get_loc(self, key): Notice that the return value is an array of locations in ``index`` and ``x`` is marked by -1, as it is not in ``index``. """ - - @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) - @final - def get_indexer( - self, - target, - method: ReindexMethod | None = None, - limit: int | None = None, - tolerance=None, - ) -> npt.NDArray[np.intp]: method = clean_reindex_fill_method(method) orig_target = target target = self._maybe_cast_listlike_indexer(target) @@ -3907,7 +3878,7 @@ def get_indexer( return ensure_platform_int(indexer) - pself, ptarget = self._maybe_promote(target) + pself, ptarget = self._maybe_downcast_for_indexing(target) if pself is not self or ptarget is not target: return pself.get_indexer( ptarget, method=method, limit=limit, tolerance=tolerance @@ -3967,12 +3938,8 @@ def _should_partial_index(self, target: Index) -> bool: if isinstance(self.dtype, IntervalDtype): if isinstance(target.dtype, IntervalDtype): return False - # See https://github.com/pandas-dev/pandas/issues/47772 the commented - # out code can be restored (instead of hardcoding `return True`) - # once that issue is fixed # "Index" has no attribute "left" - # return self.left._should_compare(target) # type: ignore[attr-defined] - return True + return self.left._should_compare(target) # type: ignore[attr-defined] return False @final @@ -4464,7 +4431,7 @@ def _reindex_non_unique( indexer, missing = self.get_indexer_non_unique(target) check = indexer != -1 - new_labels = self.take(indexer[check]) + new_labels: Index | np.ndarray = self.take(indexer[check]) new_indexer = None if len(missing): @@ -4575,7 +4542,7 @@ def join( ------- join_index, (left_indexer, right_indexer) - Examples + Examples -------- >>> idx1 = pd.Index([1, 2, 3]) >>> idx2 = pd.Index([4, 5, 6]) @@ -4591,7 +4558,7 @@ def join( if not self._is_multi and not other._is_multi: # We have specific handling for MultiIndex below - pself, pother = self._maybe_promote(other) + pself, pother = self._maybe_downcast_for_indexing(other) if pself is not self or pother is not other: return pself.join( pother, how=how, level=level, return_indexers=True, sort=sort @@ -4650,24 +4617,13 @@ def join( _validate_join_method(how) - if not self.is_unique and not other.is_unique: - return self._join_non_unique(other, how=how) - elif not self.is_unique or not other.is_unique: - if self.is_monotonic_increasing and other.is_monotonic_increasing: - if not isinstance(self.dtype, IntervalDtype): - # otherwise we will fall through to _join_via_get_indexer - # GH#39133 - # go through object dtype for ea till engine is supported properly - return self._join_monotonic(other, how=how) - else: - return self._join_non_unique(other, how=how) - elif ( - # GH48504: exclude MultiIndex to avoid going through MultiIndex._values - self.is_monotonic_increasing + if ( + not isinstance(self.dtype, CategoricalDtype) + and self.is_monotonic_increasing and other.is_monotonic_increasing and self._can_use_libjoin - and not isinstance(self, ABCMultiIndex) - and not isinstance(self.dtype, CategoricalDtype) + and other._can_use_libjoin + and (self.is_unique or other.is_unique) ): # Categorical is monotonic if data are ordered as categories, but join can # not handle this in case of not lexicographically monotonic GH#38502 @@ -4676,6 +4632,8 @@ def join( except TypeError: # object dtype; non-comparable objects pass + elif not self.is_unique or not other.is_unique: + return self._join_non_unique(other, how=how, sort=sort) return self._join_via_get_indexer(other, how, sort) @@ -4693,15 +4651,13 @@ def _join_via_get_indexer( elif how == "right": join_index = other elif how == "inner": - # TODO: sort=False here for backwards compat. It may - # be better to use the sort parameter passed into join - join_index = self.intersection(other, sort=False) + join_index = self.intersection(other, sort=sort) elif how == "outer": # TODO: sort=True here for backwards compat. It may # be better to use the sort parameter passed into join join_index = self.union(other) - if sort: + if sort and how in ["left", "right"]: join_index = join_index.sort_values() if join_index is self: @@ -4798,7 +4754,7 @@ def _join_multi(self, other: Index, how: JoinHow): @final def _join_non_unique( - self, other: Index, how: JoinHow = "left" + self, other: Index, how: JoinHow = "left", sort: bool = False ) -> tuple[Index, npt.NDArray[np.intp], npt.NDArray[np.intp]]: from pandas.core.reshape.merge import get_join_indexers @@ -4806,13 +4762,16 @@ def _join_non_unique( assert self.dtype == other.dtype left_idx, right_idx = get_join_indexers( - [self._values], [other._values], how=how, sort=True + [self._values], [other._values], how=how, sort=sort ) mask = left_idx == -1 join_idx = self.take(left_idx) right = other.take(right_idx) join_index = join_idx.putmask(mask, right) + if isinstance(join_index, ABCMultiIndex) and how == "outer": + # test_join_index_levels + join_index = join_index._sort_levels_monotonic() return join_index, left_idx, right_idx @final @@ -4970,6 +4929,7 @@ def _join_monotonic( ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: # We only get here with matching dtypes and both monotonic increasing assert other.dtype == self.dtype + assert self._can_use_libjoin and other._can_use_libjoin if self.equals(other): # This is a convenient place for this check, but its correctness @@ -5031,27 +4991,36 @@ def _wrap_joined_index( # expected "Self") mask = lidx == -1 join_idx = self.take(lidx) - right = other.take(ridx) + right = cast("MultiIndex", other.take(ridx)) join_index = join_idx.putmask(mask, right)._sort_levels_monotonic() return join_index.set_names(name) # type: ignore[return-value] else: name = get_op_result_name(self, other) return self._constructor._with_infer(joined, name=name, dtype=self.dtype) + @final @cache_readonly def _can_use_libjoin(self) -> bool: """ - Whether we can use the fastpaths implement in _libs.join + Whether we can use the fastpaths implemented in _libs.join. + + This is driven by whether (in monotonic increasing cases that are + guaranteed not to have NAs) we can convert to a np.ndarray without + making a copy. If we cannot, this negates the performance benefit + of using libjoin. """ if type(self) is Index: # excludes EAs, but include masks, we get here with monotonic # values only, meaning no NA return ( isinstance(self.dtype, np.dtype) - or isinstance(self.values, BaseMaskedArray) - or isinstance(self._values, ArrowExtensionArray) + or isinstance(self._values, (ArrowExtensionArray, BaseMaskedArray)) + or self.dtype == "string[python]" ) - return not isinstance(self.dtype, IntervalDtype) + # Exclude index types where the conversion to numpy converts to object dtype, + # which negates the performance benefit of libjoin + # TODO: exclude RangeIndex? Seems to break test_concat_datetime_timezone + return not isinstance(self, (ABCIntervalIndex, ABCMultiIndex)) # -------------------------------------------------------------------- # Uncategorized Methods @@ -5172,7 +5141,8 @@ def _get_engine_target(self) -> ArrayLike: return self._values.astype(object) return vals - def _get_join_target(self) -> ArrayLike: + @final + def _get_join_target(self) -> np.ndarray: """ Get the ndarray or ExtensionArray that we can pass to the join functions. @@ -5184,7 +5154,12 @@ def _get_join_target(self) -> ArrayLike: # This is only used if our array is monotonic, so no missing values # present return self._values.to_numpy() - return self._get_engine_target() + + # TODO: exclude ABCRangeIndex case here as it copies + target = self._get_engine_target() + if not isinstance(target, np.ndarray): + raise ValueError("_can_use_libjoin should return False.") + return target def _from_join_target(self, result: np.ndarray) -> ArrayLike: """ @@ -5193,7 +5168,7 @@ def _from_join_target(self, result: np.ndarray) -> ArrayLike: """ if isinstance(self.values, BaseMaskedArray): return type(self.values)(result, np.zeros(result.shape, dtype=np.bool_)) - elif isinstance(self.values, ArrowExtensionArray): + elif isinstance(self.values, (ArrowExtensionArray, StringArray)): return type(self.values)._from_sequence(result) return result @@ -6034,7 +6009,7 @@ def get_indexer_non_unique( # that can be matched to Interval scalars. return self._get_indexer_non_comparable(target, method=None, unique=False) - pself, ptarget = self._maybe_promote(target) + pself, ptarget = self._maybe_downcast_for_indexing(target) if pself is not self or ptarget is not target: return pself.get_indexer_non_unique(ptarget) @@ -6050,8 +6025,8 @@ def get_indexer_non_unique( # TODO: get_indexer has fastpaths for both Categorical-self and # Categorical-target. Can we do something similar here? - # Note: _maybe_promote ensures we never get here with MultiIndex - # self and non-Multi target + # Note: _maybe_downcast_for_indexing ensures we never get here + # with MultiIndex self and non-Multi target tgt_values = target._get_engine_target() if self._is_multi and target._is_multi: engine = self._engine @@ -6225,7 +6200,7 @@ def _index_as_unique(self) -> bool: _requires_unique_msg = "Reindexing only valid with uniquely valued Index objects" @final - def _maybe_promote(self, other: Index) -> tuple[Index, Index]: + def _maybe_downcast_for_indexing(self, other: Index) -> tuple[Index, Index]: """ When dealing with an object-dtype Index and a non-object Index, see if we can upcast the object-dtype one to improve performance. @@ -6266,7 +6241,7 @@ def _maybe_promote(self, other: Index) -> tuple[Index, Index]: if not is_object_dtype(self.dtype) and is_object_dtype(other.dtype): # Reverse op so we dont need to re-implement on the subclasses - other, self = other._maybe_promote(self) + other, self = other._maybe_downcast_for_indexing(self) return self, other @@ -7001,7 +6976,7 @@ def infer_objects(self, copy: bool = True) -> Index: result._references.add_index_reference(result) return result - def diff(self, periods: int = 1): + def diff(self, periods: int = 1) -> Self: """ Computes the difference between consecutive values in the Index object. @@ -7029,7 +7004,7 @@ def diff(self, periods: int = 1): """ return self._constructor(self.to_series().diff(periods)) - def round(self, decimals: int = 0): + def round(self, decimals: int = 0) -> Self: """ Round each value in the Index to the given number of decimals. @@ -7192,11 +7167,12 @@ def any(self, *args, **kwargs): """ nv.validate_any(args, kwargs) self._maybe_disable_logical_methods("any") - # error: Argument 1 to "any" has incompatible type "ArrayLike"; expected - # "Union[Union[int, float, complex, str, bytes, generic], Sequence[Union[int, - # float, complex, str, bytes, generic]], Sequence[Sequence[Any]], - # _SupportsArray]" - return np.any(self.values) # type: ignore[arg-type] + vals = self._values + if not isinstance(vals, np.ndarray): + # i.e. EA, call _reduce instead of "any" to get TypeError instead + # of AttributeError + return vals._reduce("any") + return np.any(vals) def all(self, *args, **kwargs): """ @@ -7239,11 +7215,12 @@ def all(self, *args, **kwargs): """ nv.validate_all(args, kwargs) self._maybe_disable_logical_methods("all") - # error: Argument 1 to "all" has incompatible type "ArrayLike"; expected - # "Union[Union[int, float, complex, str, bytes, generic], Sequence[Union[int, - # float, complex, str, bytes, generic]], Sequence[Sequence[Any]], - # _SupportsArray]" - return np.all(self.values) # type: ignore[arg-type] + vals = self._values + if not isinstance(vals, np.ndarray): + # i.e. EA, call _reduce instead of "all" to get TypeError instead + # of AttributeError + return vals._reduce("all") + return np.all(vals) @final def _maybe_disable_logical_methods(self, opname: str_t) -> None: @@ -7252,9 +7229,9 @@ def _maybe_disable_logical_methods(self, opname: str_t) -> None: """ if ( isinstance(self, ABCMultiIndex) - or needs_i8_conversion(self.dtype) - or isinstance(self.dtype, (IntervalDtype, CategoricalDtype)) - or is_float_dtype(self.dtype) + # TODO(3.0): PeriodArray and DatetimeArray any/all will raise, + # so checking needs_i8_conversion will be unnecessary + or (needs_i8_conversion(self.dtype) and self.dtype.kind != "m") ): # This call will raise make_invalid_op(opname)(self) @@ -7528,7 +7505,7 @@ def ensure_index(index_like: Axes, copy: bool = False) -> Index: index_like = list(index_like) if isinstance(index_like, list): - if type(index_like) is not list: + if type(index_like) is not list: # noqa: E721 # must check for exactly list here because of strict type # check in clean_index_list index_like = list(index_like) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index d5a292335a5f6..1b08596c99591 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -31,6 +31,7 @@ parsing, to_offset, ) +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr from pandas.compat.numpy import function as nv from pandas.errors import ( InvalidIndexError, @@ -108,8 +109,16 @@ def asi8(self) -> npt.NDArray[np.int64]: @property @doc(DatetimeLikeArrayMixin.freqstr) - def freqstr(self) -> str | None: - return self._data.freqstr + def freqstr(self) -> str: + from pandas import PeriodIndex + + if self._data.freqstr is not None and isinstance( + self._data, (PeriodArray, PeriodIndex) + ): + freq = freq_to_period_freqstr(self._data.freq.n, self._data.freq.name) + return freq + else: + return self._data.freqstr # type: ignore[return-value] @cache_readonly @abstractmethod diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index dcb5f8caccd3e..ae0feba1f9bcf 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -65,6 +65,8 @@ PeriodIndex, ) +from pandas._libs.tslibs.dtypes import OFFSET_TO_PERIOD_FREQSTR + def _new_DatetimeIndex(cls, d): """ @@ -198,8 +200,6 @@ class DatetimeIndex(DatetimeTimedeltaMixin): timetz dayofyear day_of_year - weekofyear - week dayofweek day_of_week weekday @@ -535,7 +535,8 @@ def _parsed_string_to_bounds(self, reso: Resolution, parsed: dt.datetime): ------- lower, upper: pd.Timestamp """ - per = Period(parsed, freq=reso.attr_abbrev) + freq = OFFSET_TO_PERIOD_FREQSTR.get(reso.attr_abbrev, reso.attr_abbrev) + per = Period(parsed, freq=freq) start, end = per.start_time, per.end_time # GH 24076 @@ -946,26 +947,26 @@ def date_range( **Other Parameters** - Changed the `freq` (frequency) to ``'M'`` (month end frequency). + Changed the `freq` (frequency) to ``'ME'`` (month end frequency). - >>> pd.date_range(start='1/1/2018', periods=5, freq='M') + >>> pd.date_range(start='1/1/2018', periods=5, freq='ME') DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30', '2018-05-31'], - dtype='datetime64[ns]', freq='M') + dtype='datetime64[ns]', freq='ME') Multiples are allowed - >>> pd.date_range(start='1/1/2018', periods=5, freq='3M') + >>> pd.date_range(start='1/1/2018', periods=5, freq='3ME') DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', '2019-01-31'], - dtype='datetime64[ns]', freq='3M') + dtype='datetime64[ns]', freq='3ME') `freq` can also be specified as an Offset object. >>> pd.date_range(start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3)) DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', '2019-01-31'], - dtype='datetime64[ns]', freq='3M') + dtype='datetime64[ns]', freq='3ME') Specify `tz` to set the timezone. diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index 2d1d2a81a8a71..9d528d34e3684 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -8,12 +8,18 @@ """ from __future__ import annotations -from typing import NoReturn +from typing import ( + TYPE_CHECKING, + NoReturn, +) from pandas.core.base import PandasObject from pandas.io.formats.printing import pprint_thing +if TYPE_CHECKING: + from pandas._typing import Self + class FrozenList(PandasObject, list): """ @@ -72,7 +78,7 @@ def __getitem__(self, n): return type(self)(super().__getitem__(n)) return super().__getitem__(n) - def __radd__(self, other): + def __radd__(self, other) -> Self: if isinstance(other, tuple): other = list(other) return type(self)(other + list(self)) @@ -84,7 +90,7 @@ def __eq__(self, other: object) -> bool: __req__ = __eq__ - def __mul__(self, other): + def __mul__(self, other) -> Self: return type(self)(super().__mul__(other)) __imul__ = __mul__ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index b36672df32e61..eb8d25bcea592 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1039,8 +1039,9 @@ def interval_range( >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), ... end=pd.Timestamp('2017-01-04')) - IntervalIndex([(2017-01-01, 2017-01-02], (2017-01-02, 2017-01-03], - (2017-01-03, 2017-01-04]], + IntervalIndex([(2017-01-01 00:00:00, 2017-01-02 00:00:00], + (2017-01-02 00:00:00, 2017-01-03 00:00:00], + (2017-01-03 00:00:00, 2017-01-04 00:00:00]], dtype='interval[datetime64[ns], right]') The ``freq`` parameter specifies the frequency between the left and right. @@ -1056,8 +1057,9 @@ def interval_range( >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), ... periods=3, freq='MS') - IntervalIndex([(2017-01-01, 2017-02-01], (2017-02-01, 2017-03-01], - (2017-03-01, 2017-04-01]], + IntervalIndex([(2017-01-01 00:00:00, 2017-02-01 00:00:00], + (2017-02-01 00:00:00, 2017-03-01 00:00:00], + (2017-03-01 00:00:00, 2017-04-01 00:00:00]], dtype='interval[datetime64[ns], right]') Specify ``start``, ``end``, and ``periods``; the frequency is generated diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ae0ab66dd5f3e..144045d40a086 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1030,7 +1030,7 @@ def levshape(self) -> Shape: # Codes Methods @property - def codes(self): + def codes(self) -> FrozenList: return self._codes def _set_codes( @@ -1074,7 +1074,9 @@ def _set_codes( self._reset_cache() - def set_codes(self, codes, *, level=None, verify_integrity: bool = True): + def set_codes( + self, codes, *, level=None, verify_integrity: bool = True + ) -> MultiIndex: """ Set new codes on MultiIndex. Defaults to returning new index. @@ -1199,7 +1201,7 @@ def copy( # type: ignore[override] names=None, deep: bool = False, name=None, - ): + ) -> Self: """ Make a copy of this object. @@ -1262,7 +1264,7 @@ def __array__(self, dtype=None) -> np.ndarray: """the array interface, return my values""" return self.values - def view(self, cls=None): + def view(self, cls=None) -> Self: """this is defined as a copy with the same identity""" result = self.copy() result._id = self._id @@ -1659,7 +1661,8 @@ def _get_level_values(self, level: int, unique: bool = False) -> Index: filled = algos.take_nd(lev._values, level_codes, fill_value=lev._na_value) return lev._shallow_copy(filled, name=name) - def get_level_values(self, level): + # error: Signature of "get_level_values" incompatible with supertype "Index" + def get_level_values(self, level) -> Index: # type: ignore[override] """ Return vector of label values for requested level. @@ -2208,17 +2211,9 @@ def append(self, other): def argsort( self, *args, na_position: str = "last", **kwargs ) -> npt.NDArray[np.intp]: - if len(args) == 0 and len(kwargs) == 0: - # lexsort is significantly faster than self._values.argsort() - target = self._sort_levels_monotonic(raise_if_incomparable=True) - return lexsort_indexer( - # error: Argument 1 to "lexsort_indexer" has incompatible type - # "List[Categorical]"; expected "Union[List[Union[ExtensionArray, - # ndarray[Any, Any]]], List[Series]]" - target._get_codes_for_sorting(), # type: ignore[arg-type] - na_position=na_position, - ) - return self._values.argsort(*args, **kwargs) + target = self._sort_levels_monotonic(raise_if_incomparable=True) + keys = [lev.codes for lev in target._get_codes_for_sorting()] + return lexsort_indexer(keys, na_position=na_position, codes_given=True) @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats: int, axis=None) -> MultiIndex: @@ -2461,12 +2456,12 @@ def _reorder_ilevels(self, order) -> MultiIndex: def _recode_for_new_levels( self, new_levels, copy: bool = True ) -> Generator[np.ndarray, None, None]: - if len(new_levels) != self.nlevels: + if len(new_levels) > self.nlevels: raise AssertionError( f"Length of new_levels ({len(new_levels)}) " - f"must be same as self.nlevels ({self.nlevels})" + f"must be <= self.nlevels ({self.nlevels})" ) - for i in range(self.nlevels): + for i in range(len(new_levels)): yield recode_for_categories( self.codes[i], self.levels[i], new_levels[i], copy=copy ) @@ -3304,7 +3299,7 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): raise KeyError(key) return slice(start, end) - def get_locs(self, seq): + def get_locs(self, seq) -> npt.NDArray[np.intp]: """ Get location for a sequence of labels. diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index d05694c00d2a4..b1023febe813d 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -16,6 +16,7 @@ Resolution, Tick, ) +from pandas._libs.tslibs.dtypes import OFFSET_TO_PERIOD_FREQSTR from pandas.util._decorators import ( cache_readonly, doc, @@ -453,7 +454,8 @@ def _maybe_cast_slice_bound(self, label, side: str): return super()._maybe_cast_slice_bound(label, side) def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime): - iv = Period(parsed, freq=reso.attr_abbrev) + freq = OFFSET_TO_PERIOD_FREQSTR.get(reso.attr_abbrev, reso.attr_abbrev) + iv = Period(parsed, freq=freq) return (iv.asfreq(self.freq, how="start"), iv.asfreq(self.freq, how="end")) @doc(DatetimeIndexOpsMixin.shift) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 9576997af6641..aeb7bb1813fef 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -407,7 +407,7 @@ def inferred_type(self) -> str: # Indexing Methods @doc(Index.get_loc) - def get_loc(self, key): + def get_loc(self, key) -> int: if is_integer(key) or (is_float(key) and key.is_integer()): new_key = int(key) try: @@ -1107,14 +1107,16 @@ def _arith_method(self, other, op): # test_arithmetic_explicit_conversions return super()._arith_method(other, op) - def take( + # error: Return type "Index" of "take" incompatible with return type + # "RangeIndex" in supertype "Index" + def take( # type: ignore[override] self, indices, axis: Axis = 0, allow_fill: bool = True, fill_value=None, **kwargs, - ): + ) -> Index: if kwargs: nv.validate_take((), kwargs) if is_scalar(indices): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ca0b1705b5c38..871e5817fdf0d 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -82,6 +82,7 @@ Axis, AxisInt, Self, + npt, ) from pandas import ( @@ -604,12 +605,12 @@ def at(self) -> _AtIndexer: Raises ------ KeyError - * If getting a value and 'label' does not exist in a DataFrame or - Series. + If getting a value and 'label' does not exist in a DataFrame or Series. + ValueError - * If row/column label pair is not a tuple or if any label from - the pair is not a scalar for DataFrame. - * If label is list-like (*excluding* NamedTuple) for Series. + If row/column label pair is not a tuple or if any label + from the pair is not a scalar for DataFrame. + If label is list-like (*excluding* NamedTuple) for Series. See Also -------- @@ -985,8 +986,9 @@ def _getitem_tuple_same_dim(self, tup: tuple): """ retval = self.obj # Selecting columns before rows is signficiantly faster + start_val = (self.ndim - len(tup)) + 1 for i, key in enumerate(reversed(tup)): - i = self.ndim - i - 1 + i = self.ndim - i - start_val if com.is_null_slice(key): continue @@ -1383,7 +1385,7 @@ def _getitem_axis(self, key, axis: AxisInt): # nested tuple slicing if is_nested_tuple(key, labels): locs = labels.get_locs(key) - indexer = [slice(None)] * self.ndim + indexer: list[slice | npt.NDArray[np.intp]] = [slice(None)] * self.ndim indexer[axis] = locs return self.obj.iloc[tuple(indexer)] diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 14969425e75a7..b4e3fdb78b77b 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -1103,7 +1103,7 @@ def _verify_integrity(self) -> None: def _normalize_axis(axis): return axis - def make_empty(self, axes=None) -> SingleArrayManager: + def make_empty(self, axes=None) -> Self: """Return an empty ArrayManager with index/array of length 0""" if axes is None: axes = [Index([], dtype=object)] diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index eca0df67ff054..30f6507d02484 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -15,9 +15,13 @@ import numpy as np -from pandas._config import using_copy_on_write +from pandas._config import ( + get_option, + using_copy_on_write, +) from pandas._libs import ( + NaT, internals as libinternals, lib, writers, @@ -27,7 +31,6 @@ BlockValuesRefs, ) from pandas._libs.missing import NA -from pandas._libs.tslibs import IncompatibleFrequency from pandas._typing import ( ArrayLike, AxisInt, @@ -60,7 +63,10 @@ from pandas.core.dtypes.common import ( ensure_platform_int, is_1d_only_ea_dtype, + is_float_dtype, + is_integer_dtype, is_list_like, + is_scalar, is_string_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -454,6 +460,31 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: and will receive the same block """ new_dtype = find_result_type(self.values.dtype, other) + if new_dtype == self.dtype: + # GH#52927 avoid RecursionError + raise AssertionError( + "Something has gone wrong, please report a bug at " + "https://github.com/pandas-dev/pandas/issues" + ) + + # In a future version of pandas, the default will be that + # setting `nan` into an integer series won't raise. + if ( + is_scalar(other) + and is_integer_dtype(self.values.dtype) + and isna(other) + and other is not NaT + ): + warn_on_upcast = False + elif ( + isinstance(other, np.ndarray) + and other.ndim == 1 + and is_integer_dtype(self.values.dtype) + and is_float_dtype(other.dtype) + and lib.has_only_ints_or_nan(other) + ): + warn_on_upcast = False + if warn_on_upcast: warnings.warn( f"Setting an item of incompatible dtype is deprecated " @@ -473,7 +504,11 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: @final def _maybe_downcast( - self, blocks: list[Block], downcast=None, using_cow: bool = False + self, + blocks: list[Block], + downcast, + using_cow: bool, + caller: str, ) -> list[Block]: if downcast is False: return blocks @@ -485,14 +520,63 @@ def _maybe_downcast( # but ATM it breaks too much existing code. # split and convert the blocks - return extend_blocks( + if caller == "fillna" and get_option("future.no_silent_downcasting"): + return blocks + + nbs = extend_blocks( [blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks] ) + if caller == "fillna": + if len(nbs) != len(blocks) or not all( + x.dtype == y.dtype for x, y in zip(nbs, blocks) + ): + # GH#54261 + warnings.warn( + "Downcasting object dtype arrays on .fillna, .ffill, .bfill " + "is deprecated and will change in a future version. " + "Call result.infer_objects(copy=False) instead. " + "To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) - if downcast is None: + return nbs + + elif downcast is None: + return blocks + elif caller == "where" and get_option("future.no_silent_downcasting") is True: return blocks + else: + nbs = extend_blocks([b._downcast_2d(downcast, using_cow) for b in blocks]) + + # When _maybe_downcast is called with caller="where", it is either + # a) with downcast=False, which is a no-op (the desired future behavior) + # b) with downcast="infer", which is _not_ passed by the user. + # In the latter case the future behavior is to stop doing inference, + # so we issue a warning if and only if some inference occurred. + if caller == "where": + # GH#53656 + if len(blocks) != len(nbs) or any( + left.dtype != right.dtype for left, right in zip(blocks, nbs) + ): + # In this case _maybe_downcast was _not_ a no-op, so the behavior + # will change, so we issue a warning. + warnings.warn( + "Downcasting behavior in Series and DataFrame methods 'where', " + "'mask', and 'clip' is deprecated. In a future " + "version this will not infer object dtypes or cast all-round " + "floats to integers. Instead call " + "result.infer_objects(copy=False) for object inference, " + "or cast round floats explicitly. To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) - return extend_blocks([b._downcast_2d(downcast, using_cow) for b in blocks]) + return nbs @final @maybe_split @@ -707,7 +791,23 @@ def replace( if not (self.is_object and value is None): # if the user *explicitly* gave None, we keep None, otherwise # may downcast to NaN - blocks = blk.convert(copy=False, using_cow=using_cow) + if get_option("future.no_silent_downcasting") is True: + blocks = [blk] + else: + blocks = blk.convert(copy=False, using_cow=using_cow) + if len(blocks) > 1 or blocks[0].dtype != blk.dtype: + warnings.warn( + # GH#54710 + "Downcasting behavior in `replace` is deprecated and " + "will be removed in a future version. To retain the old " + "behavior, explicitly call " + "`result.infer_objects(copy=False)`. " + "To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) else: blocks = [blk] return blocks @@ -782,7 +882,21 @@ def _replace_regex( replace_regex(block.values, rx, value, mask) - return block.convert(copy=False, using_cow=using_cow) + nbs = block.convert(copy=False, using_cow=using_cow) + opt = get_option("future.no_silent_downcasting") + if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt: + warnings.warn( + # GH#54710 + "Downcasting behavior in `replace` is deprecated and " + "will be removed in a future version. To retain the old " + "behavior, explicitly call `result.infer_objects(copy=False)`. " + "To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) + return nbs @final def replace_list( @@ -848,6 +962,7 @@ def replace_list( else: rb = [self if inplace else self.copy()] + opt = get_option("future.no_silent_downcasting") for i, ((src, dest), mask) in enumerate(zip(pairs, masks)): convert = i == src_len # only convert once at the end new_rb: list[Block] = [] @@ -885,14 +1000,33 @@ def replace_list( b.refs.referenced_blocks.index(ref) ) - if convert and blk.is_object and not all(x is None for x in dest_list): + if ( + not opt + and convert + and blk.is_object + and not all(x is None for x in dest_list) + ): # GH#44498 avoid unwanted cast-back - result = extend_blocks( - [ - b.convert(copy=True and not using_cow, using_cow=using_cow) - for b in result - ] - ) + nbs = [] + for res_blk in result: + converted = res_blk.convert( + copy=True and not using_cow, using_cow=using_cow + ) + if len(converted) > 1 or converted[0].dtype != res_blk.dtype: + warnings.warn( + # GH#54710 + "Downcasting behavior in `replace` is deprecated " + "and will be removed in a future version. To " + "retain the old behavior, explicitly call " + "`result.infer_objects(copy=False)`. " + "To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) + nbs.extend(converted) + result = nbs new_rb.extend(result) rb = new_rb return rb @@ -1133,7 +1267,9 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block: # length checking check_setitem_lengths(indexer, value, values) - value = extract_array(value, extract_numpy=True) + if self.dtype != _dtype_obj: + # GH48933: extract_array would convert a pd.Series value to np.ndarray + value = extract_array(value, extract_numpy=True) try: casted = np_can_hold_element(values.dtype, value) except LossySetitemError: @@ -1284,7 +1420,7 @@ def where( block = self.coerce_to_target_dtype(other) blocks = block.where(orig_other, cond, using_cow=using_cow) return self._maybe_downcast( - blocks, downcast=_downcast, using_cow=using_cow + blocks, downcast=_downcast, using_cow=using_cow, caller="where" ) else: @@ -1380,7 +1516,9 @@ def fillna( else: # GH#45423 consistent downcasting on no-ops. nb = self.copy(deep=not using_cow) - nbs = nb._maybe_downcast([nb], downcast=downcast, using_cow=using_cow) + nbs = nb._maybe_downcast( + [nb], downcast=downcast, using_cow=using_cow, caller="fillna" + ) return nbs if limit is not None: @@ -1398,7 +1536,9 @@ def fillna( # different behavior in _maybe_downcast. return extend_blocks( [ - blk._maybe_downcast([blk], downcast=downcast, using_cow=using_cow) + blk._maybe_downcast( + [blk], downcast=downcast, using_cow=using_cow, caller="fillna" + ) for blk in nbs ] ) @@ -1427,7 +1567,7 @@ def pad_or_backfill( vals = cast(NumpyExtensionArray, self.array_values) if axis == 1: vals = vals.T - new_values = vals.pad_or_backfill( + new_values = vals._pad_or_backfill( method=method, limit=limit, limit_area=limit_area, @@ -1439,7 +1579,7 @@ def pad_or_backfill( data = extract_array(new_values, extract_numpy=True) nb = self.make_block_same_class(data, refs=refs) - return nb._maybe_downcast([nb], downcast, using_cow) + return nb._maybe_downcast([nb], downcast, using_cow, caller="fillna") @final def interpolate( @@ -1492,7 +1632,7 @@ def interpolate( data = extract_array(new_values, extract_numpy=True) nb = self.make_block_same_class(data, refs=refs) - return nb._maybe_downcast([nb], downcast, using_cow) + return nb._maybe_downcast([nb], downcast, using_cow, caller="interpolate") @final def diff(self, n: int) -> list[Block]: @@ -1731,9 +1871,7 @@ def setitem(self, indexer, value, using_cow: bool = False): try: values[indexer] = value - except (ValueError, TypeError) as err: - _catch_deprecated_value_error(err) - + except (ValueError, TypeError): if isinstance(self.dtype, IntervalDtype): # see TestSetitemFloatIntervalWithIntIntervalValues nb = self.coerce_to_target_dtype(orig_value, warn_on_upcast=True) @@ -1776,16 +1914,14 @@ def where( try: res_values = arr._where(cond, other).T - except (ValueError, TypeError) as err: - _catch_deprecated_value_error(err) - + except (ValueError, TypeError): if self.ndim == 1 or self.shape[0] == 1: if isinstance(self.dtype, IntervalDtype): # TestSetitemFloatIntervalWithIntIntervalValues blk = self.coerce_to_target_dtype(orig_other) nbs = blk.where(orig_other, orig_cond, using_cow=using_cow) return self._maybe_downcast( - nbs, downcast=_downcast, using_cow=using_cow + nbs, downcast=_downcast, using_cow=using_cow, caller="where" ) elif isinstance(self, NDArrayBackedExtensionBlock): @@ -1794,7 +1930,7 @@ def where( blk = self.coerce_to_target_dtype(orig_other) nbs = blk.where(orig_other, orig_cond, using_cow=using_cow) return self._maybe_downcast( - nbs, downcast=_downcast, using_cow=using_cow + nbs, downcast=_downcast, using_cow=using_cow, caller="where" ) else: @@ -1847,9 +1983,7 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: try: # Caller is responsible for ensuring matching lengths values._putmask(mask, new) - except (TypeError, ValueError) as err: - _catch_deprecated_value_error(err) - + except (TypeError, ValueError): if self.ndim == 1 or self.shape[0] == 1: if isinstance(self.dtype, IntervalDtype): # Discussion about what we want to support in the general @@ -1930,9 +2064,9 @@ def pad_or_backfill( if values.ndim == 2 and axis == 1: # NDArrayBackedExtensionArray.fillna assumes axis=0 - new_values = values.T.pad_or_backfill(method=method, limit=limit).T + new_values = values.T._pad_or_backfill(method=method, limit=limit).T else: - new_values = values.pad_or_backfill(method=method, limit=limit) + new_values = values._pad_or_backfill(method=method, limit=limit) return [self.make_block_same_class(new_values)] @@ -1990,12 +2124,12 @@ def fillna( "need to implement this keyword or an exception will be " "raised. In the interim, the keyword is ignored by " f"{type(self.values).__name__}.", - FutureWarning, + DeprecationWarning, stacklevel=find_stack_level(), ) nb = self.make_block_same_class(new_values, refs=refs) - return nb._maybe_downcast([nb], downcast, using_cow=using_cow) + return nb._maybe_downcast([nb], downcast, using_cow=using_cow, caller="fillna") @cache_readonly def shape(self) -> Shape: @@ -2108,8 +2242,9 @@ def is_view(self) -> bool: """Extension arrays are never treated as views.""" return False + # error: Cannot override writeable attribute with read-only property @cache_readonly - def is_numeric(self): + def is_numeric(self) -> bool: # type: ignore[override] return self.values.dtype._is_numeric def _slice( @@ -2256,19 +2391,6 @@ def is_view(self) -> bool: return self.values._ndarray.base is not None -def _catch_deprecated_value_error(err: Exception) -> None: - """ - We catch ValueError for now, but only a specific one raised by DatetimeArray - which will no longer be raised in version 2.0. - """ - if isinstance(err, ValueError): - if isinstance(err, IncompatibleFrequency): - pass - elif "'value.closed' is" in str(err): - # IntervalDtype mismatched 'closed' - pass - - class DatetimeLikeBlock(NDArrayBackedExtensionBlock): """Block for datetime64[ns], timedelta64[ns].""" diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 4d33f0137d3c4..b2d463a8c6c26 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -177,7 +177,7 @@ def concatenate_managers( values = np.concatenate(vals, axis=1) # type: ignore[arg-type] elif is_1d_only_ea_dtype(blk.dtype): # TODO(EA2D): special-casing not needed with 2D EAs - values = concat_compat(vals, axis=1, ea_compat_axis=True) + values = concat_compat(vals, axis=0, ea_compat_axis=True) values = ensure_block_shape(values, ndim=2) else: values = concat_compat(vals, axis=1) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 2290cd86f35e6..6f30bc650aa36 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -13,6 +13,8 @@ import numpy as np from numpy import ma +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.core.dtypes.astype import astype_is_view @@ -65,6 +67,7 @@ from pandas.core.internals.blocks import ( BlockPlacement, ensure_block_shape, + new_block, new_block_2d, ) from pandas.core.internals.managers import ( @@ -156,7 +159,7 @@ def arrays_to_mgr( def rec_array_to_mgr( - data: np.recarray | np.ndarray, + data: np.rec.recarray | np.ndarray, index, columns, dtype: DtypeObj | None, @@ -190,7 +193,7 @@ def rec_array_to_mgr( return mgr -def mgr_to_mgr(mgr, typ: str, copy: bool = True): +def mgr_to_mgr(mgr, typ: str, copy: bool = True) -> Manager: """ Convert to specific type of Manager. Does not copy if the type is already correct. Does not guarantee a copy otherwise. `copy` keyword only controls @@ -372,6 +375,19 @@ def ndarray_to_mgr( bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp, refs=refs) block_values = [nb] + elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype(): + dtype = StringDtype(storage="pyarrow_numpy") + + obj_columns = list(values) + block_values = [ + new_block( + dtype.construct_array_type()._from_sequence(data, dtype=dtype), + BlockPlacement(slice(i, i + 1)), + ndim=1, + ) + for i, data in enumerate(obj_columns) + ] + else: bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp, refs=refs) @@ -903,7 +919,7 @@ def _list_of_dict_to_arrays( # assure that they are of the base dict class and not of derived # classes - data = [d if type(d) is dict else dict(d) for d in data] + data = [d if type(d) is dict else dict(d) for d in data] # noqa: E721 content = lib.dicts_to_array(data, list(columns)) return content, columns diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 98d2934678546..b1db2d2e708e8 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -93,6 +93,8 @@ npt, ) + from pandas.api.extensions import ExtensionArray + class BaseBlockManager(DataManager): """ @@ -966,19 +968,14 @@ def fast_xs(self, loc: int) -> SingleBlockManager: n = len(self) - # GH#46406 - immutable_ea = isinstance(dtype, ExtensionDtype) and dtype._is_immutable - - if isinstance(dtype, ExtensionDtype) and not immutable_ea: - cls = dtype.construct_array_type() - result = cls._empty((n,), dtype=dtype) + if isinstance(dtype, ExtensionDtype): + # TODO: use object dtype as workaround for non-performant + # EA.__setitem__ methods. (primarily ArrowExtensionArray.__setitem__ + # when iteratively setting individual values) + # https://github.com/pandas-dev/pandas/pull/54508#issuecomment-1675827918 + result = np.empty(n, dtype=object) else: - # error: Argument "dtype" to "empty" has incompatible type - # "Union[Type[object], dtype[Any], ExtensionDtype, None]"; expected - # "None" - result = np.empty( - n, dtype=object if immutable_ea else dtype # type: ignore[arg-type] - ) + result = np.empty(n, dtype=dtype) result = ensure_wrapped_if_datetimelike(result) for blk in self.blocks: @@ -987,9 +984,9 @@ def fast_xs(self, loc: int) -> SingleBlockManager: for i, rl in enumerate(blk.mgr_locs): result[rl] = blk.iget((i, loc)) - if immutable_ea: - dtype = cast(ExtensionDtype, dtype) - result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) + if isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + result = cls._from_sequence(result, dtype=dtype) bp = BlockPlacement(slice(0, len(result))) block = new_block(result, placement=bp, ndim=1) @@ -1055,7 +1052,7 @@ def iset( value: ArrayLike, inplace: bool = False, refs: BlockValuesRefs | None = None, - ): + ) -> None: """ Set new item in-place. Does not consolidate. Adds new Block if not contained in the current set of items @@ -1879,7 +1876,7 @@ def __getstate__(self): # compatibility with 0.13.1. return axes_array, block_values, block_items, extra_state - def __setstate__(self, state): + def __setstate__(self, state) -> None: def unpickle_block(values, mgr_locs, ndim: int) -> Block: # TODO(EA2D): ndim would be unnecessary with 2D EAs # older pickles may store e.g. DatetimeIndex instead of DatetimeArray @@ -1968,7 +1965,7 @@ def internal_values(self): """The array that Series._values returns""" return self._block.values - def array_values(self): + def array_values(self) -> ExtensionArray: """The array that Series.array returns""" return self._block.array_values diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index e89f641e17296..f4e0dcddcd34a 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -106,13 +106,13 @@ def to_dict( return into_c((k, v.to_dict(into)) for k, v in df.items()) elif orient == "list": - object_dtype_indices_as_set = set(box_native_indices) + object_dtype_indices_as_set: set[int] = set(box_native_indices) return into_c( ( k, - list(map(maybe_box_native, v.tolist())) + list(map(maybe_box_native, v.to_numpy().tolist())) if i in object_dtype_indices_as_set - else v.tolist(), + else v.to_numpy().tolist(), ) for i, (k, v) in enumerate(df.items()) ) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 58b0e2907b8ce..d275445983b6f 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -12,6 +12,7 @@ Any, Literal, cast, + overload, ) import numpy as np @@ -124,9 +125,33 @@ def mask_missing(arr: ArrayLike, values_to_mask) -> npt.NDArray[np.bool_]: return mask -def clean_fill_method(method: str, allow_nearest: bool = False): +@overload +def clean_fill_method( + method: Literal["ffill", "pad", "bfill", "backfill"], + *, + allow_nearest: Literal[False] = ..., +) -> Literal["pad", "backfill"]: + ... + + +@overload +def clean_fill_method( + method: Literal["ffill", "pad", "bfill", "backfill", "nearest"], + *, + allow_nearest: Literal[True], +) -> Literal["pad", "backfill", "nearest"]: + ... + + +def clean_fill_method( + method: Literal["ffill", "pad", "bfill", "backfill", "nearest"], + *, + allow_nearest: bool = False, +) -> Literal["pad", "backfill", "nearest"]: if isinstance(method, str): - method = method.lower() + # error: Incompatible types in assignment (expression has type "str", variable + # has type "Literal['ffill', 'pad', 'bfill', 'backfill', 'nearest']") + method = method.lower() # type: ignore[assignment] if method == "ffill": method = "pad" elif method == "bfill": @@ -252,7 +277,9 @@ def validate_limit_area(limit_area: str | None) -> Literal["inside", "outside"] return limit_area # type: ignore[return-value] -def infer_limit_direction(limit_direction, method): +def infer_limit_direction( + limit_direction: Literal["backward", "forward", "both"] | None, method: str +) -> Literal["backward", "forward", "both"]: # Set `limit_direction` depending on `method` if limit_direction is None: if method in ("backfill", "bfill"): diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 9b8d1c870091d..30d654078bd05 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -24,6 +24,7 @@ Timestamp, to_offset, ) +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr from pandas._typing import NDFrameT from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -32,7 +33,10 @@ Substitution, doc, ) -from pandas.util._exceptions import find_stack_level +from pandas.util._exceptions import ( + find_stack_level, + rewrite_warning, +) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -57,6 +61,7 @@ from pandas.core.groupby.groupby import ( BaseGroupBy, GroupBy, + _apply_groupings_depr, _pipe_template, get_groupby, ) @@ -163,6 +168,7 @@ def __init__( gpr_index: Index, group_keys: bool = False, selection=None, + include_groups: bool = True, ) -> None: self._timegrouper = timegrouper self.keys = None @@ -171,6 +177,7 @@ def __init__( self.kind = kind self.group_keys = group_keys self.as_index = True + self.include_groups = include_groups self.obj, self.ax, self._indexer = self._timegrouper._set_grouper( self._convert_obj(obj), sort=True, gpr_index=gpr_index @@ -296,7 +303,7 @@ def pipe( 2013-01-01 00:00:02 3 2013-01-01 00:00:03 4 2013-01-01 00:00:04 5 - Freq: S, dtype: int64 + Freq: s, dtype: int64 >>> r = s.resample('2s') @@ -304,7 +311,7 @@ def pipe( 2013-01-01 00:00:00 3 2013-01-01 00:00:02 7 2013-01-01 00:00:04 5 - Freq: 2S, dtype: int64 + Freq: 2s, dtype: int64 >>> r.agg(['sum', 'mean', 'max']) sum mean max @@ -444,7 +451,9 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): # a DataFrame column, but aggregate_item_by_item operates column-wise # on Series, raising AttributeError or KeyError # (depending on whether the column lookup uses getattr/__getitem__) - result = grouped.apply(how, *args, **kwargs) + result = _apply( + grouped, how, *args, include_groups=self.include_groups, **kwargs + ) except ValueError as err: if "Must produce aggregated value" in str(err): @@ -456,15 +465,21 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): # we have a non-reducing function # try to evaluate - result = grouped.apply(how, *args, **kwargs) + result = _apply( + grouped, how, *args, include_groups=self.include_groups, **kwargs + ) return self._wrap_result(result) - def _get_resampler_for_grouping(self, groupby: GroupBy, key): + def _get_resampler_for_grouping( + self, groupby: GroupBy, key, include_groups: bool = True + ): """ Return the correct class for resampling with groupby. """ - return self._resampler_for_grouping(groupby=groupby, key=key, parent=self) + return self._resampler_for_grouping( + groupby=groupby, key=key, parent=self, include_groups=include_groups + ) def _wrap_result(self, result): """ @@ -605,7 +620,7 @@ def nearest(self, limit: int | None = None): 2018-01-01 00:30:00 2 2018-01-01 00:45:00 2 2018-01-01 01:00:00 2 - Freq: 15T, dtype: int64 + Freq: 15min, dtype: int64 Limit the number of upsampled values imputed by the nearest: @@ -615,7 +630,7 @@ def nearest(self, limit: int | None = None): 2018-01-01 00:30:00 NaN 2018-01-01 00:45:00 2.0 2018-01-01 01:00:00 2.0 - Freq: 15T, dtype: float64 + Freq: 15min, dtype: float64 """ return self._upsample("nearest", limit=limit) @@ -674,7 +689,7 @@ def bfill(self, limit: int | None = None): 2018-01-01 01:00:00 2 2018-01-01 01:30:00 3 2018-01-01 02:00:00 3 - Freq: 30T, dtype: int64 + Freq: 30min, dtype: int64 >>> s.resample('15min').bfill(limit=2) 2018-01-01 00:00:00 1.0 @@ -686,7 +701,7 @@ def bfill(self, limit: int | None = None): 2018-01-01 01:30:00 3.0 2018-01-01 01:45:00 3.0 2018-01-01 02:00:00 3.0 - Freq: 15T, dtype: float64 + Freq: 15min, dtype: float64 Resampling a DataFrame that has missing values: @@ -787,7 +802,7 @@ def fillna(self, method, limit: int | None = None): 2018-01-01 01:00:00 2.0 2018-01-01 01:30:00 NaN 2018-01-01 02:00:00 3.0 - Freq: 30T, dtype: float64 + Freq: 30min, dtype: float64 >>> s.resample('30min').fillna("backfill") 2018-01-01 00:00:00 1 @@ -795,7 +810,7 @@ def fillna(self, method, limit: int | None = None): 2018-01-01 01:00:00 2 2018-01-01 01:30:00 3 2018-01-01 02:00:00 3 - Freq: 30T, dtype: int64 + Freq: 30min, dtype: int64 >>> s.resample('15min').fillna("backfill", limit=2) 2018-01-01 00:00:00 1.0 @@ -807,7 +822,7 @@ def fillna(self, method, limit: int | None = None): 2018-01-01 01:30:00 3.0 2018-01-01 01:45:00 3.0 2018-01-01 02:00:00 3.0 - Freq: 15T, dtype: float64 + Freq: 15min, dtype: float64 >>> s.resample('30min').fillna("pad") 2018-01-01 00:00:00 1 @@ -815,7 +830,7 @@ def fillna(self, method, limit: int | None = None): 2018-01-01 01:00:00 2 2018-01-01 01:30:00 2 2018-01-01 02:00:00 3 - Freq: 30T, dtype: int64 + Freq: 30min, dtype: int64 >>> s.resample('30min').fillna("nearest") 2018-01-01 00:00:00 1 @@ -823,7 +838,7 @@ def fillna(self, method, limit: int | None = None): 2018-01-01 01:00:00 2 2018-01-01 01:30:00 3 2018-01-01 02:00:00 3 - Freq: 30T, dtype: int64 + Freq: 30min, dtype: int64 Missing values present before the upsampling are not affected. @@ -841,7 +856,7 @@ def fillna(self, method, limit: int | None = None): 2018-01-01 01:00:00 NaN 2018-01-01 01:30:00 3.0 2018-01-01 02:00:00 3.0 - Freq: 30T, dtype: float64 + Freq: 30min, dtype: float64 >>> sm.resample('30min').fillna('pad') 2018-01-01 00:00:00 1.0 @@ -849,7 +864,7 @@ def fillna(self, method, limit: int | None = None): 2018-01-01 01:00:00 NaN 2018-01-01 01:30:00 NaN 2018-01-01 02:00:00 3.0 - Freq: 30T, dtype: float64 + Freq: 30min, dtype: float64 >>> sm.resample('30min').fillna('nearest') 2018-01-01 00:00:00 1.0 @@ -857,7 +872,7 @@ def fillna(self, method, limit: int | None = None): 2018-01-01 01:00:00 NaN 2018-01-01 01:30:00 3.0 2018-01-01 02:00:00 3.0 - Freq: 30T, dtype: float64 + Freq: 30min, dtype: float64 DataFrame resampling is done column-wise. All the same options are available. @@ -1018,7 +1033,7 @@ def interpolate( 2023-03-01 07:00:00 1 2023-03-01 07:00:02 2 2023-03-01 07:00:04 3 - Freq: 2S, dtype: int64 + Freq: 2s, dtype: int64 Downsample the dataframe to 2Hz by providing the period time of 500ms. @@ -1032,7 +1047,7 @@ def interpolate( 2023-03-01 07:00:03.000 1.0 2023-03-01 07:00:03.500 2.0 2023-03-01 07:00:04.000 3.0 - Freq: 500L, dtype: float64 + Freq: 500ms, dtype: float64 Internal reindexing with ``as_freq()`` prior to interpolation leads to an interpolated timeseries on the basis the reindexed timestamps (anchors). @@ -1051,7 +1066,7 @@ def interpolate( 2023-03-01 07:00:03.200 2.6 2023-03-01 07:00:03.600 2.8 2023-03-01 07:00:04.000 3.0 - Freq: 400L, dtype: float64 + Freq: 400ms, dtype: float64 Note that the series erroneously increases between two anchors ``07:00:00`` and ``07:00:02``. @@ -1590,6 +1605,7 @@ def __init__( groupby: GroupBy, key=None, selection: IndexLabel | None = None, + include_groups: bool = False, ) -> None: # reached via ._gotitem and _get_resampler_for_grouping @@ -1612,6 +1628,7 @@ def __init__( self.ax = parent.ax self.obj = parent.obj + self.include_groups = include_groups @no_type_check def _apply(self, f, *args, **kwargs): @@ -1628,7 +1645,7 @@ def func(x): return x.apply(f, *args, **kwargs) - result = self._groupby.apply(func) + result = _apply(self._groupby, func, include_groups=self.include_groups) return self._wrap_result(result) _upsample = _apply @@ -1676,6 +1693,8 @@ def _gotitem(self, key, ndim, subset=None): class DatetimeIndexResampler(Resampler): + ax: DatetimeIndex + @property def _resampler_for_grouping(self): return DatetimeIndexResamplerGroupby @@ -1807,7 +1826,11 @@ def _wrap_result(self, result): return result -class DatetimeIndexResamplerGroupby(_GroupByMixin, DatetimeIndexResampler): +# error: Definition of "ax" in base class "_GroupByMixin" is incompatible +# with definition in base class "DatetimeIndexResampler" +class DatetimeIndexResamplerGroupby( # type: ignore[misc] + _GroupByMixin, DatetimeIndexResampler +): """ Provides a resample of a groupby implementation """ @@ -1818,6 +1841,10 @@ def _resampler_cls(self): class PeriodIndexResampler(DatetimeIndexResampler): + # error: Incompatible types in assignment (expression has type "PeriodIndex", base + # class "DatetimeIndexResampler" defined the type as "DatetimeIndex") + ax: PeriodIndex # type: ignore[assignment] + @property def _resampler_for_grouping(self): return PeriodIndexResamplerGroupby @@ -1924,7 +1951,11 @@ def _upsample(self, method, limit: int | None = None, fill_value=None): return self._wrap_result(new_obj) -class PeriodIndexResamplerGroupby(_GroupByMixin, PeriodIndexResampler): +# error: Definition of "ax" in base class "_GroupByMixin" is incompatible with +# definition in base class "PeriodIndexResampler" +class PeriodIndexResamplerGroupby( # type: ignore[misc] + _GroupByMixin, PeriodIndexResampler +): """ Provides a resample of a groupby implementation. """ @@ -1935,6 +1966,10 @@ def _resampler_cls(self): class TimedeltaIndexResampler(DatetimeIndexResampler): + # error: Incompatible types in assignment (expression has type "TimedeltaIndex", + # base class "DatetimeIndexResampler" defined the type as "DatetimeIndex") + ax: TimedeltaIndex # type: ignore[assignment] + @property def _resampler_for_grouping(self): return TimedeltaIndexResamplerGroupby @@ -1952,7 +1987,11 @@ def _adjust_binner_for_upsample(self, binner): return binner -class TimedeltaIndexResamplerGroupby(_GroupByMixin, TimedeltaIndexResampler): +# error: Definition of "ax" in base class "_GroupByMixin" is incompatible with +# definition in base class "DatetimeIndexResampler" +class TimedeltaIndexResamplerGroupby( # type: ignore[misc] + _GroupByMixin, TimedeltaIndexResampler +): """ Provides a resample of a groupby implementation. """ @@ -1966,7 +2005,7 @@ def get_resampler(obj: Series | DataFrame, kind=None, **kwds) -> Resampler: """ Create a TimeGrouper and return our resampler. """ - tg = TimeGrouper(**kwds) + tg = TimeGrouper(obj, **kwds) # type: ignore[arg-type] return tg._get_resampler(obj, kind=kind) @@ -1981,6 +2020,7 @@ def get_resampler_for_grouping( limit: int | None = None, kind=None, on=None, + include_groups: bool = True, **kwargs, ) -> Resampler: """ @@ -1989,7 +2029,9 @@ def get_resampler_for_grouping( # .resample uses 'on' similar to how .groupby uses 'key' tg = TimeGrouper(freq=rule, key=on, **kwargs) resampler = tg._get_resampler(groupby.obj, kind=kind) - return resampler._get_resampler_for_grouping(groupby=groupby, key=tg.key) + return resampler._get_resampler_for_grouping( + groupby=groupby, include_groups=include_groups, key=tg.key + ) class TimeGrouper(Grouper): @@ -2019,7 +2061,9 @@ class TimeGrouper(Grouper): def __init__( self, + obj: Grouper | None = None, freq: Frequency = "Min", + key: str | None = None, closed: Literal["left", "right"] | None = None, label: Literal["left", "right"] | None = None, how: str = "mean", @@ -2043,9 +2087,21 @@ def __init__( if convention not in {None, "start", "end", "e", "s"}: raise ValueError(f"Unsupported value {convention} for `convention`") - freq = to_offset(freq) + if ( + key is None + and obj is not None + and isinstance(obj.index, PeriodIndex) # type: ignore[attr-defined] + or ( + key is not None + and obj is not None + and getattr(obj[key], "dtype", None) == "period" # type: ignore[index] + ) + ): + freq = to_offset(freq, is_period=True) + else: + freq = to_offset(freq) - end_types = {"M", "A", "Q", "BM", "BA", "BQ", "W"} + end_types = {"ME", "A", "Q", "BM", "BA", "BQ", "W"} rule = freq.rule_code if rule in end_types or ("-" in rule and rule[: rule.find("-")] in end_types): if closed is None: @@ -2107,7 +2163,7 @@ def __init__( # always sort time groupers kwargs["sort"] = True - super().__init__(freq=freq, axis=axis, **kwargs) + super().__init__(freq=freq, key=key, axis=axis, **kwargs) def _get_resampler(self, obj: NDFrame, kind=None) -> Resampler: """ @@ -2129,7 +2185,6 @@ def _get_resampler(self, obj: NDFrame, kind=None) -> Resampler: """ _, ax, indexer = self._set_grouper(obj, gpr_index=None) - if isinstance(ax, DatetimeIndex): return DatetimeIndexResampler( obj, @@ -2688,6 +2743,9 @@ def asfreq( if how is None: how = "E" + if isinstance(freq, BaseOffset): + freq = freq_to_period_freqstr(freq.n, freq.name) + new_obj = obj.copy() new_obj.index = obj.index.asfreq(freq, how=how) @@ -2767,3 +2825,18 @@ def maybe_warn_args_and_kwargs(cls, kernel: str, args, kwargs) -> None: category=FutureWarning, stacklevel=find_stack_level(), ) + + +def _apply( + grouped: GroupBy, how: Callable, *args, include_groups: bool, **kwargs +) -> DataFrame: + # GH#7155 - rewrite warning to appear as if it came from `.resample` + target_message = "DataFrameGroupBy.apply operated on the grouping columns" + new_message = _apply_groupings_depr.format("DataFrameGroupBy", "resample") + with rewrite_warning( + target_message=target_message, + target_category=FutureWarning, + new_message=new_message, + ): + result = grouped.apply(how, *args, include_groups=include_groups, **kwargs) + return result diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index ffa7199921298..1bc548de91f01 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -76,7 +76,7 @@ def concat( axis: Literal[0, "index"] = ..., join: str = ..., ignore_index: bool = ..., - keys=..., + keys: Iterable[Hashable] | None = ..., levels=..., names: list[HashableT] | None = ..., verify_integrity: bool = ..., @@ -93,7 +93,7 @@ def concat( axis: Literal[0, "index"] = ..., join: str = ..., ignore_index: bool = ..., - keys=..., + keys: Iterable[Hashable] | None = ..., levels=..., names: list[HashableT] | None = ..., verify_integrity: bool = ..., @@ -110,7 +110,7 @@ def concat( axis: Literal[0, "index"] = ..., join: str = ..., ignore_index: bool = ..., - keys=..., + keys: Iterable[Hashable] | None = ..., levels=..., names: list[HashableT] | None = ..., verify_integrity: bool = ..., @@ -127,7 +127,7 @@ def concat( axis: Literal[1, "columns"], join: str = ..., ignore_index: bool = ..., - keys=..., + keys: Iterable[Hashable] | None = ..., levels=..., names: list[HashableT] | None = ..., verify_integrity: bool = ..., @@ -144,7 +144,7 @@ def concat( axis: Axis = ..., join: str = ..., ignore_index: bool = ..., - keys=..., + keys: Iterable[Hashable] | None = ..., levels=..., names: list[HashableT] | None = ..., verify_integrity: bool = ..., @@ -160,7 +160,7 @@ def concat( axis: Axis = 0, join: str = "outer", ignore_index: bool = False, - keys=None, + keys: Iterable[Hashable] | None = None, levels=None, names: list[HashableT] | None = None, verify_integrity: bool = False, @@ -405,7 +405,7 @@ def __init__( objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], axis: Axis = 0, join: str = "outer", - keys=None, + keys: Iterable[Hashable] | None = None, levels=None, names: list[HashableT] | None = None, ignore_index: bool = False, diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6987a0ac7bf6b..6d1ff07e07c76 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -53,6 +53,7 @@ ensure_object, is_bool, is_bool_dtype, + is_extension_array_dtype, is_float_dtype, is_integer, is_integer_dtype, @@ -90,6 +91,7 @@ ExtensionArray, ) from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.arrays.string_ import StringDtype import pandas.core.common as com from pandas.core.construction import ( ensure_wrapped_if_datetimelike, @@ -103,6 +105,7 @@ from pandas import DataFrame from pandas.core import groupby from pandas.core.arrays import DatetimeArray + from pandas.core.indexes.frozen import FrozenList _factorizers = { np.int64: libhashtable.Int64Factorizer, @@ -1269,12 +1272,7 @@ def _get_merge_keys( # work-around for merge_asof(right_index=True) right_keys.append(right.index._values) if lk is not None and lk == rk: # FIXME: what about other NAs? - # avoid key upcast in corner case (length-0) - lk = cast(Hashable, lk) - if len(left) > 0: - right_drop.append(rk) - else: - left_drop.append(lk) + right_drop.append(rk) else: rk = cast(ArrayLike, rk) right_keys.append(rk) @@ -1384,6 +1382,21 @@ def _maybe_coerce_merge_keys(self) -> None: if lk.dtype.kind == rk.dtype.kind: continue + if is_extension_array_dtype(lk.dtype) and not is_extension_array_dtype( + rk.dtype + ): + ct = find_common_type([lk.dtype, rk.dtype]) + if is_extension_array_dtype(ct): + rk = ct.construct_array_type()._from_sequence(rk) # type: ignore[union-attr] # noqa: E501 + else: + rk = rk.astype(ct) # type: ignore[arg-type] + elif is_extension_array_dtype(rk.dtype): + ct = find_common_type([lk.dtype, rk.dtype]) + if is_extension_array_dtype(ct): + lk = ct.construct_array_type()._from_sequence(lk) # type: ignore[union-attr] # noqa: E501 + else: + lk = lk.astype(ct) # type: ignore[arg-type] + # check whether ints and floats if is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype): # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int @@ -1678,6 +1691,9 @@ def get_join_indexers( elif not sort and how in ["left", "outer"]: return _get_no_sort_one_missing_indexer(left_n, False) + if not sort and how == "outer": + sort = True + # get left & right join labels and num. of levels at each location mapped = ( _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how) @@ -1696,7 +1712,7 @@ def get_join_indexers( lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how) # preserve left frame order if how == 'left' and sort == False kwargs = {} - if how in ("left", "right"): + if how in ("inner", "left", "right"): kwargs["sort"] = sort join_func = { "inner": libjoin.inner_join, @@ -1718,7 +1734,7 @@ def restore_dropped_levels_multijoin( join_index: Index, lindexer: npt.NDArray[np.intp], rindexer: npt.NDArray[np.intp], -) -> tuple[list[Index], npt.NDArray[np.intp], list[Hashable]]: +) -> tuple[FrozenList, FrozenList, FrozenList]: """ *this is an internal non-public method* @@ -1794,7 +1810,7 @@ def _convert_to_multiindex(index: Index) -> MultiIndex: # error: Cannot determine type of "__add__" join_levels = join_levels + [restore_levels] # type: ignore[has-type] - join_codes = join_codes + [restore_codes] + join_codes = join_codes + [restore_codes] # type: ignore[has-type] join_names = join_names + [dropped_level_name] return join_levels, join_codes, join_names @@ -2399,21 +2415,10 @@ def _factorize_keys( rk = ensure_int64(rk.codes) elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: - if not isinstance(lk, BaseMaskedArray) and not ( - # exclude arrow dtypes that would get cast to object - isinstance(lk.dtype, ArrowDtype) - and ( - is_numeric_dtype(lk.dtype.numpy_dtype) - or is_string_dtype(lk.dtype) - and not sort - ) + if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( + isinstance(lk.dtype, StringDtype) + and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"] ): - lk, _ = lk._values_for_factorize() - - # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute - # "_values_for_factorize" - rk, _ = rk._values_for_factorize() # type: ignore[union-attr] - elif isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype): import pyarrow as pa import pyarrow.compute as pc @@ -2428,14 +2433,33 @@ def _factorize_keys( length = len(dc.dictionary) llab, rlab, count = ( - pc.fill_null(dc.indices[slice(len_lk)], length).to_numpy(), - pc.fill_null(dc.indices[slice(len_lk, None)], length).to_numpy(), + pc.fill_null(dc.indices[slice(len_lk)], length) + .to_numpy() + .astype(np.intp, copy=False), + pc.fill_null(dc.indices[slice(len_lk, None)], length) + .to_numpy() + .astype(np.intp, copy=False), len(dc.dictionary), ) if how == "right": return rlab, llab, count return llab, rlab, count + if not isinstance(lk, BaseMaskedArray) and not ( + # exclude arrow dtypes that would get cast to object + isinstance(lk.dtype, ArrowDtype) + and ( + is_numeric_dtype(lk.dtype.numpy_dtype) + or is_string_dtype(lk.dtype) + and not sort + ) + ): + lk, _ = lk._values_for_factorize() + + # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute + # "_values_for_factorize" + rk, _ = rk._values_for_factorize() # type: ignore[union-attr] + if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype: # GH#23917 TODO: Needs tests for non-matching dtypes # GH#23917 TODO: needs tests for case where lk is integer-dtype diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 71e3ea5b2588e..79354fdd12a2d 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -7,6 +7,7 @@ from typing import ( TYPE_CHECKING, Callable, + Literal, cast, ) @@ -449,7 +450,7 @@ def _all_key(): return (margins_name,) + ("",) * (len(cols) - 1) if len(rows) > 0: - margin = data[rows].groupby(rows, observed=observed).apply(aggfunc) + margin = data.groupby(rows, observed=observed)[rows].apply(aggfunc) all_key = _all_key() table[all_key] = margin result = table @@ -467,7 +468,7 @@ def _all_key(): margin_keys = table.columns if len(cols): - row_margin = data[cols].groupby(cols, observed=observed).apply(aggfunc) + row_margin = data.groupby(cols, observed=observed)[cols].apply(aggfunc) else: row_margin = Series(np.nan, index=result.columns) @@ -522,6 +523,7 @@ def pivot( cols + columns_listlike, append=append # type: ignore[operator] ) else: + index_list: list[Index] | list[Series] if index is lib.no_default: if isinstance(data.index, MultiIndex): # GH 23955 @@ -568,7 +570,7 @@ def crosstab( margins: bool = False, margins_name: Hashable = "All", dropna: bool = True, - normalize: bool = False, + normalize: bool | Literal[0, 1, "all", "index", "columns"] = False, ) -> DataFrame: """ Compute a simple cross tabulation of two (or more) factors. diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index fc8d827cd31bb..bf7c7a1ee4dc7 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -908,7 +908,7 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: data = frame.copy() else: # Take the data from frame corresponding to this idx value - if not isinstance(idx, tuple): + if len(level) == 1: idx = (idx,) gen = iter(idx) column_indexer = tuple( diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 43eea7c669ce7..980e8aa41669f 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -17,10 +17,8 @@ Timestamp, lib, ) -from pandas._libs.lib import infer_dtype from pandas.core.dtypes.common import ( - DT64NS_DTYPE, ensure_platform_int, is_bool_dtype, is_integer, @@ -43,7 +41,6 @@ to_datetime, to_timedelta, ) -from pandas.core import nanops import pandas.core.algorithms as algos if TYPE_CHECKING: @@ -243,66 +240,33 @@ def cut( # NOTE: this binning code is changed a bit from histogram for var(x) == 0 original = x - x = _preprocess_for_cut(x) - x, dtype = _coerce_to_type(x) + x_idx = _preprocess_for_cut(x) + x_idx, _ = _coerce_to_type(x_idx) if not np.iterable(bins): - if is_scalar(bins) and bins < 1: - raise ValueError("`bins` should be a positive integer.") - - sz = x.size - - if sz == 0: - raise ValueError("Cannot cut empty array") - - rng = (nanops.nanmin(x), nanops.nanmax(x)) - mn, mx = (mi + 0.0 for mi in rng) - - if np.isinf(mn) or np.isinf(mx): - # GH 24314 - raise ValueError( - "cannot specify integer `bins` when input data contains infinity" - ) - if mn == mx: # adjust end points before binning - mn -= 0.001 * abs(mn) if mn != 0 else 0.001 - mx += 0.001 * abs(mx) if mx != 0 else 0.001 - bins = np.linspace(mn, mx, bins + 1, endpoint=True) - else: # adjust end points after binning - bins = np.linspace(mn, mx, bins + 1, endpoint=True) - adj = (mx - mn) * 0.001 # 0.1% of the range - if right: - bins[0] -= adj - else: - bins[-1] += adj + bins = _nbins_to_bins(x_idx, bins, right) elif isinstance(bins, IntervalIndex): if bins.is_overlapping: raise ValueError("Overlapping IntervalIndex is not accepted.") else: - if isinstance(getattr(bins, "dtype", None), DatetimeTZDtype): - bins = np.asarray(bins, dtype=DT64NS_DTYPE) - else: - bins = np.asarray(bins) - bins = _convert_bin_to_numeric_type(bins, dtype) - - # GH 26045: cast to float64 to avoid an overflow - if (np.diff(bins.astype("float64")) < 0).any(): + bins = Index(bins) + if not bins.is_monotonic_increasing: raise ValueError("bins must increase monotonically.") fac, bins = _bins_to_cuts( - x, + x_idx, bins, right=right, labels=labels, precision=precision, include_lowest=include_lowest, - dtype=dtype, duplicates=duplicates, ordered=ordered, ) - return _postprocess_for_cut(fac, bins, retbins, dtype, original) + return _postprocess_for_cut(fac, bins, retbins, original) def qcut( @@ -367,36 +331,90 @@ def qcut( array([0, 0, 1, 2, 3]) """ original = x - x = _preprocess_for_cut(x) - x, dtype = _coerce_to_type(x) + x_idx = _preprocess_for_cut(x) + x_idx, _ = _coerce_to_type(x_idx) quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q - x_np = np.asarray(x) - x_np = x_np[~np.isnan(x_np)] - bins = np.quantile(x_np, quantiles) + bins = x_idx.to_series().dropna().quantile(quantiles) fac, bins = _bins_to_cuts( - x, - bins, + x_idx, + Index(bins), labels=labels, precision=precision, include_lowest=True, - dtype=dtype, duplicates=duplicates, ) - return _postprocess_for_cut(fac, bins, retbins, dtype, original) + return _postprocess_for_cut(fac, bins, retbins, original) + + +def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index: + """ + If a user passed an integer N for bins, convert this to a sequence of N + equal(ish)-sized bins. + """ + if is_scalar(nbins) and nbins < 1: + raise ValueError("`bins` should be a positive integer.") + + if x_idx.size == 0: + raise ValueError("Cannot cut empty array") + + rng = (x_idx.min(), x_idx.max()) + mn, mx = rng + + is_dt_or_td = lib.is_np_dtype(x_idx.dtype, "mM") or isinstance( + x_idx.dtype, DatetimeTZDtype + ) + + if is_numeric_dtype(x_idx.dtype) and (np.isinf(mn) or np.isinf(mx)): + # GH#24314 + raise ValueError( + "cannot specify integer `bins` when input data contains infinity" + ) + + if mn == mx: # adjust end points before binning + if is_dt_or_td: + # using seconds=1 is pretty arbitrary here + td = Timedelta(seconds=1) + # Use DatetimeArray/TimedeltaArray method instead of linspace + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "_generate_range" + bins = x_idx._values._generate_range( # type: ignore[union-attr] + start=mn - td, end=mx + td, periods=nbins + 1, freq=None + ) + else: + mn -= 0.001 * abs(mn) if mn != 0 else 0.001 + mx += 0.001 * abs(mx) if mx != 0 else 0.001 + + bins = np.linspace(mn, mx, nbins + 1, endpoint=True) + else: # adjust end points after binning + if is_dt_or_td: + # Use DatetimeArray/TimedeltaArray method instead of linspace + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "_generate_range" + bins = x_idx._values._generate_range( # type: ignore[union-attr] + start=mn, end=mx, periods=nbins + 1, freq=None + ) + else: + bins = np.linspace(mn, mx, nbins + 1, endpoint=True) + adj = (mx - mn) * 0.001 # 0.1% of the range + if right: + bins[0] -= adj + else: + bins[-1] += adj + + return Index(bins) def _bins_to_cuts( - x, - bins: np.ndarray, + x_idx: Index, + bins: Index, right: bool = True, labels=None, precision: int = 3, include_lowest: bool = False, - dtype: DtypeObj | None = None, duplicates: str = "raise", ordered: bool = True, ): @@ -408,9 +426,11 @@ def _bins_to_cuts( "invalid value for 'duplicates' parameter, valid options are: raise, drop" ) + result: Categorical | np.ndarray + if isinstance(bins, IntervalIndex): # we have a fast-path here - ids = bins.get_indexer(x) + ids = bins.get_indexer(x_idx) cat_dtype = CategoricalDtype(bins, ordered=True) result = Categorical.from_codes(ids, dtype=cat_dtype, validate=False) return result, bins @@ -425,12 +445,29 @@ def _bins_to_cuts( bins = unique_bins side: Literal["left", "right"] = "left" if right else "right" - ids = ensure_platform_int(bins.searchsorted(x, side=side)) + + try: + ids = bins.searchsorted(x_idx, side=side) + except TypeError as err: + # e.g. test_datetime_nan_error if bins are DatetimeArray and x_idx + # is integers + if x_idx.dtype.kind == "m": + raise ValueError("bins must be of timedelta64 dtype") from err + elif x_idx.dtype.kind == bins.dtype.kind == "M": + raise ValueError( + "Cannot use timezone-naive bins with timezone-aware values, " + "or vice-versa" + ) from err + elif x_idx.dtype.kind == "M": + raise ValueError("bins must be of datetime64 dtype") from err + else: + raise + ids = ensure_platform_int(ids) if include_lowest: - ids[np.asarray(x) == bins[0]] = 1 + ids[x_idx == bins[0]] = 1 - na_mask = isna(x) | (ids == len(bins)) | (ids == 0) + na_mask = isna(x_idx) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: @@ -442,7 +479,7 @@ def _bins_to_cuts( if labels is None: labels = _format_labels( - bins, precision, right=right, include_lowest=include_lowest, dtype=dtype + bins, precision, right=right, include_lowest=include_lowest ) elif ordered and len(set(labels)) != len(labels): raise ValueError( @@ -474,7 +511,7 @@ def _bins_to_cuts( return result, bins -def _coerce_to_type(x): +def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]: """ if the passed data is of datetime/timedelta, bool or nullable int type, this method converts it to numeric so that cut or qcut method can @@ -498,91 +535,31 @@ def _coerce_to_type(x): # https://github.com/pandas-dev/pandas/pull/31290 # https://github.com/pandas-dev/pandas/issues/31389 elif isinstance(x.dtype, ExtensionDtype) and is_numeric_dtype(x.dtype): - x = x.to_numpy(dtype=np.float64, na_value=np.nan) - - if dtype is not None: - # GH 19768: force NaT to NaN during integer conversion - x = np.where(x.notna(), x.view(np.int64), np.nan) - - return x, dtype - - -def _convert_bin_to_numeric_type(bins, dtype: DtypeObj | None): - """ - if the passed bin is of datetime/timedelta type, - this method converts it to integer - - Parameters - ---------- - bins : list-like of bins - dtype : dtype of data + x_arr = x.to_numpy(dtype=np.float64, na_value=np.nan) + x = Index(x_arr) - Raises - ------ - ValueError if bins are not of a compat dtype to dtype - """ - bins_dtype = infer_dtype(bins, skipna=False) - if lib.is_np_dtype(dtype, "m"): - if bins_dtype in ["timedelta", "timedelta64"]: - bins = to_timedelta(bins).view(np.int64) - else: - raise ValueError("bins must be of timedelta64 dtype") - elif lib.is_np_dtype(dtype, "M") or isinstance(dtype, DatetimeTZDtype): - if bins_dtype in ["datetime", "datetime64"]: - bins = to_datetime(bins) - if lib.is_np_dtype(bins.dtype, "M"): - # As of 2.0, to_datetime may give non-nano, so we need to convert - # here until the rest of this file recognizes non-nano - bins = bins.astype("datetime64[ns]", copy=False) - bins = bins.view(np.int64) - else: - raise ValueError("bins must be of datetime64 dtype") - - return bins - - -def _convert_bin_to_datelike_type(bins, dtype: DtypeObj | None): - """ - Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is - datelike - - Parameters - ---------- - bins : list-like of bins - dtype : dtype of data - - Returns - ------- - bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is - datelike - """ - if isinstance(dtype, DatetimeTZDtype): - bins = to_datetime(bins.astype(np.int64), utc=True).tz_convert(dtype.tz) - elif lib.is_np_dtype(dtype, "mM"): - bins = Index(bins.astype(np.int64), dtype=dtype) - return bins + return Index(x), dtype def _format_labels( - bins, + bins: Index, precision: int, right: bool = True, include_lowest: bool = False, - dtype: DtypeObj | None = None, ): """based on the dtype, return our labels""" closed: IntervalLeftRight = "right" if right else "left" formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta] - if isinstance(dtype, DatetimeTZDtype): - formatter = lambda x: Timestamp(x, tz=dtype.tz) + if isinstance(bins.dtype, DatetimeTZDtype): + formatter = lambda x: x adjust = lambda x: x - Timedelta("1ns") - elif lib.is_np_dtype(dtype, "M"): - formatter = Timestamp + elif lib.is_np_dtype(bins.dtype, "M"): + formatter = lambda x: x adjust = lambda x: x - Timedelta("1ns") - elif lib.is_np_dtype(dtype, "m"): - formatter = Timedelta + elif lib.is_np_dtype(bins.dtype, "m"): + formatter = lambda x: x adjust = lambda x: x - Timedelta("1ns") else: precision = _infer_precision(precision, bins) @@ -597,7 +574,7 @@ def _format_labels( return IntervalIndex.from_breaks(breaks, closed=closed) -def _preprocess_for_cut(x): +def _preprocess_for_cut(x) -> Index: """ handles preprocessing for cut where we convert passed input to array, strip the index information and store it @@ -611,10 +588,10 @@ def _preprocess_for_cut(x): if x.ndim != 1: raise ValueError("Input array must be 1 dimensional") - return x + return Index(x) -def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, original): +def _postprocess_for_cut(fac, bins, retbins: bool, original): """ handles post processing for the cut method where we combine the index information if the originally passed @@ -626,7 +603,8 @@ def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, origi if not retbins: return fac - bins = _convert_bin_to_datelike_type(bins, dtype) + if isinstance(bins, Index) and is_numeric_dtype(bins.dtype): + bins = bins._values return fac, bins @@ -646,7 +624,7 @@ def _round_frac(x, precision: int): return np.around(x, digits) -def _infer_precision(base_precision: int, bins) -> int: +def _infer_precision(base_precision: int, bins: Index) -> int: """ Infer an appropriate precision for _round_frac """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 564c799d7ab66..d3a2bb1745cd1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -26,10 +26,8 @@ import numpy as np -from pandas._config import ( - get_option, - using_copy_on_write, -) +from pandas._config import using_copy_on_write +from pandas._config.config import _get_option from pandas._libs import ( lib, @@ -76,10 +74,7 @@ pandas_dtype, validate_all_hashable, ) -from pandas.core.dtypes.dtypes import ( - ArrowDtype, - ExtensionDtype, -) +from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import ( @@ -101,6 +96,7 @@ from pandas.core.accessor import CachedAccessor from pandas.core.apply import SeriesApply from pandas.core.arrays import ExtensionArray +from pandas.core.arrays.arrow import StructAccessor from pandas.core.arrays.categorical import CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor from pandas.core.construction import ( @@ -166,6 +162,7 @@ DtypeBackend, DtypeObj, FilePath, + Frequency, IgnoreRaise, IndexKeyFunc, IndexLabel, @@ -405,7 +402,7 @@ def __init__( if fastpath: # data is a ndarray, index is defined if not isinstance(data, (SingleBlockManager, SingleArrayManager)): - manager = get_option("mode.data_manager") + manager = _get_option("mode.data_manager", silent=True) if manager == "block": data = SingleBlockManager.from_array(data, index) elif manager == "array": @@ -511,7 +508,7 @@ def __init__( else: data = sanitize_array(data, index, dtype, copy) - manager = get_option("mode.data_manager") + manager = _get_option("mode.data_manager", silent=True) if manager == "block": data = SingleBlockManager.from_array(data, index, refs=refs) elif manager == "array": @@ -3985,6 +3982,8 @@ def argsort( mask = isna(values) if mask.any(): + # TODO(3.0): once this deprecation is enforced we can call + # self.array.argsort directly, which will close GH#43840 warnings.warn( "The behavior of Series.argsort in the presence of NA values is " "deprecated. In a future version, NA values will be ordered " @@ -4386,7 +4385,7 @@ def explode(self, ignore_index: bool = False) -> Series: 3 4 dtype: object """ - if isinstance(self.dtype, ArrowDtype) and self.dtype.type == list: + if isinstance(self.dtype, ExtensionDtype): values, counts = self._values._explode() elif len(self) and is_object_dtype(self.dtype): values, counts = reshape.explode(np.asarray(self._values)) @@ -4395,7 +4394,7 @@ def explode(self, ignore_index: bool = False) -> Series: return result.reset_index(drop=True) if ignore_index else result if ignore_index: - index = default_index(len(values)) + index: Index = default_index(len(values)) else: index = self.index.repeat(counts) @@ -4634,11 +4633,6 @@ def apply( """ Invoke function on values of Series. - .. deprecated:: 2.1.0 - - If the result from ``func`` is a ``Series``, wrapping the output in a - ``DataFrame`` instead of a ``Series`` has been deprecated. - Can be ufunc (a NumPy function that applies to the entire Series) or a Python function that only works on single values. @@ -5199,6 +5193,7 @@ def info( show_counts=show_counts, ) + # TODO(3.0): this can be removed once GH#33302 deprecation is enforced def _replace_single(self, to_replace, method: str, inplace: bool, limit): """ Replaces values in a Series using the fill method specified when no @@ -5627,7 +5622,7 @@ def dropna( def to_timestamp( self, - freq=None, + freq: Frequency | None = None, how: Literal["s", "e", "start", "end"] = "start", copy: bool | None = None, ) -> Series: @@ -5750,7 +5745,6 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series See Also -------- Series.reindex : Conform Series to new index. - Series.set_index : Set Series as DataFrame index. Index : The base pandas index type. Notes @@ -5784,6 +5778,7 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series cat = CachedAccessor("cat", CategoricalAccessor) plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) sparse = CachedAccessor("sparse", SparseAccessor) + struct = CachedAccessor("struct", StructAccessor) # ---------------------------------------------------------------------- # Add plotting methods to Series @@ -5943,55 +5938,61 @@ def _flex_method(self, other, op, *, level=None, fill_value=None, axis: Axis = 0 return op(self, other) @Appender(ops.make_flex_doc("eq", "series")) - def eq(self, other, level=None, fill_value=None, axis: Axis = 0): + def eq( + self, + other, + level: Level | None = None, + fill_value: float | None = None, + axis: Axis = 0, + ) -> Series: return self._flex_method( other, operator.eq, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("ne", "series")) - def ne(self, other, level=None, fill_value=None, axis: Axis = 0): + def ne(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.ne, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("le", "series")) - def le(self, other, level=None, fill_value=None, axis: Axis = 0): + def le(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.le, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("lt", "series")) - def lt(self, other, level=None, fill_value=None, axis: Axis = 0): + def lt(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.lt, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("ge", "series")) - def ge(self, other, level=None, fill_value=None, axis: Axis = 0): + def ge(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.ge, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("gt", "series")) - def gt(self, other, level=None, fill_value=None, axis: Axis = 0): + def gt(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.gt, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("add", "series")) - def add(self, other, level=None, fill_value=None, axis: Axis = 0): + def add(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.add, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("radd", "series")) - def radd(self, other, level=None, fill_value=None, axis: Axis = 0): + def radd(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, roperator.radd, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("sub", "series")) - def sub(self, other, level=None, fill_value=None, axis: Axis = 0): + def sub(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.sub, level=level, fill_value=fill_value, axis=axis ) @@ -5999,7 +6000,7 @@ def sub(self, other, level=None, fill_value=None, axis: Axis = 0): subtract = sub @Appender(ops.make_flex_doc("rsub", "series")) - def rsub(self, other, level=None, fill_value=None, axis: Axis = 0): + def rsub(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, roperator.rsub, level=level, fill_value=fill_value, axis=axis ) @@ -6011,7 +6012,7 @@ def mul( level: Level | None = None, fill_value: float | None = None, axis: Axis = 0, - ): + ) -> Series: return self._flex_method( other, operator.mul, level=level, fill_value=fill_value, axis=axis ) @@ -6019,13 +6020,13 @@ def mul( multiply = mul @Appender(ops.make_flex_doc("rmul", "series")) - def rmul(self, other, level=None, fill_value=None, axis: Axis = 0): + def rmul(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, roperator.rmul, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("truediv", "series")) - def truediv(self, other, level=None, fill_value=None, axis: Axis = 0): + def truediv(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.truediv, level=level, fill_value=fill_value, axis=axis ) @@ -6034,7 +6035,7 @@ def truediv(self, other, level=None, fill_value=None, axis: Axis = 0): divide = truediv @Appender(ops.make_flex_doc("rtruediv", "series")) - def rtruediv(self, other, level=None, fill_value=None, axis: Axis = 0): + def rtruediv(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, roperator.rtruediv, level=level, fill_value=fill_value, axis=axis ) @@ -6042,49 +6043,49 @@ def rtruediv(self, other, level=None, fill_value=None, axis: Axis = 0): rdiv = rtruediv @Appender(ops.make_flex_doc("floordiv", "series")) - def floordiv(self, other, level=None, fill_value=None, axis: Axis = 0): + def floordiv(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.floordiv, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("rfloordiv", "series")) - def rfloordiv(self, other, level=None, fill_value=None, axis: Axis = 0): + def rfloordiv(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, roperator.rfloordiv, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("mod", "series")) - def mod(self, other, level=None, fill_value=None, axis: Axis = 0): + def mod(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.mod, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("rmod", "series")) - def rmod(self, other, level=None, fill_value=None, axis: Axis = 0): + def rmod(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, roperator.rmod, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("pow", "series")) - def pow(self, other, level=None, fill_value=None, axis: Axis = 0): + def pow(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.pow, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("rpow", "series")) - def rpow(self, other, level=None, fill_value=None, axis: Axis = 0): + def rpow(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, roperator.rpow, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("divmod", "series")) - def divmod(self, other, level=None, fill_value=None, axis: Axis = 0): + def divmod(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, divmod, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("rdivmod", "series")) - def rdivmod(self, other, level=None, fill_value=None, axis: Axis = 0): + def rdivmod(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, roperator.rdivmod, level=level, fill_value=fill_value, axis=axis ) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 7579f816d0ace..ec219941a3afc 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -34,8 +34,6 @@ * scalar : when Series.agg is called with single function * Series : when DataFrame.agg is called with a single function * DataFrame : when DataFrame.agg is called with several functions - - Return scalar, Series or DataFrame. {see_also} Notes ----- @@ -113,6 +111,12 @@ axis : {0 or 'index', 1 or 'columns'}, default 0 Split along rows (0) or columns (1). For `Series` this parameter is unused and defaults to 0. + + .. deprecated:: 2.1.0 + + Will be removed and behave like axis=0 in a future version. + For ``axis=1``, do ``frame.T.groupby(...)`` instead. + level : int, level name, or sequence of such, default None If the axis is a MultiIndex (hierarchical), group by a particular level or levels. Do not specify both ``by`` and ``level``. @@ -213,7 +217,7 @@ Name to use for the 'variable' column. If None it uses ``frame.columns.name`` or 'variable'. value_name : scalar, default 'value' - Name to use for the 'value' column. + Name to use for the 'value' column, can't be an existing column label. col_level : int or str, optional If columns are a MultiIndex then use this level to melt. ignore_index : bool, default True diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index bc6e29c3c7fbf..d96fc02e16d0d 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -98,17 +98,17 @@ def get_indexer_indexer( sort_remaining=sort_remaining, na_position=na_position, ) + elif (ascending and target.is_monotonic_increasing) or ( + not ascending and target.is_monotonic_decreasing + ): + # Check monotonic-ness before sort an index (GH 11080) + return None elif isinstance(target, ABCMultiIndex): + codes = [lev.codes for lev in target._get_codes_for_sorting()] indexer = lexsort_indexer( - target.codes, orders=ascending, na_position=na_position, codes_given=True + codes, orders=ascending, na_position=na_position, codes_given=True ) else: - # Check monotonic-ness before sort an index (GH 11080) - if (ascending and target.is_monotonic_increasing) or ( - not ascending and target.is_monotonic_decreasing - ): - return None - # ascending can only be a Sequence for MultiIndex indexer = nargsort( target, @@ -298,22 +298,8 @@ def decons_obs_group_ids( return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels] -def indexer_from_factorized( - labels, shape: Shape, compress: bool = True -) -> npt.NDArray[np.intp]: - ids = get_group_index(labels, shape, sort=True, xnull=False) - - if not compress: - ngroups = (ids.size and ids.max()) + 1 - else: - ids, obs = compress_group_index(ids, sort=True) - ngroups = len(obs) - - return get_group_index_sorter(ids, ngroups) - - def lexsort_indexer( - keys: list[ArrayLike] | list[Series], + keys: Sequence[ArrayLike | Index | Series], orders=None, na_position: str = "last", key: Callable | None = None, @@ -324,9 +310,9 @@ def lexsort_indexer( Parameters ---------- - keys : list[ArrayLike] | list[Series] - Sequence of ndarrays to be sorted by the indexer - list[Series] is only if key is not None. + keys : Sequence[ArrayLike | Index | Series] + Sequence of arrays to be sorted by the indexer + Sequence[Series] is only if key is not None. orders : bool or list of booleans, optional Determines the sorting order for each element in keys. If a list, it must be the same length as keys. This determines whether the @@ -346,83 +332,38 @@ def lexsort_indexer( """ from pandas.core.arrays import Categorical - labels = [] - shape = [] + if na_position not in ["last", "first"]: + raise ValueError(f"invalid na_position: {na_position}") + if isinstance(orders, bool): orders = [orders] * len(keys) elif orders is None: orders = [True] * len(keys) - # error: Incompatible types in assignment (expression has type - # "List[Union[ExtensionArray, ndarray[Any, Any], Index, Series]]", variable - # has type "Union[List[Union[ExtensionArray, ndarray[Any, Any]]], List[Series]]") - keys = [ensure_key_mapped(k, key) for k in keys] # type: ignore[assignment] + labels = [] for k, order in zip(keys, orders): - if na_position not in ["last", "first"]: - raise ValueError(f"invalid na_position: {na_position}") - + k = ensure_key_mapped(k, key) if codes_given: - mask = k == -1 - codes = k.copy() - n = len(codes) - mask_n = n - # error: Item "ExtensionArray" of "Union[Any, ExtensionArray, - # ndarray[Any, Any]]" has no attribute "any" - if mask.any(): # type: ignore[union-attr] - n -= 1 - + codes = cast(np.ndarray, k) + n = codes.max() + 1 if len(codes) else 0 else: cat = Categorical(k, ordered=True) + codes = cat.codes n = len(cat.categories) - codes = cat.codes.copy() - mask = cat.codes == -1 - if mask.any(): - mask_n = n + 1 - else: - mask_n = n - - if order: # ascending - if na_position == "last": - # error: Argument 1 to "where" has incompatible type "Union[Any, - # ExtensionArray, ndarray[Any, Any]]"; expected - # "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], bool, int, float, - # complex, str, bytes, _NestedSequence[Union[bool, int, float, - # complex, str, bytes]]]" - codes = np.where(mask, n, codes) # type: ignore[arg-type] - elif na_position == "first": - # error: Incompatible types in assignment (expression has type - # "Union[Any, int, ndarray[Any, dtype[signedinteger[Any]]]]", - # variable has type "Union[Series, ExtensionArray, ndarray[Any, Any]]") - # error: Unsupported operand types for + ("ExtensionArray" and "int") - codes += 1 # type: ignore[operator,assignment] - else: # not order means descending - if na_position == "last": - # error: Unsupported operand types for - ("int" and "ExtensionArray") - # error: Argument 1 to "where" has incompatible type "Union[Any, - # ExtensionArray, ndarray[Any, Any]]"; expected - # "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], bool, int, float, - # complex, str, bytes, _NestedSequence[Union[bool, int, float, - # complex, str, bytes]]]" - codes = np.where( - mask, n, n - codes - 1 # type: ignore[operator,arg-type] - ) - elif na_position == "first": - # error: Unsupported operand types for - ("int" and "ExtensionArray") - # error: Argument 1 to "where" has incompatible type "Union[Any, - # ExtensionArray, ndarray[Any, Any]]"; expected - # "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], bool, int, float, - # complex, str, bytes, _NestedSequence[Union[bool, int, float, - # complex, str, bytes]]]" - codes = np.where(mask, 0, n - codes) # type: ignore[operator,arg-type] - - shape.append(mask_n) + + mask = codes == -1 + + if na_position == "last" and mask.any(): + codes = np.where(mask, n, codes) + + # not order means descending + if not order: + codes = np.where(mask, codes, n - codes - 1) + labels.append(codes) - return indexer_from_factorized(labels, tuple(shape)) + return np.lexsort(labels[::-1]) def nargsort( diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index becf9b47b3af1..71d6f9c58e2c2 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -145,7 +145,9 @@ def _map_and_wrap(name: str | None, docstring: str | None): @forbid_nonstring_types(["bytes"], name=name) def wrapper(self): result = getattr(self._data.array, f"_str_{name}")() - return self._wrap_result(result) + return self._wrap_result( + result, returns_string=name not in ("isnumeric", "isdecimal") + ) wrapper.__doc__ = docstring return wrapper @@ -370,7 +372,7 @@ def cons_row(x): if expand: result = list(result) - out = MultiIndex.from_tuples(result, names=name) + out: Index = MultiIndex.from_tuples(result, names=name) if out.nlevels == 1: # We had all tuples of length-one, which are # better represented as a regular Index. diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 3f2f832c08dc6..a9abc0714baa3 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -7,7 +7,6 @@ TYPE_CHECKING, overload, ) -import warnings import numpy as np @@ -20,7 +19,6 @@ Timedelta, parse_timedelta_unit, ) -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.dtypes import ArrowDtype @@ -115,15 +113,17 @@ def to_timedelta( * 'D' / 'days' / 'day' * 'hours' / 'hour' / 'hr' / 'h' * 'm' / 'minute' / 'min' / 'minutes' / 'T' - * 'S' / 'seconds' / 'sec' / 'second' + * 's' / 'seconds' / 'sec' / 'second' / 'S' * 'ms' / 'milliseconds' / 'millisecond' / 'milli' / 'millis' / 'L' * 'us' / 'microseconds' / 'microsecond' / 'micro' / 'micros' / 'U' * 'ns' / 'nanoseconds' / 'nano' / 'nanos' / 'nanosecond' / 'N' Must not be specified when `arg` context strings and ``errors="raise"``. - .. deprecated:: 2.1.0 - Units 'T' and 'L' are deprecated and will be removed in a future version. + .. deprecated:: 2.2.0 + Units 'T', 'S', 'L', 'U' and 'N' are deprecated and will be removed + in a future version. Please use 'min', 's', 'ms', 'us', and 'ns' instead of + 'T', 'S', 'L', 'U' and 'N'. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. @@ -176,13 +176,6 @@ def to_timedelta( TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) """ - if unit in {"T", "t", "L", "l"}: - warnings.warn( - f"Unit '{unit}' is deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if unit is not None: unit = parse_timedelta_unit(unit) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index becbba703f92c..72e94d049a9de 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -937,6 +937,11 @@ class Window(BaseWindow): For `Series` this parameter is unused and defaults to 0. + .. deprecated:: 2.1.0 + + The axis keyword is deprecated. For ``axis=1``, + transpose the DataFrame first instead. + closed : str, default None If ``'right'``, the first point in the window is excluded from calculations. diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 27316b3ab0af0..3b2ae5daffdba 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -28,4 +28,7 @@ def _arrow_dtype_mapping() -> dict: def arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") - return {pa.string(): pd.ArrowDtype(pa.string())}.get + return { + pa.string(): pd.StringDtype(storage="pyarrow_numpy"), + pa.large_string(): pd.StringDtype(storage="pyarrow_numpy"), + }.get diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index c07f51d875d4d..6491849925e86 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -4,7 +4,7 @@ A cross-platform clipboard module for Python, with copy & paste functions for plain text. By Al Sweigart al@inventwithpython.com -BSD License +Licence at LICENSES/PYPERCLIP_LICENSE Usage: import pyperclip @@ -17,9 +17,12 @@ On Windows, no additional modules are needed. On Mac, the pyobjc module is used, falling back to the pbcopy and pbpaste cli commands. (These commands should come with OS X.). -On Linux, install xclip or xsel via package manager. For example, in Debian: +On Linux, install xclip, xsel, or wl-clipboard (for "wayland" sessions) via +package manager. +For example, in Debian: sudo apt-get install xclip sudo apt-get install xsel + sudo apt-get install wl-clipboard Otherwise on Linux, you will need the PyQt5 modules installed. @@ -28,12 +31,11 @@ Cygwin is currently not supported. Security Note: This module runs programs with these names: - - which - - where - pbcopy - pbpaste - xclip - xsel + - wl-copy/wl-paste - klipper - qdbus A malicious user could rename or add programs with these names, tricking @@ -41,7 +43,7 @@ """ -__version__ = "1.7.0" +__version__ = "1.8.2" import contextlib @@ -55,7 +57,7 @@ ) import os import platform -from shutil import which +from shutil import which as _executable_exists import subprocess import time import warnings @@ -74,25 +76,14 @@ EXCEPT_MSG = """ Pyperclip could not find a copy/paste mechanism for your system. For more information, please visit - https://pyperclip.readthedocs.io/en/latest/#not-implemented-error + https://pyperclip.readthedocs.io/en/latest/index.html#not-implemented-error """ ENCODING = "utf-8" -# The "which" unix command finds where a command is. -if platform.system() == "Windows": - WHICH_CMD = "where" -else: - WHICH_CMD = "which" - -def _executable_exists(name): - return ( - subprocess.call( - [WHICH_CMD, name], stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - == 0 - ) +class PyperclipTimeoutException(PyperclipException): + pass def _stringifyText(text) -> str: @@ -229,6 +220,32 @@ def paste_xsel(primary=False): return copy_xsel, paste_xsel +def init_wl_clipboard(): + PRIMARY_SELECTION = "-p" + + def copy_wl(text, primary=False): + text = _stringifyText(text) # Converts non-str values to str. + args = ["wl-copy"] + if primary: + args.append(PRIMARY_SELECTION) + if not text: + args.append("--clear") + subprocess.check_call(args, close_fds=True) + else: + p = subprocess.Popen(args, stdin=subprocess.PIPE, close_fds=True) + p.communicate(input=text.encode(ENCODING)) + + def paste_wl(primary=False): + args = ["wl-paste", "-n"] + if primary: + args.append(PRIMARY_SELECTION) + p = subprocess.Popen(args, stdout=subprocess.PIPE, close_fds=True) + stdout, _stderr = p.communicate() + return stdout.decode(ENCODING) + + return copy_wl, paste_wl + + def init_klipper_clipboard(): def copy_klipper(text): text = _stringifyText(text) # Converts non-str values to str. @@ -534,7 +551,7 @@ def determine_clipboard(): return init_windows_clipboard() if platform.system() == "Linux": - if which("wslconfig.exe"): + if _executable_exists("wslconfig.exe"): return init_wsl_clipboard() # Setup for the macOS platform: @@ -549,6 +566,8 @@ def determine_clipboard(): # Setup for the LINUX platform: if HAS_DISPLAY: + if os.environ.get("WAYLAND_DISPLAY") and _executable_exists("wl-copy"): + return init_wl_clipboard() if _executable_exists("xsel"): return init_xsel_clipboard() if _executable_exists("xclip"): @@ -602,6 +621,7 @@ def set_clipboard(clipboard): "qt": init_qt_clipboard, # TODO - split this into 'qtpy', 'pyqt4', and 'pyqt5' "xclip": init_xclip_clipboard, "xsel": init_xsel_clipboard, + "wl-clipboard": init_wl_clipboard, "klipper": init_klipper_clipboard, "windows": init_windows_clipboard, "no": init_no_clipboard, @@ -671,7 +691,56 @@ def is_available() -> bool: copy, paste = lazy_load_stub_copy, lazy_load_stub_paste -__all__ = ["copy", "paste", "set_clipboard", "determine_clipboard"] +def waitForPaste(timeout=None): + """This function call blocks until a non-empty text string exists on the + clipboard. It returns this text. + + This function raises PyperclipTimeoutException if timeout was set to + a number of seconds that has elapsed without non-empty text being put on + the clipboard.""" + startTime = time.time() + while True: + clipboardText = paste() + if clipboardText != "": + return clipboardText + time.sleep(0.01) + + if timeout is not None and time.time() > startTime + timeout: + raise PyperclipTimeoutException( + "waitForPaste() timed out after " + str(timeout) + " seconds." + ) + + +def waitForNewPaste(timeout=None): + """This function call blocks until a new text string exists on the + clipboard that is different from the text that was there when the function + was first called. It returns this text. + + This function raises PyperclipTimeoutException if timeout was set to + a number of seconds that has elapsed without non-empty text being put on + the clipboard.""" + startTime = time.time() + originalText = paste() + while True: + currentText = paste() + if currentText != originalText: + return currentText + time.sleep(0.01) + + if timeout is not None and time.time() > startTime + timeout: + raise PyperclipTimeoutException( + "waitForNewPaste() timed out after " + str(timeout) + " seconds." + ) + + +__all__ = [ + "copy", + "paste", + "waitForPaste", + "waitForNewPaste", + "set_clipboard", + "determine_clipboard", +] # pandas aliases clipboard_get = paste diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 9ffbfb9f1149f..073115cab8695 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,6 +1,5 @@ from __future__ import annotations -import abc from collections.abc import ( Hashable, Iterable, @@ -160,13 +159,15 @@ of dtype conversion. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb". + Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", "calamine". Engine compatibility : - "xlrd" supports old-style Excel files (.xls). - "openpyxl" supports newer Excel file formats. - "odf" supports OpenDocument file formats (.odf, .ods, .odt). - "pyxlsb" supports Binary Excel files. + - "calamine" supports Excel (.xls, .xlsx, .xlsm, .xlsb) + and OpenDocument (.ods) file formats. .. versionchanged:: 1.2.0 The engine `xlrd `_ @@ -395,7 +396,7 @@ def read_excel( | Callable[[str], bool] | None = ..., dtype: DtypeArg | None = ..., - engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ..., + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ..., converters: dict[str, Callable] | dict[int, Callable] | None = ..., true_values: Iterable[Hashable] | None = ..., false_values: Iterable[Hashable] | None = ..., @@ -434,7 +435,7 @@ def read_excel( | Callable[[str], bool] | None = ..., dtype: DtypeArg | None = ..., - engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ..., + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ..., converters: dict[str, Callable] | dict[int, Callable] | None = ..., true_values: Iterable[Hashable] | None = ..., false_values: Iterable[Hashable] | None = ..., @@ -473,7 +474,7 @@ def read_excel( | Callable[[str], bool] | None = None, dtype: DtypeArg | None = None, - engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = None, + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = None, converters: dict[str, Callable] | dict[int, Callable] | None = None, true_values: Iterable[Hashable] | None = None, false_values: Iterable[Hashable] | None = None, @@ -549,7 +550,7 @@ def read_excel( _WorkbookT = TypeVar("_WorkbookT") -class BaseExcelReader(Generic[_WorkbookT], metaclass=abc.ABCMeta): +class BaseExcelReader(Generic[_WorkbookT]): book: _WorkbookT def __init__( @@ -589,13 +590,11 @@ def __init__( ) @property - @abc.abstractmethod def _workbook_class(self) -> type[_WorkbookT]: - pass + raise NotImplementedError - @abc.abstractmethod def load_workbook(self, filepath_or_buffer, engine_kwargs) -> _WorkbookT: - pass + raise NotImplementedError def close(self) -> None: if hasattr(self, "book"): @@ -611,21 +610,17 @@ def close(self) -> None: self.handles.close() @property - @abc.abstractmethod def sheet_names(self) -> list[str]: - pass + raise NotImplementedError - @abc.abstractmethod def get_sheet_by_name(self, name: str): - pass + raise NotImplementedError - @abc.abstractmethod def get_sheet_by_index(self, index: int): - pass + raise NotImplementedError - @abc.abstractmethod def get_sheet_data(self, sheet, rows: int | None = None): - pass + raise NotImplementedError def raise_if_bad_sheet_by_index(self, index: int) -> None: n_sheets = len(self.sheet_names) @@ -940,7 +935,7 @@ def parse( @doc(storage_options=_shared_docs["storage_options"]) -class ExcelWriter(Generic[_WorkbookT], metaclass=abc.ABCMeta): +class ExcelWriter(Generic[_WorkbookT]): """ Class for writing DataFrame objects into excel sheets. @@ -1178,20 +1173,19 @@ def engine(self) -> str: return self._engine @property - @abc.abstractmethod def sheets(self) -> dict[str, Any]: """Mapping of sheet names to sheet objects.""" + raise NotImplementedError @property - @abc.abstractmethod def book(self) -> _WorkbookT: """ Book instance. Class type will depend on the engine used. This attribute can be used to access engine-specific features. """ + raise NotImplementedError - @abc.abstractmethod def _write_cells( self, cells, @@ -1214,12 +1208,13 @@ def _write_cells( freeze_panes: int tuple of length 2 contains the bottom-most row and right-most column to freeze """ + raise NotImplementedError - @abc.abstractmethod def _save(self) -> None: """ Save workbook to disk. """ + raise NotImplementedError def __init__( self, @@ -1463,13 +1458,15 @@ class ExcelFile: .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb`` + Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, ``calamine`` Engine compatibility : - ``xlrd`` supports old-style Excel files (.xls). - ``openpyxl`` supports newer Excel file formats. - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). - ``pyxlsb`` supports Binary Excel files. + - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb) + and OpenDocument (.ods) file formats. .. versionchanged:: 1.2.0 @@ -1505,6 +1502,7 @@ class ExcelFile: ... df1 = pd.read_excel(xls, "Sheet1") # doctest: +SKIP """ + from pandas.io.excel._calamine import CalamineReader from pandas.io.excel._odfreader import ODFReader from pandas.io.excel._openpyxl import OpenpyxlReader from pandas.io.excel._pyxlsb import PyxlsbReader @@ -1515,6 +1513,7 @@ class ExcelFile: "openpyxl": OpenpyxlReader, "odf": ODFReader, "pyxlsb": PyxlsbReader, + "calamine": CalamineReader, } def __init__( diff --git a/pandas/io/excel/_calamine.py b/pandas/io/excel/_calamine.py new file mode 100644 index 0000000000000..d61a9fc664164 --- /dev/null +++ b/pandas/io/excel/_calamine.py @@ -0,0 +1,127 @@ +from __future__ import annotations + +from datetime import ( + date, + datetime, + time, + timedelta, +) +from typing import ( + TYPE_CHECKING, + Any, + Union, + cast, +) + +from pandas._typing import Scalar +from pandas.compat._optional import import_optional_dependency +from pandas.util._decorators import doc + +import pandas as pd +from pandas.core.shared_docs import _shared_docs + +from pandas.io.excel._base import BaseExcelReader + +if TYPE_CHECKING: + from python_calamine import ( + CalamineSheet, + CalamineWorkbook, + ) + + from pandas._typing import ( + FilePath, + ReadBuffer, + StorageOptions, + ) + +_CellValueT = Union[int, float, str, bool, time, date, datetime, timedelta] + + +class CalamineReader(BaseExcelReader["CalamineWorkbook"]): + @doc(storage_options=_shared_docs["storage_options"]) + def __init__( + self, + filepath_or_buffer: FilePath | ReadBuffer[bytes], + storage_options: StorageOptions | None = None, + engine_kwargs: dict | None = None, + ) -> None: + """ + Reader using calamine engine (xlsx/xls/xlsb/ods). + + Parameters + ---------- + filepath_or_buffer : str, path to be parsed or + an open readable stream. + {storage_options} + engine_kwargs : dict, optional + Arbitrary keyword arguments passed to excel engine. + """ + import_optional_dependency("python_calamine") + super().__init__( + filepath_or_buffer, + storage_options=storage_options, + engine_kwargs=engine_kwargs, + ) + + @property + def _workbook_class(self) -> type[CalamineWorkbook]: + from python_calamine import CalamineWorkbook + + return CalamineWorkbook + + def load_workbook( + self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs: Any + ) -> CalamineWorkbook: + from python_calamine import load_workbook + + return load_workbook( + filepath_or_buffer, **engine_kwargs # type: ignore[arg-type] + ) + + @property + def sheet_names(self) -> list[str]: + from python_calamine import SheetTypeEnum + + return [ + sheet.name + for sheet in self.book.sheets_metadata + if sheet.typ == SheetTypeEnum.WorkSheet + ] + + def get_sheet_by_name(self, name: str) -> CalamineSheet: + self.raise_if_bad_sheet_by_name(name) + return self.book.get_sheet_by_name(name) + + def get_sheet_by_index(self, index: int) -> CalamineSheet: + self.raise_if_bad_sheet_by_index(index) + return self.book.get_sheet_by_index(index) + + def get_sheet_data( + self, sheet: CalamineSheet, file_rows_needed: int | None = None + ) -> list[list[Scalar]]: + def _convert_cell(value: _CellValueT) -> Scalar: + if isinstance(value, float): + val = int(value) + if val == value: + return val + else: + return value + elif isinstance(value, date): + return pd.Timestamp(value) + elif isinstance(value, timedelta): + return pd.Timedelta(value) + elif isinstance(value, time): + # cast needed here because Scalar doesn't include datetime.time + return cast(Scalar, value) + + return value + + rows: list[list[_CellValueT]] = sheet.to_python(skip_empty_area=False) + data: list[list[Scalar]] = [] + + for row in rows: + data.append([_convert_cell(cell) for cell in row]) + if file_rows_needed is not None and len(data) >= file_rows_needed: + break + + return data diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 8016dbbaf7f42..277f64f636731 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -150,7 +150,7 @@ def get_sheet_data( max_row_len = len(table_row) row_repeat = self._get_row_repeat(sheet_row) - if self._is_empty_row(sheet_row): + if len(table_row) == 0: empty_rows += row_repeat else: # add blank rows to our table @@ -182,16 +182,6 @@ def _get_column_repeat(self, cell) -> int: return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1)) - def _is_empty_row(self, row) -> bool: - """ - Helper function to find empty rows - """ - for column in row.childNodes: - if len(column.childNodes) > 0: - return False - - return True - def _get_cell_value(self, cell) -> Scalar | NaTType: from odf.namespaces import OFFICENS diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index 74cbe90acdae8..bc7dca2d95b6b 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -192,7 +192,15 @@ def _make_table_cell(self, cell) -> tuple[object, Any]: if isinstance(val, bool): value = str(val).lower() pvalue = str(val).upper() - if isinstance(val, datetime.datetime): + return ( + pvalue, + TableCell( + valuetype="boolean", + booleanvalue=value, + attributes=attributes, + ), + ) + elif isinstance(val, datetime.datetime): # Fast formatting value = val.isoformat() # Slow but locale-dependent @@ -210,17 +218,20 @@ def _make_table_cell(self, cell) -> tuple[object, Any]: pvalue, TableCell(valuetype="date", datevalue=value, attributes=attributes), ) + elif isinstance(val, str): + return ( + pvalue, + TableCell( + valuetype="string", + stringvalue=value, + attributes=attributes, + ), + ) else: - class_to_cell_type = { - str: "string", - int: "float", - float: "float", - bool: "boolean", - } return ( pvalue, TableCell( - valuetype=class_to_cell_type[type(val)], + valuetype="float", value=value, attributes=attributes, ), diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index c68a0ab516e05..a444970792e6e 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -1,6 +1,7 @@ from __future__ import annotations from datetime import time +import math from typing import TYPE_CHECKING import numpy as np @@ -120,9 +121,11 @@ def _parse_cell(cell_contents, cell_typ): elif cell_typ == XL_CELL_NUMBER: # GH5394 - Excel 'numbers' are always floats # it's a minimal perf hit and less surprising - val = int(cell_contents) - if val == cell_contents: - cell_contents = val + if math.isfinite(cell_contents): + # GH54564 - don't attempt to convert NaN/Inf + val = int(cell_contents) + if val == cell_contents: + cell_contents = val return cell_contents data = [] diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 9970d465ced9d..b344d9849f16c 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -941,9 +941,7 @@ def write( if isinstance(writer, ExcelWriter): need_save = False else: - # error: Cannot instantiate abstract class 'ExcelWriter' with abstract - # attributes 'engine', 'save', 'supported_extensions' and 'write_cells' - writer = ExcelWriter( # type: ignore[abstract] + writer = ExcelWriter( writer, engine=engine, storage_options=storage_options, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index ff26abd5cc26c..2297f7945a264 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1830,8 +1830,8 @@ def get_format_datetime64_from_values( class Datetime64TZFormatter(Datetime64Formatter): def _format_strings(self) -> list[str]: """we by definition have a TZ""" + ido = is_dates_only(self.values) values = self.values.astype(object) - ido = is_dates_only(values) formatter = self.formatter or get_format_datetime64( ido, date_format=self.date_format ) diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index ce59985b8f352..b1a3504d46b27 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -633,7 +633,7 @@ def write_style(self) -> None: else: element_props.append(("thead th", "text-align", "right")) template_mid = "\n\n".join(template_select % t for t in element_props) - template = dedent("\n".join((template_first, template_mid, template_last))) + template = dedent(f"{template_first}\n{template_mid}\n{template_last}") self.write(template) def render(self) -> list[str]: diff --git a/pandas/io/formats/string.py b/pandas/io/formats/string.py index 769f9dee1c31a..cdad388592717 100644 --- a/pandas/io/formats/string.py +++ b/pandas/io/formats/string.py @@ -28,7 +28,7 @@ def __init__(self, fmt: DataFrameFormatter, line_width: int | None = None) -> No def to_string(self) -> str: text = self._get_string_representation() if self.fmt.should_show_dimensions: - text = "".join([text, self.fmt.dimensions_info]) + text = f"{text}{self.fmt.dimensions_info}" return text def _get_strcols(self) -> list[list[str]]: diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index 76b938755755a..a6ee8407988ec 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -9,6 +9,7 @@ TYPE_CHECKING, Any, ) +import warnings from pandas.errors import AbstractMethodError from pandas.util._decorators import doc @@ -202,7 +203,13 @@ def process_dataframe(self) -> dict[int | str, dict[str, Any]]: df = df.reset_index() if self.na_rep is not None: - df = df.fillna(self.na_rep) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + df = df.fillna(self.na_rep) return df.to_dict(orient="index") diff --git a/pandas/io/html.py b/pandas/io/html.py index 8a73c786825e3..68d30fe5ba681 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -23,6 +23,7 @@ AbstractMethodError, EmptyDataError, ) +from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend @@ -32,6 +33,7 @@ from pandas.core.indexes.base import Index from pandas.core.indexes.multi import MultiIndex from pandas.core.series import Series +from pandas.core.shared_docs import _shared_docs from pandas.io.common import ( file_exists, @@ -363,13 +365,13 @@ def _parse_tfoot_tr(self, table): """ raise AbstractMethodError(self) - def _parse_tables(self, doc, match, attrs): + def _parse_tables(self, document, match, attrs): """ Return all tables from the parsed DOM. Parameters ---------- - doc : the DOM from which to parse the table element. + document : the DOM from which to parse the table element. match : str or regular expression The text to search for in the DOM tree. @@ -594,9 +596,9 @@ def __init__(self, *args, **kwargs) -> None: self._strainer = SoupStrainer("table") - def _parse_tables(self, doc, match, attrs): + def _parse_tables(self, document, match, attrs): element_name = self._strainer.name - tables = doc.find_all(element_name, attrs=attrs) + tables = document.find_all(element_name, attrs=attrs) if not tables: raise ValueError("No tables found") @@ -726,7 +728,7 @@ def _parse_td(self, row): # or (see _parse_thead_tr). return row.xpath("./td|./th") - def _parse_tables(self, doc, match, kwargs): + def _parse_tables(self, document, match, kwargs): pattern = match.pattern # 1. check all descendants for the given pattern and only search tables @@ -738,7 +740,7 @@ def _parse_tables(self, doc, match, kwargs): if kwargs: xpath_expr += _build_xpath_expr(kwargs) - tables = doc.xpath(xpath_expr, namespaces=_re_namespace) + tables = document.xpath(xpath_expr, namespaces=_re_namespace) tables = self._handle_hidden_tables(tables, "attrib") if self.displayed_only: @@ -1026,11 +1028,12 @@ def _parse( return ret +@doc(storage_options=_shared_docs["storage_options"]) def read_html( io: FilePath | ReadBuffer[str], *, match: str | Pattern = ".+", - flavor: str | None = None, + flavor: str | Sequence[str] | None = None, header: int | Sequence[int] | None = None, index_col: int | Sequence[int] | None = None, skiprows: int | Sequence[int] | slice | None = None, @@ -1071,11 +1074,11 @@ def read_html( This value is converted to a regular expression so that there is consistent behavior between Beautiful Soup and lxml. - flavor : str, optional - The parsing engine to use. 'bs4' and 'html5lib' are synonymous with - each other, they are both there for backwards compatibility. The - default of ``None`` tries to use ``lxml`` to parse and if that fails it - falls back on ``bs4`` + ``html5lib``. + flavor : str or list-like, optional + The parsing engine (or list of parsing engines) to use. 'bs4' and + 'html5lib' are synonymous with each other, they are both there for + backwards compatibility. The default of ``None`` tries to use ``lxml`` + to parse and if that fails it falls back on ``bs4`` + ``html5lib``. header : int or list-like, optional The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to @@ -1096,13 +1099,13 @@ def read_html( passed to lxml or Beautiful Soup. However, these attributes must be valid HTML table attributes to work correctly. For example, :: - attrs = {'id': 'table'} + attrs = {{'id': 'table'}} is a valid attribute dictionary because the 'id' HTML tag attribute is a valid HTML attribute for *any* HTML tag as per `this document `__. :: - attrs = {'asdf': 'table'} + attrs = {{'asdf': 'table'}} is *not* a valid attribute dictionary because 'asdf' is not a valid HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 @@ -1144,13 +1147,13 @@ def read_html( displayed_only : bool, default True Whether elements with "display: none" should be parsed. - extract_links : {None, "all", "header", "body", "footer"} + extract_links : {{None, "all", "header", "body", "footer"}} Table elements in the specified section(s) with tags will have their href extracted. .. versionadded:: 1.5.0 - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' Back-end data type applied to the resultant :class:`DataFrame` (still experimental). Behaviour is as follows: @@ -1161,6 +1164,10 @@ def read_html( .. versionadded:: 2.0 + {storage_options} + + .. versionadded:: 2.1.0 + Returns ------- dfs diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 833f4986b6da6..d596891197aaa 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -82,6 +82,7 @@ JSONEngine, JSONSerializable, ReadBuffer, + Self, StorageOptions, WriteBuffer, ) @@ -174,7 +175,7 @@ def to_json( if mode == "a" and (not lines or orient != "records"): msg = ( - "mode='a' (append) is only supported when" + "mode='a' (append) is only supported when " "lines is True and orient is 'records'" ) raise ValueError(msg) @@ -1056,7 +1057,7 @@ def close(self) -> None: if self.handles is not None: self.handles.close() - def __iter__(self: JsonReader[FrameSeriesStrT]) -> JsonReader[FrameSeriesStrT]: + def __iter__(self) -> Self: return self @overload @@ -1099,7 +1100,7 @@ def __next__(self) -> DataFrame | Series: else: return obj - def __enter__(self) -> JsonReader[FrameSeriesStrT]: + def __enter__(self) -> Self: return self def __exit__( @@ -1216,7 +1217,16 @@ def _try_convert_data( if not self.dtype: if all(notna(data)): return data, False - return data.fillna(np.nan), True + + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + filled = data.fillna(np.nan) + + return filled, True elif self.dtype is True: pass diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index f51b98a929440..ed254191d2736 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -13,6 +13,7 @@ from warnings import catch_warnings from pandas._config import using_pyarrow_string_dtype +from pandas._config.config import _get_option from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -258,7 +259,7 @@ def read( elif using_pyarrow_string_dtype(): to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper() - manager = get_option("mode.data_manager") + manager = _get_option("mode.data_manager", silent=True) if manager == "array": to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 71bfb00a95b50..bb6bcd3c4d6a0 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -223,5 +223,8 @@ def read(self) -> DataFrame: elif using_pyarrow_string_dtype(): frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) else: - frame = table.to_pandas() + if isinstance(self.kwds.get("dtype"), dict): + frame = table.to_pandas(types_mapper=self.kwds["dtype"].get) + else: + frame = table.to_pandas() return self._finalize_pandas_output(frame) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 520d2193e1c04..43fb4ec3b55fc 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -13,7 +13,6 @@ import csv from io import StringIO import re -import sys from typing import ( IO, TYPE_CHECKING, @@ -21,6 +20,7 @@ Literal, cast, ) +import warnings import numpy as np @@ -28,8 +28,10 @@ from pandas.errors import ( EmptyDataError, ParserError, + ParserWarning, ) from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_bool_dtype, @@ -778,8 +780,11 @@ def _alert_malformed(self, msg: str, row_num: int) -> None: if self.on_bad_lines == self.BadLineHandleMethod.ERROR: raise ParserError(msg) if self.on_bad_lines == self.BadLineHandleMethod.WARN: - base = f"Skipping line {row_num}: " - sys.stderr.write(base + msg + "\n") + warnings.warn( + f"Skipping line {row_num}: {msg}\n", + ParserWarning, + stacklevel=find_stack_level(), + ) def _next_iter_line(self, row_num: int) -> list[Scalar] | None: """ @@ -1176,17 +1181,17 @@ def _set_no_thousand_columns(self) -> set[int]: ) if self.columns and self.dtype: assert self._col_indices is not None - for i in self._col_indices: + for i, col in zip(self._col_indices, self.columns): if not isinstance(self.dtype, dict) and not is_numeric_dtype( self.dtype ): no_thousands_columns.add(i) if ( isinstance(self.dtype, dict) - and self.columns[i] in self.dtype + and col in self.dtype and ( - not is_numeric_dtype(self.dtype[self.columns[i]]) - or is_bool_dtype(self.dtype[self.columns[i]]) + not is_numeric_dtype(self.dtype[col]) + or is_bool_dtype(self.dtype[col]) ) ): no_thousands_columns.add(i) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 50dee463a06eb..acf35ebd6afe5 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -65,6 +65,7 @@ if TYPE_CHECKING: from collections.abc import ( Hashable, + Iterable, Mapping, Sequence, ) @@ -237,7 +238,8 @@ default False The behavior is as follows: - * ``bool``. If ``True`` -> try parsing the index. + * ``bool``. If ``True`` -> try parsing the index. Note: Automatically set to + ``True`` if ``date_format`` or ``date_parser`` arguments have been passed. * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date column. * ``list`` of ``list``. e.g. If ``[[1, 3]]`` -> combine columns 1 and 3 and parse @@ -279,11 +281,20 @@ Use ``date_format`` instead, or read in as ``object`` and then apply :func:`~pandas.to_datetime` as-needed. date_format : str or dict of column -> format, optional - Format to use for parsing dates when used in conjunction with ``parse_dates``. - For anything more complex, please read in as ``object`` and then apply - :func:`~pandas.to_datetime` as-needed. - - .. versionadded:: 2.0.0 + Format to use for parsing dates when used in conjunction with ``parse_dates``. + The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See + `strftime documentation + `_ for more information on choices, though + note that :const:`"%f"` will parse all the way up to nanoseconds. + You can also pass: + + - "ISO8601", to parse any `ISO8601 `_ + time string (not necessarily in exactly the same format); + - "mixed", to infer the format for each element individually. This is risky, + and you should probably use it along with `dayfirst`. + + .. versionadded:: 2.0.0 dayfirst : bool, default False DD/MM format dates, international and European format. cache_dates : bool, default True @@ -628,7 +639,10 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -638,8 +652,10 @@ def read_csv( skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., skipfooter: int = ..., nrows: int | None = ..., - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., - keep_default_na: bool = ..., + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., na_filter: bool = ..., verbose: bool = ..., skip_blank_lines: bool = ..., @@ -685,7 +701,10 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -695,7 +714,10 @@ def read_csv( skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., skipfooter: int = ..., nrows: int | None = ..., - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -742,7 +764,10 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -752,7 +777,10 @@ def read_csv( skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., skipfooter: int = ..., nrows: int | None = ..., - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -799,7 +827,10 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -809,7 +840,10 @@ def read_csv( skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., skipfooter: int = ..., nrows: int | None = ..., - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., verbose: bool = ..., @@ -867,7 +901,10 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = "infer", names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, index_col: IndexLabel | Literal[False] | None = None, - usecols: list[HashableT] | Callable[[Hashable], bool] | None = None, + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = None, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, @@ -879,7 +916,10 @@ def read_csv( skipfooter: int = 0, nrows: int | None = None, # NA and Missing Data Handling - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = None, + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = None, keep_default_na: bool = True, na_filter: bool = True, verbose: bool = False, @@ -959,7 +999,10 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1016,7 +1059,10 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1073,7 +1119,10 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1130,7 +1179,10 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1200,7 +1252,10 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = "infer", names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, index_col: IndexLabel | Literal[False] | None = None, - usecols: list[HashableT] | Callable[[Hashable], bool] | None = None, + usecols: list[HashableT] + | tuple[HashableT] + | Callable[[Hashable], bool] + | None = None, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, @@ -1283,6 +1338,51 @@ def read_table( return _read(filepath_or_buffer, kwds) +@overload +def read_fwf( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + colspecs: Sequence[tuple[int, int]] | str | None = ..., + widths: Sequence[int] | None = ..., + infer_nrows: int = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., + iterator: Literal[True], + chunksize: int | None = ..., + **kwds, +) -> TextFileReader: + ... + + +@overload +def read_fwf( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + colspecs: Sequence[tuple[int, int]] | str | None = ..., + widths: Sequence[int] | None = ..., + infer_nrows: int = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., + iterator: bool = ..., + chunksize: int, + **kwds, +) -> TextFileReader: + ... + + +@overload +def read_fwf( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + colspecs: Sequence[tuple[int, int]] | str | None = ..., + widths: Sequence[int] | None = ..., + infer_nrows: int = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., + iterator: Literal[False] = ..., + chunksize: None = ..., + **kwds, +) -> DataFrame: + ... + + def read_fwf( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, @@ -1290,6 +1390,8 @@ def read_fwf( widths: Sequence[int] | None = None, infer_nrows: int = 100, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + iterator: bool = False, + chunksize: int | None = None, **kwds, ) -> DataFrame | TextFileReader: r""" @@ -1388,6 +1490,8 @@ def read_fwf( kwds["colspecs"] = colspecs kwds["infer_nrows"] = infer_nrows kwds["engine"] = "python-fwf" + kwds["iterator"] = iterator + kwds["chunksize"] = chunksize check_dtype_backend(dtype_backend) kwds["dtype_backend"] = dtype_backend diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 89c3f7bbc4f84..c0a27ecfec803 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -68,7 +68,6 @@ ) from pandas.core.dtypes.missing import array_equivalent -import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -2824,7 +2823,7 @@ def read( "cannot read on an abstract storer: subclasses should implement" ) - def write(self, **kwargs): + def write(self, obj, **kwargs) -> None: raise NotImplementedError( "cannot write on an abstract storer: subclasses should implement" ) @@ -2938,8 +2937,7 @@ def get_attrs(self) -> None: for n in self.attributes: setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None))) - # error: Signature of "write" incompatible with supertype "Fixed" - def write(self, obj, **kwargs) -> None: # type: ignore[override] + def write(self, obj, **kwargs) -> None: self.set_attrs() def read_array(self, key: str, start: int | None = None, stop: int | None = None): @@ -3224,13 +3222,10 @@ def read( values = self.read_array("values", start=start, stop=stop) result = Series(values, index=index, name=self.name, copy=False) if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): - import pyarrow as pa - - result = result.astype(pd.ArrowDtype(pa.string())) + result = result.astype("string[pyarrow_numpy]") return result - # error: Signature of "write" incompatible with supertype "Fixed" - def write(self, obj, **kwargs) -> None: # type: ignore[override] + def write(self, obj, **kwargs) -> None: super().write(obj, **kwargs) self.write_index("index", obj.index) self.write_array("values", obj) @@ -3296,9 +3291,7 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): - import pyarrow as pa - - df = df.astype(pd.ArrowDtype(pa.string())) + df = df.astype("string[pyarrow_numpy]") dfs.append(df) if len(dfs) > 0: @@ -3308,8 +3301,7 @@ def read( return DataFrame(columns=axes[0], index=axes[1]) - # error: Signature of "write" incompatible with supertype "Fixed" - def write(self, obj, **kwargs) -> None: # type: ignore[override] + def write(self, obj, **kwargs) -> None: super().write(obj, **kwargs) # TODO(ArrayManager) HDFStore relies on accessing the blocks @@ -4360,7 +4352,7 @@ def read( """ raise NotImplementedError("WORMTable needs to implement read") - def write(self, **kwargs) -> None: + def write(self, obj, **kwargs) -> None: """ write in a format that we can search later on (but cannot append to): write out the indices and the values using _write_array @@ -4686,9 +4678,7 @@ def read( values, # type: ignore[arg-type] skipna=True, ): - import pyarrow as pa - - df = df.astype(pd.ArrowDtype(pa.string())) + df = df.astype("string[pyarrow_numpy]") frames.append(df) if len(frames) == 1: @@ -4719,12 +4709,13 @@ def is_transposed(self) -> bool: def get_object(cls, obj, transposed: bool): return obj - def write(self, obj, data_columns=None, **kwargs): + # error: Signature of "write" incompatible with supertype "Fixed" + def write(self, obj, data_columns=None, **kwargs) -> None: # type: ignore[override] """we are going to write this as a frame table""" if not isinstance(obj, DataFrame): name = obj.name or "values" obj = obj.to_frame(name) - return super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs) + super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs) def read( self, @@ -4757,7 +4748,8 @@ class AppendableMultiSeriesTable(AppendableSeriesTable): pandas_kind = "series_table" table_type = "appendable_multiseries" - def write(self, obj, **kwargs): + # error: Signature of "write" incompatible with supertype "Fixed" + def write(self, obj, **kwargs) -> None: # type: ignore[override] """we are going to write this as a frame table""" name = obj.name or "values" newobj, self.levels = self.validate_multiindex(obj) @@ -4765,7 +4757,7 @@ def write(self, obj, **kwargs): cols = list(self.levels) cols.append(name) newobj.columns = Index(cols) - return super().write(obj=newobj, **kwargs) + super().write(obj=newobj, **kwargs) class GenericTable(AppendableFrameTable): @@ -4830,7 +4822,8 @@ def indexables(self): return _indexables - def write(self, **kwargs): + # error: Signature of "write" incompatible with supertype "AppendableTable" + def write(self, **kwargs) -> None: # type: ignore[override] raise NotImplementedError("cannot write on an generic table") @@ -4846,7 +4839,8 @@ class AppendableMultiFrameTable(AppendableFrameTable): def table_type_short(self) -> str: return "appendable_multi" - def write(self, obj, data_columns=None, **kwargs): + # error: Signature of "write" incompatible with supertype "Fixed" + def write(self, obj, data_columns=None, **kwargs) -> None: # type: ignore[override] if data_columns is None: data_columns = [] elif data_columns is True: @@ -4856,7 +4850,7 @@ def write(self, obj, data_columns=None, **kwargs): for n in self.levels: if n not in data_columns: data_columns.insert(0, n) - return super().write(obj=obj, data_columns=data_columns, **kwargs) + super().write(obj=obj, data_columns=data_columns, **kwargs) def read( self, diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 7669d5aa4cea5..0788d9da06eb9 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -138,7 +138,7 @@ def _parse_date_columns(data_frame, parse_dates): if isinstance(df_col.dtype, DatetimeTZDtype) or col_name in parse_dates: try: fmt = parse_dates[col_name] - except TypeError: + except (KeyError, TypeError): fmt = None data_frame.isetitem(i, _handle_date_column(df_col, format=fmt)) @@ -2091,13 +2091,11 @@ def _adapt_time(t) -> str: adapt_date_iso = lambda val: val.isoformat() adapt_datetime_iso = lambda val: val.isoformat() - adapt_datetime_epoch = lambda val: int(val.timestamp()) sqlite3.register_adapter(time, _adapt_time) sqlite3.register_adapter(date, adapt_date_iso) sqlite3.register_adapter(datetime, adapt_datetime_iso) - sqlite3.register_adapter(datetime, adapt_datetime_epoch) convert_date = lambda val: date.fromisoformat(val.decode()) convert_datetime = lambda val: datetime.fromisoformat(val.decode()) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index c5648a022d4a9..d630a5ff8a41c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1773,7 +1773,7 @@ def read( self._data_read = True # if necessary, swap the byte order to native here if self._byteorder != self._native_byteorder: - raw_data = raw_data.byteswap().newbyteorder() + raw_data = raw_data.byteswap().view(raw_data.dtype.newbyteorder()) if convert_categoricals: self._read_value_labels() @@ -2963,7 +2963,7 @@ def _convert_strls(self, data: DataFrame) -> DataFrame: """No-op, future compatibility""" return data - def _prepare_data(self) -> np.recarray: + def _prepare_data(self) -> np.rec.recarray: data = self.data typlist = self.typlist convert_dates = self._convert_dates @@ -2983,7 +2983,14 @@ def _prepare_data(self) -> np.recarray: for i, col in enumerate(data): typ = typlist[i] if typ <= self._max_string_length: - data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,)) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + dc = data[col].fillna("") + data[col] = dc.apply(_pad_bytes, args=(typ,)) stype = f"S{typ}" dtypes[col] = stype data[col] = data[col].astype(stype) @@ -2995,7 +3002,7 @@ def _prepare_data(self) -> np.recarray: return data.to_records(index=False, column_dtypes=dtypes) - def _write_data(self, records: np.recarray) -> None: + def _write_data(self, records: np.rec.recarray) -> None: self._write_bytes(records.tobytes()) @staticmethod diff --git a/pandas/meson.build b/pandas/meson.build index 1dc9955aa4ff6..435103a954d86 100644 --- a/pandas/meson.build +++ b/pandas/meson.build @@ -26,7 +26,6 @@ subdir('_libs') subdirs_list = [ '_config', - '_libs', '_testing', 'api', 'arrays', @@ -40,8 +39,9 @@ subdirs_list = [ 'util' ] foreach subdir: subdirs_list - install_subdir(subdir, install_dir: py.get_install_dir(pure: false) / 'pandas') + install_subdir(subdir, install_dir: py.get_install_dir() / 'pandas') endforeach + top_level_py_list = [ '__init__.py', '_typing.py', @@ -49,8 +49,4 @@ top_level_py_list = [ 'conftest.py', 'testing.py' ] -foreach file: top_level_py_list - py.install_sources(file, - pure: false, - subdir: 'pandas') -endforeach +py.install_sources(top_level_py_list, subdir: 'pandas') diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 07c77ec4f3e0a..0eb8ab2412e0e 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1573,7 +1573,7 @@ def area( ... 'signups': [5, 5, 6, 12, 14, 13], ... 'visits': [20, 42, 28, 62, 81, 50], ... }, index=pd.date_range(start='2018/01/01', end='2018/07/01', - ... freq='M')) + ... freq='ME')) >>> ax = df.plot.area() Area plots are stacked by default. To produce an unstacked plot, diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 83cb8a6ab67dd..5fcea796a9c6e 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -40,6 +40,24 @@ from pandas._typing import MatplotlibColor +def _set_ticklabels(ax: Axes, labels: list[str], is_vertical: bool, **kwargs) -> None: + """Set the tick labels of a given axis. + + Due to https://github.com/matplotlib/matplotlib/pull/17266, we need to handle the + case of repeated ticks (due to `FixedLocator`) and thus we duplicate the number of + labels. + """ + ticks = ax.get_xticks() if is_vertical else ax.get_yticks() + if len(ticks) != len(labels): + i, remainder = divmod(len(ticks), len(labels)) + assert remainder == 0, remainder + labels *= i + if is_vertical: + ax.set_xticklabels(labels, **kwargs) + else: + ax.set_yticklabels(labels, **kwargs) + + class BoxPlot(LinePlot): @property def _kind(self) -> Literal["box"]: @@ -193,7 +211,9 @@ def _make_plot(self) -> None: ) self.maybe_color_bp(bp) self._return_obj[label] = ret - self._set_ticklabels(ax, ticklabels) + _set_ticklabels( + ax=ax, labels=ticklabels, is_vertical=self.orientation == "vertical" + ) else: y = self.data.values.T ax = self._get_ax(0) @@ -209,13 +229,9 @@ def _make_plot(self) -> None: labels = [pprint_thing(left) for left in labels] if not self.use_index: labels = [pprint_thing(key) for key in range(len(labels))] - self._set_ticklabels(ax, labels) - - def _set_ticklabels(self, ax: Axes, labels: list[str]) -> None: - if self.orientation == "vertical": - ax.set_xticklabels(labels) - else: - ax.set_yticklabels(labels) + _set_ticklabels( + ax=ax, labels=labels, is_vertical=self.orientation == "vertical" + ) def _make_legend(self) -> None: pass @@ -382,16 +398,9 @@ def plot_group(keys, values, ax: Axes, **kwds): ax.tick_params(axis="both", labelsize=fontsize) # GH 45465: x/y are flipped when "vert" changes - is_vertical = kwds.get("vert", True) - ticks = ax.get_xticks() if is_vertical else ax.get_yticks() - if len(ticks) != len(keys): - i, remainder = divmod(len(ticks), len(keys)) - assert remainder == 0, remainder - keys *= i - if is_vertical: - ax.set_xticklabels(keys, rotation=rot) - else: - ax.set_yticklabels(keys, rotation=rot) + _set_ticklabels( + ax=ax, labels=keys, is_vertical=kwds.get("vert", True), rotation=rot + ) maybe_color_bp(bp, **kwds) # Return axes in multiplot case, maybe revisit later # 985 diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index cd7823ba15e44..e6408a0eae841 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -11,7 +11,6 @@ from typing import ( TYPE_CHECKING, Any, - Final, cast, ) @@ -30,8 +29,14 @@ Timestamp, to_offset, ) -from pandas._libs.tslibs.dtypes import FreqGroup -from pandas._typing import F +from pandas._libs.tslibs.dtypes import ( + FreqGroup, + periods_per_day, +) +from pandas._typing import ( + F, + npt, +) from pandas.core.dtypes.common import ( is_float, @@ -60,15 +65,6 @@ from pandas._libs.tslibs.offsets import BaseOffset -# constants -HOURS_PER_DAY: Final = 24.0 -MIN_PER_HOUR: Final = 60.0 -SEC_PER_MIN: Final = 60.0 - -SEC_PER_HOUR: Final = SEC_PER_MIN * MIN_PER_HOUR -SEC_PER_DAY: Final = SEC_PER_HOUR * HOURS_PER_DAY - -MUSEC_PER_DAY: Final = 10**6 * SEC_PER_DAY _mpl_units = {} # Cache for units overwritten by us @@ -412,7 +408,7 @@ def __call__(self): ) interval = self._get_interval() - freq = f"{interval}L" + freq = f"{interval}ms" tz = self.tz.tzname(None) st = dmin.replace(tzinfo=None) ed = dmin.replace(tzinfo=None) @@ -495,7 +491,7 @@ def _get_default_annual_spacing(nyears) -> tuple[int, int]: return (min_spacing, maj_spacing) -def period_break(dates: PeriodIndex, period: str) -> np.ndarray: +def _period_break(dates: PeriodIndex, period: str) -> npt.NDArray[np.intp]: """ Returns the indices where the given period changes. @@ -506,12 +502,17 @@ def period_break(dates: PeriodIndex, period: str) -> np.ndarray: period : str Name of the period to monitor. """ + mask = _period_break_mask(dates, period) + return np.nonzero(mask)[0] + + +def _period_break_mask(dates: PeriodIndex, period: str) -> npt.NDArray[np.bool_]: current = getattr(dates, period) previous = getattr(dates - 1 * dates.freq, period) - return np.nonzero(current - previous)[0] + return current != previous -def has_level_label(label_flags: np.ndarray, vmin: float) -> bool: +def has_level_label(label_flags: npt.NDArray[np.intp], vmin: float) -> bool: """ Returns true if the ``label_flags`` indicate there is at least one label for this level. @@ -527,54 +528,59 @@ def has_level_label(label_flags: np.ndarray, vmin: float) -> bool: return True -def _daily_finder(vmin, vmax, freq: BaseOffset): +def _get_periods_per_ymd(freq: BaseOffset) -> tuple[int, int, int]: # error: "BaseOffset" has no attribute "_period_dtype_code" dtype_code = freq._period_dtype_code # type: ignore[attr-defined] freq_group = FreqGroup.from_period_dtype_code(dtype_code) - periodsperday = -1 + ppd = -1 # placeholder for above-day freqs if dtype_code >= FreqGroup.FR_HR.value: - if freq_group == FreqGroup.FR_NS: - periodsperday = 24 * 60 * 60 * 1000000000 - elif freq_group == FreqGroup.FR_US: - periodsperday = 24 * 60 * 60 * 1000000 - elif freq_group == FreqGroup.FR_MS: - periodsperday = 24 * 60 * 60 * 1000 - elif freq_group == FreqGroup.FR_SEC: - periodsperday = 24 * 60 * 60 - elif freq_group == FreqGroup.FR_MIN: - periodsperday = 24 * 60 - elif freq_group == FreqGroup.FR_HR: - periodsperday = 24 - else: # pragma: no cover - raise ValueError(f"unexpected frequency: {dtype_code}") - periodsperyear = 365 * periodsperday - periodspermonth = 28 * periodsperday - + # error: "BaseOffset" has no attribute "_creso" + ppd = periods_per_day(freq._creso) # type: ignore[attr-defined] + ppm = 28 * ppd + ppy = 365 * ppd elif freq_group == FreqGroup.FR_BUS: - periodsperyear = 261 - periodspermonth = 19 + ppm = 19 + ppy = 261 elif freq_group == FreqGroup.FR_DAY: - periodsperyear = 365 - periodspermonth = 28 + ppm = 28 + ppy = 365 elif freq_group == FreqGroup.FR_WK: - periodsperyear = 52 - periodspermonth = 3 - else: # pragma: no cover - raise ValueError("unexpected frequency") + ppm = 3 + ppy = 52 + elif freq_group == FreqGroup.FR_MTH: + ppm = 1 + ppy = 12 + elif freq_group == FreqGroup.FR_QTR: + ppm = -1 # placerholder + ppy = 4 + elif freq_group == FreqGroup.FR_ANN: + ppm = -1 # placeholder + ppy = 1 + else: + raise NotImplementedError(f"Unsupported frequency: {dtype_code}") + + return ppd, ppm, ppy + + +def _daily_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: + # error: "BaseOffset" has no attribute "_period_dtype_code" + dtype_code = freq._period_dtype_code # type: ignore[attr-defined] + + periodsperday, periodspermonth, periodsperyear = _get_periods_per_ymd(freq) # save this for later usage vmin_orig = vmin + (vmin, vmax) = (int(vmin), int(vmax)) + span = vmax - vmin + 1 - (vmin, vmax) = ( - Period(ordinal=int(vmin), freq=freq), - Period(ordinal=int(vmax), freq=freq), + dates_ = period_range( + start=Period(ordinal=vmin, freq=freq), + end=Period(ordinal=vmax, freq=freq), + freq=freq, ) - assert isinstance(vmin, Period) - assert isinstance(vmax, Period) - span = vmax.ordinal - vmin.ordinal + 1 - dates_ = period_range(start=vmin, end=vmax, freq=freq) + # Initialize the output info = np.zeros( span, dtype=[("val", np.int64), ("maj", bool), ("min", bool), ("fmt", "|S20")] @@ -595,45 +601,38 @@ def first_label(label_flags): # Case 1. Less than a month if span <= periodspermonth: - day_start = period_break(dates_, "day") - month_start = period_break(dates_, "month") + day_start = _period_break(dates_, "day") + month_start = _period_break(dates_, "month") + year_start = _period_break(dates_, "year") - def _hour_finder(label_interval, force_year_start) -> None: - _hour = dates_.hour - _prev_hour = (dates_ - 1 * dates_.freq).hour - hour_start = (_hour - _prev_hour) != 0 + def _hour_finder(label_interval: int, force_year_start: bool) -> None: + target = dates_.hour + mask = _period_break_mask(dates_, "hour") info_maj[day_start] = True - info_min[hour_start & (_hour % label_interval == 0)] = True - year_start = period_break(dates_, "year") - info_fmt[hour_start & (_hour % label_interval == 0)] = "%H:%M" + info_min[mask & (target % label_interval == 0)] = True + info_fmt[mask & (target % label_interval == 0)] = "%H:%M" info_fmt[day_start] = "%H:%M\n%d-%b" info_fmt[year_start] = "%H:%M\n%d-%b\n%Y" if force_year_start and not has_level_label(year_start, vmin_orig): info_fmt[first_label(day_start)] = "%H:%M\n%d-%b\n%Y" - def _minute_finder(label_interval) -> None: - hour_start = period_break(dates_, "hour") - _minute = dates_.minute - _prev_minute = (dates_ - 1 * dates_.freq).minute - minute_start = (_minute - _prev_minute) != 0 + def _minute_finder(label_interval: int) -> None: + target = dates_.minute + hour_start = _period_break(dates_, "hour") + mask = _period_break_mask(dates_, "minute") info_maj[hour_start] = True - info_min[minute_start & (_minute % label_interval == 0)] = True - year_start = period_break(dates_, "year") - info_fmt = info["fmt"] - info_fmt[minute_start & (_minute % label_interval == 0)] = "%H:%M" + info_min[mask & (target % label_interval == 0)] = True + info_fmt[mask & (target % label_interval == 0)] = "%H:%M" info_fmt[day_start] = "%H:%M\n%d-%b" info_fmt[year_start] = "%H:%M\n%d-%b\n%Y" - def _second_finder(label_interval) -> None: - minute_start = period_break(dates_, "minute") - _second = dates_.second - _prev_second = (dates_ - 1 * dates_.freq).second - second_start = (_second - _prev_second) != 0 - info["maj"][minute_start] = True - info["min"][second_start & (_second % label_interval == 0)] = True - year_start = period_break(dates_, "year") - info_fmt = info["fmt"] - info_fmt[second_start & (_second % label_interval == 0)] = "%H:%M:%S" + def _second_finder(label_interval: int) -> None: + target = dates_.second + minute_start = _period_break(dates_, "minute") + mask = _period_break_mask(dates_, "second") + info_maj[minute_start] = True + info_min[mask & (target % label_interval == 0)] = True + info_fmt[mask & (target % label_interval == 0)] = "%H:%M:%S" info_fmt[day_start] = "%H:%M:%S\n%d-%b" info_fmt[year_start] = "%H:%M:%S\n%d-%b\n%Y" @@ -672,8 +671,6 @@ def _second_finder(label_interval) -> None: else: info_maj[month_start] = True info_min[day_start] = True - year_start = period_break(dates_, "year") - info_fmt = info["fmt"] info_fmt[day_start] = "%d" info_fmt[month_start] = "%d\n%b" info_fmt[year_start] = "%d\n%b\n%Y" @@ -685,15 +682,15 @@ def _second_finder(label_interval) -> None: # Case 2. Less than three months elif span <= periodsperyear // 4: - month_start = period_break(dates_, "month") + month_start = _period_break(dates_, "month") info_maj[month_start] = True if dtype_code < FreqGroup.FR_HR.value: info["min"] = True else: - day_start = period_break(dates_, "day") + day_start = _period_break(dates_, "day") info["min"][day_start] = True - week_start = period_break(dates_, "week") - year_start = period_break(dates_, "year") + week_start = _period_break(dates_, "week") + year_start = _period_break(dates_, "year") info_fmt[week_start] = "%d" info_fmt[month_start] = "\n\n%b" info_fmt[year_start] = "\n\n%b\n%Y" @@ -704,9 +701,9 @@ def _second_finder(label_interval) -> None: info_fmt[first_label(month_start)] = "\n\n%b\n%Y" # Case 3. Less than 14 months ............... elif span <= 1.15 * periodsperyear: - year_start = period_break(dates_, "year") - month_start = period_break(dates_, "month") - week_start = period_break(dates_, "week") + year_start = _period_break(dates_, "year") + month_start = _period_break(dates_, "month") + week_start = _period_break(dates_, "week") info_maj[month_start] = True info_min[week_start] = True info_min[year_start] = False @@ -717,17 +714,17 @@ def _second_finder(label_interval) -> None: info_fmt[first_label(month_start)] = "%b\n%Y" # Case 4. Less than 2.5 years ............... elif span <= 2.5 * periodsperyear: - year_start = period_break(dates_, "year") - quarter_start = period_break(dates_, "quarter") - month_start = period_break(dates_, "month") + year_start = _period_break(dates_, "year") + quarter_start = _period_break(dates_, "quarter") + month_start = _period_break(dates_, "month") info_maj[quarter_start] = True info_min[month_start] = True info_fmt[quarter_start] = "%b" info_fmt[year_start] = "%b\n%Y" # Case 4. Less than 4 years ................. elif span <= 4 * periodsperyear: - year_start = period_break(dates_, "year") - month_start = period_break(dates_, "month") + year_start = _period_break(dates_, "year") + month_start = _period_break(dates_, "month") info_maj[year_start] = True info_min[month_start] = True info_min[year_start] = False @@ -738,15 +735,15 @@ def _second_finder(label_interval) -> None: info_fmt[year_start] = "%b\n%Y" # Case 5. Less than 11 years ................ elif span <= 11 * periodsperyear: - year_start = period_break(dates_, "year") - quarter_start = period_break(dates_, "quarter") + year_start = _period_break(dates_, "year") + quarter_start = _period_break(dates_, "quarter") info_maj[year_start] = True info_min[quarter_start] = True info_min[year_start] = False info_fmt[year_start] = "%Y" # Case 6. More than 12 years ................ else: - year_start = period_break(dates_, "year") + year_start = _period_break(dates_, "year") year_break = dates_[year_start].year nyears = span / periodsperyear (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears) @@ -759,8 +756,8 @@ def _second_finder(label_interval) -> None: return info -def _monthly_finder(vmin, vmax, freq): - periodsperyear = 12 +def _monthly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: + _, _, periodsperyear = _get_periods_per_ymd(freq) vmin_orig = vmin (vmin, vmax) = (int(vmin), int(vmax)) @@ -795,6 +792,7 @@ def _monthly_finder(vmin, vmax, freq): quarter_start = (dates_ % 3 == 0).nonzero() info_maj[year_start] = True # TODO: Check the following : is it really info['fmt'] ? + # 2023-09-15 this is reached in test_finder_monthly info["fmt"][quarter_start] = True info["min"] = True @@ -829,8 +827,8 @@ def _monthly_finder(vmin, vmax, freq): return info -def _quarterly_finder(vmin, vmax, freq): - periodsperyear = 4 +def _quarterly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: + _, _, periodsperyear = _get_periods_per_ymd(freq) vmin_orig = vmin (vmin, vmax) = (int(vmin), int(vmax)) span = vmax - vmin + 1 @@ -876,7 +874,8 @@ def _quarterly_finder(vmin, vmax, freq): return info -def _annual_finder(vmin, vmax, freq): +def _annual_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: + # Note: small difference here vs other finders in adding 1 to vmax (vmin, vmax) = (int(vmin), int(vmax + 1)) span = vmax - vmin + 1 @@ -889,8 +888,9 @@ def _annual_finder(vmin, vmax, freq): (min_anndef, maj_anndef) = _get_default_annual_spacing(span) major_idx = dates_ % maj_anndef == 0 + minor_idx = dates_ % min_anndef == 0 info["maj"][major_idx] = True - info["min"][(dates_ % min_anndef == 0)] = True + info["min"][minor_idx] = True info["fmt"][major_idx] = "%Y" return info @@ -942,7 +942,7 @@ def __init__( day: int = 1, plot_obj=None, ) -> None: - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) self.freq = freq self.base = base (self.quarter, self.month, self.day) = (quarter, month, day) @@ -1026,7 +1026,7 @@ def __init__( dynamic_mode: bool = True, plot_obj=None, ) -> None: - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) self.format = None self.freq = freq self.locs: list[Any] = [] # unused, for matplotlib compat @@ -1087,7 +1087,7 @@ def format_timedelta_ticks(x, pos, n_decimals: int) -> str: """ Convert seconds to 'D days HH:MM:SS.F' """ - s, ns = divmod(x, 10**9) + s, ns = divmod(x, 10**9) # TODO(non-nano): this looks like it assumes ns m, s = divmod(s, 60) h, m = divmod(m, 60) d, h = divmod(h, 24) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index c62f73271577d..d88605db60720 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1538,7 +1538,13 @@ def _kind(self) -> Literal["area"]: def __init__(self, data, **kwargs) -> None: kwargs.setdefault("stacked", True) - data = data.fillna(value=0) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + data = data.fillna(value=0) LinePlot.__init__(self, data, **kwargs) if not self.stacked: diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index 90e6f29dd98ab..77de5c1ad7b42 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -15,7 +15,10 @@ Period, to_offset, ) -from pandas._libs.tslibs.dtypes import FreqGroup +from pandas._libs.tslibs.dtypes import ( + OFFSET_TO_PERIOD_FREQSTR, + FreqGroup, +) from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -188,7 +191,7 @@ def _get_ax_freq(ax: Axes): def _get_period_alias(freq: timedelta | BaseOffset | str) -> str | None: - freqstr = to_offset(freq).rule_code + freqstr = to_offset(freq, is_period=True).rule_code return get_period_alias(freqstr) @@ -198,7 +201,7 @@ def _get_freq(ax: Axes, series: Series): freq = getattr(series.index, "freq", None) if freq is None: freq = getattr(series.index, "inferred_freq", None) - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) ax_freq = _get_ax_freq(ax) @@ -232,7 +235,10 @@ def use_dynamic_x(ax: Axes, data: DataFrame | Series) -> bool: # FIXME: hack this for 0.10.1, creating more technical debt...sigh if isinstance(data.index, ABCDatetimeIndex): # error: "BaseOffset" has no attribute "_period_dtype_code" - base = to_offset(freq_str)._period_dtype_code # type: ignore[attr-defined] + freq_str = OFFSET_TO_PERIOD_FREQSTR.get(freq_str, freq_str) + base = to_offset( + freq_str, is_period=True + )._period_dtype_code # type: ignore[attr-defined] x = data.index if base <= FreqGroup.FR_DAY.value: return x[:1].is_normalized diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 3a3f73a68374b..227b72573f979 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -18,6 +18,13 @@ from pandas.tests.frame.common import zip_frames +@pytest.fixture(params=["python", "numba"]) +def engine(request): + if request.param == "numba": + pytest.importorskip("numba") + return request.param + + def test_apply(float_frame): with np.errstate(all="ignore"): # ufunc @@ -38,8 +45,9 @@ def test_apply(float_frame): @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_args(float_frame, axis): - result = float_frame.apply(lambda x, y: x + y, axis, args=(1,)) +@pytest.mark.parametrize("raw", [True, False]) +def test_apply_args(float_frame, axis, raw): + result = float_frame.apply(lambda x, y: x + y, axis, args=(1,), raw=raw) expected = float_frame + 1 tm.assert_frame_equal(result, expected) @@ -234,36 +242,42 @@ def test_apply_broadcast_series_lambda_func(int_frame_const_col): @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_raw_float_frame(float_frame, axis): +def test_apply_raw_float_frame(float_frame, axis, engine): + if engine == "numba": + pytest.skip("numba can't handle when UDF returns None.") + def _assert_raw(x): assert isinstance(x, np.ndarray) assert x.ndim == 1 - float_frame.apply(_assert_raw, axis=axis, raw=True) + float_frame.apply(_assert_raw, axis=axis, engine=engine, raw=True) @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_raw_float_frame_lambda(float_frame, axis): - result = float_frame.apply(np.mean, axis=axis, raw=True) +def test_apply_raw_float_frame_lambda(float_frame, axis, engine): + result = float_frame.apply(np.mean, axis=axis, engine=engine, raw=True) expected = float_frame.apply(lambda x: x.values.mean(), axis=axis) tm.assert_series_equal(result, expected) -def test_apply_raw_float_frame_no_reduction(float_frame): +def test_apply_raw_float_frame_no_reduction(float_frame, engine): # no reduction - result = float_frame.apply(lambda x: x * 2, raw=True) + result = float_frame.apply(lambda x: x * 2, engine=engine, raw=True) expected = float_frame * 2 tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_raw_mixed_type_frame(mixed_type_frame, axis): +def test_apply_raw_mixed_type_frame(mixed_type_frame, axis, engine): + if engine == "numba": + pytest.skip("isinstance check doesn't work with numba") + def _assert_raw(x): assert isinstance(x, np.ndarray) assert x.ndim == 1 # Mixed dtype (GH-32423) - mixed_type_frame.apply(_assert_raw, axis=axis, raw=True) + mixed_type_frame.apply(_assert_raw, axis=axis, engine=engine, raw=True) def test_apply_axis1(float_frame): @@ -300,14 +314,20 @@ def test_apply_mixed_dtype_corner_indexing(): ) @pytest.mark.parametrize("raw", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_empty_infer_type(ax, func, raw, axis): +def test_apply_empty_infer_type(ax, func, raw, axis, engine, request): df = DataFrame(**{ax: ["a", "b", "c"]}) with np.errstate(all="ignore"): test_res = func(np.array([], dtype="f8")) is_reduction = not isinstance(test_res, np.ndarray) - result = df.apply(func, axis=axis, raw=raw) + if engine == "numba" and raw is False: + mark = pytest.mark.xfail( + reason="numba engine only supports raw=True at the moment" + ) + request.node.add_marker(mark) + + result = df.apply(func, axis=axis, engine=engine, raw=raw) if is_reduction: agg_axis = df._get_agg_axis(axis) assert isinstance(result, Series) @@ -607,8 +627,10 @@ def non_reducing_function(row): assert names == list(df.index) -def test_apply_raw_function_runs_once(): +def test_apply_raw_function_runs_once(engine): # https://github.com/pandas-dev/pandas/issues/34506 + if engine == "numba": + pytest.skip("appending to list outside of numba func is not supported") df = DataFrame({"a": [1, 2, 3]}) values = [] # Save row values function is applied to @@ -623,7 +645,7 @@ def non_reducing_function(row): for func in [reducing_function, non_reducing_function]: del values[:] - df.apply(func, raw=True, axis=1) + df.apply(func, engine=engine, raw=True, axis=1) assert values == list(df.a.to_list()) @@ -1449,10 +1471,12 @@ def test_apply_no_suffix_index(): tm.assert_frame_equal(result, expected) -def test_apply_raw_returns_string(): +def test_apply_raw_returns_string(engine): # https://github.com/pandas-dev/pandas/issues/35940 + if engine == "numba": + pytest.skip("No object dtype support in numba") df = DataFrame({"A": ["aa", "bbb"]}) - result = df.apply(lambda x: x[0], axis=1, raw=True) + result = df.apply(lambda x: x[0], engine=engine, axis=1, raw=True) expected = Series(["aa", "bbb"]) tm.assert_series_equal(result, expected) @@ -1632,3 +1656,14 @@ def test_agg_dist_like_and_nonunique_columns(): result = df.agg({"A": "count"}) expected = df["A"].count() tm.assert_series_equal(result, expected) + + +def test_numba_unsupported(): + df = DataFrame( + {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]} + ) + with pytest.raises( + ValueError, + match="The numba engine in DataFrame.apply can only be used when raw=True", + ): + df.apply(lambda x: x, engine="numba", raw=False) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index d3e5ac1b4ca7a..aeb6a01eb587a 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -420,8 +420,9 @@ def test_agg_evaluate_lambdas(string_series): def test_with_nested_series(datetime_series, op_name): # GH 2316 # .agg with a reducer and a transform, what to do - msg = "Returning a DataFrame from Series.apply when the supplied function" - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = "cannot aggregate" + warning = FutureWarning if op_name == "agg" else None + with tm.assert_produces_warning(warning, match=msg): # GH52123 result = getattr(datetime_series, op_name)( lambda x: Series([x, x**2], index=["x", "x^2"]) @@ -429,6 +430,10 @@ def test_with_nested_series(datetime_series, op_name): expected = DataFrame({"x": datetime_series, "x^2": datetime_series**2}) tm.assert_frame_equal(result, expected) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = datetime_series.agg(lambda x: Series([x, x**2], index=["x", "x^2"])) + tm.assert_frame_equal(result, expected) + def test_replicate_describe(string_series): # this also tests a result set that is all scalars @@ -512,10 +517,7 @@ def test_apply_series_on_date_time_index_aware_series(dti, exp, aware): index = dti.tz_localize("UTC").index else: index = dti.index - msg = "Returning a DataFrame from Series.apply when the supplied function" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH52123 - result = Series(index).apply(lambda x: Series([1, 2])) + result = Series(index).apply(lambda x: Series([1, 2])) tm.assert_frame_equal(result, exp) @@ -662,19 +664,7 @@ def test_apply_dictlike_lambda(ops, by_row, expected): def test_apply_retains_column_name(by_row): # GH 16380 df = DataFrame({"x": range(3)}, Index(range(3), name="x")) - func = lambda x: Series(range(x + 1), Index(range(x + 1), name="y")) - - if not by_row: - # GH53400 - msg = "'Series' object cannot be interpreted as an integer" - with pytest.raises(TypeError, match=msg): - df.x.apply(func, by_row=by_row) - return - - msg = "Returning a DataFrame from Series.apply when the supplied function" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH52123 - result = df.x.apply(func, by_row=by_row) + result = df.x.apply(lambda x: Series(range(x + 1), Index(range(x + 1), name="y"))) expected = DataFrame( [[0.0, np.nan, np.nan], [0.0, 1.0, np.nan], [0.0, 1.0, 2.0]], columns=Index(range(3), name="y"), @@ -689,3 +679,11 @@ def test_apply_type(): result = s.apply(type) expected = Series([int, str, type], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) + + +def test_series_apply_unpack_nested_data(): + # GH#55189 + ser = Series([[1, 2, 3], [4, 5, 6, 7]]) + result = ser.apply(lambda x: Series(x)) + expected = DataFrame({0: [1.0, 4.0], 1: [2.0, 5.0], 2: [3.0, 6.0], 3: [np.nan, 7]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 34b526bf97408..0c46d22ddcc2e 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1076,7 +1076,7 @@ def test_dt64arr_add_dtlike_raises(self, tz_naive_fixture, box_with_array): # Note: freq here includes both Tick and non-Tick offsets; this is # relevant because historically integer-addition was allowed if we had # a freq. - @pytest.mark.parametrize("freq", ["H", "D", "W", "M", "MS", "Q", "B", None]) + @pytest.mark.parametrize("freq", ["H", "D", "W", "2ME", "MS", "Q", "B", None]) @pytest.mark.parametrize("dtype", [None, "uint8"]) def test_dt64arr_addsub_intlike( self, request, dtype, box_with_array, freq, tz_naive_fixture diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 7a079ae7795e6..ee8391830db4c 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -698,7 +698,7 @@ def test_sub_n_gt_1_offsets(self, offset, kwd_name, n): # datetime-like arrays pd.date_range("2016-01-01", periods=3, freq="H"), pd.date_range("2016-01-01", periods=3, tz="Europe/Brussels"), - pd.date_range("2016-01-01", periods=3, freq="S")._data, + pd.date_range("2016-01-01", periods=3, freq="s")._data, pd.date_range("2016-01-01", periods=3, tz="Asia/Tokyo")._data, # Miscellaneous invalid types 3.14, @@ -794,13 +794,13 @@ def test_parr_sub_td64array(self, box_with_array, tdi_freq, pi_freq): if pi_freq == "H": result = pi - td64obj - expected = (pi.to_timestamp("S") - tdi).to_period(pi_freq) + expected = (pi.to_timestamp("s") - tdi).to_period(pi_freq) expected = tm.box_expected(expected, xbox) tm.assert_equal(result, expected) # Subtract from scalar result = pi[0] - td64obj - expected = (pi[0].to_timestamp("S") - tdi).to_period(pi_freq) + expected = (pi[0].to_timestamp("s") - tdi).to_period(pi_freq) expected = tm.box_expected(expected, box) tm.assert_equal(result, expected) @@ -977,10 +977,10 @@ def test_pi_add_offset_n_gt1_not_divisible(self, box_with_array): pi = tm.box_expected(pi, box_with_array) expected = tm.box_expected(expected, box_with_array) - result = pi + to_offset("3M") + result = pi + to_offset("3ME") tm.assert_equal(result, expected) - result = to_offset("3M") + pi + result = to_offset("3ME") + pi tm.assert_equal(result, expected) # --------------------------------------------------------------- @@ -1048,7 +1048,7 @@ def test_parr_add_timedeltalike_minute_gt1(self, three_days, box_with_array): with pytest.raises(TypeError, match=msg): other - rng - @pytest.mark.parametrize("freqstr", ["5ns", "5us", "5ms", "5s", "5T", "5h", "5d"]) + @pytest.mark.parametrize("freqstr", ["5ns", "5us", "5ms", "5s", "5min", "5h", "5d"]) def test_parr_add_timedeltalike_tick_gt1(self, three_days, freqstr, box_with_array): # GH#23031 adding a time-delta-like offset to a PeriodArray that has # tick-like frequency with n != 1 diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index e15bac9f0b8aa..3377c411a7084 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -141,7 +141,8 @@ def test_getitem_listlike(self): def test_periodindex(self): idx1 = PeriodIndex( - ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], freq="M" + ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], + freq="M", ) cat1 = Categorical(idx1) @@ -152,7 +153,8 @@ def test_periodindex(self): tm.assert_index_equal(cat1.categories, exp_idx) idx2 = PeriodIndex( - ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], freq="M" + ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], + freq="M", ) cat2 = Categorical(idx2, ordered=True) str(cat2) diff --git a/pandas/tests/arrays/period/test_constructors.py b/pandas/tests/arrays/period/test_constructors.py index ecc9ee745bad8..0ea26a6ece7eb 100644 --- a/pandas/tests/arrays/period/test_constructors.py +++ b/pandas/tests/arrays/period/test_constructors.py @@ -144,3 +144,13 @@ def test_freq_deprecated(): expected = PeriodArray(data, dtype="period[M]") tm.assert_equal(res, expected) + + +def test_period_array_from_datetime64(): + arr = np.array( + ["2020-01-01T00:00:00", "2020-02-02T00:00:00"], dtype="datetime64[ns]" + ) + result = PeriodArray._from_datetime64(arr, freq=MonthEnd(2)) + + expected = period_array(["2020-01-01", "2020-02-01"], freq=MonthEnd(2)) + tm.assert_period_array_equal(result, expected) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index de93e89ecacd5..89cc31ec5ecc8 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -5,12 +5,23 @@ import numpy as np import pytest +from pandas.compat.pyarrow import pa_version_under12p0 + from pandas.core.dtypes.common import is_dtype_equal import pandas as pd import pandas._testing as tm -from pandas.core.arrays.string_arrow import ArrowStringArray -from pandas.util.version import Version +from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, +) + + +def na_val(dtype): + if dtype.storage == "pyarrow_numpy": + return np.nan + else: + return pd.NA @pytest.fixture @@ -27,21 +38,34 @@ def cls(dtype): def test_repr(dtype): df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)}) - expected = " A\n0 a\n1 \n2 b" + if dtype.storage == "pyarrow_numpy": + expected = " A\n0 a\n1 NaN\n2 b" + else: + expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected - expected = "0 a\n1 \n2 b\nName: A, dtype: string" + if dtype.storage == "pyarrow_numpy": + expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string" + else: + expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected - arr_name = "ArrowStringArray" if dtype.storage == "pyarrow" else "StringArray" - expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" + if dtype.storage == "pyarrow": + arr_name = "ArrowStringArray" + expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" + elif dtype.storage == "pyarrow_numpy": + arr_name = "ArrowStringArrayNumpySemantics" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" + else: + arr_name = "StringArray" + expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" assert repr(df.A.array) == expected def test_none_to_nan(cls): a = cls._from_sequence(["a", None, "b"]) assert a[1] is not None - assert a[1] is pd.NA + assert a[1] is na_val(a.dtype) def test_setitem_validates(cls): @@ -94,6 +118,14 @@ def test_astype_roundtrip(dtype): result = casted.astype("datetime64[ns]") tm.assert_series_equal(result, ser) + # GH#38509 same thing for timedelta64 + ser2 = ser - ser.iloc[-1] + casted2 = ser2.astype(dtype) + assert is_dtype_equal(casted2.dtype, dtype) + + result2 = casted2.astype(ser2.dtype) + tm.assert_series_equal(result2, ser2) + def test_add(dtype): a = pd.Series(["a", "b", "c", None, None], dtype=dtype) @@ -195,19 +227,30 @@ def test_comparison_methods_scalar(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = "a" result = getattr(a, op_name)(other) - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) - expected = pd.array(expected, dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + if dtype.storage == "pyarrow_numpy": + expected = np.array([getattr(item, op_name)(other) for item in a]) + expected[1] = False + tm.assert_numpy_array_equal(result, expected.astype(np.bool_)) + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) + expected = pd.array(expected, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) def test_comparison_methods_scalar_pd_na(comparison_op, dtype): op_name = f"__{comparison_op.__name__}__" a = pd.array(["a", None, "c"], dtype=dtype) result = getattr(a, op_name)(pd.NA) - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = pd.array([None, None, None], dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + + if dtype.storage == "pyarrow_numpy": + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(result, expected) + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = pd.array([None, None, None], dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) + tm.assert_extension_array_equal(result, expected) def test_comparison_methods_scalar_not_string(comparison_op, dtype): @@ -223,12 +266,21 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): return result = getattr(a, op_name)(other) - expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[ - op_name - ] - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = pd.array(expected_data, dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + + if dtype.storage == "pyarrow_numpy": + expected_data = { + "__eq__": [False, False, False], + "__ne__": [True, False, True], + }[op_name] + expected = np.array(expected_data) + tm.assert_numpy_array_equal(result, expected) + else: + expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[ + op_name + ] + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = pd.array(expected_data, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) def test_comparison_methods_array(comparison_op, dtype): @@ -237,15 +289,25 @@ def test_comparison_methods_array(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = [None, None, "c"] result = getattr(a, op_name)(other) - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = np.full(len(a), fill_value=None, dtype="object") - expected[-1] = getattr(other[-1], op_name)(a[-1]) - expected = pd.array(expected, dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + if dtype.storage == "pyarrow_numpy": + expected = np.array([False, False, False]) + expected[-1] = getattr(other[-1], op_name)(a[-1]) + tm.assert_numpy_array_equal(result, expected) - result = getattr(a, op_name)(pd.NA) - expected = pd.array([None, None, None], dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + result = getattr(a, op_name)(pd.NA) + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(result, expected) + + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = np.full(len(a), fill_value=None, dtype="object") + expected[-1] = getattr(other[-1], op_name)(a[-1]) + expected = pd.array(expected, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) + + result = getattr(a, op_name)(pd.NA) + expected = pd.array([None, None, None], dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) def test_constructor_raises(cls): @@ -297,7 +359,7 @@ def test_from_sequence_no_mutate(copy, cls, request): result = cls._from_sequence(nan_arr, copy=copy) - if cls is ArrowStringArray: + if cls in (ArrowStringArray, ArrowStringArrayNumpySemantics): import pyarrow as pa expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True)) @@ -315,8 +377,16 @@ def test_astype_int(dtype): tm.assert_numpy_array_equal(result, expected) arr = pd.array(["1", pd.NA, "3"], dtype=dtype) - msg = r"int\(\) argument must be a string, a bytes-like object or a( real)? number" - with pytest.raises(TypeError, match=msg): + if dtype.storage == "pyarrow_numpy": + err = ValueError + msg = "cannot convert float NaN to integer" + else: + err = TypeError + msg = ( + r"int\(\) argument must be a string, a bytes-like " + r"object or a( real)? number" + ) + with pytest.raises(err, match=msg): arr.astype("int64") @@ -364,7 +434,7 @@ def test_min_max(method, skipna, dtype, request): expected = "a" if method == "min" else "c" assert result == expected else: - assert result is pd.NA + assert result is na_val(arr.dtype) @pytest.mark.parametrize("method", ["min", "max"]) @@ -412,7 +482,7 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) - if dtype.storage == "pyarrow" and Version(pa.__version__) <= Version("11.0.0"): + if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: expected = pa.chunked_array(expected) assert arr.equals(expected) @@ -432,7 +502,7 @@ def test_arrow_roundtrip(dtype, string_storage2): expected = df.astype(f"string[{string_storage2}]") tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None - assert result.loc[2, "a"] is pd.NA + assert result.loc[2, "a"] is na_val(result["a"].dtype) def test_arrow_load_from_zero_chunks(dtype, string_storage2): @@ -455,6 +525,8 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2): def test_value_counts_na(dtype): if getattr(dtype, "storage", "") == "pyarrow": exp_dtype = "int64[pyarrow]" + elif getattr(dtype, "storage", "") == "pyarrow_numpy": + exp_dtype = "int64" else: exp_dtype = "Int64" arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) @@ -470,6 +542,8 @@ def test_value_counts_na(dtype): def test_value_counts_with_normalize(dtype): if getattr(dtype, "storage", "") == "pyarrow": exp_dtype = "double[pyarrow]" + elif getattr(dtype, "storage", "") == "pyarrow_numpy": + exp_dtype = np.float64 else: exp_dtype = "Float64" ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) @@ -526,7 +600,7 @@ def test_astype_from_float_dtype(float_dtype, dtype): def test_to_numpy_returns_pdna_default(dtype): arr = pd.array(["a", pd.NA, "b"], dtype=dtype) result = np.array(arr) - expected = np.array(["a", pd.NA, "b"], dtype=object) + expected = np.array(["a", na_val(dtype), "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -566,7 +640,7 @@ def test_setitem_scalar_with_mask_validation(dtype): mask = np.array([False, True, False]) ser[mask] = None - assert ser.array[1] is pd.NA + assert ser.array[1] is na_val(ser.dtype) # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 1ab628f186b47..c1d424f12bfc4 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -12,7 +12,10 @@ StringArray, StringDtype, ) -from pandas.core.arrays.string_arrow import ArrowStringArray +from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, +) skip_if_no_pyarrow = pytest.mark.skipif( pa_version_under7p0, @@ -166,6 +169,9 @@ def test_pyarrow_not_installed_raises(): with pytest.raises(ImportError, match=msg): ArrowStringArray([]) + with pytest.raises(ImportError, match=msg): + ArrowStringArrayNumpySemantics([]) + with pytest.raises(ImportError, match=msg): ArrowStringArray._from_sequence(["a", None, "b"]) @@ -235,9 +241,10 @@ def test_setitem_invalid_indexer_raises(): @skip_if_no_pyarrow -def test_pickle_roundtrip(): +@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"]) +def test_pickle_roundtrip(dtype): # GH 42600 - expected = pd.Series(range(10), dtype="string[pyarrow]") + expected = pd.Series(range(10), dtype=dtype) expected_sliced = expected.head(2) full_pickled = pickle.dumps(expected) sliced_pickled = pickle.dumps(expected_sliced) @@ -249,3 +256,11 @@ def test_pickle_roundtrip(): result_sliced = pickle.loads(sliced_pickled) tm.assert_series_equal(result_sliced, expected_sliced) + + +@skip_if_no_pyarrow +def test_string_dtype_error_message(): + # GH#55051 + msg = "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'." + with pytest.raises(ValueError, match=msg): + StringDtype("bla") diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 9eee2e0bea687..291c687d84125 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -11,6 +11,7 @@ OutOfBoundsDatetime, Timestamp, ) +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr import pandas as pd from pandas import ( @@ -31,7 +32,7 @@ # TODO: more freq variants -@pytest.fixture(params=["D", "B", "W", "M", "Q", "Y"]) +@pytest.fixture(params=["D", "B", "W", "ME", "Q", "Y"]) def freqstr(request): """Fixture returning parametrized frequency in string format.""" return request.param @@ -52,6 +53,7 @@ def period_index(freqstr): warnings.filterwarnings( "ignore", message="Period with BDay freq", category=FutureWarning ) + freqstr = freq_to_period_freqstr(1, freqstr) pi = pd.period_range(start=Timestamp("2000-01-01"), periods=100, freq=freqstr) return pi @@ -258,7 +260,7 @@ def test_fillna_method_doesnt_change_orig(self, method): fill_value = arr[3] if method == "pad" else arr[5] - result = arr.pad_or_backfill(method=method) + result = arr._pad_or_backfill(method=method) assert result[4] == fill_value # check that the original was not changed @@ -324,14 +326,12 @@ def test_searchsorted_castable_strings(self, arr1d, box, string_storage): ): arr.searchsorted("foo") - arr_type = "StringArray" if string_storage == "python" else "ArrowStringArray" - with pd.option_context("string_storage", string_storage): with pytest.raises( TypeError, match=re.escape( f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', " - f"or array of those. Got '{arr_type}' instead." + "or array of those. Got string array instead." ), ): arr.searchsorted([str(arr[1]), "baz"]) @@ -622,13 +622,13 @@ def test_round(self, arr1d): # GH#24064 dti = self.index_cls(arr1d) - result = dti.round(freq="2T") + result = dti.round(freq="2min") expected = dti - pd.Timedelta(minutes=1) expected = expected._with_freq(None) tm.assert_index_equal(result, expected) dta = dti._data - result = dta.round(freq="2T") + result = dta.round(freq="2min") expected = expected._data._with_freq(None) tm.assert_datetime_array_equal(result, expected) @@ -757,6 +757,7 @@ def test_to_period(self, datetime_index, freqstr): dti = datetime_index arr = DatetimeArray(dti) + freqstr = freq_to_period_freqstr(1, freqstr) expected = dti.to_period(freq=freqstr) result = arr.to_period(freq=freqstr) assert isinstance(result, PeriodArray) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 8e38a8c741b8d..a105852395b3a 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -497,7 +497,7 @@ def test_fillna_preserves_tz(self, method): dtype=DatetimeTZDtype(tz="US/Central"), ) - result = arr.pad_or_backfill(method=method) + result = arr._pad_or_backfill(method=method) tm.assert_extension_array_equal(result, expected) # assert that arr and dti were not modified in-place @@ -510,12 +510,12 @@ def test_fillna_2d(self): dta[0, 1] = pd.NaT dta[1, 0] = pd.NaT - res1 = dta.pad_or_backfill(method="pad") + res1 = dta._pad_or_backfill(method="pad") expected1 = dta.copy() expected1[1, 0] = dta[0, 0] tm.assert_extension_array_equal(res1, expected1) - res2 = dta.pad_or_backfill(method="backfill") + res2 = dta._pad_or_backfill(method="backfill") expected2 = dta.copy() expected2 = dta.copy() expected2[1, 0] = dta[2, 0] @@ -529,10 +529,10 @@ def test_fillna_2d(self): assert not dta2._ndarray.flags["C_CONTIGUOUS"] tm.assert_extension_array_equal(dta, dta2) - res3 = dta2.pad_or_backfill(method="pad") + res3 = dta2._pad_or_backfill(method="pad") tm.assert_extension_array_equal(res3, expected1) - res4 = dta2.pad_or_backfill(method="backfill") + res4 = dta2._pad_or_backfill(method="backfill") tm.assert_extension_array_equal(res4, expected2) # test the DataFrame method while we're here @@ -746,6 +746,14 @@ def test_iter_zoneinfo_fold(self, tz): assert str(left) == str(right2) assert left.utcoffset() == right2.utcoffset() + def test_date_range_frequency_M_deprecated(self): + depr_msg = r"\'M\' will be deprecated, please use \'ME\' for \'month end\'" + + expected = pd.date_range("1/1/2000", periods=4, freq="2ME") + with tm.assert_produces_warning(UserWarning, match=depr_msg): + result = pd.date_range("1/1/2000", periods=4, freq="2M") + tm.assert_index_equal(result, expected) + def test_factorize_sort_without_freq(): dta = DatetimeArray._from_sequence([0, 2, 1]) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 5b9618bfb2abf..db13e979a3c2d 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -176,7 +176,7 @@ def test_iter_box(self): assert s.dtype == "Period[M]" for res, exp in zip(s, vals): assert isinstance(res, pd.Period) - assert res.freq == "M" + assert res.freq == "ME" assert res == exp diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index fe1be2d8b6a0a..76b974330cbf1 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1541,10 +1541,10 @@ def test_chained_where_mask(using_copy_on_write, func): def test_asfreq_noop(using_copy_on_write): df = DataFrame( {"a": [0.0, None, 2.0, 3.0]}, - index=date_range("1/1/2000", periods=4, freq="T"), + index=date_range("1/1/2000", periods=4, freq="min"), ) df_orig = df.copy() - df2 = df.asfreq(freq="T") + df2 = df.asfreq(freq="min") if using_copy_on_write: assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 471e456146178..4507857418e9e 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -94,10 +94,10 @@ def test_categorical_dtype(self): [ "period[D]", "period[3M]", - "period[U]", + "period[us]", "Period[D]", "Period[3M]", - "Period[U]", + "Period[us]", ], ) def test_period_dtype(self, dtype): @@ -782,3 +782,9 @@ def test_pandas_dtype_numpy_warning(): match="Converting `np.integer` or `np.signedinteger` to a dtype is deprecated", ): pandas_dtype(np.integer) + + +def test_pandas_dtype_ea_not_instance(): + # GH 31356 GH 54592 + with tm.assert_produces_warning(UserWarning): + assert pandas_dtype(CategoricalDtype) == CategoricalDtype() diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index f57093c29b733..6562074eee634 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -467,8 +467,8 @@ def test_identity(self): assert PeriodDtype("period[3D]") == PeriodDtype("period[3D]") assert PeriodDtype("period[3D]") is not PeriodDtype("period[3D]") - assert PeriodDtype("period[1S1U]") == PeriodDtype("period[1000001U]") - assert PeriodDtype("period[1S1U]") is not PeriodDtype("period[1000001U]") + assert PeriodDtype("period[1s1us]") == PeriodDtype("period[1000001us]") + assert PeriodDtype("period[1s1us]") is not PeriodDtype("period[1000001us]") def test_compat(self, dtype): assert not is_datetime64_ns_dtype(dtype) @@ -505,15 +505,15 @@ def test_is_dtype(self, dtype): assert PeriodDtype.is_dtype("period[D]") assert PeriodDtype.is_dtype("period[3D]") assert PeriodDtype.is_dtype(PeriodDtype("3D")) - assert PeriodDtype.is_dtype("period[U]") - assert PeriodDtype.is_dtype("period[S]") - assert PeriodDtype.is_dtype(PeriodDtype("U")) - assert PeriodDtype.is_dtype(PeriodDtype("S")) + assert PeriodDtype.is_dtype("period[us]") + assert PeriodDtype.is_dtype("period[s]") + assert PeriodDtype.is_dtype(PeriodDtype("us")) + assert PeriodDtype.is_dtype(PeriodDtype("s")) assert not PeriodDtype.is_dtype("D") assert not PeriodDtype.is_dtype("3D") assert not PeriodDtype.is_dtype("U") - assert not PeriodDtype.is_dtype("S") + assert not PeriodDtype.is_dtype("s") assert not PeriodDtype.is_dtype("foo") assert not PeriodDtype.is_dtype(np.object_) assert not PeriodDtype.is_dtype(np.int64) @@ -728,7 +728,7 @@ def test_is_dtype(self, dtype): assert not IntervalDtype.is_dtype("D") assert not IntervalDtype.is_dtype("3D") - assert not IntervalDtype.is_dtype("U") + assert not IntervalDtype.is_dtype("us") assert not IntervalDtype.is_dtype("S") assert not IntervalDtype.is_dtype("foo") assert not IntervalDtype.is_dtype("IntervalA") diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index 7cd55b7240d54..6efaa95aef1b5 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -56,12 +56,7 @@ class TestMyDtype(BaseDtypeTests): BaseUnaryOpsTests, ) from pandas.tests.extension.base.printing import BasePrintingTests -from pandas.tests.extension.base.reduce import ( # noqa: F401 - BaseBooleanReduceTests, - BaseNoReduceTests, - BaseNumericReduceTests, - BaseReduceTests, -) +from pandas.tests.extension.base.reduce import BaseReduceTests from pandas.tests.extension.base.reshaping import BaseReshapingTests from pandas.tests.extension.base.setitem import BaseSetitemTests @@ -90,5 +85,47 @@ class ExtensionTests( BaseReduceTests, BaseReshapingTests, BaseSetitemTests, + Dim2CompatTests, ): pass + + +def __getattr__(name: str): + import warnings + + if name == "BaseNoReduceTests": + warnings.warn( + "BaseNoReduceTests is deprecated and will be removed in a " + "future version. Use BaseReduceTests and override " + "`_supports_reduction` instead.", + FutureWarning, + ) + from pandas.tests.extension.base.reduce import BaseNoReduceTests + + return BaseNoReduceTests + + elif name == "BaseNumericReduceTests": + warnings.warn( + "BaseNumericReduceTests is deprecated and will be removed in a " + "future version. Use BaseReduceTests and override " + "`_supports_reduction` instead.", + FutureWarning, + ) + from pandas.tests.extension.base.reduce import BaseNumericReduceTests + + return BaseNumericReduceTests + + elif name == "BaseBooleanReduceTests": + warnings.warn( + "BaseBooleanReduceTests is deprecated and will be removed in a " + "future version. Use BaseReduceTests and override " + "`_supports_reduction` instead.", + FutureWarning, + ) + from pandas.tests.extension.base.reduce import BaseBooleanReduceTests + + return BaseBooleanReduceTests + + raise AttributeError( + f"module 'pandas.tests.extension.base' has no attribute '{name}'" + ) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 4648f66112e80..9a41a3a582c4a 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -16,16 +16,13 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: return False def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): - alt = ser.astype("float64") - result = getattr(ser, op_name)(skipna=skipna) - - if result.dtype == pd.Float32Dtype() and op_name == "cumprod" and skipna: - # TODO: avoid special-casing here - pytest.skip( - f"Float32 precision lead to large differences with op {op_name} " - f"and skipna={skipna}" - ) + try: + alt = ser.astype("float64") + except TypeError: + # e.g. Period can't be cast to float64 + alt = ser.astype(object) + result = getattr(ser, op_name)(skipna=skipna) expected = getattr(alt, op_name)(skipna=skipna) tm.assert_series_equal(result, expected, check_dtype=False) @@ -37,5 +34,6 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna): if self._supports_accumulation(ser, op_name): self.check_accumulate(ser, op_name, skipna) else: - with pytest.raises(NotImplementedError): + with pytest.raises((NotImplementedError, TypeError)): + # TODO: require TypeError for things that will _never_ work? getattr(ser, op_name)(skipna=skipna) diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index 8828f33b7c62c..7215e910365cf 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -35,7 +35,8 @@ def test_series_constructor(self, data): if hasattr(result._mgr, "blocks"): assert isinstance(result2._mgr.blocks[0], EABackedBlock) - def test_series_constructor_no_data_with_index(self, dtype, na_value): + def test_series_constructor_no_data_with_index(self, dtype): + na_value = dtype.na_value result = pd.Series(index=[1, 2, 3], dtype=dtype) expected = pd.Series([na_value] * 3, index=[1, 2, 3], dtype=dtype) tm.assert_series_equal(result, expected) @@ -45,7 +46,8 @@ def test_series_constructor_no_data_with_index(self, dtype, na_value): expected = pd.Series([], index=pd.Index([], dtype="object"), dtype=dtype) tm.assert_series_equal(result, expected) - def test_series_constructor_scalar_na_with_index(self, dtype, na_value): + def test_series_constructor_scalar_na_with_index(self, dtype): + na_value = dtype.na_value result = pd.Series(na_value, index=[1, 2, 3], dtype=dtype) expected = pd.Series([na_value] * 3, index=[1, 2, 3], dtype=dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index a0c24ee068e81..3d1274df0a21b 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -20,6 +20,17 @@ class Dim2CompatTests: # Note: these are ONLY for ExtensionArray subclasses that support 2D arrays. # i.e. not for pyarrow-backed EAs. + @pytest.fixture(autouse=True) + def skip_if_doesnt_support_2d(self, dtype, request): + if not dtype._supports_2d: + node = request.node + # In cases where we are mixed in to ExtensionTests, we only want to + # skip tests that are defined in Dim2CompatTests + test_func = node._obj + if test_func.__qualname__.startswith("Dim2CompatTests"): + # TODO: is there a less hacky way of checking this? + pytest.skip("Test is only for EAs that support 2D.") + def test_transpose(self, data): arr2d = data.repeat(2).reshape(-1, 2) shape = arr2d.shape @@ -160,9 +171,9 @@ def test_fillna_2d_method(self, data_missing, method): assert arr[0].isna().all() assert not arr[1].isna().any() - result = arr.pad_or_backfill(method=method, limit=None) + result = arr._pad_or_backfill(method=method, limit=None) - expected = data_missing.pad_or_backfill(method=method).repeat(2).reshape(2, 2) + expected = data_missing._pad_or_backfill(method=method).repeat(2).reshape(2, 2) tm.assert_extension_array_equal(result, expected) # Reverse so that backfill is not a no-op. @@ -170,10 +181,10 @@ def test_fillna_2d_method(self, data_missing, method): assert not arr2[0].isna().any() assert arr2[1].isna().all() - result2 = arr2.pad_or_backfill(method=method, limit=None) + result2 = arr2._pad_or_backfill(method=method, limit=None) expected2 = ( - data_missing[::-1].pad_or_backfill(method=method).repeat(2).reshape(2, 2) + data_missing[::-1]._pad_or_backfill(method=method).repeat(2).reshape(2, 2) ) tm.assert_extension_array_equal(result2, expected2) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 5f0c1b960a475..be16a105384c6 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -148,7 +148,8 @@ def test_getitem_invalid(self, data): with pytest.raises(IndexError, match=msg): data[-ub - 1] - def test_getitem_scalar_na(self, data_missing, na_cmp, na_value): + def test_getitem_scalar_na(self, data_missing, na_cmp): + na_value = data_missing.dtype.na_value result = data_missing[0] assert na_cmp(result, na_value) @@ -348,7 +349,8 @@ def test_take_sequence(self, data): assert result.iloc[1] == data[1] assert result.iloc[2] == data[3] - def test_take(self, data, na_value, na_cmp): + def test_take(self, data, na_cmp): + na_value = data.dtype.na_value result = data.take([0, -1]) assert result.dtype == data.dtype assert result[0] == data[0] @@ -361,7 +363,8 @@ def test_take(self, data, na_value, na_cmp): with pytest.raises(IndexError, match="out of bounds"): data.take([len(data) + 1]) - def test_take_empty(self, data, na_value, na_cmp): + def test_take_empty(self, data, na_cmp): + na_value = data.dtype.na_value empty = data[:0] result = empty.take([-1], allow_fill=True) @@ -393,7 +396,8 @@ def test_take_non_na_fill_value(self, data_missing): expected = arr.take([1, 1]) tm.assert_extension_array_equal(result, expected) - def test_take_pandas_style_negative_raises(self, data, na_value): + def test_take_pandas_style_negative_raises(self, data): + na_value = data.dtype.na_value with pytest.raises(ValueError, match=""): data.take([0, -2], fill_value=na_value, allow_fill=True) @@ -413,7 +417,8 @@ def test_take_series(self, data): ) tm.assert_series_equal(result, expected) - def test_reindex(self, data, na_value): + def test_reindex(self, data): + na_value = data.dtype.na_value s = pd.Series(data) result = s.reindex([0, 1, 3]) expected = pd.Series(data.take([0, 1, 3]), index=[0, 1, 3]) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 489f43729a004..5c21c4f7137a5 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -108,9 +108,13 @@ def test_groupby_extension_transform(self, data_for_grouping): def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) - df.groupby("B", group_keys=False).apply(groupby_apply_op) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("B", group_keys=False).apply(groupby_apply_op) df.groupby("B", group_keys=False).A.apply(groupby_apply_op) - df.groupby("A", group_keys=False).apply(groupby_apply_op) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("A", group_keys=False).apply(groupby_apply_op) df.groupby("A", group_keys=False).B.apply(groupby_apply_op) def test_groupby_apply_identity(self, data_for_grouping): diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 2dd62a4ca7538..4e0bc8d804bab 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -70,6 +70,9 @@ def test_value_counts_with_normalize(self, data): ): # TODO: avoid special-casing expected = expected.astype("double[pyarrow]") + elif getattr(data.dtype, "storage", "") == "pyarrow_numpy": + # TODO: avoid special-casing + expected = expected.astype("float64") elif na_value_for_dtype(data.dtype) is pd.NA: # TODO(GH#44692): avoid special-casing expected = expected.astype("Float64") @@ -118,7 +121,7 @@ def test_argsort_missing(self, data_missing_for_sorting): expected = pd.Series(np.array([1, -1, 0], dtype=np.intp)) tm.assert_series_equal(result, expected) - def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value): + def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting): # GH 24382 is_bool = data_for_sorting.dtype._is_boolean @@ -151,9 +154,10 @@ def test_argmin_argmax_empty_array(self, method, data): getattr(data[:0], method)() @pytest.mark.parametrize("method", ["argmax", "argmin"]) - def test_argmin_argmax_all_na(self, method, data, na_value): + def test_argmin_argmax_all_na(self, method, data): # all missing with skipna=True is the same as empty err_msg = "attempt to get" + na_value = data.dtype.na_value data_na = type(data)._from_sequence([na_value, na_value], dtype=data.dtype) with pytest.raises(ValueError, match=err_msg): getattr(data_na, method)() @@ -540,7 +544,8 @@ def _test_searchsorted_bool_dtypes(self, data_for_sorting, as_series): sorter = np.array([1, 0]) assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 - def test_where_series(self, data, na_value, as_frame): + def test_where_series(self, data, as_frame): + na_value = data.dtype.na_value assert data[0] != data[1] cls = type(data) a, b = data[:2] @@ -667,7 +672,8 @@ def test_insert_invalid_loc(self, data): data.insert(1.5, data[0]) @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame]) - def test_equals(self, data, na_value, as_series, box): + def test_equals(self, data, as_series, box): + na_value = data.dtype.na_value data2 = type(data)._from_sequence([data[0]] * len(data), dtype=data.dtype) data_na = type(data)._from_sequence([na_value] * len(data), dtype=data.dtype) diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 2a0bd0770dc07..40cc952d44200 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -94,7 +94,7 @@ def test_fillna_no_op_returns_copy(self, data): assert result is not data tm.assert_extension_array_equal(result, data) - result = data.pad_or_backfill(method="backfill") + result = data._pad_or_backfill(method="backfill") assert result is not data tm.assert_extension_array_equal(result, data) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index d5eb65ec9d35d..d2aa4bd63c428 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core import ops @@ -128,12 +130,18 @@ class BaseArithmeticOpsTests(BaseOpsUtil): def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # series & scalar + if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype): + pytest.skip("Skip testing Python string formatting") + op_name = all_arithmetic_operators ser = pd.Series(data) self.check_opname(ser, op_name, ser.iloc[0]) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar + if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype): + pytest.skip("Skip testing Python string formatting") + op_name = all_arithmetic_operators df = pd.DataFrame({"A": data}) self.check_opname(df, op_name, data[0]) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 9b56b10681e15..1b94a635d1748 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -83,6 +83,7 @@ def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): ser = pd.Series(data) if not self._supports_reduction(ser, op_name): + # TODO: the message being checked here isn't actually checking anything msg = ( "[Cc]annot perform|Categorical is not ordered for operation|" "does not support reduction|" @@ -101,6 +102,7 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): ser = pd.Series(data) if not self._supports_reduction(ser, op_name): + # TODO: the message being checked here isn't actually checking anything msg = ( "[Cc]annot perform|Categorical is not ordered for operation|" "does not support reduction|" @@ -129,7 +131,8 @@ def test_reduce_frame(self, data, all_numeric_reductions, skipna): self.check_reduce_frame(ser, op_name, skipna) -# TODO: deprecate BaseNoReduceTests, BaseNumericReduceTests, BaseBooleanReduceTests +# TODO(3.0): remove BaseNoReduceTests, BaseNumericReduceTests, +# BaseBooleanReduceTests class BaseNoReduceTests(BaseReduceTests): """we don't define any reductions""" diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 5d9c03e1b2569..13ab56adfe914 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -72,7 +72,8 @@ def test_concat_mixed_dtypes(self, data): expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")]) tm.assert_series_equal(result, expected) - def test_concat_columns(self, data, na_value): + def test_concat_columns(self, data): + na_value = data.dtype.na_value df1 = pd.DataFrame({"A": data[:3]}) df2 = pd.DataFrame({"B": [1, 2, 3]}) @@ -96,8 +97,9 @@ def test_concat_columns(self, data, na_value): result = pd.concat([df1["A"], df2["B"]], axis=1) tm.assert_frame_equal(result, expected) - def test_concat_extension_arrays_copy_false(self, data, na_value): + def test_concat_extension_arrays_copy_false(self, data): # GH 20756 + na_value = data.dtype.na_value df1 = pd.DataFrame({"A": data[:3]}) df2 = pd.DataFrame({"B": data[3:7]}) expected = pd.DataFrame( @@ -122,7 +124,8 @@ def test_concat_with_reindex(self, data): ) tm.assert_frame_equal(result, expected) - def test_align(self, data, na_value): + def test_align(self, data): + na_value = data.dtype.na_value a = data[:3] b = data[2:5] r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) @@ -133,7 +136,8 @@ def test_align(self, data, na_value): tm.assert_series_equal(r1, e1) tm.assert_series_equal(r2, e2) - def test_align_frame(self, data, na_value): + def test_align_frame(self, data): + na_value = data.dtype.na_value a = data[:3] b = data[2:5] r1, r2 = pd.DataFrame({"A": a}).align(pd.DataFrame({"A": b}, index=[1, 2, 3])) @@ -148,8 +152,9 @@ def test_align_frame(self, data, na_value): tm.assert_frame_equal(r1, e1) tm.assert_frame_equal(r2, e2) - def test_align_series_frame(self, data, na_value): + def test_align_series_frame(self, data): # https://github.com/pandas-dev/pandas/issues/20576 + na_value = data.dtype.na_value ser = pd.Series(data, name="a") df = pd.DataFrame({"col": np.arange(len(ser) + 1)}) r1, r2 = ser.align(df) @@ -180,7 +185,7 @@ def test_set_frame_overwrite_object(self, data): df["A"] = data assert df.dtypes["A"] == data.dtype - def test_merge(self, data, na_value): + def test_merge(self, data): # GH-20743 df1 = pd.DataFrame({"ext": data[:3], "int1": [1, 2, 3], "key": [0, 1, 2]}) df2 = pd.DataFrame({"int2": [1, 2, 3, 4], "key": [0, 0, 1, 3]}) @@ -205,7 +210,8 @@ def test_merge(self, data, na_value): "int2": [1, 2, 3, np.nan, 4], "key": [0, 0, 1, 2, 3], "ext": data._from_sequence( - [data[0], data[0], data[1], data[2], na_value], dtype=data.dtype + [data[0], data[0], data[1], data[2], data.dtype.na_value], + dtype=data.dtype, ), } ) @@ -236,9 +242,9 @@ def test_merge_on_extension_array_duplicates(self, data): result = pd.merge(df1, df2, on="key") expected = pd.DataFrame( { - "key": key.take([0, 0, 0, 0, 1]), - "val_x": [1, 1, 3, 3, 2], - "val_y": [1, 3, 1, 3, 2], + "key": key.take([0, 0, 1, 2, 2]), + "val_x": [1, 1, 2, 3, 3], + "val_y": [1, 3, 2, 1, 3], } ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index fd90635deffac..c975138837e6b 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -359,7 +359,8 @@ def test_setitem_with_expansion_dataframe_column(self, data, full_indexer): tm.assert_frame_equal(result, expected) - def test_setitem_with_expansion_row(self, data, na_value): + def test_setitem_with_expansion_row(self, data): + na_value = data.dtype.na_value df = pd.DataFrame({"data": data[:1]}) df.loc[1, "data"] = data[1] diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 7b7945b15ed83..a94f7de283d01 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -2,6 +2,8 @@ import pytest +from pandas._config.config import _get_option + from pandas import ( Series, options, @@ -116,12 +118,6 @@ def na_cmp(): return operator.is_ -@pytest.fixture -def na_value(dtype): - """The scalar missing value for this type. Default dtype.na_value""" - return dtype.na_value - - @pytest.fixture def data_for_grouping(): """ @@ -218,4 +214,7 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - return options.mode.copy_on_write and options.mode.data_manager == "block" + return ( + options.mode.copy_on_write + and _get_option("mode.data_manager", silent=True) == "block" + ) diff --git a/pandas/tests/extension/date/array.py b/pandas/tests/extension/date/array.py index 39accd6d223a7..2306f5974ba18 100644 --- a/pandas/tests/extension/date/array.py +++ b/pandas/tests/extension/date/array.py @@ -176,9 +176,13 @@ def isna(self) -> np.ndarray: @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): if isinstance(scalars, dt.date): - pass + raise TypeError elif isinstance(scalars, DateArray): - pass + if dtype is not None: + return scalars.astype(dtype, copy=copy) + if copy: + return scalars.copy() + return scalars[:] elif isinstance(scalars, np.ndarray): scalars = scalars.astype("U10") # 10 chars for yyyy-mm-dd return DateArray(scalars) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index d2c3cb395ed44..9ce7ac309b6d3 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -285,7 +285,7 @@ def value_counts(self, dropna: bool = True): return value_counts(self.to_numpy(), dropna=dropna) # We override fillna here to simulate a 3rd party EA that has done so. This - # lets us test the deprecation telling authors to implement pad_or_backfill + # lets us test the deprecation telling authors to implement _pad_or_backfill # Simulate a 3rd-party EA that has not yet updated to include a "copy" # keyword in its fillna method. # error: Signature of "fillna" incompatible with supertype "ExtensionArray" diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 2f274354f0da0..8dbd1c4c511d0 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -133,58 +133,96 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators): def test_fillna_frame(self, data_missing): msg = "ExtensionArray.fillna added a 'copy' keyword" with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + DeprecationWarning, match=msg, check_stacklevel=False ): super().test_fillna_frame(data_missing) def test_fillna_limit_pad(self, data_missing): msg = "ExtensionArray.fillna 'method' keyword is deprecated" with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + DeprecationWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, + ): + super().test_fillna_limit_pad(data_missing) + + msg = "The 'method' keyword in DecimalArray.fillna is deprecated" + with tm.assert_produces_warning( + FutureWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, ): super().test_fillna_limit_pad(data_missing) def test_fillna_limit_backfill(self, data_missing): - msg = "|".join( - [ - "ExtensionArray.fillna added a 'copy' keyword", - "Series.fillna with 'method' is deprecated", - ] - ) + msg = "Series.fillna with 'method' is deprecated" with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + FutureWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, ): super().test_fillna_limit_backfill(data_missing) - def test_fillna_no_op_returns_copy(self, data): msg = "ExtensionArray.fillna 'method' keyword is deprecated" with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + DeprecationWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, + ): + super().test_fillna_limit_backfill(data_missing) + + msg = "The 'method' keyword in DecimalArray.fillna is deprecated" + with tm.assert_produces_warning( + FutureWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, + ): + super().test_fillna_limit_backfill(data_missing) + + def test_fillna_no_op_returns_copy(self, data): + msg = "|".join( + [ + "ExtensionArray.fillna 'method' keyword is deprecated", + "The 'method' keyword in DecimalArray.fillna is deprecated", + ] + ) + with tm.assert_produces_warning( + (FutureWarning, DeprecationWarning), match=msg, check_stacklevel=False ): super().test_fillna_no_op_returns_copy(data) def test_fillna_series(self, data_missing): msg = "ExtensionArray.fillna added a 'copy' keyword" with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + DeprecationWarning, match=msg, check_stacklevel=False ): super().test_fillna_series(data_missing) def test_fillna_series_method(self, data_missing, fillna_method): - msg = "ExtensionArray.fillna 'method' keyword is deprecated" + msg = "|".join( + [ + "ExtensionArray.fillna 'method' keyword is deprecated", + "The 'method' keyword in DecimalArray.fillna is deprecated", + ] + ) with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + (FutureWarning, DeprecationWarning), match=msg, check_stacklevel=False ): super().test_fillna_series_method(data_missing, fillna_method) def test_fillna_copy_frame(self, data_missing, using_copy_on_write): - warn = FutureWarning if not using_copy_on_write else None + warn = DeprecationWarning if not using_copy_on_write else None msg = "ExtensionArray.fillna added a 'copy' keyword" with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): super().test_fillna_copy_frame(data_missing) def test_fillna_copy_series(self, data_missing, using_copy_on_write): - warn = FutureWarning if not using_copy_on_write else None + warn = DeprecationWarning if not using_copy_on_write else None msg = "ExtensionArray.fillna added a 'copy' keyword" with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): super().test_fillna_copy_series(data_missing) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 9e1a4fb5da251..68a27a28b160f 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -97,24 +97,24 @@ def test_from_dtype(self, data): super().test_from_dtype(data) @pytest.mark.xfail(reason="RecursionError, GH-33900") - def test_series_constructor_no_data_with_index(self, dtype, na_value): + def test_series_constructor_no_data_with_index(self, dtype): # RecursionError: maximum recursion depth exceeded in comparison rec_limit = sys.getrecursionlimit() try: # Limit to avoid stack overflow on Windows CI sys.setrecursionlimit(100) - super().test_series_constructor_no_data_with_index(dtype, na_value) + super().test_series_constructor_no_data_with_index(dtype) finally: sys.setrecursionlimit(rec_limit) @pytest.mark.xfail(reason="RecursionError, GH-33900") - def test_series_constructor_scalar_na_with_index(self, dtype, na_value): + def test_series_constructor_scalar_na_with_index(self, dtype): # RecursionError: maximum recursion depth exceeded in comparison rec_limit = sys.getrecursionlimit() try: # Limit to avoid stack overflow on Windows CI sys.setrecursionlimit(100) - super().test_series_constructor_scalar_na_with_index(dtype, na_value) + super().test_series_constructor_scalar_na_with_index(dtype) finally: sys.setrecursionlimit(rec_limit) @@ -214,19 +214,19 @@ def test_combine_first(self, data): super().test_combine_first(data) @pytest.mark.xfail(reason="broadcasting error") - def test_where_series(self, data, na_value): + def test_where_series(self, data): # Fails with # *** ValueError: operands could not be broadcast together # with shapes (4,) (4,) (0,) - super().test_where_series(data, na_value) + super().test_where_series(data) @pytest.mark.xfail(reason="Can't compare dicts.") def test_searchsorted(self, data_for_sorting): super().test_searchsorted(data_for_sorting) @pytest.mark.xfail(reason="Can't compare dicts.") - def test_equals(self, data, na_value, as_series): - super().test_equals(data, na_value, as_series) + def test_equals(self, data, as_series): + super().test_equals(data, as_series) @pytest.mark.skip("fill-value is interpreted as a dict of values") def test_fillna_copy_frame(self, data_missing): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 5955cfc2ef5e4..339e97e735f85 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -31,6 +31,7 @@ import pytest from pandas._libs import lib +from pandas._libs.tslibs import timezones from pandas.compat import ( PY311, is_ci_environment, @@ -40,6 +41,7 @@ pa_version_under9p0, pa_version_under11p0, pa_version_under13p0, + pa_version_under14p0, ) from pandas.core.dtypes.dtypes import ( @@ -518,9 +520,7 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, reque super().test_reduce_series_numeric(data, all_numeric_reductions, skipna) @pytest.mark.parametrize("skipna", [True, False]) - def test_reduce_series_boolean( - self, data, all_boolean_reductions, skipna, na_value, request - ): + def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna, request): pa_dtype = data.dtype.pyarrow_dtype xfail_mark = pytest.mark.xfail( raises=TypeError, @@ -724,7 +724,7 @@ def test_EA_types(self, engine, data, dtype_backend, request): def test_invert(self, data, request): pa_dtype = data.dtype.pyarrow_dtype - if not pa.types.is_boolean(pa_dtype): + if not (pa.types.is_boolean(pa_dtype) or pa.types.is_integer(pa_dtype)): request.node.add_marker( pytest.mark.xfail( raises=pa.ArrowNotImplementedError, @@ -753,9 +753,7 @@ def test_value_counts_returns_pyarrow_int64(self, data): result = data.value_counts() assert result.dtype == ArrowDtype(pa.int64()) - def test_argmin_argmax( - self, data_for_sorting, data_missing_for_sorting, na_value, request - ): + def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, request): pa_dtype = data_for_sorting.dtype.pyarrow_dtype if pa.types.is_decimal(pa_dtype) and pa_version_under7p0: request.node.add_marker( @@ -764,7 +762,7 @@ def test_argmin_argmax( raises=pa.ArrowNotImplementedError, ) ) - super().test_argmin_argmax(data_for_sorting, data_missing_for_sorting, na_value) + super().test_argmin_argmax(data_for_sorting, data_missing_for_sorting) @pytest.mark.parametrize( "op_name, skipna, expected", @@ -921,7 +919,7 @@ def _is_temporal_supported(self, opname, pa_dtype): or ( opname in ("__truediv__", "__rtruediv__", "__floordiv__", "__rfloordiv__") - and not pa_version_under13p0 + and not pa_version_under14p0 ) ) and pa.types.is_duration(pa_dtype) @@ -1033,9 +1031,7 @@ def _get_arith_xfail_marker(self, opname, pa_dtype): def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): pa_dtype = data.dtype.pyarrow_dtype - if all_arithmetic_operators == "__rmod__" and ( - pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype) - ): + if all_arithmetic_operators == "__rmod__" and (pa.types.is_binary(pa_dtype)): pytest.skip("Skip testing Python string formatting") mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) @@ -1287,6 +1283,31 @@ def test_logical_masked_numpy(self, op, exp): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES) +def test_bitwise(pa_type): + # GH 54495 + dtype = ArrowDtype(pa_type) + left = pd.Series([1, None, 3, 4], dtype=dtype) + right = pd.Series([None, 3, 5, 4], dtype=dtype) + + result = left | right + expected = pd.Series([None, None, 3 | 5, 4 | 4], dtype=dtype) + tm.assert_series_equal(result, expected) + + result = left & right + expected = pd.Series([None, None, 3 & 5, 4 & 4], dtype=dtype) + tm.assert_series_equal(result, expected) + + result = left ^ right + expected = pd.Series([None, None, 3 ^ 5, 4 ^ 4], dtype=dtype) + tm.assert_series_equal(result, expected) + + result = ~left + expected = ~(left.fillna(0).to_numpy()) + expected = pd.Series(expected, dtype=dtype).mask(left.isnull()) + tm.assert_series_equal(result, expected) + + def test_arrowdtype_construct_from_string_type_with_unsupported_parameters(): with pytest.raises(NotImplementedError, match="Passing pyarrow type"): ArrowDtype.construct_from_string("not_a_real_dype[s, tz=UTC][pyarrow]") @@ -1576,6 +1597,19 @@ def test_to_numpy_null_array_no_dtype(): tm.assert_numpy_array_equal(result, expected) +def test_to_numpy_without_dtype(): + # GH 54808 + arr = pd.array([True, pd.NA], dtype="boolean[pyarrow]") + result = arr.to_numpy(na_value=False) + expected = np.array([True, False], dtype=np.bool_) + tm.assert_numpy_array_equal(result, expected) + + arr = pd.array([1.0, pd.NA], dtype="float32[pyarrow]") + result = arr.to_numpy(na_value=0.0) + expected = np.array([1.0, 0.0], dtype=np.float32) + tm.assert_numpy_array_equal(result, expected) + + def test_setitem_null_slice(data): # GH50248 orig = data.copy() @@ -2399,7 +2433,7 @@ def test_dt_tz(tz): dtype=ArrowDtype(pa.timestamp("ns", tz=tz)), ) result = ser.dt.tz - assert result == tz + assert result == timezones.maybe_get_tz(tz) def test_dt_isocalendar(): @@ -2472,7 +2506,7 @@ def test_dt_roundlike_unsupported_freq(method): @pytest.mark.xfail( pa_version_under7p0, reason="Methods not supported for pyarrow < 7.0" ) -@pytest.mark.parametrize("freq", ["D", "H", "T", "S", "L", "U", "N"]) +@pytest.mark.parametrize("freq", ["D", "H", "min", "s", "ms", "us", "ns"]) @pytest.mark.parametrize("method", ["ceil", "floor", "round"]) def test_dt_ceil_year_floor(freq, method): ser = pd.Series( @@ -2960,6 +2994,15 @@ def test_groupby_count_return_arrow_dtype(data_missing): tm.assert_frame_equal(result, expected) +def test_fixed_size_list(): + # GH#55000 + ser = pd.Series( + [[1, 2], [3, 4]], dtype=ArrowDtype(pa.list_(pa.int64(), list_size=2)) + ) + result = ser.dtype.type + assert result == list + + def test_arrowextensiondtype_dataframe_repr(): # GH 54062 df = pd.DataFrame( @@ -2972,3 +3015,34 @@ def test_arrowextensiondtype_dataframe_repr(): # pyarrow.ExtensionType values are displayed expected = " col\n0 15340\n1 15341\n2 15342" assert result == expected + + +@pytest.mark.parametrize("pa_type", tm.TIMEDELTA_PYARROW_DTYPES) +def test_duration_fillna_numpy(pa_type): + # GH 54707 + ser1 = pd.Series([None, 2], dtype=ArrowDtype(pa_type)) + ser2 = pd.Series(np.array([1, 3], dtype=f"m8[{pa_type.unit}]")) + result = ser1.fillna(ser2) + expected = pd.Series([1, 2], dtype=ArrowDtype(pa_type)) + tm.assert_series_equal(result, expected) + + +def test_comparison_not_propagating_arrow_error(): + # GH#54944 + a = pd.Series([1 << 63], dtype="uint64[pyarrow]") + b = pd.Series([None], dtype="int64[pyarrow]") + with pytest.raises(pa.lib.ArrowInvalid, match="Integer value"): + a < b + + +def test_factorize_chunked_dictionary(): + # GH 54844 + pa_array = pa.chunked_array( + [pa.array(["a"]).dictionary_encode(), pa.array(["b"]).dictionary_encode()] + ) + ser = pd.Series(ArrowExtensionArray(pa_array)) + res_indices, res_uniques = ser.factorize() + exp_indicies = np.array([0, 1], dtype=np.intp) + exp_uniques = pd.Index(ArrowExtensionArray(pa_array.combine_chunks())) + tm.assert_numpy_array_equal(res_indices, exp_indicies) + tm.assert_index_equal(res_uniques, exp_uniques) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 79b8e9ddbf8f5..82b6c54bc3106 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -165,14 +165,14 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request) ) super().test_arith_series_with_scalar(data, op_name) - def _compare_other(self, s, data, op, other): + def _compare_other(self, ser: pd.Series, data, op, other): op_name = f"__{op.__name__}__" if op_name not in ["__eq__", "__ne__"]: msg = "Unordered Categoricals can only compare equality or not" with pytest.raises(TypeError, match=msg): op(data, other) else: - return super()._compare_other(s, data, op, other) + return super()._compare_other(ser, data, op, other) @pytest.mark.xfail(reason="Categorical overrides __repr__") @pytest.mark.parametrize("size", ["big", "small"]) diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index 97773d0d40a57..5a7b15ddb01ce 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -82,78 +82,63 @@ def cmp(a, b): # ---------------------------------------------------------------------------- -class BaseDatetimeTests: - pass +class TestDatetimeArray(base.ExtensionTests): + def _get_expected_exception(self, op_name, obj, other): + if op_name in ["__sub__", "__rsub__"]: + return None + return super()._get_expected_exception(op_name, obj, other) + def _supports_accumulation(self, ser, op_name: str) -> bool: + return op_name in ["cummin", "cummax"] -# ---------------------------------------------------------------------------- -# Tests -class TestDatetimeDtype(BaseDatetimeTests, base.BaseDtypeTests): - pass + def _supports_reduction(self, obj, op_name: str) -> bool: + return op_name in ["min", "max", "median", "mean", "std", "any", "all"] + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): + meth = all_boolean_reductions + msg = f"'{meth}' with datetime64 dtypes is deprecated and will raise in" + with tm.assert_produces_warning( + FutureWarning, match=msg, check_stacklevel=False + ): + super().test_reduce_series_boolean(data, all_boolean_reductions, skipna) -class TestConstructors(BaseDatetimeTests, base.BaseConstructorsTests): def test_series_constructor(self, data): # Series construction drops any .freq attr data = data._with_freq(None) super().test_series_constructor(data) - -class TestGetitem(BaseDatetimeTests, base.BaseGetitemTests): - pass - - -class TestIndex(base.BaseIndexTests): - pass - - -class TestMethods(BaseDatetimeTests, base.BaseMethodsTests): @pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map(self, data, na_action): result = data.map(lambda x: x, na_action=na_action) tm.assert_extension_array_equal(result, data) - -class TestInterface(BaseDatetimeTests, base.BaseInterfaceTests): - pass - - -class TestArithmeticOps(BaseDatetimeTests, base.BaseArithmeticOpsTests): - implements = {"__sub__", "__rsub__"} - - def _get_expected_exception(self, op_name, obj, other): - if op_name in self.implements: - return None - return super()._get_expected_exception(op_name, obj, other) - - -class TestCasting(BaseDatetimeTests, base.BaseCastingTests): - pass - - -class TestComparisonOps(BaseDatetimeTests, base.BaseComparisonOpsTests): - pass - - -class TestMissing(BaseDatetimeTests, base.BaseMissingTests): - pass - - -class TestReshaping(BaseDatetimeTests, base.BaseReshapingTests): - pass - - -class TestSetitem(BaseDatetimeTests, base.BaseSetitemTests): - pass - - -class TestGroupby(BaseDatetimeTests, base.BaseGroupbyTests): - pass - - -class TestPrinting(BaseDatetimeTests, base.BasePrintingTests): - pass - - -class Test2DCompat(BaseDatetimeTests, base.NDArrayBacked2DTests): + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data): + expected_msg = r".*must implement _from_sequence_of_strings.*" + with pytest.raises(NotImplementedError, match=expected_msg): + super().test_EA_types(engine, data) + + def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): + if op_name in ["median", "mean", "std"]: + alt = ser.astype("int64") + + res_op = getattr(ser, op_name) + exp_op = getattr(alt, op_name) + result = res_op(skipna=skipna) + expected = exp_op(skipna=skipna) + if op_name in ["mean", "median"]: + # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" + # has no attribute "tz" + tz = ser.dtype.tz # type: ignore[union-attr] + expected = pd.Timestamp(expected, tz=tz) + else: + expected = pd.Timedelta(expected) + tm.assert_almost_equal(result, expected) + + else: + return super().check_reduce(ser, op_name, skipna) + + +class Test2DCompat(base.NDArrayBacked2DTests): pass diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 7efb8fbad8cd1..d27e9b8b9e983 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -13,6 +13,8 @@ be added to the array-specific tests in `pandas/tests/arrays/`. """ +import warnings + import numpy as np import pytest @@ -186,7 +188,14 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): if sdtype.kind in "iu": if op_name in ("__rtruediv__", "__truediv__", "__div__"): - expected = expected.fillna(np.nan).astype("Float64") + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + filled = expected.fillna(np.nan) + expected = filled.astype("Float64") else: # combine method result in 'biggest' (int64) dtype expected = expected.astype(sdtype) @@ -323,6 +332,13 @@ def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): else: expected_dtype = f"Int{length}" + if expected_dtype == "Float32" and op_name == "cumprod" and skipna: + # TODO: xfail? + pytest.skip( + f"Float32 precision lead to large differences with op {op_name} " + f"and skipna={skipna}" + ) + if op_name == "cumsum": result = getattr(ser, op_name)(skipna=skipna) expected = pd.Series( diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index 63297c20daa97..2d1d213322bac 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -13,10 +13,17 @@ be added to the array-specific tests in `pandas/tests/arrays/`. """ +from __future__ import annotations + +from typing import TYPE_CHECKING + import numpy as np import pytest -from pandas._libs import iNaT +from pandas._libs import ( + Period, + iNaT, +) from pandas.compat import is_platform_windows from pandas.compat.numpy import np_version_gte1p24 @@ -26,6 +33,9 @@ from pandas.core.arrays import PeriodArray from pandas.tests.extension import base +if TYPE_CHECKING: + import pandas as pd + @pytest.fixture(params=["D", "2D"]) def dtype(request): @@ -61,27 +71,36 @@ def data_for_grouping(dtype): return PeriodArray([B, B, NA, NA, A, A, B, C], dtype=dtype) -class BasePeriodTests: - pass - - -class TestPeriodDtype(BasePeriodTests, base.BaseDtypeTests): - pass +class TestPeriodArray(base.ExtensionTests): + def _get_expected_exception(self, op_name, obj, other): + if op_name in ("__sub__", "__rsub__"): + return None + return super()._get_expected_exception(op_name, obj, other) + def _supports_accumulation(self, ser, op_name: str) -> bool: + return op_name in ["cummin", "cummax"] -class TestConstructors(BasePeriodTests, base.BaseConstructorsTests): - pass + def _supports_reduction(self, obj, op_name: str) -> bool: + return op_name in ["min", "max", "median"] + def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): + if op_name == "median": + res_op = getattr(ser, op_name) -class TestGetitem(BasePeriodTests, base.BaseGetitemTests): - pass + alt = ser.astype("int64") + exp_op = getattr(alt, op_name) + result = res_op(skipna=skipna) + expected = exp_op(skipna=skipna) + # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has no + # attribute "freq" + freq = ser.dtype.freq # type: ignore[union-attr] + expected = Period._from_ordinal(int(expected), freq=freq) + tm.assert_almost_equal(result, expected) -class TestIndex(base.BaseIndexTests): - pass - + else: + return super().check_reduce(ser, op_name, skipna) -class TestMethods(BasePeriodTests, base.BaseMethodsTests): @pytest.mark.parametrize("periods", [1, -2]) def test_diff(self, data, periods): if is_platform_windows() and np_version_gte1p24: @@ -96,48 +115,5 @@ def test_map(self, data, na_action): tm.assert_extension_array_equal(result, data) -class TestInterface(BasePeriodTests, base.BaseInterfaceTests): - pass - - -class TestArithmeticOps(BasePeriodTests, base.BaseArithmeticOpsTests): - def _get_expected_exception(self, op_name, obj, other): - if op_name in ("__sub__", "__rsub__"): - return None - return super()._get_expected_exception(op_name, obj, other) - - -class TestCasting(BasePeriodTests, base.BaseCastingTests): - pass - - -class TestComparisonOps(BasePeriodTests, base.BaseComparisonOpsTests): - pass - - -class TestMissing(BasePeriodTests, base.BaseMissingTests): - pass - - -class TestReshaping(BasePeriodTests, base.BaseReshapingTests): - pass - - -class TestSetitem(BasePeriodTests, base.BaseSetitemTests): - pass - - -class TestGroupby(BasePeriodTests, base.BaseGroupbyTests): - pass - - -class TestPrinting(BasePeriodTests, base.BasePrintingTests): - pass - - -class TestParsing(BasePeriodTests, base.BaseParsingTests): - pass - - -class Test2DCompat(BasePeriodTests, base.NDArrayBacked2DTests): +class Test2DCompat(base.NDArrayBacked2DTests): pass diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 01448a2f83f75..f56dea3f43de7 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -146,29 +146,29 @@ def test_concat_mixed_dtypes(self, data): def test_stack(self, data, columns, future_stack): super().test_stack(data, columns, future_stack) - def test_concat_columns(self, data, na_value): + def test_concat_columns(self, data): self._check_unsupported(data) - super().test_concat_columns(data, na_value) + super().test_concat_columns(data) - def test_concat_extension_arrays_copy_false(self, data, na_value): + def test_concat_extension_arrays_copy_false(self, data): self._check_unsupported(data) - super().test_concat_extension_arrays_copy_false(data, na_value) + super().test_concat_extension_arrays_copy_false(data) - def test_align(self, data, na_value): + def test_align(self, data): self._check_unsupported(data) - super().test_align(data, na_value) + super().test_align(data) - def test_align_frame(self, data, na_value): + def test_align_frame(self, data): self._check_unsupported(data) - super().test_align_frame(data, na_value) + super().test_align_frame(data) - def test_align_series_frame(self, data, na_value): + def test_align_series_frame(self, data): self._check_unsupported(data) - super().test_align_series_frame(data, na_value) + super().test_align_series_frame(data) - def test_merge(self, data, na_value): + def test_merge(self, data): self._check_unsupported(data) - super().test_merge(data, na_value) + super().test_merge(data) class TestGetitem(BaseSparseTests, base.BaseGetitemTests): @@ -180,9 +180,9 @@ def test_get(self, data): assert ser.get(4) == ser.iloc[2] assert ser.get(2) == ser.iloc[1] - def test_reindex(self, data, na_value): + def test_reindex(self, data): self._check_unsupported(data) - super().test_reindex(data, na_value) + super().test_reindex(data) class TestSetitem(BaseSparseTests, base.BaseSetitemTests): @@ -220,8 +220,10 @@ def test_fillna_no_op_returns_copy(self, data, request): super().test_fillna_no_op_returns_copy(data) @pytest.mark.xfail(reason="Unsupported") - def test_fillna_series(self): + def test_fillna_series(self, data_missing): # this one looks doable. + # TODO: this fails bc we do not pass through data_missing. If we did, + # the 0-fill case would xpass super().test_fillna_series() def test_fillna_frame(self, data_missing): @@ -280,7 +282,7 @@ def test_fillna_copy_series(self, data_missing, using_copy_on_write): def test_fillna_length_mismatch(self, data_missing): super().test_fillna_length_mismatch(data_missing) - def test_where_series(self, data, na_value): + def test_where_series(self, data): assert data[0] != data[1] cls = type(data) a, b = data[:2] @@ -291,6 +293,7 @@ def test_where_series(self, data, na_value): result = ser.where(cond) new_dtype = SparseDtype("float", 0.0) + na_value = data.dtype.na_value expected = pd.Series( cls._from_sequence([a, a, na_value, na_value], dtype=new_dtype) ) @@ -314,15 +317,15 @@ def test_shift_0_periods(self, data): assert result._sparse_values[0] != result._sparse_values[1] @pytest.mark.parametrize("method", ["argmax", "argmin"]) - def test_argmin_argmax_all_na(self, method, data, na_value): + def test_argmin_argmax_all_na(self, method, data): # overriding because Sparse[int64, 0] cannot handle na_value self._check_unsupported(data) - super().test_argmin_argmax_all_na(method, data, na_value) + super().test_argmin_argmax_all_na(method, data) @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame]) - def test_equals(self, data, na_value, as_series, box): + def test_equals(self, data, as_series, box): self._check_unsupported(data) - super().test_equals(data, na_value, as_series, box) + super().test_equals(data, as_series, box) @pytest.mark.parametrize( "func, na_action, expected", @@ -349,7 +352,9 @@ def test_map_raises(self, data, na_action): class TestCasting(BaseSparseTests, base.BaseCastingTests): @pytest.mark.xfail(raises=TypeError, reason="no sparse StringDtype") - def test_astype_string(self, data): + def test_astype_string(self, data, nullable_string_dtype): + # TODO: this fails bc we do not pass through nullable_string_dtype; + # If we did, the 0-cases would xpass super().test_astype_string(data) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 8268de9a47f11..840dd1057745f 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -158,7 +158,11 @@ def test_fillna_no_op_returns_copy(self, data): class TestReduce(base.BaseReduceTests): def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: - return op_name in ["min", "max"] + return ( + op_name in ["min", "max"] + or ser.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr] + and op_name in ("any", "all") + ) class TestMethods(base.BaseMethodsTests): @@ -176,6 +180,8 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): # attribute "storage" if dtype.storage == "pyarrow": # type: ignore[union-attr] cast_to = "boolean[pyarrow]" + elif dtype.storage == "pyarrow_numpy": # type: ignore[union-attr] + cast_to = np.bool_ # type: ignore[assignment] else: cast_to = "boolean" return pointwise_result.astype(cast_to) @@ -201,7 +207,7 @@ def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): class Test2DCompat(base.Dim2CompatTests): @pytest.fixture(autouse=True) - def arrow_not_supported(self, data, request): + def arrow_not_supported(self, data): if isinstance(data, ArrowStringArray): pytest.skip(reason="2D support not implemented for ArrowStringArray") diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 59dca5055f170..7dffa7bb242d5 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -93,7 +93,7 @@ def test_from_records_sequencelike(self): tup.extend(b.iloc[i].values) tuples.append(tuple(tup)) - recarray = np.array(tuples, dtype=dtypes).view(np.recarray) + recarray = np.array(tuples, dtype=dtypes).view(np.rec.recarray) recarray2 = df.to_records() lists = [list(x) for x in tuples] diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index 9fed2116b2896..9d9324f557c8d 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -458,6 +458,14 @@ def test_getitem_datetime_slice(self): ): df["2011-01-01":"2011-11-01"] + def test_getitem_slice_same_dim_only_one_axis(self): + # GH#54622 + df = DataFrame(np.random.default_rng(2).standard_normal((10, 8))) + result = df.iloc[(slice(None, None, 2),)] + assert result.shape == (5, 8) + expected = df.iloc[slice(None, None, 2), slice(None)] + tm.assert_frame_equal(result, expected) + class TestGetitemDeprecatedIndexers: @pytest.mark.parametrize("key", [{"a", "b"}, {"a": "a"}]) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 288aa1af746b6..b324291bab31e 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -337,18 +337,12 @@ def test_setitem(self, float_frame, using_copy_on_write): def test_setitem2(self): # dtype changing GH4204 df = DataFrame([[0, 0]]) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): - df.iloc[0] = np.nan + df.iloc[0] = np.nan expected = DataFrame([[np.nan, np.nan]]) tm.assert_frame_equal(df, expected) df = DataFrame([[0, 0]]) - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): - df.loc[0] = np.nan + df.loc[0] = np.nan tm.assert_frame_equal(df, expected) def test_setitem_boolean(self, float_frame): @@ -1579,9 +1573,7 @@ def test_setitem(self, uint64_frame): # With NaN: because uint64 has no NaN element, # the column should be cast to object. df2 = df.copy() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): df2.iloc[1, 1] = pd.NaT df2.iloc[1, 2] = pd.NaT result = df2["B"] @@ -1901,19 +1893,19 @@ def test_setitem_dict_and_set_disallowed_multiindex(self, key): class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py # but checks for warnings instead of errors. - def _check_setitem_invalid(self, df, invalid, indexer): + def _check_setitem_invalid(self, df, invalid, indexer, warn): msg = "Setting an item of incompatible dtype is deprecated" msg = re.escape(msg) orig_df = df.copy() # iloc - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): df.iloc[indexer, 0] = invalid df = orig_df.copy() # loc - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): df.loc[indexer, "a"] = invalid df = orig_df.copy() @@ -1934,16 +1926,20 @@ def _check_setitem_invalid(self, df, invalid, indexer): @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_bool(self, invalid, indexer): df = DataFrame({"a": [True, False, False]}, dtype="bool") - self._check_setitem_invalid(df, invalid, indexer) + self._check_setitem_invalid(df, invalid, indexer, FutureWarning) @pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): df = DataFrame({"a": [1, 2, 3]}, dtype=any_int_numpy_dtype) - self._check_setitem_invalid(df, invalid, indexer) + if isna(invalid) and invalid is not pd.NaT: + warn = None + else: + warn = FutureWarning + self._check_setitem_invalid(df, invalid, indexer, warn) @pytest.mark.parametrize("invalid", _invalid_scalars + [True]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer): df = DataFrame({"a": [1, 2, None]}, dtype=float_numpy_dtype) - self._check_setitem_invalid(df, invalid, indexer) + self._check_setitem_invalid(df, invalid, indexer, FutureWarning) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 3d3df2d714ca4..4576a86ad27cd 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -96,6 +96,7 @@ def test_where_upcasting(self): tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") def test_where_alignment(self, where_frame, float_string_frame): # aligning def _check_align(df, cond, other, check_dtypes=True): @@ -170,12 +171,13 @@ def test_where_invalid(self): with pytest.raises(ValueError, match=msg): df.mask(0) + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") def test_where_set(self, where_frame, float_string_frame, mixed_int_frame): # where inplace def _check_set(df, cond, check_dtypes=True): dfi = df.copy() - econd = cond.reindex_like(df).fillna(True) + econd = cond.reindex_like(df).fillna(True).infer_objects(copy=False) expected = dfi.mask(~econd) return_value = dfi.where(cond, np.nan, inplace=True) @@ -356,7 +358,9 @@ def test_where_bug_transposition(self): expected = a.copy() expected[~do_not_replace] = b - result = a.where(do_not_replace, b) + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = a.where(do_not_replace, b) tm.assert_frame_equal(result, expected) a = DataFrame({0: [4, 6], 1: [1, 0]}) @@ -366,7 +370,8 @@ def test_where_bug_transposition(self): expected = a.copy() expected[~do_not_replace] = b - result = a.where(do_not_replace, b) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = a.where(do_not_replace, b) tm.assert_frame_equal(result, expected) def test_where_datetime(self): @@ -718,7 +723,9 @@ def test_where_ea_other(self): ser2 = Series(arr[:2], index=["A", "B"]) expected = DataFrame({"A": [1, 7, 3], "B": [4, pd.NA, 6]}) expected["B"] = expected["B"].astype(object) - result = df.where(mask, ser2, axis=1) + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.where(mask, ser2, axis=1) tm.assert_frame_equal(result, expected) def test_where_interval_noop(self): @@ -735,7 +742,10 @@ def test_where_interval_fullop_downcast(self, frame_or_series): # GH#45768 obj = frame_or_series([pd.Interval(0, 0)] * 2) other = frame_or_series([1.0, 2.0]) - res = obj.where(~obj.notna(), other) + + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = obj.where(~obj.notna(), other) # since all entries are being changed, we will downcast result # from object to ints (not floats) @@ -780,7 +790,9 @@ def test_where_datetimelike_noop(self, dtype): # opposite case where we are replacing *all* values -> we downcast # from object dtype # GH#45768 - res5 = df.where(mask2, 4) + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + with tm.assert_produces_warning(FutureWarning, match=msg): + res5 = df.where(mask2, 4) expected = DataFrame(4, index=df.index, columns=df.columns) tm.assert_frame_equal(res5, expected) @@ -984,10 +996,18 @@ def test_where_downcast_to_td64(): td = pd.Timedelta(days=1) - res = ser.where(mask, td) + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = ser.where(mask, td) expected = Series([td, td, td], dtype="m8[ns]") tm.assert_series_equal(res, expected) + with pd.option_context("future.no_silent_downcasting", True): + with tm.assert_produces_warning(None, match=msg): + res2 = ser.where(mask, td) + expected2 = expected.astype(object) + tm.assert_series_equal(res2, expected2) + def _check_where_equivalences(df, mask, other, expected): # similar to tests.series.indexing.test_setitem.SetitemCastingEquivalences diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index 2c5137db94c16..a2f8f3e278395 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -48,7 +48,7 @@ def test_asfreq2(self, frame_or_series): monthly_ts = daily_ts.asfreq(offsets.BMonthEnd()) tm.assert_equal(monthly_ts, ts) - result = ts[:0].asfreq("M") + result = ts[:0].asfreq("ME") assert len(result) == 0 assert result is not ts @@ -76,7 +76,7 @@ def test_tz_aware_asfreq_smoke(self, tz, frame_or_series): ) # it works! - obj.asfreq("T") + obj.asfreq("min") def test_asfreq_normalize(self, frame_or_series): rng = date_range("1/1/2000 09:30", periods=20) @@ -170,7 +170,7 @@ def test_asfreq_fillvalue(self): # test for fill value during upsampling, related to issue 3715 # setup - rng = date_range("1/1/2016", periods=10, freq="2S") + rng = date_range("1/1/2016", periods=10, freq="2s") # Explicit cast to 'float' to avoid implicit cast when setting None ts = Series(np.arange(len(rng)), index=rng, dtype="float") df = DataFrame({"one": ts}) @@ -178,13 +178,13 @@ def test_asfreq_fillvalue(self): # insert pre-existing missing value df.loc["2016-01-01 00:00:08", "one"] = None - actual_df = df.asfreq(freq="1S", fill_value=9.0) - expected_df = df.asfreq(freq="1S").fillna(9.0) + actual_df = df.asfreq(freq="1s", fill_value=9.0) + expected_df = df.asfreq(freq="1s").fillna(9.0) expected_df.loc["2016-01-01 00:00:08", "one"] = None tm.assert_frame_equal(expected_df, actual_df) - expected_series = ts.asfreq(freq="1S").fillna(9.0) - actual_series = ts.asfreq(freq="1S", fill_value=9.0) + expected_series = ts.asfreq(freq="1s").fillna(9.0) + actual_series = ts.asfreq(freq="1s", fill_value=9.0) tm.assert_series_equal(expected_series, actual_series) def test_asfreq_with_date_object_index(self, frame_or_series): @@ -221,11 +221,11 @@ def test_asfreq_after_normalize(self, unit): @pytest.mark.parametrize( "freq, freq_half", [ - ("2M", "M"), + ("2ME", "ME"), (MonthEnd(2), MonthEnd(1)), ], ) - def test_asfreq_2M(self, freq, freq_half): + def test_asfreq_2ME(self, freq, freq_half): index = date_range("1/1/2000", periods=6, freq=freq_half) df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0, 4.0, 5.0], index=index)}) expected = df.asfreq(freq=freq) @@ -233,3 +233,13 @@ def test_asfreq_2M(self, freq, freq_half): index = date_range("1/1/2000", periods=3, freq=freq) result = DataFrame({"s": Series([0.0, 2.0, 4.0], index=index)}) tm.assert_frame_equal(result, expected) + + def test_asfreq_frequency_M_deprecated(self): + depr_msg = r"\'M\' will be deprecated, please use \'ME\' for \'month end\'" + + index = date_range("1/1/2000", periods=4, freq="ME") + df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0], index=index)}) + expected = df.asfreq(freq="5ME") + with tm.assert_produces_warning(UserWarning, match=depr_msg): + result = df.asfreq(freq="5M") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py index 9bd032a0aefc4..ed8ccaea92c58 100644 --- a/pandas/tests/frame/methods/test_clip.py +++ b/pandas/tests/frame/methods/test_clip.py @@ -151,7 +151,11 @@ def test_clip_with_na_args(self, float_frame): # GH#19992 and adjusted in GH#40420 df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}) - result = df.clip(lower=[4, 5, np.nan], axis=0) + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + # TODO: avoid this warning here? seems like we should never be upcasting + # in the first place? + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.clip(lower=[4, 5, np.nan], axis=0) expected = DataFrame( {"col_0": [4, 5, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]} ) @@ -167,7 +171,8 @@ def test_clip_with_na_args(self, float_frame): data = {"col_0": [9, -3, 0, -1, 5], "col_1": [-2, -7, 6, 8, -5]} df = DataFrame(data) t = Series([2, -4, np.nan, 6, 3]) - result = df.clip(lower=t, axis=0) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.clip(lower=t, axis=0) expected = DataFrame({"col_0": [9, -3, 0, 6, 5], "col_1": [2, -4, 6, 8, 3]}) tm.assert_frame_equal(result, expected) @@ -177,3 +182,14 @@ def test_clip_int_data_with_float_bound(self): result = df.clip(lower=1.5) expected = DataFrame({"a": [1.5, 2.0, 3.0]}) tm.assert_frame_equal(result, expected) + + def test_clip_with_list_bound(self): + # GH#54817 + df = DataFrame([1, 5]) + expected = DataFrame([3, 5]) + result = df.clip([3]) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([1, 3]) + result = df.clip(upper=[3]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_copy.py b/pandas/tests/frame/methods/test_copy.py index 95fcaaa473067..e7901ed363106 100644 --- a/pandas/tests/frame/methods/test_copy.py +++ b/pandas/tests/frame/methods/test_copy.py @@ -56,7 +56,7 @@ def test_copy_consolidates(self): } ) - for i in range(0, 10): + for i in range(10): df.loc[:, f"n_{i}"] = np.random.default_rng(2).integers(0, 100, size=55) assert len(df._mgr.blocks) == 11 diff --git a/pandas/tests/frame/methods/test_equals.py b/pandas/tests/frame/methods/test_equals.py index 4028a26dfdc65..6fcf670f96ef0 100644 --- a/pandas/tests/frame/methods/test_equals.py +++ b/pandas/tests/frame/methods/test_equals.py @@ -35,7 +35,7 @@ def test_equals(self): np.random.default_rng(2).random(10), index=index, columns=["floats"] ) df1["text"] = "the sky is so blue. we could use more chocolate.".split() - df1["start"] = date_range("2000-1-1", periods=10, freq="T") + df1["start"] = date_range("2000-1-1", periods=10, freq="min") df1["end"] = date_range("2000-1-1", periods=10, freq="D") df1["diff"] = df1["end"] - df1["start"] # Explicitly cast to object, to avoid implicit cast when setting np.nan @@ -66,7 +66,7 @@ def test_equals(self): assert not df1.equals(different) # DatetimeIndex - index = date_range("2000-1-1", periods=10, freq="T") + index = date_range("2000-1-1", periods=10, freq="min") df1 = df1.set_index(index) df2 = df1.copy() assert df1.equals(df2) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 812150bb860e9..52b4b64ee279f 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -360,7 +360,9 @@ def test_fillna_dtype_conversion(self): expected = Series([np.dtype("object")] * 5, index=[1, 2, 3, 4, 5]) tm.assert_series_equal(result, expected) - result = df.fillna(1) + msg = "Downcasting object dtype arrays" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.fillna(1) expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) tm.assert_frame_equal(result, expected) @@ -817,7 +819,8 @@ def test_fillna_nones_inplace(): [[None, None], [None, None]], columns=["A", "B"], ) - with tm.assert_produces_warning(False): + msg = "Downcasting object dtype arrays" + with tm.assert_produces_warning(FutureWarning, match=msg): df.fillna(value={"A": 1, "B": 2}, inplace=True) expected = DataFrame([[1, 2], [1, 2]], columns=["A", "B"]) diff --git a/pandas/tests/frame/methods/test_filter.py b/pandas/tests/frame/methods/test_filter.py index 1a2fbf8a65a55..9d5e6876bb08c 100644 --- a/pandas/tests/frame/methods/test_filter.py +++ b/pandas/tests/frame/methods/test_filter.py @@ -137,3 +137,17 @@ def test_filter_regex_non_string(self): result = df.filter(regex="STRING") expected = df[["STRING"]] tm.assert_frame_equal(result, expected) + + def test_filter_keep_order(self): + # GH#54980 + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + result = df.filter(items=["B", "A"]) + expected = df[["B", "A"]] + tm.assert_frame_equal(result, expected) + + def test_filter_different_dtype(self): + # GH#54980 + df = DataFrame({1: [1, 2, 3], 2: [4, 5, 6]}) + result = df.filter(items=["B", "A"]) + expected = df[[]] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_first_and_last.py b/pandas/tests/frame/methods/test_first_and_last.py index 2e85edc7ed0ea..23355a5549a88 100644 --- a/pandas/tests/frame/methods/test_first_and_last.py +++ b/pandas/tests/frame/methods/test_first_and_last.py @@ -29,7 +29,7 @@ def test_first_subset(self, frame_or_series): assert len(result) == 10 with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = ts.first("3M") + result = ts.first("3ME") expected = ts[:"3/31/2000"] tm.assert_equal(result, expected) @@ -39,7 +39,7 @@ def test_first_subset(self, frame_or_series): tm.assert_equal(result, expected) with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = ts[:0].first("3M") + result = ts[:0].first("3ME") tm.assert_equal(result, ts[:0]) def test_first_last_raises(self, frame_or_series): @@ -87,7 +87,7 @@ def test_last_subset(self, frame_or_series): tm.assert_equal(result, expected) with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg): - result = ts[:0].last("3M") + result = ts[:0].last("3ME") tm.assert_equal(result, ts[:0]) @pytest.mark.parametrize("start, periods", [("2010-03-31", 1), ("2010-03-30", 2)]) @@ -95,7 +95,7 @@ def test_first_with_first_day_last_of_month(self, frame_or_series, start, period # GH#29623 x = frame_or_series([1] * 100, index=bdate_range(start, periods=100)) with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = x.first("1M") + result = x.first("1ME") expected = frame_or_series( [1] * periods, index=bdate_range(start, periods=periods) ) @@ -105,7 +105,7 @@ def test_first_with_first_day_end_of_frq_n_greater_one(self, frame_or_series): # GH#29623 x = frame_or_series([1] * 100, index=bdate_range("2010-03-31", periods=100)) with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = x.first("2M") + result = x.first("2ME") expected = frame_or_series( [1] * 23, index=bdate_range("2010-03-31", "2010-04-30") ) diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 98f3926968ad0..2d4ac1d4a4444 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -553,13 +553,13 @@ def test_frame_join_tzaware(self): test1 = DataFrame( np.zeros((6, 3)), index=date_range( - "2012-11-15 00:00:00", periods=6, freq="100L", tz="US/Central" + "2012-11-15 00:00:00", periods=6, freq="100ms", tz="US/Central" ), ) test2 = DataFrame( np.zeros((3, 3)), index=date_range( - "2012-11-15 00:00:00", periods=3, freq="250L", tz="US/Central" + "2012-11-15 00:00:00", periods=3, freq="250ms", tz="US/Central" ), columns=range(3, 6), ) diff --git a/pandas/tests/frame/methods/test_pct_change.py b/pandas/tests/frame/methods/test_pct_change.py index d0153da038a75..ede212ae18ae9 100644 --- a/pandas/tests/frame/methods/test_pct_change.py +++ b/pandas/tests/frame/methods/test_pct_change.py @@ -160,3 +160,21 @@ def test_pct_change_with_duplicated_indices(fill_method): index=["a", "b"] * 3, ) tm.assert_frame_equal(result, expected) + + +def test_pct_change_none_beginning_no_warning(): + # GH#54481 + df = DataFrame( + [ + [1, None], + [2, 1], + [3, 2], + [4, 3], + [5, 4], + ] + ) + result = df.pct_change() + expected = DataFrame( + {0: [np.nan, 1, 0.5, 1 / 3, 0.25], 1: [np.nan, np.nan, 1, 0.5, 1 / 3]} + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 0858e33a989b7..0105c41bd0eca 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -26,7 +26,7 @@ isna, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT +from pandas.api.types import CategoricalDtype class TestReindexSetIndex: @@ -35,7 +35,7 @@ class TestReindexSetIndex: def test_dti_set_index_reindex_datetimeindex(self): # GH#6631 df = DataFrame(np.random.default_rng(2).random(6)) - idx1 = date_range("2011/01/01", periods=6, freq="M", tz="US/Eastern") + idx1 = date_range("2011/01/01", periods=6, freq="ME", tz="US/Eastern") idx2 = date_range("2013", periods=6, freq="A", tz="Asia/Tokyo") df = df.set_index(idx1) @@ -1082,7 +1082,9 @@ def test_reindex_with_categoricalindex(self): { "A": np.arange(3, dtype="int64"), }, - index=CategoricalIndex(list("abc"), dtype=CDT(list("cabe")), name="B"), + index=CategoricalIndex( + list("abc"), dtype=CategoricalDtype(list("cabe")), name="B" + ), ) # reindexing @@ -1111,13 +1113,13 @@ def test_reindex_with_categoricalindex(self): result = df.reindex(Categorical(["a", "e"], categories=cats)) expected = DataFrame( - {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))} + {"A": [0, np.nan], "B": Series(list("ae")).astype(CategoricalDtype(cats))} ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(Categorical(["a"], categories=cats)) expected = DataFrame( - {"A": [0], "B": Series(list("a")).astype(CDT(cats))} + {"A": [0], "B": Series(list("a")).astype(CategoricalDtype(cats))} ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) @@ -1138,13 +1140,19 @@ def test_reindex_with_categoricalindex(self): # give back the type of categorical that we received result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True)) expected = DataFrame( - {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))} + { + "A": [0, np.nan], + "B": Series(list("ae")).astype(CategoricalDtype(cats, ordered=True)), + } ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(Categorical(["a", "d"], categories=["a", "d"])) expected = DataFrame( - {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))} + { + "A": [0, np.nan], + "B": Series(list("ad")).astype(CategoricalDtype(["a", "d"])), + } ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) @@ -1152,7 +1160,9 @@ def test_reindex_with_categoricalindex(self): { "A": np.arange(6, dtype="int64"), }, - index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"), + index=CategoricalIndex( + list("aabbca"), dtype=CategoricalDtype(list("cabe")), name="B" + ), ) # passed duplicate indexers are not allowed msg = "cannot reindex on an axis with duplicate labels" diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 61e44b4e24c08..f07c53060a06b 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -289,7 +289,9 @@ def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype): def test_regex_replace_dict_nested_gh4115(self): df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) - result = df.replace({"Type": {"Q": 0, "T": 1}}) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace({"Type": {"Q": 0, "T": 1}}) tm.assert_frame_equal(result, expected) def test_regex_replace_list_to_scalar(self, mix_abc): @@ -301,16 +303,20 @@ def test_regex_replace_list_to_scalar(self, mix_abc): "c": [np.nan, np.nan, np.nan, "d"], } ) - res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True) res2 = df.copy() res3 = df.copy() - return_value = res2.replace( - [r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True - ) + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = res2.replace( + [r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True + ) assert return_value is None - return_value = res3.replace( - regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True - ) + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = res3.replace( + regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True + ) assert return_value is None tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) @@ -520,7 +526,9 @@ def test_replace_convert(self): # gh 3907 df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) m = {"foo": 1, "bar": 2, "bah": 3} - rep = df.replace(m) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + rep = df.replace(m) expec = Series([np.int64] * 3) res = rep.dtypes tm.assert_series_equal(expec, res) @@ -838,7 +846,12 @@ def test_replace_for_new_dtypes(self, datetime_frame): ], ) def test_replace_dtypes(self, frame, to_replace, value, expected): - result = frame.replace(to_replace, value) + warn = None + if isinstance(to_replace, datetime) and to_replace.year == 2920: + warn = FutureWarning + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(warn, match=msg): + result = frame.replace(to_replace, value) tm.assert_frame_equal(result, expected) def test_replace_input_formats_listlike(self): @@ -927,7 +940,9 @@ def test_replace_dict_no_regex(self): "Strongly Disagree": 1, } expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) - result = answer.replace(weights) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = answer.replace(weights) tm.assert_series_equal(result, expected) def test_replace_series_no_regex(self): @@ -950,7 +965,9 @@ def test_replace_series_no_regex(self): } ) expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) - result = answer.replace(weights) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = answer.replace(weights) tm.assert_series_equal(result, expected) def test_replace_dict_tuple_list_ordering_remains_the_same(self): @@ -1076,7 +1093,9 @@ def test_replace_period(self): expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]}) assert expected.dtypes.iloc[0] == "Period[M]" - result = df.replace(d) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace(d) tm.assert_frame_equal(result, expected) def test_replace_datetime(self): @@ -1106,7 +1125,9 @@ def test_replace_datetime(self): ) assert set(df.fname.values) == set(d["fname"].keys()) expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]}) - result = df.replace(d) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace(d) tm.assert_frame_equal(result, expected) def test_replace_datetimetz(self): @@ -1307,10 +1328,12 @@ def test_replace_commutative(self, df, to_replace, exp): np.float64(1), ], ) - def test_replace_replacer_dtype(self, request, replacer): + def test_replace_replacer_dtype(self, replacer): # GH26632 df = DataFrame(["a"]) - result = df.replace({"a": replacer, "b": replacer}) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace({"a": replacer, "b": replacer}) expected = DataFrame([replacer]) tm.assert_frame_equal(result, expected) @@ -1564,12 +1587,15 @@ def test_replace_regex_dtype_frame(self, regex): # GH-48644 df1 = DataFrame({"A": ["0"], "B": ["0"]}) expected_df1 = DataFrame({"A": [1], "B": [1]}) - result_df1 = df1.replace(to_replace="0", value=1, regex=regex) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result_df1 = df1.replace(to_replace="0", value=1, regex=regex) tm.assert_frame_equal(result_df1, expected_df1) df2 = DataFrame({"A": ["0"], "B": ["1"]}) expected_df2 = DataFrame({"A": [1], "B": ["1"]}) - result_df2 = df2.replace(to_replace="0", value=1, regex=regex) + with tm.assert_produces_warning(FutureWarning, match=msg): + result_df2 = df2.replace(to_replace="0", value=1, regex=regex) tm.assert_frame_equal(result_df2, expected_df2) def test_replace_with_value_also_being_replaced(self): diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index d99dd36f3a2e3..339e19254fd10 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -788,15 +788,15 @@ def test_errorreset_index_rename(float_frame): def test_reset_index_false_index_name(): - result_series = Series(data=range(5, 10), index=range(0, 5)) + result_series = Series(data=range(5, 10), index=range(5)) result_series.index.name = False result_series.reset_index() - expected_series = Series(range(5, 10), RangeIndex(range(0, 5), name=False)) + expected_series = Series(range(5, 10), RangeIndex(range(5), name=False)) tm.assert_series_equal(result_series, expected_series) # GH 38147 - result_frame = DataFrame(data=range(5, 10), index=range(0, 5)) + result_frame = DataFrame(data=range(5, 10), index=range(5)) result_frame.index.name = False result_frame.reset_index() - expected_frame = DataFrame(range(5, 10), RangeIndex(range(0, 5), name=False)) + expected_frame = DataFrame(range(5, 10), RangeIndex(range(5), name=False)) tm.assert_frame_equal(result_frame, expected_frame) diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 808f0cff2485c..60c05767a5e1a 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -32,19 +32,23 @@ def test_shift_axis1_with_valid_fill_value_one_array(self): expected2 = DataFrame([12345] * 5, dtype="Float64") tm.assert_frame_equal(res2, expected2) - def test_shift_disallow_freq_and_fill_value(self, frame_or_series): + def test_shift_deprecate_freq_and_fill_value(self, frame_or_series): # Can't pass both! obj = frame_or_series( np.random.default_rng(2).standard_normal(5), index=date_range("1/1/2000", periods=5, freq="H"), ) - msg = "Cannot pass both 'freq' and 'fill_value' to (Series|DataFrame).shift" - with pytest.raises(ValueError, match=msg): + msg = ( + "Passing a 'freq' together with a 'fill_value' silently ignores the " + "fill_value" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): obj.shift(1, fill_value=1, freq="H") if frame_or_series is DataFrame: - with pytest.raises(ValueError, match=msg): + obj.columns = date_range("1/1/2000", periods=1, freq="H") + with tm.assert_produces_warning(FutureWarning, match=msg): obj.shift(1, axis=1, fill_value=1, freq="H") @pytest.mark.parametrize( @@ -75,8 +79,8 @@ def test_shift_mismatched_freq(self, frame_or_series): index=date_range("1/1/2000", periods=5, freq="H"), ) - result = ts.shift(1, freq="5T") - exp_index = ts.index.shift(1, freq="5T") + result = ts.shift(1, freq="5min") + exp_index = ts.index.shift(1, freq="5min") tm.assert_index_equal(result.index, exp_index) # GH#1063, multiple of same base @@ -716,8 +720,11 @@ def test_shift_with_iterable_freq_and_fill_value(self): df.shift(1, freq="H"), ) - msg = r"Cannot pass both 'freq' and 'fill_value' to.*" - with pytest.raises(ValueError, match=msg): + msg = ( + "Passing a 'freq' together with a 'fill_value' silently ignores the " + "fill_value" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): df.shift([1, 2], fill_value=1, freq="H") def test_shift_with_iterable_check_other_arguments(self): diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 228b62a418813..985a9e3602410 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -911,7 +911,7 @@ def test_sort_index_multiindex_sparse_column(self): expected = DataFrame( { i: pd.array([0.0, 0.0, 0.0, 0.0], dtype=pd.SparseDtype("float64", 0.0)) - for i in range(0, 4) + for i in range(4) }, index=MultiIndex.from_product([[1, 2], [1, 2]]), ) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 294da02e259b7..9f45347c31165 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -1326,6 +1326,6 @@ def test_to_csv_categorical_and_interval(self): ) df["a"] = df["a"].astype("category") result = df.to_csv() - expected_rows = [",a", '0,"[2020-01-01, 2020-01-02]"'] + expected_rows = [",a", '0,"[2020-01-01 00:00:00, 2020-01-02 00:00:00]"'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index 1118ad88d5092..61f0ad30b4519 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -99,19 +99,19 @@ def test_to_dict(self, mapping): for k2, v2 in v.items(): assert v2 == recons_data[k][k2] - recons_data = DataFrame(test_data).to_dict("list", mapping) + recons_data = DataFrame(test_data).to_dict("list", into=mapping) for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k][int(k2) - 1] - recons_data = DataFrame(test_data).to_dict("series", mapping) + recons_data = DataFrame(test_data).to_dict("series", into=mapping) for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k][k2] - recons_data = DataFrame(test_data).to_dict("split", mapping) + recons_data = DataFrame(test_data).to_dict("split", into=mapping) expected_split = { "columns": ["A", "B"], "index": ["1", "2", "3"], @@ -119,7 +119,7 @@ def test_to_dict(self, mapping): } tm.assert_dict_equal(recons_data, expected_split) - recons_data = DataFrame(test_data).to_dict("records", mapping) + recons_data = DataFrame(test_data).to_dict("records", into=mapping) expected_records = [ {"A": 1.0, "B": "1"}, {"A": 2.0, "B": "2"}, @@ -166,6 +166,21 @@ def test_to_dict_not_unique_warning(self): with tm.assert_produces_warning(UserWarning): df.to_dict() + @pytest.mark.filterwarnings("ignore::UserWarning") + @pytest.mark.parametrize( + "orient,expected", + [ + ("list", {"A": [2, 5], "B": [3, 6]}), + ("dict", {"A": {0: 2, 1: 5}, "B": {0: 3, 1: 6}}), + ], + ) + def test_to_dict_not_unique(self, orient, expected): + # GH#54824: This is to make sure that dataframes with non-unique column + # would have uniform behavior throughout different orients + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "A", "B"]) + result = df.to_dict(orient) + assert result == expected + # orient - orient argument to to_dict function # item_getter - function for extracting value from # the resulting dict using column name and index @@ -494,3 +509,13 @@ def test_to_dict_masked_native_python(self): df = DataFrame({"a": Series([1, NA], dtype="Int64"), "B": 1}) result = df.to_dict(orient="records") assert isinstance(result[0]["a"], int) + + def test_to_dict_pos_args_deprecation(self): + # GH-54229 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_dict except for the " + r"argument 'orient' will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.to_dict("records", {}) diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index 8853d718270f4..fa8c4e4811ea6 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -391,7 +391,7 @@ def test_to_records_dtype(self, kwargs, expected): # see GH#18146 df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]}) - if not isinstance(expected, np.recarray): + if not isinstance(expected, np.rec.recarray): with pytest.raises(expected[0], match=expected[1]): df.to_records(**kwargs) else: @@ -512,7 +512,7 @@ def keys(self): @pytest.mark.parametrize("tz", ["UTC", "GMT", "US/Eastern"]) def test_to_records_datetimeindex_with_tz(self, tz): # GH#13937 - dr = date_range("2016-01-01", periods=10, freq="S", tz=tz) + dr = date_range("2016-01-01", periods=10, freq="s", tz=tz) df = DataFrame({"datetime": dr}, index=dr) diff --git a/pandas/tests/frame/methods/test_to_timestamp.py b/pandas/tests/frame/methods/test_to_timestamp.py index 2f73e3d58b516..e72b576fca833 100644 --- a/pandas/tests/frame/methods/test_to_timestamp.py +++ b/pandas/tests/frame/methods/test_to_timestamp.py @@ -99,7 +99,7 @@ def test_to_timestamp_columns(self): tm.assert_index_equal(result.columns, exp_index) delta = timedelta(hours=23, minutes=59) - result = df.to_timestamp("T", "end", axis=1) + result = df.to_timestamp("min", "end", axis=1) exp_index = _get_with_delta(delta) exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns") tm.assert_index_equal(result.columns, exp_index) @@ -110,8 +110,8 @@ def test_to_timestamp_columns(self): exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result.columns, exp_index) - result1 = df.to_timestamp("5t", axis=1) - result2 = df.to_timestamp("t", axis=1) + result1 = df.to_timestamp("5min", axis=1) + result2 = df.to_timestamp("min", axis=1) expected = date_range("2001-01-01", "2009-01-01", freq="AS") assert isinstance(result1.columns, DatetimeIndex) assert isinstance(result2.columns, DatetimeIndex) diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index 355f05cd5156c..f30db91f82b60 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -175,3 +175,31 @@ def test_data_frame_value_counts_subset(nulls_fixture, columns): ) tm.assert_series_equal(result, expected) + + +def test_value_counts_categorical_future_warning(): + # GH#54775 + df = pd.DataFrame({"a": [1, 2, 3]}, dtype="category") + result = df.value_counts() + expected = pd.Series( + 1, + index=pd.MultiIndex.from_arrays( + [pd.Index([1, 2, 3], name="a", dtype="category")] + ), + name="count", + ) + tm.assert_series_equal(result, expected) + + +def test_value_counts_with_missing_category(): + # GH-54836 + df = pd.DataFrame({"a": pd.Categorical([1, 2, 4], categories=[1, 2, 3, 4])}) + result = df.value_counts() + expected = pd.Series( + [1, 1, 1, 0], + index=pd.MultiIndex.from_arrays( + [pd.CategoricalIndex([1, 2, 4, 3], categories=[1, 2, 3, 4], name="a")] + ), + name="count", + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 878e94c15e16b..1488fa65fabc0 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1254,7 +1254,9 @@ def test_operators_none_as_na(self, op): # since filling converts dtypes from object, changed expected to be # object - filled = df.fillna(np.nan) + msg = "Downcasting object dtype arrays" + with tm.assert_produces_warning(FutureWarning, match=msg): + filled = df.fillna(np.nan) result = op(df, 3) expected = op(filled, 3).astype(object) expected[pd.isna(expected)] = np.nan @@ -1265,10 +1267,14 @@ def test_operators_none_as_na(self, op): expected[pd.isna(expected)] = np.nan tm.assert_frame_equal(result, expected) - result = op(df, df.fillna(7)) + msg = "Downcasting object dtype arrays" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = op(df, df.fillna(7)) tm.assert_frame_equal(result, expected) - result = op(df.fillna(7), df) + msg = "Downcasting object dtype arrays" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = op(df.fillna(7), df) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("op,res", [("__eq__", False), ("__ne__", True)]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index c170704150383..fd851ab244cb8 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -692,12 +692,12 @@ def test_constructor_error_msgs(self): arr = np.array([[4, 5, 6]]) msg = r"Shape of passed values is \(1, 3\), indices imply \(1, 4\)" with pytest.raises(ValueError, match=msg): - DataFrame(index=[0], columns=range(0, 4), data=arr) + DataFrame(index=[0], columns=range(4), data=arr) arr = np.array([4, 5, 6]) msg = r"Shape of passed values is \(3, 1\), indices imply \(1, 4\)" with pytest.raises(ValueError, match=msg): - DataFrame(index=[0], columns=range(0, 4), data=arr) + DataFrame(index=[0], columns=range(4), data=arr) # higher dim raise exception with pytest.raises(ValueError, match="Must pass 2-d input"): @@ -2391,7 +2391,7 @@ def test_construct_with_two_categoricalindex_series(self): def test_constructor_series_nonexact_categoricalindex(self): # GH 42424 - ser = Series(range(0, 100)) + ser = Series(range(100)) ser1 = cut(ser, 10).value_counts().head(5) ser2 = cut(ser, 10).value_counts().tail(5) result = DataFrame({"1": ser1, "2": ser2}) @@ -2685,8 +2685,8 @@ def test_construct_with_strings_and_none(self): def test_frame_string_inference(self): # GH#54430 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -2718,6 +2718,31 @@ def test_frame_string_inference(self): df = DataFrame({"a": ["a", "b"]}, dtype="object") tm.assert_frame_equal(df, expected) + def test_frame_string_inference_array_string_dtype(self): + # GH#54496 + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" + expected = DataFrame( + {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) + ) + with pd.option_context("future.infer_string", True): + df = DataFrame({"a": np.array(["a", "b"])}) + tm.assert_frame_equal(df, expected) + + expected = DataFrame({0: ["a", "b"], 1: ["c", "d"]}, dtype=dtype) + with pd.option_context("future.infer_string", True): + df = DataFrame(np.array([["a", "c"], ["b", "d"]])) + tm.assert_frame_equal(df, expected) + + expected = DataFrame( + {"a": ["a", "b"], "b": ["c", "d"]}, + dtype=dtype, + columns=Index(["a", "b"], dtype=dtype), + ) + with pd.option_context("future.infer_string", True): + df = DataFrame(np.array([["a", "c"], ["b", "d"]]), columns=["a", "b"]) + tm.assert_frame_equal(df, expected) + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): @@ -2936,7 +2961,9 @@ def test_frame_datetime64_mixed_index_ctor_1681(self): def test_frame_timeseries_column(self): # GH19157 - dr = date_range(start="20130101T10:00:00", periods=3, freq="T", tz="US/Eastern") + dr = date_range( + start="20130101T10:00:00", periods=3, freq="min", tz="US/Eastern" + ) result = DataFrame(dr, columns=["timestamps"]) expected = DataFrame( { diff --git a/pandas/tests/frame/test_iteration.py b/pandas/tests/frame/test_iteration.py index 8bc26bff41767..216fd5c2f70c5 100644 --- a/pandas/tests/frame/test_iteration.py +++ b/pandas/tests/frame/test_iteration.py @@ -55,7 +55,7 @@ def test_iterrows_iso8601(self): s = DataFrame( { "non_iso8601": ["M1701", "M1802", "M1903", "M2004"], - "iso8601": date_range("2000-01-01", periods=4, freq="M"), + "iso8601": date_range("2000-01-01", periods=4, freq="ME"), } ) for k, v in s.iterrows(): diff --git a/pandas/tests/frame/test_logical_ops.py b/pandas/tests/frame/test_logical_ops.py index 2cc3b67e7ac02..a15d7d7f93f01 100644 --- a/pandas/tests/frame/test_logical_ops.py +++ b/pandas/tests/frame/test_logical_ops.py @@ -151,6 +151,7 @@ def _check_unary_op(op): _check_unary_op(operator.inv) # TODO: belongs elsewhere + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") def test_logical_with_nas(self): d = DataFrame({"a": [np.nan, False], "b": [True, True]}) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index e7b6a0c0b39b0..e66557f132c1d 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -752,7 +752,7 @@ def test_sum_corner(self): tm.makeDateIndex(0), tm.makeNumericIndex(0, dtype=int), tm.makeNumericIndex(0, dtype=float), - tm.makeDateIndex(0, freq="M"), + tm.makeDateIndex(0, freq="ME"), tm.makePeriodIndex(0), ], ) @@ -1155,6 +1155,7 @@ def test_any_all_mixed_float(self, opname, axis, bool_only, float_string_frame): def test_any_all_bool_with_na(self, opname, axis, bool_frame_with_na): getattr(bool_frame_with_na, opname)(axis=axis, bool_only=False) + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize("opname", ["any", "all"]) def test_any_all_bool_frame(self, opname, bool_frame_with_na): # GH#12863: numpy gives back non-boolean data for object type diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 49375658abfee..64d516e484991 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -455,3 +455,14 @@ def test_masked_ea_with_formatter(self): 0 0.12 1.00 1 1.12 2.00""" assert result == expected + + def test_repr_ea_columns(self, any_string_dtype): + # GH#54797 + pytest.importorskip("pyarrow") + df = DataFrame({"long_column_name": [1, 2, 3], "col2": [4, 5, 6]}) + df.columns = df.columns.astype(any_string_dtype) + expected = """ long_column_name col2 +0 1 4 +1 2 5 +2 3 6""" + assert repr(df) == expected diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index c90b871d5d66f..9b76ae093e8c4 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1184,6 +1184,7 @@ def test_stack_preserve_categorical_dtype_values(self, future_stack): ) tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize( "index, columns", [ @@ -1194,6 +1195,7 @@ def test_stack_preserve_categorical_dtype_values(self, future_stack): ) def test_stack_multi_columns_non_unique_index(self, index, columns, future_stack): # GH-28301 + df = DataFrame(index=index, columns=columns).fillna(1) stacked = df.stack(future_stack=future_stack) new_index = MultiIndex.from_tuples(stacked.index.to_numpy()) @@ -1767,7 +1769,9 @@ def test_unstack_bug(self, future_stack): } ) - result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) unstacked = result.unstack() restacked = unstacked.stack(future_stack=future_stack) @@ -2508,3 +2512,19 @@ def test_unstack_mixed_level_names(self): index=MultiIndex.from_tuples([(1, "red"), (2, "blue")], names=[0, "y"]), ) tm.assert_frame_equal(result, expected) + + +def test_stack_tuple_columns(future_stack): + # GH#54948 - test stack when the input has a non-MultiIndex with tuples + df = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=[("a", 1), ("a", 2), ("b", 1)] + ) + result = df.stack(future_stack=future_stack) + expected = Series( + [1, 2, 3, 4, 5, 6, 7, 8, 9], + index=MultiIndex( + levels=[[0, 1, 2], [("a", 1), ("a", 2), ("b", 1)]], + codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + ), + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 620d5055f5d3b..fc7aa9e7b2c46 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -87,7 +87,7 @@ def test_metadata_propagation_indiv_resample(self): np.random.default_rng(2).standard_normal((1000, 2)), index=date_range("20130101", periods=1000, freq="s"), ) - result = df.resample("1T") + result = df.resample("1min") tm.assert_metadata_equivalent(df, result) def test_metadata_propagation_indiv(self, monkeypatch): diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index 4ea205ac13c47..3648961eb3808 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -111,13 +111,13 @@ def test_metadata_propagation_indiv_resample(self): index=date_range("20130101", periods=1000, freq="s"), name="foo", ) - result = ts.resample("1T").mean() + result = ts.resample("1min").mean() tm.assert_metadata_equivalent(ts, result) - result = ts.resample("1T").min() + result = ts.resample("1min").min() tm.assert_metadata_equivalent(ts, result) - result = ts.resample("1T").apply(lambda x: x.sum()) + result = ts.resample("1min").apply(lambda x: x.sum()) tm.assert_metadata_equivalent(ts, result) def test_metadata_propagation_indiv(self, monkeypatch): diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index cdfa80c8c7cb5..882f42ff18bdd 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -360,16 +360,16 @@ def test_agg_multiple_functions_same_name(): # GH 30880 df = DataFrame( np.random.default_rng(2).standard_normal((1000, 3)), - index=pd.date_range("1/1/2012", freq="S", periods=1000), + index=pd.date_range("1/1/2012", freq="s", periods=1000), columns=["A", "B", "C"], ) - result = df.resample("3T").agg( + result = df.resample("3min").agg( {"A": [partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]} ) - expected_index = pd.date_range("1/1/2012", freq="3T", periods=6) + expected_index = pd.date_range("1/1/2012", freq="3min", periods=6) expected_columns = MultiIndex.from_tuples([("A", "quantile"), ("A", "quantile")]) expected_values = np.array( - [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]] + [df.resample("3min").A.quantile(q=q).values for q in [0.9999, 0.1111]] ).T expected = DataFrame( expected_values, columns=expected_columns, index=expected_index @@ -382,13 +382,13 @@ def test_agg_multiple_functions_same_name_with_ohlc_present(): # ohlc expands dimensions, so different test to the above is required. df = DataFrame( np.random.default_rng(2).standard_normal((1000, 3)), - index=pd.date_range("1/1/2012", freq="S", periods=1000, name="dti"), + index=pd.date_range("1/1/2012", freq="s", periods=1000, name="dti"), columns=Index(["A", "B", "C"], name="alpha"), ) - result = df.resample("3T").agg( + result = df.resample("3min").agg( {"A": ["ohlc", partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]} ) - expected_index = pd.date_range("1/1/2012", freq="3T", periods=6, name="dti") + expected_index = pd.date_range("1/1/2012", freq="3min", periods=6, name="dti") expected_columns = MultiIndex.from_tuples( [ ("A", "ohlc", "open"), @@ -401,9 +401,11 @@ def test_agg_multiple_functions_same_name_with_ohlc_present(): names=["alpha", None, None], ) non_ohlc_expected_values = np.array( - [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]] + [df.resample("3min").A.quantile(q=q).values for q in [0.9999, 0.1111]] ).T - expected_values = np.hstack([df.resample("3T").A.ohlc(), non_ohlc_expected_values]) + expected_values = np.hstack( + [df.resample("3min").A.ohlc(), non_ohlc_expected_values] + ) expected = DataFrame( expected_values, columns=expected_columns, index=expected_index ) @@ -513,6 +515,18 @@ def test_groupby_agg_dict_with_getitem(): tm.assert_frame_equal(result, expected) +def test_groupby_agg_dict_dup_columns(): + # GH#55006 + df = DataFrame( + [[1, 2, 3, 4], [1, 3, 4, 5], [2, 4, 5, 6]], + columns=["a", "b", "c", "c"], + ) + gb = df.groupby("a") + result = gb.agg({"b": "sum"}) + expected = DataFrame({"b": [5, 4]}, index=Index([1, 2], name="a")) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "op", [ diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index f917f567e1ce3..865fda0ab54a2 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -118,7 +118,7 @@ def test_cython_agg_nothing_to_agg_with_dates(): { "a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25, - "dates": pd.date_range("now", periods=50, freq="T"), + "dates": pd.date_range("now", periods=50, freq="min"), } ) msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes" diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 9d3ebbd3672ae..7ea107f254104 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -499,13 +499,17 @@ def test_agg_timezone_round_trip(): assert ts == grouped.first()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] ts = df["B"].iloc[2] assert ts == grouped.last()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] def test_sum_uint64_overflow(): diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index d04ee7cec0db1..abcb9f68e0f5c 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -28,7 +28,9 @@ def test_apply_func_that_appends_group_to_list_without_copy(): def store(group): groups.append(group) - df.groupby("index").apply(store) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("index").apply(store) expected_value = DataFrame( {"index": [0] * 10, 0: [1] * 10}, index=pd.RangeIndex(0, 100, 10) ) @@ -71,9 +73,11 @@ def test_apply_issues(): ["2011.05.16", "2011.05.17", "2011.05.18"], dtype=object, name="date" ) expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) - result = df.groupby("date", group_keys=False).apply( - lambda x: x["time"][x["value"].idxmax()] - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("date", group_keys=False).apply( + lambda x: x["time"][x["value"].idxmax()] + ) tm.assert_series_equal(result, expected) @@ -179,7 +183,9 @@ def f_constant_df(group): for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]: del names[:] - df.groupby("a", group_keys=False).apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("a", group_keys=False).apply(func) assert names == group_names @@ -197,9 +203,11 @@ def test_group_apply_once_per_group2(capsys): index=["0", "2", "4", "6", "8", "10", "12", "14"], ) - df.groupby("group_by_column", group_keys=False).apply( - lambda df: print("function_called") - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("group_by_column", group_keys=False).apply( + lambda df: print("function_called") + ) result = capsys.readouterr().out.count("function_called") # If `groupby` behaves unexpectedly, this test will break @@ -219,8 +227,11 @@ def slow(group): def fast(group): return group.copy() - fast_df = df.groupby("A", group_keys=False).apply(fast) - slow_df = df.groupby("A", group_keys=False).apply(slow) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + fast_df = df.groupby("A", group_keys=False).apply(fast) + with tm.assert_produces_warning(FutureWarning, match=msg): + slow_df = df.groupby("A", group_keys=False).apply(slow) tm.assert_frame_equal(fast_df, slow_df) @@ -242,7 +253,9 @@ def test_groupby_apply_identity_maybecopy_index_identical(func): df = DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) - result = df.groupby("g", group_keys=False).apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("g", group_keys=False).apply(func) tm.assert_frame_equal(result, df) @@ -285,8 +298,11 @@ def test_groupby_as_index_apply(): tm.assert_index_equal(res_as, exp) tm.assert_index_equal(res_not_as, exp) - res_as_apply = g_as.apply(lambda x: x.head(2)).index - res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + res_as_apply = g_as.apply(lambda x: x.head(2)).index + with tm.assert_produces_warning(FutureWarning, match=msg): + res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index # apply doesn't maintain the original ordering # changed in GH5610 as the as_index=False returns a MI here @@ -299,7 +315,9 @@ def test_groupby_as_index_apply(): ind = Index(list("abcde")) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) - res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index tm.assert_index_equal(res, ind) @@ -328,13 +346,19 @@ def desc3(group): # weirdo return result - result = grouped.apply(desc) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(desc) assert result.index.names == ("A", "B", "stat") - result2 = grouped.apply(desc2) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = grouped.apply(desc2) assert result2.index.names == ("A", "B", "stat") - result3 = grouped.apply(desc3) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result3 = grouped.apply(desc3) assert result3.index.names == ("A", "B", None) @@ -364,7 +388,9 @@ def test_apply_series_yield_constant(df): def test_apply_frame_yield_constant(df): # GH13568 - result = df.groupby(["A", "B"]).apply(len) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["A", "B"]).apply(len) assert isinstance(result, Series) assert result.name is None @@ -375,7 +401,9 @@ def test_apply_frame_yield_constant(df): def test_apply_frame_to_series(df): grouped = df.groupby(["A", "B"]) - result = grouped.apply(len) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(len) expected = grouped.count()["C"] tm.assert_index_equal(result.index, expected.index) tm.assert_numpy_array_equal(result.values, expected.values) @@ -384,7 +412,9 @@ def test_apply_frame_to_series(df): def test_apply_frame_not_as_index_column_name(df): # GH 35964 - path within _wrap_applied_output not hit by a test grouped = df.groupby(["A", "B"], as_index=False) - result = grouped.apply(len) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(len) expected = grouped.count().rename(columns={"C": np.nan}).drop(columns="D") # TODO(GH#34306): Use assert_frame_equal when column name is not np.nan tm.assert_index_equal(result.index, expected.index) @@ -407,7 +437,9 @@ def trans2(group): } ) - result = df.groupby("A").apply(trans) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(trans) exp = df.groupby("A")["C"].apply(trans2) tm.assert_series_equal(result, exp, check_names=False) assert result.name == "C" @@ -436,7 +468,9 @@ def test_apply_chunk_view(group_keys): # Low level tinkering could be unsafe, make sure not df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) expected = df.take([0, 1, 3, 4, 6, 7]) if group_keys: expected.index = MultiIndex.from_arrays( @@ -457,7 +491,9 @@ def test_apply_no_name_column_conflict(): # it works! #2605 grouped = df.groupby(["name", "name2"]) - grouped.apply(lambda x: x.sort_values("value", inplace=True)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped.apply(lambda x: x.sort_values("value", inplace=True)) def test_apply_typecast_fail(): @@ -474,7 +510,9 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - result = df.groupby("d", group_keys=False).apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("d", group_keys=False).apply(f) expected = df.copy() expected["v2"] = np.tile([0.0, 0.5, 1], 2) @@ -498,7 +536,9 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - result = df.groupby("d", group_keys=False).apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("d", group_keys=False).apply(f) expected = df.copy() expected["v2"] = np.tile([0.0, 0.5, 1], 2) @@ -536,8 +576,11 @@ def filt2(x): else: return x[x.category == "c"] - expected = data.groupby("id_field").apply(filt1) - result = data.groupby("id_field").apply(filt2) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = data.groupby("id_field").apply(filt1) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = data.groupby("id_field").apply(filt2) tm.assert_frame_equal(result, expected) @@ -556,7 +599,9 @@ def test_apply_with_duplicated_non_sorted_axis(test_series): expected = ser.sort_index() tm.assert_series_equal(result, expected) else: - result = df.groupby("Y", group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("Y", group_keys=False).apply(lambda x: x) # not expecting the order to remain the same for duplicated axis result = result.sort_values("Y") @@ -601,7 +646,9 @@ def f(g): g["value3"] = g["value1"] * 2 return g - result = grouped.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(f) assert "value3" in result @@ -615,9 +662,13 @@ def test_apply_numeric_coercion_when_datetime(): df = DataFrame( {"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]} ) - expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) df.Date = pd.to_datetime(df.Date) - result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) tm.assert_series_equal(result["Str"], expected["Str"]) # GH 15421 @@ -628,7 +679,9 @@ def test_apply_numeric_coercion_when_datetime(): def get_B(g): return g.iloc[0][["B"]] - result = df.groupby("A").apply(get_B)["B"] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(get_B)["B"] expected = df.B expected.index = df.A tm.assert_series_equal(result, expected) @@ -653,8 +706,11 @@ def predictions(tool): ) df2 = df1.copy() df2.oTime = pd.to_datetime(df2.oTime) - expected = df1.groupby("Key").apply(predictions).p1 - result = df2.groupby("Key").apply(predictions).p1 + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df1.groupby("Key").apply(predictions).p1 + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df2.groupby("Key").apply(predictions).p1 tm.assert_series_equal(expected, result) @@ -669,11 +725,13 @@ def test_apply_aggregating_timedelta_and_datetime(): } ) df["time_delta_zero"] = df.datetime - df.datetime - result = df.groupby("clientid").apply( - lambda ddf: Series( - {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("clientid").apply( + lambda ddf: Series( + {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} + ) ) - ) expected = DataFrame( { "clientid": ["A", "B", "C"], @@ -716,11 +774,15 @@ def func_with_no_date(batch): def func_with_date(batch): return Series({"b": datetime(2015, 1, 1), "c": 2}) - dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) dfg_no_conversion_expected = DataFrame({"c": 2}, index=[1]) dfg_no_conversion_expected.index.name = "a" - dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) dfg_conversion_expected = DataFrame( {"b": pd.Timestamp(2015, 1, 1).as_unit("ns"), "c": 2}, index=[1] ) @@ -764,7 +826,9 @@ def test_groupby_apply_all_none(): def test_func(x): pass - result = test_df.groupby("groups").apply(test_func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = test_df.groupby("groups").apply(test_func) expected = DataFrame() tm.assert_frame_equal(result, expected) @@ -779,8 +843,11 @@ def test_func(x): return None return x.iloc[[0, -1]] - result1 = test_df1.groupby("groups").apply(test_func) - result2 = test_df2.groupby("groups").apply(test_func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = test_df1.groupby("groups").apply(test_func) + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = test_df2.groupby("groups").apply(test_func) index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None]) index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None]) expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1) @@ -793,7 +860,9 @@ def test_groupby_apply_return_empty_chunk(): # GH 22221: apply filter which returns some empty groups df = DataFrame({"value": [0, 1], "group": ["filled", "empty"]}) groups = df.groupby("group") - result = groups.apply(lambda group: group[group.value != 1]["value"]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = groups.apply(lambda group: group[group.value != 1]["value"]) expected = Series( [0], name="value", @@ -820,7 +889,9 @@ def test_apply_with_mixed_types(): def test_func_returns_object(): # GH 28652 df = DataFrame({"a": [1, 2]}, index=Index([1, 2])) - result = df.groupby("a").apply(lambda g: g.index) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("a").apply(lambda g: g.index) expected = Series([Index([1]), Index([2])], index=Index([1, 2], name="a")) tm.assert_series_equal(result, expected) @@ -837,7 +908,9 @@ def test_apply_datetime_issue(group_column_dtlike): # standard int values in range(len(num_columns)) df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]}) - result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) expected = DataFrame( ["spam"], Index(["foo"], dtype="object", name="a"), columns=[42] @@ -876,7 +949,9 @@ def test_apply_series_return_dataframe_groups(): def most_common_values(df): return Series({c: s.value_counts().index[0] for c, s in df.items()}) - result = tdf.groupby("day").apply(most_common_values)["userId"] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = tdf.groupby("day").apply(most_common_values)["userId"] expected = Series( ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" ) @@ -917,7 +992,9 @@ def test_groupby_apply_datetime_result_dtypes(): ], columns=["observation", "color", "mood", "intensity", "score"], ) - result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes expected = Series( [np.dtype("datetime64[ns]"), object, object, np.int64, object], index=["observation", "color", "mood", "intensity", "score"], @@ -937,7 +1014,9 @@ def test_groupby_apply_datetime_result_dtypes(): def test_apply_index_has_complex_internals(index): # GH 31248 df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) - result = df.groupby("group", group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("group", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, df) @@ -960,7 +1039,9 @@ def test_apply_index_has_complex_internals(index): def test_apply_function_returns_non_pandas_non_scalar(function, expected_values): # GH 31441 df = DataFrame(["A", "A", "B", "B"], columns=["groups"]) - result = df.groupby("groups").apply(function) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("groups").apply(function) expected = Series(expected_values, index=Index(["A", "B"], name="groups")) tm.assert_series_equal(result, expected) @@ -972,7 +1053,9 @@ def fct(group): df = DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]}) - result = df.groupby("A").apply(fct) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(fct) expected = Series( [[1.0, 2.0], [3.0], [np.nan]], index=Index(["a", "b", "none"], name="A") ) @@ -983,7 +1066,9 @@ def fct(group): def test_apply_function_index_return(function): # GH: 22541 df = DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"]) - result = df.groupby("id").apply(function) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("id").apply(function) expected = Series( [Index([0, 4, 7, 9]), Index([1, 2, 3, 5]), Index([6, 8])], index=Index([1, 2, 3], name="id"), @@ -1019,7 +1104,9 @@ def test_apply_result_type(group_keys, udf): # We'd like to control whether the group keys end up in the index # regardless of whether the UDF happens to be a transform. df = DataFrame({"A": ["a", "b"], "B": [1, 2]}) - df_result = df.groupby("A", group_keys=group_keys).apply(udf) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df_result = df.groupby("A", group_keys=group_keys).apply(udf) series_result = df.B.groupby(df.A, group_keys=group_keys).apply(udf) if group_keys: @@ -1034,8 +1121,11 @@ def test_result_order_group_keys_false(): # GH 34998 # apply result order should not depend on whether index is the same or just equal df = DataFrame({"A": [2, 1, 2], "B": [1, 2, 3]}) - result = df.groupby("A", group_keys=False).apply(lambda x: x) - expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A", group_keys=False).apply(lambda x: x) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) tm.assert_frame_equal(result, expected) @@ -1047,8 +1137,15 @@ def test_apply_with_timezones_aware(): df1 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_no_tz}) df2 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz}) - result1 = df1.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy()) - result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = df1.groupby("x", group_keys=False).apply( + lambda df: df[["x", "y"]].copy() + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = df2.groupby("x", group_keys=False).apply( + lambda df: df[["x", "y"]].copy() + ) tm.assert_frame_equal(result1, result2) @@ -1103,7 +1200,9 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): ) grp = df.groupby(["A", "B"]) - result = grp.apply(lambda x: x.head(1)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grp.apply(lambda x: x.head(1)) expected = df.iloc[[0, 2, 3]] expected = expected.reset_index() @@ -1151,7 +1250,9 @@ def test_apply_dropna_with_indexed_same(dropna): }, index=list("xxyxz"), ) - result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) expected = df.dropna() if dropna else df.iloc[[0, 3, 1, 2, 4]] tm.assert_frame_equal(result, expected) @@ -1176,7 +1277,9 @@ def test_apply_dropna_with_indexed_same(dropna): def test_apply_as_index_constant_lambda(as_index, expected): # GH 13217 df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 1, 2, 2], "c": [1, 1, 1, 1]}) - result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) tm.assert_equal(result, expected) @@ -1186,7 +1289,9 @@ def test_sort_index_groups(): {"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 0], "C": [1, 1, 1, 2, 2]}, index=range(5), ) - result = df.groupby("C").apply(lambda x: x.A.sort_index()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("C").apply(lambda x: x.A.sort_index()) expected = Series( range(1, 6), index=MultiIndex.from_tuples( @@ -1206,9 +1311,11 @@ def test_positional_slice_groups_datetimelike(): "let": list("abcde"), } ) - result = expected.groupby( - [expected.let, expected.date.dt.date], group_keys=False - ).apply(lambda x: x.iloc[0:]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = expected.groupby( + [expected.let, expected.date.dt.date], group_keys=False + ).apply(lambda x: x.iloc[0:]) tm.assert_frame_equal(result, expected) @@ -1251,24 +1358,29 @@ def test_apply_na(dropna): {"grp": [1, 1, 2, 2], "y": [1, 0, 2, 5], "z": [1, 2, np.nan, np.nan]} ) dfgrp = df.groupby("grp", dropna=dropna) - result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) - expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) tm.assert_frame_equal(result, expected) def test_apply_empty_string_nan_coerce_bug(): # GH#24903 - result = ( - DataFrame( - { - "a": [1, 1, 2, 2], - "b": ["", "", "", ""], - "c": pd.to_datetime([1, 2, 3, 4], unit="s"), - } + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ( + DataFrame( + { + "a": [1, 1, 2, 2], + "b": ["", "", "", ""], + "c": pd.to_datetime([1, 2, 3, 4], unit="s"), + } + ) + .groupby(["a", "b"]) + .apply(lambda df: df.iloc[-1]) ) - .groupby(["a", "b"]) - .apply(lambda df: df.iloc[-1]) - ) expected = DataFrame( [[1, "", pd.to_datetime(2, unit="s")], [2, "", pd.to_datetime(4, unit="s")]], columns=["a", "b", "c"], @@ -1293,9 +1405,11 @@ def test_apply_index_key_error_bug(index_values): }, index=Index(["a2", "a3", "aa"], name="a"), ) - result = result.groupby("a").apply( - lambda df: Series([df["b"].mean()], index=["b_mean"]) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = result.groupby("a").apply( + lambda df: Series([df["b"].mean()], index=["b_mean"]) + ) tm.assert_frame_equal(result, expected) @@ -1343,7 +1457,9 @@ def test_apply_index_key_error_bug(index_values): def test_apply_nonmonotonic_float_index(arg, idx): # GH 34455 expected = DataFrame({"col": arg}, index=idx) - result = expected.groupby("col", group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = expected.groupby("col", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, expected) @@ -1390,33 +1506,16 @@ def test_empty_df(method, op): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "group_col", - [([0.0, np.nan, 0.0, 0.0]), ([np.nan, 0.0, 0.0, 0.0]), ([0, 0.0, 0.0, np.nan])], -) -def test_apply_inconsistent_output(group_col): - # GH 34478 - df = DataFrame({"group_col": group_col, "value_col": [2, 2, 2, 2]}) - - result = df.groupby("group_col").value_col.apply( - lambda x: x.value_counts().reindex(index=[1, 2, 3]) - ) - expected = Series( - [np.nan, 3.0, np.nan], - name="value_col", - index=MultiIndex.from_product([[0.0], [1, 2, 3]], names=["group_col", 0.0]), - ) - - tm.assert_series_equal(result, expected) - - -def test_apply_array_output_multi_getitem(): - # GH 18930 - df = DataFrame( - {"A": {"a": 1, "b": 2}, "B": {"a": 1, "b": 2}, "C": {"a": 1, "b": 2}} - ) - result = df.groupby("A")[["B", "C"]].apply(lambda x: np.array([0])) - expected = Series( - [np.array([0])] * 2, index=Index([1, 2], name="A"), name=("B", "C") - ) - tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("include_groups", [True, False]) +def test_include_groups(include_groups): + # GH#7155 + df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) + gb = df.groupby("a") + warn = FutureWarning if include_groups else None + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + result = gb.apply(lambda x: x.sum(), include_groups=include_groups) + expected = DataFrame({"a": [2, 2], "b": [7, 5]}, index=Index([1, 2], name="a")) + if not include_groups: + expected = expected[["b"]] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py index 9bc07b584e9d1..09d5e06bf6ddd 100644 --- a/pandas/tests/groupby/test_apply_mutate.py +++ b/pandas/tests/groupby/test_apply_mutate.py @@ -13,10 +13,16 @@ def test_group_by_copy(): } ).set_index("name") - grp_by_same_value = df.groupby(["age"], group_keys=False).apply(lambda group: group) - grp_by_copy = df.groupby(["age"], group_keys=False).apply( - lambda group: group.copy() - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + grp_by_same_value = df.groupby(["age"], group_keys=False).apply( + lambda group: group + ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + grp_by_copy = df.groupby(["age"], group_keys=False).apply( + lambda group: group.copy() + ) tm.assert_frame_equal(grp_by_same_value, grp_by_copy) @@ -47,8 +53,11 @@ def f_no_copy(x): x["rank"] = x.val.rank(method="min") return x.groupby("cat2")["rank"].min() - grpby_copy = df.groupby("cat1").apply(f_copy) - grpby_no_copy = df.groupby("cat1").apply(f_no_copy) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + grpby_copy = df.groupby("cat1").apply(f_copy) + with tm.assert_produces_warning(FutureWarning, match=msg): + grpby_no_copy = df.groupby("cat1").apply(f_no_copy) tm.assert_series_equal(grpby_copy, grpby_no_copy) @@ -58,8 +67,11 @@ def test_no_mutate_but_looks_like(): # second does not, but should yield the same results df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) - result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) tm.assert_series_equal(result1, result2) @@ -73,7 +85,9 @@ def fn(x): x.loc[x.index[-1], "col2"] = 0 return x.col2 - result = df.groupby(["col1"], as_index=False).apply(fn) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["col1"], as_index=False).apply(fn) expected = pd.Series( [1, 2, 0, 4, 5, 0], index=pd.MultiIndex.from_tuples( diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 68ce58ad23690..b11240c841420 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -124,7 +124,9 @@ def test_basic(): # TODO: split this test def f(x): return x.drop_duplicates("person_name").iloc[0] - result = g.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") expected["person_name"] = expected["person_name"].astype("object") @@ -268,7 +270,10 @@ def test_level_get_group(observed): names=["Index1", "Index2"], ), ) - result = g.get_group("a") + msg = "you will need to pass a length-1 tuple" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#25971 - warn when not passing a length-1 tuple + result = g.get_group("a") tm.assert_frame_equal(result, expected) @@ -326,7 +331,9 @@ def test_apply(ordered): # but for transform we should still get back the original index idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) expected = Series(1, index=idx) - result = grouped.apply(lambda x: 1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(lambda x: 1) tm.assert_series_equal(result, expected) @@ -1147,7 +1154,7 @@ def test_groupby_multiindex_categorical_datetime(): { "key1": Categorical(list("abcbabcba")), "key2": Categorical( - list(pd.date_range("2018-06-01 00", freq="1T", periods=3)) * 3 + list(pd.date_range("2018-06-01 00", freq="1min", periods=3)) * 3 ), "values": np.arange(9), } @@ -1157,7 +1164,7 @@ def test_groupby_multiindex_categorical_datetime(): idx = MultiIndex.from_product( [ Categorical(["a", "b", "c"]), - Categorical(pd.date_range("2018-06-01 00", freq="1T", periods=3)), + Categorical(pd.date_range("2018-06-01 00", freq="1min", periods=3)), ], names=["key1", "key2"], ) @@ -2010,7 +2017,10 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde df["a2"] = df["a"] df = df.set_index(keys) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) + warn = FutureWarning if method == "apply" and index_kind == "range" else None + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) if (method == "transform" or not as_index) and index_kind == "range": result = op_result["a"].cat.categories else: diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 6c27344ce3110..16d7fe61b90ad 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -265,7 +265,7 @@ def test_groupby_timedelta_cython_count(): def test_count(): n = 1 << 15 - dr = date_range("2015-08-30", periods=n // 10, freq="T") + dr = date_range("2015-08-30", periods=n // 10, freq="min") df = DataFrame( { @@ -289,7 +289,9 @@ def test_count(): for key in ["1st", "2nd", ["1st", "2nd"]]: left = df.groupby(key).count() - right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) tm.assert_frame_equal(left, right) @@ -379,3 +381,14 @@ def __eq__(self, other): result = df.groupby("grp").count() expected = DataFrame({"a": [2, 2]}, index=Index(list("ab"), name="grp")) tm.assert_frame_equal(result, expected) + + +def test_count_arrow_string_array(any_string_dtype): + # GH#54751 + pytest.importorskip("pyarrow") + df = DataFrame( + {"a": [1, 2, 3], "b": Series(["a", "b", "a"], dtype=any_string_dtype)} + ) + result = df.groupby("a").count() + expected = DataFrame({"b": 1}, index=Index([1, 2, 3], name="a")) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 0abf6428730ff..41bbfcf6840a9 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -95,10 +95,12 @@ def test_builtins_apply(keys, f): assert result.shape == (ngroups, 3), assert_msg npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function - expected = gb.apply(npfunc) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = gb.apply(npfunc) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(None): + with tm.assert_produces_warning(FutureWarning, match=msg): expected2 = gb.apply(lambda x: npfunc(x)) tm.assert_frame_equal(result, expected2) @@ -1532,6 +1534,7 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): method(*args, **kwargs) +@pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize("dtype", [bool, int, float, object]) def test_deprecate_numeric_only_series(dtype, groupby_func, request): # GH#46560 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9b260d5757767..9132be50d5857 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -150,7 +150,9 @@ def test_groupby_nonobject_dtype(mframe, df_mixed_floats): def max_value(group): return group.loc[group["value"].idxmax()] - applied = df.groupby("A").apply(max_value) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + applied = df.groupby("A").apply(max_value) result = applied.dtypes expected = df.dtypes tm.assert_series_equal(result, expected) @@ -171,7 +173,9 @@ def f_0(grp): return grp.iloc[0] expected = df.groupby("A").first()[["B"]] - result = df.groupby("A").apply(f_0)[["B"]] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_0)[["B"]] tm.assert_frame_equal(result, expected) def f_1(grp): @@ -179,9 +183,10 @@ def f_1(grp): return None return grp.iloc[0] - result = df.groupby("A").apply(f_1)[["B"]] - # Cast to avoid upcast when setting nan below - e = expected.copy().astype("float64") + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_1)[["B"]] + e = expected.copy() e.loc["Tiger"] = np.nan tm.assert_frame_equal(result, e) @@ -190,9 +195,10 @@ def f_2(grp): return None return grp.iloc[0] - result = df.groupby("A").apply(f_2)[["B"]] - # Explicit cast to float to avoid implicit cast when setting nan - e = expected.copy().astype({"B": "float"}) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_2)[["B"]] + e = expected.copy() e.loc["Pony"] = np.nan tm.assert_frame_equal(result, e) @@ -202,7 +208,9 @@ def f_3(grp): return None return grp.iloc[0] - result = df.groupby("A").apply(f_3)[["C"]] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_3)[["C"]] e = df.groupby("A").first()[["C"]] e.loc["Pony"] = pd.NaT tm.assert_frame_equal(result, e) @@ -213,7 +221,9 @@ def f_4(grp): return None return grp.iloc[0].loc["C"] - result = df.groupby("A").apply(f_4) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").apply(f_4) e = df.groupby("A").first()["C"].copy() e.loc["Pony"] = np.nan e.name = None @@ -392,8 +402,11 @@ def f3(x): depr_msg = "The behavior of array concatenation with empty entries is deprecated" # correct result - result1 = df.groupby("a").apply(f1) - result2 = df2.groupby("a").apply(f1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = df.groupby("a").apply(f1) + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = df2.groupby("a").apply(f1) tm.assert_frame_equal(result1, result2) # should fail (not the same number of levels) @@ -1322,11 +1335,15 @@ def summarize_random_name(df): # inconsistent. return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"]) - metrics = df.groupby("A").apply(summarize) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + metrics = df.groupby("A").apply(summarize) assert metrics.columns.name is None - metrics = df.groupby("A").apply(summarize, "metrics") + with tm.assert_produces_warning(FutureWarning, match=msg): + metrics = df.groupby("A").apply(summarize, "metrics") assert metrics.columns.name == "metrics" - metrics = df.groupby("A").apply(summarize_random_name) + with tm.assert_produces_warning(FutureWarning, match=msg): + metrics = df.groupby("A").apply(summarize_random_name) assert metrics.columns.name is None @@ -1619,7 +1636,9 @@ def test_dont_clobber_name_column(): {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2} ) - result = df.groupby("key", group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("key", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, df) @@ -1693,7 +1712,9 @@ def freducex(x): grouped = df.groupby(grouper, group_keys=False) # make sure all these work - grouped.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped.apply(f) grouped.aggregate(freduce) grouped.aggregate({"C": freduce, "D": freduce}) grouped.transform(f) @@ -1714,7 +1735,9 @@ def f(group): names.append(group.name) return group.copy() - df.groupby("a", sort=False, group_keys=False).apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("a", sort=False, group_keys=False).apply(f) expected_names = [0, 1, 2] assert names == expected_names @@ -1920,15 +1943,17 @@ def test_groupby_preserves_sort(sort_column, group_column): def test_sort(x): tm.assert_frame_equal(x, x.sort_values(by=sort_column)) - g.apply(test_sort) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + g.apply(test_sort) def test_pivot_table_values_key_error(): # This test is designed to replicate the error in issue #14938 df = DataFrame( { - "eventDate": date_range(datetime.today(), periods=20, freq="M").tolist(), - "thename": range(0, 20), + "eventDate": date_range(datetime.today(), periods=20, freq="ME").tolist(), + "thename": range(20), } ) @@ -2102,7 +2127,9 @@ def test_empty_groupby_apply_nonunique_columns(): df[3] = df[3].astype(np.int64) df.columns = [0, 1, 2, 0] gb = df.groupby(df[1], group_keys=False) - res = gb.apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = gb.apply(lambda x: x) assert (res.dtypes == df.dtypes).all() @@ -3141,13 +3168,13 @@ def test_groupby_with_Time_Grouper(): expected_output = DataFrame( { - "time2": date_range("2016-08-31 22:08:00", periods=13, freq="1T"), + "time2": date_range("2016-08-31 22:08:00", periods=13, freq="1min"), "quant": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], "quant2": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], } ) - df = test_data.groupby(Grouper(key="time2", freq="1T")).count().reset_index() + df = test_data.groupby(Grouper(key="time2", freq="1min")).count().reset_index() tm.assert_frame_equal(df, expected_output) @@ -3159,3 +3186,62 @@ def test_groupby_series_with_datetimeindex_month_name(): expected = Series([2, 1], name="jan") expected.index.name = "jan" tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("test_series", [True, False]) +@pytest.mark.parametrize( + "kwarg, value, name, warn", + [ + ("by", "a", 1, None), + ("by", ["a"], 1, FutureWarning), + ("by", ["a"], (1,), None), + ("level", 0, 1, None), + ("level", [0], 1, FutureWarning), + ("level", [0], (1,), None), + ], +) +def test_depr_get_group_len_1_list_likes(test_series, kwarg, value, name, warn): + # GH#25971 + obj = DataFrame({"b": [3, 4, 5]}, index=Index([1, 1, 2], name="a")) + if test_series: + obj = obj["b"] + gb = obj.groupby(**{kwarg: value}) + msg = "you will need to pass a length-1 tuple" + with tm.assert_produces_warning(warn, match=msg): + result = gb.get_group(name) + if test_series: + expected = Series([3, 4], index=Index([1, 1], name="a"), name="b") + else: + expected = DataFrame({"b": [3, 4]}, index=Index([1, 1], name="a")) + tm.assert_equal(result, expected) + + +def test_groupby_ngroup_with_nan(): + # GH#50100 + df = DataFrame({"a": Categorical([np.nan]), "b": [1]}) + result = df.groupby(["a", "b"], dropna=False, observed=False).ngroup() + expected = Series([0]) + tm.assert_series_equal(result, expected) + + +def test_get_group_axis_1(): + # GH#54858 + df = DataFrame( + { + "col1": [0, 3, 2, 3], + "col2": [4, 1, 6, 7], + "col3": [3, 8, 2, 10], + "col4": [1, 13, 6, 15], + "col5": [-4, 5, 6, -7], + } + ) + with tm.assert_produces_warning(FutureWarning, match="deprecated"): + grouped = df.groupby(axis=1, by=[1, 2, 3, 2, 1]) + result = grouped.get_group(1) + expected = DataFrame( + { + "col1": [0, 3, 2, 3], + "col5": [-4, 5, 6, -7], + } + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 099e7bc3890d0..d82278c277d48 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -324,7 +324,9 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, df = pd.DataFrame(data) gb = df.groupby("groups", dropna=dropna) - result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) mi_tuples = tuple(zip(data["groups"], selected_data["values"])) mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None]) diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/test_groupby_shift_diff.py index 495f3fcd359c7..bb4b9aa866ac9 100644 --- a/pandas/tests/groupby/test_groupby_shift_diff.py +++ b/pandas/tests/groupby/test_groupby_shift_diff.py @@ -166,12 +166,14 @@ def test_shift_periods_freq(): tm.assert_frame_equal(result, expected) -def test_shift_disallow_freq_and_fill_value(): +def test_shift_deprecate_freq_and_fill_value(): # GH 53832 data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]} df = DataFrame(data, index=date_range(start="20100101", periods=6)) - msg = "Cannot pass both 'freq' and 'fill_value' to (Series|DataFrame).shift" - with pytest.raises(ValueError, match=msg): + msg = ( + "Passing a 'freq' together with a 'fill_value' silently ignores the fill_value" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): df.groupby(df.index).shift(periods=-2, freq="D", fill_value="1") @@ -238,12 +240,15 @@ def test_group_shift_with_multiple_periods_and_fill_value(): tm.assert_frame_equal(shifted_df, expected_df) -def test_group_shift_with_multiple_periods_and_both_fill_and_freq_fails(): +def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated(): # GH#44424 df = DataFrame( {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, index=date_range("1/1/2000", periods=5, freq="H"), ) - msg = r"Cannot pass both 'freq' and 'fill_value' to.*" - with pytest.raises(ValueError, match=msg): + msg = ( + "Passing a 'freq' together with a 'fill_value' silently ignores the " + "fill_value" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="H") diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 773c1e60e97af..601e67bbca5e3 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -63,7 +63,9 @@ def func(group): assert hasattr(group, "testattr") return group.testattr - result = custom_df.groupby("c").apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = custom_df.groupby("c").apply(func) expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c")) tm.assert_series_equal(result, expected) @@ -101,5 +103,7 @@ def test_groupby_resample_preserves_subclass(obj): df = df.set_index("Date") # Confirm groupby.resample() preserves dataframe type - result = df.groupby("Buyer").resample("5D").sum() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("Buyer").resample("5D").sum() assert isinstance(result, obj) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index e0793ada679c2..5adf9ace255ea 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -131,6 +131,20 @@ def test_getitem_single_column(self): tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "func", [lambda x: x.sum(), lambda x: x.agg(lambda y: y.sum())] + ) + def test_getitem_from_grouper(self, func): + # GH 50383 + df = DataFrame({"a": [1, 1, 2], "b": 3, "c": 4, "d": 5}) + gb = df.groupby(["a", "b"])[["a", "c"]] + + idx = MultiIndex.from_tuples([(1, 3), (2, 3)], names=["a", "b"]) + expected = DataFrame({"a": [2, 2], "c": [8, 4]}, index=idx) + result = func(gb) + + tm.assert_frame_equal(result, expected) + def test_indices_grouped_by_tuple_with_lambda(self): # GH 36158 df = DataFrame( @@ -224,7 +238,9 @@ def test_grouper_creation_bug(self): result = g.sum() tm.assert_frame_equal(result, expected) - result = g.apply(lambda x: x.sum()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = g.apply(lambda x: x.sum()) expected["A"] = [0, 2, 4] expected = expected.loc[:, ["A", "B"]] tm.assert_frame_equal(result, expected) @@ -267,10 +283,10 @@ def test_grouper_creation_bug(self): names=["one", "two", "three"], ), ) - result = s.groupby(Grouper(level="three", freq="M")).sum() + result = s.groupby(Grouper(level="three", freq="ME")).sum() expected = Series( [28], - index=pd.DatetimeIndex([Timestamp("2013-01-31")], freq="M", name="three"), + index=pd.DatetimeIndex([Timestamp("2013-01-31")], freq="ME", name="three"), ) tm.assert_series_equal(result, expected) @@ -377,12 +393,12 @@ def test_grouper_getting_correct_binner(self): ), ) result = df.groupby( - [Grouper(level="one"), Grouper(level="two", freq="M")] + [Grouper(level="one"), Grouper(level="two", freq="ME")] ).sum() expected = DataFrame( {"A": [31, 28, 21, 31, 28, 21]}, index=MultiIndex.from_product( - [list("ab"), date_range("20130101", freq="M", periods=3)], + [list("ab"), date_range("20130101", freq="ME", periods=3)], names=["one", "two"], ), ) @@ -1067,7 +1083,7 @@ def test_groupby_with_small_elem(self): {"event": ["start", "start"], "change": [1234, 5678]}, index=pd.DatetimeIndex(["2014-09-10", "2013-10-10"]), ) - grouped = df.groupby([Grouper(freq="M"), "event"]) + grouped = df.groupby([Grouper(freq="ME"), "event"]) assert len(grouped.groups) == 2 assert grouped.ngroups == 2 assert (Timestamp("2014-09-30"), "start") in grouped.groups @@ -1082,7 +1098,7 @@ def test_groupby_with_small_elem(self): {"event": ["start", "start", "start"], "change": [1234, 5678, 9123]}, index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-09-15"]), ) - grouped = df.groupby([Grouper(freq="M"), "event"]) + grouped = df.groupby([Grouper(freq="ME"), "event"]) assert len(grouped.groups) == 2 assert grouped.ngroups == 2 assert (Timestamp("2014-09-30"), "start") in grouped.groups @@ -1098,7 +1114,7 @@ def test_groupby_with_small_elem(self): {"event": ["start", "start", "start"], "change": [1234, 5678, 9123]}, index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-08-05"]), ) - grouped = df.groupby([Grouper(freq="M"), "event"]) + grouped = df.groupby([Grouper(freq="ME"), "event"]) assert len(grouped.groups) == 3 assert grouped.ngroups == 3 assert (Timestamp("2014-09-30"), "start") in grouped.groups diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 165d72bf3e878..efe7b171d630d 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -420,7 +420,7 @@ def test_timestamp_groupby_quantile(): df = DataFrame( { "timestamp": pd.date_range( - start="2020-04-19 00:00:00", freq="1T", periods=100, tz="UTC" + start="2020-04-19 00:00:00", freq="1min", periods=100, tz="UTC" ).floor("1H"), "category": list(range(1, 101)), "value": list(range(101, 201)), diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 5d85a0783e024..a3b7da3fa836c 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -710,3 +710,12 @@ def test_rank_categorical(): expected = df.astype(object).groupby("col1").rank() tm.assert_frame_equal(res, expected) + + +@pytest.mark.parametrize("na_option", ["top", "bottom"]) +def test_groupby_op_with_nullables(na_option): + # GH 54206 + df = DataFrame({"x": [None]}, dtype="Float64") + result = df.groupby("x", dropna=False)["x"].rank(method="min", na_option=na_option) + expected = Series([1.0], dtype="Float64", name=result.name) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 527e7c6081970..a9e67df1fb793 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -147,7 +147,7 @@ def test_groupby_with_timegrouper_methods(self, should_sort): df = df.sort_values(by="Quantity", ascending=False) df = df.set_index("Date", drop=False) - g = df.groupby(Grouper(freq="6M")) + g = df.groupby(Grouper(freq="6ME")) assert g.group_keys assert isinstance(g.grouper, BinGrouper) @@ -248,7 +248,7 @@ def test_timegrouper_with_reg_groups(self): result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M"), "Buyer"]).sum(numeric_only=True) + result = df.groupby([Grouper(freq="1ME"), "Buyer"]).sum(numeric_only=True) expected = DataFrame( { "Buyer": "Carl Joe Mark".split(), @@ -264,32 +264,32 @@ def test_timegrouper_with_reg_groups(self): # passing the name df = df.reset_index() - result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum( + result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum( numeric_only=True ) tm.assert_frame_equal(result, expected) with pytest.raises(KeyError, match="'The grouper name foo is not found'"): - df.groupby([Grouper(freq="1M", key="foo"), "Buyer"]).sum() + df.groupby([Grouper(freq="1ME", key="foo"), "Buyer"]).sum() # passing the level df = df.set_index("Date") - result = df.groupby([Grouper(freq="1M", level="Date"), "Buyer"]).sum( + result = df.groupby([Grouper(freq="1ME", level="Date"), "Buyer"]).sum( numeric_only=True ) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M", level=0), "Buyer"]).sum( + result = df.groupby([Grouper(freq="1ME", level=0), "Buyer"]).sum( numeric_only=True ) tm.assert_frame_equal(result, expected) with pytest.raises(ValueError, match="The level foo is not valid"): - df.groupby([Grouper(freq="1M", level="foo"), "Buyer"]).sum() + df.groupby([Grouper(freq="1ME", level="foo"), "Buyer"]).sum() # multi names df = df.copy() df["Date"] = df.index + offsets.MonthEnd(2) - result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum( + result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum( numeric_only=True ) expected = DataFrame( @@ -309,7 +309,7 @@ def test_timegrouper_with_reg_groups(self): msg = "The Grouper cannot specify both a key and a level!" with pytest.raises(ValueError, match=msg): df.groupby( - [Grouper(freq="1M", key="Date", level="Date"), "Buyer"] + [Grouper(freq="1ME", key="Date", level="Date"), "Buyer"] ).sum() # single groupers @@ -320,21 +320,23 @@ def test_timegrouper_with_reg_groups(self): [datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date" ), ) - result = df.groupby(Grouper(freq="1M")).sum(numeric_only=True) + result = df.groupby(Grouper(freq="1ME")).sum(numeric_only=True) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M")]).sum(numeric_only=True) + result = df.groupby([Grouper(freq="1ME")]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) expected.index = expected.index.shift(1) assert expected.index.freq == offsets.MonthEnd() - result = df.groupby(Grouper(freq="1M", key="Date")).sum(numeric_only=True) + result = df.groupby(Grouper(freq="1ME", key="Date")).sum(numeric_only=True) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M", key="Date")]).sum(numeric_only=True) + result = df.groupby([Grouper(freq="1ME", key="Date")]).sum( + numeric_only=True + ) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("freq", ["D", "M", "A", "Q-APR"]) + @pytest.mark.parametrize("freq", ["D", "ME", "A", "Q-APR"]) def test_timegrouper_with_reg_groups_freq(self, freq): # GH 6764 multiple grouping with/without sort df = DataFrame( @@ -421,7 +423,7 @@ def test_timegrouper_get_group(self): dt_list = ["2013-09-30", "2013-10-31", "2013-12-31"] for df in [df_original, df_reordered]: - grouped = df.groupby(Grouper(freq="M", key="Date")) + grouped = df.groupby(Grouper(freq="ME", key="Date")) for t, expected in zip(dt_list, expected_list): dt = Timestamp(t) result = grouped.get_group(dt) @@ -436,7 +438,7 @@ def test_timegrouper_get_group(self): g_list = [("Joe", "2013-09-30"), ("Carl", "2013-10-31"), ("Joe", "2013-12-31")] for df in [df_original, df_reordered]: - grouped = df.groupby(["Buyer", Grouper(freq="M", key="Date")]) + grouped = df.groupby(["Buyer", Grouper(freq="ME", key="Date")]) for (b, t), expected in zip(g_list, expected_list): dt = Timestamp(t) result = grouped.get_group((b, dt)) @@ -453,7 +455,7 @@ def test_timegrouper_get_group(self): ] for df in [df_original, df_reordered]: - grouped = df.groupby(Grouper(freq="M")) + grouped = df.groupby(Grouper(freq="ME")) for t, expected in zip(dt_list, expected_list): dt = Timestamp(t) result = grouped.get_group(dt) @@ -470,8 +472,12 @@ def test_timegrouper_apply_return_type_series(self): def sumfunc_series(x): return Series([x["value"].sum()], ("sum",)) - expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) - result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_series) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series) tm.assert_frame_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -487,8 +493,11 @@ def test_timegrouper_apply_return_type_value(self): def sumfunc_value(x): return x.value.sum() - expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) - result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_value) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value) tm.assert_series_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -753,7 +762,7 @@ def test_timezone_info(self): def test_datetime_count(self): df = DataFrame( - {"a": [1, 2, 3] * 2, "dates": date_range("now", periods=6, freq="T")} + {"a": [1, 2, 3] * 2, "dates": date_range("now", periods=6, freq="min")} ) result = df.groupby("a").dates.count() expected = Series([2, 2, 2], index=Index([1, 2, 3], name="a"), name="dates") @@ -842,7 +851,7 @@ def test_grouper_period_index(self): result = period_series.groupby(period_series.index.month).sum() expected = Series( - range(0, periods), index=Index(range(1, periods + 1), name=index.name) + range(periods), index=Index(range(1, periods + 1), name=index.name) ) tm.assert_series_equal(result, expected) @@ -895,7 +904,9 @@ def test_groupby_apply_timegrouper_with_nat_apply_squeeze( assert gb._selected_obj._get_axis(gb.axis).nlevels == 1 # function that returns a Series - res = gb.apply(lambda x: x["Quantity"] * 2) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = gb.apply(lambda x: x["Quantity"] * 2) expected = DataFrame( [[36, 6, 6, 10, 2]], diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 7c50124e57e29..944dda8977882 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -327,9 +327,12 @@ def test_against_frame_and_seriesgroupby( ) if frame: # compare against apply with DataFrame value_counts - expected = gp.apply( - _frame_value_counts, ["gender", "education"], normalize, sort, ascending - ) + warn = FutureWarning if groupby == "column" else None + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + expected = gp.apply( + _frame_value_counts, ["gender", "education"], normalize, sort, ascending + ) if as_index: tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 062dfe3931423..cf3f41e04902c 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -70,7 +70,7 @@ def demean(arr): # GH 8430 df = tm.makeTimeDataFrame() - g = df.groupby(pd.Grouper(freq="M")) + g = df.groupby(pd.Grouper(freq="ME")) g.transform(lambda x: x - 1) # GH 9700 @@ -636,7 +636,9 @@ def f(group): return group[:1] grouped = df.groupby("c") - result = grouped.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.apply(f) assert result["d"].dtype == np.float64 @@ -790,7 +792,13 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target): f = gb[["float", "float_missing"]].apply(targop) expected = concat([f, i], axis=1) else: - expected = gb.apply(targop) + if op != "shift" or not isinstance(gb_target.get("by"), (str, list)): + warn = None + else: + warn = FutureWarning + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + expected = gb.apply(targop) expected = expected.sort_index(axis=1) if op == "shift": diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index 638124ac20e06..60abbfc441e8e 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -46,8 +46,8 @@ def test_construct_empty_tuples(self, tuple_list): def test_index_string_inference(self): # GH#54430 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" expected = Index(["a", "b"], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Index(["a", "b"]) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 64cbe657a8aff..87facbf529411 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -228,6 +228,13 @@ def test_isin(self): expected = np.array([False] * 5 + [True]) tm.assert_numpy_array_equal(result, expected) + def test_isin_overlapping_intervals(self): + # GH 34974 + idx = pd.IntervalIndex([pd.Interval(0, 2), pd.Interval(0, 1)]) + result = CategoricalIndex(idx).isin(idx) + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) + def test_identical(self): ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) ci2 = CategoricalIndex(["a", "b"], categories=["a", "b", "c"], ordered=True) diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index 458a37c994091..fe397e2c7c88e 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -25,7 +25,7 @@ def sort(request): return request.param -@pytest.fixture(params=["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"]) +@pytest.fixture(params=["D", "3D", "-3D", "H", "2H", "-2H", "min", "2min", "s", "-3s"]) def freq_sample(request): """ Valid values for 'freq' parameter used to create date_range and diff --git a/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py b/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py index e5da06cb005f6..c38e24232f181 100644 --- a/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py +++ b/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py @@ -68,7 +68,7 @@ def test_drop_duplicates(self, keep, expected, index, idx): class TestDropDuplicatesPeriodIndex(DropDuplicates): - @pytest.fixture(params=["D", "3D", "H", "2H", "T", "2T", "S", "3S"]) + @pytest.fixture(params=["D", "3D", "H", "2H", "min", "2min", "s", "3s"]) def freq(self, request): return request.param diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index d339639dc5def..08a2473f22556 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -168,7 +168,7 @@ def test_astype_object(self): @pytest.mark.parametrize("tz", [None, "Asia/Tokyo"]) def test_astype_object_tz(self, tz): - idx = date_range(start="2013-01-01", periods=4, freq="M", name="idx", tz=tz) + idx = date_range(start="2013-01-01", periods=4, freq="ME", name="idx", tz=tz) expected_list = [ Timestamp("2013-01-31", tz=tz), Timestamp("2013-02-28", tz=tz), diff --git a/pandas/tests/indexes/datetimes/methods/test_factorize.py b/pandas/tests/indexes/datetimes/methods/test_factorize.py index 3ad927f133fb2..99a3bc910b9ca 100644 --- a/pandas/tests/indexes/datetimes/methods/test_factorize.py +++ b/pandas/tests/indexes/datetimes/methods/test_factorize.py @@ -58,7 +58,7 @@ def test_factorize(self): def test_factorize_preserves_freq(self): # GH#38120 freq should be preserved - idx3 = date_range("2000-01", periods=4, freq="M", tz="Asia/Tokyo") + idx3 = date_range("2000-01", periods=4, freq="ME", tz="Asia/Tokyo") exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) arr, idx = idx3.factorize() diff --git a/pandas/tests/indexes/datetimes/methods/test_insert.py b/pandas/tests/indexes/datetimes/methods/test_insert.py index cedf8cd54b81e..9ef43ace747e2 100644 --- a/pandas/tests/indexes/datetimes/methods/test_insert.py +++ b/pandas/tests/indexes/datetimes/methods/test_insert.py @@ -76,18 +76,18 @@ def test_insert(self): tm.assert_index_equal(result, expected) assert result.name == expected.name - idx = date_range("1/1/2000", periods=3, freq="M", name="idx") + idx = date_range("1/1/2000", periods=3, freq="ME", name="idx") # preserve freq expected_0 = DatetimeIndex( ["1999-12-31", "2000-01-31", "2000-02-29", "2000-03-31"], name="idx", - freq="M", + freq="ME", ) expected_3 = DatetimeIndex( ["2000-01-31", "2000-02-29", "2000-03-31", "2000-04-30"], name="idx", - freq="M", + freq="ME", ) # reset freq to None diff --git a/pandas/tests/indexes/datetimes/methods/test_shift.py b/pandas/tests/indexes/datetimes/methods/test_shift.py index 65bdfc9053e5e..e8661fafc3bb7 100644 --- a/pandas/tests/indexes/datetimes/methods/test_shift.py +++ b/pandas/tests/indexes/datetimes/methods/test_shift.py @@ -96,7 +96,7 @@ def test_dti_shift_localized(self, tzstr): dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI") dr_tz = dr.tz_localize(tzstr) - result = dr_tz.shift(1, "10T") + result = dr_tz.shift(1, "10min") assert result.tz == dr_tz.tz def test_dti_shift_across_dst(self): diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 14de6c5907d03..5f266ea0b42a6 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -20,7 +20,7 @@ class TestToPeriod: def test_dti_to_period(self): - dti = date_range(start="1/1/2005", end="12/1/2005", freq="M") + dti = date_range(start="1/1/2005", end="12/1/2005", freq="ME") pi1 = dti.to_period() pi2 = dti.to_period(freq="D") pi3 = dti.to_period(freq="3D") @@ -67,21 +67,27 @@ def test_to_period_monthish(self): for off in offsets: rng = date_range("01-Jan-2012", periods=8, freq=off) prng = rng.to_period() - assert prng.freq == "M" + assert prng.freqstr == "M" - rng = date_range("01-Jan-2012", periods=8, freq="M") + rng = date_range("01-Jan-2012", periods=8, freq="ME") prng = rng.to_period() - assert prng.freq == "M" + assert prng.freqstr == "M" with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): date_range("01-Jan-2012", periods=8, freq="EOM") - @pytest.mark.parametrize("freq", ["2M", MonthEnd(2)]) - def test_dti_to_period_2monthish(self, freq): - dti = date_range("2020-01-01", periods=3, freq=freq) + @pytest.mark.parametrize( + "freq_offset, freq_period", + [ + ("2ME", "2M"), + (MonthEnd(2), MonthEnd(2)), + ], + ) + def test_dti_to_period_2monthish(self, freq_offset, freq_period): + dti = date_range("2020-01-01", periods=3, freq=freq_offset) pi = dti.to_period() - tm.assert_index_equal(pi, period_range("2020-01", "2020-05", freq=freq)) + tm.assert_index_equal(pi, period_range("2020-01", "2020-05", freq=freq_period)) def test_to_period_infer(self): # https://github.com/pandas-dev/pandas/issues/33358 @@ -119,10 +125,10 @@ def test_to_period_millisecond(self): with tm.assert_produces_warning(UserWarning): # warning that timezone info will be lost - period = index.to_period(freq="L") + period = index.to_period(freq="ms") assert 2 == len(period) - assert period[0] == Period("2007-01-01 10:11:12.123Z", "L") - assert period[1] == Period("2007-01-01 10:11:13.789Z", "L") + assert period[0] == Period("2007-01-01 10:11:12.123Z", "ms") + assert period[1] == Period("2007-01-01 10:11:13.789Z", "ms") def test_to_period_microsecond(self): index = DatetimeIndex( @@ -134,10 +140,10 @@ def test_to_period_microsecond(self): with tm.assert_produces_warning(UserWarning): # warning that timezone info will be lost - period = index.to_period(freq="U") + period = index.to_period(freq="us") assert 2 == len(period) - assert period[0] == Period("2007-01-01 10:11:12.123456Z", "U") - assert period[1] == Period("2007-01-01 10:11:13.789123Z", "U") + assert period[0] == Period("2007-01-01 10:11:12.123456Z", "us") + assert period[1] == Period("2007-01-01 10:11:13.789123Z", "us") @pytest.mark.parametrize( "tz", diff --git a/pandas/tests/indexes/datetimes/test_asof.py b/pandas/tests/indexes/datetimes/test_asof.py index 7adc400302cb9..f52b6da5b2f07 100644 --- a/pandas/tests/indexes/datetimes/test_asof.py +++ b/pandas/tests/indexes/datetimes/test_asof.py @@ -11,7 +11,7 @@ class TestAsOf: def test_asof_partial(self): - index = date_range("2010-01-01", periods=2, freq="m") + index = date_range("2010-01-01", periods=2, freq="ME") expected = Timestamp("2010-02-28") result = index.asof("2010-02") assert result == expected diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 733c14f33567a..61c8cc4a50fe2 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -972,8 +972,8 @@ def test_explicit_none_freq(self): def test_dti_constructor_years_only(self, tz_naive_fixture): tz = tz_naive_fixture # GH 6961 - rng1 = date_range("2014", "2015", freq="M", tz=tz) - expected1 = date_range("2014-01-31", "2014-12-31", freq="M", tz=tz) + rng1 = date_range("2014", "2015", freq="ME", tz=tz) + expected1 = date_range("2014-01-31", "2014-12-31", freq="ME", tz=tz) rng2 = date_range("2014", "2015", freq="MS", tz=tz) expected2 = date_range("2014-01-01", "2015-01-01", freq="MS", tz=tz) @@ -1010,7 +1010,7 @@ def test_ctor_str_intraday(self): assert rng[0].second == 1 def test_is_(self): - dti = date_range(start="1/1/2005", end="12/1/2005", freq="M") + dti = date_range(start="1/1/2005", end="12/1/2005", freq="ME") assert dti.is_(dti) assert dti.is_(dti.view()) assert not dti.is_(dti.copy()) @@ -1036,7 +1036,7 @@ def test_constructor_int64_nocopy(self): assert (index.asi8[50:100] != -1).all() @pytest.mark.parametrize( - "freq", ["M", "Q", "A", "D", "B", "BH", "T", "S", "L", "U", "H", "N", "C"] + "freq", ["ME", "Q", "A", "D", "B", "BH", "min", "s", "ms", "us", "H", "ns", "C"] ) def test_from_freq_recreate_from_data(self, freq): org = date_range(start="2001/02/01 09:00", freq=freq, periods=1) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 2e2e33e2fb366..b93aee1d988de 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -123,7 +123,7 @@ def test_date_range_timestamp_equiv_preserve_frequency(self): class TestDateRanges: - @pytest.mark.parametrize("freq", ["N", "U", "L", "T", "S", "H", "D"]) + @pytest.mark.parametrize("freq", ["ns", "us", "ms", "min", "s", "H", "D"]) def test_date_range_edges(self, freq): # GH#13672 td = Timedelta(f"1{freq}") @@ -277,10 +277,10 @@ def test_date_range_negative_freq(self): tm.assert_index_equal(rng, exp) assert rng.freq == "-2A" - rng = date_range("2011-01-31", freq="-2M", periods=3) - exp = DatetimeIndex(["2011-01-31", "2010-11-30", "2010-09-30"], freq="-2M") + rng = date_range("2011-01-31", freq="-2ME", periods=3) + exp = DatetimeIndex(["2011-01-31", "2010-11-30", "2010-09-30"], freq="-2ME") tm.assert_index_equal(rng, exp) - assert rng.freq == "-2M" + assert rng.freq == "-2ME" def test_date_range_bms_bug(self): # #1645 @@ -638,7 +638,7 @@ def test_range_tz_dateutil(self): assert dr[0] == start assert dr[2] == end - @pytest.mark.parametrize("freq", ["1D", "3D", "2M", "7W", "3H", "A"]) + @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3H", "A"]) def test_range_closed(self, freq, inclusive_endpoints_fixture): begin = datetime(2011, 1, 1) end = datetime(2014, 1, 1) @@ -653,7 +653,7 @@ def test_range_closed(self, freq, inclusive_endpoints_fixture): tm.assert_index_equal(expected_range, result_range) - @pytest.mark.parametrize("freq", ["1D", "3D", "2M", "7W", "3H", "A"]) + @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3H", "A"]) def test_range_closed_with_tz_aware_start_end( self, freq, inclusive_endpoints_fixture ): @@ -674,7 +674,7 @@ def test_range_closed_with_tz_aware_start_end( tm.assert_index_equal(expected_range, result_range) - @pytest.mark.parametrize("freq", ["1D", "3D", "2M", "7W", "3H", "A"]) + @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3H", "A"]) def test_range_with_tz_closed_with_tz_aware_start_end( self, freq, inclusive_endpoints_fixture ): @@ -750,7 +750,7 @@ def test_range_closed_boundary(self, inclusive_endpoints_fixture): def test_years_only(self): # GH 6961 - dr = date_range("2014", "2015", freq="M") + dr = date_range("2014", "2015", freq="ME") assert dr[0] == datetime(2014, 1, 31) assert dr[-1] == datetime(2014, 12, 31) @@ -761,13 +761,13 @@ def test_freq_divides_end_in_nanos(self): expected_1 = DatetimeIndex( ["2005-01-12 10:00:00", "2005-01-12 15:45:00"], dtype="datetime64[ns]", - freq="345T", + freq="345min", tz=None, ) expected_2 = DatetimeIndex( ["2005-01-13 10:00:00", "2005-01-13 15:45:00"], dtype="datetime64[ns]", - freq="345T", + freq="345min", tz=None, ) tm.assert_index_equal(result_1, expected_1) @@ -836,6 +836,25 @@ def test_freq_dateoffset_with_relateivedelta_nanos(self): ) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( + "freq,freq_depr", + [ + ("min", "T"), + ("s", "S"), + ("ms", "L"), + ("us", "U"), + ("ns", "N"), + ], + ) + def test_frequencies_T_S_L_U_N_deprecated(self, freq, freq_depr): + # GH#52536 + msg = f"'{freq_depr}' is deprecated and will be removed in a future version." + + expected = date_range("1/1/2000", periods=4, freq=freq) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = date_range("1/1/2000", periods=4, freq=freq_depr) + tm.assert_index_equal(result, expected) + class TestDateRangeTZ: """Tests for date_range with timezones""" diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index aa4954ff0ba85..f44cbbf560584 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -56,10 +56,10 @@ def test_time_overflow_for_32bit_machines(self): # overflow. periods = np.int_(1000) - idx1 = date_range(start="2000", periods=periods, freq="S") + idx1 = date_range(start="2000", periods=periods, freq="s") assert len(idx1) == periods - idx2 = date_range(end="2000", periods=periods, freq="S") + idx2 = date_range(end="2000", periods=periods, freq="s") assert len(idx2) == periods def test_nat(self): @@ -148,8 +148,8 @@ def test_groupby_function_tuple_1677(self): assert isinstance(result.index[0], tuple) def assert_index_parameters(self, index): - assert index.freq == "40960N" - assert index.inferred_freq == "40960N" + assert index.freq == "40960ns" + assert index.inferred_freq == "40960ns" def test_ns_index(self): nsamples = 400 diff --git a/pandas/tests/indexes/datetimes/test_delete.py b/pandas/tests/indexes/datetimes/test_delete.py index e9de5a055a5c2..3565e516e69d5 100644 --- a/pandas/tests/indexes/datetimes/test_delete.py +++ b/pandas/tests/indexes/datetimes/test_delete.py @@ -10,11 +10,11 @@ class TestDelete: def test_delete(self): - idx = date_range(start="2000-01-01", periods=5, freq="M", name="idx") + idx = date_range(start="2000-01-01", periods=5, freq="ME", name="idx") # preserve freq - expected_0 = date_range(start="2000-02-01", periods=4, freq="M", name="idx") - expected_4 = date_range(start="2000-01-01", periods=4, freq="M", name="idx") + expected_0 = date_range(start="2000-02-01", periods=4, freq="ME", name="idx") + expected_4 = date_range(start="2000-01-01", periods=4, freq="ME", name="idx") # reset freq to None expected_1 = DatetimeIndex( diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index cb3e0179bf46c..502cb0407bfcd 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -73,17 +73,17 @@ def test_dti_repr_short(self): [ ( ["2012-01-01 00:00:00"], - "60T", + "60min", ( "DatetimeIndex(['2012-01-01 00:00:00'], " - "dtype='datetime64[ns]', freq='60T')" + "dtype='datetime64[ns]', freq='60min')" ), ), ( ["2012-01-01 00:00:00", "2012-01-01 01:00:00"], - "60T", + "60min", "DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 01:00:00'], " - "dtype='datetime64[ns]', freq='60T')", + "dtype='datetime64[ns]', freq='60min')", ), ( ["2012-01-01"], diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 8e1b41095e056..c3944a4443d67 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -101,7 +101,7 @@ def test_dti_business_getitem_matplotlib_hackaround(self, freq): rng[:, None] def test_getitem_int_list(self): - dti = date_range(start="1/1/2005", end="12/1/2005", freq="M") + dti = date_range(start="1/1/2005", end="12/1/2005", freq="ME") dti2 = dti[[1, 3, 5]] v1 = dti2[0] @@ -426,7 +426,7 @@ def test_get_loc_time_obj2(self): step = 24 * 3600 for n in ns: - idx = date_range("2014-11-26", periods=n, freq="S") + idx = date_range("2014-11-26", periods=n, freq="s") ts = pd.Series(np.random.default_rng(2).standard_normal(n), index=idx) locs = np.arange(start, n, step, dtype=np.intp) diff --git a/pandas/tests/indexes/datetimes/test_map.py b/pandas/tests/indexes/datetimes/test_map.py index 45698ef225151..c31e2407190ea 100644 --- a/pandas/tests/indexes/datetimes/test_map.py +++ b/pandas/tests/indexes/datetimes/test_map.py @@ -40,7 +40,7 @@ def test_map_bug_1677(self): def test_index_map(self, name): # see GH#20990 count = 6 - index = date_range("2018-01-01", periods=count, freq="M", name=name).map( + index = date_range("2018-01-01", periods=count, freq="ME", name=name).map( lambda x: (x.year, x.month) ) exp_index = MultiIndex.from_product(((2018,), range(1, 7)), names=[name, name]) diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 139190962895d..185134af165f4 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -140,7 +140,7 @@ def test_datetimeindex_accessors4(self): assert dti.is_month_start[0] == 1 def test_datetimeindex_accessors5(self): - freq_m = to_offset("M") + freq_m = to_offset("ME") bm = to_offset("BM") qfeb = to_offset("Q-FEB") qsfeb = to_offset("QS-FEB") @@ -256,7 +256,7 @@ def test_datetime_name_accessors(self, time_locale): assert np.isnan(ts.day_name(locale=time_locale)) # GH#12805 - dti = date_range(freq="M", start="2012", end="2013") + dti = date_range(freq="ME", start="2012", end="2013") result = dti.month_name(locale=time_locale) expected = Index([month.capitalize() for month in expected_months]) diff --git a/pandas/tests/indexes/datetimes/test_npfuncs.py b/pandas/tests/indexes/datetimes/test_npfuncs.py index 301466c0da41c..6c3e44c2a5db1 100644 --- a/pandas/tests/indexes/datetimes/test_npfuncs.py +++ b/pandas/tests/indexes/datetimes/test_npfuncs.py @@ -7,7 +7,7 @@ class TestSplit: def test_split_non_utc(self): # GH#14042 - indices = date_range("2016-01-01 00:00:00+0200", freq="S", periods=10) + indices = date_range("2016-01-01 00:00:00+0200", freq="s", periods=10) result = np.split(indices, indices_or_sections=[])[0] expected = indices._with_freq(None) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index d6ef4198fad2e..21dc22bea87dc 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -22,13 +22,13 @@ class TestDatetimeIndexOps: [ ("A", "day"), ("Q", "day"), - ("M", "day"), + ("ME", "day"), ("D", "day"), ("H", "hour"), - ("T", "minute"), - ("S", "second"), - ("L", "millisecond"), - ("U", "microsecond"), + ("min", "minute"), + ("s", "second"), + ("ms", "millisecond"), + ("us", "microsecond"), ], ) def test_resolution(self, request, tz_naive_fixture, freq, expected): diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 7978e596e6ee5..7ad0dfbaf6cb1 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -204,7 +204,7 @@ def test_partial_slice_daily(self): s["2004-12-31 00"] def test_partial_slice_hourly(self): - rng = date_range(freq="T", start=datetime(2005, 1, 1, 20, 0, 0), periods=500) + rng = date_range(freq="min", start=datetime(2005, 1, 1, 20, 0, 0), periods=500) s = Series(np.arange(len(rng)), index=rng) result = s["2005-1-1"] @@ -218,7 +218,7 @@ def test_partial_slice_hourly(self): s["2004-12-31 00:15"] def test_partial_slice_minutely(self): - rng = date_range(freq="S", start=datetime(2005, 1, 1, 23, 59, 0), periods=500) + rng = date_range(freq="s", start=datetime(2005, 1, 1, 23, 59, 0), periods=500) s = Series(np.arange(len(rng)), index=rng) result = s["2005-1-1 23:59"] @@ -336,7 +336,7 @@ def test_partial_slicing_with_multiindex(self): "TICKER": ["ABC", "MNP", "XYZ", "XYZ"], "val": [1, 2, 3, 4], }, - index=date_range("2013-06-19 09:30:00", periods=4, freq="5T"), + index=date_range("2013-06-19 09:30:00", periods=4, freq="5min"), ) df_multi = df.set_index(["ACCOUNT", "TICKER"], append=True) @@ -453,9 +453,11 @@ def test_getitem_with_datestring_with_UTC_offset(self, start, end): def test_slice_reduce_to_series(self): # GH 27516 - df = DataFrame({"A": range(24)}, index=date_range("2000", periods=24, freq="M")) + df = DataFrame( + {"A": range(24)}, index=date_range("2000", periods=24, freq="ME") + ) expected = Series( - range(12), index=date_range("2000", periods=12, freq="M"), name="A" + range(12), index=date_range("2000", periods=12, freq="ME"), name="A" ) result = df.loc["2000", "A"] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index f07a9dce5f6ae..49597ef885183 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -96,7 +96,7 @@ def test_round_daily(self): "freq, error_msg", [ ("Y", " is a non-fixed frequency"), - ("M", " is a non-fixed frequency"), + ("ME", " is a non-fixed frequency"), ("foobar", "Invalid frequency: foobar"), ], ) @@ -133,9 +133,9 @@ def test_round(self, tz_naive_fixture): msg = " is a non-fixed frequency" with pytest.raises(ValueError, match=msg): - rng.round(freq="M") + rng.round(freq="ME") with pytest.raises(ValueError, match=msg): - elt.round(freq="M") + elt.round(freq="ME") # GH#14440 & GH#15578 index = DatetimeIndex(["2016-10-17 12:00:00.0015"], tz=tz) @@ -175,7 +175,7 @@ def test_no_rounding_occurs(self, tz_naive_fixture): ] ) - tm.assert_index_equal(rng.round(freq="2T"), expected_rng) + tm.assert_index_equal(rng.round(freq="2min"), expected_rng) @pytest.mark.parametrize( "test_input, rounder, freq, expected", @@ -196,8 +196,8 @@ def test_no_rounding_occurs(self, tz_naive_fixture): ), (["1823-01-01 00:00:01"], "floor", "1s", ["1823-01-01 00:00:01"]), (["1823-01-01 00:00:01"], "ceil", "1s", ["1823-01-01 00:00:01"]), - (["2018-01-01 00:15:00"], "ceil", "15T", ["2018-01-01 00:15:00"]), - (["2018-01-01 00:15:00"], "floor", "15T", ["2018-01-01 00:15:00"]), + (["2018-01-01 00:15:00"], "ceil", "15min", ["2018-01-01 00:15:00"]), + (["2018-01-01 00:15:00"], "floor", "15min", ["2018-01-01 00:15:00"]), (["1823-01-01 03:00:00"], "ceil", "3H", ["1823-01-01 03:00:00"]), (["1823-01-01 03:00:00"], "floor", "3H", ["1823-01-01 03:00:00"]), ( @@ -333,14 +333,14 @@ def test_hour(self): tm.assert_index_equal(r1, r2) def test_minute(self): - dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="T") + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="min") r1 = pd.Index([x.to_julian_date() for x in dr]) r2 = dr.to_julian_date() assert isinstance(r2, pd.Index) and r2.dtype == np.float64 tm.assert_index_equal(r1, r2) def test_second(self): - dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="S") + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="s") r1 = pd.Index([x.to_julian_date() for x in dr]) r2 = dr.to_julian_date() assert isinstance(r2, pd.Index) and r2.dtype == np.float64 diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index adf7acfa59e0c..b56bad7f2e833 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -269,7 +269,7 @@ def test_intersection(self, tz, sort): # parametrize over both anchored and non-anchored freqs, as they # have different code paths - @pytest.mark.parametrize("freq", ["T", "B"]) + @pytest.mark.parametrize("freq", ["min", "B"]) def test_intersection_empty(self, tz_aware_fixture, freq): # empty same freq GH2129 tz = tz_aware_fixture @@ -283,7 +283,7 @@ def test_intersection_empty(self, tz_aware_fixture, freq): assert result.freq == rng.freq # no overlap GH#33604 - check_freq = freq != "T" # We don't preserve freq on non-anchored offsets + check_freq = freq != "min" # We don't preserve freq on non-anchored offsets result = rng[:3].intersection(rng[-3:]) tm.assert_index_equal(result, rng[:0]) if check_freq: @@ -343,9 +343,11 @@ def test_difference_freq(self, sort): tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) + # preserve frequency when the difference is a contiguous + # subset of the original range other = date_range("20160922", "20160925", freq="D") idx_diff = index.difference(other, sort) - expected = DatetimeIndex(["20160920", "20160921"], freq=None) + expected = DatetimeIndex(["20160920", "20160921"], freq="D") tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 09b06ecd5630d..756a72cf1849a 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -192,7 +192,7 @@ def test_dti_tz_convert_hour_overflow_dst_timestamps(self, tz): expected = Index([9, 9, 9], dtype=np.int32) tm.assert_index_equal(ut.hour, expected) - @pytest.mark.parametrize("freq, n", [("H", 1), ("T", 60), ("S", 3600)]) + @pytest.mark.parametrize("freq, n", [("H", 1), ("min", 60), ("s", 3600)]) def test_dti_tz_convert_trans_pos_plus_1__bug(self, freq, n): # Regression test for tslib.tz_convert(vals, tz1, tz2). # See https://github.com/pandas-dev/pandas/issues/4496 for details. @@ -204,7 +204,7 @@ def test_dti_tz_convert_trans_pos_plus_1__bug(self, freq, n): tm.assert_index_equal(idx.hour, Index(expected, dtype=np.int32)) def test_dti_tz_convert_dst(self): - for freq, n in [("H", 1), ("T", 60), ("S", 3600)]: + for freq, n in [("H", 1), ("min", 60), ("s", 3600)]: # Start DST idx = date_range( "2014-03-08 23:00", "2014-03-09 09:00", freq=freq, tz="UTC" @@ -272,8 +272,8 @@ def test_dti_tz_convert_dst(self): def test_tz_convert_roundtrip(self, tz_aware_fixture): tz = tz_aware_fixture - idx1 = date_range(start="2014-01-01", end="2014-12-31", freq="M", tz="UTC") - exp1 = date_range(start="2014-01-01", end="2014-12-31", freq="M") + idx1 = date_range(start="2014-01-01", end="2014-12-31", freq="ME", tz="UTC") + exp1 = date_range(start="2014-01-01", end="2014-12-31", freq="ME") idx2 = date_range(start="2014-01-01", end="2014-12-31", freq="D", tz="UTC") exp2 = date_range(start="2014-01-01", end="2014-12-31", freq="D") @@ -281,8 +281,8 @@ def test_tz_convert_roundtrip(self, tz_aware_fixture): idx3 = date_range(start="2014-01-01", end="2014-03-01", freq="H", tz="UTC") exp3 = date_range(start="2014-01-01", end="2014-03-01", freq="H") - idx4 = date_range(start="2014-08-01", end="2014-10-31", freq="T", tz="UTC") - exp4 = date_range(start="2014-08-01", end="2014-10-31", freq="T") + idx4 = date_range(start="2014-08-01", end="2014-10-31", freq="min", tz="UTC") + exp4 = date_range(start="2014-08-01", end="2014-10-31", freq="min") for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3), (idx4, exp4)]: converted = idx.tz_convert(tz) @@ -440,11 +440,11 @@ def test_dti_tz_localize_pass_dates_to_utc(self, tzstr): @pytest.mark.parametrize("prefix", ["", "dateutil/"]) def test_dti_tz_localize(self, prefix): tzstr = prefix + "US/Eastern" - dti = date_range(start="1/1/2005", end="1/1/2005 0:00:30.256", freq="L") + dti = date_range(start="1/1/2005", end="1/1/2005 0:00:30.256", freq="ms") dti2 = dti.tz_localize(tzstr) dti_utc = date_range( - start="1/1/2005 05:00", end="1/1/2005 5:00:30.256", freq="L", tz="utc" + start="1/1/2005 05:00", end="1/1/2005 5:00:30.256", freq="ms", tz="utc" ) tm.assert_numpy_array_equal(dti2.values, dti_utc.values) @@ -452,11 +452,11 @@ def test_dti_tz_localize(self, prefix): dti3 = dti2.tz_convert(prefix + "US/Pacific") tm.assert_numpy_array_equal(dti3.values, dti_utc.values) - dti = date_range(start="11/6/2011 1:59", end="11/6/2011 2:00", freq="L") + dti = date_range(start="11/6/2011 1:59", end="11/6/2011 2:00", freq="ms") with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): dti.tz_localize(tzstr) - dti = date_range(start="3/13/2011 1:59", end="3/13/2011 2:00", freq="L") + dti = date_range(start="3/13/2011 1:59", end="3/13/2011 2:00", freq="ms") with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:00:00"): dti.tz_localize(tzstr) @@ -474,14 +474,14 @@ def test_dti_tz_localize_utc_conversion(self, tz): # 1) check for DST ambiguities # 2) convert to UTC - rng = date_range("3/10/2012", "3/11/2012", freq="30T") + rng = date_range("3/10/2012", "3/11/2012", freq="30min") converted = rng.tz_localize(tz) expected_naive = rng + pd.offsets.Hour(5) tm.assert_numpy_array_equal(converted.asi8, expected_naive.asi8) # DST ambiguity, this should fail - rng = date_range("3/11/2012", "3/12/2012", freq="30T") + rng = date_range("3/11/2012", "3/12/2012", freq="30min") # Is this really how it should fail?? with pytest.raises(pytz.NonExistentTimeError, match="2012-03-11 02:00:00"): rng.tz_localize(tz) @@ -490,7 +490,7 @@ def test_dti_tz_localize_roundtrip(self, tz_aware_fixture): # note: this tz tests that a tz-naive index can be localized # and de-localized successfully, when there are no DST transitions # in the range. - idx = date_range(start="2014-06-01", end="2014-08-30", freq="15T") + idx = date_range(start="2014-06-01", end="2014-08-30", freq="15min") tz = tz_aware_fixture localized = idx.tz_localize(tz) # can't localize a tz-aware object @@ -879,7 +879,7 @@ def test_dti_tz_conversion_freq(self, tz_naive_fixture): # GH25241 t3 = DatetimeIndex(["2019-01-01 10:00"], freq="H") assert t3.tz_localize(tz=tz_naive_fixture).freq == t3.freq - t4 = DatetimeIndex(["2019-01-02 12:00"], tz="UTC", freq="T") + t4 = DatetimeIndex(["2019-01-02 12:00"], tz="UTC", freq="min") assert t4.tz_convert(tz="UTC").freq == t4.freq def test_drop_dst_boundary(self): diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index 4d6f3a62d4dd0..f003211abd857 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -80,7 +80,11 @@ def test_repr_floats(self): ((Timestamp("20180102"), Timestamp("20180103"))), ], "both", - ["[2018-01-01, 2018-01-02]", "NaN", "[2018-01-02, 2018-01-03]"], + [ + "[2018-01-01 00:00:00, 2018-01-02 00:00:00]", + "NaN", + "[2018-01-02 00:00:00, 2018-01-03 00:00:00]", + ], ), ( [ @@ -103,3 +107,19 @@ def test_to_native_types(self, tuples, closed, expected_data): result = index._format_native_types() expected = np.array(expected_data) tm.assert_numpy_array_equal(result, expected) + + def test_timestamp_with_timezone(self): + # GH 55035 + index = IntervalIndex( + [ + Interval( + Timestamp("2020-01-01", tz="UTC"), Timestamp("2020-01-02", tz="UTC") + ) + ] + ) + result = repr(index) + expected = ( + "IntervalIndex([(2020-01-01 00:00:00+00:00, 2020-01-02 00:00:00+00:00]], " + "dtype='interval[datetime64[ns, UTC], right]')" + ) + assert result == expected diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index e65ae52e348c6..db8f697b95cd8 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -307,9 +307,9 @@ def test_get_indexer_datetime(self): result = ii.get_indexer(DatetimeIndex(["2018-01-02"]).astype(str)) tm.assert_numpy_array_equal(result, expected) - # TODO this should probably be deprecated? # https://github.com/pandas-dev/pandas/issues/47772 result = ii.get_indexer(DatetimeIndex(["2018-01-02"]).asi8) + expected = np.array([-1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/interval/test_interval_range.py b/pandas/tests/indexes/interval/test_interval_range.py index 57783265b04b3..499220b39279d 100644 --- a/pandas/tests/indexes/interval/test_interval_range.py +++ b/pandas/tests/indexes/interval/test_interval_range.py @@ -58,7 +58,7 @@ def test_constructor_numeric(self, closed, name, freq, periods): @pytest.mark.parametrize("tz", [None, "US/Eastern"]) @pytest.mark.parametrize( - "freq, periods", [("D", 364), ("2D", 182), ("22D18H", 16), ("M", 11)] + "freq, periods", [("D", 364), ("2D", 182), ("22D18H", 16), ("ME", 11)] ) def test_constructor_timestamp(self, closed, name, freq, periods, tz): start, end = Timestamp("20180101", tz=tz), Timestamp("20181231", tz=tz) diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index 7097aa2b61632..87f1439db5fc8 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -98,8 +98,8 @@ def test_numpy_repeat(): def test_append_mixed_dtypes(): # GH 13660 - dti = date_range("2011-01-01", freq="M", periods=3) - dti_tz = date_range("2011-01-01", freq="M", periods=3, tz="US/Eastern") + dti = date_range("2011-01-01", freq="ME", periods=3) + dti_tz = date_range("2011-01-01", freq="ME", periods=3, tz="US/Eastern") pi = period_range("2011-01", freq="M", periods=3) mi = MultiIndex.from_arrays( diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index 91ec1b2475cde..cce6c98d71b47 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -778,7 +778,7 @@ def test_datetimeindex(): idx1 = pd.DatetimeIndex( ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"] * 2, tz="Asia/Tokyo" ) - idx2 = date_range("2010/01/01", periods=6, freq="M", tz="US/Eastern") + idx2 = date_range("2010/01/01", periods=6, freq="ME", tz="US/Eastern") idx = MultiIndex.from_arrays([idx1, idx2]) expected1 = pd.DatetimeIndex( diff --git a/pandas/tests/indexes/multi/test_partial_indexing.py b/pandas/tests/indexes/multi/test_partial_indexing.py index 47efc43d5eae0..66163dad3deae 100644 --- a/pandas/tests/indexes/multi/test_partial_indexing.py +++ b/pandas/tests/indexes/multi/test_partial_indexing.py @@ -31,7 +31,7 @@ def df(): dr = date_range("2016-01-01", "2016-01-03", freq="12H") abc = ["a", "b", "c"] mi = MultiIndex.from_product([dr, abc]) - frame = DataFrame({"c1": range(0, 15)}, index=mi) + frame = DataFrame({"c1": range(15)}, index=mi) return frame diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index da9838d4a2ed3..06dbb33aadf97 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -169,6 +169,28 @@ def test_append_names_dont_match(): tm.assert_index_equal(result, expected) +def test_append_overlapping_interval_levels(): + # GH 54934 + ivl1 = pd.IntervalIndex.from_breaks([0.0, 1.0, 2.0]) + ivl2 = pd.IntervalIndex.from_breaks([0.5, 1.5, 2.5]) + mi1 = MultiIndex.from_product([ivl1, ivl1]) + mi2 = MultiIndex.from_product([ivl2, ivl2]) + result = mi1.append(mi2) + expected = MultiIndex.from_tuples( + [ + (pd.Interval(0.0, 1.0), pd.Interval(0.0, 1.0)), + (pd.Interval(0.0, 1.0), pd.Interval(1.0, 2.0)), + (pd.Interval(1.0, 2.0), pd.Interval(0.0, 1.0)), + (pd.Interval(1.0, 2.0), pd.Interval(1.0, 2.0)), + (pd.Interval(0.5, 1.5), pd.Interval(0.5, 1.5)), + (pd.Interval(0.5, 1.5), pd.Interval(1.5, 2.5)), + (pd.Interval(1.5, 2.5), pd.Interval(0.5, 1.5)), + (pd.Interval(1.5, 2.5), pd.Interval(1.5, 2.5)), + ] + ) + tm.assert_index_equal(result, expected) + + def test_repeat(): reps = 2 numbers = [1, 2, 3] diff --git a/pandas/tests/indexes/numeric/test_join.py b/pandas/tests/indexes/numeric/test_join.py index 93ff6238b90ff..918d505216735 100644 --- a/pandas/tests/indexes/numeric/test_join.py +++ b/pandas/tests/indexes/numeric/test_join.py @@ -11,13 +11,13 @@ def test_join_non_unique(self): joined, lidx, ridx = left.join(left, return_indexers=True) - exp_joined = Index([3, 3, 3, 3, 4, 4, 4, 4]) + exp_joined = Index([4, 4, 4, 4, 3, 3, 3, 3]) tm.assert_index_equal(joined, exp_joined) - exp_lidx = np.array([2, 2, 3, 3, 0, 0, 1, 1], dtype=np.intp) + exp_lidx = np.array([0, 0, 1, 1, 2, 2, 3, 3], dtype=np.intp) tm.assert_numpy_array_equal(lidx, exp_lidx) - exp_ridx = np.array([2, 3, 2, 3, 0, 1, 0, 1], dtype=np.intp) + exp_ridx = np.array([0, 1, 0, 1, 2, 3, 2, 3], dtype=np.intp) tm.assert_numpy_array_equal(ridx, exp_ridx) def test_join_inner(self): diff --git a/pandas/tests/indexes/numeric/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py index 977c7da7d866f..8cd295802a5d1 100644 --- a/pandas/tests/indexes/numeric/test_numeric.py +++ b/pandas/tests/indexes/numeric/test_numeric.py @@ -227,6 +227,14 @@ def test_fillna_float64(self): exp = Index([1.0, "obj", 3.0], name="x") tm.assert_index_equal(idx.fillna("obj"), exp, exact=True) + def test_logical_compat(self, simple_index): + idx = simple_index + assert idx.all() == idx.values.all() + assert idx.any() == idx.values.any() + + assert idx.all() == idx.to_series().all() + assert idx.any() == idx.to_series().any() + class TestNumericInt: @pytest.fixture(params=[np.int64, np.int32, np.int16, np.int8, np.uint64]) diff --git a/pandas/tests/indexes/period/methods/test_asfreq.py b/pandas/tests/indexes/period/methods/test_asfreq.py index 4f5cfbade4d84..89ea4fb6472d0 100644 --- a/pandas/tests/indexes/period/methods/test_asfreq.py +++ b/pandas/tests/indexes/period/methods/test_asfreq.py @@ -16,57 +16,57 @@ def test_asfreq(self): pi4 = period_range(freq="D", start="1/1/2001", end="1/1/2001") pi5 = period_range(freq="H", start="1/1/2001", end="1/1/2001 00:00") pi6 = period_range(freq="Min", start="1/1/2001", end="1/1/2001 00:00") - pi7 = period_range(freq="S", start="1/1/2001", end="1/1/2001 00:00:00") + pi7 = period_range(freq="s", start="1/1/2001", end="1/1/2001 00:00:00") - assert pi1.asfreq("Q", "S") == pi2 + assert pi1.asfreq("Q", "s") == pi2 assert pi1.asfreq("Q", "s") == pi2 assert pi1.asfreq("M", "start") == pi3 assert pi1.asfreq("D", "StarT") == pi4 assert pi1.asfreq("H", "beGIN") == pi5 - assert pi1.asfreq("Min", "S") == pi6 - assert pi1.asfreq("S", "S") == pi7 - - assert pi2.asfreq("A", "S") == pi1 - assert pi2.asfreq("M", "S") == pi3 - assert pi2.asfreq("D", "S") == pi4 - assert pi2.asfreq("H", "S") == pi5 - assert pi2.asfreq("Min", "S") == pi6 - assert pi2.asfreq("S", "S") == pi7 - - assert pi3.asfreq("A", "S") == pi1 - assert pi3.asfreq("Q", "S") == pi2 - assert pi3.asfreq("D", "S") == pi4 - assert pi3.asfreq("H", "S") == pi5 - assert pi3.asfreq("Min", "S") == pi6 - assert pi3.asfreq("S", "S") == pi7 - - assert pi4.asfreq("A", "S") == pi1 - assert pi4.asfreq("Q", "S") == pi2 - assert pi4.asfreq("M", "S") == pi3 - assert pi4.asfreq("H", "S") == pi5 - assert pi4.asfreq("Min", "S") == pi6 - assert pi4.asfreq("S", "S") == pi7 - - assert pi5.asfreq("A", "S") == pi1 - assert pi5.asfreq("Q", "S") == pi2 - assert pi5.asfreq("M", "S") == pi3 - assert pi5.asfreq("D", "S") == pi4 - assert pi5.asfreq("Min", "S") == pi6 - assert pi5.asfreq("S", "S") == pi7 - - assert pi6.asfreq("A", "S") == pi1 - assert pi6.asfreq("Q", "S") == pi2 - assert pi6.asfreq("M", "S") == pi3 - assert pi6.asfreq("D", "S") == pi4 - assert pi6.asfreq("H", "S") == pi5 - assert pi6.asfreq("S", "S") == pi7 - - assert pi7.asfreq("A", "S") == pi1 - assert pi7.asfreq("Q", "S") == pi2 - assert pi7.asfreq("M", "S") == pi3 - assert pi7.asfreq("D", "S") == pi4 - assert pi7.asfreq("H", "S") == pi5 - assert pi7.asfreq("Min", "S") == pi6 + assert pi1.asfreq("Min", "s") == pi6 + assert pi1.asfreq("s", "s") == pi7 + + assert pi2.asfreq("A", "s") == pi1 + assert pi2.asfreq("M", "s") == pi3 + assert pi2.asfreq("D", "s") == pi4 + assert pi2.asfreq("H", "s") == pi5 + assert pi2.asfreq("Min", "s") == pi6 + assert pi2.asfreq("s", "s") == pi7 + + assert pi3.asfreq("A", "s") == pi1 + assert pi3.asfreq("Q", "s") == pi2 + assert pi3.asfreq("D", "s") == pi4 + assert pi3.asfreq("H", "s") == pi5 + assert pi3.asfreq("Min", "s") == pi6 + assert pi3.asfreq("s", "s") == pi7 + + assert pi4.asfreq("A", "s") == pi1 + assert pi4.asfreq("Q", "s") == pi2 + assert pi4.asfreq("M", "s") == pi3 + assert pi4.asfreq("H", "s") == pi5 + assert pi4.asfreq("Min", "s") == pi6 + assert pi4.asfreq("s", "s") == pi7 + + assert pi5.asfreq("A", "s") == pi1 + assert pi5.asfreq("Q", "s") == pi2 + assert pi5.asfreq("M", "s") == pi3 + assert pi5.asfreq("D", "s") == pi4 + assert pi5.asfreq("Min", "s") == pi6 + assert pi5.asfreq("s", "s") == pi7 + + assert pi6.asfreq("A", "s") == pi1 + assert pi6.asfreq("Q", "s") == pi2 + assert pi6.asfreq("M", "s") == pi3 + assert pi6.asfreq("D", "s") == pi4 + assert pi6.asfreq("H", "s") == pi5 + assert pi6.asfreq("s", "s") == pi7 + + assert pi7.asfreq("A", "s") == pi1 + assert pi7.asfreq("Q", "s") == pi2 + assert pi7.asfreq("M", "s") == pi3 + assert pi7.asfreq("D", "s") == pi4 + assert pi7.asfreq("H", "s") == pi5 + assert pi7.asfreq("Min", "s") == pi6 msg = "How must be one of S or E" with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/indexes/period/methods/test_factorize.py b/pandas/tests/indexes/period/methods/test_factorize.py index 7705da02bb7d2..ac3d09d76157b 100644 --- a/pandas/tests/indexes/period/methods/test_factorize.py +++ b/pandas/tests/indexes/period/methods/test_factorize.py @@ -10,7 +10,8 @@ class TestFactorize: def test_factorize(self): idx1 = PeriodIndex( - ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], freq="M" + ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], + freq="M", ) exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) @@ -25,7 +26,8 @@ def test_factorize(self): tm.assert_index_equal(idx, exp_idx) idx2 = PeriodIndex( - ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], freq="M" + ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], + freq="M", ) exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.intp) diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index 7d4d681659ab6..a5bdfa11140d1 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -110,16 +110,18 @@ def test_constructor_U(self): def test_constructor_nano(self): idx = period_range( - start=Period(ordinal=1, freq="N"), end=Period(ordinal=4, freq="N"), freq="N" + start=Period(ordinal=1, freq="ns"), + end=Period(ordinal=4, freq="ns"), + freq="ns", ) exp = PeriodIndex( [ - Period(ordinal=1, freq="N"), - Period(ordinal=2, freq="N"), - Period(ordinal=3, freq="N"), - Period(ordinal=4, freq="N"), + Period(ordinal=1, freq="ns"), + Period(ordinal=2, freq="ns"), + Period(ordinal=3, freq="ns"), + Period(ordinal=4, freq="ns"), ], - freq="N", + freq="ns", ) tm.assert_index_equal(idx, exp) @@ -144,7 +146,7 @@ def test_constructor_corner(self): def test_constructor_with_without_freq(self): # GH53687 - start = Period("2002-01-01 00:00", freq="30T") + start = Period("2002-01-01 00:00", freq="30min") exp = period_range(start=start, periods=5, freq=start.freq) result = period_range(start=start, periods=5) tm.assert_index_equal(exp, result) @@ -177,15 +179,15 @@ def test_constructor_fromarraylike(self): result = PeriodIndex(idx, freq=offsets.MonthEnd()) tm.assert_index_equal(result, idx) - assert result.freq == "M" + assert result.freq == "ME" result = PeriodIndex(idx, freq="2M") tm.assert_index_equal(result, idx.asfreq("2M")) - assert result.freq == "2M" + assert result.freq == "2ME" result = PeriodIndex(idx, freq=offsets.MonthEnd(2)) tm.assert_index_equal(result, idx.asfreq("2M")) - assert result.freq == "2M" + assert result.freq == "2ME" result = PeriodIndex(idx, freq="D") exp = idx.asfreq("D", "e") @@ -203,7 +205,7 @@ def test_constructor_datetime64arr(self): @pytest.mark.parametrize("box", [None, "series", "index"]) def test_constructor_datetime64arr_ok(self, box): # https://github.com/pandas-dev/pandas/issues/23438 - data = date_range("2017", periods=4, freq="M") + data = date_range("2017", periods=4, freq="ME") if box is None: data = data._values elif box == "series": @@ -248,7 +250,7 @@ def test_constructor_empty(self): idx = PeriodIndex([], freq="M") assert isinstance(idx, PeriodIndex) assert len(idx) == 0 - assert idx.freq == "M" + assert idx.freq == "ME" with pytest.raises(ValueError, match="freq not specified"): PeriodIndex([]) @@ -413,14 +415,32 @@ def test_constructor_freq_mult(self): with pytest.raises(ValueError, match=msg): period_range("2011-01", periods=3, freq="0M") - @pytest.mark.parametrize("freq", ["A", "M", "D", "T", "S"]) + @pytest.mark.parametrize( + "freq_offset, freq_period", + [ + ("A", "A"), + ("ME", "M"), + ("D", "D"), + ("min", "min"), + ("s", "s"), + ], + ) @pytest.mark.parametrize("mult", [1, 2, 3, 4, 5]) - def test_constructor_freq_mult_dti_compat(self, mult, freq): - freqstr = str(mult) + freq - pidx = period_range(start="2014-04-01", freq=freqstr, periods=10) - expected = date_range(start="2014-04-01", freq=freqstr, periods=10).to_period( - freqstr - ) + def test_constructor_freq_mult_dti_compat(self, mult, freq_offset, freq_period): + freqstr_offset = str(mult) + freq_offset + freqstr_period = str(mult) + freq_period + pidx = period_range(start="2014-04-01", freq=freqstr_period, periods=10) + expected = date_range( + start="2014-04-01", freq=freqstr_offset, periods=10 + ).to_period(freqstr_period) + tm.assert_index_equal(pidx, expected) + + @pytest.mark.parametrize("mult", [1, 2, 3, 4, 5]) + def test_constructor_freq_mult_dti_compat_month(self, mult): + pidx = period_range(start="2014-04-01", freq=f"{mult}M", periods=10) + expected = date_range( + start="2014-04-01", freq=f"{mult}ME", periods=10 + ).to_period(f"{mult}M") tm.assert_index_equal(pidx, expected) def test_constructor_freq_combined(self): @@ -456,7 +476,7 @@ def test_constructor(self): pi = period_range(freq="Min", start="1/1/2001", end="1/1/2001 23:59") assert len(pi) == 24 * 60 - pi = period_range(freq="S", start="1/1/2001", end="1/1/2001 23:59:59") + pi = period_range(freq="s", start="1/1/2001", end="1/1/2001 23:59:59") assert len(pi) == 24 * 60 * 60 with tm.assert_produces_warning(FutureWarning, match=msg): @@ -506,7 +526,7 @@ def test_constructor(self): Period("2006-12-31", ("w", 1)) @pytest.mark.parametrize( - "freq", ["M", "Q", "A", "D", "B", "T", "S", "L", "U", "N", "H"] + "freq", ["M", "Q", "A", "D", "B", "min", "s", "ms", "us", "ns", "H"] ) @pytest.mark.filterwarnings( r"ignore:Period with BDay freq is deprecated:FutureWarning" diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index c0c6f3c977ceb..109a4a41e2841 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -174,8 +174,8 @@ def test_getitem_list_periods(self): @pytest.mark.arm_slow def test_getitem_seconds(self): # GH#6716 - didx = date_range(start="2013/01/01 09:00:00", freq="S", periods=4000) - pidx = period_range(start="2013/01/01 09:00:00", freq="S", periods=4000) + didx = date_range(start="2013/01/01 09:00:00", freq="s", periods=4000) + pidx = period_range(start="2013/01/01 09:00:00", freq="s", periods=4000) for idx in [didx, pidx]: # getitem against index should raise ValueError @@ -579,7 +579,7 @@ def test_where_invalid_dtypes(self): result = pi.where(mask, tdi) tm.assert_index_equal(result, expected) - dti = i2.to_timestamp("S") + dti = i2.to_timestamp("s") expected = pd.Index([dti[0], dti[1]] + tail, dtype=object) assert expected[0] is NaT result = pi.where(mask, dti) diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index e52866abbe234..3a272f53091b5 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -84,7 +84,7 @@ def test_range_slice_day(self, make_range): @pytest.mark.parametrize("make_range", [date_range, period_range]) def test_range_slice_seconds(self, make_range): # GH#6716 - idx = make_range(start="2013/01/01 09:00:00", freq="S", periods=4000) + idx = make_range(start="2013/01/01 09:00:00", freq="s", periods=4000) msg = "slice indices must be integers or None or have an __index__ method" # slices against index should raise IndexError diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 6d8ae1793d5ec..bd40fa37897d8 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -164,7 +164,7 @@ def test_period_index_length(self): period_range(freq="H", start="12/31/2001", end="1/1/2002 23:00"), period_range(freq="Min", start="12/31/2001", end="1/1/2002 00:20"), period_range( - freq="S", start="12/31/2001 00:00:00", end="12/31/2001 00:05:00" + freq="s", start="12/31/2001 00:00:00", end="12/31/2001 00:05:00" ), period_range(end=Period("2006-12-31", "W"), periods=10), ], @@ -278,6 +278,12 @@ def test_format_empty(self): assert empty_idx.format() == [] assert empty_idx.format(name=True) == [""] + def test_period_index_frequency_ME_error_message(self): + msg = "Invalid frequency: 2ME" + + with pytest.raises(ValueError, match=msg): + PeriodIndex(["2020-01-01", "2020-01-02"], freq="2ME") + def test_maybe_convert_timedelta(): pi = PeriodIndex(["2000", "2001"], freq="D") diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py index c94ddf57c0ee1..63acaba2d4f3e 100644 --- a/pandas/tests/indexes/period/test_period_range.py +++ b/pandas/tests/indexes/period/test_period_range.py @@ -20,7 +20,7 @@ def test_required_arguments(self): with pytest.raises(ValueError, match=msg): period_range("2011-1-1", "2012-1-1", "B") - @pytest.mark.parametrize("freq", ["D", "W", "M", "Q", "A"]) + @pytest.mark.parametrize("freq", ["D", "W", "Q", "A"]) def test_construction_from_string(self, freq): # non-empty expected = date_range( @@ -49,11 +49,39 @@ def test_construction_from_string(self, freq): result = period_range(start=end, end=start, freq=freq, name="foo") tm.assert_index_equal(result, expected) + def test_construction_from_string_monthly(self): + # non-empty + expected = date_range( + start="2017-01-01", periods=5, freq="ME", name="foo" + ).to_period() + start, end = str(expected[0]), str(expected[-1]) + + result = period_range(start=start, end=end, freq="M", name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(start=start, periods=5, freq="M", name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(end=end, periods=5, freq="M", name="foo") + tm.assert_index_equal(result, expected) + + # empty + expected = PeriodIndex([], freq="M", name="foo") + + result = period_range(start=start, periods=0, freq="M", name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(end=end, periods=0, freq="M", name="foo") + tm.assert_index_equal(result, expected) + + result = period_range(start=end, end=start, freq="M", name="foo") + tm.assert_index_equal(result, expected) + def test_construction_from_period(self): # upsampling start, end = Period("2017Q1", freq="Q"), Period("2018Q1", freq="Q") expected = date_range( - start="2017-03-31", end="2018-03-31", freq="M", name="foo" + start="2017-03-31", end="2018-03-31", freq="ME", name="foo" ).to_period() result = period_range(start=start, end=end, freq="M", name="foo") tm.assert_index_equal(result, expected) @@ -119,3 +147,8 @@ def test_errors(self): msg = "periods must be a number, got foo" with pytest.raises(TypeError, match=msg): period_range(start="2017Q1", periods="foo") + + def test_period_range_frequency_ME_error_message(self): + msg = "Invalid frequency: 2ME" + with pytest.raises(ValueError, match=msg): + period_range(start="Jan-2000", end="Dec-2000", freq="2ME") diff --git a/pandas/tests/indexes/period/test_resolution.py b/pandas/tests/indexes/period/test_resolution.py index 7ecbde75cfa47..6c876b4f9366f 100644 --- a/pandas/tests/indexes/period/test_resolution.py +++ b/pandas/tests/indexes/period/test_resolution.py @@ -12,10 +12,10 @@ class TestResolution: ("M", "month"), ("D", "day"), ("H", "hour"), - ("T", "minute"), - ("S", "second"), - ("L", "millisecond"), - ("U", "microsecond"), + ("min", "minute"), + ("s", "second"), + ("ms", "millisecond"), + ("us", "microsecond"), ], ) def test_resolution(self, freq, expected): diff --git a/pandas/tests/indexes/period/test_scalar_compat.py b/pandas/tests/indexes/period/test_scalar_compat.py index c96444981ff14..d8afd29ff31c5 100644 --- a/pandas/tests/indexes/period/test_scalar_compat.py +++ b/pandas/tests/indexes/period/test_scalar_compat.py @@ -20,7 +20,7 @@ def test_start_time(self): def test_end_time(self): # GH#17157 index = period_range(freq="M", start="2016-01-01", end="2016-05-31") - expected_index = date_range("2016-01-01", end="2016-05-31", freq="M") + expected_index = date_range("2016-01-01", end="2016-05-31", freq="ME") expected_index += Timedelta(1, "D") - Timedelta(1, "ns") tm.assert_index_equal(index.end_time, expected_index) diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index af89d712b5565..dd05210e417b0 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -62,10 +62,10 @@ def test_union(self, sort): ) rng5 = PeriodIndex( - ["2000-01-01 09:01", "2000-01-01 09:03", "2000-01-01 09:05"], freq="T" + ["2000-01-01 09:01", "2000-01-01 09:03", "2000-01-01 09:05"], freq="min" ) other5 = PeriodIndex( - ["2000-01-01 09:01", "2000-01-01 09:05", "2000-01-01 09:08"], freq="T" + ["2000-01-01 09:01", "2000-01-01 09:05", "2000-01-01 09:08"], freq="min" ) expected5 = PeriodIndex( [ @@ -74,7 +74,7 @@ def test_union(self, sort): "2000-01-01 09:05", "2000-01-01 09:08", ], - freq="T", + freq="min", ) rng6 = period_range("2000-01-01", freq="M", periods=7) @@ -240,7 +240,7 @@ def test_intersection_cases(self, sort): assert result.freq == "D" # empty same freq - rng = date_range("6/1/2000", "6/15/2000", freq="T") + rng = date_range("6/1/2000", "6/15/2000", freq="min") result = rng[0:0].intersection(rng) assert len(result) == 0 @@ -274,10 +274,10 @@ def test_difference(self, sort): expected4 = rng4 rng5 = PeriodIndex( - ["2000-01-01 09:03", "2000-01-01 09:01", "2000-01-01 09:05"], freq="T" + ["2000-01-01 09:03", "2000-01-01 09:01", "2000-01-01 09:05"], freq="min" ) - other5 = PeriodIndex(["2000-01-01 09:01", "2000-01-01 09:05"], freq="T") - expected5 = PeriodIndex(["2000-01-01 09:03"], freq="T") + other5 = PeriodIndex(["2000-01-01 09:01", "2000-01-01 09:05"], freq="min") + expected5 = PeriodIndex(["2000-01-01 09:03"], freq="min") period_rng = [ "2000-02-01", diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index 13509bd58b4b8..18668fd357fd8 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -21,11 +21,11 @@ class TestPeriodRepresentation: ("D", "1970-01-01"), ("B", "1970-01-01"), ("H", "1970-01-01"), - ("T", "1970-01-01"), - ("S", "1970-01-01"), - ("L", "1970-01-01"), - ("U", "1970-01-01"), - ("N", "1970-01-01"), + ("min", "1970-01-01"), + ("s", "1970-01-01"), + ("ms", "1970-01-01"), + ("us", "1970-01-01"), + ("ns", "1970-01-01"), ("M", "1970-01"), ("A", 1970), ], diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 5f137df281fa3..132704434829e 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -10,9 +10,6 @@ ) import pandas._testing as tm -# aliases to make some tests easier to read -RI = RangeIndex - class TestRangeIndex: @pytest.fixture @@ -507,25 +504,31 @@ def test_len_specialised(self, step): @pytest.mark.parametrize( "indices, expected", [ - ([RI(1, 12, 5)], RI(1, 12, 5)), - ([RI(0, 6, 4)], RI(0, 6, 4)), - ([RI(1, 3), RI(3, 7)], RI(1, 7)), - ([RI(1, 5, 2), RI(5, 6)], RI(1, 6, 2)), - ([RI(1, 3, 2), RI(4, 7, 3)], RI(1, 7, 3)), - ([RI(-4, 3, 2), RI(4, 7, 2)], RI(-4, 7, 2)), - ([RI(-4, -8), RI(-8, -12)], RI(0, 0)), - ([RI(-4, -8), RI(3, -4)], RI(0, 0)), - ([RI(-4, -8), RI(3, 5)], RI(3, 5)), - ([RI(-4, -2), RI(3, 5)], Index([-4, -3, 3, 4])), - ([RI(-2), RI(3, 5)], RI(3, 5)), - ([RI(2), RI(2)], Index([0, 1, 0, 1])), - ([RI(2), RI(2, 5), RI(5, 8, 4)], RI(0, 6)), - ([RI(2), RI(3, 5), RI(5, 8, 4)], Index([0, 1, 3, 4, 5])), - ([RI(-2, 2), RI(2, 5), RI(5, 8, 4)], RI(-2, 6)), - ([RI(3), Index([-1, 3, 15])], Index([0, 1, 2, -1, 3, 15])), - ([RI(3), Index([-1, 3.1, 15.0])], Index([0, 1, 2, -1, 3.1, 15.0])), - ([RI(3), Index(["a", None, 14])], Index([0, 1, 2, "a", None, 14])), - ([RI(3, 1), Index(["a", None, 14])], Index(["a", None, 14])), + ([RangeIndex(1, 12, 5)], RangeIndex(1, 12, 5)), + ([RangeIndex(0, 6, 4)], RangeIndex(0, 6, 4)), + ([RangeIndex(1, 3), RangeIndex(3, 7)], RangeIndex(1, 7)), + ([RangeIndex(1, 5, 2), RangeIndex(5, 6)], RangeIndex(1, 6, 2)), + ([RangeIndex(1, 3, 2), RangeIndex(4, 7, 3)], RangeIndex(1, 7, 3)), + ([RangeIndex(-4, 3, 2), RangeIndex(4, 7, 2)], RangeIndex(-4, 7, 2)), + ([RangeIndex(-4, -8), RangeIndex(-8, -12)], RangeIndex(0, 0)), + ([RangeIndex(-4, -8), RangeIndex(3, -4)], RangeIndex(0, 0)), + ([RangeIndex(-4, -8), RangeIndex(3, 5)], RangeIndex(3, 5)), + ([RangeIndex(-4, -2), RangeIndex(3, 5)], Index([-4, -3, 3, 4])), + ([RangeIndex(-2), RangeIndex(3, 5)], RangeIndex(3, 5)), + ([RangeIndex(2), RangeIndex(2)], Index([0, 1, 0, 1])), + ([RangeIndex(2), RangeIndex(2, 5), RangeIndex(5, 8, 4)], RangeIndex(0, 6)), + ( + [RangeIndex(2), RangeIndex(3, 5), RangeIndex(5, 8, 4)], + Index([0, 1, 3, 4, 5]), + ), + ( + [RangeIndex(-2, 2), RangeIndex(2, 5), RangeIndex(5, 8, 4)], + RangeIndex(-2, 6), + ), + ([RangeIndex(3), Index([-1, 3, 15])], Index([0, 1, 2, -1, 3, 15])), + ([RangeIndex(3), Index([-1, 3.1, 15.0])], Index([0, 1, 2, -1, 3.1, 15.0])), + ([RangeIndex(3), Index(["a", None, 14])], Index([0, 1, 2, "a", None, 14])), + ([RangeIndex(3, 1), Index(["a", None, 14])], Index(["a", None, 14])), ], ) def test_append(self, indices, expected): @@ -567,7 +570,7 @@ def test_format_empty(self): assert empty_idx.format(name=True) == [""] @pytest.mark.parametrize( - "RI", + "ri", [ RangeIndex(0, -1, -1), RangeIndex(0, 1, 1), @@ -576,10 +579,10 @@ def test_format_empty(self): RangeIndex(-3, -5, -2), ], ) - def test_append_len_one(self, RI): + def test_append_len_one(self, ri): # GH39401 - result = RI.append([]) - tm.assert_index_equal(result, RI, exact=True) + result = ri.append([]) + tm.assert_index_equal(result, ri, exact=True) @pytest.mark.parametrize("base", [RangeIndex(0, 2), Index([0, 1])]) def test_isin_range(self, base): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index ffa0b115e34fb..bc04c1c6612f4 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -692,7 +692,12 @@ def test_format_missing(self, vals, nulls_fixture): @pytest.mark.parametrize("op", ["any", "all"]) def test_logical_compat(self, op, simple_index): index = simple_index - assert getattr(index, op)() == getattr(index.values, op)() + left = getattr(index, op)() + assert left == getattr(index.values, op)() + right = getattr(index.to_series(), op)() + # left might not match right exactly in e.g. string cases where the + # because we use np.any/all instead of .any/all + assert bool(left) == bool(right) @pytest.mark.parametrize( "index", ["string", "int64", "int32", "float64", "float32"], indirect=True diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index f8f5a543a9c19..79dc423f12a85 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -209,17 +209,25 @@ def test_numeric_compat(self, simple_index): 1 // idx def test_logical_compat(self, simple_index): - if ( - isinstance(simple_index, RangeIndex) - or is_numeric_dtype(simple_index.dtype) - or simple_index.dtype == object - ): + if simple_index.dtype == object: pytest.skip("Tested elsewhere.") idx = simple_index - with pytest.raises(TypeError, match="cannot perform all"): - idx.all() - with pytest.raises(TypeError, match="cannot perform any"): - idx.any() + if idx.dtype.kind in "iufcbm": + assert idx.all() == idx._values.all() + assert idx.all() == idx.to_series().all() + assert idx.any() == idx._values.any() + assert idx.any() == idx.to_series().any() + else: + msg = "cannot perform (any|all)" + if isinstance(idx, IntervalIndex): + msg = ( + r"'IntervalArray' with dtype interval\[.*\] does " + "not support reduction '(any|all)'" + ) + with pytest.raises(TypeError, match=msg): + idx.all() + with pytest.raises(TypeError, match=msg): + idx.any() def test_repr_roundtrip(self, simple_index): if isinstance(simple_index, IntervalIndex): diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 2fd203dbc77ed..d6304774b87c4 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -796,11 +796,21 @@ def test_difference_name_preservation(self, index, second_name, expected, sort): assert result.name == expected def test_difference_empty_arg(self, index, sort): - first = index[5:20] + first = index.copy() + first = first[5:20] first.name = "name" result = first.difference([], sort) + expected = index[5:20].unique() + expected.name = "name" + tm.assert_index_equal(result, expected) - tm.assert_index_equal(result, first) + def test_difference_should_not_compare(self): + # GH 55113 + left = Index([1, 1]) + right = Index([True]) + result = left.difference(right) + expected = Index([1]) + tm.assert_index_equal(result, expected) @pytest.mark.parametrize("index", ["string"], indirect=True) def test_difference_identity(self, index, sort): @@ -899,3 +909,10 @@ def test_union_ea_dtypes(self, any_numeric_ea_and_arrow_dtype): result = idx.union(idx2) expected = Index([1, 2, 3, 4, 5], dtype=any_numeric_ea_and_arrow_dtype) tm.assert_index_equal(result, expected) + + def test_union_string_array(self, any_string_dtype): + idx1 = Index(["a"], dtype=any_string_dtype) + idx2 = Index(["b"], dtype=any_string_dtype) + result = idx1.union(idx2) + expected = Index(["a", "b"], dtype=any_string_dtype) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/timedeltas/methods/test_shift.py b/pandas/tests/indexes/timedeltas/methods/test_shift.py index f49af73f9f499..e33b8de3e6594 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_shift.py +++ b/pandas/tests/indexes/timedeltas/methods/test_shift.py @@ -29,11 +29,11 @@ def test_tdi_shift_hours(self): def test_tdi_shift_minutes(self): # GH#9903 idx = TimedeltaIndex(["5 hours", "6 hours", "9 hours"], name="xxx") - tm.assert_index_equal(idx.shift(0, freq="T"), idx) + tm.assert_index_equal(idx.shift(0, freq="min"), idx) exp = TimedeltaIndex(["05:03:00", "06:03:00", "9:03:00"], name="xxx") - tm.assert_index_equal(idx.shift(3, freq="T"), exp) + tm.assert_index_equal(idx.shift(3, freq="min"), exp) exp = TimedeltaIndex(["04:57:00", "05:57:00", "8:57:00"], name="xxx") - tm.assert_index_equal(idx.shift(-3, freq="T"), exp) + tm.assert_index_equal(idx.shift(-3, freq="min"), exp) def test_tdi_shift_int(self): # GH#8083 diff --git a/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/pandas/tests/indexes/timedeltas/test_scalar_compat.py index 9f470b40d1f58..fe3ff1799e763 100644 --- a/pandas/tests/indexes/timedeltas/test_scalar_compat.py +++ b/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -74,15 +74,15 @@ def test_tdi_round(self): msg = " is a non-fixed frequency" with pytest.raises(ValueError, match=msg): - td.round(freq="M") + td.round(freq="ME") with pytest.raises(ValueError, match=msg): - elt.round(freq="M") + elt.round(freq="ME") @pytest.mark.parametrize( "freq,msg", [ ("Y", " is a non-fixed frequency"), - ("M", " is a non-fixed frequency"), + ("ME", " is a non-fixed frequency"), ("foobar", "Invalid frequency: foobar"), ], ) @@ -104,23 +104,23 @@ def test_round(self): # note that negative times round DOWN! so don't give whole numbers for freq, s1, s2 in [ - ("N", t1, t2), - ("U", t1, t2), + ("ns", t1, t2), + ("us", t1, t2), ( - "L", + "ms", t1a, TimedeltaIndex( ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] ), ), ( - "S", + "s", t1a, TimedeltaIndex( ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] ), ), - ("12T", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), + ("12min", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), ("H", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), ("d", t1c, TimedeltaIndex([-1, -1, -1], unit="D")), ]: diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index cb6dce1e7ad80..6cdd6944e90ea 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -219,9 +219,11 @@ def test_difference_freq(self, sort): tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) + # preserve frequency when the difference is a contiguous + # subset of the original range other = timedelta_range("2 days", "5 days", freq="D") idx_diff = index.difference(other, sort) - expected = TimedeltaIndex(["0 days", "1 days"], freq=None) + expected = TimedeltaIndex(["0 days", "1 days"], freq="D") tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py index 72bdc6da47d94..d0593b3230959 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta_range.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -47,12 +47,19 @@ def test_timedelta_range(self): [ ("T", "minute"), ("t", "minute"), + ("S", "second"), ("L", "millisecond"), ("l", "millisecond"), + ("U", "microsecond"), + ("u", "microsecond"), + ("N", "nanosecond"), + ("n", "nanosecond"), ], ) - def test_timedelta_units_T_L_deprecated(self, depr_unit, unit): - depr_msg = f"Unit '{depr_unit}' is deprecated." + def test_timedelta_units_T_S_L_U_N_deprecated(self, depr_unit, unit): + depr_msg = ( + f"'{depr_unit}' is deprecated and will be removed in a future version." + ) expected = to_timedelta(np.arange(5), unit=unit) with tm.assert_produces_warning(FutureWarning, match=depr_msg): @@ -60,7 +67,7 @@ def test_timedelta_units_T_L_deprecated(self, depr_unit, unit): tm.assert_index_equal(result, expected) @pytest.mark.parametrize( - "periods, freq", [(3, "2D"), (5, "D"), (6, "19H12T"), (7, "16H"), (9, "12H")] + "periods, freq", [(3, "2D"), (5, "D"), (6, "19H12min"), (7, "16H"), (9, "12H")] ) def test_linspace_behavior(self, periods, freq): # GH 20976 diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index e2fbc0653695b..b86e233110e88 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -145,6 +145,23 @@ def test_frame_getitem_simple_key_error( indexer(df) +def test_tuple_string_column_names(): + # GH#50372 + mi = MultiIndex.from_tuples([("a", "aa"), ("a", "ab"), ("b", "ba"), ("b", "bb")]) + df = DataFrame([range(4), range(1, 5), range(2, 6)], columns=mi) + df["single_index"] = 0 + + df_flat = df.copy() + df_flat.columns = df_flat.columns.to_flat_index() + df_flat["new_single_index"] = 0 + + result = df_flat[[("a", "aa"), "new_single_index"]] + expected = DataFrame( + [[0, 0], [1, 0], [2, 0]], columns=Index([("a", "aa"), "new_single_index"]) + ) + tm.assert_frame_equal(result, expected) + + def test_frame_getitem_multicolumn_empty_level(): df = DataFrame({"a": ["1", "2", "3"], "b": ["2", "3", "4"]}) df.columns = [ diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 960a8a59ba5a7..51b94c3beeb6f 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -556,3 +556,21 @@ def test_frame_setitem_copy_no_write( result = df tm.assert_frame_equal(result, expected) + + +def test_frame_setitem_partial_multiindex(): + # GH 54875 + df = DataFrame( + { + "a": [1, 2, 3], + "b": [3, 4, 5], + "c": 6, + "d": 7, + } + ).set_index(["a", "b", "c"]) + ser = Series(8, index=df.index.droplevel("c")) + result = df.copy() + result["d"] = ser + expected = df.copy() + expected["d"] = 8 + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index b45d197af332e..d3a6d4bf7cebf 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -16,7 +16,6 @@ Timestamp, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT @pytest.fixture @@ -25,7 +24,9 @@ def df(): { "A": np.arange(6, dtype="int64"), }, - index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cab")), name="B"), + index=CategoricalIndex( + list("aabbca"), dtype=CategoricalDtype(list("cab")), name="B" + ), ) @@ -35,13 +36,15 @@ def df2(): { "A": np.arange(6, dtype="int64"), }, - index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"), + index=CategoricalIndex( + list("aabbca"), dtype=CategoricalDtype(list("cabe")), name="B" + ), ) class TestCategoricalIndex: def test_loc_scalar(self, df): - dtype = CDT(list("cab")) + dtype = CategoricalDtype(list("cab")) result = df.loc["a"] bidx = Series(list("aaa"), name="B").astype(dtype) assert bidx.dtype == dtype diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index f36fdf0d36ea9..7353b5ef76ba3 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -1,4 +1,4 @@ -from string import ascii_letters as letters +from string import ascii_letters import numpy as np import pytest @@ -24,9 +24,9 @@ def random_text(nobs=100): # Construct a DataFrame where each row is a random slice from 'letters' - idxs = np.random.default_rng(2).integers(len(letters), size=(nobs, 2)) + idxs = np.random.default_rng(2).integers(len(ascii_letters), size=(nobs, 2)) idxs.sort(axis=1) - strings = [letters[x[0] : x[1]] for x in idxs] + strings = [ascii_letters[x[0] : x[1]] for x in idxs] return DataFrame(strings, columns=["letters"]) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 2c39729097487..82368c67dc6d4 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -836,8 +836,6 @@ def test_replace_series(self, how, to_key, from_key, replacer): # tested below return - result = obj.replace(replacer) - if (from_key == "float64" and to_key in ("int64")) or ( from_key == "complex128" and to_key in ("int64", "float64") ): @@ -851,6 +849,17 @@ def test_replace_series(self, how, to_key, from_key, replacer): exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key + msg = "Downcasting behavior in `replace`" + warn = FutureWarning + if ( + exp.dtype == obj.dtype + or exp.dtype == object + or (exp.dtype.kind in "iufc" and obj.dtype.kind in "iufc") + ): + warn = None + with tm.assert_produces_warning(warn, match=msg): + result = obj.replace(replacer) + tm.assert_series_equal(result, exp) @pytest.mark.parametrize( @@ -866,11 +875,14 @@ def test_replace_series_datetime_tz(self, how, to_key, from_key, replacer): obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key - result = obj.replace(replacer) - exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key + msg = "Downcasting behavior in `replace`" + warn = FutureWarning if exp.dtype != object else None + with tm.assert_produces_warning(warn, match=msg): + result = obj.replace(replacer) + tm.assert_series_equal(result, exp) @pytest.mark.parametrize( @@ -888,16 +900,22 @@ def test_replace_series_datetime_datetime(self, how, to_key, from_key, replacer) obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key - result = obj.replace(replacer) - exp = pd.Series(self.rep[to_key], index=index, name="yyy") + warn = FutureWarning if isinstance(obj.dtype, pd.DatetimeTZDtype) and isinstance( exp.dtype, pd.DatetimeTZDtype ): # with mismatched tzs, we retain the original dtype as of 2.0 exp = exp.astype(obj.dtype) + warn = None else: assert exp.dtype == to_key + if to_key == from_key: + warn = None + + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(warn, match=msg): + result = obj.replace(replacer) tm.assert_series_equal(result, exp) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index d6d8a63797bb6..54e204c43dadd 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -830,8 +830,7 @@ def test_coercion_with_loc(self, expected): start_data, expected_result, warn = expected start_dataframe = DataFrame({"foo": start_data}) - with tm.assert_produces_warning(warn, match="incompatible dtype"): - start_dataframe.loc[0, ["foo"]] = None + start_dataframe.loc[0, ["foo"]] = None expected_dataframe = DataFrame({"foo": expected_result}) tm.assert_frame_equal(start_dataframe, expected_dataframe) @@ -841,8 +840,7 @@ def test_coercion_with_setitem_and_dataframe(self, expected): start_data, expected_result, warn = expected start_dataframe = DataFrame({"foo": start_data}) - with tm.assert_produces_warning(warn, match="incompatible dtype"): - start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None + start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None expected_dataframe = DataFrame({"foo": expected_result}) tm.assert_frame_equal(start_dataframe, expected_dataframe) @@ -852,10 +850,7 @@ def test_none_coercion_loc_and_dataframe(self, expected): start_data, expected_result, warn = expected start_dataframe = DataFrame({"foo": start_data}) - with tm.assert_produces_warning(warn, match="incompatible dtype"): - start_dataframe.loc[ - start_dataframe["foo"] == start_dataframe["foo"][0] - ] = None + start_dataframe.loc[start_dataframe["foo"] == start_dataframe["foo"][0]] = None expected_dataframe = DataFrame({"foo": expected_result}) tm.assert_frame_equal(start_dataframe, expected_dataframe) @@ -869,10 +864,7 @@ def test_none_coercion_mixed_dtypes(self): "d": ["a", "b", "c"], } ) - with tm.assert_produces_warning( - FutureWarning, match="item of incompatible dtype" - ): - start_dataframe.iloc[0] = None + start_dataframe.iloc[0] = None exp = DataFrame( { @@ -1132,3 +1124,19 @@ def test_scalar_setitem_series_with_nested_value_length1(value, indexer_sli): assert (ser.loc[0] == value).all() else: assert ser.loc[0] == value + + +def test_object_dtype_series_set_series_element(): + # GH 48933 + s1 = Series(dtype="O", index=["a", "b"]) + + s1["a"] = Series() + s1.loc["b"] = Series() + + tm.assert_series_equal(s1.loc["a"], Series()) + tm.assert_series_equal(s1.loc["b"], Series()) + + s2 = Series(dtype="O", index=["a", "b"]) + + s2.iloc[1] = Series() + tm.assert_series_equal(s2.iloc[1], Series()) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index d0b6adfda0241..a2693c85e507f 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1657,6 +1657,14 @@ def test_loc_setitem_range_key(self, frame_or_series): expected = frame_or_series([0, 1, 10, 9, 11], index=obj.index) tm.assert_equal(obj, expected) + def test_loc_setitem_numpy_frame_categorical_value(self): + # GH#52927 + df = DataFrame({"a": [1, 1, 1, 1, 1], "b": ["a", "a", "a", "a", "a"]}) + df.loc[1:2, "a"] = Categorical([2, 2], categories=[1, 2]) + + expected = DataFrame({"a": [1, 2, 2, 1, 1], "b": ["a", "a", "a", "a", "a"]}) + tm.assert_frame_equal(df, expected) + class TestLocWithEllipsis: @pytest.fixture(params=[tm.loc, tm.iloc]) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index b7108896f01ed..597bc2975268e 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1353,7 +1353,7 @@ def test_period_can_hold_element(self, element): with tm.assert_produces_warning(FutureWarning): self.check_series_setitem(elem, pi, False) - dti = pi.to_timestamp("S")[:-1] + dti = pi.to_timestamp("s")[:-1] elem = element(dti) with tm.assert_produces_warning(FutureWarning): self.check_series_setitem(elem, pi, False) diff --git a/pandas/tests/internals/test_managers.py b/pandas/tests/internals/test_managers.py index 75aa901fce910..f40362c299717 100644 --- a/pandas/tests/internals/test_managers.py +++ b/pandas/tests/internals/test_managers.py @@ -1,6 +1,12 @@ """ Testing interaction between the different managers (BlockManager, ArrayManager) """ +import os +import subprocess +import sys + +import pytest + from pandas.core.dtypes.missing import array_equivalent import pandas as pd @@ -14,12 +20,19 @@ def test_dataframe_creation(): - with pd.option_context("mode.data_manager", "block"): - df_block = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]}) + msg = "data_manager option is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with pd.option_context("mode.data_manager", "block"): + df_block = pd.DataFrame( + {"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]} + ) assert isinstance(df_block._mgr, BlockManager) - with pd.option_context("mode.data_manager", "array"): - df_array = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]}) + with tm.assert_produces_warning(FutureWarning, match=msg): + with pd.option_context("mode.data_manager", "array"): + df_array = pd.DataFrame( + {"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]} + ) assert isinstance(df_array._mgr, ArrayManager) # also ensure both are seen as equal @@ -45,12 +58,15 @@ def test_dataframe_creation(): def test_series_creation(): - with pd.option_context("mode.data_manager", "block"): - s_block = pd.Series([1, 2, 3], name="A", index=["a", "b", "c"]) + msg = "data_manager option is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with pd.option_context("mode.data_manager", "block"): + s_block = pd.Series([1, 2, 3], name="A", index=["a", "b", "c"]) assert isinstance(s_block._mgr, SingleBlockManager) - with pd.option_context("mode.data_manager", "array"): - s_array = pd.Series([1, 2, 3], name="A", index=["a", "b", "c"]) + with tm.assert_produces_warning(FutureWarning, match=msg): + with pd.option_context("mode.data_manager", "array"): + s_array = pd.Series([1, 2, 3], name="A", index=["a", "b", "c"]) assert isinstance(s_array._mgr, SingleArrayManager) # also ensure both are seen as equal @@ -68,3 +84,20 @@ def test_series_creation(): result = s_array._as_manager("block") assert isinstance(result._mgr, SingleBlockManager) tm.assert_series_equal(result, s_array) + + +@pytest.mark.single_cpu +@pytest.mark.parametrize("manager", ["block", "array"]) +def test_array_manager_depr_env_var(manager): + # GH#55043 + test_env = os.environ.copy() + test_env["PANDAS_DATA_MANAGER"] = manager + response = subprocess.run( + [sys.executable, "-c", "import pandas"], + capture_output=True, + env=test_env, + check=True, + ) + msg = "FutureWarning: The env variable PANDAS_DATA_MANAGER is set" + stderr_msg = response.stderr.decode("utf-8") + assert msg in stderr_msg, stderr_msg diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 170e2f61e7d4a..701bfe3767db4 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -234,3 +234,19 @@ def compression_format(request): @pytest.fixture(params=_compression_formats_params) def compression_ext(request): return request.param[0] + + +@pytest.fixture( + params=[ + "python", + pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), + ] +) +def string_storage(request): + """ + Parametrized fixture for pd.options.mode.string_storage. + + * 'python' + * 'pyarrow' + """ + return request.param diff --git a/pandas/tests/io/data/excel/test6.xls b/pandas/tests/io/data/excel/test6.xls new file mode 100644 index 0000000000000..e43a1a67510d8 Binary files /dev/null and b/pandas/tests/io/data/excel/test6.xls differ diff --git a/pandas/tests/io/data/excel/test_unempty_cells.ods b/pandas/tests/io/data/excel/test_unempty_cells.ods new file mode 100644 index 0000000000000..52458753bae06 Binary files /dev/null and b/pandas/tests/io/data/excel/test_unempty_cells.ods differ diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py index 25079b235d332..9d49718bec357 100644 --- a/pandas/tests/io/excel/test_odf.py +++ b/pandas/tests/io/excel/test_odf.py @@ -48,3 +48,14 @@ def test_read_newlines_between_xml_elements_table(): result = pd.read_excel("test_newlines.ods") tm.assert_frame_equal(result, expected) + + +def test_read_unempty_cells(): + expected = pd.DataFrame( + [1, np.nan, 3, np.nan, 5], + columns=["Column 1"], + ) + + result = pd.read_excel("test_unempty_cells.ods") + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/excel/test_odswriter.py b/pandas/tests/io/excel/test_odswriter.py index 21d31ec8a7fb5..ecee58362f8a9 100644 --- a/pandas/tests/io/excel/test_odswriter.py +++ b/pandas/tests/io/excel/test_odswriter.py @@ -1,7 +1,12 @@ +from datetime import ( + date, + datetime, +) import re import pytest +import pandas as pd import pandas._testing as tm from pandas.io.excel import ExcelWriter @@ -47,3 +52,47 @@ def test_book_and_sheets_consistent(ext): table = odf.table.Table(name="test_name") writer.book.spreadsheet.addElement(table) assert writer.sheets == {"test_name": table} + + +@pytest.mark.parametrize( + ["value", "cell_value_type", "cell_value_attribute", "cell_value"], + argvalues=[ + (True, "boolean", "boolean-value", "true"), + ("test string", "string", "string-value", "test string"), + (1, "float", "value", "1"), + (1.5, "float", "value", "1.5"), + ( + datetime(2010, 10, 10, 10, 10, 10), + "date", + "date-value", + "2010-10-10T10:10:10", + ), + (date(2010, 10, 10), "date", "date-value", "2010-10-10"), + ], +) +def test_cell_value_type(ext, value, cell_value_type, cell_value_attribute, cell_value): + # GH#54994 ODS: cell attributes should follow specification + # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#refTable13 + from odf.namespaces import OFFICENS + from odf.table import ( + TableCell, + TableRow, + ) + + table_cell_name = TableCell().qname + + with tm.ensure_clean(ext) as f: + pd.DataFrame([[value]]).to_excel(f, header=False, index=False) + + with pd.ExcelFile(f) as wb: + sheet = wb._reader.get_sheet_by_index(0) + sheet_rows = sheet.getElementsByType(TableRow) + sheet_cells = [ + x + for x in sheet_rows[0].childNodes + if hasattr(x, "qname") and x.qname == table_cell_name + ] + + cell = sheet_cells[0] + assert cell.attributes.get((OFFICENS, "value-type")) == cell_value_type + assert cell.attributes.get((OFFICENS, cell_value_attribute)) == cell_value diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index b8d41164792e0..da94c74f2303e 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -241,7 +241,7 @@ def test_if_sheet_exists_raises(ext, if_sheet_exists, msg): df = DataFrame({"fruit": ["pear"]}) with tm.ensure_clean(ext) as f: with pytest.raises(ValueError, match=re.escape(msg)): - df.to_excel(f, "foo", engine="openpyxl") + df.to_excel(f, sheet_name="foo", engine="openpyxl") with ExcelWriter( f, engine="openpyxl", mode="a", if_sheet_exists=if_sheet_exists ) as writer: diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 20bc6e98577d5..8dd9f96a05a90 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -54,6 +54,7 @@ ), pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")), pytest.param("odf", marks=td.skip_if_no("odf")), + pytest.param("calamine", marks=td.skip_if_no("python_calamine")), ] @@ -67,11 +68,11 @@ def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool: return False if engine == "odf" and read_ext != ".ods": return False - if read_ext == ".ods" and engine != "odf": + if read_ext == ".ods" and engine not in {"odf", "calamine"}: return False if engine == "pyxlsb" and read_ext != ".xlsb": return False - if read_ext == ".xlsb" and engine != "pyxlsb": + if read_ext == ".xlsb" and engine not in {"pyxlsb", "calamine"}: return False if engine == "xlrd" and read_ext != ".xls": return False @@ -160,9 +161,9 @@ def test_engine_kwargs(self, read_ext, engine): "ods": {"foo": "abcd"}, } - if read_ext[1:] in {"xls", "xlsb"}: + if engine in {"xlrd", "pyxlsb"}: msg = re.escape(r"open_workbook() got an unexpected keyword argument 'foo'") - elif read_ext[1:] == "ods": + elif engine == "odf": msg = re.escape(r"load() got an unexpected keyword argument 'foo'") else: msg = re.escape(r"load_workbook() got an unexpected keyword argument 'foo'") @@ -194,8 +195,8 @@ def test_usecols_int(self, read_ext): usecols=3, ) - def test_usecols_list(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_usecols_list(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -218,8 +219,8 @@ def test_usecols_list(self, request, read_ext, df_ref): tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - def test_usecols_str(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_usecols_str(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -275,9 +276,9 @@ def test_usecols_str(self, request, read_ext, df_ref): "usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]] ) def test_usecols_diff_positional_int_columns_order( - self, request, read_ext, usecols, df_ref + self, request, engine, read_ext, usecols, df_ref ): - if read_ext == ".xlsb": + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -298,8 +299,8 @@ def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_r result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols=usecols) tm.assert_frame_equal(result, expected, check_names=False) - def test_read_excel_without_slicing(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -310,8 +311,8 @@ def test_read_excel_without_slicing(self, request, read_ext, df_ref): result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0) tm.assert_frame_equal(result, expected, check_names=False) - def test_usecols_excel_range_str(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_usecols_excel_range_str(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -398,20 +399,26 @@ def test_excel_stop_iterator(self, read_ext): expected = DataFrame([["aaaa", "bbbbb"]], columns=["Test", "Test1"]) tm.assert_frame_equal(parsed, expected) - def test_excel_cell_error_na(self, request, read_ext): - if read_ext == ".xlsb": + def test_excel_cell_error_na(self, request, engine, read_ext): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) ) + # https://github.com/tafia/calamine/issues/355 + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Calamine can't extract error from ods files") + ) + parsed = pd.read_excel("test3" + read_ext, sheet_name="Sheet1") expected = DataFrame([[np.nan]], columns=["Test"]) tm.assert_frame_equal(parsed, expected) - def test_excel_table(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_excel_table(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -431,8 +438,8 @@ def test_excel_table(self, request, read_ext, df_ref): ) tm.assert_frame_equal(df3, df1.iloc[:-1]) - def test_reader_special_dtypes(self, request, read_ext): - if read_ext == ".xlsb": + def test_reader_special_dtypes(self, request, engine, read_ext): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -591,7 +598,7 @@ def test_dtype_backend(self, read_ext, dtype_backend): } ) with tm.ensure_clean(read_ext) as file_path: - df.to_excel(file_path, "test", index=False) + df.to_excel(file_path, sheet_name="test", index=False) result = pd.read_excel( file_path, sheet_name="test", dtype_backend=dtype_backend ) @@ -623,7 +630,7 @@ def test_dtype_backend_and_dtype(self, read_ext): df = DataFrame({"a": [np.nan, 1.0], "b": [2.5, np.nan]}) with tm.ensure_clean(read_ext) as file_path: - df.to_excel(file_path, "test", index=False) + df.to_excel(file_path, sheet_name="test", index=False) result = pd.read_excel( file_path, sheet_name="test", @@ -647,7 +654,7 @@ def test_dtype_backend_string(self, read_ext, string_storage): } ) with tm.ensure_clean(read_ext) as file_path: - df.to_excel(file_path, "test", index=False) + df.to_excel(file_path, sheet_name="test", index=False) result = pd.read_excel( file_path, sheet_name="test", dtype_backend="numpy_nullable" ) @@ -800,8 +807,8 @@ def test_date_conversion_overflow(self, request, engine, read_ext): result = pd.read_excel("testdateoverflow" + read_ext) tm.assert_frame_equal(result, expected) - def test_sheet_name(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_sheet_name(self, request, read_ext, engine, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -869,6 +876,11 @@ def test_corrupt_bytes_raises(self, engine): "Unsupported format, or corrupt file: Expected BOF " "record; found b'foo'" ) + elif engine == "calamine": + from python_calamine import CalamineError + + error = CalamineError + msg = "Cannot detect file format" else: error = BadZipFile msg = "File is not a zip file" @@ -969,6 +981,14 @@ def test_reader_seconds(self, request, engine, read_ext): ) ) + # GH 55045 + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail( + reason="ODS file contains bad datetime (seconds as text)" + ) + ) + # Test reading times with and without milliseconds. GH5945. expected = DataFrame.from_dict( { @@ -994,15 +1014,21 @@ def test_reader_seconds(self, request, engine, read_ext): actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1") tm.assert_frame_equal(actual, expected) - def test_read_excel_multiindex(self, request, read_ext): + def test_read_excel_multiindex(self, request, engine, read_ext): # see gh-4679 - if read_ext == ".xlsb": + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" ) ) + # https://github.com/tafia/calamine/issues/354 + if engine == "calamine" and read_ext == ".ods": + request.node.add_marker( + pytest.mark.xfail(reason="Last test fails in calamine") + ) + mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) mi_file = "testmultiindex" + read_ext @@ -1088,10 +1114,10 @@ def test_read_excel_multiindex(self, request, read_ext): ], ) def test_read_excel_multiindex_blank_after_name( - self, request, read_ext, sheet_name, idx_lvl2 + self, request, engine, read_ext, sheet_name, idx_lvl2 ): # GH34673 - if read_ext == ".xlsb": + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb (GH4679" @@ -1212,9 +1238,9 @@ def test_read_excel_bool_header_arg(self, read_ext): with pytest.raises(TypeError, match=msg): pd.read_excel("test1" + read_ext, header=arg) - def test_read_excel_skiprows(self, request, read_ext): + def test_read_excel_skiprows(self, request, engine, read_ext): # GH 4903 - if read_ext == ".xlsb": + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -1267,9 +1293,9 @@ def test_read_excel_skiprows(self, request, read_ext): ) tm.assert_frame_equal(actual, expected) - def test_read_excel_skiprows_callable_not_in(self, request, read_ext): + def test_read_excel_skiprows_callable_not_in(self, request, engine, read_ext): # GH 4903 - if read_ext == ".xlsb": + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -1397,7 +1423,7 @@ def test_trailing_blanks(self, read_ext): def test_ignore_chartsheets_by_str(self, request, engine, read_ext): # GH 41448 - if engine == "odf": + if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") if engine == "pyxlsb": request.node.add_marker( @@ -1410,7 +1436,7 @@ def test_ignore_chartsheets_by_str(self, request, engine, read_ext): def test_ignore_chartsheets_by_int(self, request, engine, read_ext): # GH 41448 - if engine == "odf": + if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") if engine == "pyxlsb": request.node.add_marker( @@ -1540,8 +1566,8 @@ def test_excel_passes_na_filter(self, read_ext, na_filter): expected = DataFrame(expected, columns=["Test"]) tm.assert_frame_equal(parsed, expected) - def test_excel_table_sheet_by_index(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -1569,8 +1595,8 @@ def test_excel_table_sheet_by_index(self, request, read_ext, df_ref): tm.assert_frame_equal(df3, df1.iloc[:-1]) - def test_sheet_name(self, request, read_ext, df_ref): - if read_ext == ".xlsb": + def test_sheet_name(self, request, engine, read_ext, df_ref): + if engine == "pyxlsb": request.node.add_marker( pytest.mark.xfail( reason="Sheets containing datetimes not supported by pyxlsb" @@ -1639,7 +1665,7 @@ def test_excel_read_binary(self, engine, read_ext): def test_excel_read_binary_via_read_excel(self, read_ext, engine): # GH 38424 with open("test1" + read_ext, "rb") as f: - result = pd.read_excel(f) + result = pd.read_excel(f, engine=engine) expected = pd.read_excel("test1" + read_ext, engine=engine) tm.assert_frame_equal(result, expected) @@ -1691,7 +1717,7 @@ def test_engine_invalid_option(self, read_ext): def test_ignore_chartsheets(self, request, engine, read_ext): # GH 41448 - if engine == "odf": + if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") if engine == "pyxlsb": request.node.add_marker( @@ -1711,6 +1737,10 @@ def test_corrupt_files_closed(self, engine, read_ext): import xlrd errors = (BadZipFile, xlrd.biffh.XLRDError) + elif engine == "calamine": + from python_calamine import CalamineError + + errors = (CalamineError,) with tm.ensure_clean(f"corrupt{read_ext}") as file: Path(file).write_text("corrupt", encoding="utf-8") diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 905c733ea5ef1..e4ce969daab53 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -79,7 +79,7 @@ def test_read_one_empty_col_no_header(self, ext, header, expected): df = DataFrame([["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]]) with tm.ensure_clean(ext) as path: - df.to_excel(path, filename, index=False, header=False) + df.to_excel(path, sheet_name=filename, index=False, header=False) result = pd.read_excel( path, sheet_name=filename, usecols=[0], header=header ) @@ -95,7 +95,7 @@ def test_read_one_empty_col_with_header(self, ext, header, expected): df = DataFrame([["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]]) with tm.ensure_clean(ext) as path: - df.to_excel(path, "with_header", index=False, header=True) + df.to_excel(path, sheet_name="with_header", index=False, header=True) result = pd.read_excel( path, sheet_name=filename, usecols=[0], header=header ) @@ -109,8 +109,10 @@ def test_set_column_names_in_parameter(self, ext): with tm.ensure_clean(ext) as pth: with ExcelWriter(pth) as writer: - refdf.to_excel(writer, "Data_no_head", header=False, index=False) - refdf.to_excel(writer, "Data_with_head", index=False) + refdf.to_excel( + writer, sheet_name="Data_no_head", header=False, index=False + ) + refdf.to_excel(writer, sheet_name="Data_with_head", index=False) refdf.columns = ["A", "B"] @@ -145,7 +147,7 @@ def tdf(col_sheet_name): with tm.ensure_clean(ext) as pth: with ExcelWriter(pth) as ew: for sheetname, df in dfs.items(): - df.to_excel(ew, sheetname) + df.to_excel(ew, sheet_name=sheetname) dfs_returned = pd.read_excel(pth, sheet_name=sheets, index_col=0) @@ -291,7 +293,7 @@ def test_multiindex_interval_datetimes(self, ext): [ range(4), pd.interval_range( - start=pd.Timestamp("2020-01-01"), periods=4, freq="6M" + start=pd.Timestamp("2020-01-01"), periods=4, freq="6ME" ), ] ) @@ -305,10 +307,10 @@ def test_multiindex_interval_datetimes(self, ext): [ range(4), [ - "(2020-01-31, 2020-07-31]", - "(2020-07-31, 2021-01-31]", - "(2021-01-31, 2021-07-31]", - "(2021-07-31, 2022-01-31]", + "(2020-01-31 00:00:00, 2020-07-31 00:00:00]", + "(2020-07-31 00:00:00, 2021-01-31 00:00:00]", + "(2021-01-31 00:00:00, 2021-07-31 00:00:00]", + "(2021-07-31 00:00:00, 2022-01-31 00:00:00]", ], ] ), @@ -371,10 +373,10 @@ def test_excel_sheet_by_name_raise(self, path): def test_excel_writer_context_manager(self, frame, path): with ExcelWriter(path) as writer: - frame.to_excel(writer, "Data1") + frame.to_excel(writer, sheet_name="Data1") frame2 = frame.copy() frame2.columns = frame.columns[::-1] - frame2.to_excel(writer, "Data2") + frame2.to_excel(writer, sheet_name="Data2") with ExcelFile(path) as reader: found_df = pd.read_excel(reader, sheet_name="Data1", index_col=0) @@ -387,42 +389,42 @@ def test_roundtrip(self, frame, path): frame = frame.copy() frame.iloc[:5, frame.columns.get_loc("A")] = np.nan - frame.to_excel(path, "test1") - frame.to_excel(path, "test1", columns=["A", "B"]) - frame.to_excel(path, "test1", header=False) - frame.to_excel(path, "test1", index=False) + frame.to_excel(path, sheet_name="test1") + frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(path, sheet_name="test1", header=False) + frame.to_excel(path, sheet_name="test1", index=False) # test roundtrip - frame.to_excel(path, "test1") + frame.to_excel(path, sheet_name="test1") recons = pd.read_excel(path, sheet_name="test1", index_col=0) tm.assert_frame_equal(frame, recons) - frame.to_excel(path, "test1", index=False) + frame.to_excel(path, sheet_name="test1", index=False) recons = pd.read_excel(path, sheet_name="test1", index_col=None) recons.index = frame.index tm.assert_frame_equal(frame, recons) - frame.to_excel(path, "test1", na_rep="NA") + frame.to_excel(path, sheet_name="test1", na_rep="NA") recons = pd.read_excel(path, sheet_name="test1", index_col=0, na_values=["NA"]) tm.assert_frame_equal(frame, recons) # GH 3611 - frame.to_excel(path, "test1", na_rep="88") + frame.to_excel(path, sheet_name="test1", na_rep="88") recons = pd.read_excel(path, sheet_name="test1", index_col=0, na_values=["88"]) tm.assert_frame_equal(frame, recons) - frame.to_excel(path, "test1", na_rep="88") + frame.to_excel(path, sheet_name="test1", na_rep="88") recons = pd.read_excel( path, sheet_name="test1", index_col=0, na_values=[88, 88.0] ) tm.assert_frame_equal(frame, recons) # GH 6573 - frame.to_excel(path, "Sheet1") + frame.to_excel(path, sheet_name="Sheet1") recons = pd.read_excel(path, index_col=0) tm.assert_frame_equal(frame, recons) - frame.to_excel(path, "0") + frame.to_excel(path, sheet_name="0") recons = pd.read_excel(path, index_col=0) tm.assert_frame_equal(frame, recons) @@ -436,7 +438,7 @@ def test_mixed(self, frame, path): mixed_frame = frame.copy() mixed_frame["foo"] = "bar" - mixed_frame.to_excel(path, "test1") + mixed_frame.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(mixed_frame, recons) @@ -448,7 +450,7 @@ def test_ts_frame(self, tsframe, path): index = pd.DatetimeIndex(np.asarray(df.index), freq=None) df.index = index - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(df, recons) @@ -456,10 +458,10 @@ def test_ts_frame(self, tsframe, path): def test_basics_with_nan(self, frame, path): frame = frame.copy() frame.iloc[:5, frame.columns.get_loc("A")] = np.nan - frame.to_excel(path, "test1") - frame.to_excel(path, "test1", columns=["A", "B"]) - frame.to_excel(path, "test1", header=False) - frame.to_excel(path, "test1", index=False) + frame.to_excel(path, sheet_name="test1") + frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(path, sheet_name="test1", header=False) + frame.to_excel(path, sheet_name="test1", index=False) @pytest.mark.parametrize("np_type", [np.int8, np.int16, np.int32, np.int64]) def test_int_types(self, np_type, path): @@ -468,7 +470,7 @@ def test_int_types(self, np_type, path): df = DataFrame( np.random.default_rng(2).integers(-10, 10, size=(10, 2)), dtype=np_type ) - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) @@ -483,7 +485,7 @@ def test_int_types(self, np_type, path): def test_float_types(self, np_type, path): # Test np.float values read come back as float. df = DataFrame(np.random.default_rng(2).random(10), dtype=np_type) - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( @@ -495,7 +497,7 @@ def test_float_types(self, np_type, path): def test_bool_types(self, path): # Test np.bool_ values read come back as float. df = DataFrame([1, 0, True, False], dtype=np.bool_) - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( @@ -506,7 +508,7 @@ def test_bool_types(self, path): def test_inf_roundtrip(self, path): df = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) @@ -521,15 +523,15 @@ def test_sheets(self, frame, tsframe, path): frame = frame.copy() frame.iloc[:5, frame.columns.get_loc("A")] = np.nan - frame.to_excel(path, "test1") - frame.to_excel(path, "test1", columns=["A", "B"]) - frame.to_excel(path, "test1", header=False) - frame.to_excel(path, "test1", index=False) + frame.to_excel(path, sheet_name="test1") + frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(path, sheet_name="test1", header=False) + frame.to_excel(path, sheet_name="test1", index=False) # Test writing to separate sheets with ExcelWriter(path) as writer: - frame.to_excel(writer, "test1") - tsframe.to_excel(writer, "test2") + frame.to_excel(writer, sheet_name="test1") + tsframe.to_excel(writer, sheet_name="test2") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(frame, recons) @@ -543,14 +545,14 @@ def test_colaliases(self, frame, path): frame = frame.copy() frame.iloc[:5, frame.columns.get_loc("A")] = np.nan - frame.to_excel(path, "test1") - frame.to_excel(path, "test1", columns=["A", "B"]) - frame.to_excel(path, "test1", header=False) - frame.to_excel(path, "test1", index=False) + frame.to_excel(path, sheet_name="test1") + frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(path, sheet_name="test1", header=False) + frame.to_excel(path, sheet_name="test1", index=False) # column aliases col_aliases = Index(["AA", "X", "Y", "Z"]) - frame.to_excel(path, "test1", header=col_aliases) + frame.to_excel(path, sheet_name="test1", header=col_aliases) with ExcelFile(path) as reader: rs = pd.read_excel(reader, sheet_name="test1", index_col=0) xp = frame.copy() @@ -561,14 +563,16 @@ def test_roundtrip_indexlabels(self, merge_cells, frame, path): frame = frame.copy() frame.iloc[:5, frame.columns.get_loc("A")] = np.nan - frame.to_excel(path, "test1") - frame.to_excel(path, "test1", columns=["A", "B"]) - frame.to_excel(path, "test1", header=False) - frame.to_excel(path, "test1", index=False) + frame.to_excel(path, sheet_name="test1") + frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(path, sheet_name="test1", header=False) + frame.to_excel(path, sheet_name="test1", index=False) # test index_label df = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) >= 0 - df.to_excel(path, "test1", index_label=["test"], merge_cells=merge_cells) + df.to_excel( + path, sheet_name="test1", index_label=["test"], merge_cells=merge_cells + ) with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( np.int64 @@ -579,7 +583,7 @@ def test_roundtrip_indexlabels(self, merge_cells, frame, path): df = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) >= 0 df.to_excel( path, - "test1", + sheet_name="test1", index_label=["test", "dummy", "dummy2"], merge_cells=merge_cells, ) @@ -591,7 +595,9 @@ def test_roundtrip_indexlabels(self, merge_cells, frame, path): assert df.index.names == recons.index.names df = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) >= 0 - df.to_excel(path, "test1", index_label="test", merge_cells=merge_cells) + df.to_excel( + path, sheet_name="test1", index_label="test", merge_cells=merge_cells + ) with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( np.int64 @@ -601,7 +607,7 @@ def test_roundtrip_indexlabels(self, merge_cells, frame, path): frame.to_excel( path, - "test1", + sheet_name="test1", columns=["A", "B", "C", "D"], index=False, merge_cells=merge_cells, @@ -636,7 +642,7 @@ def test_excel_roundtrip_datetime(self, merge_cells, tsframe, path): tsf = tsframe.copy() tsf.index = [x.date() for x in tsframe.index] - tsf.to_excel(path, "test1", merge_cells=merge_cells) + tsf.to_excel(path, sheet_name="test1", merge_cells=merge_cells) with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) @@ -666,14 +672,14 @@ def test_excel_date_datetime_format(self, ext, path): with tm.ensure_clean(ext) as filename2: with ExcelWriter(path) as writer1: - df.to_excel(writer1, "test1") + df.to_excel(writer1, sheet_name="test1") with ExcelWriter( filename2, date_format="DD.MM.YYYY", datetime_format="DD.MM.YYYY HH-MM-SS", ) as writer2: - df.to_excel(writer2, "test1") + df.to_excel(writer2, sheet_name="test1") with ExcelFile(path) as reader1: rs1 = pd.read_excel(reader1, sheet_name="test1", index_col=0) @@ -699,7 +705,7 @@ def test_to_excel_interval_no_labels(self, path): df["new"] = pd.cut(df[0], 10) expected["new"] = pd.cut(expected[0], 10).astype(str) - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(expected, recons) @@ -718,7 +724,7 @@ def test_to_excel_interval_labels(self, path): df["new"] = intervals expected["new"] = pd.Series(list(intervals)) - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(expected, recons) @@ -739,15 +745,15 @@ def test_to_excel_timedelta(self, path): lambda x: timedelta(seconds=x).total_seconds() / 86400 ) - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(expected, recons) def test_to_excel_periodindex(self, tsframe, path): - xp = tsframe.resample("M", kind="period").mean() + xp = tsframe.resample("ME", kind="period").mean() - xp.to_excel(path, "sht1") + xp.to_excel(path, sheet_name="sht1") with ExcelFile(path) as reader: rs = pd.read_excel(reader, sheet_name="sht1", index_col=0) @@ -758,11 +764,11 @@ def test_to_excel_multiindex(self, merge_cells, frame, path): new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index - frame.to_excel(path, "test1", header=False) - frame.to_excel(path, "test1", columns=["A", "B"]) + frame.to_excel(path, sheet_name="test1", header=False) + frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) # round trip - frame.to_excel(path, "test1", merge_cells=merge_cells) + frame.to_excel(path, sheet_name="test1", merge_cells=merge_cells) with ExcelFile(path) as reader: df = pd.read_excel(reader, sheet_name="test1", index_col=[0, 1]) tm.assert_frame_equal(frame, df) @@ -797,7 +803,7 @@ def test_to_excel_multiindex_cols(self, merge_cells, frame, path): header = 0 # round trip - frame.to_excel(path, "test1", merge_cells=merge_cells) + frame.to_excel(path, sheet_name="test1", merge_cells=merge_cells) with ExcelFile(path) as reader: df = pd.read_excel( reader, sheet_name="test1", header=header, index_col=[0, 1] @@ -813,7 +819,7 @@ def test_to_excel_multiindex_dates(self, merge_cells, tsframe, path): tsframe.index = MultiIndex.from_arrays(new_index) tsframe.index.names = ["time", "foo"] - tsframe.to_excel(path, "test1", merge_cells=merge_cells) + tsframe.to_excel(path, sheet_name="test1", merge_cells=merge_cells) with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=[0, 1]) @@ -832,7 +838,7 @@ def test_to_excel_multiindex_no_write_index(self, path): frame2.index = multi_index # Write out to Excel without the index. - frame2.to_excel(path, "test1", index=False) + frame2.to_excel(path, sheet_name="test1", index=False) # Read it back in. with ExcelFile(path) as reader: @@ -846,7 +852,7 @@ def test_to_excel_empty_multiindex(self, path): expected = DataFrame([], columns=[0, 1, 2]) df = DataFrame([], index=MultiIndex.from_tuples([], names=[0, 1]), columns=[2]) - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: result = pd.read_excel(reader, sheet_name="test1") @@ -860,7 +866,7 @@ def test_to_excel_float_format(self, path): index=["A", "B"], columns=["X", "Y", "Z"], ) - df.to_excel(path, "test1", float_format="%.2f") + df.to_excel(path, sheet_name="test1", float_format="%.2f") with ExcelFile(path) as reader: result = pd.read_excel(reader, sheet_name="test1", index_col=0) @@ -898,7 +904,7 @@ def test_to_excel_unicode_filename(self, ext): index=["A", "B"], columns=["X", "Y", "Z"], ) - df.to_excel(filename, "test1", float_format="%.2f") + df.to_excel(filename, sheet_name="test1", float_format="%.2f") with ExcelFile(filename) as reader: result = pd.read_excel(reader, sheet_name="test1", index_col=0) @@ -968,7 +974,7 @@ def roundtrip(data, header=True, parser_hdr=0, index=True): def test_duplicated_columns(self, path): # see gh-5235 df = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["A", "B", "B"]) - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") expected = DataFrame( [[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["A", "B", "B.1"] ) @@ -979,7 +985,7 @@ def test_duplicated_columns(self, path): # see gh-11007, gh-10970 df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A", "B"]) - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") result = pd.read_excel(path, sheet_name="test1", index_col=0) expected = DataFrame( @@ -988,7 +994,7 @@ def test_duplicated_columns(self, path): tm.assert_frame_equal(result, expected) # see gh-10982 - df.to_excel(path, "test1", index=False, header=False) + df.to_excel(path, sheet_name="test1", index=False, header=False) result = pd.read_excel(path, sheet_name="test1", header=None) expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) @@ -997,7 +1003,7 @@ def test_duplicated_columns(self, path): def test_swapped_columns(self, path): # Test for issue #5427. write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) - write_frame.to_excel(path, "test1", columns=["B", "A"]) + write_frame.to_excel(path, sheet_name="test1", columns=["B", "A"]) read_frame = pd.read_excel(path, sheet_name="test1", header=0) @@ -1009,12 +1015,12 @@ def test_invalid_columns(self, path): write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) with pytest.raises(KeyError, match="Not all names specified"): - write_frame.to_excel(path, "test1", columns=["B", "C"]) + write_frame.to_excel(path, sheet_name="test1", columns=["B", "C"]) with pytest.raises( KeyError, match="'passes columns are not ALL present dataframe'" ): - write_frame.to_excel(path, "test1", columns=["C", "D"]) + write_frame.to_excel(path, sheet_name="test1", columns=["C", "D"]) @pytest.mark.parametrize( "to_excel_index,read_excel_index_col", @@ -1027,7 +1033,7 @@ def test_write_subset_columns(self, path, to_excel_index, read_excel_index_col): # GH 31677 write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2], "C": [3, 3, 3]}) write_frame.to_excel( - path, "col_subset_bug", columns=["A", "B"], index=to_excel_index + path, sheet_name="col_subset_bug", columns=["A", "B"], index=to_excel_index ) expected = write_frame[["A", "B"]] @@ -1044,7 +1050,7 @@ def test_comment_arg(self, path): # Create file to read in. df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) - df.to_excel(path, "test_c") + df.to_excel(path, sheet_name="test_c") # Read file without comment arg. result1 = pd.read_excel(path, sheet_name="test_c", index_col=0) @@ -1062,7 +1068,7 @@ def test_comment_default(self, path): # Create file to read in df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) - df.to_excel(path, "test_c") + df.to_excel(path, sheet_name="test_c") # Read file with default and explicit comment=None result1 = pd.read_excel(path, sheet_name="test_c") @@ -1076,7 +1082,7 @@ def test_comment_used(self, path): # Create file to read in. df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) - df.to_excel(path, "test_c") + df.to_excel(path, sheet_name="test_c") # Test read_frame_comment against manually produced expected output. expected = DataFrame({"A": ["one", None, "one"], "B": ["two", None, None]}) @@ -1112,7 +1118,7 @@ def test_datetimes(self, path): ] write_frame = DataFrame({"A": datetimes}) - write_frame.to_excel(path, "Sheet1") + write_frame.to_excel(path, sheet_name="Sheet1") read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0) tm.assert_series_equal(write_frame["A"], read_frame["A"]) @@ -1171,7 +1177,7 @@ def test_write_lists_dict(self, path): "str": ["apple", "banana", "cherry"], } ) - df.to_excel(path, "Sheet1") + df.to_excel(path, sheet_name="Sheet1") read = pd.read_excel(path, sheet_name="Sheet1", header=0, index_col=0) expected = df.copy() @@ -1183,7 +1189,7 @@ def test_write_lists_dict(self, path): def test_render_as_column_name(self, path): # see gh-34331 df = DataFrame({"render": [1, 2], "data": [3, 4]}) - df.to_excel(path, "Sheet1") + df.to_excel(path, sheet_name="Sheet1") read = pd.read_excel(path, "Sheet1", index_col=0) expected = df tm.assert_frame_equal(read, expected) @@ -1191,7 +1197,9 @@ def test_render_as_column_name(self, path): def test_true_and_false_value_options(self, path): # see gh-13347 df = DataFrame([["foo", "bar"]], columns=["col1", "col2"]) - expected = df.replace({"foo": True, "bar": False}) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.replace({"foo": True, "bar": False}) df.to_excel(path) read_frame = pd.read_excel( @@ -1202,7 +1210,7 @@ def test_true_and_false_value_options(self, path): def test_freeze_panes(self, path): # see gh-15160 expected = DataFrame([[1, 2], [3, 4]], columns=["col1", "col2"]) - expected.to_excel(path, "Sheet1", freeze_panes=(1, 1)) + expected.to_excel(path, sheet_name="Sheet1", freeze_panes=(1, 1)) result = pd.read_excel(path, index_col=0) tm.assert_frame_equal(result, expected) @@ -1372,6 +1380,18 @@ def test_excelwriter_fspath(self): with ExcelWriter(path) as writer: assert os.fspath(writer) == str(path) + def test_to_excel_pos_args_deprecation(self): + # GH-54229 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_excel except " + r"for the argument 'excel_writer' will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + buf = BytesIO() + writer = ExcelWriter(buf) + df.to_excel(writer, "Sheet_name_1") + @pytest.mark.parametrize("klass", _writers.values()) def test_subclass_attr(klass): diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 509029861715e..6d5008ca9ee68 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -1,5 +1,6 @@ import io +import numpy as np import pytest import pandas as pd @@ -44,6 +45,17 @@ def test_read_xlsx_fails(datapath): pd.read_excel(path, engine="xlrd") +def test_nan_in_xls(datapath): + # GH 54564 + path = datapath("io", "data", "excel", "test6.xls") + + expected = pd.DataFrame({0: np.r_[0, 2].astype("int64"), 1: np.r_[1, np.nan]}) + + result = pd.read_excel(path, header=None) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "file_header", [ diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 8341dda1597bb..f3a1ebe23b568 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -3291,7 +3291,7 @@ def test_dates_display(self): assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.000004" - x = Series(date_range("20130101 09:00:00", periods=5, freq="N")) + x = Series(date_range("20130101 09:00:00", periods=5, freq="ns")) x.iloc[1] = np.nan result = fmt.Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 09:00:00.000000000" @@ -3320,6 +3320,14 @@ def format_func(x): result = formatter.get_result() assert result == ["10:10", "12:12"] + def test_datetime64formatter_tz_ms(self): + x = Series( + np.array(["2999-01-01", "2999-01-02", "NaT"], dtype="datetime64[ms]") + ).dt.tz_localize("US/Pacific") + result = fmt.Datetime64TZFormatter(x).get_result() + assert result[0].strip() == "2999-01-01 00:00:00-08:00" + assert result[1].strip() == "2999-01-02 00:00:00-08:00" + class TestNaTFormatting: def test_repr(self): @@ -3342,7 +3350,7 @@ def test_period_format_and_strftime_default(self): assert per.strftime(None)[1] is np.nan # ...except for NaTs # Same test with nanoseconds freq - per = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="n") + per = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="ns") formatted = per.format() assert (formatted == per.strftime(None)).all() assert formatted[0] == "2003-01-01 12:01:01.123456789" @@ -3352,19 +3360,19 @@ def test_period_custom(self): # GH#46252 custom formatting directives %l (ms) and %u (us) # 3 digits - per = pd.period_range("2003-01-01 12:01:01.123", periods=2, freq="l") + per = pd.period_range("2003-01-01 12:01:01.123", periods=2, freq="ms") formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") assert formatted[0] == "03 12:01:01 (ms=123 us=123000 ns=123000000)" assert formatted[1] == "03 12:01:01 (ms=124 us=124000 ns=124000000)" # 6 digits - per = pd.period_range("2003-01-01 12:01:01.123456", periods=2, freq="u") + per = pd.period_range("2003-01-01 12:01:01.123456", periods=2, freq="us") formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") assert formatted[0] == "03 12:01:01 (ms=123 us=123456 ns=123456000)" assert formatted[1] == "03 12:01:01 (ms=123 us=123457 ns=123457000)" # 9 digits - per = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="n") + per = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="ns") formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") assert formatted[0] == "03 12:01:01 (ms=123 us=123456 ns=123456789)" assert formatted[1] == "03 12:01:01 (ms=123 us=123456 ns=123456790)" diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 73de2b068b699..6c3bf01cb1857 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -1,6 +1,6 @@ from io import StringIO import re -from string import ascii_uppercase as uppercase +from string import ascii_uppercase import sys import textwrap @@ -452,9 +452,9 @@ def memory_usage(f): return f.memory_usage(deep=True).sum() N = 100 - M = len(uppercase) + M = len(ascii_uppercase) index = MultiIndex.from_product( - [list(uppercase), date_range("20160101", periods=N)], + [list(ascii_uppercase), date_range("20160101", periods=N)], names=["id", "date"], ) df = DataFrame( diff --git a/pandas/tests/io/formats/test_series_info.py b/pandas/tests/io/formats/test_series_info.py index 02827ee25042a..29dd704f6efa9 100644 --- a/pandas/tests/io/formats/test_series_info.py +++ b/pandas/tests/io/formats/test_series_info.py @@ -1,5 +1,5 @@ from io import StringIO -from string import ascii_uppercase as uppercase +from string import ascii_uppercase import textwrap import numpy as np @@ -165,9 +165,9 @@ def test_info_memory_usage_bug_on_multiindex(): # GH 14308 # memory usage introspection should not materialize .values N = 100 - M = len(uppercase) + M = len(ascii_uppercase) index = MultiIndex.from_product( - [list(uppercase), date_range("20160101", periods=N)], + [list(ascii_uppercase), date_range("20160101", periods=N)], names=["id", "date"], ) s = Series(np.random.default_rng(2).standard_normal(N * M), index=index) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index c8e984a92f418..822bd14610388 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -731,3 +731,15 @@ def test_to_csv_iterative_compression_buffer(compression): pd.read_csv(buffer, compression=compression, index_col=0), df ) assert not buffer.closed + + +def test_to_csv_pos_args_deprecation(): + # GH-54229 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_csv except for the " + r"argument 'path_or_buf' will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + buffer = io.BytesIO() + df.to_csv(buffer, ";") diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 3b5fe329c320c..5811485406b86 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -978,3 +978,14 @@ def test_to_html_empty_complex_array(): "
" ) assert result == expected + + +def test_to_html_pos_args_deprecation(): + # GH-54229 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_html except for the " + r"argument 'buf' will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.to_html(None, None) diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py index 974a2174cb03b..9643cf3258e64 100644 --- a/pandas/tests/io/generate_legacy_storage_files.py +++ b/pandas/tests/io/generate_legacy_storage_files.py @@ -142,7 +142,7 @@ def create_data(): "period": period_range("2013-01-01", freq="M", periods=10), "float": Index(np.arange(10, dtype=np.float64)), "uint": Index(np.arange(10, dtype=np.uint64)), - "timedelta": timedelta_range("00:00:00", freq="30T", periods=10), + "timedelta": timedelta_range("00:00:00", freq="30min", periods=10), } index["range"] = RangeIndex(10) diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 0ee53572877a5..fc2edc7559a48 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -32,7 +32,7 @@ def df_schema(): "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], "C": pd.date_range("2016-01-01", freq="d", periods=4), - "D": pd.timedelta_range("1H", periods=4, freq="T"), + "D": pd.timedelta_range("1H", periods=4, freq="min"), }, index=pd.Index(range(4), name="idx"), ) @@ -45,7 +45,7 @@ def df_table(): "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], "C": pd.date_range("2016-01-01", freq="d", periods=4), - "D": pd.timedelta_range("1H", periods=4, freq="T"), + "D": pd.timedelta_range("1H", periods=4, freq="min"), "E": pd.Series(pd.Categorical(["a", "b", "c", "c"])), "F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)), "G": [1.0, 2.0, 3, 4.0], @@ -695,7 +695,7 @@ def test_read_json_table_orient(self, index_nm, vals, recwarn): @pytest.mark.parametrize("index_nm", [None, "idx", "index"]) @pytest.mark.parametrize( "vals", - [{"timedeltas": pd.timedelta_range("1H", periods=4, freq="T")}], + [{"timedeltas": pd.timedelta_range("1H", periods=4, freq="min")}], ) def test_read_json_table_orient_raises(self, index_nm, vals, recwarn): df = DataFrame(vals, index=pd.Index(range(4), name=index_nm)) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 4ee9e1e2d1598..b3c2e67f7c318 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2044,7 +2044,7 @@ def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient): ) if orient == "values": - expected.columns = list(range(0, 8)) + expected.columns = list(range(8)) tm.assert_frame_equal(result, expected) @@ -2095,7 +2095,7 @@ def test_pyarrow_engine_lines_false(): def test_json_roundtrip_string_inference(orient): - pa = pytest.importorskip("pyarrow") + pytest.importorskip("pyarrow") df = DataFrame( [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"] ) @@ -2104,9 +2104,9 @@ def test_json_roundtrip_string_inference(orient): result = read_json(StringIO(out)) expected = DataFrame( [["a", "b"], ["c", "d"]], - dtype=pd.ArrowDtype(pa.string()), - index=pd.Index(["row 1", "row 2"], dtype=pd.ArrowDtype(pa.string())), - columns=pd.Index(["col 1", "col 2"], dtype=pd.ArrowDtype(pa.string())), + dtype="string[pyarrow_numpy]", + index=pd.Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"), + columns=pd.Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index c2f915e33df8a..d7baba87bba31 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -399,7 +399,7 @@ def test_to_json_append_orient(orient_): # Test ValueError when orient is not 'records' df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) msg = ( - r"mode='a' \(append\) is only supported when" + r"mode='a' \(append\) is only supported when " "lines is True and orient is 'records'" ) with pytest.raises(ValueError, match=msg): @@ -411,7 +411,7 @@ def test_to_json_append_lines(): # Test ValueError when lines is not True df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) msg = ( - r"mode='a' \(append\) is only supported when" + r"mode='a' \(append\) is only supported when " "lines is True and orient is 'records'" ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 5bb7097770820..d5f8c5200c4a3 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -1033,7 +1033,7 @@ def test_decode_floating_point(self, sign, float_number): def test_encode_big_set(self): s = set() - for x in range(0, 100000): + for x in range(100000): s.add(x) # Make sure no Exception is raised. diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 492b4d5ec058e..0c5a2e0d04e5a 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -15,6 +15,7 @@ from pandas.errors import ( EmptyDataError, ParserError, + ParserWarning, ) from pandas import DataFrame @@ -129,18 +130,16 @@ def test_unexpected_keyword_parameter_exception(all_parsers): parser.read_table("foo.tsv", foo=1) -def test_suppress_error_output(all_parsers, capsys): +def test_suppress_error_output(all_parsers): # see gh-15925 parser = all_parsers data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - result = parser.read_csv(StringIO(data), on_bad_lines="skip") + with tm.assert_produces_warning(None): + result = parser.read_csv(StringIO(data), on_bad_lines="skip") tm.assert_frame_equal(result, expected) - captured = capsys.readouterr() - assert captured.err == "" - def test_error_bad_lines(all_parsers): # see gh-15925 @@ -152,19 +151,18 @@ def test_error_bad_lines(all_parsers): parser.read_csv(StringIO(data), on_bad_lines="error") -def test_warn_bad_lines(all_parsers, capsys): +def test_warn_bad_lines(all_parsers): # see gh-15925 parser = all_parsers data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - result = parser.read_csv(StringIO(data), on_bad_lines="warn") + with tm.assert_produces_warning( + ParserWarning, match="Skipping line", check_stacklevel=False + ): + result = parser.read_csv(StringIO(data), on_bad_lines="warn") tm.assert_frame_equal(result, expected) - captured = capsys.readouterr() - assert "Skipping line 3" in captured.err - assert "Skipping line 5" in captured.err - def test_read_csv_wrong_num_columns(all_parsers): # Too few columns. @@ -245,7 +243,7 @@ def test_bad_header_uniform_error(all_parsers): parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error") -def test_on_bad_lines_warn_correct_formatting(all_parsers, capsys): +def test_on_bad_lines_warn_correct_formatting(all_parsers): # see gh-15925 parser = all_parsers data = """1,2 @@ -256,17 +254,8 @@ def test_on_bad_lines_warn_correct_formatting(all_parsers, capsys): """ expected = DataFrame({"1": "a", "2": ["b"] * 2}) - result = parser.read_csv(StringIO(data), on_bad_lines="warn") + with tm.assert_produces_warning( + ParserWarning, match="Skipping line", check_stacklevel=False + ): + result = parser.read_csv(StringIO(data), on_bad_lines="warn") tm.assert_frame_equal(result, expected) - - captured = capsys.readouterr() - if parser.engine == "c": - warn = """Skipping line 3: expected 2 fields, saw 3 -Skipping line 4: expected 2 fields, saw 3 - -""" - else: - warn = """Skipping line 3: Expected 2 fields in line 3, saw 3 -Skipping line 4: Expected 2 fields in line 4, saw 3 -""" - assert captured.err == warn diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 1c0f0939029ff..97a32ad79a67c 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -542,8 +542,8 @@ def test_ea_int_avoid_overflow(all_parsers): def test_string_inference(all_parsers): # GH#54430 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" data = """a,b x,1 @@ -558,3 +558,37 @@ def test_string_inference(all_parsers): columns=pd.Index(["a", "b"], dtype=dtype), ) tm.assert_frame_equal(result, expected) + + +def test_accurate_parsing_of_large_integers(all_parsers): + # GH#52505 + data = """SYMBOL,MOMENT,ID,ID_DEAL +AAPL,20230301181139587,1925036343869802844, +AAPL,20230301181139587,2023552585717889863,2023552585717263358 +NVDA,20230301181139587,2023552585717889863,2023552585717263359 +AMC,20230301181139587,2023552585717889863,2023552585717263360 +AMZN,20230301181139587,2023552585717889759,2023552585717263360 +MSFT,20230301181139587,2023552585717889863,2023552585717263361 +NVDA,20230301181139587,2023552585717889827,2023552585717263361""" + orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) + assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1 + assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1 + assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2 + assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263361, "ID_DEAL"]) == 2 + + +def test_dtypes_with_usecols(all_parsers): + # GH#54868 + + parser = all_parsers + data = """a,b,c +1,2,3 +4,5,6""" + + result = parser.read_csv(StringIO(data), usecols=["a", "c"], dtype={"a": object}) + if parser.engine == "pyarrow": + values = [1, 4] + else: + values = ["1", "4"] + expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 32a010b3aeb34..18eee01f87621 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -19,7 +19,10 @@ from pandas.compat import is_ci_environment from pandas.compat.numpy import np_version_gte1p24 -from pandas.errors import ParserError +from pandas.errors import ( + ParserError, + ParserWarning, +) import pandas.util._test_decorators as td from pandas import ( @@ -461,7 +464,7 @@ def test_data_after_quote(c_parser_only): tm.assert_frame_equal(result, expected) -def test_comment_whitespace_delimited(c_parser_only, capsys): +def test_comment_whitespace_delimited(c_parser_only): parser = c_parser_only test_input = """\ 1 2 @@ -474,18 +477,17 @@ def test_comment_whitespace_delimited(c_parser_only, capsys): 8# 1 field, NaN 9 2 3 # skipped line # comment""" - df = parser.read_csv( - StringIO(test_input), - comment="#", - header=None, - delimiter="\\s+", - skiprows=0, - on_bad_lines="warn", - ) - captured = capsys.readouterr() - # skipped lines 2, 3, 4, 9 - for line_num in (2, 3, 4, 9): - assert f"Skipping line {line_num}" in captured.err + with tm.assert_produces_warning( + ParserWarning, match="Skipping line", check_stacklevel=False + ): + df = parser.read_csv( + StringIO(test_input), + comment="#", + header=None, + delimiter="\\s+", + skiprows=0, + on_bad_lines="warn", + ) expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]]) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 5cb54bb4e2916..d72174c40478e 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -658,3 +658,29 @@ def test_header_missing_rows(all_parsers): msg = r"Passed header=\[0,1,2\], len of 3, but only 2 lines in file" with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), header=[0, 1, 2]) + + +@skip_pyarrow +def test_header_multiple_whitespaces(all_parsers): + # GH#54931 + parser = all_parsers + data = """aa bb(1,1) cc(1,1) + 0 2 3.5""" + + result = parser.read_csv(StringIO(data), sep=r"\s+") + expected = DataFrame({"aa": [0], "bb(1,1)": 2, "cc(1,1)": 3.5}) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_header_delim_whitespace(all_parsers): + # GH#54918 + parser = all_parsers + data = """a,b +1,2 +3,4 + """ + + result = parser.read_csv(StringIO(data), delim_whitespace=True) + expected = DataFrame({"a,b": ["1,2", "3,4"]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 959b988e208c1..dbd474c6ae0b9 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -274,7 +274,7 @@ def test_multi_char_sep_quotes(python_parser_only, quoting): parser.read_csv(StringIO(data), quoting=quoting, **kwargs) -def test_none_delimiter(python_parser_only, capsys): +def test_none_delimiter(python_parser_only): # see gh-13374 and gh-17465 parser = python_parser_only data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9" @@ -283,12 +283,14 @@ def test_none_delimiter(python_parser_only, capsys): # We expect the third line in the data to be # skipped because it is malformed, but we do # not expect any errors to occur. - result = parser.read_csv(StringIO(data), header=0, sep=None, on_bad_lines="warn") + with tm.assert_produces_warning( + ParserWarning, match="Skipping line 3", check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), header=0, sep=None, on_bad_lines="warn" + ) tm.assert_frame_equal(result, expected) - captured = capsys.readouterr() - assert "Skipping line 3" in captured.err - @pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz']) @pytest.mark.parametrize("skipfooter", [0, 1]) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index f150ed3903443..e2d785a38eb51 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -12,6 +12,7 @@ import pandas._libs.parsers as parser from pandas._libs.parsers import TextReader +from pandas.errors import ParserWarning from pandas import DataFrame import pandas._testing as tm @@ -125,7 +126,7 @@ def test_integer_thousands_alt(self): expected = DataFrame([123456, 12500]) tm.assert_frame_equal(result, expected) - def test_skip_bad_lines(self, capsys): + def test_skip_bad_lines(self): # too many lines, see #2430 for why data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r" @@ -145,14 +146,11 @@ def test_skip_bad_lines(self, capsys): } assert_array_dicts_equal(result, expected) - reader = TextReader( - StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn - ) - reader.read() - captured = capsys.readouterr() - - assert "Skipping line 4" in captured.err - assert "Skipping line 6" in captured.err + with tm.assert_produces_warning(ParserWarning, match="Skipping line"): + reader = TextReader( + StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn + ) + reader.read() def test_header_not_enough_lines(self): data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6" diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index f5a0bcd2c00fd..ac171568187cd 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -12,11 +12,6 @@ import pytest -from pandas.compat import ( - is_ci_environment, - is_platform_mac, - is_platform_windows, -) from pandas.errors import ParserError import pandas._testing as tm @@ -177,9 +172,6 @@ def test_close_file_handle_on_invalid_usecols(all_parsers): if parser.engine == "pyarrow": pyarrow = pytest.importorskip("pyarrow") error = pyarrow.lib.ArrowKeyError - if is_ci_environment() and (is_platform_windows() or is_platform_mac()): - # GH#45547 causes timeouts on windows/mac builds - pytest.skip("GH#45547 causing timeouts on windows/mac builds 2022-01-22") with tm.ensure_clean("test.csv") as fname: Path(fname).write_text("col1,col2\na,b\n1,2", encoding="utf-8") diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index 72fc2361c5053..9b4590ca3f01f 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -392,7 +392,7 @@ def test_read_py2_hdf_file_in_py3(datapath): def test_read_infer_string(tmp_path, setup_path): # GH#54431 - pa = pytest.importorskip("pyarrow") + pytest.importorskip("pyarrow") df = DataFrame({"a": ["a", "b", None]}) path = tmp_path / setup_path df.to_hdf(path, key="data", format="table") @@ -400,7 +400,7 @@ def test_read_infer_string(tmp_path, setup_path): result = read_hdf(path, key="data", mode="r") expected = DataFrame( {"a": ["a", "b", None]}, - dtype=pd.ArrowDtype(pa.string()), - columns=Index(["a"], dtype=pd.ArrowDtype(pa.string())), + dtype="string[pyarrow_numpy]", + columns=Index(["a"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index 2f2349fe1168b..e387b1b607f63 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -64,7 +64,7 @@ def test_select_with_dups(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "A", "B", "B"] ) - df.index = date_range("20130101 9:30", periods=10, freq="T") + df.index = date_range("20130101 9:30", periods=10, freq="min") with ensure_clean_store(setup_path) as store: store.append("df", df) @@ -95,7 +95,7 @@ def test_select_with_dups(setup_path): ], axis=1, ) - df.index = date_range("20130101 9:30", periods=10, freq="T") + df.index = date_range("20130101 9:30", periods=10, freq="min") with ensure_clean_store(setup_path) as store: store.append("df", df) @@ -397,7 +397,7 @@ def test_select_iterator_complete_8014(setup_path): # no iterator with ensure_clean_store(setup_path) as store: - expected = tm.makeTimeDataFrame(100064, "S") + expected = tm.makeTimeDataFrame(100064, "s") _maybe_remove(store, "df") store.append("df", expected) @@ -428,7 +428,7 @@ def test_select_iterator_complete_8014(setup_path): # with iterator, full range with ensure_clean_store(setup_path) as store: - expected = tm.makeTimeDataFrame(100064, "S") + expected = tm.makeTimeDataFrame(100064, "s") _maybe_remove(store, "df") store.append("df", expected) @@ -466,7 +466,7 @@ def test_select_iterator_non_complete_8014(setup_path): # with iterator, non complete range with ensure_clean_store(setup_path) as store: - expected = tm.makeTimeDataFrame(100064, "S") + expected = tm.makeTimeDataFrame(100064, "s") _maybe_remove(store, "df") store.append("df", expected) @@ -496,7 +496,7 @@ def test_select_iterator_non_complete_8014(setup_path): # with iterator, empty where with ensure_clean_store(setup_path) as store: - expected = tm.makeTimeDataFrame(100064, "S") + expected = tm.makeTimeDataFrame(100064, "s") _maybe_remove(store, "df") store.append("df", expected) @@ -516,7 +516,7 @@ def test_select_iterator_many_empty_frames(setup_path): # with iterator, range limited to the first chunk with ensure_clean_store(setup_path) as store: - expected = tm.makeTimeDataFrame(100000, "S") + expected = tm.makeTimeDataFrame(100000, "s") _maybe_remove(store, "df") store.append("df", expected) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 4b3c82ad3f083..10e0467c5d74d 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -471,3 +471,13 @@ def test_invalid_dtype_backend(self): ) with pytest.raises(ValueError, match=msg): read_clipboard(dtype_backend="numpy") + + def test_to_clipboard_pos_args_deprecation(self): + # GH-54229 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_clipboard " + r"will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.to_clipboard(True, None) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index a0fee6751bf53..cf43203466ef4 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -222,14 +222,10 @@ def test_invalid_dtype_backend(self): def test_string_inference(self, tmp_path): # GH#54431 - import pyarrow as pa - path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}) df.to_feather(path) with pd.option_context("future.infer_string", True): result = read_feather(path) - expected = pd.DataFrame( - data={"a": ["x", "y"]}, dtype=pd.ArrowDtype(pa.string()) - ) + expected = pd.DataFrame(data={"a": ["x", "y"]}, dtype="string[pyarrow_numpy]") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index c2d791ba24c87..d90f803f1e607 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -426,7 +426,7 @@ def test_string_inference(tmp_path): result = read_orc(path) expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype=pd.ArrowDtype(pa.string()), - columns=pd.Index(["a"], dtype=pd.ArrowDtype(pa.string())), + dtype="string[pyarrow_numpy]", + columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index a4c6cfcf9fe0e..b043f9fab23ae 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1,5 +1,6 @@ """ test parquet compat """ import datetime +from decimal import Decimal from io import BytesIO import os import pathlib @@ -7,15 +8,14 @@ import numpy as np import pytest -from pandas._config import ( - get_option, - using_copy_on_write, -) +from pandas._config import using_copy_on_write +from pandas._config.config import _get_option from pandas.compat import is_platform_windows from pandas.compat.pyarrow import ( pa_version_under7p0, pa_version_under8p0, + pa_version_under11p0, pa_version_under13p0, ) @@ -59,7 +59,8 @@ pytest.param( "fastparquet", marks=pytest.mark.skipif( - not _HAVE_FASTPARQUET or get_option("mode.data_manager") == "array", + not _HAVE_FASTPARQUET + or _get_option("mode.data_manager", silent=True) == "array", reason="fastparquet is not installed or ArrayManager is used", ), ), @@ -86,7 +87,7 @@ def pa(): def fp(): if not _HAVE_FASTPARQUET: pytest.skip("fastparquet is not installed") - elif get_option("mode.data_manager") == "array": + elif _get_option("mode.data_manager", silent=True) == "array": pytest.skip("ArrayManager is not supported with fastparquet") return "fastparquet" @@ -359,6 +360,20 @@ def test_cross_engine_fp_pa(df_cross_compat, pa, fp): tm.assert_frame_equal(result, df[["a", "d"]]) +def test_parquet_pos_args_deprecation(engine): + # GH-54229 + df = pd.DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_parquet except for the " + r"argument 'path' will be keyword-only." + ) + with tm.ensure_clean() as path: + with tm.assert_produces_warning( + FutureWarning, match=msg, check_stacklevel=False + ): + df.to_parquet(path, engine) + + class Base: def check_error_on_write(self, df, engine, exc, err_msg): # check that we are raising the exception on writing @@ -965,7 +980,7 @@ def test_timestamp_nanoseconds(self, pa): ver = "2.6" else: ver = "2.0" - df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1n", periods=10)}) + df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1ns", periods=10)}) check_round_trip(df, pa, write_kwargs={"version": ver}) def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): @@ -996,9 +1011,9 @@ def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): def test_filter_row_groups(self, pa): # https://github.com/pandas-dev/pandas/issues/26551 pytest.importorskip("pyarrow") - df = pd.DataFrame({"a": list(range(0, 3))}) + df = pd.DataFrame({"a": list(range(3))}) with tm.ensure_clean() as path: - df.to_parquet(path, pa) + df.to_parquet(path, engine=pa) result = read_parquet( path, pa, filters=[("a", "==", 0)], use_legacy_dataset=False ) @@ -1011,7 +1026,7 @@ def test_read_parquet_manager(self, pa, using_array_manager): ) with tm.ensure_clean() as path: - df.to_parquet(path, pa) + df.to_parquet(path, engine=pa) result = read_parquet(path, pa) if using_array_manager: assert isinstance(result._mgr, pd.core.internals.ArrayManager) @@ -1099,8 +1114,6 @@ def test_df_attrs_persistence(self, tmp_path, pa): def test_string_inference(self, tmp_path, pa): # GH#54431 - import pyarrow as pa - path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"]) df.to_parquet(path, engine="pyarrow") @@ -1108,8 +1121,39 @@ def test_string_inference(self, tmp_path, pa): result = read_parquet(path, engine="pyarrow") expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype=pd.ArrowDtype(pa.string()), - index=pd.Index(["a", "b"], dtype=pd.ArrowDtype(pa.string())), + dtype="string[pyarrow_numpy]", + index=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.skipif(pa_version_under11p0, reason="not supported before 11.0") + def test_roundtrip_decimal(self, tmp_path, pa): + # GH#54768 + import pyarrow as pa + + path = tmp_path / "decimal.p" + df = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="string[pyarrow]") + df.to_parquet(path, schema=pa.schema([("a", pa.decimal128(5))])) + result = read_parquet(path) + expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") + tm.assert_frame_equal(result, expected) + + def test_infer_string_large_string_type(self, tmp_path, pa): + # GH#54798 + import pyarrow as pa + import pyarrow.parquet as pq + + path = tmp_path / "large_string.p" + + table = pa.table({"a": pa.array([None, "b", "c"], pa.large_string())}) + pq.write_table(table, path) + + with pd.option_context("future.infer_string", True): + result = read_parquet(path) + expected = pd.DataFrame( + data={"a": [None, "b", "c"]}, + dtype="string[pyarrow_numpy]", + columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) @@ -1174,10 +1218,10 @@ def test_categorical(self, fp): check_round_trip(df, fp) def test_filter_row_groups(self, fp): - d = {"a": list(range(0, 3))} + d = {"a": list(range(3))} df = pd.DataFrame(d) with tm.ensure_clean() as path: - df.to_parquet(path, fp, compression=None, row_group_offsets=1) + df.to_parquet(path, engine=fp, compression=None, row_group_offsets=1) result = read_parquet(path, fp, filters=[("a", "==", 0)]) assert len(result) == 1 diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index a30b3f64bd75c..bb8f0ce214c96 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -526,7 +526,7 @@ def test_pickle_timeseries_periodindex(): prng = period_range("1/1/2011", "1/1/2012", freq="M") ts = Series(np.random.default_rng(2).standard_normal(len(prng)), prng) new_ts = tm.round_trip_pickle(ts) - assert new_ts.index.freq == "M" + assert new_ts.index.freqstr == "M" @pytest.mark.parametrize( diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index d1d0795234e72..5d8dce72d69c9 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -15,6 +15,7 @@ @pytest.mark.parametrize("path_klass", [lambda p: p, Path]) def test_spss_labelled_num(path_klass, datapath): # test file from the Haven project (https://haven.tidyverse.org/) + # Licence at LICENSES/HAVEN_LICENSE, LICENSES/HAVEN_MIT fname = path_klass(datapath("io", "data", "spss", "labelled-num.sav")) df = pd.read_spss(fname, convert_categoricals=True) @@ -30,6 +31,7 @@ def test_spss_labelled_num(path_klass, datapath): @pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError") def test_spss_labelled_num_na(datapath): # test file from the Haven project (https://haven.tidyverse.org/) + # Licence at LICENSES/HAVEN_LICENSE, LICENSES/HAVEN_MIT fname = datapath("io", "data", "spss", "labelled-num-na.sav") df = pd.read_spss(fname, convert_categoricals=True) @@ -45,6 +47,7 @@ def test_spss_labelled_num_na(datapath): @pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError") def test_spss_labelled_str(datapath): # test file from the Haven project (https://haven.tidyverse.org/) + # Licence at LICENSES/HAVEN_LICENSE, LICENSES/HAVEN_MIT fname = datapath("io", "data", "spss", "labelled-str.sav") df = pd.read_spss(fname, convert_categoricals=True) @@ -60,6 +63,7 @@ def test_spss_labelled_str(datapath): @pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError") def test_spss_umlauts(datapath): # test file from the Haven project (https://haven.tidyverse.org/) + # Licence at LICENSES/HAVEN_LICENSE, LICENSES/HAVEN_MIT fname = datapath("io", "data", "spss", "umlauts.sav") df = pd.read_spss(fname, convert_categoricals=True) @@ -84,6 +88,7 @@ def test_spss_usecols(datapath): def test_spss_umlauts_dtype_backend(datapath, dtype_backend): # test file from the Haven project (https://haven.tidyverse.org/) + # Licence at LICENSES/HAVEN_LICENSE, LICENSES/HAVEN_MIT fname = datapath("io", "data", "spss", "umlauts.sav") df = pd.read_spss(fname, convert_categoricals=False, dtype_backend=dtype_backend) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 02736406e109b..f015c9efe7122 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -397,6 +397,54 @@ def test_frame3(): return DataFrame(data, columns=columns) +def get_all_views(conn): + if isinstance(conn, sqlite3.Connection): + c = conn.execute("SELECT name FROM sqlite_master WHERE type='view'") + return [view[0] for view in c.fetchall()] + else: + from sqlalchemy import inspect + + return inspect(conn).get_view_names() + + +def get_all_tables(conn): + if isinstance(conn, sqlite3.Connection): + c = conn.execute("SELECT name FROM sqlite_master WHERE type='table'") + return [table[0] for table in c.fetchall()] + else: + from sqlalchemy import inspect + + return inspect(conn).get_table_names() + + +def drop_table( + table_name: str, + conn: sqlite3.Connection | sqlalchemy.engine.Engine | sqlalchemy.engine.Connection, +): + if isinstance(conn, sqlite3.Connection): + conn.execute(f"DROP TABLE IF EXISTS {sql._get_valid_sqlite_name(table_name)}") + conn.commit() + else: + with conn.begin() as con: + sql.SQLDatabase(con).drop_table(table_name) + + +def drop_view( + view_name: str, + conn: sqlite3.Connection | sqlalchemy.engine.Engine | sqlalchemy.engine.Connection, +): + if isinstance(conn, sqlite3.Connection): + conn.execute(f"DROP VIEW IF EXISTS {sql._get_valid_sqlite_name(view_name)}") + conn.commit() + else: + quoted_view = conn.engine.dialect.identifier_preparer.quote_identifier( + view_name + ) + stmt = sqlalchemy.text(f"DROP VIEW IF EXISTS {quoted_view}") + with conn.begin() as con: + con.execute(stmt) # type: ignore[union-attr] + + @pytest.fixture def mysql_pymysql_engine(iris_path, types_data): sqlalchemy = pytest.importorskip("sqlalchemy") @@ -413,16 +461,18 @@ def mysql_pymysql_engine(iris_path, types_data): for entry in types_data: entry.pop("DateColWithTz") create_and_load_types(engine, types_data, "mysql") + if not insp.has_table("iris_view"): + create_and_load_iris_view(engine) yield engine - with engine.connect() as conn: - with conn.begin(): - stmt = sqlalchemy.text("DROP TABLE IF EXISTS test_frame;") - conn.execute(stmt) + for view in get_all_views(engine): + drop_view(view, engine) + for tbl in get_all_tables(engine): + drop_table(tbl, engine) engine.dispose() @pytest.fixture -def mysql_pymysql_conn(mysql_pymysql_engine): +def mysql_pymysql_conn(iris_path, mysql_pymysql_engine): with mysql_pymysql_engine.connect() as conn: yield conn @@ -440,11 +490,13 @@ def postgresql_psycopg2_engine(iris_path, types_data): create_and_load_iris(engine, iris_path, "postgresql") if not insp.has_table("types"): create_and_load_types(engine, types_data, "postgresql") + if not insp.has_table("iris_view"): + create_and_load_iris_view(engine) yield engine - with engine.connect() as conn: - with conn.begin(): - stmt = sqlalchemy.text("DROP TABLE IF EXISTS test_frame;") - conn.execute(stmt) + for view in get_all_views(engine): + drop_view(view, engine) + for tbl in get_all_tables(engine): + drop_table(tbl, engine) engine.dispose() @@ -462,10 +514,25 @@ def sqlite_str(): @pytest.fixture -def sqlite_engine(sqlite_str): +def sqlite_engine(sqlite_str, iris_path, types_data): sqlalchemy = pytest.importorskip("sqlalchemy") engine = sqlalchemy.create_engine(sqlite_str, poolclass=sqlalchemy.pool.NullPool) + + insp = sqlalchemy.inspect(engine) + if not insp.has_table("iris"): + create_and_load_iris(engine, iris_path, "sqlite") + if not insp.has_table("iris_view"): + create_and_load_iris_view(engine) + if not insp.has_table("types"): + for entry in types_data: + entry.pop("DateColWithTz") + create_and_load_types(engine, types_data, "sqlite") + yield engine + for view in get_all_views(engine): + drop_view(view, engine) + for tbl in get_all_tables(engine): + drop_table(tbl, engine) engine.dispose() @@ -476,17 +543,25 @@ def sqlite_conn(sqlite_engine): @pytest.fixture -def sqlite_iris_str(sqlite_str, iris_path): +def sqlite_iris_str(sqlite_str, iris_path, types_data): sqlalchemy = pytest.importorskip("sqlalchemy") engine = sqlalchemy.create_engine(sqlite_str) - create_and_load_iris(engine, iris_path, "sqlite") + + insp = sqlalchemy.inspect(engine) + if not insp.has_table("iris"): + create_and_load_iris(engine, iris_path, "sqlite") + if not insp.has_table("iris_view"): + create_and_load_iris_view(engine) + if not insp.has_table("types"): + for entry in types_data: + entry.pop("DateColWithTz") + create_and_load_types(engine, types_data, "sqlite") engine.dispose() return sqlite_str @pytest.fixture def sqlite_iris_engine(sqlite_engine, iris_path): - create_and_load_iris(sqlite_engine, iris_path, "sqlite") return sqlite_engine @@ -499,37 +574,66 @@ def sqlite_iris_conn(sqlite_iris_engine): @pytest.fixture def sqlite_buildin(): with contextlib.closing(sqlite3.connect(":memory:")) as closing_conn: + create_and_load_iris_view(closing_conn) with closing_conn as conn: yield conn @pytest.fixture -def sqlite_buildin_iris(sqlite_buildin, iris_path): +def sqlite_sqlalchemy_memory_engine(iris_path, types_data): + sqlalchemy = pytest.importorskip("sqlalchemy") + engine = sqlalchemy.create_engine("sqlite:///:memory:") + + insp = sqlalchemy.inspect(engine) + if not insp.has_table("iris"): + create_and_load_iris(engine, iris_path, "sqlite") + if not insp.has_table("iris_view"): + create_and_load_iris_view(engine) + if not insp.has_table("types"): + for entry in types_data: + entry.pop("DateColWithTz") + create_and_load_types(engine, types_data, "sqlite") + + yield engine + for view in get_all_views(engine): + drop_view(view, engine) + for tbl in get_all_tables(engine): + drop_table(tbl, engine) + + +@pytest.fixture +def sqlite_buildin_iris(sqlite_buildin, iris_path, types_data): create_and_load_iris_sqlite3(sqlite_buildin, iris_path) + + for entry in types_data: + entry.pop("DateColWithTz") + types_data = [tuple(entry.values()) for entry in types_data] + + create_and_load_types_sqlite3(sqlite_buildin, types_data) return sqlite_buildin mysql_connectable = [ - "mysql_pymysql_engine", - "mysql_pymysql_conn", + pytest.param("mysql_pymysql_engine", marks=pytest.mark.db), + pytest.param("mysql_pymysql_conn", marks=pytest.mark.db), ] postgresql_connectable = [ - "postgresql_psycopg2_engine", - "postgresql_psycopg2_conn", + pytest.param("postgresql_psycopg2_engine", marks=pytest.mark.db), + pytest.param("postgresql_psycopg2_conn", marks=pytest.mark.db), ] sqlite_connectable = [ - "sqlite_engine", - "sqlite_conn", - "sqlite_str", + pytest.param("sqlite_engine", marks=pytest.mark.db), + pytest.param("sqlite_conn", marks=pytest.mark.db), + pytest.param("sqlite_str", marks=pytest.mark.db), ] sqlite_iris_connectable = [ - "sqlite_iris_engine", - "sqlite_iris_conn", - "sqlite_iris_str", + pytest.param("sqlite_iris_engine", marks=pytest.mark.db), + pytest.param("sqlite_iris_conn", marks=pytest.mark.db), + pytest.param("sqlite_iris_str", marks=pytest.mark.db), ] sqlalchemy_connectable = mysql_connectable + postgresql_connectable + sqlite_connectable @@ -538,12 +642,17 @@ def sqlite_buildin_iris(sqlite_buildin, iris_path): mysql_connectable + postgresql_connectable + sqlite_iris_connectable ) -all_connectable = sqlalchemy_connectable + ["sqlite_buildin"] +all_connectable = sqlalchemy_connectable + [ + "sqlite_buildin", + "sqlite_sqlalchemy_memory_engine", +] -all_connectable_iris = sqlalchemy_connectable_iris + ["sqlite_buildin_iris"] +all_connectable_iris = sqlalchemy_connectable_iris + [ + "sqlite_buildin_iris", + "sqlite_sqlalchemy_memory_engine", +] -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql(conn, test_frame1, request): # GH 51086 if conn is sqlite_engine @@ -551,7 +660,14 @@ def test_dataframe_to_sql(conn, test_frame1, request): test_frame1.to_sql(name="test", con=conn, if_exists="append", index=False) -@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_dataframe_to_sql_empty(conn, test_frame1, request): + # GH 51086 if conn is sqlite_engine + conn = request.getfixturevalue(conn) + empty_df = test_frame1.iloc[:0] + empty_df.to_sql(name="test", con=conn, if_exists="append", index=False) + + @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_arrow_dtypes(conn, request): # GH 52046 @@ -572,7 +688,6 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): df.to_sql(name="test_arrow", con=conn, if_exists="replace", index=False) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_arrow_dtypes_missing(conn, request, nulls_fixture): # GH 52046 @@ -588,7 +703,6 @@ def test_dataframe_to_sql_arrow_dtypes_missing(conn, request, nulls_fixture): df.to_sql(name="test_arrow", con=conn, if_exists="replace", index=False) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) @pytest.mark.parametrize("method", [None, "multi"]) def test_to_sql(conn, method, test_frame1, request): @@ -599,7 +713,6 @@ def test_to_sql(conn, method, test_frame1, request): assert count_rows(conn, "test_frame") == len(test_frame1) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) @pytest.mark.parametrize("mode, num_row_coef", [("replace", 1), ("append", 2)]) def test_to_sql_exist(conn, mode, num_row_coef, test_frame1, request): @@ -611,7 +724,6 @@ def test_to_sql_exist(conn, mode, num_row_coef, test_frame1, request): assert count_rows(conn, "test_frame") == num_row_coef * len(test_frame1) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_to_sql_exist_fail(conn, test_frame1, request): conn = request.getfixturevalue(conn) @@ -624,7 +736,6 @@ def test_to_sql_exist_fail(conn, test_frame1, request): pandasSQL.to_sql(test_frame1, "test_frame", if_exists="fail") -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_iris_query(conn, request): conn = request.getfixturevalue(conn) @@ -637,7 +748,6 @@ def test_read_iris_query(conn, request): assert "SepalWidth" in iris_frame.columns -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_iris_query_chunksize(conn, request): conn = request.getfixturevalue(conn) @@ -650,7 +760,6 @@ def test_read_iris_query_chunksize(conn, request): assert "SepalWidth" in iris_frame.columns -@pytest.mark.db @pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) def test_read_iris_query_expression_with_parameter(conn, request): conn = request.getfixturevalue(conn) @@ -672,7 +781,6 @@ def test_read_iris_query_expression_with_parameter(conn, request): autoload_con.dispose() -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_iris_query_string_with_parameter(conn, request, sql_strings): for db, query in sql_strings["read_parameters"].items(): @@ -685,7 +793,6 @@ def test_read_iris_query_string_with_parameter(conn, request, sql_strings): check_iris_frame(iris_frame) -@pytest.mark.db @pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) def test_read_iris_table(conn, request): # GH 51015 if conn = sqlite_iris_str @@ -696,7 +803,6 @@ def test_read_iris_table(conn, request): check_iris_frame(iris_frame) -@pytest.mark.db @pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) def test_read_iris_table_chunksize(conn, request): conn = request.getfixturevalue(conn) @@ -706,7 +812,6 @@ def test_read_iris_table_chunksize(conn, request): check_iris_frame(iris_frame) -@pytest.mark.db @pytest.mark.parametrize("conn", sqlalchemy_connectable) def test_to_sql_callable(conn, test_frame1, request): conn = request.getfixturevalue(conn) @@ -725,26 +830,38 @@ def sample(pd_table, conn, keys, data_iter): assert count_rows(conn, "test_frame") == len(test_frame1) -@pytest.mark.db -@pytest.mark.parametrize("conn", mysql_connectable) +@pytest.mark.parametrize("conn", all_connectable_iris) def test_default_type_conversion(conn, request): + conn_name = conn + if conn_name == "sqlite_buildin_iris": + request.node.add_marker( + pytest.mark.xfail( + reason="sqlite_buildin connection does not implement read_sql_table" + ) + ) + conn = request.getfixturevalue(conn) df = sql.read_sql_table("types", conn) assert issubclass(df.FloatCol.dtype.type, np.floating) assert issubclass(df.IntCol.dtype.type, np.integer) - # MySQL has no real BOOL type (it's an alias for TINYINT) - assert issubclass(df.BoolCol.dtype.type, np.integer) + # MySQL/sqlite has no real BOOL type + if "postgresql" in conn_name: + assert issubclass(df.BoolCol.dtype.type, np.bool_) + else: + assert issubclass(df.BoolCol.dtype.type, np.integer) # Int column with NA values stays as float assert issubclass(df.IntColWithNull.dtype.type, np.floating) # Bool column with NA = int column with NA values => becomes float - assert issubclass(df.BoolColWithNull.dtype.type, np.floating) + if "postgresql" in conn_name: + assert issubclass(df.BoolColWithNull.dtype.type, object) + else: + assert issubclass(df.BoolColWithNull.dtype.type, np.floating) -@pytest.mark.db @pytest.mark.parametrize("conn", mysql_connectable) def test_read_procedure(conn, request): conn = request.getfixturevalue(conn) @@ -782,7 +899,6 @@ def test_read_procedure(conn, request): tm.assert_frame_equal(df, res2) -@pytest.mark.db @pytest.mark.parametrize("conn", postgresql_connectable) @pytest.mark.parametrize("expected_count", [2, "Success!"]) def test_copy_from_callable_insertion_method(conn, expected_count, request): @@ -822,7 +938,6 @@ def psql_insert_copy(table, conn, keys, data_iter): tm.assert_frame_equal(result, expected) -@pytest.mark.db @pytest.mark.parametrize("conn", postgresql_connectable) def test_insertion_method_on_conflict_do_nothing(conn, request): # GH 15988: Example in to_sql docstring @@ -881,7 +996,6 @@ def insert_on_conflict(table, conn, keys, data_iter): pandasSQL.drop_table("test_insert_conflict") -@pytest.mark.db @pytest.mark.parametrize("conn", mysql_connectable) def test_insertion_method_on_conflict_update(conn, request): # GH 14553: Example in to_sql docstring @@ -935,7 +1049,6 @@ def insert_on_conflict(table, conn, keys, data_iter): pandasSQL.drop_table("test_insert_conflict") -@pytest.mark.db @pytest.mark.parametrize("conn", postgresql_connectable) def test_read_view_postgres(conn, request): # GH 52969 @@ -1017,1741 +1130,2037 @@ def test_execute_deprecated(sqlite_buildin_iris): sql.execute("select * from iris", sqlite_buildin_iris) -class MixInBase: - def teardown_method(self): - # if setup fails, there may not be a connection to close. - if hasattr(self, "conn"): - self.conn.close() - # use a fresh connection to ensure we can drop all tables. - try: - conn = self.connect() - except (sqlalchemy.exc.OperationalError, sqlite3.OperationalError): - pass - else: - with conn: - for view in self._get_all_views(conn): - self.drop_view(view, conn) - for tbl in self._get_all_tables(conn): - self.drop_table(tbl, conn) - +@pytest.fixture +def flavor(): + def func(conn_name): + if "postgresql" in conn_name: + return "postgresql" + elif "sqlite" in conn_name: + return "sqlite" + elif "mysql" in conn_name: + return "mysql" -class SQLiteMixIn(MixInBase): - def connect(self): - return sqlite3.connect(":memory:") + raise ValueError(f"unsupported connection: {conn_name}") - def drop_table(self, table_name, conn): - conn.execute(f"DROP TABLE IF EXISTS {sql._get_valid_sqlite_name(table_name)}") - conn.commit() + return func - def _get_all_tables(self, conn): - c = conn.execute("SELECT name FROM sqlite_master WHERE type='table'") - return [table[0] for table in c.fetchall()] - def drop_view(self, view_name, conn): - conn.execute(f"DROP VIEW IF EXISTS {sql._get_valid_sqlite_name(view_name)}") - conn.commit() +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_read_sql_iris_parameter(conn, request, sql_strings, flavor): + conn_name = conn + conn = request.getfixturevalue(conn) + query = sql_strings["read_parameters"][flavor(conn_name)] + params = ("Iris-setosa", 5.1) + pandasSQL = pandasSQL_builder(conn) - def _get_all_views(self, conn): - c = conn.execute("SELECT name FROM sqlite_master WHERE type='view'") - return [view[0] for view in c.fetchall()] + with pandasSQL.run_transaction(): + iris_frame = pandasSQL.read_query(query, params=params) + check_iris_frame(iris_frame) -class SQLAlchemyMixIn(MixInBase): - @classmethod - def teardown_class(cls): - cls.engine.dispose() +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_read_sql_iris_named_parameter(conn, request, sql_strings, flavor): + conn_name = conn + conn = request.getfixturevalue(conn) + query = sql_strings["read_named_parameters"][flavor(conn_name)] + params = {"name": "Iris-setosa", "length": 5.1} + pandasSQL = pandasSQL_builder(conn) + with pandasSQL.run_transaction(): + iris_frame = pandasSQL.read_query(query, params=params) + check_iris_frame(iris_frame) - def connect(self): - return self.engine.connect() - def drop_table(self, table_name, conn): - if conn.in_transaction(): - conn.get_transaction().rollback() - with conn.begin(): - sql.SQLDatabase(conn).drop_table(table_name) +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_read_sql_iris_no_parameter_with_percent(conn, request, sql_strings, flavor): + if "mysql" in conn or "postgresql" in conn: + request.node.add_marker(pytest.mark.xfail(reason="broken test")) - def _get_all_tables(self, conn): - from sqlalchemy import inspect + conn_name = conn + conn = request.getfixturevalue(conn) - return inspect(conn).get_table_names() + query = sql_strings["read_no_parameters_with_percent"][flavor(conn_name)] + pandasSQL = pandasSQL_builder(conn) + with pandasSQL.run_transaction(): + iris_frame = pandasSQL.read_query(query, params=None) + check_iris_frame(iris_frame) - def drop_view(self, view_name, conn): - quoted_view = conn.engine.dialect.identifier_preparer.quote_identifier( - view_name - ) - if conn.in_transaction(): - conn.get_transaction().rollback() - with conn.begin(): - conn.exec_driver_sql(f"DROP VIEW IF EXISTS {quoted_view}") - def _get_all_views(self, conn): - from sqlalchemy import inspect +# ----------------------------------------------------------------------------- +# -- Testing the public API - return inspect(conn).get_view_names() +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_api_read_sql_view(conn, request): + conn = request.getfixturevalue(conn) + iris_frame = sql.read_sql_query("SELECT * FROM iris_view", conn) + check_iris_frame(iris_frame) -class PandasSQLTest: - """ - Base class with common private methods for SQLAlchemy and fallback cases. - """ +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_api_read_sql_with_chunksize_no_result(conn, request): + conn = request.getfixturevalue(conn) + query = 'SELECT * FROM iris_view WHERE "SepalLength" < 0.0' + with_batch = sql.read_sql_query(query, conn, chunksize=5) + without_batch = sql.read_sql_query(query, conn) + tm.assert_frame_equal(concat(with_batch), without_batch) - def load_iris_data(self, iris_path): - self.drop_table("iris", self.conn) - if isinstance(self.conn, sqlite3.Connection): - create_and_load_iris_sqlite3(self.conn, iris_path) - else: - create_and_load_iris(self.conn, iris_path, self.flavor) - - def load_types_data(self, types_data): - if self.flavor != "postgresql": - for entry in types_data: - entry.pop("DateColWithTz") - if isinstance(self.conn, sqlite3.Connection): - types_data = [tuple(entry.values()) for entry in types_data] - create_and_load_types_sqlite3(self.conn, types_data) - else: - create_and_load_types(self.conn, types_data, self.flavor) - - def _read_sql_iris_parameter(self, sql_strings): - query = sql_strings["read_parameters"][self.flavor] - params = ("Iris-setosa", 5.1) - iris_frame = self.pandasSQL.read_query(query, params=params) - check_iris_frame(iris_frame) - - def _read_sql_iris_named_parameter(self, sql_strings): - query = sql_strings["read_named_parameters"][self.flavor] - params = {"name": "Iris-setosa", "length": 5.1} - iris_frame = self.pandasSQL.read_query(query, params=params) - check_iris_frame(iris_frame) - - def _read_sql_iris_no_parameter_with_percent(self, sql_strings): - query = sql_strings["read_no_parameters_with_percent"][self.flavor] - iris_frame = self.pandasSQL.read_query(query, params=None) - check_iris_frame(iris_frame) - - def _to_sql_empty(self, test_frame1): - self.drop_table("test_frame1", self.conn) - assert self.pandasSQL.to_sql(test_frame1.iloc[:0], "test_frame1") == 0 - - def _to_sql_with_sql_engine(self, test_frame1, engine="auto", **engine_kwargs): - """`to_sql` with the `engine` param""" - # mostly copied from this class's `_to_sql()` method - self.drop_table("test_frame1", self.conn) - assert ( - self.pandasSQL.to_sql( - test_frame1, "test_frame1", engine=engine, **engine_kwargs - ) - == 4 - ) - assert self.pandasSQL.has_table("test_frame1") +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame1", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame1") - num_entries = len(test_frame1) - num_rows = count_rows(self.conn, "test_frame1") - assert num_rows == num_entries + sql.to_sql(test_frame1, "test_frame1", conn) + assert sql.has_table("test_frame1", conn) - # Nuke table - self.drop_table("test_frame1", self.conn) - def _roundtrip(self, test_frame1): - self.drop_table("test_frame_roundtrip", self.conn) - assert self.pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") == 4 - result = self.pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_fail(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame2", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame2") - result.set_index("level_0", inplace=True) - # result.index.astype(int) + sql.to_sql(test_frame1, "test_frame2", conn, if_exists="fail") + assert sql.has_table("test_frame2", conn) - result.index.name = None + msg = "Table 'test_frame2' already exists" + with pytest.raises(ValueError, match=msg): + sql.to_sql(test_frame1, "test_frame2", conn, if_exists="fail") - tm.assert_frame_equal(result, test_frame1) - def _execute_sql(self): - # drop_sql = "DROP TABLE IF EXISTS test" # should already be done - iris_results = self.pandasSQL.execute("SELECT * FROM iris") - row = iris_results.fetchone() - tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_replace(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame3", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame3") - def _to_sql_save_index(self): - df = DataFrame.from_records( - [(1, 2.1, "line1"), (2, 1.5, "line2")], columns=["A", "B", "C"], index=["A"] - ) - assert self.pandasSQL.to_sql(df, "test_to_sql_saves_index") == 2 - ix_cols = self._get_index_columns("test_to_sql_saves_index") - assert ix_cols == [["A"]] - - def _transaction_test(self): - with self.pandasSQL.run_transaction() as trans: - stmt = "CREATE TABLE test_trans (A INT, B TEXT)" - if isinstance(self.pandasSQL, SQLiteDatabase): - trans.execute(stmt) - else: - from sqlalchemy import text + sql.to_sql(test_frame1, "test_frame3", conn, if_exists="fail") + # Add to table again + sql.to_sql(test_frame1, "test_frame3", conn, if_exists="replace") + assert sql.has_table("test_frame3", conn) - stmt = text(stmt) - trans.execute(stmt) + num_entries = len(test_frame1) + num_rows = count_rows(conn, "test_frame3") - class DummyException(Exception): - pass + assert num_rows == num_entries - # Make sure when transaction is rolled back, no rows get inserted - ins_sql = "INSERT INTO test_trans (A,B) VALUES (1, 'blah')" - if isinstance(self.pandasSQL, SQLDatabase): - from sqlalchemy import text - ins_sql = text(ins_sql) - try: - with self.pandasSQL.run_transaction() as trans: - trans.execute(ins_sql) - raise DummyException("error") - except DummyException: - # ignore raised exception - pass - res = self.pandasSQL.read_query("SELECT * FROM test_trans") - assert len(res) == 0 - - # Make sure when transaction is committed, rows do get inserted - with self.pandasSQL.run_transaction() as trans: - trans.execute(ins_sql) - res2 = self.pandasSQL.read_query("SELECT * FROM test_trans") - assert len(res2) == 1 +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_append(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame4", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame4") + assert sql.to_sql(test_frame1, "test_frame4", conn, if_exists="fail") == 4 -# ----------------------------------------------------------------------------- -# -- Testing the public API + # Add to table again + assert sql.to_sql(test_frame1, "test_frame4", conn, if_exists="append") == 4 + assert sql.has_table("test_frame4", conn) + num_entries = 2 * len(test_frame1) + num_rows = count_rows(conn, "test_frame4") -class _TestSQLApi(PandasSQLTest): - """ - Base class to test the public API. + assert num_rows == num_entries - From this two classes are derived to run these tests for both the - sqlalchemy mode (`TestSQLApi`) and the fallback mode - (`TestSQLiteFallbackApi`). These tests are run with sqlite3. Specific - tests for the different sql flavours are included in `_TestSQLAlchemy`. - Notes: - flavor can always be passed even in SQLAlchemy mode, - should be correctly ignored. +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_type_mapping(conn, request, test_frame3): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame5", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame5") - we don't use drop_table because that isn't part of the public api + sql.to_sql(test_frame3, "test_frame5", conn, index=False) + result = sql.read_sql("SELECT * FROM test_frame5", conn) - """ + tm.assert_frame_equal(test_frame3, result) - flavor = "sqlite" - mode: str - @pytest.fixture(autouse=True) - def setup_method(self, iris_path, types_data): - self.conn = self.connect() - self.load_iris_data(iris_path) - self.load_types_data(types_data) - self.load_test_data_and_sql() +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_series(conn, request): + conn = request.getfixturevalue(conn) + if sql.has_table("test_series", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_series") - def load_test_data_and_sql(self): - create_and_load_iris_view(self.conn) + s = Series(np.arange(5, dtype="int64"), name="series") + sql.to_sql(s, "test_series", conn, index=False) + s2 = sql.read_sql_query("SELECT * FROM test_series", conn) + tm.assert_frame_equal(s.to_frame(), s2) - def test_read_sql_view(self): - iris_frame = sql.read_sql_query("SELECT * FROM iris_view", self.conn) - check_iris_frame(iris_frame) - def test_read_sql_with_chunksize_no_result(self): - query = "SELECT * FROM iris_view WHERE SepalLength < 0.0" - with_batch = sql.read_sql_query(query, self.conn, chunksize=5) - without_batch = sql.read_sql_query(query, self.conn) - tm.assert_frame_equal(concat(with_batch), without_batch) +@pytest.mark.parametrize("conn", all_connectable) +def test_api_roundtrip(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame_roundtrip", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame_roundtrip") - def test_to_sql(self, test_frame1): - sql.to_sql(test_frame1, "test_frame1", self.conn) - assert sql.has_table("test_frame1", self.conn) + sql.to_sql(test_frame1, "test_frame_roundtrip", con=conn) + result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=conn) - def test_to_sql_fail(self, test_frame1): - sql.to_sql(test_frame1, "test_frame2", self.conn, if_exists="fail") - assert sql.has_table("test_frame2", self.conn) + # HACK! + result.index = test_frame1.index + result.set_index("level_0", inplace=True) + result.index.astype(int) + result.index.name = None + tm.assert_frame_equal(result, test_frame1) - msg = "Table 'test_frame2' already exists" - with pytest.raises(ValueError, match=msg): - sql.to_sql(test_frame1, "test_frame2", self.conn, if_exists="fail") - def test_to_sql_replace(self, test_frame1): - sql.to_sql(test_frame1, "test_frame3", self.conn, if_exists="fail") - # Add to table again - sql.to_sql(test_frame1, "test_frame3", self.conn, if_exists="replace") - assert sql.has_table("test_frame3", self.conn) +@pytest.mark.parametrize("conn", all_connectable) +def test_api_roundtrip_chunksize(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame_roundtrip", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame_roundtrip") - num_entries = len(test_frame1) - num_rows = count_rows(self.conn, "test_frame3") + sql.to_sql( + test_frame1, + "test_frame_roundtrip", + con=conn, + index=False, + chunksize=2, + ) + result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=conn) + tm.assert_frame_equal(result, test_frame1) - assert num_rows == num_entries - def test_to_sql_append(self, test_frame1): - assert sql.to_sql(test_frame1, "test_frame4", self.conn, if_exists="fail") == 4 +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_api_execute_sql(conn, request): + # drop_sql = "DROP TABLE IF EXISTS test" # should already be done + conn = request.getfixturevalue(conn) + with sql.pandasSQL_builder(conn) as pandas_sql: + iris_results = pandas_sql.execute("SELECT * FROM iris") + row = iris_results.fetchone() + tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) - # Add to table again - assert ( - sql.to_sql(test_frame1, "test_frame4", self.conn, if_exists="append") == 4 - ) - assert sql.has_table("test_frame4", self.conn) - num_entries = 2 * len(test_frame1) - num_rows = count_rows(self.conn, "test_frame4") +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_api_date_parsing(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + # Test date parsing in read_sql + # No Parsing + df = sql.read_sql_query("SELECT * FROM types", conn) + if not ("mysql" in conn_name or "postgres" in conn_name): + assert not issubclass(df.DateCol.dtype.type, np.datetime64) - assert num_rows == num_entries + df = sql.read_sql_query("SELECT * FROM types", conn, parse_dates=["DateCol"]) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + assert df.DateCol.tolist() == [ + Timestamp(2000, 1, 3, 0, 0, 0), + Timestamp(2000, 1, 4, 0, 0, 0), + ] - def test_to_sql_type_mapping(self, test_frame3): - sql.to_sql(test_frame3, "test_frame5", self.conn, index=False) - result = sql.read_sql("SELECT * FROM test_frame5", self.conn) + df = sql.read_sql_query( + "SELECT * FROM types", + conn, + parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"}, + ) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + assert df.DateCol.tolist() == [ + Timestamp(2000, 1, 3, 0, 0, 0), + Timestamp(2000, 1, 4, 0, 0, 0), + ] - tm.assert_frame_equal(test_frame3, result) + df = sql.read_sql_query("SELECT * FROM types", conn, parse_dates=["IntDateCol"]) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + assert df.IntDateCol.tolist() == [ + Timestamp(1986, 12, 25, 0, 0, 0), + Timestamp(2013, 1, 1, 0, 0, 0), + ] - def test_to_sql_series(self): - s = Series(np.arange(5, dtype="int64"), name="series") - sql.to_sql(s, "test_series", self.conn, index=False) - s2 = sql.read_sql_query("SELECT * FROM test_series", self.conn) - tm.assert_frame_equal(s.to_frame(), s2) + df = sql.read_sql_query( + "SELECT * FROM types", conn, parse_dates={"IntDateCol": "s"} + ) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + assert df.IntDateCol.tolist() == [ + Timestamp(1986, 12, 25, 0, 0, 0), + Timestamp(2013, 1, 1, 0, 0, 0), + ] - def test_roundtrip(self, test_frame1): - sql.to_sql(test_frame1, "test_frame_roundtrip", con=self.conn) - result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=self.conn) + df = sql.read_sql_query( + "SELECT * FROM types", + conn, + parse_dates={"IntDateOnlyCol": "%Y%m%d"}, + ) + assert issubclass(df.IntDateOnlyCol.dtype.type, np.datetime64) + assert df.IntDateOnlyCol.tolist() == [ + Timestamp("2010-10-10"), + Timestamp("2010-12-12"), + ] - # HACK! - result.index = test_frame1.index - result.set_index("level_0", inplace=True) - result.index.astype(int) - result.index.name = None - tm.assert_frame_equal(result, test_frame1) - def test_roundtrip_chunksize(self, test_frame1): - sql.to_sql( - test_frame1, - "test_frame_roundtrip", - con=self.conn, - index=False, - chunksize=2, +@pytest.mark.parametrize("conn", all_connectable_iris) +@pytest.mark.parametrize("error", ["ignore", "raise", "coerce"]) +@pytest.mark.parametrize( + "read_sql, text, mode", + [ + (sql.read_sql, "SELECT * FROM types", ("sqlalchemy", "fallback")), + (sql.read_sql, "types", ("sqlalchemy")), + ( + sql.read_sql_query, + "SELECT * FROM types", + ("sqlalchemy", "fallback"), + ), + (sql.read_sql_table, "types", ("sqlalchemy")), + ], +) +def test_api_custom_dateparsing_error( + conn, request, read_sql, text, mode, error, types_data_frame +): + conn_name = conn + conn = request.getfixturevalue(conn) + if text == "types" and conn_name == "sqlite_buildin_iris": + request.node.add_marker( + pytest.mark.xfail(reason="failing combination of arguments") ) - result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=self.conn) - tm.assert_frame_equal(result, test_frame1) - def test_execute_sql(self): - # drop_sql = "DROP TABLE IF EXISTS test" # should already be done - with sql.pandasSQL_builder(self.conn) as pandas_sql: - iris_results = pandas_sql.execute("SELECT * FROM iris") - row = iris_results.fetchone() - tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) + expected = types_data_frame.astype({"DateCol": "datetime64[ns]"}) - def test_date_parsing(self): - # Test date parsing in read_sql - # No Parsing - df = sql.read_sql_query("SELECT * FROM types", self.conn) - assert not issubclass(df.DateCol.dtype.type, np.datetime64) + result = read_sql( + text, + con=conn, + parse_dates={ + "DateCol": {"errors": error}, + }, + ) + if "postgres" in conn_name: + # TODO: clean up types_data_frame fixture + result = result.drop(columns=["DateColWithTz"]) + result["BoolCol"] = result["BoolCol"].astype(int) + result["BoolColWithNull"] = result["BoolColWithNull"].astype(float) - df = sql.read_sql_query( - "SELECT * FROM types", self.conn, parse_dates=["DateCol"] - ) - assert issubclass(df.DateCol.dtype.type, np.datetime64) - assert df.DateCol.tolist() == [ - Timestamp(2000, 1, 3, 0, 0, 0), - Timestamp(2000, 1, 4, 0, 0, 0), - ] + tm.assert_frame_equal(result, expected) - df = sql.read_sql_query( - "SELECT * FROM types", - self.conn, - parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"}, - ) - assert issubclass(df.DateCol.dtype.type, np.datetime64) - assert df.DateCol.tolist() == [ - Timestamp(2000, 1, 3, 0, 0, 0), - Timestamp(2000, 1, 4, 0, 0, 0), - ] - df = sql.read_sql_query( - "SELECT * FROM types", self.conn, parse_dates=["IntDateCol"] - ) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) - assert df.IntDateCol.tolist() == [ - Timestamp(1986, 12, 25, 0, 0, 0), - Timestamp(2013, 1, 1, 0, 0, 0), - ] +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_api_date_and_index(conn, request): + # Test case where same column appears in parse_date and index_col + conn = request.getfixturevalue(conn) + df = sql.read_sql_query( + "SELECT * FROM types", + conn, + index_col="DateCol", + parse_dates=["DateCol", "IntDateCol"], + ) - df = sql.read_sql_query( - "SELECT * FROM types", self.conn, parse_dates={"IntDateCol": "s"} - ) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) - assert df.IntDateCol.tolist() == [ - Timestamp(1986, 12, 25, 0, 0, 0), - Timestamp(2013, 1, 1, 0, 0, 0), - ] + assert issubclass(df.index.dtype.type, np.datetime64) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) - df = sql.read_sql_query( - "SELECT * FROM types", - self.conn, - parse_dates={"IntDateOnlyCol": "%Y%m%d"}, - ) - assert issubclass(df.IntDateOnlyCol.dtype.type, np.datetime64) - assert df.IntDateOnlyCol.tolist() == [ - Timestamp("2010-10-10"), - Timestamp("2010-12-12"), - ] - @pytest.mark.parametrize("error", ["ignore", "raise", "coerce"]) - @pytest.mark.parametrize( - "read_sql, text, mode", - [ - (sql.read_sql, "SELECT * FROM types", ("sqlalchemy", "fallback")), - (sql.read_sql, "types", ("sqlalchemy")), - ( - sql.read_sql_query, - "SELECT * FROM types", - ("sqlalchemy", "fallback"), - ), - (sql.read_sql_table, "types", ("sqlalchemy")), - ], - ) - def test_custom_dateparsing_error( - self, read_sql, text, mode, error, types_data_frame - ): - if self.mode in mode: - expected = types_data_frame.astype({"DateCol": "datetime64[ns]"}) - - result = read_sql( - text, - con=self.conn, - parse_dates={ - "DateCol": {"errors": error}, - }, - ) +@pytest.mark.parametrize("conn", all_connectable) +def test_api_timedelta(conn, request): + # see #6921 + conn = request.getfixturevalue(conn) + if sql.has_table("test_timedelta", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_timedelta") - tm.assert_frame_equal(result, expected) + df = to_timedelta(Series(["00:00:01", "00:00:03"], name="foo")).to_frame() + with tm.assert_produces_warning(UserWarning): + result_count = df.to_sql(name="test_timedelta", con=conn) + assert result_count == 2 + result = sql.read_sql_query("SELECT * FROM test_timedelta", conn) + tm.assert_series_equal(result["foo"], df["foo"].view("int64")) - def test_date_and_index(self): - # Test case where same column appears in parse_date and index_col - df = sql.read_sql_query( - "SELECT * FROM types", - self.conn, - index_col="DateCol", - parse_dates=["DateCol", "IntDateCol"], - ) +@pytest.mark.parametrize("conn", all_connectable) +def test_api_complex_raises(conn, request): + conn = request.getfixturevalue(conn) + df = DataFrame({"a": [1 + 1j, 2j]}) + msg = "Complex datatypes not supported" + with pytest.raises(ValueError, match=msg): + assert df.to_sql("test_complex", con=conn) is None - assert issubclass(df.index.dtype.type, np.datetime64) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) - def test_timedelta(self): - # see #6921 - df = to_timedelta(Series(["00:00:01", "00:00:03"], name="foo")).to_frame() - with tm.assert_produces_warning(UserWarning): - result_count = df.to_sql(name="test_timedelta", con=self.conn) - assert result_count == 2 - result = sql.read_sql_query("SELECT * FROM test_timedelta", self.conn) - tm.assert_series_equal(result["foo"], df["foo"].view("int64")) - - def test_complex_raises(self): - df = DataFrame({"a": [1 + 1j, 2j]}) - msg = "Complex datatypes not supported" - with pytest.raises(ValueError, match=msg): - assert df.to_sql("test_complex", con=self.conn) is None +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize( + "index_name,index_label,expected", + [ + # no index name, defaults to 'index' + (None, None, "index"), + # specifying index_label + (None, "other_label", "other_label"), + # using the index name + ("index_name", None, "index_name"), + # has index name, but specifying index_label + ("index_name", "other_label", "other_label"), + # index name is integer + (0, None, "0"), + # index name is None but index label is integer + (None, 0, "0"), + ], +) +def test_api_to_sql_index_label(conn, request, index_name, index_label, expected): + conn = request.getfixturevalue(conn) + if sql.has_table("test_index_label", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_index_label") - @pytest.mark.parametrize( - "index_name,index_label,expected", - [ - # no index name, defaults to 'index' - (None, None, "index"), - # specifying index_label - (None, "other_label", "other_label"), - # using the index name - ("index_name", None, "index_name"), - # has index name, but specifying index_label - ("index_name", "other_label", "other_label"), - # index name is integer - (0, None, "0"), - # index name is None but index label is integer - (None, 0, "0"), - ], - ) - def test_to_sql_index_label(self, index_name, index_label, expected): - temp_frame = DataFrame({"col1": range(4)}) - temp_frame.index.name = index_name - query = "SELECT * FROM test_index_label" - sql.to_sql(temp_frame, "test_index_label", self.conn, index_label=index_label) - frame = sql.read_sql_query(query, self.conn) - assert frame.columns[0] == expected - - def test_to_sql_index_label_multiindex(self): - expected_row_count = 4 - temp_frame = DataFrame( - {"col1": range(4)}, - index=MultiIndex.from_product([("A0", "A1"), ("B0", "B1")]), + temp_frame = DataFrame({"col1": range(4)}) + temp_frame.index.name = index_name + query = "SELECT * FROM test_index_label" + sql.to_sql(temp_frame, "test_index_label", conn, index_label=index_label) + frame = sql.read_sql_query(query, conn) + assert frame.columns[0] == expected + + +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_index_label_multiindex(conn, request): + conn_name = conn + if "mysql" in conn_name: + request.node.add_marker( + pytest.mark.xfail( + reason="MySQL can fail using TEXT without length as key", strict=False + ) ) - # no index name, defaults to 'level_0' and 'level_1' - result = sql.to_sql(temp_frame, "test_index_label", self.conn) - assert result == expected_row_count - frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) - assert frame.columns[0] == "level_0" - assert frame.columns[1] == "level_1" + conn = request.getfixturevalue(conn) + if sql.has_table("test_index_label", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_index_label") + + expected_row_count = 4 + temp_frame = DataFrame( + {"col1": range(4)}, + index=MultiIndex.from_product([("A0", "A1"), ("B0", "B1")]), + ) - # specifying index_label - result = sql.to_sql( + # no index name, defaults to 'level_0' and 'level_1' + result = sql.to_sql(temp_frame, "test_index_label", conn) + assert result == expected_row_count + frame = sql.read_sql_query("SELECT * FROM test_index_label", conn) + assert frame.columns[0] == "level_0" + assert frame.columns[1] == "level_1" + + # specifying index_label + result = sql.to_sql( + temp_frame, + "test_index_label", + conn, + if_exists="replace", + index_label=["A", "B"], + ) + assert result == expected_row_count + frame = sql.read_sql_query("SELECT * FROM test_index_label", conn) + assert frame.columns[:2].tolist() == ["A", "B"] + + # using the index name + temp_frame.index.names = ["A", "B"] + result = sql.to_sql(temp_frame, "test_index_label", conn, if_exists="replace") + assert result == expected_row_count + frame = sql.read_sql_query("SELECT * FROM test_index_label", conn) + assert frame.columns[:2].tolist() == ["A", "B"] + + # has index name, but specifying index_label + result = sql.to_sql( + temp_frame, + "test_index_label", + conn, + if_exists="replace", + index_label=["C", "D"], + ) + assert result == expected_row_count + frame = sql.read_sql_query("SELECT * FROM test_index_label", conn) + assert frame.columns[:2].tolist() == ["C", "D"] + + msg = "Length of 'index_label' should match number of levels, which is 2" + with pytest.raises(ValueError, match=msg): + sql.to_sql( temp_frame, "test_index_label", - self.conn, + conn, if_exists="replace", - index_label=["A", "B"], + index_label="C", ) - assert result == expected_row_count - frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) - assert frame.columns[:2].tolist() == ["A", "B"] - # using the index name - temp_frame.index.names = ["A", "B"] - result = sql.to_sql( - temp_frame, "test_index_label", self.conn, if_exists="replace" - ) - assert result == expected_row_count - frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) - assert frame.columns[:2].tolist() == ["A", "B"] - # has index name, but specifying index_label - result = sql.to_sql( - temp_frame, - "test_index_label", - self.conn, - if_exists="replace", - index_label=["C", "D"], - ) - assert result == expected_row_count - frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) - assert frame.columns[:2].tolist() == ["C", "D"] +@pytest.mark.parametrize("conn", all_connectable) +def test_api_multiindex_roundtrip(conn, request): + conn = request.getfixturevalue(conn) + if sql.has_table("test_multiindex_roundtrip", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_multiindex_roundtrip") + + df = DataFrame.from_records( + [(1, 2.1, "line1"), (2, 1.5, "line2")], + columns=["A", "B", "C"], + index=["A", "B"], + ) - msg = "Length of 'index_label' should match number of levels, which is 2" - with pytest.raises(ValueError, match=msg): - sql.to_sql( - temp_frame, - "test_index_label", - self.conn, - if_exists="replace", - index_label="C", - ) + df.to_sql(name="test_multiindex_roundtrip", con=conn) + result = sql.read_sql_query( + "SELECT * FROM test_multiindex_roundtrip", conn, index_col=["A", "B"] + ) + tm.assert_frame_equal(df, result, check_index_type=True) - def test_multiindex_roundtrip(self): - df = DataFrame.from_records( - [(1, 2.1, "line1"), (2, 1.5, "line2")], - columns=["A", "B", "C"], - index=["A", "B"], - ) - df.to_sql(name="test_multiindex_roundtrip", con=self.conn) - result = sql.read_sql_query( - "SELECT * FROM test_multiindex_roundtrip", self.conn, index_col=["A", "B"] - ) - tm.assert_frame_equal(df, result, check_index_type=True) +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize( + "dtype", + [ + None, + int, + float, + {"A": int, "B": float}, + ], +) +def test_api_dtype_argument(conn, request, dtype): + # GH10285 Add dtype argument to read_sql_query + conn_name = conn + conn = request.getfixturevalue(conn) + if sql.has_table("test_dtype_argument", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_dtype_argument") - @pytest.mark.parametrize( - "dtype", - [ - None, - int, - float, - {"A": int, "B": float}, - ], - ) - def test_dtype_argument(self, dtype): - # GH10285 Add dtype argument to read_sql_query - df = DataFrame([[1.2, 3.4], [5.6, 7.8]], columns=["A", "B"]) - assert df.to_sql(name="test_dtype_argument", con=self.conn) == 2 - - expected = df.astype(dtype) - result = sql.read_sql_query( - "SELECT A, B FROM test_dtype_argument", con=self.conn, dtype=dtype - ) + df = DataFrame([[1.2, 3.4], [5.6, 7.8]], columns=["A", "B"]) + assert df.to_sql(name="test_dtype_argument", con=conn) == 2 - tm.assert_frame_equal(result, expected) + expected = df.astype(dtype) - def test_integer_col_names(self): - df = DataFrame([[1, 2], [3, 4]], columns=[0, 1]) - sql.to_sql(df, "test_frame_integer_col_names", self.conn, if_exists="replace") + if "postgres" in conn_name: + query = 'SELECT "A", "B" FROM test_dtype_argument' + else: + query = "SELECT A, B FROM test_dtype_argument" + result = sql.read_sql_query(query, con=conn, dtype=dtype) - def test_get_schema(self, test_frame1): - create_sql = sql.get_schema(test_frame1, "test", con=self.conn) - assert "CREATE" in create_sql + tm.assert_frame_equal(result, expected) - def test_get_schema_with_schema(self, test_frame1): - # GH28486 - create_sql = sql.get_schema(test_frame1, "test", con=self.conn, schema="pypi") - assert "CREATE TABLE pypi." in create_sql - def test_get_schema_dtypes(self): - if self.mode == "sqlalchemy": - from sqlalchemy import Integer +@pytest.mark.parametrize("conn", all_connectable) +def test_api_integer_col_names(conn, request): + conn = request.getfixturevalue(conn) + df = DataFrame([[1, 2], [3, 4]], columns=[0, 1]) + sql.to_sql(df, "test_frame_integer_col_names", conn, if_exists="replace") - dtype = Integer - else: - dtype = "INTEGER" - float_frame = DataFrame({"a": [1.1, 1.2], "b": [2.1, 2.2]}) - create_sql = sql.get_schema( - float_frame, "test", con=self.conn, dtype={"b": dtype} - ) - assert "CREATE" in create_sql - assert "INTEGER" in create_sql +@pytest.mark.parametrize("conn", all_connectable) +def test_api_get_schema(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + create_sql = sql.get_schema(test_frame1, "test", con=conn) + assert "CREATE" in create_sql - def test_get_schema_keys(self, test_frame1): - frame = DataFrame({"Col1": [1.1, 1.2], "Col2": [2.1, 2.2]}) - create_sql = sql.get_schema(frame, "test", con=self.conn, keys="Col1") - constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("Col1")' - assert constraint_sentence in create_sql - # multiple columns as key (GH10385) - create_sql = sql.get_schema(test_frame1, "test", con=self.conn, keys=["A", "B"]) - constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("A", "B")' - assert constraint_sentence in create_sql +@pytest.mark.parametrize("conn", all_connectable) +def test_api_get_schema_with_schema(conn, request, test_frame1): + # GH28486 + conn = request.getfixturevalue(conn) + create_sql = sql.get_schema(test_frame1, "test", con=conn, schema="pypi") + assert "CREATE TABLE pypi." in create_sql - def test_chunksize_read(self): - df = DataFrame( - np.random.default_rng(2).standard_normal((22, 5)), columns=list("abcde") - ) - df.to_sql(name="test_chunksize", con=self.conn, index=False) - # reading the query in one time - res1 = sql.read_sql_query("select * from test_chunksize", self.conn) +@pytest.mark.parametrize("conn", all_connectable) +def test_api_get_schema_dtypes(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + float_frame = DataFrame({"a": [1.1, 1.2], "b": [2.1, 2.2]}) - # reading the query in chunks with read_sql_query - res2 = DataFrame() - i = 0 - sizes = [5, 5, 5, 5, 2] + if conn_name == "sqlite_buildin": + dtype = "INTEGER" + else: + from sqlalchemy import Integer - for chunk in sql.read_sql_query( - "select * from test_chunksize", self.conn, chunksize=5 - ): - res2 = concat([res2, chunk], ignore_index=True) - assert len(chunk) == sizes[i] - i += 1 + dtype = Integer + create_sql = sql.get_schema(float_frame, "test", con=conn, dtype={"b": dtype}) + assert "CREATE" in create_sql + assert "INTEGER" in create_sql - tm.assert_frame_equal(res1, res2) - # reading the query in chunks with read_sql_query - if self.mode == "sqlalchemy": - res3 = DataFrame() - i = 0 - sizes = [5, 5, 5, 5, 2] +@pytest.mark.parametrize("conn", all_connectable) +def test_api_get_schema_keys(conn, request, test_frame1): + conn_name = conn + conn = request.getfixturevalue(conn) + frame = DataFrame({"Col1": [1.1, 1.2], "Col2": [2.1, 2.2]}) + create_sql = sql.get_schema(frame, "test", con=conn, keys="Col1") - for chunk in sql.read_sql_table("test_chunksize", self.conn, chunksize=5): - res3 = concat([res3, chunk], ignore_index=True) - assert len(chunk) == sizes[i] - i += 1 + if "mysql" in conn_name: + constraint_sentence = "CONSTRAINT test_pk PRIMARY KEY (`Col1`)" + else: + constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("Col1")' + assert constraint_sentence in create_sql - tm.assert_frame_equal(res1, res3) + # multiple columns as key (GH10385) + create_sql = sql.get_schema(test_frame1, "test", con=conn, keys=["A", "B"]) + if "mysql" in conn_name: + constraint_sentence = "CONSTRAINT test_pk PRIMARY KEY (`A`, `B`)" + else: + constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("A", "B")' + assert constraint_sentence in create_sql - def test_categorical(self): - # GH8624 - # test that categorical gets written correctly as dense column - df = DataFrame( - { - "person_id": [1, 2, 3], - "person_name": ["John P. Doe", "Jane Dove", "John P. Doe"], - } - ) - df2 = df.copy() - df2["person_name"] = df2["person_name"].astype("category") - df2.to_sql(name="test_categorical", con=self.conn, index=False) - res = sql.read_sql_query("SELECT * FROM test_categorical", self.conn) +@pytest.mark.parametrize("conn", all_connectable) +def test_api_chunksize_read(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + if sql.has_table("test_chunksize", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_chunksize") - tm.assert_frame_equal(res, df) + df = DataFrame( + np.random.default_rng(2).standard_normal((22, 5)), columns=list("abcde") + ) + df.to_sql(name="test_chunksize", con=conn, index=False) - def test_unicode_column_name(self): - # GH 11431 - df = DataFrame([[1, 2], [3, 4]], columns=["\xe9", "b"]) - df.to_sql(name="test_unicode", con=self.conn, index=False) + # reading the query in one time + res1 = sql.read_sql_query("select * from test_chunksize", conn) - def test_escaped_table_name(self): - # GH 13206 - df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]}) - df.to_sql(name="d1187b08-4943-4c8d-a7f6", con=self.conn, index=False) + # reading the query in chunks with read_sql_query + res2 = DataFrame() + i = 0 + sizes = [5, 5, 5, 5, 2] - res = sql.read_sql_query("SELECT * FROM `d1187b08-4943-4c8d-a7f6`", self.conn) + for chunk in sql.read_sql_query("select * from test_chunksize", conn, chunksize=5): + res2 = concat([res2, chunk], ignore_index=True) + assert len(chunk) == sizes[i] + i += 1 - tm.assert_frame_equal(res, df) + tm.assert_frame_equal(res1, res2) - def test_read_sql_duplicate_columns(self): - # GH#53117 - df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": 1}) - df.to_sql(name="test_table", con=self.conn, index=False) + # reading the query in chunks with read_sql_query + if conn_name == "sqlite_buildin": + with pytest.raises(NotImplementedError, match=""): + sql.read_sql_table("test_chunksize", conn, chunksize=5) + else: + res3 = DataFrame() + i = 0 + sizes = [5, 5, 5, 5, 2] - result = pd.read_sql("SELECT a, b, a +1 as a, c FROM test_table;", self.conn) - expected = DataFrame( - [[1, 0.1, 2, 1], [2, 0.2, 3, 1], [3, 0.3, 4, 1]], - columns=["a", "b", "a", "c"], - ) - tm.assert_frame_equal(result, expected) + for chunk in sql.read_sql_table("test_chunksize", conn, chunksize=5): + res3 = concat([res3, chunk], ignore_index=True) + assert len(chunk) == sizes[i] + i += 1 + tm.assert_frame_equal(res1, res3) -@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="SQLAlchemy not installed") -class TestSQLApi(SQLAlchemyMixIn, _TestSQLApi): - """ - Test the public API as it would be used directly - Tests for `read_sql_table` are included here, as this is specific for the - sqlalchemy mode. +@pytest.mark.parametrize("conn", all_connectable) +def test_api_categorical(conn, request): + # GH8624 + # test that categorical gets written correctly as dense column + conn = request.getfixturevalue(conn) + if sql.has_table("test_categorical", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_categorical") - """ + df = DataFrame( + { + "person_id": [1, 2, 3], + "person_name": ["John P. Doe", "Jane Dove", "John P. Doe"], + } + ) + df2 = df.copy() + df2["person_name"] = df2["person_name"].astype("category") - flavor = "sqlite" - mode = "sqlalchemy" + df2.to_sql(name="test_categorical", con=conn, index=False) + res = sql.read_sql_query("SELECT * FROM test_categorical", conn) - @classmethod - def setup_class(cls): - cls.engine = sqlalchemy.create_engine("sqlite:///:memory:") + tm.assert_frame_equal(res, df) - def test_read_table_columns(self, test_frame1): - # test columns argument in read_table - sql.to_sql(test_frame1, "test_frame", self.conn) - cols = ["A", "B"] - result = sql.read_sql_table("test_frame", self.conn, columns=cols) - assert result.columns.tolist() == cols +@pytest.mark.parametrize("conn", all_connectable) +def test_api_unicode_column_name(conn, request): + # GH 11431 + conn = request.getfixturevalue(conn) + if sql.has_table("test_unicode", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_unicode") - def test_read_table_index_col(self, test_frame1): - # test columns argument in read_table - sql.to_sql(test_frame1, "test_frame", self.conn) + df = DataFrame([[1, 2], [3, 4]], columns=["\xe9", "b"]) + df.to_sql(name="test_unicode", con=conn, index=False) - result = sql.read_sql_table("test_frame", self.conn, index_col="index") - assert result.index.names == ["index"] - result = sql.read_sql_table("test_frame", self.conn, index_col=["A", "B"]) - assert result.index.names == ["A", "B"] +@pytest.mark.parametrize("conn", all_connectable) +def test_api_escaped_table_name(conn, request): + # GH 13206 + conn_name = conn + conn = request.getfixturevalue(conn) + if sql.has_table("d1187b08-4943-4c8d-a7f6", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("d1187b08-4943-4c8d-a7f6") - result = sql.read_sql_table( - "test_frame", self.conn, index_col=["A", "B"], columns=["C", "D"] - ) - assert result.index.names == ["A", "B"] - assert result.columns.tolist() == ["C", "D"] + df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]}) + df.to_sql(name="d1187b08-4943-4c8d-a7f6", con=conn, index=False) - def test_read_sql_delegate(self): - iris_frame1 = sql.read_sql_query("SELECT * FROM iris", self.conn) - iris_frame2 = sql.read_sql("SELECT * FROM iris", self.conn) - tm.assert_frame_equal(iris_frame1, iris_frame2) + if "postgres" in conn_name: + query = 'SELECT * FROM "d1187b08-4943-4c8d-a7f6"' + else: + query = "SELECT * FROM `d1187b08-4943-4c8d-a7f6`" + res = sql.read_sql_query(query, conn) - iris_frame1 = sql.read_sql_table("iris", self.conn) - iris_frame2 = sql.read_sql("iris", self.conn) - tm.assert_frame_equal(iris_frame1, iris_frame2) + tm.assert_frame_equal(res, df) - def test_not_reflect_all_tables(self): - from sqlalchemy import text - from sqlalchemy.engine import Engine - # create invalid table - query_list = [ - text("CREATE TABLE invalid (x INTEGER, y UNKNOWN);"), - text("CREATE TABLE other_table (x INTEGER, y INTEGER);"), - ] - for query in query_list: - if isinstance(self.conn, Engine): - with self.conn.connect() as conn: - with conn.begin(): - conn.execute(query) - else: - with self.conn.begin(): - self.conn.execute(query) +@pytest.mark.parametrize("conn", all_connectable) +def test_api_read_sql_duplicate_columns(conn, request): + # GH#53117 + conn = request.getfixturevalue(conn) + if sql.has_table("test_table", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_table") - with tm.assert_produces_warning(None): - sql.read_sql_table("other_table", self.conn) - sql.read_sql_query("SELECT * FROM other_table", self.conn) + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": 1}) + df.to_sql(name="test_table", con=conn, index=False) - def test_warning_case_insensitive_table_name(self, test_frame1): - # see gh-7815 - with tm.assert_produces_warning( - UserWarning, - match=( - r"The provided table name 'TABLE1' is not found exactly as such in " - r"the database after writing the table, possibly due to case " - r"sensitivity issues. Consider using lower case table names." - ), - ): - sql.SQLDatabase(self.conn).check_case_sensitive("TABLE1", "") + result = pd.read_sql("SELECT a, b, a +1 as a, c FROM test_table;", conn) + expected = DataFrame( + [[1, 0.1, 2, 1], [2, 0.2, 3, 1], [3, 0.3, 4, 1]], + columns=["a", "b", "a", "c"], + ) + tm.assert_frame_equal(result, expected) - # Test that the warning is certainly NOT triggered in a normal case. - with tm.assert_produces_warning(None): - test_frame1.to_sql(name="CaseSensitive", con=self.conn) - def _get_index_columns(self, tbl_name): - from sqlalchemy.engine import reflection +@pytest.mark.parametrize("conn", all_connectable) +def test_read_table_columns(conn, request, test_frame1): + # test columns argument in read_table + conn_name = conn + if conn_name == "sqlite_buildin": + request.node.add_marker(pytest.mark.xfail(reason="Not Implemented")) - insp = reflection.Inspector.from_engine(self.conn) - ixs = insp.get_indexes("test_index_saved") - ixs = [i["column_names"] for i in ixs] - return ixs + conn = request.getfixturevalue(conn) + sql.to_sql(test_frame1, "test_frame", conn) - def test_sqlalchemy_type_mapping(self): - from sqlalchemy import TIMESTAMP + cols = ["A", "B"] - # Test Timestamp objects (no datetime64 because of timezone) (GH9085) - df = DataFrame( - {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)} - ) - db = sql.SQLDatabase(self.conn) - table = sql.SQLTable("test_type", db, frame=df) - # GH 9086: TIMESTAMP is the suggested type for datetimes with timezones - assert isinstance(table.table.c["time"].type, TIMESTAMP) + result = sql.read_sql_table("test_frame", conn, columns=cols) + assert result.columns.tolist() == cols - @pytest.mark.parametrize( - "integer, expected", - [ - ("int8", "SMALLINT"), - ("Int8", "SMALLINT"), - ("uint8", "SMALLINT"), - ("UInt8", "SMALLINT"), - ("int16", "SMALLINT"), - ("Int16", "SMALLINT"), - ("uint16", "INTEGER"), - ("UInt16", "INTEGER"), - ("int32", "INTEGER"), - ("Int32", "INTEGER"), - ("uint32", "BIGINT"), - ("UInt32", "BIGINT"), - ("int64", "BIGINT"), - ("Int64", "BIGINT"), - (int, "BIGINT" if np.dtype(int).name == "int64" else "INTEGER"), - ], - ) - def test_sqlalchemy_integer_mapping(self, integer, expected): - # GH35076 Map pandas integer to optimal SQLAlchemy integer type - df = DataFrame([0, 1], columns=["a"], dtype=integer) - db = sql.SQLDatabase(self.conn) - table = sql.SQLTable("test_type", db, frame=df) - - result = str(table.table.c.a.type) - assert result == expected - - @pytest.mark.parametrize("integer", ["uint64", "UInt64"]) - def test_sqlalchemy_integer_overload_mapping(self, integer): - # GH35076 Map pandas integer to optimal SQLAlchemy integer type - df = DataFrame([0, 1], columns=["a"], dtype=integer) - db = sql.SQLDatabase(self.conn) - with pytest.raises( - ValueError, match="Unsigned 64 bit integer datatype is not supported" - ): - sql.SQLTable("test_type", db, frame=df) - - def test_database_uri_string(self, test_frame1): - # Test read_sql and .to_sql method with a database URI (GH10654) - # db_uri = 'sqlite:///:memory:' # raises - # sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near - # "iris": syntax error [SQL: 'iris'] - with tm.ensure_clean() as name: - db_uri = "sqlite:///" + name - table = "iris" - test_frame1.to_sql(name=table, con=db_uri, if_exists="replace", index=False) - test_frame2 = sql.read_sql(table, db_uri) - test_frame3 = sql.read_sql_table(table, db_uri) - query = "SELECT * FROM iris" - test_frame4 = sql.read_sql_query(query, db_uri) - tm.assert_frame_equal(test_frame1, test_frame2) - tm.assert_frame_equal(test_frame1, test_frame3) - tm.assert_frame_equal(test_frame1, test_frame4) - - @td.skip_if_installed("pg8000") - def test_pg8000_sqlalchemy_passthrough_error(self): - # using driver that will not be installed on CI to trigger error - # in sqlalchemy.create_engine -> test passing of this error to user - db_uri = "postgresql+pg8000://user:pass@host/dbname" - with pytest.raises(ImportError, match="pg8000"): - sql.read_sql("select * from table", db_uri) - - def test_query_by_text_obj(self): - # WIP : GH10846 - from sqlalchemy import text - name_text = text("select * from iris where name=:name") - iris_df = sql.read_sql(name_text, self.conn, params={"name": "Iris-versicolor"}) - all_names = set(iris_df["Name"]) - assert all_names == {"Iris-versicolor"} - - def test_query_by_select_obj(self): - # WIP : GH10846 - from sqlalchemy import ( - bindparam, - select, - ) +@pytest.mark.parametrize("conn", all_connectable) +def test_read_table_index_col(conn, request, test_frame1): + # test columns argument in read_table + conn_name = conn + if conn_name == "sqlite_buildin": + request.node.add_marker(pytest.mark.xfail(reason="Not Implemented")) - iris = iris_table_metadata(self.flavor) - name_select = select(iris).where(iris.c.Name == bindparam("name")) - iris_df = sql.read_sql(name_select, self.conn, params={"name": "Iris-setosa"}) - all_names = set(iris_df["Name"]) - assert all_names == {"Iris-setosa"} + conn = request.getfixturevalue(conn) + sql.to_sql(test_frame1, "test_frame", conn) - def test_column_with_percentage(self): - # GH 37157 - df = DataFrame({"A": [0, 1, 2], "%_variation": [3, 4, 5]}) - df.to_sql(name="test_column_percentage", con=self.conn, index=False) + result = sql.read_sql_table("test_frame", conn, index_col="index") + assert result.index.names == ["index"] - res = sql.read_sql_table("test_column_percentage", self.conn) + result = sql.read_sql_table("test_frame", conn, index_col=["A", "B"]) + assert result.index.names == ["A", "B"] - tm.assert_frame_equal(res, df) + result = sql.read_sql_table( + "test_frame", conn, index_col=["A", "B"], columns=["C", "D"] + ) + assert result.index.names == ["A", "B"] + assert result.columns.tolist() == ["C", "D"] -class TestSQLiteFallbackApi(SQLiteMixIn, _TestSQLApi): - """ - Test the public sqlite connection fallback API +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_read_sql_delegate(conn, request): + if conn == "sqlite_buildin_iris": + request.node.add_marker( + pytest.mark.xfail( + reason="sqlite_buildin connection does not implement read_sql_table" + ) + ) - """ + conn = request.getfixturevalue(conn) + iris_frame1 = sql.read_sql_query("SELECT * FROM iris", conn) + iris_frame2 = sql.read_sql("SELECT * FROM iris", conn) + tm.assert_frame_equal(iris_frame1, iris_frame2) + + iris_frame1 = sql.read_sql_table("iris", conn) + iris_frame2 = sql.read_sql("iris", conn) + tm.assert_frame_equal(iris_frame1, iris_frame2) - flavor = "sqlite" - mode = "fallback" - def connect(self, database=":memory:"): - return sqlite3.connect(database) +def test_not_reflect_all_tables(sqlite_conn): + conn = sqlite_conn + from sqlalchemy import text + from sqlalchemy.engine import Engine - def test_sql_open_close(self, test_frame3): - # Test if the IO in the database still work if the connection closed - # between the writing and reading (as in many real situations). + # create invalid table + query_list = [ + text("CREATE TABLE invalid (x INTEGER, y UNKNOWN);"), + text("CREATE TABLE other_table (x INTEGER, y INTEGER);"), + ] - with tm.ensure_clean() as name: - with closing(self.connect(name)) as conn: - assert ( - sql.to_sql(test_frame3, "test_frame3_legacy", conn, index=False) - == 4 - ) + for query in query_list: + if isinstance(conn, Engine): + with conn.connect() as conn: + with conn.begin(): + conn.execute(query) + else: + with conn.begin(): + conn.execute(query) - with closing(self.connect(name)) as conn: - result = sql.read_sql_query("SELECT * FROM test_frame3_legacy;", conn) + with tm.assert_produces_warning(None): + sql.read_sql_table("other_table", conn) + sql.read_sql_query("SELECT * FROM other_table", conn) - tm.assert_frame_equal(test_frame3, result) - @pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason="SQLAlchemy is installed") - def test_con_string_import_error(self): - conn = "mysql://root@localhost/pandas" - msg = "Using URI string without sqlalchemy installed" - with pytest.raises(ImportError, match=msg): - sql.read_sql("SELECT * FROM iris", conn) +@pytest.mark.parametrize("conn", all_connectable) +def test_warning_case_insensitive_table_name(conn, request, test_frame1): + conn_name = conn + if conn_name == "sqlite_buildin": + request.node.add_marker(pytest.mark.xfail(reason="Does not raise warning")) - @pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason="SQLAlchemy is installed") - def test_con_unknown_dbapi2_class_does_not_error_without_sql_alchemy_installed( - self, + conn = request.getfixturevalue(conn) + # see gh-7815 + with tm.assert_produces_warning( + UserWarning, + match=( + r"The provided table name 'TABLE1' is not found exactly as such in " + r"the database after writing the table, possibly due to case " + r"sensitivity issues. Consider using lower case table names." + ), ): - class MockSqliteConnection: - def __init__(self, *args, **kwargs) -> None: - self.conn = sqlite3.Connection(*args, **kwargs) - - def __getattr__(self, name): - return getattr(self.conn, name) - - def close(self): - self.conn.close() - - with contextlib.closing(MockSqliteConnection(":memory:")) as conn: - with tm.assert_produces_warning(UserWarning): - sql.read_sql("SELECT 1", conn) - - def test_read_sql_delegate(self): - iris_frame1 = sql.read_sql_query("SELECT * FROM iris", self.conn) - iris_frame2 = sql.read_sql("SELECT * FROM iris", self.conn) - tm.assert_frame_equal(iris_frame1, iris_frame2) - - msg = "Execution failed on sql 'iris': near \"iris\": syntax error" - with pytest.raises(sql.DatabaseError, match=msg): - sql.read_sql("iris", self.conn) - - def test_get_schema2(self, test_frame1): - # without providing a connection object (available for backwards comp) - create_sql = sql.get_schema(test_frame1, "test") - assert "CREATE" in create_sql - - def _get_sqlite_column_type(self, schema, column): - for col in schema.split("\n"): - if col.split()[0].strip('"') == column: - return col.split()[1] - raise ValueError(f"Column {column} not found") - - def test_sqlite_type_mapping(self): - # Test Timestamp objects (no datetime64 because of timezone) (GH9085) - df = DataFrame( - {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)} - ) - db = sql.SQLiteDatabase(self.conn) - table = sql.SQLiteTable("test_type", db, frame=df) - schema = table.sql_schema() - assert self._get_sqlite_column_type(schema, "time") == "TIMESTAMP" + sql.SQLDatabase(conn).check_case_sensitive("TABLE1", "") + # Test that the warning is certainly NOT triggered in a normal case. + with tm.assert_produces_warning(None): + test_frame1.to_sql(name="CaseSensitive", con=conn) -# ----------------------------------------------------------------------------- -# -- Database flavor specific tests +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_sqlalchemy_type_mapping(conn, request): + conn = request.getfixturevalue(conn) + from sqlalchemy import TIMESTAMP -@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="SQLAlchemy not installed") -class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): - """ - Base class for testing the sqlalchemy backend. + # Test Timestamp objects (no datetime64 because of timezone) (GH9085) + df = DataFrame( + {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)} + ) + db = sql.SQLDatabase(conn) + table = sql.SQLTable("test_type", db, frame=df) + # GH 9086: TIMESTAMP is the suggested type for datetimes with timezones + assert isinstance(table.table.c["time"].type, TIMESTAMP) - Subclasses for specific database types are created below. Tests that - deviate for each flavor are overwritten there. - """ +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +@pytest.mark.parametrize( + "integer, expected", + [ + ("int8", "SMALLINT"), + ("Int8", "SMALLINT"), + ("uint8", "SMALLINT"), + ("UInt8", "SMALLINT"), + ("int16", "SMALLINT"), + ("Int16", "SMALLINT"), + ("uint16", "INTEGER"), + ("UInt16", "INTEGER"), + ("int32", "INTEGER"), + ("Int32", "INTEGER"), + ("uint32", "BIGINT"), + ("UInt32", "BIGINT"), + ("int64", "BIGINT"), + ("Int64", "BIGINT"), + (int, "BIGINT" if np.dtype(int).name == "int64" else "INTEGER"), + ], +) +def test_sqlalchemy_integer_mapping(conn, request, integer, expected): + # GH35076 Map pandas integer to optimal SQLAlchemy integer type + conn = request.getfixturevalue(conn) + df = DataFrame([0, 1], columns=["a"], dtype=integer) + db = sql.SQLDatabase(conn) + table = sql.SQLTable("test_type", db, frame=df) - flavor: str + result = str(table.table.c.a.type) + assert result == expected - @classmethod - def setup_class(cls): - cls.setup_driver() - cls.setup_engine() - @pytest.fixture(autouse=True) - def setup_method(self, iris_path, types_data): - try: - self.conn = self.engine.connect() - self.pandasSQL = sql.SQLDatabase(self.conn) - except sqlalchemy.exc.OperationalError: - pytest.skip(f"Can't connect to {self.flavor} server") - self.load_iris_data(iris_path) - self.load_types_data(types_data) +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +@pytest.mark.parametrize("integer", ["uint64", "UInt64"]) +def test_sqlalchemy_integer_overload_mapping(conn, request, integer): + conn = request.getfixturevalue(conn) + # GH35076 Map pandas integer to optimal SQLAlchemy integer type + df = DataFrame([0, 1], columns=["a"], dtype=integer) + db = sql.SQLDatabase(conn) + with pytest.raises( + ValueError, match="Unsigned 64 bit integer datatype is not supported" + ): + sql.SQLTable("test_type", db, frame=df) - @classmethod - def setup_driver(cls): - raise NotImplementedError() - @classmethod - def setup_engine(cls): - raise NotImplementedError() +@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="fails without SQLAlchemy") +@pytest.mark.parametrize("conn", all_connectable) +def test_database_uri_string(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + # Test read_sql and .to_sql method with a database URI (GH10654) + # db_uri = 'sqlite:///:memory:' # raises + # sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near + # "iris": syntax error [SQL: 'iris'] + with tm.ensure_clean() as name: + db_uri = "sqlite:///" + name + table = "iris" + test_frame1.to_sql(name=table, con=db_uri, if_exists="replace", index=False) + test_frame2 = sql.read_sql(table, db_uri) + test_frame3 = sql.read_sql_table(table, db_uri) + query = "SELECT * FROM iris" + test_frame4 = sql.read_sql_query(query, db_uri) + tm.assert_frame_equal(test_frame1, test_frame2) + tm.assert_frame_equal(test_frame1, test_frame3) + tm.assert_frame_equal(test_frame1, test_frame4) + + +@td.skip_if_installed("pg8000") +@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="fails without SQLAlchemy") +@pytest.mark.parametrize("conn", all_connectable) +def test_pg8000_sqlalchemy_passthrough_error(conn, request): + conn = request.getfixturevalue(conn) + # using driver that will not be installed on CI to trigger error + # in sqlalchemy.create_engine -> test passing of this error to user + db_uri = "postgresql+pg8000://user:pass@host/dbname" + with pytest.raises(ImportError, match="pg8000"): + sql.read_sql("select * from table", db_uri) - def test_read_sql_parameter(self, sql_strings): - self._read_sql_iris_parameter(sql_strings) - def test_read_sql_named_parameter(self, sql_strings): - self._read_sql_iris_named_parameter(sql_strings) +@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) +def test_query_by_text_obj(conn, request): + # WIP : GH10846 + conn_name = conn + conn = request.getfixturevalue(conn) + from sqlalchemy import text - def test_to_sql_empty(self, test_frame1): - self._to_sql_empty(test_frame1) + if "postgres" in conn_name: + name_text = text('select * from iris where "Name"=:name') + else: + name_text = text("select * from iris where name=:name") + iris_df = sql.read_sql(name_text, conn, params={"name": "Iris-versicolor"}) + all_names = set(iris_df["Name"]) + assert all_names == {"Iris-versicolor"} - def test_create_table(self): - from sqlalchemy import inspect - temp_conn = self.connect() - temp_frame = DataFrame( - {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} - ) - with sql.SQLDatabase(temp_conn, need_transaction=True) as pandasSQL: - assert pandasSQL.to_sql(temp_frame, "temp_frame") == 4 +@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) +def test_query_by_select_obj(conn, request): + conn = request.getfixturevalue(conn) + # WIP : GH10846 + from sqlalchemy import ( + bindparam, + select, + ) - insp = inspect(temp_conn) - assert insp.has_table("temp_frame") + iris = iris_table_metadata("postgres") + name_select = select(iris).where(iris.c.Name == bindparam("name")) + iris_df = sql.read_sql(name_select, conn, params={"name": "Iris-setosa"}) + all_names = set(iris_df["Name"]) + assert all_names == {"Iris-setosa"} - # Cleanup - with sql.SQLDatabase(temp_conn, need_transaction=True) as pandasSQL: - pandasSQL.drop_table("temp_frame") - def test_drop_table(self): - from sqlalchemy import inspect +@pytest.mark.parametrize("conn", all_connectable) +def test_column_with_percentage(conn, request): + # GH 37157 + conn_name = conn + if conn_name == "sqlite_buildin": + request.node.add_marker(pytest.mark.xfail(reason="Not Implemented")) - temp_conn = self.connect() - temp_frame = DataFrame( - {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} - ) - pandasSQL = sql.SQLDatabase(temp_conn) - assert pandasSQL.to_sql(temp_frame, "temp_frame") == 4 + conn = request.getfixturevalue(conn) + df = DataFrame({"A": [0, 1, 2], "%_variation": [3, 4, 5]}) + df.to_sql(name="test_column_percentage", con=conn, index=False) - insp = inspect(temp_conn) - assert insp.has_table("temp_frame") + res = sql.read_sql_table("test_column_percentage", conn) - pandasSQL.drop_table("temp_frame") - try: - insp.clear_cache() # needed with SQLAlchemy 2.0, unavailable prior - except AttributeError: - pass - assert not insp.has_table("temp_frame") + tm.assert_frame_equal(res, df) - def test_roundtrip(self, test_frame1): - self._roundtrip(test_frame1) - def test_execute_sql(self): - self._execute_sql() +def test_sql_open_close(test_frame3): + # Test if the IO in the database still work if the connection closed + # between the writing and reading (as in many real situations). - def test_read_table(self): - iris_frame = sql.read_sql_table("iris", con=self.conn) - check_iris_frame(iris_frame) + with tm.ensure_clean() as name: + with closing(sqlite3.connect(name)) as conn: + assert sql.to_sql(test_frame3, "test_frame3_legacy", conn, index=False) == 4 - def test_read_table_columns(self): - iris_frame = sql.read_sql_table( - "iris", con=self.conn, columns=["SepalLength", "SepalLength"] - ) - tm.equalContents(iris_frame.columns.values, ["SepalLength", "SepalLength"]) + with closing(sqlite3.connect(name)) as conn: + result = sql.read_sql_query("SELECT * FROM test_frame3_legacy;", conn) - def test_read_table_absent_raises(self): - msg = "Table this_doesnt_exist not found" - with pytest.raises(ValueError, match=msg): - sql.read_sql_table("this_doesnt_exist", con=self.conn) + tm.assert_frame_equal(test_frame3, result) - def test_default_type_conversion(self): - df = sql.read_sql_table("types", self.conn) - assert issubclass(df.FloatCol.dtype.type, np.floating) - assert issubclass(df.IntCol.dtype.type, np.integer) - assert issubclass(df.BoolCol.dtype.type, np.bool_) +@pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason="SQLAlchemy is installed") +def test_con_string_import_error(): + conn = "mysql://root@localhost/pandas" + msg = "Using URI string without sqlalchemy installed" + with pytest.raises(ImportError, match=msg): + sql.read_sql("SELECT * FROM iris", conn) - # Int column with NA values stays as float - assert issubclass(df.IntColWithNull.dtype.type, np.floating) - # Bool column with NA values becomes object - assert issubclass(df.BoolColWithNull.dtype.type, object) - def test_bigint(self): - # int64 should be converted to BigInteger, GH7433 - df = DataFrame(data={"i64": [2**62]}) - assert df.to_sql(name="test_bigint", con=self.conn, index=False) == 1 - result = sql.read_sql_table("test_bigint", self.conn) - - tm.assert_frame_equal(df, result) - - def test_default_date_load(self): - df = sql.read_sql_table("types", self.conn) - - # IMPORTANT - sqlite has no native date type, so shouldn't parse, but - # MySQL SHOULD be converted. - assert issubclass(df.DateCol.dtype.type, np.datetime64) - - def test_datetime_with_timezone(self, request): - # edge case that converts postgresql datetime with time zone types - # to datetime64[ns,psycopg2.tz.FixedOffsetTimezone..], which is ok - # but should be more natural, so coerce to datetime64[ns] for now - - def check(col): - # check that a column is either datetime64[ns] - # or datetime64[ns, UTC] - if lib.is_np_dtype(col.dtype, "M"): - # "2000-01-01 00:00:00-08:00" should convert to - # "2000-01-01 08:00:00" - assert col[0] == Timestamp("2000-01-01 08:00:00") - - # "2000-06-01 00:00:00-07:00" should convert to - # "2000-06-01 07:00:00" - assert col[1] == Timestamp("2000-06-01 07:00:00") - - elif isinstance(col.dtype, DatetimeTZDtype): - assert str(col.dt.tz) == "UTC" - - # "2000-01-01 00:00:00-08:00" should convert to - # "2000-01-01 08:00:00" - # "2000-06-01 00:00:00-07:00" should convert to - # "2000-06-01 07:00:00" - # GH 6415 - expected_data = [ - Timestamp("2000-01-01 08:00:00", tz="UTC"), - Timestamp("2000-06-01 07:00:00", tz="UTC"), - ] - expected = Series(expected_data, name=col.name) - tm.assert_series_equal(col, expected) +@pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason="SQLAlchemy is installed") +def test_con_unknown_dbapi2_class_does_not_error_without_sql_alchemy_installed(): + class MockSqliteConnection: + def __init__(self, *args, **kwargs) -> None: + self.conn = sqlite3.Connection(*args, **kwargs) - else: - raise AssertionError( - f"DateCol loaded with incorrect type -> {col.dtype}" - ) + def __getattr__(self, name): + return getattr(self.conn, name) - # GH11216 - df = read_sql_query("select * from types", self.conn) - if not hasattr(df, "DateColWithTz"): - request.node.add_marker( - pytest.mark.xfail(reason="no column with datetime with time zone") - ) + def close(self): + self.conn.close() - # this is parsed on Travis (linux), but not on macosx for some reason - # even with the same versions of psycopg2 & sqlalchemy, possibly a - # Postgresql server version difference - col = df.DateColWithTz - assert isinstance(col.dtype, DatetimeTZDtype) + with contextlib.closing(MockSqliteConnection(":memory:")) as conn: + with tm.assert_produces_warning(UserWarning): + sql.read_sql("SELECT 1", conn) - df = read_sql_query( - "select * from types", self.conn, parse_dates=["DateColWithTz"] - ) - if not hasattr(df, "DateColWithTz"): - request.node.add_marker( - pytest.mark.xfail(reason="no column with datetime with time zone") - ) - col = df.DateColWithTz - assert isinstance(col.dtype, DatetimeTZDtype) - assert str(col.dt.tz) == "UTC" - check(df.DateColWithTz) - - df = concat( - list(read_sql_query("select * from types", self.conn, chunksize=1)), - ignore_index=True, - ) - col = df.DateColWithTz - assert isinstance(col.dtype, DatetimeTZDtype) - assert str(col.dt.tz) == "UTC" - expected = sql.read_sql_table("types", self.conn) - col = expected.DateColWithTz - assert isinstance(col.dtype, DatetimeTZDtype) - tm.assert_series_equal(df.DateColWithTz, expected.DateColWithTz) - - # xref #7139 - # this might or might not be converted depending on the postgres driver - df = sql.read_sql_table("types", self.conn) - check(df.DateColWithTz) - - def test_datetime_with_timezone_roundtrip(self): - # GH 9086 - # Write datetimetz data to a db and read it back - # For dbs that support timestamps with timezones, should get back UTC - # otherwise naive data should be returned - expected = DataFrame( - {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific")} - ) - assert expected.to_sql(name="test_datetime_tz", con=self.conn, index=False) == 3 - if self.flavor == "postgresql": - # SQLAlchemy "timezones" (i.e. offsets) are coerced to UTC - expected["A"] = expected["A"].dt.tz_convert("UTC") - else: - # Otherwise, timestamps are returned as local, naive - expected["A"] = expected["A"].dt.tz_localize(None) +def test_sqlite_read_sql_delegate(sqlite_buildin_iris): + conn = sqlite_buildin_iris + iris_frame1 = sql.read_sql_query("SELECT * FROM iris", conn) + iris_frame2 = sql.read_sql("SELECT * FROM iris", conn) + tm.assert_frame_equal(iris_frame1, iris_frame2) - result = sql.read_sql_table("test_datetime_tz", self.conn) - tm.assert_frame_equal(result, expected) + msg = "Execution failed on sql 'iris': near \"iris\": syntax error" + with pytest.raises(sql.DatabaseError, match=msg): + sql.read_sql("iris", conn) - result = sql.read_sql_query("SELECT * FROM test_datetime_tz", self.conn) - if self.flavor == "sqlite": - # read_sql_query does not return datetime type like read_sql_table - assert isinstance(result.loc[0, "A"], str) - result["A"] = to_datetime(result["A"]) - tm.assert_frame_equal(result, expected) - def test_out_of_bounds_datetime(self): - # GH 26761 - data = DataFrame({"date": datetime(9999, 1, 1)}, index=[0]) - assert data.to_sql(name="test_datetime_obb", con=self.conn, index=False) == 1 - result = sql.read_sql_table("test_datetime_obb", self.conn) - expected = DataFrame([pd.NaT], columns=["date"]) - tm.assert_frame_equal(result, expected) +def test_get_schema2(test_frame1): + # without providing a connection object (available for backwards comp) + create_sql = sql.get_schema(test_frame1, "test") + assert "CREATE" in create_sql - def test_naive_datetimeindex_roundtrip(self): - # GH 23510 - # Ensure that a naive DatetimeIndex isn't converted to UTC - dates = date_range("2018-01-01", periods=5, freq="6H")._with_freq(None) - expected = DataFrame({"nums": range(5)}, index=dates) - assert ( - expected.to_sql(name="foo_table", con=self.conn, index_label="info_date") - == 5 - ) - result = sql.read_sql_table("foo_table", self.conn, index_col="info_date") - # result index with gain a name from a set_index operation; expected - tm.assert_frame_equal(result, expected, check_names=False) - def test_date_parsing(self): - # No Parsing - df = sql.read_sql_table("types", self.conn) - expected_type = object if self.flavor == "sqlite" else np.datetime64 - assert issubclass(df.DateCol.dtype.type, expected_type) +def test_sqlite_type_mapping(sqlite_buildin): + # Test Timestamp objects (no datetime64 because of timezone) (GH9085) + conn = sqlite_buildin + df = DataFrame( + {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)} + ) + db = sql.SQLiteDatabase(conn) + table = sql.SQLiteTable("test_type", db, frame=df) + schema = table.sql_schema() + for col in schema.split("\n"): + if col.split()[0].strip('"') == "time": + assert col.split()[1] == "TIMESTAMP" - df = sql.read_sql_table("types", self.conn, parse_dates=["DateCol"]) - assert issubclass(df.DateCol.dtype.type, np.datetime64) - df = sql.read_sql_table( - "types", self.conn, parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"} - ) - assert issubclass(df.DateCol.dtype.type, np.datetime64) +# ----------------------------------------------------------------------------- +# -- Database flavor specific tests - df = sql.read_sql_table( - "types", - self.conn, - parse_dates={"DateCol": {"format": "%Y-%m-%d %H:%M:%S"}}, - ) - assert issubclass(df.DateCol.dtype.type, np.datetime64) - df = sql.read_sql_table("types", self.conn, parse_dates=["IntDateCol"]) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_create_table(conn, request): + if conn == "sqlite_str": + pytest.skip("sqlite_str has no inspection system") - df = sql.read_sql_table("types", self.conn, parse_dates={"IntDateCol": "s"}) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + conn = request.getfixturevalue(conn) - df = sql.read_sql_table( - "types", self.conn, parse_dates={"IntDateCol": {"unit": "s"}} - ) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + from sqlalchemy import inspect - def test_datetime(self): - df = DataFrame( - {"A": date_range("2013-01-01 09:00:00", periods=3), "B": np.arange(3.0)} - ) - assert df.to_sql(name="test_datetime", con=self.conn) == 3 + temp_frame = DataFrame({"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}) + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + assert pandasSQL.to_sql(temp_frame, "temp_frame") == 4 - # with read_table -> type information from schema used - result = sql.read_sql_table("test_datetime", self.conn) - result = result.drop("index", axis=1) - tm.assert_frame_equal(result, df) + insp = inspect(conn) + assert insp.has_table("temp_frame") - # with read_sql -> no type information -> sqlite has no native - result = sql.read_sql_query("SELECT * FROM test_datetime", self.conn) - result = result.drop("index", axis=1) - if self.flavor == "sqlite": - assert isinstance(result.loc[0, "A"], str) - result["A"] = to_datetime(result["A"]) - tm.assert_frame_equal(result, df) - else: - tm.assert_frame_equal(result, df) + # Cleanup + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("temp_frame") - def test_datetime_NaT(self): - df = DataFrame( - {"A": date_range("2013-01-01 09:00:00", periods=3), "B": np.arange(3.0)} - ) - df.loc[1, "A"] = np.nan - assert df.to_sql(name="test_datetime", con=self.conn, index=False) == 3 - # with read_table -> type information from schema used - result = sql.read_sql_table("test_datetime", self.conn) - tm.assert_frame_equal(result, df) +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_drop_table(conn, request): + if conn == "sqlite_str": + pytest.skip("sqlite_str has no inspection system") - # with read_sql -> no type information -> sqlite has no native - result = sql.read_sql_query("SELECT * FROM test_datetime", self.conn) - if self.flavor == "sqlite": - assert isinstance(result.loc[0, "A"], str) - result["A"] = to_datetime(result["A"], errors="coerce") - tm.assert_frame_equal(result, df) - else: - tm.assert_frame_equal(result, df) - - def test_datetime_date(self): - # test support for datetime.date - df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) - assert df.to_sql(name="test_date", con=self.conn, index=False) == 2 - res = read_sql_table("test_date", self.conn) - result = res["a"] - expected = to_datetime(df["a"]) - # comes back as datetime64 - tm.assert_series_equal(result, expected) - - def test_datetime_time(self, sqlite_buildin): - # test support for datetime.time - df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) - assert df.to_sql(name="test_time", con=self.conn, index=False) == 2 - res = read_sql_table("test_time", self.conn) - tm.assert_frame_equal(res, df) - - # GH8341 - # first, use the fallback to have the sqlite adapter put in place - sqlite_conn = sqlite_buildin - assert sql.to_sql(df, "test_time2", sqlite_conn, index=False) == 2 - res = sql.read_sql_query("SELECT * FROM test_time2", sqlite_conn) - ref = df.map(lambda _: _.strftime("%H:%M:%S.%f")) - tm.assert_frame_equal(ref, res) # check if adapter is in place - # then test if sqlalchemy is unaffected by the sqlite adapter - assert sql.to_sql(df, "test_time3", self.conn, index=False) == 2 - if self.flavor == "sqlite": - res = sql.read_sql_query("SELECT * FROM test_time3", self.conn) - ref = df.map(lambda _: _.strftime("%H:%M:%S.%f")) - tm.assert_frame_equal(ref, res) - res = sql.read_sql_table("test_time3", self.conn) - tm.assert_frame_equal(df, res) - - def test_mixed_dtype_insert(self): - # see GH6509 - s1 = Series(2**25 + 1, dtype=np.int32) - s2 = Series(0.0, dtype=np.float32) - df = DataFrame({"s1": s1, "s2": s2}) - - # write and read again - assert df.to_sql(name="test_read_write", con=self.conn, index=False) == 1 - df2 = sql.read_sql_table("test_read_write", self.conn) - - tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True) - - def test_nan_numeric(self): - # NaNs in numeric float column - df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]}) - assert df.to_sql(name="test_nan", con=self.conn, index=False) == 3 - - # with read_table - result = sql.read_sql_table("test_nan", self.conn) - tm.assert_frame_equal(result, df) + conn = request.getfixturevalue(conn) - # with read_sql - result = sql.read_sql_query("SELECT * FROM test_nan", self.conn) - tm.assert_frame_equal(result, df) + from sqlalchemy import inspect - def test_nan_fullcolumn(self): - # full NaN column (numeric float column) - df = DataFrame({"A": [0, 1, 2], "B": [np.nan, np.nan, np.nan]}) - assert df.to_sql(name="test_nan", con=self.conn, index=False) == 3 + temp_frame = DataFrame({"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}) + pandasSQL = sql.SQLDatabase(conn) + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(temp_frame, "temp_frame") == 4 - # with read_table - result = sql.read_sql_table("test_nan", self.conn) - tm.assert_frame_equal(result, df) + insp = inspect(conn) + assert insp.has_table("temp_frame") - # with read_sql -> not type info from table -> stays None - df["B"] = df["B"].astype("object") - df["B"] = None - result = sql.read_sql_query("SELECT * FROM test_nan", self.conn) - tm.assert_frame_equal(result, df) + with pandasSQL.run_transaction(): + pandasSQL.drop_table("temp_frame") + try: + insp.clear_cache() # needed with SQLAlchemy 2.0, unavailable prior + except AttributeError: + pass + assert not insp.has_table("temp_frame") - def test_nan_string(self): - # NaNs in string column - df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", np.nan]}) - assert df.to_sql(name="test_nan", con=self.conn, index=False) == 3 - # NaNs are coming back as None - df.loc[2, "B"] = None +@pytest.mark.parametrize("conn", all_connectable) +def test_roundtrip(conn, request, test_frame1): + if conn == "sqlite_str": + pytest.skip("sqlite_str has no inspection system") - # with read_table - result = sql.read_sql_table("test_nan", self.conn) - tm.assert_frame_equal(result, df) + conn = request.getfixturevalue(conn) + pandasSQL = pandasSQL_builder(conn) + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") == 4 + result = pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") - # with read_sql - result = sql.read_sql_query("SELECT * FROM test_nan", self.conn) - tm.assert_frame_equal(result, df) + result.set_index("level_0", inplace=True) + # result.index.astype(int) - def _get_index_columns(self, tbl_name): - from sqlalchemy import inspect + result.index.name = None - insp = inspect(self.conn) + tm.assert_frame_equal(result, test_frame1) - ixs = insp.get_indexes(tbl_name) - ixs = [i["column_names"] for i in ixs] - return ixs - def test_to_sql_save_index(self): - self._to_sql_save_index() +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_execute_sql(conn, request): + conn = request.getfixturevalue(conn) + pandasSQL = pandasSQL_builder(conn) + with pandasSQL.run_transaction(): + iris_results = pandasSQL.execute("SELECT * FROM iris") + row = iris_results.fetchone() + tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) - def test_transactions(self): - self._transaction_test() - def test_get_schema_create_table(self, test_frame3): - # Use a dataframe without a bool column, since MySQL converts bool to - # TINYINT (which read_sql_table returns as an int and causes a dtype - # mismatch) - from sqlalchemy import text - from sqlalchemy.engine import Engine +@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) +def test_sqlalchemy_read_table(conn, request): + conn = request.getfixturevalue(conn) + iris_frame = sql.read_sql_table("iris", con=conn) + check_iris_frame(iris_frame) - tbl = "test_get_schema_create_table" - create_sql = sql.get_schema(test_frame3, tbl, con=self.conn) - blank_test_df = test_frame3.iloc[:0] - self.drop_table(tbl, self.conn) - create_sql = text(create_sql) - if isinstance(self.conn, Engine): - with self.conn.connect() as conn: - with conn.begin(): - conn.execute(create_sql) - else: - with self.conn.begin(): - self.conn.execute(create_sql) - returned_df = sql.read_sql_table(tbl, self.conn) - tm.assert_frame_equal(returned_df, blank_test_df, check_index_type=False) - self.drop_table(tbl, self.conn) - - def test_dtype(self): - from sqlalchemy import ( - TEXT, - String, - ) - from sqlalchemy.schema import MetaData - - cols = ["A", "B"] - data = [(0.8, True), (0.9, None)] - df = DataFrame(data, columns=cols) - assert df.to_sql(name="dtype_test", con=self.conn) == 2 - assert df.to_sql(name="dtype_test2", con=self.conn, dtype={"B": TEXT}) == 2 - meta = MetaData() - meta.reflect(bind=self.conn) - sqltype = meta.tables["dtype_test2"].columns["B"].type - assert isinstance(sqltype, TEXT) - msg = "The type of B is not a SQLAlchemy type" - with pytest.raises(ValueError, match=msg): - df.to_sql(name="error", con=self.conn, dtype={"B": str}) +@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) +def test_sqlalchemy_read_table_columns(conn, request): + conn = request.getfixturevalue(conn) + iris_frame = sql.read_sql_table( + "iris", con=conn, columns=["SepalLength", "SepalLength"] + ) + tm.equalContents(iris_frame.columns.values, ["SepalLength", "SepalLength"]) - # GH9083 - assert ( - df.to_sql(name="dtype_test3", con=self.conn, dtype={"B": String(10)}) == 2 - ) - meta.reflect(bind=self.conn) - sqltype = meta.tables["dtype_test3"].columns["B"].type - assert isinstance(sqltype, String) - assert sqltype.length == 10 - - # single dtype - assert df.to_sql(name="single_dtype_test", con=self.conn, dtype=TEXT) == 2 - meta.reflect(bind=self.conn) - sqltypea = meta.tables["single_dtype_test"].columns["A"].type - sqltypeb = meta.tables["single_dtype_test"].columns["B"].type - assert isinstance(sqltypea, TEXT) - assert isinstance(sqltypeb, TEXT) - - def test_notna_dtype(self): - from sqlalchemy import ( - Boolean, - DateTime, - Float, - Integer, - ) - from sqlalchemy.schema import MetaData - cols = { - "Bool": Series([True, None]), - "Date": Series([datetime(2012, 5, 1), None]), - "Int": Series([1, None], dtype="object"), - "Float": Series([1.1, None]), - } - df = DataFrame(cols) - - tbl = "notna_dtype_test" - assert df.to_sql(name=tbl, con=self.conn) == 2 - _ = sql.read_sql_table(tbl, self.conn) - meta = MetaData() - meta.reflect(bind=self.conn) - my_type = Integer if self.flavor == "mysql" else Boolean - col_dict = meta.tables[tbl].columns - assert isinstance(col_dict["Bool"].type, my_type) - assert isinstance(col_dict["Date"].type, DateTime) - assert isinstance(col_dict["Int"].type, Integer) - assert isinstance(col_dict["Float"].type, Float) - - def test_double_precision(self): - from sqlalchemy import ( - BigInteger, - Float, - Integer, - ) - from sqlalchemy.schema import MetaData +@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) +def test_read_table_absent_raises(conn, request): + conn = request.getfixturevalue(conn) + msg = "Table this_doesnt_exist not found" + with pytest.raises(ValueError, match=msg): + sql.read_sql_table("this_doesnt_exist", con=conn) - V = 1.23456789101112131415 - df = DataFrame( - { - "f32": Series([V], dtype="float32"), - "f64": Series([V], dtype="float64"), - "f64_as_f32": Series([V], dtype="float64"), - "i32": Series([5], dtype="int32"), - "i64": Series([5], dtype="int64"), - } +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_sqlalchemy_default_type_conversion(conn, request): + conn_name = conn + if conn_name == "sqlite_str": + pytest.skip("types tables not created in sqlite_str fixture") + elif "mysql" in conn_name or "sqlite" in conn_name: + request.node.add_marker( + pytest.mark.xfail(reason="boolean dtype not inferred properly") ) - assert ( - df.to_sql( - name="test_dtypes", - con=self.conn, - index=False, - if_exists="replace", - dtype={"f64_as_f32": Float(precision=23)}, - ) - == 1 - ) - res = sql.read_sql_table("test_dtypes", self.conn) - - # check precision of float64 - assert np.round(df["f64"].iloc[0], 14) == np.round(res["f64"].iloc[0], 14) - - # check sql types - meta = MetaData() - meta.reflect(bind=self.conn) - col_dict = meta.tables["test_dtypes"].columns - assert str(col_dict["f32"].type) == str(col_dict["f64_as_f32"].type) - assert isinstance(col_dict["f32"].type, Float) - assert isinstance(col_dict["f64"].type, Float) - assert isinstance(col_dict["i32"].type, Integer) - assert isinstance(col_dict["i64"].type, BigInteger) - - def test_connectable_issue_example(self): - # This tests the example raised in issue - # https://github.com/pandas-dev/pandas/issues/10104 - from sqlalchemy.engine import Engine + conn = request.getfixturevalue(conn) + df = sql.read_sql_table("types", conn) - def test_select(connection): - query = "SELECT test_foo_data FROM test_foo_data" - return sql.read_sql_query(query, con=connection) + assert issubclass(df.FloatCol.dtype.type, np.floating) + assert issubclass(df.IntCol.dtype.type, np.integer) + assert issubclass(df.BoolCol.dtype.type, np.bool_) - def test_append(connection, data): - data.to_sql(name="test_foo_data", con=connection, if_exists="append") + # Int column with NA values stays as float + assert issubclass(df.IntColWithNull.dtype.type, np.floating) + # Bool column with NA values becomes object + assert issubclass(df.BoolColWithNull.dtype.type, object) - def test_connectable(conn): - # https://github.com/sqlalchemy/sqlalchemy/commit/ - # 00b5c10846e800304caa86549ab9da373b42fa5d#r48323973 - foo_data = test_select(conn) - test_append(conn, foo_data) - def main(connectable): - if isinstance(connectable, Engine): - with connectable.connect() as conn: - with conn.begin(): - test_connectable(conn) - else: - test_connectable(connectable) +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_bigint(conn, request): + # int64 should be converted to BigInteger, GH7433 + conn = request.getfixturevalue(conn) + df = DataFrame(data={"i64": [2**62]}) + assert df.to_sql(name="test_bigint", con=conn, index=False) == 1 + result = sql.read_sql_table("test_bigint", conn) - assert ( - DataFrame({"test_foo_data": [0, 1, 2]}).to_sql( - name="test_foo_data", con=self.conn - ) - == 3 + tm.assert_frame_equal(df, result) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_default_date_load(conn, request): + conn_name = conn + if conn_name == "sqlite_str": + pytest.skip("types tables not created in sqlite_str fixture") + elif "sqlite" in conn_name: + request.node.add_marker( + pytest.mark.xfail(reason="sqlite does not read date properly") ) - main(self.conn) - - @pytest.mark.parametrize( - "input", - [{"foo": [np.inf]}, {"foo": [-np.inf]}, {"foo": [-np.inf], "infe0": ["bar"]}], - ) - def test_to_sql_with_negative_npinf(self, input, request): - # GH 34431 - - df = DataFrame(input) - - if self.flavor == "mysql": - # GH 36465 - # The input {"foo": [-np.inf], "infe0": ["bar"]} does not raise any error - # for pymysql version >= 0.10 - # TODO(GH#36465): remove this version check after GH 36465 is fixed - pymysql = pytest.importorskip("pymysql") - - if ( - Version(pymysql.__version__) < Version("1.0.3") - and "infe0" in df.columns - ): - mark = pytest.mark.xfail(reason="GH 36465") - request.node.add_marker(mark) - - msg = "inf cannot be used with MySQL" - with pytest.raises(ValueError, match=msg): - df.to_sql(name="foobar", con=self.conn, index=False) + + conn = request.getfixturevalue(conn) + df = sql.read_sql_table("types", conn) + + assert issubclass(df.DateCol.dtype.type, np.datetime64) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) +def test_datetime_with_timezone(conn, request): + # edge case that converts postgresql datetime with time zone types + # to datetime64[ns,psycopg2.tz.FixedOffsetTimezone..], which is ok + # but should be more natural, so coerce to datetime64[ns] for now + + def check(col): + # check that a column is either datetime64[ns] + # or datetime64[ns, UTC] + if lib.is_np_dtype(col.dtype, "M"): + # "2000-01-01 00:00:00-08:00" should convert to + # "2000-01-01 08:00:00" + assert col[0] == Timestamp("2000-01-01 08:00:00") + + # "2000-06-01 00:00:00-07:00" should convert to + # "2000-06-01 07:00:00" + assert col[1] == Timestamp("2000-06-01 07:00:00") + + elif isinstance(col.dtype, DatetimeTZDtype): + assert str(col.dt.tz) == "UTC" + + # "2000-01-01 00:00:00-08:00" should convert to + # "2000-01-01 08:00:00" + # "2000-06-01 00:00:00-07:00" should convert to + # "2000-06-01 07:00:00" + # GH 6415 + expected_data = [ + Timestamp("2000-01-01 08:00:00", tz="UTC"), + Timestamp("2000-06-01 07:00:00", tz="UTC"), + ] + expected = Series(expected_data, name=col.name) + tm.assert_series_equal(col, expected) + else: - assert df.to_sql(name="foobar", con=self.conn, index=False) == 1 - res = sql.read_sql_table("foobar", self.conn) - tm.assert_equal(df, res) - - def test_temporary_table(self): - from sqlalchemy import ( - Column, - Integer, - Unicode, - select, + raise AssertionError(f"DateCol loaded with incorrect type -> {col.dtype}") + + # GH11216 + conn = request.getfixturevalue(conn) + df = read_sql_query("select * from types", conn) + if not hasattr(df, "DateColWithTz"): + request.node.add_marker( + pytest.mark.xfail(reason="no column with datetime with time zone") + ) + + # this is parsed on Travis (linux), but not on macosx for some reason + # even with the same versions of psycopg2 & sqlalchemy, possibly a + # Postgresql server version difference + col = df.DateColWithTz + assert isinstance(col.dtype, DatetimeTZDtype) + + df = read_sql_query("select * from types", conn, parse_dates=["DateColWithTz"]) + if not hasattr(df, "DateColWithTz"): + request.node.add_marker( + pytest.mark.xfail(reason="no column with datetime with time zone") + ) + col = df.DateColWithTz + assert isinstance(col.dtype, DatetimeTZDtype) + assert str(col.dt.tz) == "UTC" + check(df.DateColWithTz) + + df = concat( + list(read_sql_query("select * from types", conn, chunksize=1)), + ignore_index=True, + ) + col = df.DateColWithTz + assert isinstance(col.dtype, DatetimeTZDtype) + assert str(col.dt.tz) == "UTC" + expected = sql.read_sql_table("types", conn) + col = expected.DateColWithTz + assert isinstance(col.dtype, DatetimeTZDtype) + tm.assert_series_equal(df.DateColWithTz, expected.DateColWithTz) + + # xref #7139 + # this might or might not be converted depending on the postgres driver + df = sql.read_sql_table("types", conn) + check(df.DateColWithTz) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_datetime_with_timezone_roundtrip(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + # GH 9086 + # Write datetimetz data to a db and read it back + # For dbs that support timestamps with timezones, should get back UTC + # otherwise naive data should be returned + expected = DataFrame( + {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific")} + ) + assert expected.to_sql(name="test_datetime_tz", con=conn, index=False) == 3 + + if "postgresql" in conn_name: + # SQLAlchemy "timezones" (i.e. offsets) are coerced to UTC + expected["A"] = expected["A"].dt.tz_convert("UTC") + else: + # Otherwise, timestamps are returned as local, naive + expected["A"] = expected["A"].dt.tz_localize(None) + + result = sql.read_sql_table("test_datetime_tz", conn) + tm.assert_frame_equal(result, expected) + + result = sql.read_sql_query("SELECT * FROM test_datetime_tz", conn) + if "sqlite" in conn_name: + # read_sql_query does not return datetime type like read_sql_table + assert isinstance(result.loc[0, "A"], str) + result["A"] = to_datetime(result["A"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_out_of_bounds_datetime(conn, request): + # GH 26761 + conn = request.getfixturevalue(conn) + data = DataFrame({"date": datetime(9999, 1, 1)}, index=[0]) + assert data.to_sql(name="test_datetime_obb", con=conn, index=False) == 1 + result = sql.read_sql_table("test_datetime_obb", conn) + expected = DataFrame([pd.NaT], columns=["date"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_naive_datetimeindex_roundtrip(conn, request): + # GH 23510 + # Ensure that a naive DatetimeIndex isn't converted to UTC + conn = request.getfixturevalue(conn) + dates = date_range("2018-01-01", periods=5, freq="6H")._with_freq(None) + expected = DataFrame({"nums": range(5)}, index=dates) + assert expected.to_sql(name="foo_table", con=conn, index_label="info_date") == 5 + result = sql.read_sql_table("foo_table", conn, index_col="info_date") + # result index with gain a name from a set_index operation; expected + tm.assert_frame_equal(result, expected, check_names=False) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) +def test_date_parsing(conn, request): + # No Parsing + conn_name = conn + conn = request.getfixturevalue(conn) + df = sql.read_sql_table("types", conn) + expected_type = object if "sqlite" in conn_name else np.datetime64 + assert issubclass(df.DateCol.dtype.type, expected_type) + + df = sql.read_sql_table("types", conn, parse_dates=["DateCol"]) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + + df = sql.read_sql_table("types", conn, parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"}) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + + df = sql.read_sql_table( + "types", + conn, + parse_dates={"DateCol": {"format": "%Y-%m-%d %H:%M:%S"}}, + ) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + + df = sql.read_sql_table("types", conn, parse_dates=["IntDateCol"]) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + + df = sql.read_sql_table("types", conn, parse_dates={"IntDateCol": "s"}) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + + df = sql.read_sql_table("types", conn, parse_dates={"IntDateCol": {"unit": "s"}}) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_datetime(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + df = DataFrame( + {"A": date_range("2013-01-01 09:00:00", periods=3), "B": np.arange(3.0)} + ) + assert df.to_sql(name="test_datetime", con=conn) == 3 + + # with read_table -> type information from schema used + result = sql.read_sql_table("test_datetime", conn) + result = result.drop("index", axis=1) + tm.assert_frame_equal(result, df) + + # with read_sql -> no type information -> sqlite has no native + result = sql.read_sql_query("SELECT * FROM test_datetime", conn) + result = result.drop("index", axis=1) + if "sqlite" in conn_name: + assert isinstance(result.loc[0, "A"], str) + result["A"] = to_datetime(result["A"]) + tm.assert_frame_equal(result, df) + else: + tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_datetime_NaT(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + df = DataFrame( + {"A": date_range("2013-01-01 09:00:00", periods=3), "B": np.arange(3.0)} + ) + df.loc[1, "A"] = np.nan + assert df.to_sql(name="test_datetime", con=conn, index=False) == 3 + + # with read_table -> type information from schema used + result = sql.read_sql_table("test_datetime", conn) + tm.assert_frame_equal(result, df) + + # with read_sql -> no type information -> sqlite has no native + result = sql.read_sql_query("SELECT * FROM test_datetime", conn) + if "sqlite" in conn_name: + assert isinstance(result.loc[0, "A"], str) + result["A"] = to_datetime(result["A"], errors="coerce") + tm.assert_frame_equal(result, df) + else: + tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_datetime_date(conn, request): + # test support for datetime.date + conn = request.getfixturevalue(conn) + df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) + assert df.to_sql(name="test_date", con=conn, index=False) == 2 + res = read_sql_table("test_date", conn) + result = res["a"] + expected = to_datetime(df["a"]) + # comes back as datetime64 + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_datetime_time(conn, request, sqlite_buildin): + # test support for datetime.time + conn_name = conn + conn = request.getfixturevalue(conn) + df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) + assert df.to_sql(name="test_time", con=conn, index=False) == 2 + res = read_sql_table("test_time", conn) + tm.assert_frame_equal(res, df) + + # GH8341 + # first, use the fallback to have the sqlite adapter put in place + sqlite_conn = sqlite_buildin + assert sql.to_sql(df, "test_time2", sqlite_conn, index=False) == 2 + res = sql.read_sql_query("SELECT * FROM test_time2", sqlite_conn) + ref = df.map(lambda _: _.strftime("%H:%M:%S.%f")) + tm.assert_frame_equal(ref, res) # check if adapter is in place + # then test if sqlalchemy is unaffected by the sqlite adapter + assert sql.to_sql(df, "test_time3", conn, index=False) == 2 + if "sqlite" in conn_name: + res = sql.read_sql_query("SELECT * FROM test_time3", conn) + ref = df.map(lambda _: _.strftime("%H:%M:%S.%f")) + tm.assert_frame_equal(ref, res) + res = sql.read_sql_table("test_time3", conn) + tm.assert_frame_equal(df, res) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_mixed_dtype_insert(conn, request): + # see GH6509 + conn = request.getfixturevalue(conn) + s1 = Series(2**25 + 1, dtype=np.int32) + s2 = Series(0.0, dtype=np.float32) + df = DataFrame({"s1": s1, "s2": s2}) + + # write and read again + assert df.to_sql(name="test_read_write", con=conn, index=False) == 1 + df2 = sql.read_sql_table("test_read_write", conn) + + tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_nan_numeric(conn, request): + # NaNs in numeric float column + conn = request.getfixturevalue(conn) + df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]}) + assert df.to_sql(name="test_nan", con=conn, index=False) == 3 + + # with read_table + result = sql.read_sql_table("test_nan", conn) + tm.assert_frame_equal(result, df) + + # with read_sql + result = sql.read_sql_query("SELECT * FROM test_nan", conn) + tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_nan_fullcolumn(conn, request): + # full NaN column (numeric float column) + conn = request.getfixturevalue(conn) + df = DataFrame({"A": [0, 1, 2], "B": [np.nan, np.nan, np.nan]}) + assert df.to_sql(name="test_nan", con=conn, index=False) == 3 + + # with read_table + result = sql.read_sql_table("test_nan", conn) + tm.assert_frame_equal(result, df) + + # with read_sql -> not type info from table -> stays None + df["B"] = df["B"].astype("object") + df["B"] = None + result = sql.read_sql_query("SELECT * FROM test_nan", conn) + tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_nan_string(conn, request): + # NaNs in string column + conn = request.getfixturevalue(conn) + df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", np.nan]}) + assert df.to_sql(name="test_nan", con=conn, index=False) == 3 + + # NaNs are coming back as None + df.loc[2, "B"] = None + + # with read_table + result = sql.read_sql_table("test_nan", conn) + tm.assert_frame_equal(result, df) + + # with read_sql + result = sql.read_sql_query("SELECT * FROM test_nan", conn) + tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize("conn", all_connectable) +def test_to_sql_save_index(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + df = DataFrame.from_records( + [(1, 2.1, "line1"), (2, 1.5, "line2")], columns=["A", "B", "C"], index=["A"] + ) + + pandasSQL = pandasSQL_builder(conn) + tbl_name = "test_to_sql_saves_index" + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(df, tbl_name) == 2 + + if conn_name in {"sqlite_buildin", "sqlite_str"}: + ixs = sql.read_sql_query( + "SELECT * FROM sqlite_master WHERE type = 'index' " + f"AND tbl_name = '{tbl_name}'", + conn, ) - from sqlalchemy.orm import ( - Session, - declarative_base, + ix_cols = [] + for ix_name in ixs.name: + ix_info = sql.read_sql_query(f"PRAGMA index_info({ix_name})", conn) + ix_cols.append(ix_info.name.tolist()) + else: + from sqlalchemy import inspect + + insp = inspect(conn) + + ixs = insp.get_indexes(tbl_name) + ix_cols = [i["column_names"] for i in ixs] + + assert ix_cols == [["A"]] + + +@pytest.mark.parametrize("conn", all_connectable) +def test_transactions(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + + stmt = "CREATE TABLE test_trans (A INT, B TEXT)" + pandasSQL = pandasSQL_builder(conn) + if conn_name != "sqlite_buildin": + from sqlalchemy import text + + stmt = text(stmt) + + with pandasSQL.run_transaction() as trans: + trans.execute(stmt) + + +@pytest.mark.parametrize("conn", all_connectable) +def test_transaction_rollback(conn, request): + conn = request.getfixturevalue(conn) + pandasSQL = pandasSQL_builder(conn) + with pandasSQL.run_transaction() as trans: + stmt = "CREATE TABLE test_trans (A INT, B TEXT)" + if isinstance(pandasSQL, SQLiteDatabase): + trans.execute(stmt) + else: + from sqlalchemy import text + + stmt = text(stmt) + trans.execute(stmt) + + class DummyException(Exception): + pass + + # Make sure when transaction is rolled back, no rows get inserted + ins_sql = "INSERT INTO test_trans (A,B) VALUES (1, 'blah')" + if isinstance(pandasSQL, SQLDatabase): + from sqlalchemy import text + + ins_sql = text(ins_sql) + try: + with pandasSQL.run_transaction() as trans: + trans.execute(ins_sql) + raise DummyException("error") + except DummyException: + # ignore raised exception + pass + with pandasSQL.run_transaction(): + res = pandasSQL.read_query("SELECT * FROM test_trans") + assert len(res) == 0 + + # Make sure when transaction is committed, rows do get inserted + with pandasSQL.run_transaction() as trans: + trans.execute(ins_sql) + res2 = pandasSQL.read_query("SELECT * FROM test_trans") + assert len(res2) == 1 + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_get_schema_create_table(conn, request, test_frame3): + # Use a dataframe without a bool column, since MySQL converts bool to + # TINYINT (which read_sql_table returns as an int and causes a dtype + # mismatch) + if conn == "sqlite_str": + request.node.add_marker( + pytest.mark.xfail(reason="test does not support sqlite_str fixture") ) - test_data = "Hello, World!" - expected = DataFrame({"spam": [test_data]}) - Base = declarative_base() - - class Temporary(Base): - __tablename__ = "temp_test" - __table_args__ = {"prefixes": ["TEMPORARY"]} - id = Column(Integer, primary_key=True) - spam = Column(Unicode(30), nullable=False) - - with Session(self.conn) as session: - with session.begin(): - conn = session.connection() - Temporary.__table__.create(conn) - session.add(Temporary(spam=test_data)) - session.flush() - df = sql.read_sql_query(sql=select(Temporary.spam), con=conn) - tm.assert_frame_equal(df, expected) - - # -- SQL Engine tests (in the base class for now) - def test_invalid_engine(self, test_frame1): - msg = "engine must be one of 'auto', 'sqlalchemy'" + conn = request.getfixturevalue(conn) + + from sqlalchemy import text + from sqlalchemy.engine import Engine + + tbl = "test_get_schema_create_table" + create_sql = sql.get_schema(test_frame3, tbl, con=conn) + blank_test_df = test_frame3.iloc[:0] + + create_sql = text(create_sql) + if isinstance(conn, Engine): + with conn.connect() as newcon: + with newcon.begin(): + newcon.execute(create_sql) + else: + conn.execute(create_sql) + returned_df = sql.read_sql_table(tbl, conn) + tm.assert_frame_equal(returned_df, blank_test_df, check_index_type=False) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_dtype(conn, request): + if conn == "sqlite_str": + pytest.skip("sqlite_str has no inspection system") + + conn = request.getfixturevalue(conn) + + from sqlalchemy import ( + TEXT, + String, + ) + from sqlalchemy.schema import MetaData + + cols = ["A", "B"] + data = [(0.8, True), (0.9, None)] + df = DataFrame(data, columns=cols) + assert df.to_sql(name="dtype_test", con=conn) == 2 + assert df.to_sql(name="dtype_test2", con=conn, dtype={"B": TEXT}) == 2 + meta = MetaData() + meta.reflect(bind=conn) + sqltype = meta.tables["dtype_test2"].columns["B"].type + assert isinstance(sqltype, TEXT) + msg = "The type of B is not a SQLAlchemy type" + with pytest.raises(ValueError, match=msg): + df.to_sql(name="error", con=conn, dtype={"B": str}) + + # GH9083 + assert df.to_sql(name="dtype_test3", con=conn, dtype={"B": String(10)}) == 2 + meta.reflect(bind=conn) + sqltype = meta.tables["dtype_test3"].columns["B"].type + assert isinstance(sqltype, String) + assert sqltype.length == 10 + + # single dtype + assert df.to_sql(name="single_dtype_test", con=conn, dtype=TEXT) == 2 + meta.reflect(bind=conn) + sqltypea = meta.tables["single_dtype_test"].columns["A"].type + sqltypeb = meta.tables["single_dtype_test"].columns["B"].type + assert isinstance(sqltypea, TEXT) + assert isinstance(sqltypeb, TEXT) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_notna_dtype(conn, request): + if conn == "sqlite_str": + pytest.skip("sqlite_str has no inspection system") + + conn_name = conn + conn = request.getfixturevalue(conn) + + from sqlalchemy import ( + Boolean, + DateTime, + Float, + Integer, + ) + from sqlalchemy.schema import MetaData + + cols = { + "Bool": Series([True, None]), + "Date": Series([datetime(2012, 5, 1), None]), + "Int": Series([1, None], dtype="object"), + "Float": Series([1.1, None]), + } + df = DataFrame(cols) + + tbl = "notna_dtype_test" + assert df.to_sql(name=tbl, con=conn) == 2 + _ = sql.read_sql_table(tbl, conn) + meta = MetaData() + meta.reflect(bind=conn) + my_type = Integer if "mysql" in conn_name else Boolean + col_dict = meta.tables[tbl].columns + assert isinstance(col_dict["Bool"].type, my_type) + assert isinstance(col_dict["Date"].type, DateTime) + assert isinstance(col_dict["Int"].type, Integer) + assert isinstance(col_dict["Float"].type, Float) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_double_precision(conn, request): + if conn == "sqlite_str": + pytest.skip("sqlite_str has no inspection system") + + conn = request.getfixturevalue(conn) + + from sqlalchemy import ( + BigInteger, + Float, + Integer, + ) + from sqlalchemy.schema import MetaData + + V = 1.23456789101112131415 + + df = DataFrame( + { + "f32": Series([V], dtype="float32"), + "f64": Series([V], dtype="float64"), + "f64_as_f32": Series([V], dtype="float64"), + "i32": Series([5], dtype="int32"), + "i64": Series([5], dtype="int64"), + } + ) + + assert ( + df.to_sql( + name="test_dtypes", + con=conn, + index=False, + if_exists="replace", + dtype={"f64_as_f32": Float(precision=23)}, + ) + == 1 + ) + res = sql.read_sql_table("test_dtypes", conn) + + # check precision of float64 + assert np.round(df["f64"].iloc[0], 14) == np.round(res["f64"].iloc[0], 14) + + # check sql types + meta = MetaData() + meta.reflect(bind=conn) + col_dict = meta.tables["test_dtypes"].columns + assert str(col_dict["f32"].type) == str(col_dict["f64_as_f32"].type) + assert isinstance(col_dict["f32"].type, Float) + assert isinstance(col_dict["f64"].type, Float) + assert isinstance(col_dict["i32"].type, Integer) + assert isinstance(col_dict["i64"].type, BigInteger) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_connectable_issue_example(conn, request): + conn = request.getfixturevalue(conn) + + # This tests the example raised in issue + # https://github.com/pandas-dev/pandas/issues/10104 + from sqlalchemy.engine import Engine + + def test_select(connection): + query = "SELECT test_foo_data FROM test_foo_data" + return sql.read_sql_query(query, con=connection) + + def test_append(connection, data): + data.to_sql(name="test_foo_data", con=connection, if_exists="append") + + def test_connectable(conn): + # https://github.com/sqlalchemy/sqlalchemy/commit/ + # 00b5c10846e800304caa86549ab9da373b42fa5d#r48323973 + foo_data = test_select(conn) + test_append(conn, foo_data) + + def main(connectable): + if isinstance(connectable, Engine): + with connectable.connect() as conn: + with conn.begin(): + test_connectable(conn) + else: + test_connectable(connectable) + + assert ( + DataFrame({"test_foo_data": [0, 1, 2]}).to_sql(name="test_foo_data", con=conn) + == 3 + ) + main(conn) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +@pytest.mark.parametrize( + "input", + [{"foo": [np.inf]}, {"foo": [-np.inf]}, {"foo": [-np.inf], "infe0": ["bar"]}], +) +def test_to_sql_with_negative_npinf(conn, request, input): + # GH 34431 + + df = DataFrame(input) + conn_name = conn + conn = request.getfixturevalue(conn) + + if "mysql" in conn_name: + # GH 36465 + # The input {"foo": [-np.inf], "infe0": ["bar"]} does not raise any error + # for pymysql version >= 0.10 + # TODO(GH#36465): remove this version check after GH 36465 is fixed + pymysql = pytest.importorskip("pymysql") + + if Version(pymysql.__version__) < Version("1.0.3") and "infe0" in df.columns: + mark = pytest.mark.xfail(reason="GH 36465") + request.node.add_marker(mark) + + msg = "inf cannot be used with MySQL" with pytest.raises(ValueError, match=msg): - self._to_sql_with_sql_engine(test_frame1, "bad_engine") + df.to_sql(name="foobar", con=conn, index=False) + else: + assert df.to_sql(name="foobar", con=conn, index=False) == 1 + res = sql.read_sql_table("foobar", conn) + tm.assert_equal(df, res) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_temporary_table(conn, request): + if conn == "sqlite_str": + pytest.skip("test does not work with str connection") + + conn = request.getfixturevalue(conn) + + from sqlalchemy import ( + Column, + Integer, + Unicode, + select, + ) + from sqlalchemy.orm import ( + Session, + declarative_base, + ) + + test_data = "Hello, World!" + expected = DataFrame({"spam": [test_data]}) + Base = declarative_base() + + class Temporary(Base): + __tablename__ = "temp_test" + __table_args__ = {"prefixes": ["TEMPORARY"]} + id = Column(Integer, primary_key=True) + spam = Column(Unicode(30), nullable=False) - def test_options_sqlalchemy(self, test_frame1): - # use the set option - with pd.option_context("io.sql.engine", "sqlalchemy"): - self._to_sql_with_sql_engine(test_frame1) + with Session(conn) as session: + with session.begin(): + conn = session.connection() + Temporary.__table__.create(conn) + session.add(Temporary(spam=test_data)) + session.flush() + df = sql.read_sql_query(sql=select(Temporary.spam), con=conn) + tm.assert_frame_equal(df, expected) - def test_options_auto(self, test_frame1): - # use the set option - with pd.option_context("io.sql.engine", "auto"): - self._to_sql_with_sql_engine(test_frame1) - def test_options_get_engine(self): +@pytest.mark.parametrize("conn", all_connectable) +def test_invalid_engine(conn, request, test_frame1): + if conn == "sqlite_buildin": + request.node.add_marker( + pytest.mark.xfail(reason="SQLiteDatabase does not raise for bad engine") + ) + + conn = request.getfixturevalue(conn) + msg = "engine must be one of 'auto', 'sqlalchemy'" + pandasSQL = pandasSQL_builder(conn) + with pytest.raises(ValueError, match=msg): + pandasSQL.to_sql(test_frame1, "test_frame1", engine="bad_engine") + + +@pytest.mark.parametrize("conn", all_connectable) +def test_to_sql_with_sql_engine(conn, request, test_frame1): + """`to_sql` with the `engine` param""" + # mostly copied from this class's `_to_sql()` method + conn = request.getfixturevalue(conn) + pandasSQL = pandasSQL_builder(conn) + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, "test_frame1", engine="auto") == 4 + assert pandasSQL.has_table("test_frame1") + + num_entries = len(test_frame1) + num_rows = count_rows(conn, "test_frame1") + assert num_rows == num_entries + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_options_sqlalchemy(conn, request, test_frame1): + # use the set option + conn = request.getfixturevalue(conn) + with pd.option_context("io.sql.engine", "sqlalchemy"): + pandasSQL = pandasSQL_builder(conn) + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, "test_frame1") == 4 + assert pandasSQL.has_table("test_frame1") + + num_entries = len(test_frame1) + num_rows = count_rows(conn, "test_frame1") + assert num_rows == num_entries + + +@pytest.mark.parametrize("conn", all_connectable) +def test_options_auto(conn, request, test_frame1): + # use the set option + conn = request.getfixturevalue(conn) + with pd.option_context("io.sql.engine", "auto"): + pandasSQL = pandasSQL_builder(conn) + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, "test_frame1") == 4 + assert pandasSQL.has_table("test_frame1") + + num_entries = len(test_frame1) + num_rows = count_rows(conn, "test_frame1") + assert num_rows == num_entries + + +@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="fails without SQLAlchemy") +def test_options_get_engine(): + assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) + + with pd.option_context("io.sql.engine", "sqlalchemy"): + assert isinstance(get_engine("auto"), SQLAlchemyEngine) assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) - with pd.option_context("io.sql.engine", "sqlalchemy"): - assert isinstance(get_engine("auto"), SQLAlchemyEngine) - assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) + with pd.option_context("io.sql.engine", "auto"): + assert isinstance(get_engine("auto"), SQLAlchemyEngine) + assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) + + +def test_get_engine_auto_error_message(): + # Expect different error messages from get_engine(engine="auto") + # if engines aren't installed vs. are installed but bad version + pass + # TODO(GH#36893) fill this in when we add more engines + + +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) +def test_read_sql_dtype_backend( + conn, + request, + string_storage, + func, + dtype_backend, + dtype_backend_data, + dtype_backend_expected, +): + # GH#50048 + conn_name = conn + conn = request.getfixturevalue(conn) + table = "test" + df = dtype_backend_data + df.to_sql(name=table, con=conn, index=False, if_exists="replace") + + with pd.option_context("mode.string_storage", string_storage): + result = getattr(pd, func)( + f"Select * from {table}", conn, dtype_backend=dtype_backend + ) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + tm.assert_frame_equal(result, expected) + + with pd.option_context("mode.string_storage", string_storage): + iterator = getattr(pd, func)( + f"Select * from {table}", + con=conn, + dtype_backend=dtype_backend, + chunksize=3, + ) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + for result in iterator: + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize("func", ["read_sql", "read_sql_table"]) +def test_read_sql_dtype_backend_table( + conn, + request, + string_storage, + func, + dtype_backend, + dtype_backend_data, + dtype_backend_expected, +): + if "sqlite" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason=( + "SQLite actually returns proper boolean values via " + "read_sql_table, but before pytest refactor was skipped" + ) + ) + ) + # GH#50048 + conn_name = conn + conn = request.getfixturevalue(conn) + table = "test" + df = dtype_backend_data + df.to_sql(name=table, con=conn, index=False, if_exists="replace") + + with pd.option_context("mode.string_storage", string_storage): + result = getattr(pd, func)(table, conn, dtype_backend=dtype_backend) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + tm.assert_frame_equal(result, expected) + + with pd.option_context("mode.string_storage", string_storage): + iterator = getattr(pd, func)( + table, + conn, + dtype_backend=dtype_backend, + chunksize=3, + ) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + for result in iterator: + tm.assert_frame_equal(result, expected) + - with pd.option_context("io.sql.engine", "auto"): - assert isinstance(get_engine("auto"), SQLAlchemyEngine) - assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize("func", ["read_sql", "read_sql_table", "read_sql_query"]) +def test_read_sql_invalid_dtype_backend_table(conn, request, func, dtype_backend_data): + conn = request.getfixturevalue(conn) + table = "test" + df = dtype_backend_data + df.to_sql(name=table, con=conn, index=False, if_exists="replace") - def test_get_engine_auto_error_message(self): - # Expect different error messages from get_engine(engine="auto") - # if engines aren't installed vs. are installed but bad version - pass - # TODO(GH#36893) fill this in when we add more engines - - @pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) - def test_read_sql_dtype_backend(self, string_storage, func, dtype_backend): - # GH#50048 - table = "test" - df = self.dtype_backend_data() - df.to_sql(name=table, con=self.conn, index=False, if_exists="replace") - - with pd.option_context("mode.string_storage", string_storage): - result = getattr(pd, func)( - f"Select * from {table}", self.conn, dtype_backend=dtype_backend - ) - expected = self.dtype_backend_expected(string_storage, dtype_backend) - tm.assert_frame_equal(result, expected) + msg = ( + "dtype_backend numpy is invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + getattr(pd, func)(table, conn, dtype_backend="numpy") - with pd.option_context("mode.string_storage", string_storage): - iterator = getattr(pd, func)( - f"Select * from {table}", - con=self.conn, - dtype_backend=dtype_backend, - chunksize=3, - ) - expected = self.dtype_backend_expected(string_storage, dtype_backend) - for result in iterator: - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("func", ["read_sql", "read_sql_table"]) - def test_read_sql_dtype_backend_table(self, string_storage, func, dtype_backend): - # GH#50048 - table = "test" - df = self.dtype_backend_data() - df.to_sql(name=table, con=self.conn, index=False, if_exists="replace") - - with pd.option_context("mode.string_storage", string_storage): - result = getattr(pd, func)(table, self.conn, dtype_backend=dtype_backend) - expected = self.dtype_backend_expected(string_storage, dtype_backend) - tm.assert_frame_equal(result, expected) - with pd.option_context("mode.string_storage", string_storage): - iterator = getattr(pd, func)( - table, - self.conn, - dtype_backend=dtype_backend, - chunksize=3, - ) - expected = self.dtype_backend_expected(string_storage, dtype_backend) - for result in iterator: - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("func", ["read_sql", "read_sql_table", "read_sql_query"]) - def test_read_sql_invalid_dtype_backend_table(self, func): - table = "test" - df = self.dtype_backend_data() - df.to_sql(name=table, con=self.conn, index=False, if_exists="replace") - - msg = ( - "dtype_backend numpy is invalid, only 'numpy_nullable' and " - "'pyarrow' are allowed." - ) - with pytest.raises(ValueError, match=msg): - getattr(pd, func)(table, self.conn, dtype_backend="numpy") +@pytest.fixture +def dtype_backend_data() -> DataFrame: + return DataFrame( + { + "a": Series([1, np.nan, 3], dtype="Int64"), + "b": Series([1, 2, 3], dtype="Int64"), + "c": Series([1.5, np.nan, 2.5], dtype="Float64"), + "d": Series([1.5, 2.0, 2.5], dtype="Float64"), + "e": [True, False, None], + "f": [True, False, True], + "g": ["a", "b", "c"], + "h": ["a", "b", None], + } + ) - def dtype_backend_data(self) -> DataFrame: - return DataFrame( - { - "a": Series([1, np.nan, 3], dtype="Int64"), - "b": Series([1, 2, 3], dtype="Int64"), - "c": Series([1.5, np.nan, 2.5], dtype="Float64"), - "d": Series([1.5, 2.0, 2.5], dtype="Float64"), - "e": [True, False, None], - "f": [True, False, True], - "g": ["a", "b", "c"], - "h": ["a", "b", None], - } - ) - def dtype_backend_expected(self, storage, dtype_backend) -> DataFrame: +@pytest.fixture +def dtype_backend_expected(): + def func(storage, dtype_backend, conn_name): string_array: StringArray | ArrowStringArray string_array_na: StringArray | ArrowStringArray if storage == "python": @@ -2786,549 +3195,396 @@ def dtype_backend_expected(self, storage, dtype_backend) -> DataFrame: for col in df.columns } ) - return df - - def test_chunksize_empty_dtypes(self): - # GH#50245 - dtypes = {"a": "int64", "b": "object"} - df = DataFrame(columns=["a", "b"]).astype(dtypes) - expected = df.copy() - df.to_sql(name="test", con=self.conn, index=False, if_exists="replace") - - for result in read_sql_query( - "SELECT * FROM test", - self.conn, - dtype=dtypes, - chunksize=1, - ): - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("dtype_backend", [lib.no_default, "numpy_nullable"]) - @pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) - def test_read_sql_dtype(self, func, dtype_backend): - # GH#50797 - table = "test" - df = DataFrame({"a": [1, 2, 3], "b": 5}) - df.to_sql(name=table, con=self.conn, index=False, if_exists="replace") - - result = getattr(pd, func)( - f"Select * from {table}", - self.conn, - dtype={"a": np.float64}, - dtype_backend=dtype_backend, - ) - expected = DataFrame( - { - "a": Series([1, 2, 3], dtype=np.float64), - "b": Series( - [5, 5, 5], - dtype="int64" if not dtype_backend == "numpy_nullable" else "Int64", - ), - } - ) - tm.assert_frame_equal(result, expected) - - -class TestSQLiteAlchemy(_TestSQLAlchemy): - """ - Test the sqlalchemy backend against an in-memory sqlite database. - """ - - flavor = "sqlite" - - @classmethod - def setup_engine(cls): - cls.engine = sqlalchemy.create_engine("sqlite:///:memory:") - - @classmethod - def setup_driver(cls): - # sqlite3 is built-in - cls.driver = None - - def test_keyword_deprecation(self): - # GH 54397 - msg = ( - "tarting with pandas version 3.0 all arguments of to_sql except for the " - "argument 'name' will be keyword-only." - ) - df = DataFrame([{"A": 1, "B": 2, "C": 3}, {"A": 1, "B": 2, "C": 3}]) - - with tm.assert_produces_warning(FutureWarning, match=msg): - df.to_sql("example", self.conn) + if "mysql" in conn_name or "sqlite" in conn_name: + if dtype_backend == "numpy_nullable": + df = df.astype({"e": "Int64", "f": "Int64"}) + else: + df = df.astype({"e": "int64[pyarrow]", "f": "int64[pyarrow]"}) - def test_default_type_conversion(self): - df = sql.read_sql_table("types", self.conn) + return df - assert issubclass(df.FloatCol.dtype.type, np.floating) - assert issubclass(df.IntCol.dtype.type, np.integer) + return func - # sqlite has no boolean type, so integer type is returned - assert issubclass(df.BoolCol.dtype.type, np.integer) - # Int column with NA values stays as float - assert issubclass(df.IntColWithNull.dtype.type, np.floating) +@pytest.mark.parametrize("conn", all_connectable) +def test_chunksize_empty_dtypes(conn, request): + # GH#50245 + conn = request.getfixturevalue(conn) + dtypes = {"a": "int64", "b": "object"} + df = DataFrame(columns=["a", "b"]).astype(dtypes) + expected = df.copy() + df.to_sql(name="test", con=conn, index=False, if_exists="replace") + + for result in read_sql_query( + "SELECT * FROM test", + conn, + dtype=dtypes, + chunksize=1, + ): + tm.assert_frame_equal(result, expected) - # Non-native Bool column with NA values stays as float - assert issubclass(df.BoolColWithNull.dtype.type, np.floating) - def test_default_date_load(self): - df = sql.read_sql_table("types", self.conn) +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize("dtype_backend", [lib.no_default, "numpy_nullable"]) +@pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) +def test_read_sql_dtype(conn, request, func, dtype_backend): + # GH#50797 + conn = request.getfixturevalue(conn) + table = "test" + df = DataFrame({"a": [1, 2, 3], "b": 5}) + df.to_sql(name=table, con=conn, index=False, if_exists="replace") + + result = getattr(pd, func)( + f"Select * from {table}", + conn, + dtype={"a": np.float64}, + dtype_backend=dtype_backend, + ) + expected = DataFrame( + { + "a": Series([1, 2, 3], dtype=np.float64), + "b": Series( + [5, 5, 5], + dtype="int64" if not dtype_backend == "numpy_nullable" else "Int64", + ), + } + ) + tm.assert_frame_equal(result, expected) - # IMPORTANT - sqlite has no native date type, so shouldn't parse, but - assert not issubclass(df.DateCol.dtype.type, np.datetime64) - def test_bigint_warning(self): - # test no warning for BIGINT (to support int64) is raised (GH7433) - df = DataFrame({"a": [1, 2]}, dtype="int64") - assert df.to_sql(name="test_bigintwarning", con=self.conn, index=False) == 2 +def test_keyword_deprecation(sqlite_sqlalchemy_memory_engine): + conn = sqlite_sqlalchemy_memory_engine + # GH 54397 + msg = ( + "Starting with pandas version 3.0 all arguments of to_sql except for the " + "arguments 'name' and 'con' will be keyword-only." + ) + df = DataFrame([{"A": 1, "B": 2, "C": 3}, {"A": 1, "B": 2, "C": 3}]) + df.to_sql("example", conn) - with tm.assert_produces_warning(None): - sql.read_sql_table("test_bigintwarning", self.conn) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.to_sql("example", conn, None, if_exists="replace") - def test_valueerror_exception(self): - df = DataFrame({"col1": [1, 2], "col2": [3, 4]}) - with pytest.raises(ValueError, match="Empty table name specified"): - df.to_sql(name="", con=self.conn, if_exists="replace", index=False) - def test_row_object_is_named_tuple(self): - # GH 40682 - # Test for the is_named_tuple() function - # Placed here due to its usage of sqlalchemy +def test_bigint_warning(sqlite_sqlalchemy_memory_engine): + conn = sqlite_sqlalchemy_memory_engine + # test no warning for BIGINT (to support int64) is raised (GH7433) + df = DataFrame({"a": [1, 2]}, dtype="int64") + assert df.to_sql(name="test_bigintwarning", con=conn, index=False) == 2 - from sqlalchemy import ( - Column, - Integer, - String, - ) - from sqlalchemy.orm import ( - declarative_base, - sessionmaker, - ) + with tm.assert_produces_warning(None): + sql.read_sql_table("test_bigintwarning", conn) - BaseModel = declarative_base() - - class Test(BaseModel): - __tablename__ = "test_frame" - id = Column(Integer, primary_key=True) - string_column = Column(String(50)) - - with self.conn.begin(): - BaseModel.metadata.create_all(self.conn) - Session = sessionmaker(bind=self.conn) - with Session() as session: - df = DataFrame({"id": [0, 1], "string_column": ["hello", "world"]}) - assert ( - df.to_sql( - name="test_frame", con=self.conn, index=False, if_exists="replace" - ) - == 2 - ) - session.commit() - test_query = session.query(Test.id, Test.string_column) - df = DataFrame(test_query) - assert list(df.columns) == ["id", "string_column"] +def test_valueerror_exception(sqlite_sqlalchemy_memory_engine): + conn = sqlite_sqlalchemy_memory_engine + df = DataFrame({"col1": [1, 2], "col2": [3, 4]}) + with pytest.raises(ValueError, match="Empty table name specified"): + df.to_sql(name="", con=conn, if_exists="replace", index=False) - def dtype_backend_expected(self, storage, dtype_backend) -> DataFrame: - df = super().dtype_backend_expected(storage, dtype_backend) - if dtype_backend == "numpy_nullable": - df = df.astype({"e": "Int64", "f": "Int64"}) - else: - df = df.astype({"e": "int64[pyarrow]", "f": "int64[pyarrow]"}) - return df +def test_row_object_is_named_tuple(sqlite_sqlalchemy_memory_engine): + conn = sqlite_sqlalchemy_memory_engine + # GH 40682 + # Test for the is_named_tuple() function + # Placed here due to its usage of sqlalchemy - @pytest.mark.parametrize("func", ["read_sql", "read_sql_table"]) - def test_read_sql_dtype_backend_table(self, string_storage, func): - # GH#50048 Not supported for sqlite - pass + from sqlalchemy import ( + Column, + Integer, + String, + ) + from sqlalchemy.orm import ( + declarative_base, + sessionmaker, + ) - def test_read_sql_string_inference(self): - # GH#54430 - pa = pytest.importorskip("pyarrow") - table = "test" - df = DataFrame({"a": ["x", "y"]}) - df.to_sql(table, con=self.conn, index=False, if_exists="replace") + BaseModel = declarative_base() - with pd.option_context("future.infer_string", True): - result = read_sql_table(table, self.conn) + class Test(BaseModel): + __tablename__ = "test_frame" + id = Column(Integer, primary_key=True) + string_column = Column(String(50)) - dtype = pd.ArrowDtype(pa.string()) - expected = DataFrame( - {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) + with conn.begin(): + BaseModel.metadata.create_all(conn) + Session = sessionmaker(bind=conn) + with Session() as session: + df = DataFrame({"id": [0, 1], "string_column": ["hello", "world"]}) + assert ( + df.to_sql(name="test_frame", con=conn, index=False, if_exists="replace") + == 2 ) + session.commit() + test_query = session.query(Test.id, Test.string_column) + df = DataFrame(test_query) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.db -class TestMySQLAlchemy(_TestSQLAlchemy): - """ - Test the sqlalchemy backend against an MySQL database. + assert list(df.columns) == ["id", "string_column"] - """ - flavor = "mysql" - port = 3306 +def test_read_sql_string_inference(sqlite_sqlalchemy_memory_engine): + conn = sqlite_sqlalchemy_memory_engine + # GH#54430 + pytest.importorskip("pyarrow") + table = "test" + df = DataFrame({"a": ["x", "y"]}) + df.to_sql(table, con=conn, index=False, if_exists="replace") - @classmethod - def setup_engine(cls): - cls.engine = sqlalchemy.create_engine( - f"mysql+{cls.driver}://root@localhost:{cls.port}/pandas", - connect_args=cls.connect_args, - ) + with pd.option_context("future.infer_string", True): + result = read_sql_table(table, conn) - @classmethod - def setup_driver(cls): - pymysql = pytest.importorskip("pymysql") - cls.driver = "pymysql" - cls.connect_args = {"client_flag": pymysql.constants.CLIENT.MULTI_STATEMENTS} + dtype = "string[pyarrow_numpy]" + expected = DataFrame( + {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) + ) - def test_default_type_conversion(self): - pass + tm.assert_frame_equal(result, expected) - def dtype_backend_expected(self, storage, dtype_backend) -> DataFrame: - df = super().dtype_backend_expected(storage, dtype_backend) - if dtype_backend == "numpy_nullable": - df = df.astype({"e": "Int64", "f": "Int64"}) - else: - df = df.astype({"e": "int64[pyarrow]", "f": "int64[pyarrow]"}) - return df +def test_roundtripping_datetimes(sqlite_sqlalchemy_memory_engine): + conn = sqlite_sqlalchemy_memory_engine + # GH#54877 + df = DataFrame({"t": [datetime(2020, 12, 31, 12)]}, dtype="datetime64[ns]") + df.to_sql("test", conn, if_exists="replace", index=False) + result = pd.read_sql("select * from test", conn).iloc[0, 0] + assert result == "2020-12-31 12:00:00.000000" @pytest.mark.db -class TestPostgreSQLAlchemy(_TestSQLAlchemy): - """ - Test the sqlalchemy backend against an PostgreSQL database. - - """ - - flavor = "postgresql" - port = 5432 - - @classmethod - def setup_engine(cls): - cls.engine = sqlalchemy.create_engine( - f"postgresql+{cls.driver}://postgres:postgres@localhost:{cls.port}/pandas" - ) - - @classmethod - def setup_driver(cls): - pytest.importorskip("psycopg2") - cls.driver = "psycopg2" - - def test_schema_support(self): - from sqlalchemy.engine import Engine - - # only test this for postgresql (schema's not supported in - # mysql/sqlite) - df = DataFrame({"col1": [1, 2], "col2": [0.1, 0.2], "col3": ["a", "n"]}) - - # create a schema - with self.conn.begin(): - self.conn.exec_driver_sql("DROP SCHEMA IF EXISTS other CASCADE;") - self.conn.exec_driver_sql("CREATE SCHEMA other;") - - # write dataframe to different schema's - assert df.to_sql(name="test_schema_public", con=self.conn, index=False) == 2 - assert ( - df.to_sql( - name="test_schema_public_explicit", - con=self.conn, - index=False, - schema="public", - ) - == 2 - ) - assert ( - df.to_sql( - name="test_schema_other", con=self.conn, index=False, schema="other" - ) - == 2 - ) - - # read dataframes back in - res1 = sql.read_sql_table("test_schema_public", self.conn) - tm.assert_frame_equal(df, res1) - res2 = sql.read_sql_table("test_schema_public_explicit", self.conn) - tm.assert_frame_equal(df, res2) - res3 = sql.read_sql_table( - "test_schema_public_explicit", self.conn, schema="public" +def test_psycopg2_schema_support(postgresql_psycopg2_engine): + conn = postgresql_psycopg2_engine + + # only test this for postgresql (schema's not supported in + # mysql/sqlite) + df = DataFrame({"col1": [1, 2], "col2": [0.1, 0.2], "col3": ["a", "n"]}) + + # create a schema + with conn.connect() as con: + with con.begin(): + con.exec_driver_sql("DROP SCHEMA IF EXISTS other CASCADE;") + con.exec_driver_sql("CREATE SCHEMA other;") + + # write dataframe to different schema's + assert df.to_sql(name="test_schema_public", con=conn, index=False) == 2 + assert ( + df.to_sql( + name="test_schema_public_explicit", + con=conn, + index=False, + schema="public", ) - tm.assert_frame_equal(df, res3) - res4 = sql.read_sql_table("test_schema_other", self.conn, schema="other") - tm.assert_frame_equal(df, res4) - msg = "Table test_schema_other not found" - with pytest.raises(ValueError, match=msg): - sql.read_sql_table("test_schema_other", self.conn, schema="public") - - # different if_exists options - - # create a schema - with self.conn.begin(): - self.conn.exec_driver_sql("DROP SCHEMA IF EXISTS other CASCADE;") - self.conn.exec_driver_sql("CREATE SCHEMA other;") + == 2 + ) + assert ( + df.to_sql(name="test_schema_other", con=conn, index=False, schema="other") == 2 + ) - # write dataframe with different if_exists options - assert ( - df.to_sql( - name="test_schema_other", con=self.conn, schema="other", index=False - ) - == 2 - ) + # read dataframes back in + res1 = sql.read_sql_table("test_schema_public", conn) + tm.assert_frame_equal(df, res1) + res2 = sql.read_sql_table("test_schema_public_explicit", conn) + tm.assert_frame_equal(df, res2) + res3 = sql.read_sql_table("test_schema_public_explicit", conn, schema="public") + tm.assert_frame_equal(df, res3) + res4 = sql.read_sql_table("test_schema_other", conn, schema="other") + tm.assert_frame_equal(df, res4) + msg = "Table test_schema_other not found" + with pytest.raises(ValueError, match=msg): + sql.read_sql_table("test_schema_other", conn, schema="public") + + # different if_exists options + + # create a schema + with conn.connect() as con: + with con.begin(): + con.exec_driver_sql("DROP SCHEMA IF EXISTS other CASCADE;") + con.exec_driver_sql("CREATE SCHEMA other;") + + # write dataframe with different if_exists options + assert ( + df.to_sql(name="test_schema_other", con=conn, schema="other", index=False) == 2 + ) + df.to_sql( + name="test_schema_other", + con=conn, + schema="other", + index=False, + if_exists="replace", + ) + assert ( df.to_sql( name="test_schema_other", - con=self.conn, + con=conn, schema="other", index=False, - if_exists="replace", - ) - assert ( - df.to_sql( - name="test_schema_other", - con=self.conn, - schema="other", - index=False, - if_exists="append", - ) - == 2 + if_exists="append", ) - res = sql.read_sql_table("test_schema_other", self.conn, schema="other") - tm.assert_frame_equal(concat([df, df], ignore_index=True), res) - - # specifying schema in user-provided meta - - # The schema won't be applied on another Connection - # because of transactional schemas - if isinstance(self.conn, Engine): - engine2 = self.connect() - pdsql = sql.SQLDatabase(engine2, schema="other") - assert pdsql.to_sql(df, "test_schema_other2", index=False) == 2 - assert ( - pdsql.to_sql(df, "test_schema_other2", index=False, if_exists="replace") - == 2 - ) - assert ( - pdsql.to_sql(df, "test_schema_other2", index=False, if_exists="append") - == 2 - ) - res1 = sql.read_sql_table("test_schema_other2", self.conn, schema="other") - res2 = pdsql.read_table("test_schema_other2") - tm.assert_frame_equal(res1, res2) + == 2 + ) + res = sql.read_sql_table("test_schema_other", conn, schema="other") + tm.assert_frame_equal(concat([df, df], ignore_index=True), res) - def test_self_join_date_columns(self): - # GH 44421 - from sqlalchemy.engine import Engine - from sqlalchemy.sql import text - create_table = text( - """ - CREATE TABLE person - ( - id serial constraint person_pkey primary key, - created_dt timestamp with time zone - ); +@pytest.mark.db +def test_self_join_date_columns(postgresql_psycopg2_engine): + # GH 44421 + conn = postgresql_psycopg2_engine + from sqlalchemy.sql import text - INSERT INTO person - VALUES (1, '2021-01-01T00:00:00Z'); + create_table = text( """ - ) - if isinstance(self.conn, Engine): - with self.conn.connect() as con: - with con.begin(): - con.execute(create_table) - else: - with self.conn.begin(): - self.conn.execute(create_table) - - sql_query = ( - 'SELECT * FROM "person" AS p1 INNER JOIN "person" AS p2 ON p1.id = p2.id;' - ) - result = pd.read_sql(sql_query, self.conn) - expected = DataFrame( - [[1, Timestamp("2021", tz="UTC")] * 2], columns=["id", "created_dt"] * 2 - ) - tm.assert_frame_equal(result, expected) - - # Cleanup - with sql.SQLDatabase(self.conn, need_transaction=True) as pandasSQL: - pandasSQL.drop_table("person") - - -# ----------------------------------------------------------------------------- -# -- Test Sqlite / MySQL fallback - - -class TestSQLiteFallback(SQLiteMixIn, PandasSQLTest): - """ - Test the fallback mode against an in-memory sqlite database. + CREATE TABLE person + ( + id serial constraint person_pkey primary key, + created_dt timestamp with time zone + ); + INSERT INTO person + VALUES (1, '2021-01-01T00:00:00Z'); """ + ) + with conn.connect() as con: + with con.begin(): + con.execute(create_table) - flavor = "sqlite" - - @pytest.fixture(autouse=True) - def setup_method(self, iris_path, types_data): - self.conn = self.connect() - self.load_iris_data(iris_path) - self.load_types_data(types_data) - self.pandasSQL = sql.SQLiteDatabase(self.conn) - - def test_read_sql_parameter(self, sql_strings): - self._read_sql_iris_parameter(sql_strings) - - def test_read_sql_named_parameter(self, sql_strings): - self._read_sql_iris_named_parameter(sql_strings) + sql_query = ( + 'SELECT * FROM "person" AS p1 INNER JOIN "person" AS p2 ON p1.id = p2.id;' + ) + result = pd.read_sql(sql_query, conn) + expected = DataFrame( + [[1, Timestamp("2021", tz="UTC")] * 2], columns=["id", "created_dt"] * 2 + ) + tm.assert_frame_equal(result, expected) - def test_to_sql_empty(self, test_frame1): - self._to_sql_empty(test_frame1) + # Cleanup + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("person") - def test_create_and_drop_table(self): - temp_frame = DataFrame( - {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} - ) - assert self.pandasSQL.to_sql(temp_frame, "drop_test_frame") == 4 +def test_create_and_drop_table(sqlite_sqlalchemy_memory_engine): + conn = sqlite_sqlalchemy_memory_engine + temp_frame = DataFrame({"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}) + pandasSQL = sql.SQLDatabase(conn) - assert self.pandasSQL.has_table("drop_test_frame") + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(temp_frame, "drop_test_frame") == 4 - self.pandasSQL.drop_table("drop_test_frame") + assert pandasSQL.has_table("drop_test_frame") - assert not self.pandasSQL.has_table("drop_test_frame") + with pandasSQL.run_transaction(): + pandasSQL.drop_table("drop_test_frame") - def test_roundtrip(self, test_frame1): - self._roundtrip(test_frame1) + assert not pandasSQL.has_table("drop_test_frame") - def test_execute_sql(self): - self._execute_sql() - def test_datetime_date(self): - # test support for datetime.date - df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) - assert df.to_sql(name="test_date", con=self.conn, index=False) == 2 - res = read_sql_query("SELECT * FROM test_date", self.conn) - if self.flavor == "sqlite": - # comes back as strings - tm.assert_frame_equal(res, df.astype(str)) - elif self.flavor == "mysql": - tm.assert_frame_equal(res, df) +def test_sqlite_datetime_date(sqlite_buildin): + conn = sqlite_buildin + df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) + assert df.to_sql(name="test_date", con=conn, index=False) == 2 + res = read_sql_query("SELECT * FROM test_date", conn) + # comes back as strings + tm.assert_frame_equal(res, df.astype(str)) - @pytest.mark.parametrize("tz_aware", [False, True]) - def test_datetime_time(self, tz_aware): - # test support for datetime.time, GH #8341 - if not tz_aware: - tz_times = [time(9, 0, 0), time(9, 1, 30)] - else: - tz_dt = date_range("2013-01-01 09:00:00", periods=2, tz="US/Pacific") - tz_times = Series(tz_dt.to_pydatetime()).map(lambda dt: dt.timetz()) - df = DataFrame(tz_times, columns=["a"]) +@pytest.mark.parametrize("tz_aware", [False, True]) +def test_sqlite_datetime_time(tz_aware, sqlite_buildin): + conn = sqlite_buildin + # test support for datetime.time, GH #8341 + if not tz_aware: + tz_times = [time(9, 0, 0), time(9, 1, 30)] + else: + tz_dt = date_range("2013-01-01 09:00:00", periods=2, tz="US/Pacific") + tz_times = Series(tz_dt.to_pydatetime()).map(lambda dt: dt.timetz()) + + df = DataFrame(tz_times, columns=["a"]) + + assert df.to_sql(name="test_time", con=conn, index=False) == 2 + res = read_sql_query("SELECT * FROM test_time", conn) + # comes back as strings + expected = df.map(lambda _: _.strftime("%H:%M:%S.%f")) + tm.assert_frame_equal(res, expected) + + +def get_sqlite_column_type(conn, table, column): + recs = conn.execute(f"PRAGMA table_info({table})") + for cid, name, ctype, not_null, default, pk in recs: + if name == column: + return ctype + raise ValueError(f"Table {table}, column {column} not found") + + +def test_sqlite_test_dtype(sqlite_buildin): + conn = sqlite_buildin + cols = ["A", "B"] + data = [(0.8, True), (0.9, None)] + df = DataFrame(data, columns=cols) + assert df.to_sql(name="dtype_test", con=conn) == 2 + assert df.to_sql(name="dtype_test2", con=conn, dtype={"B": "STRING"}) == 2 + + # sqlite stores Boolean values as INTEGER + assert get_sqlite_column_type(conn, "dtype_test", "B") == "INTEGER" + + assert get_sqlite_column_type(conn, "dtype_test2", "B") == "STRING" + msg = r"B \(\) not a string" + with pytest.raises(ValueError, match=msg): + df.to_sql(name="error", con=conn, dtype={"B": bool}) + + # single dtype + assert df.to_sql(name="single_dtype_test", con=conn, dtype="STRING") == 2 + assert get_sqlite_column_type(conn, "single_dtype_test", "A") == "STRING" + assert get_sqlite_column_type(conn, "single_dtype_test", "B") == "STRING" + + +def test_sqlite_notna_dtype(sqlite_buildin): + conn = sqlite_buildin + cols = { + "Bool": Series([True, None]), + "Date": Series([datetime(2012, 5, 1), None]), + "Int": Series([1, None], dtype="object"), + "Float": Series([1.1, None]), + } + df = DataFrame(cols) - assert df.to_sql(name="test_time", con=self.conn, index=False) == 2 - res = read_sql_query("SELECT * FROM test_time", self.conn) - if self.flavor == "sqlite": - # comes back as strings - expected = df.map(lambda _: _.strftime("%H:%M:%S.%f")) - tm.assert_frame_equal(res, expected) + tbl = "notna_dtype_test" + assert df.to_sql(name=tbl, con=conn) == 2 - def _get_index_columns(self, tbl_name): - ixs = sql.read_sql_query( - "SELECT * FROM sqlite_master WHERE type = 'index' " - f"AND tbl_name = '{tbl_name}'", - self.conn, - ) - ix_cols = [] - for ix_name in ixs.name: - ix_info = sql.read_sql_query(f"PRAGMA index_info({ix_name})", self.conn) - ix_cols.append(ix_info.name.tolist()) - return ix_cols - - def test_to_sql_save_index(self): - self._to_sql_save_index() - - def test_transactions(self): - self._transaction_test() - - def _get_sqlite_column_type(self, table, column): - recs = self.conn.execute(f"PRAGMA table_info({table})") - for cid, name, ctype, not_null, default, pk in recs: - if name == column: - return ctype - raise ValueError(f"Table {table}, column {column} not found") - - def test_dtype(self): - if self.flavor == "mysql": - pytest.skip("Not applicable to MySQL legacy") - cols = ["A", "B"] - data = [(0.8, True), (0.9, None)] - df = DataFrame(data, columns=cols) - assert df.to_sql(name="dtype_test", con=self.conn) == 2 - assert df.to_sql(name="dtype_test2", con=self.conn, dtype={"B": "STRING"}) == 2 - - # sqlite stores Boolean values as INTEGER - assert self._get_sqlite_column_type("dtype_test", "B") == "INTEGER" - - assert self._get_sqlite_column_type("dtype_test2", "B") == "STRING" - msg = r"B \(\) not a string" - with pytest.raises(ValueError, match=msg): - df.to_sql(name="error", con=self.conn, dtype={"B": bool}) - - # single dtype - assert df.to_sql(name="single_dtype_test", con=self.conn, dtype="STRING") == 2 - assert self._get_sqlite_column_type("single_dtype_test", "A") == "STRING" - assert self._get_sqlite_column_type("single_dtype_test", "B") == "STRING" - - def test_notna_dtype(self): - if self.flavor == "mysql": - pytest.skip("Not applicable to MySQL legacy") - - cols = { - "Bool": Series([True, None]), - "Date": Series([datetime(2012, 5, 1), None]), - "Int": Series([1, None], dtype="object"), - "Float": Series([1.1, None]), - } - df = DataFrame(cols) + assert get_sqlite_column_type(conn, tbl, "Bool") == "INTEGER" + assert get_sqlite_column_type(conn, tbl, "Date") == "TIMESTAMP" + assert get_sqlite_column_type(conn, tbl, "Int") == "INTEGER" + assert get_sqlite_column_type(conn, tbl, "Float") == "REAL" - tbl = "notna_dtype_test" - assert df.to_sql(name=tbl, con=self.conn) == 2 - assert self._get_sqlite_column_type(tbl, "Bool") == "INTEGER" - assert self._get_sqlite_column_type(tbl, "Date") == "TIMESTAMP" - assert self._get_sqlite_column_type(tbl, "Int") == "INTEGER" - assert self._get_sqlite_column_type(tbl, "Float") == "REAL" +def test_sqlite_illegal_names(sqlite_buildin): + # For sqlite, these should work fine + conn = sqlite_buildin + df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) - def test_illegal_names(self): - # For sqlite, these should work fine - df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) + msg = "Empty table or column name specified" + with pytest.raises(ValueError, match=msg): + df.to_sql(name="", con=conn) - msg = "Empty table or column name specified" - with pytest.raises(ValueError, match=msg): - df.to_sql(name="", con=self.conn) - - for ndx, weird_name in enumerate( - [ - "test_weird_name]", - "test_weird_name[", - "test_weird_name`", - 'test_weird_name"', - "test_weird_name'", - "_b.test_weird_name_01-30", - '"_b.test_weird_name_01-30"', - "99beginswithnumber", - "12345", - "\xe9", - ] - ): - assert df.to_sql(name=weird_name, con=self.conn) == 2 - sql.table_exists(weird_name, self.conn) + for ndx, weird_name in enumerate( + [ + "test_weird_name]", + "test_weird_name[", + "test_weird_name`", + 'test_weird_name"', + "test_weird_name'", + "_b.test_weird_name_01-30", + '"_b.test_weird_name_01-30"', + "99beginswithnumber", + "12345", + "\xe9", + ] + ): + assert df.to_sql(name=weird_name, con=conn) == 2 + sql.table_exists(weird_name, conn) - df2 = DataFrame([[1, 2], [3, 4]], columns=["a", weird_name]) - c_tbl = f"test_weird_col_name{ndx:d}" - assert df2.to_sql(name=c_tbl, con=self.conn) == 2 - sql.table_exists(c_tbl, self.conn) + df2 = DataFrame([[1, 2], [3, 4]], columns=["a", weird_name]) + c_tbl = f"test_weird_col_name{ndx:d}" + assert df2.to_sql(name=c_tbl, con=conn) == 2 + sql.table_exists(c_tbl, conn) # ----------------------------------------------------------------------------- @@ -3367,227 +3623,222 @@ def tquery(query, con=None): return None if res is None else list(res) -class TestXSQLite: - def drop_table(self, table_name, conn): - cur = conn.cursor() - cur.execute(f"DROP TABLE IF EXISTS {sql._get_valid_sqlite_name(table_name)}") - conn.commit() +def test_xsqlite_basic(sqlite_buildin): + frame = tm.makeTimeDataFrame() + assert sql.to_sql(frame, name="test_table", con=sqlite_buildin, index=False) == 30 + result = sql.read_sql("select * from test_table", sqlite_buildin) - def test_basic(self, sqlite_buildin): - frame = tm.makeTimeDataFrame() - assert ( - sql.to_sql(frame, name="test_table", con=sqlite_buildin, index=False) == 30 - ) - result = sql.read_sql("select * from test_table", sqlite_buildin) + # HACK! Change this once indexes are handled properly. + result.index = frame.index - # HACK! Change this once indexes are handled properly. - result.index = frame.index + expected = frame + tm.assert_frame_equal(result, frame) - expected = frame - tm.assert_frame_equal(result, frame) + frame["txt"] = ["a"] * len(frame) + frame2 = frame.copy() + new_idx = Index(np.arange(len(frame2)), dtype=np.int64) + 10 + frame2["Idx"] = new_idx.copy() + assert sql.to_sql(frame2, name="test_table2", con=sqlite_buildin, index=False) == 30 + result = sql.read_sql("select * from test_table2", sqlite_buildin, index_col="Idx") + expected = frame.copy() + expected.index = new_idx + expected.index.name = "Idx" + tm.assert_frame_equal(expected, result) - frame["txt"] = ["a"] * len(frame) - frame2 = frame.copy() - new_idx = Index(np.arange(len(frame2)), dtype=np.int64) + 10 - frame2["Idx"] = new_idx.copy() - assert ( - sql.to_sql(frame2, name="test_table2", con=sqlite_buildin, index=False) - == 30 - ) - result = sql.read_sql( - "select * from test_table2", sqlite_buildin, index_col="Idx" - ) - expected = frame.copy() - expected.index = new_idx - expected.index.name = "Idx" - tm.assert_frame_equal(expected, result) - - def test_write_row_by_row(self, sqlite_buildin): - frame = tm.makeTimeDataFrame() - frame.iloc[0, 0] = np.nan - create_sql = sql.get_schema(frame, "test") - cur = sqlite_buildin.cursor() - cur.execute(create_sql) - ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" - for _, row in frame.iterrows(): - fmt_sql = format_query(ins, *row) - tquery(fmt_sql, con=sqlite_buildin) +def test_xsqlite_write_row_by_row(sqlite_buildin): + frame = tm.makeTimeDataFrame() + frame.iloc[0, 0] = np.nan + create_sql = sql.get_schema(frame, "test") + cur = sqlite_buildin.cursor() + cur.execute(create_sql) - sqlite_buildin.commit() + ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" + for _, row in frame.iterrows(): + fmt_sql = format_query(ins, *row) + tquery(fmt_sql, con=sqlite_buildin) - result = sql.read_sql("select * from test", con=sqlite_buildin) - result.index = frame.index - tm.assert_frame_equal(result, frame, rtol=1e-3) + sqlite_buildin.commit() - def test_execute(self, sqlite_buildin): - frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, "test") - cur = sqlite_buildin.cursor() - cur.execute(create_sql) - ins = "INSERT INTO test VALUES (?, ?, ?, ?)" - - row = frame.iloc[0] - with sql.pandasSQL_builder(sqlite_buildin) as pandas_sql: - pandas_sql.execute(ins, tuple(row)) - sqlite_buildin.commit() - - result = sql.read_sql("select * from test", sqlite_buildin) - result.index = frame.index[:1] - tm.assert_frame_equal(result, frame[:1]) - - def test_schema(self, sqlite_buildin): - frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, "test") - lines = create_sql.splitlines() - for line in lines: - tokens = line.split(" ") - if len(tokens) == 2 and tokens[0] == "A": - assert tokens[1] == "DATETIME" - - create_sql = sql.get_schema(frame, "test", keys=["A", "B"]) - lines = create_sql.splitlines() - assert 'PRIMARY KEY ("A", "B")' in create_sql - cur = sqlite_buildin.cursor() - cur.execute(create_sql) + result = sql.read_sql("select * from test", con=sqlite_buildin) + result.index = frame.index + tm.assert_frame_equal(result, frame, rtol=1e-3) - def test_execute_fail(self, sqlite_buildin): - create_sql = """ - CREATE TABLE test - ( - a TEXT, - b TEXT, - c REAL, - PRIMARY KEY (a, b) - ); - """ - cur = sqlite_buildin.cursor() + +def test_xsqlite_execute(sqlite_buildin): + frame = tm.makeTimeDataFrame() + create_sql = sql.get_schema(frame, "test") + cur = sqlite_buildin.cursor() + cur.execute(create_sql) + ins = "INSERT INTO test VALUES (?, ?, ?, ?)" + + row = frame.iloc[0] + with sql.pandasSQL_builder(sqlite_buildin) as pandas_sql: + pandas_sql.execute(ins, tuple(row)) + sqlite_buildin.commit() + + result = sql.read_sql("select * from test", sqlite_buildin) + result.index = frame.index[:1] + tm.assert_frame_equal(result, frame[:1]) + + +def test_xsqlite_schema(sqlite_buildin): + frame = tm.makeTimeDataFrame() + create_sql = sql.get_schema(frame, "test") + lines = create_sql.splitlines() + for line in lines: + tokens = line.split(" ") + if len(tokens) == 2 and tokens[0] == "A": + assert tokens[1] == "DATETIME" + + create_sql = sql.get_schema(frame, "test", keys=["A", "B"]) + lines = create_sql.splitlines() + assert 'PRIMARY KEY ("A", "B")' in create_sql + cur = sqlite_buildin.cursor() + cur.execute(create_sql) + + +def test_xsqlite_execute_fail(sqlite_buildin): + create_sql = """ + CREATE TABLE test + ( + a TEXT, + b TEXT, + c REAL, + PRIMARY KEY (a, b) + ); + """ + cur = sqlite_buildin.cursor() + cur.execute(create_sql) + + with sql.pandasSQL_builder(sqlite_buildin) as pandas_sql: + pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)') + pandas_sql.execute('INSERT INTO test VALUES("foo", "baz", 2.567)') + + with pytest.raises(sql.DatabaseError, match="Execution failed on sql"): + pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 7)') + + +def test_xsqlite_execute_closed_connection(): + create_sql = """ + CREATE TABLE test + ( + a TEXT, + b TEXT, + c REAL, + PRIMARY KEY (a, b) + ); + """ + with contextlib.closing(sqlite3.connect(":memory:")) as conn: + cur = conn.cursor() cur.execute(create_sql) - with sql.pandasSQL_builder(sqlite_buildin) as pandas_sql: + with sql.pandasSQL_builder(conn) as pandas_sql: pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)') - pandas_sql.execute('INSERT INTO test VALUES("foo", "baz", 2.567)') - with pytest.raises(sql.DatabaseError, match="Execution failed on sql"): - pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 7)') + msg = "Cannot operate on a closed database." + with pytest.raises(sqlite3.ProgrammingError, match=msg): + tquery("select * from test", con=conn) - def test_execute_closed_connection(self): - create_sql = """ - CREATE TABLE test - ( - a TEXT, - b TEXT, - c REAL, - PRIMARY KEY (a, b) - ); - """ - with contextlib.closing(sqlite3.connect(":memory:")) as conn: - cur = conn.cursor() - cur.execute(create_sql) - - with sql.pandasSQL_builder(conn) as pandas_sql: - pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)') - - msg = "Cannot operate on a closed database." - with pytest.raises(sqlite3.ProgrammingError, match=msg): - tquery("select * from test", con=conn) - - def test_keyword_as_column_names(self, sqlite_buildin): - df = DataFrame({"From": np.ones(5)}) - assert sql.to_sql(df, con=sqlite_buildin, name="testkeywords", index=False) == 5 - - def test_onecolumn_of_integer(self, sqlite_buildin): - # GH 3628 - # a column_of_integers dataframe should transfer well to sql - - mono_df = DataFrame([1, 2], columns=["c0"]) - assert sql.to_sql(mono_df, con=sqlite_buildin, name="mono_df", index=False) == 2 - # computing the sum via sql - con_x = sqlite_buildin - the_sum = sum(my_c0[0] for my_c0 in con_x.execute("select * from mono_df")) - # it should not fail, and gives 3 ( Issue #3628 ) - assert the_sum == 3 - - result = sql.read_sql("select * from mono_df", con_x) - tm.assert_frame_equal(result, mono_df) - - def test_if_exists(self, sqlite_buildin): - df_if_exists_1 = DataFrame({"col1": [1, 2], "col2": ["A", "B"]}) - df_if_exists_2 = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"]}) - table_name = "table_if_exists" - sql_select = f"SELECT * FROM {table_name}" - - msg = "'notvalidvalue' is not valid for if_exists" - with pytest.raises(ValueError, match=msg): - sql.to_sql( - frame=df_if_exists_1, - con=sqlite_buildin, - name=table_name, - if_exists="notvalidvalue", - ) - self.drop_table(table_name, sqlite_buildin) - # test if_exists='fail' +def test_xsqlite_keyword_as_column_names(sqlite_buildin): + df = DataFrame({"From": np.ones(5)}) + assert sql.to_sql(df, con=sqlite_buildin, name="testkeywords", index=False) == 5 + + +def test_xsqlite_onecolumn_of_integer(sqlite_buildin): + # GH 3628 + # a column_of_integers dataframe should transfer well to sql + + mono_df = DataFrame([1, 2], columns=["c0"]) + assert sql.to_sql(mono_df, con=sqlite_buildin, name="mono_df", index=False) == 2 + # computing the sum via sql + con_x = sqlite_buildin + the_sum = sum(my_c0[0] for my_c0 in con_x.execute("select * from mono_df")) + # it should not fail, and gives 3 ( Issue #3628 ) + assert the_sum == 3 + + result = sql.read_sql("select * from mono_df", con_x) + tm.assert_frame_equal(result, mono_df) + + +def test_xsqlite_if_exists(sqlite_buildin): + df_if_exists_1 = DataFrame({"col1": [1, 2], "col2": ["A", "B"]}) + df_if_exists_2 = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"]}) + table_name = "table_if_exists" + sql_select = f"SELECT * FROM {table_name}" + + msg = "'notvalidvalue' is not valid for if_exists" + with pytest.raises(ValueError, match=msg): sql.to_sql( - frame=df_if_exists_1, con=sqlite_buildin, name=table_name, if_exists="fail" + frame=df_if_exists_1, + con=sqlite_buildin, + name=table_name, + if_exists="notvalidvalue", ) - msg = "Table 'table_if_exists' already exists" - with pytest.raises(ValueError, match=msg): - sql.to_sql( - frame=df_if_exists_1, - con=sqlite_buildin, - name=table_name, - if_exists="fail", - ) - # test if_exists='replace' + drop_table(table_name, sqlite_buildin) + + # test if_exists='fail' + sql.to_sql( + frame=df_if_exists_1, con=sqlite_buildin, name=table_name, if_exists="fail" + ) + msg = "Table 'table_if_exists' already exists" + with pytest.raises(ValueError, match=msg): sql.to_sql( frame=df_if_exists_1, con=sqlite_buildin, name=table_name, + if_exists="fail", + ) + # test if_exists='replace' + sql.to_sql( + frame=df_if_exists_1, + con=sqlite_buildin, + name=table_name, + if_exists="replace", + index=False, + ) + assert tquery(sql_select, con=sqlite_buildin) == [(1, "A"), (2, "B")] + assert ( + sql.to_sql( + frame=df_if_exists_2, + con=sqlite_buildin, + name=table_name, if_exists="replace", index=False, ) - assert tquery(sql_select, con=sqlite_buildin) == [(1, "A"), (2, "B")] - assert ( - sql.to_sql( - frame=df_if_exists_2, - con=sqlite_buildin, - name=table_name, - if_exists="replace", - index=False, - ) - == 3 - ) - assert tquery(sql_select, con=sqlite_buildin) == [(3, "C"), (4, "D"), (5, "E")] - self.drop_table(table_name, sqlite_buildin) + == 3 + ) + assert tquery(sql_select, con=sqlite_buildin) == [(3, "C"), (4, "D"), (5, "E")] + drop_table(table_name, sqlite_buildin) - # test if_exists='append' - assert ( - sql.to_sql( - frame=df_if_exists_1, - con=sqlite_buildin, - name=table_name, - if_exists="fail", - index=False, - ) - == 2 + # test if_exists='append' + assert ( + sql.to_sql( + frame=df_if_exists_1, + con=sqlite_buildin, + name=table_name, + if_exists="fail", + index=False, ) - assert tquery(sql_select, con=sqlite_buildin) == [(1, "A"), (2, "B")] - assert ( - sql.to_sql( - frame=df_if_exists_2, - con=sqlite_buildin, - name=table_name, - if_exists="append", - index=False, - ) - == 3 + == 2 + ) + assert tquery(sql_select, con=sqlite_buildin) == [(1, "A"), (2, "B")] + assert ( + sql.to_sql( + frame=df_if_exists_2, + con=sqlite_buildin, + name=table_name, + if_exists="append", + index=False, ) - assert tquery(sql_select, con=sqlite_buildin) == [ - (1, "A"), - (2, "B"), - (3, "C"), - (4, "D"), - (5, "E"), - ] - self.drop_table(table_name, sqlite_buildin) + == 3 + ) + assert tquery(sql_select, con=sqlite_buildin) == [ + (1, "A"), + (2, "B"), + (3, "C"), + (4, "D"), + (5, "E"), + ] + drop_table(table_name, sqlite_buildin) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 7459aa1df8f3e..cd504616b6c5d 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -798,7 +798,7 @@ def test_missing_value_generator(self): expected_values.insert(0, ".") for t in types: offset = valid_range[t][1] - for i in range(0, 27): + for i in range(27): val = StataMissingValue(offset + 1 + i) assert val.string == expected_values[i] diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 11008646f0ad4..47114cab47619 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -1759,7 +1759,7 @@ def test_errorbar_with_partial_columns_dti(self): df_err = DataFrame( np.abs(np.random.default_rng(2).standard_normal((10, 2))), columns=[0, 2] ) - ix = date_range("1/1/2000", periods=10, freq="M") + ix = date_range("1/1/2000", periods=10, freq="ME") df.set_index(ix, inplace=True) df_err.set_index(ix, inplace=True) ax = _check_plot_works(df.plot, yerr=df_err, kind="line") @@ -1780,7 +1780,7 @@ def test_errorbar_timeseries(self, kind): d_err = {"x": np.ones(12) * 0.2, "y": np.ones(12) * 0.4} # check time-series plots - ix = date_range("1/1/2000", "1/1/2001", freq="M") + ix = date_range("1/1/2000", "1/1/2001", freq="ME") tdf = DataFrame(d, index=ix) tdf_err = DataFrame(d_err, index=ix) diff --git a/pandas/tests/plotting/frame/test_frame_subplots.py b/pandas/tests/plotting/frame/test_frame_subplots.py index bce00600f6615..4d8d8fa4cdee3 100644 --- a/pandas/tests/plotting/frame/test_frame_subplots.py +++ b/pandas/tests/plotting/frame/test_frame_subplots.py @@ -89,7 +89,7 @@ def test_subplots_no_legend(self, kind): @pytest.mark.parametrize("kind", ["line", "area"]) def test_subplots_timeseries(self, kind): - idx = date_range(start="2014-07-01", freq="M", periods=10) + idx = date_range(start="2014-07-01", freq="ME", periods=10) df = DataFrame(np.random.default_rng(2).random((10, 3)), index=idx) axes = df.plot(kind=kind, subplots=True, sharex=True) @@ -112,7 +112,7 @@ def test_subplots_timeseries(self, kind): @pytest.mark.parametrize("kind", ["line", "area"]) def test_subplots_timeseries_rot(self, kind): - idx = date_range(start="2014-07-01", freq="M", periods=10) + idx = date_range(start="2014-07-01", freq="ME", periods=10) df = DataFrame(np.random.default_rng(2).random((10, 3)), index=idx) axes = df.plot(kind=kind, subplots=True, sharex=False, rot=45, fontsize=7) for ax in axes: @@ -361,7 +361,7 @@ def test_subplots_ts_share_axes(self): mpl.pyplot.subplots_adjust(left=0.05, right=0.95, hspace=0.3, wspace=0.3) df = DataFrame( np.random.default_rng(2).standard_normal((10, 9)), - index=date_range(start="2014-07-01", freq="M", periods=10), + index=date_range(start="2014-07-01", freq="ME", periods=10), ) for i, ax in enumerate(axes.ravel()): df[i].plot(ax=ax, fontsize=5) diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 555b9fd0c82c2..76f7fa1f22eec 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -329,6 +329,22 @@ def test_plot_xlabel_ylabel(self, vert): assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel + @pytest.mark.parametrize("vert", [True, False]) + def test_plot_box(self, vert): + # GH 54941 + rng = np.random.default_rng(2) + df1 = DataFrame(rng.integers(0, 100, size=(100, 4)), columns=list("ABCD")) + df2 = DataFrame(rng.integers(0, 100, size=(100, 4)), columns=list("ABCD")) + + xlabel, ylabel = "x", "y" + _, axs = plt.subplots(ncols=2, figsize=(10, 7), sharey=True) + df1.plot.box(ax=axs[0], vert=vert, xlabel=xlabel, ylabel=ylabel) + df2.plot.box(ax=axs[1], vert=vert, xlabel=xlabel, ylabel=ylabel) + for ax in axs: + assert ax.get_xlabel() == xlabel + assert ax.get_ylabel() == ylabel + mpl.pyplot.close() + @pytest.mark.parametrize("vert", [True, False]) def test_boxplot_xlabel_ylabel(self, vert): df = DataFrame( diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 56d7900e2907d..7d574b86cef36 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -10,6 +10,8 @@ import pandas._config.config as cf +from pandas._libs.tslibs import to_offset + from pandas import ( Index, Period, @@ -255,7 +257,7 @@ def test_time_formatter(self, time, format_expected): result = converter.TimeFormatter(None)(time) assert result == format_expected - @pytest.mark.parametrize("freq", ("B", "L", "S")) + @pytest.mark.parametrize("freq", ("B", "ms", "s")) def test_dateindex_conversion(self, freq, dtc): rtol = 10**-9 dateindex = tm.makeDateIndex(k=10, freq=freq) @@ -390,7 +392,7 @@ def test_quarterly_finder(year_span): pytest.skip("the quarterly finder is only invoked if the span is >= 45") nyears = span / 4 (min_anndef, maj_anndef) = converter._get_default_annual_spacing(nyears) - result = converter._quarterly_finder(vmin, vmax, "Q") + result = converter._quarterly_finder(vmin, vmax, to_offset("Q")) quarters = PeriodIndex( arrays.PeriodArray(np.array([x[0] for x in result]), dtype="period[Q]") ) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index b3ae25ac9168f..220741cf1ec3d 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -14,6 +14,7 @@ BaseOffset, to_offset, ) +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr from pandas import ( DataFrame, @@ -86,7 +87,7 @@ def test_frame_inferred(self): def test_frame_inferred_n_gt_1(self): # N > 1 - idx = date_range("2008-1-1 00:15:00", freq="15T", periods=10) + idx = date_range("2008-1-1 00:15:00", freq="15min", periods=10) idx = DatetimeIndex(idx.values, freq=None) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx @@ -116,7 +117,7 @@ def test_nonnumeric_exclude_error(self): with pytest.raises(TypeError, match=msg): df["A"].plot() - @pytest.mark.parametrize("freq", ["S", "T", "H", "D", "W", "M", "Q", "A"]) + @pytest.mark.parametrize("freq", ["s", "min", "H", "D", "W", "M", "Q", "A"]) def test_tsplot_period(self, freq): idx = period_range("12/31/1999", freq=freq, periods=100) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) @@ -124,7 +125,7 @@ def test_tsplot_period(self, freq): _check_plot_works(ser.plot, ax=ax) @pytest.mark.parametrize( - "freq", ["S", "T", "H", "D", "W", "M", "Q-DEC", "A", "1B30Min"] + "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "A", "1B30Min"] ) def test_tsplot_datetime(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) @@ -186,14 +187,14 @@ def check_format_of_first_point(ax, expected_string): daily.plot(ax=ax) check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") - @pytest.mark.parametrize("freq", ["S", "T", "H", "D", "W", "M", "Q", "A"]) + @pytest.mark.parametrize("freq", ["s", "min", "H", "D", "W", "M", "Q", "A"]) def test_line_plot_period_series(self, freq): idx = period_range("12/31/1999", freq=freq, periods=100) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq) @pytest.mark.parametrize( - "frqncy", ["1S", "3S", "5T", "7H", "4D", "8W", "11M", "3A"] + "frqncy", ["1s", "3s", "5min", "7H", "4D", "8W", "11M", "3A"] ) def test_line_plot_period_mlt_series(self, frqncy): # test period index line plot for series with multiples (`mlt`) of the @@ -203,14 +204,14 @@ def test_line_plot_period_mlt_series(self, frqncy): _check_plot_works(s.plot, s.index.freq.rule_code) @pytest.mark.parametrize( - "freq", ["S", "T", "H", "D", "W", "M", "Q-DEC", "A", "1B30Min"] + "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "A", "1B30Min"] ) def test_line_plot_datetime_series(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq.rule_code) - @pytest.mark.parametrize("freq", ["S", "T", "H", "D", "W", "M", "Q", "A"]) + @pytest.mark.parametrize("freq", ["s", "min", "H", "D", "W", "ME", "Q", "A"]) def test_line_plot_period_frame(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) df = DataFrame( @@ -221,7 +222,7 @@ def test_line_plot_period_frame(self, freq): _check_plot_works(df.plot, df.index.freq) @pytest.mark.parametrize( - "frqncy", ["1S", "3S", "5T", "7H", "4D", "8W", "11M", "3A"] + "frqncy", ["1s", "3s", "5min", "7H", "4D", "8W", "11M", "3A"] ) def test_line_plot_period_mlt_frame(self, frqncy): # test period index line plot for DataFrames with multiples (`mlt`) @@ -233,12 +234,13 @@ def test_line_plot_period_mlt_frame(self, frqncy): index=idx, columns=["A", "B", "C"], ) - freq = df.index.asfreq(df.index.freq.rule_code).freq + freq = freq_to_period_freqstr(1, df.index.freq.rule_code) + freq = df.index.asfreq(freq).freq _check_plot_works(df.plot, freq) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @pytest.mark.parametrize( - "freq", ["S", "T", "H", "D", "W", "M", "Q-DEC", "A", "1B30Min"] + "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "A", "1B30Min"] ) def test_line_plot_datetime_frame(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) @@ -247,11 +249,12 @@ def test_line_plot_datetime_frame(self, freq): index=idx, columns=["A", "B", "C"], ) - freq = df.index.to_period(df.index.freq.rule_code).freq + freq = freq_to_period_freqstr(1, df.index.freq.rule_code) + freq = df.index.to_period(freq).freq _check_plot_works(df.plot, freq) @pytest.mark.parametrize( - "freq", ["S", "T", "H", "D", "W", "M", "Q-DEC", "A", "1B30Min"] + "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "A", "1B30Min"] ) def test_line_plot_inferred_freq(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) @@ -288,7 +291,7 @@ def test_plot_multiple_inferred_freq(self): def test_uhf(self): import pandas.plotting._matplotlib.converter as conv - idx = date_range("2012-6-22 21:59:51.960928", freq="L", periods=500) + idx = date_range("2012-6-22 21:59:51.960928", freq="ms", periods=500) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 2)), index=idx ) @@ -306,7 +309,7 @@ def test_uhf(self): assert xp == rs def test_irreg_hf(self): - idx = date_range("2012-6-22 21:59:51", freq="S", periods=10) + idx = date_range("2012-6-22 21:59:51", freq="s", periods=10) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 2)), index=idx ) @@ -320,7 +323,7 @@ def test_irreg_hf(self): assert (np.fabs(diffs[1:] - [sec, sec * 2, sec]) < 1e-8).all() def test_irreg_hf_object(self): - idx = date_range("2012-6-22 21:59:51", freq="S", periods=10) + idx = date_range("2012-6-22 21:59:51", freq="s", periods=10) df2 = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 2)), index=idx ) @@ -435,7 +438,7 @@ def test_get_finder(self): assert conv.get_finder(to_offset("B")) == conv._daily_finder assert conv.get_finder(to_offset("D")) == conv._daily_finder - assert conv.get_finder(to_offset("M")) == conv._monthly_finder + assert conv.get_finder(to_offset("ME")) == conv._monthly_finder assert conv.get_finder(to_offset("Q")) == conv._quarterly_finder assert conv.get_finder(to_offset("A")) == conv._annual_finder assert conv.get_finder(to_offset("W")) == conv._daily_finder @@ -801,7 +804,7 @@ def test_mixed_freq_irregular_first_df(self): def test_mixed_freq_hf_first(self): idxh = date_range("1/1/1999", periods=365, freq="D") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) _, ax = mpl.pyplot.subplots() @@ -815,7 +818,7 @@ def test_mixed_freq_alignment(self): ts_data = np.random.default_rng(2).standard_normal(12) ts = Series(ts_data, index=ts_ind) - ts2 = ts.asfreq("T").interpolate() + ts2 = ts.asfreq("min").interpolate() _, ax = mpl.pyplot.subplots() ax = ts.plot(ax=ax) @@ -825,7 +828,7 @@ def test_mixed_freq_alignment(self): def test_mixed_freq_lf_first(self): idxh = date_range("1/1/1999", periods=365, freq="D") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) _, ax = mpl.pyplot.subplots() @@ -838,7 +841,7 @@ def test_mixed_freq_lf_first(self): mpl.pyplot.close(ax.get_figure()) def test_mixed_freq_lf_first_hourly(self): - idxh = date_range("1/1/1999", periods=240, freq="T") + idxh = date_range("1/1/1999", periods=240, freq="min") idxl = date_range("1/1/1999", periods=4, freq="H") high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) @@ -846,7 +849,7 @@ def test_mixed_freq_lf_first_hourly(self): low.plot(ax=ax) high.plot(ax=ax) for line in ax.get_lines(): - assert PeriodIndex(data=line.get_xdata()).freq == "T" + assert PeriodIndex(data=line.get_xdata()).freq == "min" @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_mixed_freq_irreg_period(self): @@ -862,7 +865,7 @@ def test_mixed_freq_irreg_period(self): def test_mixed_freq_shared_ax(self): # GH13341, using sharex=True - idx1 = date_range("2015-01-01", periods=3, freq="M") + idx1 = date_range("2015-01-01", periods=3, freq="ME") idx2 = idx1[:1].union(idx1[2:]) s1 = Series(range(len(idx1)), idx1) s2 = Series(range(len(idx2)), idx2) @@ -877,7 +880,7 @@ def test_mixed_freq_shared_ax(self): def test_mixed_freq_shared_ax_twin_x(self): # GH13341, using sharex=True - idx1 = date_range("2015-01-01", periods=3, freq="M") + idx1 = date_range("2015-01-01", periods=3, freq="ME") idx2 = idx1[:1].union(idx1[2:]) s1 = Series(range(len(idx1)), idx1) s2 = Series(range(len(idx2)), idx2) @@ -915,7 +918,7 @@ def test_nat_handling(self): def test_to_weekly_resampling(self): idxh = date_range("1/1/1999", periods=52, freq="W") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) _, ax = mpl.pyplot.subplots() @@ -926,7 +929,7 @@ def test_to_weekly_resampling(self): def test_from_weekly_resampling(self): idxh = date_range("1/1/1999", periods=52, freq="W") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) _, ax = mpl.pyplot.subplots() @@ -949,7 +952,7 @@ def test_from_weekly_resampling(self): @pytest.mark.parametrize("kind1, kind2", [("line", "area"), ("area", "line")]) def test_from_resampling_area_line_mixed(self, kind1, kind2): idxh = date_range("1/1/1999", periods=52, freq="W") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = DataFrame( np.random.default_rng(2).random((len(idxh), 3)), index=idxh, @@ -1005,7 +1008,7 @@ def test_from_resampling_area_line_mixed(self, kind1, kind2): @pytest.mark.parametrize("kind1, kind2", [("line", "area"), ("area", "line")]) def test_from_resampling_area_line_mixed_high_to_low(self, kind1, kind2): idxh = date_range("1/1/1999", periods=52, freq="W") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = DataFrame( np.random.default_rng(2).random((len(idxh), 3)), index=idxh, @@ -1058,8 +1061,8 @@ def test_from_resampling_area_line_mixed_high_to_low(self, kind1, kind2): def test_mixed_freq_second_millisecond(self): # GH 7772, GH 7760 - idxh = date_range("2014-07-01 09:00", freq="S", periods=50) - idxl = date_range("2014-07-01 09:00", freq="100L", periods=500) + idxh = date_range("2014-07-01 09:00", freq="s", periods=50) + idxl = date_range("2014-07-01 09:00", freq="100ms", periods=500) high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) # high to low @@ -1068,12 +1071,12 @@ def test_mixed_freq_second_millisecond(self): low.plot(ax=ax) assert len(ax.get_lines()) == 2 for line in ax.get_lines(): - assert PeriodIndex(data=line.get_xdata()).freq == "L" + assert PeriodIndex(data=line.get_xdata()).freq == "ms" def test_mixed_freq_second_millisecond_low_to_high(self): # GH 7772, GH 7760 - idxh = date_range("2014-07-01 09:00", freq="S", periods=50) - idxl = date_range("2014-07-01 09:00", freq="100L", periods=500) + idxh = date_range("2014-07-01 09:00", freq="s", periods=50) + idxl = date_range("2014-07-01 09:00", freq="100ms", periods=500) high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) # low to high @@ -1082,7 +1085,7 @@ def test_mixed_freq_second_millisecond_low_to_high(self): high.plot(ax=ax) assert len(ax.get_lines()) == 2 for line in ax.get_lines(): - assert PeriodIndex(data=line.get_xdata()).freq == "L" + assert PeriodIndex(data=line.get_xdata()).freq == "ms" def test_irreg_dtypes(self): # date @@ -1211,7 +1214,7 @@ def test_time_musec(self): def test_secondary_upsample(self): idxh = date_range("1/1/1999", periods=365, freq="D") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) _, ax = mpl.pyplot.subplots() @@ -1329,7 +1332,7 @@ def test_secondary_legend_nonts_multi_col(self): @pytest.mark.xfail(reason="Api changed in 3.6.0") def test_format_date_axis(self): - rng = date_range("1/1/2012", periods=12, freq="M") + rng = date_range("1/1/2012", periods=12, freq="ME") df = DataFrame(np.random.default_rng(2).standard_normal((len(rng), 3)), rng) _, ax = mpl.pyplot.subplots() ax = df.plot(ax=ax) @@ -1632,8 +1635,10 @@ def _check_plot_works(f, freq=None, series=None, *args, **kwargs): if orig_axfreq is None: assert ax.freq == dfreq + if freq is not None: + ax_freq = to_offset(ax.freq, is_period=True) if freq is not None and orig_axfreq is None: - assert ax.freq == freq + assert ax_freq == freq ax = fig.add_subplot(212) kwargs["ax"] = ax diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 6f0afab53c267..e77dc0b305171 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -715,7 +715,7 @@ def test_errorbar_plot_yerr_0(self): ) def test_errorbar_plot_ts(self, yerr): # test time series plotting - ix = date_range("1/1/2000", "1/1/2001", freq="M") + ix = date_range("1/1/2000", "1/1/2001", freq="ME") ts = Series(np.arange(12), index=ix, name="x") yerr.index = ix diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 87892a81cef3d..021252500e814 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1078,6 +1078,25 @@ def test_any_all_datetimelike(self): assert df.any().all() assert not df.all().any() + def test_any_all_pyarrow_string(self): + # GH#54591 + pytest.importorskip("pyarrow") + ser = Series(["", "a"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert not ser.all() + + ser = Series([None, "a"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert not ser.all() + + ser = Series([None, ""], dtype="string[pyarrow_numpy]") + assert not ser.any() + assert not ser.all() + + ser = Series(["a", "b"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert ser.all() + def test_timedelta64_analytics(self): # index min/max dti = date_range("2012-1-1", periods=3, freq="D") diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index 55d78c516b6f3..3cea39fa75ece 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -41,7 +41,7 @@ def test_dt64_mean(self, tz_naive_fixture, box): assert obj.mean(skipna=False) is pd.NaT @pytest.mark.parametrize("box", [Series, pd.Index, PeriodArray]) - @pytest.mark.parametrize("freq", ["S", "H", "D", "W", "B"]) + @pytest.mark.parametrize("freq", ["s", "H", "D", "W", "B"]) def test_period_mean(self, box, freq): # GH#24757 dti = pd.date_range("2001-01-01", periods=11) diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 7a76a21a6c579..ae28a010d8435 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -83,8 +83,8 @@ def test_asfreq_fill_value(series, create_index): def test_resample_interpolate(frame): # GH#12925 df = frame - result = df.resample("1T").asfreq().interpolate() - expected = df.resample("1T").interpolate() + result = df.resample("1min").asfreq().interpolate() + expected = df.resample("1min").interpolate() tm.assert_frame_equal(result, expected) @@ -100,12 +100,12 @@ def test_raises_on_non_datetimelike_index(): @all_ts -@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "H"]) def test_resample_empty_series(freq, empty_series_dti, resample_method): # GH12771 & GH12868 ser = empty_series_dti - if freq == "M" and isinstance(ser.index, TimedeltaIndex): + if freq == "ME" and isinstance(ser.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " "e.g. '24H' or '3D', not " @@ -113,7 +113,9 @@ def test_resample_empty_series(freq, empty_series_dti, resample_method): with pytest.raises(ValueError, match=msg): ser.resample(freq) return - + elif freq == "ME" and isinstance(ser.index, PeriodIndex): + # index is PeriodIndex, so convert to corresponding Period freq + freq = "M" rs = ser.resample(freq) result = getattr(rs, resample_method)() @@ -136,7 +138,7 @@ def test_resample_empty_series(freq, empty_series_dti, resample_method): @pytest.mark.parametrize( "freq", [ - pytest.param("M", marks=pytest.mark.xfail(reason="Don't know why this fails")), + pytest.param("ME", marks=pytest.mark.xfail(reason="Don't know why this fails")), "D", "H", ], @@ -162,12 +164,12 @@ def test_resample_nat_index_series(freq, series, resample_method): @all_ts -@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "H"]) @pytest.mark.parametrize("resample_method", ["count", "size"]) def test_resample_count_empty_series(freq, empty_series_dti, resample_method): # GH28427 ser = empty_series_dti - if freq == "M" and isinstance(ser.index, TimedeltaIndex): + if freq == "ME" and isinstance(ser.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " "e.g. '24H' or '3D', not " @@ -175,7 +177,9 @@ def test_resample_count_empty_series(freq, empty_series_dti, resample_method): with pytest.raises(ValueError, match=msg): ser.resample(freq) return - + elif freq == "ME" and isinstance(ser.index, PeriodIndex): + # index is PeriodIndex, so convert to corresponding Period freq + freq = "M" rs = ser.resample(freq) result = getattr(rs, resample_method)() @@ -188,12 +192,12 @@ def test_resample_count_empty_series(freq, empty_series_dti, resample_method): @all_ts -@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "H"]) def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): # GH13212 df = empty_frame_dti # count retains dimensions too - if freq == "M" and isinstance(df.index, TimedeltaIndex): + if freq == "ME" and isinstance(df.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " "e.g. '24H' or '3D', not " @@ -201,7 +205,9 @@ def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): with pytest.raises(ValueError, match=msg): df.resample(freq, group_keys=False) return - + elif freq == "ME" and isinstance(df.index, PeriodIndex): + # index is PeriodIndex, so convert to corresponding Period freq + freq = "M" rs = df.resample(freq, group_keys=False) result = getattr(rs, resample_method)() if resample_method == "ohlc": @@ -228,13 +234,13 @@ def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): @all_ts -@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "H"]) def test_resample_count_empty_dataframe(freq, empty_frame_dti): # GH28427 empty_frame_dti["a"] = [] - if freq == "M" and isinstance(empty_frame_dti.index, TimedeltaIndex): + if freq == "ME" and isinstance(empty_frame_dti.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " "e.g. '24H' or '3D', not " @@ -242,7 +248,9 @@ def test_resample_count_empty_dataframe(freq, empty_frame_dti): with pytest.raises(ValueError, match=msg): empty_frame_dti.resample(freq) return - + elif freq == "ME" and isinstance(empty_frame_dti.index, PeriodIndex): + # index is PeriodIndex, so convert to corresponding Period freq + freq = "M" result = empty_frame_dti.resample(freq).count() index = _asfreq_compat(empty_frame_dti.index, freq) @@ -253,13 +261,13 @@ def test_resample_count_empty_dataframe(freq, empty_frame_dti): @all_ts -@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "H"]) def test_resample_size_empty_dataframe(freq, empty_frame_dti): # GH28427 empty_frame_dti["a"] = [] - if freq == "M" and isinstance(empty_frame_dti.index, TimedeltaIndex): + if freq == "ME" and isinstance(empty_frame_dti.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " "e.g. '24H' or '3D', not " @@ -267,7 +275,9 @@ def test_resample_size_empty_dataframe(freq, empty_frame_dti): with pytest.raises(ValueError, match=msg): empty_frame_dti.resample(freq) return - + elif freq == "ME" and isinstance(empty_frame_dti.index, PeriodIndex): + # index is PeriodIndex, so convert to corresponding Period freq + freq = "M" result = empty_frame_dti.resample(freq).size() index = _asfreq_compat(empty_frame_dti.index, freq) @@ -298,12 +308,12 @@ def test_resample_empty_dtypes(index, dtype, resample_method): @all_ts -@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "H"]) def test_apply_to_empty_series(empty_series_dti, freq): # GH 14313 ser = empty_series_dti - if freq == "M" and isinstance(empty_series_dti.index, TimedeltaIndex): + if freq == "ME" and isinstance(empty_series_dti.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " "e.g. '24H' or '3D', not " @@ -311,7 +321,9 @@ def test_apply_to_empty_series(empty_series_dti, freq): with pytest.raises(ValueError, match=msg): empty_series_dti.resample(freq) return - + elif freq == "ME" and isinstance(empty_series_dti.index, PeriodIndex): + # index is PeriodIndex, so convert to corresponding Period freq + freq = "M" result = ser.resample(freq, group_keys=False).apply(lambda x: 1) expected = ser.resample(freq).apply("sum") diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index dbda751e82113..d929dfa6e1e59 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -82,7 +82,7 @@ def test_custom_grouper(index, unit): arr = [1] + [5] * 2592 idx = dti[0:-1:5] idx = idx.append(dti[-1:]) - idx = DatetimeIndex(idx, freq="5T").as_unit(unit) + idx = DatetimeIndex(idx, freq="5min").as_unit(unit) expect = Series(arr, index=idx) # GH2763 - return input dtype if we can @@ -140,21 +140,21 @@ def test_resample_integerarray(unit): # GH 25580, resample on IntegerArray ts = Series( range(9), - index=date_range("1/1/2000", periods=9, freq="T").as_unit(unit), + index=date_range("1/1/2000", periods=9, freq="min").as_unit(unit), dtype="Int64", ) - result = ts.resample("3T").sum() + result = ts.resample("3min").sum() expected = Series( [3, 12, 21], - index=date_range("1/1/2000", periods=3, freq="3T").as_unit(unit), + index=date_range("1/1/2000", periods=3, freq="3min").as_unit(unit), dtype="Int64", ) tm.assert_series_equal(result, expected) - result = ts.resample("3T").mean() + result = ts.resample("3min").mean() expected = Series( [1, 4, 7], - index=date_range("1/1/2000", periods=3, freq="3T").as_unit(unit), + index=date_range("1/1/2000", periods=3, freq="3min").as_unit(unit), dtype="Float64", ) tm.assert_series_equal(result, expected) @@ -255,11 +255,11 @@ class FnClass: def __call__(self, x): return str(type(x)) - df_standard = df.resample("M").apply(fn) - df_lambda = df.resample("M").apply(lambda x: str(type(x))) - df_partial = df.resample("M").apply(partial(fn)) - df_partial2 = df.resample("M").apply(partial(fn, a=2)) - df_class = df.resample("M").apply(FnClass()) + df_standard = df.resample("ME").apply(fn) + df_lambda = df.resample("ME").apply(lambda x: str(type(x))) + df_partial = df.resample("ME").apply(partial(fn)) + df_partial2 = df.resample("ME").apply(partial(fn, a=2)) + df_class = df.resample("ME").apply(FnClass()) tm.assert_frame_equal(df_standard, df_lambda) tm.assert_frame_equal(df_standard, df_partial) @@ -428,14 +428,14 @@ def test_resample_frame_basic_cy_funcs(f, unit): df = tm.makeTimeDataFrame() df.index = df.index.as_unit(unit) - b = Grouper(freq="M") + b = Grouper(freq="ME") g = df.groupby(b) # check all cython functions work g._cython_agg_general(f, alt=None, numeric_only=True) -@pytest.mark.parametrize("freq", ["A", "M"]) +@pytest.mark.parametrize("freq", ["A", "ME"]) def test_resample_frame_basic_M_A(freq, unit): df = tm.makeTimeDataFrame() df.index = df.index.as_unit(unit) @@ -443,7 +443,7 @@ def test_resample_frame_basic_M_A(freq, unit): tm.assert_series_equal(result["A"], df["A"].resample(freq).mean()) -@pytest.mark.parametrize("freq", ["W-WED", "M"]) +@pytest.mark.parametrize("freq", ["W-WED", "ME"]) def test_resample_frame_basic_kind(freq, unit): df = tm.makeTimeDataFrame() df.index = df.index.as_unit(unit) @@ -493,7 +493,7 @@ def test_resample_how_method(unit): ), ) expected.index = expected.index.as_unit(unit) - tm.assert_series_equal(s.resample("10S").mean(), expected) + tm.assert_series_equal(s.resample("10s").mean(), expected) def test_resample_extra_index_point(unit): @@ -508,16 +508,16 @@ def test_resample_extra_index_point(unit): def test_upsample_with_limit(unit): - rng = date_range("1/1/2000", periods=3, freq="5t").as_unit(unit) + rng = date_range("1/1/2000", periods=3, freq="5min").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) - result = ts.resample("t").ffill(limit=2) + result = ts.resample("min").ffill(limit=2) expected = ts.reindex(result.index, method="ffill", limit=2) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("freq", ["5D", "10H", "5Min", "10S"]) -@pytest.mark.parametrize("rule", ["Y", "3M", "15D", "30H", "15Min", "30S"]) +@pytest.mark.parametrize("freq", ["5D", "10H", "5Min", "10s"]) +@pytest.mark.parametrize("rule", ["Y", "3ME", "15D", "30H", "15Min", "30s"]) def test_nearest_upsample_with_limit(tz_aware_fixture, freq, rule, unit): # GH 33939 rng = date_range("1/1/2000", periods=3, freq=freq, tz=tz_aware_fixture).as_unit( @@ -560,10 +560,10 @@ def test_resample_ohlc_result(unit): index = index.union(date_range("4-15-2000", "5-15-2000", freq="h").as_unit(unit)) s = Series(range(len(index)), index=index) - a = s.loc[:"4-15-2000"].resample("30T").ohlc() + a = s.loc[:"4-15-2000"].resample("30min").ohlc() assert isinstance(a, DataFrame) - b = s.loc[:"4-14-2000"].resample("30T").ohlc() + b = s.loc[:"4-14-2000"].resample("30min").ohlc() assert isinstance(b, DataFrame) @@ -670,7 +670,7 @@ def test_resample_reresample(unit): [ ["A-DEC", {"start": "1990", "end": "2000", "freq": "a-dec"}], ["A-JUN", {"start": "1990", "end": "2000", "freq": "a-jun"}], - ["M", {"start": "1990-01", "end": "2000-01", "freq": "M"}], + ["ME", {"start": "1990-01", "end": "2000-01", "freq": "M"}], ], ) def test_resample_timestamp_to_period( @@ -710,7 +710,7 @@ def test_downsample_non_unique(unit): rng2 = rng.repeat(5).values ts = Series(np.random.default_rng(2).standard_normal(len(rng2)), index=rng2) - result = ts.resample("M").mean() + result = ts.resample("ME").mean() expected = ts.groupby(lambda x: x.month).mean() assert len(result) == 2 @@ -739,12 +739,12 @@ def test_resample_axis1(unit): warning_msg = "DataFrame.resample with axis=1 is deprecated." with tm.assert_produces_warning(FutureWarning, match=warning_msg): - result = df.resample("M", axis=1).mean() - expected = df.T.resample("M").mean().T + result = df.resample("ME", axis=1).mean() + expected = df.T.resample("ME").mean().T tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("freq", ["t", "5t", "15t", "30t", "4h", "12h"]) +@pytest.mark.parametrize("freq", ["min", "5min", "15min", "30min", "4h", "12h"]) def test_resample_anchored_ticks(freq, unit): # If a fixed delta (5 minute, 4 hour) evenly divides a day, we should # "anchor" the origin at midnight so we get regular intervals rather @@ -765,7 +765,7 @@ def test_resample_single_group(end, unit): rng = date_range("2000-1-1", f"2000-{end}-10", freq="D").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - tm.assert_series_equal(ts.resample("M").sum(), ts.resample("M").apply(mysum)) + tm.assert_series_equal(ts.resample("ME").sum(), ts.resample("ME").apply(mysum)) def test_resample_single_group_std(unit): @@ -1030,7 +1030,7 @@ def _create_series(values, timestamps, freq="D"): def test_resample_daily_anchored(unit): - rng = date_range("1/1/2000 0:00:00", periods=10000, freq="T").as_unit(unit) + rng = date_range("1/1/2000 0:00:00", periods=10000, freq="min").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) ts[:2] = np.nan # so results are the same @@ -1045,7 +1045,7 @@ def test_resample_to_period_monthly_buglet(unit): rng = date_range("1/1/2000", "12/31/2000").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - result = ts.resample("M", kind="period").mean() + result = ts.resample("ME", kind="period").mean() exp_index = period_range("Jan-2000", "Dec-2000", freq="M") tm.assert_index_equal(result.index, exp_index) @@ -1077,8 +1077,12 @@ def test_resample_segfault(unit): all_wins_and_wagers, columns=("ID", "timestamp", "A", "B") ).set_index("timestamp") df.index = df.index.as_unit(unit) - result = df.groupby("ID").resample("5min").sum() - expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("ID").resample("5min").sum() + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) tm.assert_frame_equal(result, expected) @@ -1097,7 +1101,9 @@ def test_resample_dtype_preservation(unit): result = df.resample("1D").ffill() assert result.val.dtype == np.int32 - result = df.groupby("group").resample("1D").ffill() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("group").resample("1D").ffill() assert result.val.dtype == np.int32 @@ -1132,7 +1138,7 @@ def test_monthly_resample_error(unit): dates = date_range("4/16/2012 20:00", periods=5000, freq="h").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(dates)), index=dates) # it works! - ts.resample("M") + ts.resample("ME") def test_nanosecond_resample_error(): @@ -1140,12 +1146,12 @@ def test_nanosecond_resample_error(): # Resampling using pd.tseries.offsets.Nano as period start = 1443707890427 exp_start = 1443707890400 - indx = date_range(start=pd.to_datetime(start), periods=10, freq="100n") + indx = date_range(start=pd.to_datetime(start), periods=10, freq="100ns") ts = Series(range(len(indx)), index=indx) r = ts.resample(pd.tseries.offsets.Nano(100)) result = r.agg("mean") - exp_indx = date_range(start=pd.to_datetime(exp_start), periods=10, freq="100n") + exp_indx = date_range(start=pd.to_datetime(exp_start), periods=10, freq="100ns") exp = Series(range(len(exp_indx)), index=exp_indx, dtype=float) tm.assert_series_equal(result, exp) @@ -1157,20 +1163,20 @@ def test_resample_anchored_intraday(simple_date_range_series, unit): rng = date_range("1/1/2012", "4/1/2012", freq="100min").as_unit(unit) df = DataFrame(rng.month, index=rng) - result = df.resample("M").mean() - expected = df.resample("M", kind="period").mean().to_timestamp(how="end") + result = df.resample("ME").mean() + expected = df.resample("ME", kind="period").mean().to_timestamp(how="end") expected.index += Timedelta(1, "ns") - Timedelta(1, "D") expected.index = expected.index.as_unit(unit)._with_freq("infer") - assert expected.index.freq == "M" + assert expected.index.freq == "ME" tm.assert_frame_equal(result, expected) - result = df.resample("M", closed="left").mean() - exp = df.shift(1, freq="D").resample("M", kind="period").mean() + result = df.resample("ME", closed="left").mean() + exp = df.shift(1, freq="D").resample("ME", kind="period").mean() exp = exp.to_timestamp(how="end") exp.index = exp.index + Timedelta(1, "ns") - Timedelta(1, "D") exp.index = exp.index.as_unit(unit)._with_freq("infer") - assert exp.index.freq == "M" + assert exp.index.freq == "ME" tm.assert_frame_equal(result, exp) rng = date_range("1/1/2012", "4/1/2012", freq="100min").as_unit(unit) @@ -1195,7 +1201,7 @@ def test_resample_anchored_intraday(simple_date_range_series, unit): ts = simple_date_range_series("2012-04-29 23:00", "2012-04-30 5:00", freq="h") ts.index = ts.index.as_unit(unit) - resampled = ts.resample("M").mean() + resampled = ts.resample("ME").mean() assert len(resampled) == 1 @@ -1214,25 +1220,25 @@ def test_resample_anchored_multiday(label, sec): # # See: https://github.com/pandas-dev/pandas/issues/8683 - index1 = date_range("2014-10-14 23:06:23.206", periods=3, freq="400L") - index2 = date_range("2014-10-15 23:00:00", periods=2, freq="2200L") + index1 = date_range("2014-10-14 23:06:23.206", periods=3, freq="400ms") + index2 = date_range("2014-10-15 23:00:00", periods=2, freq="2200ms") index = index1.union(index2) s = Series(np.random.default_rng(2).standard_normal(5), index=index) # Ensure left closing works - result = s.resample("2200L", label=label).mean() + result = s.resample("2200ms", label=label).mean() assert result.index[-1] == Timestamp(f"2014-10-15 23:00:{sec}00") def test_corner_cases(unit): # miscellaneous test coverage - rng = date_range("1/1/2000", periods=12, freq="t").as_unit(unit) + rng = date_range("1/1/2000", periods=12, freq="min").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - result = ts.resample("5t", closed="right", label="left").mean() - ex_index = date_range("1999-12-31 23:55", periods=4, freq="5t").as_unit(unit) + result = ts.resample("5min", closed="right", label="left").mean() + ex_index = date_range("1999-12-31 23:55", periods=4, freq="5min").as_unit(unit) tm.assert_index_equal(result.index, ex_index) @@ -1248,7 +1254,7 @@ def test_corner_cases_date(simple_date_range_series, unit): # resample to periods ts = simple_date_range_series("2000-04-28", "2000-04-30 11:00", freq="h") ts.index = ts.index.as_unit(unit) - result = ts.resample("M", kind="period").mean() + result = ts.resample("ME", kind="period").mean() assert len(result) == 1 assert result.index[0] == Period("2000-04", freq="M") @@ -1302,12 +1308,12 @@ def test_resample_median_bug_1688(dtype): dtype=dtype, ) - result = df.resample("T").apply(lambda x: x.mean()) - exp = df.asfreq("T") + result = df.resample("min").apply(lambda x: x.mean()) + exp = df.asfreq("min") tm.assert_frame_equal(result, exp) - result = df.resample("T").median() - exp = df.asfreq("T") + result = df.resample("min").median() + exp = df.asfreq("min") tm.assert_frame_equal(result, exp) @@ -1315,23 +1321,23 @@ def test_how_lambda_functions(simple_date_range_series, unit): ts = simple_date_range_series("1/1/2000", "4/1/2000") ts.index = ts.index.as_unit(unit) - result = ts.resample("M").apply(lambda x: x.mean()) - exp = ts.resample("M").mean() + result = ts.resample("ME").apply(lambda x: x.mean()) + exp = ts.resample("ME").mean() tm.assert_series_equal(result, exp) - foo_exp = ts.resample("M").mean() + foo_exp = ts.resample("ME").mean() foo_exp.name = "foo" - bar_exp = ts.resample("M").std() + bar_exp = ts.resample("ME").std() bar_exp.name = "bar" - result = ts.resample("M").apply([lambda x: x.mean(), lambda x: x.std(ddof=1)]) + result = ts.resample("ME").apply([lambda x: x.mean(), lambda x: x.std(ddof=1)]) result.columns = ["foo", "bar"] tm.assert_series_equal(result["foo"], foo_exp) tm.assert_series_equal(result["bar"], bar_exp) # this is a MI Series, so comparing the names of the results # doesn't make sense - result = ts.resample("M").aggregate( + result = ts.resample("ME").aggregate( {"foo": lambda x: x.mean(), "bar": lambda x: x.std(ddof=1)} ) tm.assert_series_equal(result["foo"], foo_exp, check_names=False) @@ -1354,12 +1360,12 @@ def test_resample_consistency(unit): # GH 6418 # resample with bfill / limit / reindex consistency - i30 = date_range("2002-02-02", periods=4, freq="30T").as_unit(unit) + i30 = date_range("2002-02-02", periods=4, freq="30min").as_unit(unit) s = Series(np.arange(4.0), index=i30) s.iloc[2] = np.nan # Upsample by factor 3 with reindex() and resample() methods: - i10 = date_range(i30[0], i30[-1], freq="10T").as_unit(unit) + i10 = date_range(i30[0], i30[-1], freq="10min").as_unit(unit) s10 = s.reindex(index=i10, method="bfill") s10_2 = s.reindex(index=i10, method="bfill", limit=2) @@ -1392,10 +1398,10 @@ def test_resample_consistency(unit): def test_resample_timegrouper(dates): # GH 7227 df = DataFrame({"A": dates, "B": np.arange(len(dates))}) - result = df.set_index("A").resample("M").count() + result = df.set_index("A").resample("ME").count() exp_idx = DatetimeIndex( ["2014-07-31", "2014-08-31", "2014-09-30", "2014-10-31", "2014-11-30"], - freq="M", + freq="ME", name="A", ) expected = DataFrame({"B": [1, 0, 2, 2, 1]}, index=exp_idx) @@ -1403,11 +1409,11 @@ def test_resample_timegrouper(dates): expected.index = expected.index._with_freq(None) tm.assert_frame_equal(result, expected) - result = df.groupby(Grouper(freq="M", key="A")).count() + result = df.groupby(Grouper(freq="ME", key="A")).count() tm.assert_frame_equal(result, expected) df = DataFrame({"A": dates, "B": np.arange(len(dates)), "C": np.arange(len(dates))}) - result = df.set_index("A").resample("M").count() + result = df.set_index("A").resample("ME").count() expected = DataFrame( {"B": [1, 0, 2, 2, 1], "C": [1, 0, 2, 2, 1]}, index=exp_idx, @@ -1417,7 +1423,7 @@ def test_resample_timegrouper(dates): expected.index = expected.index._with_freq(None) tm.assert_frame_equal(result, expected) - result = df.groupby(Grouper(freq="M", key="A")).count() + result = df.groupby(Grouper(freq="ME", key="A")).count() tm.assert_frame_equal(result, expected) @@ -1479,7 +1485,7 @@ def test_resample_nunique_with_date_gap(func, unit): index2 = date_range("4-15-2000", "5-15-2000", freq="h").as_unit(unit) index3 = index.append(index2) s = Series(range(len(index3)), index=index3, dtype="int64") - r = s.resample("M") + r = s.resample("ME") result = r.count() expected = func(r) tm.assert_series_equal(result, expected) @@ -1493,11 +1499,13 @@ def test_resample_group_info(n, k, unit): # use a fixed seed to always have the same uniques prng = np.random.default_rng(2) - dr = date_range(start="2015-08-27", periods=n // 10, freq="T").as_unit(unit) + dr = date_range(start="2015-08-27", periods=n // 10, freq="min").as_unit(unit) ts = Series(prng.integers(0, n // k, n).astype("int64"), index=prng.choice(dr, n)) - left = ts.resample("30T").nunique() - ix = date_range(start=ts.index.min(), end=ts.index.max(), freq="30T").as_unit(unit) + left = ts.resample("30min").nunique() + ix = date_range(start=ts.index.min(), end=ts.index.max(), freq="30min").as_unit( + unit + ) vals = ts.values bins = np.searchsorted(ix.values, ts.index, side="right") @@ -1516,14 +1524,16 @@ def test_resample_group_info(n, k, unit): def test_resample_size(unit): n = 10000 - dr = date_range("2015-09-19", periods=n, freq="T").as_unit(unit) + dr = date_range("2015-09-19", periods=n, freq="min").as_unit(unit) ts = Series( np.random.default_rng(2).standard_normal(n), index=np.random.default_rng(2).choice(dr, n), ) - left = ts.resample("7T").size() - ix = date_range(start=left.index.min(), end=ts.index.max(), freq="7T").as_unit(unit) + left = ts.resample("7min").size() + ix = date_range(start=left.index.min(), end=ts.index.max(), freq="7min").as_unit( + unit + ) bins = np.searchsorted(ix.values, ts.index.values, side="right") val = np.bincount(bins, minlength=len(ix) + 1)[1:].astype("int64", copy=False) @@ -1819,8 +1829,12 @@ def f(data, add_arg): # Testing dataframe df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10)) - result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) - expected = df.groupby("A").resample("D").mean().multiply(multiplier) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("A").resample("D").mean().multiply(multiplier) tm.assert_frame_equal(result, expected) @@ -1828,13 +1842,13 @@ def f(data, add_arg): @pytest.mark.parametrize( "n1, freq1, n2, freq2", [ - (30, "S", 0.5, "Min"), - (60, "S", 1, "Min"), - (3600, "S", 1, "H"), + (30, "s", 0.5, "Min"), + (60, "s", 1, "Min"), + (3600, "s", 1, "H"), (60, "Min", 1, "H"), - (21600, "S", 0.25, "D"), - (86400, "S", 1, "D"), - (43200, "S", 0.5, "D"), + (21600, "s", 0.25, "D"), + (86400, "s", 1, "D"), + (43200, "s", 0.5, "D"), (1440, "Min", 1, "D"), (12, "H", 0.5, "D"), (24, "H", 1, "D"), @@ -1858,9 +1872,9 @@ def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k, unit): ("19910905", "19920406", "D", "19910905", "19920407"), ("19910905 00:00", "19920406 06:00", "D", "19910905", "19920407"), ("19910905 06:00", "19920406 06:00", "H", "19910905 06:00", "19920406 07:00"), - ("19910906", "19920406", "M", "19910831", "19920430"), - ("19910831", "19920430", "M", "19910831", "19920531"), - ("1991-08", "1992-04", "M", "19910831", "19920531"), + ("19910906", "19920406", "ME", "19910831", "19920430"), + ("19910831", "19920430", "ME", "19910831", "19920531"), + ("1991-08", "1992-04", "ME", "19910831", "19920531"), ], ) def test_get_timestamp_range_edges(first, last, freq, exp_first, exp_last, unit): @@ -1881,7 +1895,7 @@ def test_get_timestamp_range_edges(first, last, freq, exp_first, exp_last, unit) @pytest.mark.parametrize("duplicates", [True, False]) def test_resample_apply_product(duplicates, unit): # GH 5586 - index = date_range(start="2012-01-31", freq="M", periods=12).as_unit(unit) + index = date_range(start="2012-01-31", freq="ME", periods=12).as_unit(unit) ts = Series(range(12), index=index) df = DataFrame({"A": ts, "B": ts + 2}) @@ -1954,7 +1968,7 @@ def test_resample_calendar_day_with_dst( @pytest.mark.parametrize("func", ["min", "max", "first", "last"]) def test_resample_aggregate_functions_min_count(func, unit): # GH#37768 - index = date_range(start="2020", freq="M", periods=3).as_unit(unit) + index = date_range(start="2020", freq="ME", periods=3).as_unit(unit) ser = Series([1, np.nan, np.nan], index) result = getattr(ser.resample("Q"), func)(min_count=2) expected = Series( @@ -2027,3 +2041,13 @@ def test_resample_empty_series_with_tz(): ) expected = Series([], index=expected_idx, name="values", dtype="float64") tm.assert_series_equal(result, expected) + + +def test_resample_M_deprecated(): + depr_msg = r"\'M\' will be deprecated, please use \'ME\' for \'month end\'" + + s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) + expected = s.resample("2ME").mean() + with tm.assert_produces_warning(UserWarning, match=depr_msg): + result = s.resample("2M").mean() + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 7559a85de7a6b..db3804a6600b9 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -103,17 +103,19 @@ def test_selection(self, index, freq, kind, kwargs): @pytest.mark.parametrize("month", MONTHS) @pytest.mark.parametrize("meth", ["ffill", "bfill"]) @pytest.mark.parametrize("conv", ["start", "end"]) - @pytest.mark.parametrize("targ", ["D", "B", "M"]) + @pytest.mark.parametrize( + ("offset", "period"), [("D", "D"), ("B", "B"), ("ME", "M")] + ) def test_annual_upsample_cases( - self, targ, conv, meth, month, simple_period_range_series + self, offset, period, conv, meth, month, simple_period_range_series ): ts = simple_period_range_series("1/1/1990", "12/31/1991", freq=f"A-{month}") - warn = FutureWarning if targ == "B" else None + warn = FutureWarning if period == "B" else None msg = r"PeriodDtype\[B\] is deprecated" with tm.assert_produces_warning(warn, match=msg): - result = getattr(ts.resample(targ, convention=conv), meth)() - expected = result.to_timestamp(targ, how=conv) - expected = expected.asfreq(targ, meth).to_period() + result = getattr(ts.resample(period, convention=conv), meth)() + expected = result.to_timestamp(period, how=conv) + expected = expected.asfreq(offset, meth).to_period() tm.assert_series_equal(result, expected) def test_basic_downsample(self, simple_period_range_series): @@ -182,19 +184,21 @@ def test_annual_upsample(self, simple_period_range_series): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("month", MONTHS) - @pytest.mark.parametrize("target", ["D", "B", "M"]) @pytest.mark.parametrize("convention", ["start", "end"]) + @pytest.mark.parametrize( + ("offset", "period"), [("D", "D"), ("B", "B"), ("ME", "M")] + ) def test_quarterly_upsample( - self, month, target, convention, simple_period_range_series + self, month, offset, period, convention, simple_period_range_series ): freq = f"Q-{month}" ts = simple_period_range_series("1/1/1990", "12/31/1995", freq=freq) - warn = FutureWarning if target == "B" else None + warn = FutureWarning if period == "B" else None msg = r"PeriodDtype\[B\] is deprecated" with tm.assert_produces_warning(warn, match=msg): - result = ts.resample(target, convention=convention).ffill() - expected = result.to_timestamp(target, how=convention) - expected = expected.asfreq(target, "ffill").to_period() + result = ts.resample(period, convention=convention).ffill() + expected = result.to_timestamp(period, how=convention) + expected = expected.asfreq(offset, "ffill").to_period() tm.assert_series_equal(result, expected) @pytest.mark.parametrize("target", ["D", "B"]) @@ -219,13 +223,13 @@ def test_resample_basic(self): ) s[10:30] = np.nan index = PeriodIndex( - [Period("2013-01-01 00:00", "T"), Period("2013-01-01 00:01", "T")], + [Period("2013-01-01 00:00", "min"), Period("2013-01-01 00:01", "min")], name="idx", ) expected = Series([34.5, 79.5], index=index) - result = s.to_period().resample("T", kind="period").mean() + result = s.to_period().resample("min", kind="period").mean() tm.assert_series_equal(result, expected) - result2 = s.resample("T", kind="period").mean() + result2 = s.resample("min", kind="period").mean() tm.assert_series_equal(result2, expected) @pytest.mark.parametrize( @@ -325,11 +329,11 @@ def test_with_local_timezone_dateutil(self): def test_resample_nonexistent_time_bin_edge(self): # GH 19375 - index = date_range("2017-03-12", "2017-03-12 1:45:00", freq="15T") + index = date_range("2017-03-12", "2017-03-12 1:45:00", freq="15min") s = Series(np.zeros(len(index)), index=index) expected = s.tz_localize("US/Pacific") - expected.index = pd.DatetimeIndex(expected.index, freq="900S") - result = expected.resample("900S").mean() + expected.index = pd.DatetimeIndex(expected.index, freq="900s") + result = expected.resample("900s").mean() tm.assert_series_equal(result, expected) # GH 23742 @@ -350,10 +354,13 @@ def test_resample_nonexistent_time_bin_edge(self): def test_resample_ambiguous_time_bin_edge(self): # GH 10117 idx = date_range( - "2014-10-25 22:00:00", "2014-10-26 00:30:00", freq="30T", tz="Europe/London" + "2014-10-25 22:00:00", + "2014-10-26 00:30:00", + freq="30min", + tz="Europe/London", ) expected = Series(np.zeros(len(idx)), index=idx) - result = expected.resample("30T").mean() + result = expected.resample("30min").mean() tm.assert_series_equal(result, expected) def test_fill_method_and_how_upsample(self): @@ -362,8 +369,8 @@ def test_fill_method_and_how_upsample(self): np.arange(9, dtype="int64"), index=date_range("2010-01-01", periods=9, freq="Q"), ) - last = s.resample("M").ffill() - both = s.resample("M").ffill().resample("M").last().astype("int64") + last = s.resample("ME").ffill() + both = s.resample("ME").ffill().resample("ME").last().astype("int64") tm.assert_series_equal(last, both) @pytest.mark.parametrize("day", DAYS) @@ -438,7 +445,7 @@ def test_cant_fill_missing_dups(self): @pytest.mark.parametrize("freq", ["5min"]) @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) def test_resample_5minute(self, freq, kind): - rng = period_range("1/1/2000", "1/5/2000", freq="T") + rng = period_range("1/1/2000", "1/5/2000", freq="min") ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) expected = ts.to_timestamp().resample(freq).mean() if kind != "timestamp": @@ -505,7 +512,7 @@ def test_resample_tz_localized(self): # #2245 idx = date_range( - "2001-09-20 15:59", "2001-09-20 16:00", freq="T", tz="Australia/Sydney" + "2001-09-20 15:59", "2001-09-20 16:00", freq="min", tz="Australia/Sydney" ) s = Series([1, 2], index=idx) @@ -627,7 +634,7 @@ def test_resample_bms_2752(self): @pytest.mark.xfail(reason="Commented out for more than 3 years. Should this work?") def test_monthly_convention_span(self): - rng = period_range("2000-01", periods=3, freq="M") + rng = period_range("2000-01", periods=3, freq="ME") ts = Series(np.arange(3), index=rng) # hacky way to get same thing @@ -640,7 +647,7 @@ def test_monthly_convention_span(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "from_freq, to_freq", [("D", "M"), ("Q", "A"), ("M", "Q"), ("D", "W")] + "from_freq, to_freq", [("D", "ME"), ("Q", "A"), ("ME", "Q"), ("D", "W")] ) def test_default_right_closed_label(self, from_freq, to_freq): idx = date_range(start="8/15/2012", periods=100, freq=from_freq) @@ -653,7 +660,7 @@ def test_default_right_closed_label(self, from_freq, to_freq): @pytest.mark.parametrize( "from_freq, to_freq", - [("D", "MS"), ("Q", "AS"), ("M", "QS"), ("H", "D"), ("T", "H")], + [("D", "MS"), ("Q", "AS"), ("ME", "QS"), ("H", "D"), ("min", "H")], ) def test_default_left_closed_label(self, from_freq, to_freq): idx = date_range(start="8/15/2012", periods=100, freq=from_freq) @@ -800,7 +807,7 @@ def test_upsampling_ohlc(self, freq, period_mult, kind): ) def test_resample_with_nat(self, periods, values, freq, expected_values): # GH 13224 - index = PeriodIndex(periods, freq="S") + index = PeriodIndex(periods, freq="s") frame = DataFrame(values, index=index) expected_index = period_range( @@ -812,7 +819,7 @@ def test_resample_with_nat(self, periods, values, freq, expected_values): def test_resample_with_only_nat(self): # GH 13224 - pi = PeriodIndex([pd.NaT] * 3, freq="S") + pi = PeriodIndex([pd.NaT] * 3, freq="s") frame = DataFrame([2, 3, 5], index=pi, columns=["a"]) expected_index = PeriodIndex(data=[], freq=pi.freq) expected = DataFrame(index=expected_index, columns=["a"], dtype="float64") @@ -834,7 +841,6 @@ def test_resample_with_only_nat(self): ("19910905 12:00", "19910909 12:00", "H", "24H", "34H"), ("19910905 12:00", "19910909 12:00", "H", "17H", "10H"), ("19910905 12:00", "19910909 12:00", "H", "17H", "3H"), - ("19910905 12:00", "19910909 1:00", "H", "M", "3H"), ("19910905", "19910913 06:00", "2H", "24H", "10H"), ("19910905", "19910905 01:39", "Min", "5Min", "3Min"), ("19910905", "19910905 03:18", "2Min", "5Min", "3Min"), @@ -848,43 +854,54 @@ def test_resample_with_offset(self, start, end, start_freq, end_freq, offset): result = result.to_timestamp(end_freq) expected = ser.to_timestamp().resample(end_freq, offset=offset).mean() - if end_freq == "M": - # TODO: is non-tick the relevant characteristic? (GH 33815) - expected.index = expected.index._with_freq(None) + tm.assert_series_equal(result, expected) + + def test_resample_with_offset_month(self): + # GH 23882 & 31809 + pi = period_range("19910905 12:00", "19910909 1:00", freq="H") + ser = Series(np.arange(len(pi)), index=pi) + result = ser.resample("M", offset="3H").mean() + result = result.to_timestamp("M") + expected = ser.to_timestamp().resample("ME", offset="3H").mean() + # TODO: is non-tick the relevant characteristic? (GH 33815) + expected.index = expected.index._with_freq(None) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "first,last,freq,exp_first,exp_last", + "first,last,freq,freq_to_offset,exp_first,exp_last", [ - ("19910905", "19920406", "D", "19910905", "19920406"), - ("19910905 00:00", "19920406 06:00", "D", "19910905", "19920406"), + ("19910905", "19920406", "D", "D", "19910905", "19920406"), + ("19910905 00:00", "19920406 06:00", "D", "D", "19910905", "19920406"), ( "19910905 06:00", "19920406 06:00", "H", + "H", "19910905 06:00", "19920406 06:00", ), - ("19910906", "19920406", "M", "1991-09", "1992-04"), - ("19910831", "19920430", "M", "1991-08", "1992-04"), - ("1991-08", "1992-04", "M", "1991-08", "1992-04"), + ("19910906", "19920406", "M", "ME", "1991-09", "1992-04"), + ("19910831", "19920430", "M", "ME", "1991-08", "1992-04"), + ("1991-08", "1992-04", "M", "ME", "1991-08", "1992-04"), ], ) - def test_get_period_range_edges(self, first, last, freq, exp_first, exp_last): + def test_get_period_range_edges( + self, first, last, freq, freq_to_offset, exp_first, exp_last + ): first = Period(first) last = Period(last) exp_first = Period(exp_first, freq=freq) exp_last = Period(exp_last, freq=freq) - freq = pd.tseries.frequencies.to_offset(freq) + freq = pd.tseries.frequencies.to_offset(freq_to_offset) result = _get_period_range_edges(first, last, freq) expected = (exp_first, exp_last) assert result == expected def test_sum_min_count(self): # GH 19974 - index = date_range(start="2018", freq="M", periods=6) + index = date_range(start="2018", freq="ME", periods=6) data = np.ones(6) data[3:6] = np.nan s = Series(data, index).to_period() @@ -893,3 +910,30 @@ def test_sum_min_count(self): [3.0, np.nan], index=PeriodIndex(["2018Q1", "2018Q2"], freq="Q-DEC") ) tm.assert_series_equal(result, expected) + + def test_resample_t_l_deprecated(self): + # GH 52536 + msg_t = "'T' is deprecated and will be removed in a future version." + msg_l = "'L' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg_l): + rng_l = period_range( + "2020-01-01 00:00:00 00:00", "2020-01-01 00:00:00 00:01", freq="L" + ) + ser = Series(np.arange(len(rng_l)), index=rng_l) + + rng = period_range( + "2020-01-01 00:00:00 00:00", "2020-01-01 00:00:00 00:01", freq="min" + ) + expected = Series([29999.5, 60000.0], index=rng) + with tm.assert_produces_warning(FutureWarning, match=msg_t): + result = ser.resample("T").mean() + tm.assert_series_equal(result, expected) + + +def test_resample_frequency_ME_error_message(series_and_frame): + msg = "Invalid frequency: 2ME" + + obj = series_and_frame + with pytest.raises(ValueError, match=msg): + obj.resample("2ME") diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 1cfcf555355b5..86a7439410d8b 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -77,7 +77,9 @@ def test_groupby_resample_api(): ) index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i], names=["group", "date"]) expected = DataFrame({"val": [5] * 7 + [6] + [7] * 7 + [8]}, index=index) - result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] tm.assert_frame_equal(result, expected) @@ -171,7 +173,7 @@ def test_attribute_access(test_frame): def test_api_compat_before_use(attr): # make sure that we are setting the binner # on these attributes - rng = date_range("1/1/2012", periods=100, freq="S") + rng = date_range("1/1/2012", periods=100, freq="s") ts = Series(np.arange(len(rng)), index=rng) rs = ts.resample("30s") @@ -201,7 +203,7 @@ def tests_raises_on_nuisance(test_frame): def test_downsample_but_actually_upsampling(): # this is reindex / asfreq - rng = date_range("1/1/2012", periods=100, freq="S") + rng = date_range("1/1/2012", periods=100, freq="s") ts = Series(np.arange(len(rng), dtype="int64"), index=rng) result = ts.resample("20s").asfreq() expected = Series( @@ -216,7 +218,7 @@ def test_combined_up_downsampling_of_irregular(): # ts2.resample('2s').mean().ffill() # preserve these semantics - rng = date_range("1/1/2012", periods=100, freq="S") + rng = date_range("1/1/2012", periods=100, freq="s") ts = Series(np.arange(len(rng)), index=rng) ts2 = ts.iloc[[0, 1, 2, 3, 5, 7, 11, 15, 16, 25, 30]] @@ -260,7 +262,7 @@ def test_combined_up_downsampling_of_irregular(): "2012-01-01 00:00:30", ], dtype="datetime64[ns]", - freq="2S", + freq="2s", ), ) tm.assert_series_equal(result, expected) @@ -294,7 +296,7 @@ def test_transform_frame(on): def test_fillna(): # need to upsample here - rng = date_range("1/1/2012", periods=10, freq="2S") + rng = date_range("1/1/2012", periods=10, freq="2s") ts = Series(np.arange(len(rng), dtype="int64"), index=rng) r = ts.resample("s") @@ -344,11 +346,11 @@ def test_agg_consistency(): # similar aggregations with and w/o selection list df = DataFrame( np.random.default_rng(2).standard_normal((1000, 3)), - index=date_range("1/1/2012", freq="S", periods=1000), + index=date_range("1/1/2012", freq="s", periods=1000), columns=["A", "B", "C"], ) - r = df.resample("3T") + r = df.resample("3min") msg = r"Column\(s\) \['r1', 'r2'\] do not exist" with pytest.raises(KeyError, match=msg): @@ -359,11 +361,11 @@ def test_agg_consistency_int_str_column_mix(): # GH#39025 df = DataFrame( np.random.default_rng(2).standard_normal((1000, 2)), - index=date_range("1/1/2012", freq="S", periods=1000), + index=date_range("1/1/2012", freq="s", periods=1000), columns=[1, "a"], ) - r = df.resample("3T") + r = df.resample("3min") msg = r"Column\(s\) \[2, 'b'\] do not exist" with pytest.raises(KeyError, match=msg): @@ -597,7 +599,7 @@ def test_multi_agg_axis_1_raises(func): ).T warning_msg = "DataFrame.resample with axis=1 is deprecated." with tm.assert_produces_warning(FutureWarning, match=warning_msg): - res = df.resample("M", axis=1) + res = df.resample("ME", axis=1) with pytest.raises( NotImplementedError, match="axis other than 0 is not supported" ): @@ -650,7 +652,7 @@ def test_try_aggregate_non_existing_column(): # Error as we don't have 'z' column msg = r"Column\(s\) \['z'\] do not exist" with pytest.raises(KeyError, match=msg): - df.resample("30T").agg({"x": ["mean"], "y": ["median"], "z": ["sum"]}) + df.resample("30min").agg({"x": ["mean"], "y": ["median"], "z": ["sum"]}) def test_agg_list_like_func_with_args(): @@ -1021,7 +1023,7 @@ def test_df_axis_param_depr(): # Deprecation error when axis=1 is explicitly passed warning_msg = "DataFrame.resample with axis=1 is deprecated." with tm.assert_produces_warning(FutureWarning, match=warning_msg): - df.resample("M", axis=1) + df.resample("ME", axis=1) # Deprecation error when axis=0 is explicitly passed df = df.T @@ -1030,7 +1032,7 @@ def test_df_axis_param_depr(): "will be removed in a future version." ) with tm.assert_produces_warning(FutureWarning, match=warning_msg): - df.resample("M", axis=0) + df.resample("ME", axis=0) def test_series_axis_param_depr(_test_series): diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 62b0bc2012af1..f394e3a25dc0f 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -68,8 +68,12 @@ def test_deferred_with_groupby(): def f_0(x): return x.set_index("date").resample("D").asfreq() - expected = df.groupby("id").apply(f_0) - result = df.set_index("date").groupby("id").resample("D").asfreq() + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("id").apply(f_0) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.set_index("date").groupby("id").resample("D").asfreq() tm.assert_frame_equal(result, expected) df = DataFrame( @@ -83,8 +87,12 @@ def f_0(x): def f_1(x): return x.resample("1D").ffill() - expected = df.groupby("group").apply(f_1) - result = df.groupby("group").resample("1D").ffill() + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.groupby("group").apply(f_1) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("group").resample("1D").ffill() tm.assert_frame_equal(result, expected) @@ -99,7 +107,9 @@ def test_getitem(test_frame): result = g.B.resample("2s").mean() tm.assert_series_equal(result, expected) - result = g.resample("2s").mean().B + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = g.resample("2s").mean().B tm.assert_series_equal(result, expected) @@ -184,7 +194,7 @@ def test_groupby_with_origin(): def test_nearest(): # GH 17496 # Resample nearest - index = date_range("1/1/2000", periods=3, freq="T") + index = date_range("1/1/2000", periods=3, freq="min") result = Series(range(3), index=index).resample("20s").nearest() expected = Series( @@ -200,7 +210,7 @@ def test_nearest(): "2000-01-01 00:02:00", ], dtype="datetime64[ns]", - freq="20S", + freq="20s", ), ) tm.assert_series_equal(result, expected) @@ -230,8 +240,12 @@ def test_methods(f, test_frame): g = test_frame.groupby("A") r = g.resample("2s") - result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = getattr(r, f)() + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) tm.assert_equal(result, expected) @@ -248,8 +262,12 @@ def test_methods_nunique(test_frame): def test_methods_std_var(f, test_frame): g = test_frame.groupby("A") r = g.resample("2s") - result = getattr(r, f)(ddof=1) - expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = getattr(r, f)(ddof=1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) tm.assert_frame_equal(result, expected) @@ -258,18 +276,24 @@ def test_apply(test_frame): r = g.resample("2s") # reduction - expected = g.resample("2s").sum() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.resample("2s").sum() def f_0(x): return x.resample("2s").sum() - result = r.apply(f_0) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = r.apply(f_0) tm.assert_frame_equal(result, expected) def f_1(x): return x.resample("2s").apply(lambda y: y.sum()) - result = g.apply(f_1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = g.apply(f_1) # y.sum() results in int64 instead of int32 on 32-bit architectures expected = expected.astype("int64") tm.assert_frame_equal(result, expected) @@ -286,14 +310,14 @@ def f(x): s = Series([1, 2], index=["a", "b"]) return s - expected = df.groupby(pd.Grouper(freq="M")).apply(f) + expected = df.groupby(pd.Grouper(freq="ME")).apply(f) - result = df.resample("M").apply(f) + result = df.resample("ME").apply(f) tm.assert_frame_equal(result, expected) # A case for series - expected = df["col1"].groupby(pd.Grouper(freq="M"), group_keys=False).apply(f) - result = df["col1"].resample("M").apply(f) + expected = df["col1"].groupby(pd.Grouper(freq="ME"), group_keys=False).apply(f) + result = df["col1"].resample("ME").apply(f) tm.assert_series_equal(result, expected) @@ -321,7 +345,7 @@ def weighted_quantile(series, weights, q): cutoff = cumsum.iloc[-1] * q return series[cumsum >= cutoff].iloc[0] - times = date_range("2017-6-23 18:00", periods=8, freq="15T", tz="UTC") + times = date_range("2017-6-23 18:00", periods=8, freq="15min", tz="UTC") data = Series([1.0, 1, 1, 1, 1, 2, 2, 0], index=times) weights = Series([160.0, 91, 65, 43, 24, 10, 1, 0], index=times) @@ -337,7 +361,9 @@ def test_resample_groupby_with_label(): # GH 13235 index = date_range("2000-01-01", freq="2D", periods=5) df = DataFrame(index=index, data={"col0": [0, 0, 1, 1, 2], "col1": [1, 1, 1, 1, 1]}) - result = df.groupby("col0").resample("1W", label="left").sum() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("col0").resample("1W", label="left").sum() mi = [ np.array([0, 0, 1, 2], dtype=np.int64), @@ -357,7 +383,9 @@ def test_consistency_with_window(test_frame): # consistent return values with window df = test_frame expected = Index([1, 2, 3], name="A") - result = df.groupby("A").resample("2s").mean() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").resample("2s").mean() assert result.index.nlevels == 2 tm.assert_index_equal(result.index.levels[0], expected) @@ -441,7 +469,7 @@ def test_resample_groupby_agg_listlike(): # GH 42905 ts = Timestamp("2021-02-28 00:00:00") df = DataFrame({"class": ["beta"], "value": [69]}, index=Index([ts], name="date")) - resampled = df.groupby("class").resample("M")["value"] + resampled = df.groupby("class").resample("ME")["value"] result = resampled.agg(["sum", "size"]) expected = DataFrame( [[69, 1]], @@ -455,7 +483,9 @@ def test_resample_groupby_agg_listlike(): def test_empty(keys): # GH 26411 df = DataFrame([], columns=["a", "b"], index=TimedeltaIndex([])) - result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() expected = ( DataFrame(columns=["a", "b"]) .set_index(keys, drop=False) @@ -478,7 +508,8 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): if consolidate: df = df._consolidate() - result = df.groupby(["key"]).resample("W", on="date").min() + with tm.assert_produces_warning(FutureWarning): + result = df.groupby(["key"]).resample("W", on="date").min() idx = pd.MultiIndex.from_arrays( [ ["A"] * 3 + ["B"] * 3, @@ -530,7 +561,9 @@ def test_resample_no_index(keys): df = DataFrame([], columns=["a", "b", "date"]) df["date"] = pd.to_datetime(df["date"]) df = df.set_index("date") - result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() expected = DataFrame(columns=["a", "b", "date"]).set_index(keys, drop=False) expected["date"] = pd.to_datetime(expected["date"]) expected = expected.set_index("date", append=True, drop=True) @@ -577,7 +610,9 @@ def test_groupby_resample_size_all_index_same(): {"A": [1] * 3 + [2] * 3 + [1] * 3 + [2] * 3, "B": np.arange(12)}, index=date_range("31/12/2000 18:00", freq="H", periods=12), ) - result = df.groupby("A").resample("D").size() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").resample("D").size() expected = Series( 3, index=pd.MultiIndex.from_tuples( diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 8c06f1e8a1e38..3e0922228cb74 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -70,7 +70,7 @@ def test_apply_iteration(): N = 1000 ind = date_range(start="2000-01-01", freq="D", periods=N) df = DataFrame({"open": 1, "close": 2}, index=ind) - tg = Grouper(freq="M") + tg = Grouper(freq="ME") grouper, _ = tg._get_grouper(df) @@ -305,10 +305,10 @@ def test_repr(): ) def test_upsample_sum(method, method_args, expected_values): s = Series(1, index=date_range("2017", periods=2, freq="H")) - resampled = s.resample("30T") + resampled = s.resample("30min") index = pd.DatetimeIndex( ["2017-01-01T00:00:00", "2017-01-01T00:30:00", "2017-01-01T01:00:00"], - freq="30T", + freq="30min", ) result = methodcaller(method, **method_args)(resampled) expected = Series(expected_values, index=index) @@ -323,12 +323,14 @@ def test_groupby_resample_interpolate(): df["week_starting"] = date_range("01/01/2018", periods=3, freq="W") - result = ( - df.set_index("week_starting") - .groupby("volume") - .resample("1D") - .interpolate(method="linear") - ) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ( + df.set_index("week_starting") + .groupby("volume") + .resample("1D") + .interpolate(method="linear") + ) expected_ind = pd.MultiIndex.from_tuples( [ diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index a119a911e5fbe..79b13673e70c6 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -14,10 +14,10 @@ def test_asfreq_bug(): df = DataFrame(data=[1, 3], index=[timedelta(), timedelta(minutes=3)]) - result = df.resample("1T").asfreq() + result = df.resample("1min").asfreq() expected = DataFrame( data=[1, np.nan, np.nan, 3], - index=timedelta_range("0 day", periods=4, freq="1T"), + index=timedelta_range("0 day", periods=4, freq="1min"), ) tm.assert_frame_equal(result, expected) @@ -28,19 +28,19 @@ def test_resample_with_nat(): result = DataFrame({"value": [2, 3, 5]}, index).resample("1s").mean() expected = DataFrame( {"value": [2.5, np.nan, 5.0]}, - index=timedelta_range("0 day", periods=3, freq="1S"), + index=timedelta_range("0 day", periods=3, freq="1s"), ) tm.assert_frame_equal(result, expected) def test_resample_as_freq_with_subperiod(): # GH 13022 - index = timedelta_range("00:00:00", "00:10:00", freq="5T") + index = timedelta_range("00:00:00", "00:10:00", freq="5min") df = DataFrame(data={"value": [1, 5, 10]}, index=index) - result = df.resample("2T").asfreq() + result = df.resample("2min").asfreq() expected_data = {"value": [1, np.nan, np.nan, np.nan, np.nan, 10]} expected = DataFrame( - data=expected_data, index=timedelta_range("00:00:00", "00:10:00", freq="2T") + data=expected_data, index=timedelta_range("00:00:00", "00:10:00", freq="2min") ) tm.assert_frame_equal(result, expected) @@ -71,9 +71,9 @@ def test_resample_single_period_timedelta(): def test_resample_timedelta_idempotency(): # GH 12072 - index = timedelta_range("0", periods=9, freq="10L") + index = timedelta_range("0", periods=9, freq="10ms") series = Series(range(9), index=index) - result = series.resample("10L").mean() + result = series.resample("10ms").mean() expected = series.astype(float) tm.assert_series_equal(result, expected) @@ -128,13 +128,13 @@ def test_resample_timedelta_values(): @pytest.mark.parametrize( "start, end, freq, resample_freq", [ - ("8H", "21h59min50s", "10S", "3H"), # GH 30353 example + ("8H", "21h59min50s", "10s", "3H"), # GH 30353 example ("3H", "22H", "1H", "5H"), ("527D", "5006D", "3D", "10D"), ("1D", "10D", "1D", "2D"), # GH 13022 example # tests that worked before GH 33498: - ("8H", "21h59min50s", "10S", "2H"), - ("0H", "21h59min50s", "10S", "3H"), + ("8H", "21h59min50s", "10s", "2H"), + ("0H", "21h59min50s", "10s", "3H"), ("10D", "85D", "D", "2D"), ], ) @@ -155,7 +155,7 @@ def test_resample_with_timedelta_yields_no_empty_groups(duplicates): # GH 10603 df = DataFrame( np.random.default_rng(2).normal(size=(10000, 4)), - index=timedelta_range(start="0s", periods=10000, freq="3906250n"), + index=timedelta_range(start="0s", periods=10000, freq="3906250ns"), ) if duplicates: # case with non-unique columns @@ -196,11 +196,11 @@ def test_resample_closed_right(): # GH#45414 idx = pd.Index([pd.Timedelta(seconds=120 + i * 30) for i in range(10)]) ser = Series(range(10), index=idx) - result = ser.resample("T", closed="right", label="right").sum() + result = ser.resample("min", closed="right", label="right").sum() expected = Series( [0, 3, 7, 11, 15, 9], index=pd.TimedeltaIndex( - [pd.Timedelta(seconds=120 + i * 60) for i in range(6)], freq="T" + [pd.Timedelta(seconds=120 + i * 60) for i in range(6)], freq="min" ), ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 3efcd930af581..5dde863f246d1 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -858,3 +858,12 @@ def test_concat_multiindex_with_category(): ) expected = expected.set_index(["c1", "c2"]) tm.assert_frame_equal(result, expected) + + +def test_concat_ea_upcast(): + # GH#54848 + df1 = DataFrame(["a"], dtype="string") + df2 = DataFrame([1], dtype="Int64") + result = concat([df1, df2]) + expected = DataFrame(["a", 1], index=[0, 0]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index a06fc5eede55c..12d28c388d508 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -77,23 +77,23 @@ def test_concat_datetime_timezone(self): exp_idx = DatetimeIndex( [ - "2010-12-31 15:00:00+00:00", - "2010-12-31 16:00:00+00:00", - "2010-12-31 17:00:00+00:00", "2010-12-31 23:00:00+00:00", "2011-01-01 00:00:00+00:00", "2011-01-01 01:00:00+00:00", + "2010-12-31 15:00:00+00:00", + "2010-12-31 16:00:00+00:00", + "2010-12-31 17:00:00+00:00", ] ) expected = DataFrame( [ - [np.nan, 1], - [np.nan, 2], - [np.nan, 3], [1, np.nan], [2, np.nan], [3, np.nan], + [np.nan, 1], + [np.nan, 2], + [np.nan, 3], ], index=exp_idx, columns=["a", "b"], @@ -112,7 +112,7 @@ def test_concat_datetime_timezone(self): def test_concat_datetimeindex_freq(self): # GH 3232 # Monotonic index result - dr = date_range("01-Jan-2013", periods=100, freq="50L", tz="UTC") + dr = date_range("01-Jan-2013", periods=100, freq="50ms", tz="UTC") data = list(range(100)) expected = DataFrame(data, index=dr) result = concat([expected[:50], expected[50:]]) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index cb6bdba4c7f54..20daa388c2c88 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -813,9 +813,7 @@ def _check_join(left, right, result, join_col, how="left", lsuffix="_x", rsuffix left_grouped = left.groupby(join_col) right_grouped = right.groupby(join_col) - for group_key, group in result.groupby( - join_col if len(join_col) > 1 else join_col[0] - ): + for group_key, group in result.groupby(join_col): l_joined = _restrict_to_columns(group, left.columns, lsuffix) r_joined = _restrict_to_columns(group, right.columns, rsuffix) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 91510fae0a9b1..d889ae2e4806b 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -26,7 +26,6 @@ TimedeltaIndex, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.concat import concat from pandas.core.reshape.merge import ( MergeError, @@ -582,11 +581,11 @@ def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2): df_empty = df[:0] expected = DataFrame( { - "value_x": Series(dtype=df.dtypes["value"]), "key": Series(dtype=df.dtypes["key"]), + "value_x": Series(dtype=df.dtypes["value"]), "value_y": Series(dtype=df.dtypes["value"]), }, - columns=["value_x", "key", "value_y"], + columns=["key", "value_x", "value_y"], ) actual = df_empty.merge(df, on="key") tm.assert_frame_equal(actual, expected) @@ -889,13 +888,13 @@ def test_merge_on_datetime64tz_empty(self): result = left.merge(right, on="date") expected = DataFrame( { + "date": Series(dtype=dtz), "value_x": Series(dtype=float), "date2_x": Series(dtype=dtz), - "date": Series(dtype=dtz), "value_y": Series(dtype=float), "date2_y": Series(dtype=dtz), }, - columns=["value_x", "date2_x", "date", "value_y", "date2_y"], + columns=["date", "value_x", "date2_x", "value_y", "date2_y"], ) tm.assert_frame_equal(result, expected) @@ -1462,13 +1461,14 @@ def test_merge_readonly(self): def _check_merge(x, y): for how in ["inner", "left", "outer"]: - result = x.join(y, how=how) + for sort in [True, False]: + result = x.join(y, how=how, sort=sort) - expected = merge(x.reset_index(), y.reset_index(), how=how, sort=True) - expected = expected.set_index("index") + expected = merge(x.reset_index(), y.reset_index(), how=how, sort=sort) + expected = expected.set_index("index") - # TODO check_names on merge? - tm.assert_frame_equal(result, expected, check_names=False) + # TODO check_names on merge? + tm.assert_frame_equal(result, expected, check_names=False) class TestMergeDtypes: @@ -1751,7 +1751,7 @@ def test_merge_string_dtype(self, how, expected_data, any_string_dtype): "how, expected_data", [ ("inner", [[True, 1, 4], [False, 5, 3]]), - ("outer", [[True, 1, 4], [False, 5, 3]]), + ("outer", [[False, 5, 3], [True, 1, 4]]), ("left", [[True, 1, 4], [False, 5, 3]]), ("right", [[False, 5, 3], [True, 1, 4]]), ], @@ -1826,11 +1826,9 @@ def test_merge_empty(self, left_empty, how, exp): if exp == "left": expected = DataFrame({"A": [2, 1], "B": [3, 4], "C": [np.nan, np.nan]}) elif exp == "right": - expected = DataFrame({"B": [np.nan], "A": [1], "C": [5]}) + expected = DataFrame({"A": [1], "B": [np.nan], "C": [5]}) elif exp == "empty": expected = DataFrame(columns=["A", "B", "C"], dtype="int64") - if left_empty: - expected = expected[["B", "A", "C"]] elif exp == "empty_cross": expected = DataFrame(columns=["A_x", "B", "A_y", "C"], dtype="int64") @@ -1843,7 +1841,7 @@ def left(): { "X": Series( np.random.default_rng(2).choice(["foo", "bar"], size=(10,)) - ).astype(CDT(["foo", "bar"])), + ).astype(CategoricalDtype(["foo", "bar"])), "Y": np.random.default_rng(2).choice(["one", "two", "three"], size=(10,)), } ) @@ -1852,7 +1850,10 @@ def left(): @pytest.fixture def right(): return DataFrame( - {"X": Series(["foo", "bar"]).astype(CDT(["foo", "bar"])), "Z": [1, 2]} + { + "X": Series(["foo", "bar"]).astype(CategoricalDtype(["foo", "bar"])), + "Z": [1, 2], + } ) @@ -2003,8 +2004,8 @@ def test_other_columns(self, left, right): "change", [ lambda x: x, - lambda x: x.astype(CDT(["foo", "bar", "bah"])), - lambda x: x.astype(CDT(ordered=True)), + lambda x: x.astype(CategoricalDtype(["foo", "bar", "bah"])), + lambda x: x.astype(CategoricalDtype(ordered=True)), ], ) def test_dtype_on_merged_different(self, change, join_type, left, right): @@ -2111,11 +2112,13 @@ def test_merging_with_bool_or_int_cateorical_column( # GH 17187 # merging with a boolean/int categorical column df1 = DataFrame({"id": [1, 2, 3, 4], "cat": category_column}) - df1["cat"] = df1["cat"].astype(CDT(categories, ordered=ordered)) + df1["cat"] = df1["cat"].astype(CategoricalDtype(categories, ordered=ordered)) df2 = DataFrame({"id": [2, 4], "num": [1, 9]}) result = df1.merge(df2) expected = DataFrame({"id": [2, 4], "cat": expected_categories, "num": [1, 9]}) - expected["cat"] = expected["cat"].astype(CDT(categories, ordered=ordered)) + expected["cat"] = expected["cat"].astype( + CategoricalDtype(categories, ordered=ordered) + ) tm.assert_frame_equal(expected, result) def test_merge_on_int_array(self): @@ -2331,9 +2334,9 @@ def test_merge_suffix(col1, col2, kwargs, expected_cols): "outer", DataFrame( { - "A": [100, 200, 1, 300], - "B1": [60, 70, 80, np.nan], - "B2": [600, 700, np.nan, 800], + "A": [1, 100, 200, 300], + "B1": [80, 60, 70, np.nan], + "B2": [np.nan, 600, 700, 800], } ), ), @@ -2480,14 +2483,12 @@ def test_merge_multiindex_columns(): result = frame_x.merge(frame_y, on="id", suffixes=((l_suf, r_suf))) # Constructing the expected results - expected_labels = [letter + l_suf for letter in letters] + [ - letter + r_suf for letter in letters - ] - expected_index = MultiIndex.from_product( - [expected_labels, numbers], names=["outer", "inner"] - ) + tuples = [(letter + l_suf, num) for letter in letters for num in numbers] + tuples += [("id", "")] + tuples += [(letter + r_suf, num) for letter in letters for num in numbers] + + expected_index = MultiIndex.from_tuples(tuples, names=["outer", "inner"]) expected = DataFrame(columns=expected_index) - expected["id"] = "" tm.assert_frame_equal(result, expected) @@ -2752,9 +2753,9 @@ def test_merge_outer_with_NaN(dtype): result = merge(right, left, on="key", how="outer") expected = DataFrame( { - "key": [np.nan, np.nan, 1, 2], - "col2": [3, 4, np.nan, np.nan], - "col1": [np.nan, np.nan, 1, 2], + "key": [1, 2, np.nan, np.nan], + "col2": [np.nan, np.nan, 3, 4], + "col1": [1, 2, np.nan, np.nan], }, dtype=dtype, ) @@ -2847,3 +2848,137 @@ def test_merge_multiindex_single_level(): result = df.merge(df2, left_on=["col"], right_index=True, how="left") tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("on_index", [True, False]) +@pytest.mark.parametrize("left_unique", [True, False]) +@pytest.mark.parametrize("left_monotonic", [True, False]) +@pytest.mark.parametrize("right_unique", [True, False]) +@pytest.mark.parametrize("right_monotonic", [True, False]) +def test_merge_combinations( + how, sort, on_index, left_unique, left_monotonic, right_unique, right_monotonic +): + # GH 54611 + left = [2, 3] + if left_unique: + left.append(4 if left_monotonic else 1) + else: + left.append(3 if left_monotonic else 2) + + right = [2, 3] + if right_unique: + right.append(4 if right_monotonic else 1) + else: + right.append(3 if right_monotonic else 2) + + left = DataFrame({"key": left}) + right = DataFrame({"key": right}) + + if on_index: + left = left.set_index("key") + right = right.set_index("key") + on_kwargs = {"left_index": True, "right_index": True} + else: + on_kwargs = {"on": "key"} + + result = merge(left, right, how=how, sort=sort, **on_kwargs) + + if on_index: + left = left.reset_index() + right = right.reset_index() + + if how in ["left", "right", "inner"]: + if how in ["left", "inner"]: + expected, other, other_unique = left, right, right_unique + else: + expected, other, other_unique = right, left, left_unique + if how == "inner": + keep_values = set(left["key"].values).intersection(right["key"].values) + keep_mask = expected["key"].isin(keep_values) + expected = expected[keep_mask] + if sort: + expected = expected.sort_values("key") + if not other_unique: + other_value_counts = other["key"].value_counts() + repeats = other_value_counts.reindex(expected["key"].values, fill_value=1) + repeats = repeats.astype(np.intp) + expected = expected["key"].repeat(repeats.values) + expected = expected.to_frame() + elif how == "outer": + if on_index and left_unique and left["key"].equals(right["key"]): + expected = DataFrame({"key": left["key"]}) + else: + left_counts = left["key"].value_counts() + right_counts = right["key"].value_counts() + expected_counts = left_counts.mul(right_counts, fill_value=1) + expected_counts = expected_counts.astype(np.intp) + expected = expected_counts.index.values.repeat(expected_counts.values) + expected = DataFrame({"key": expected}) + expected = expected.sort_values("key") + + if on_index: + expected = expected.set_index("key") + else: + expected = expected.reset_index(drop=True) + + tm.assert_frame_equal(result, expected) + + +def test_merge_ea_int_and_float_numpy(): + # GH#46178 + df1 = DataFrame([1.0, np.nan], dtype=pd.Int64Dtype()) + df2 = DataFrame([1.5]) + expected = DataFrame(columns=[0], dtype="Int64") + + with tm.assert_produces_warning(UserWarning, match="You are merging"): + result = df1.merge(df2) + tm.assert_frame_equal(result, expected) + + with tm.assert_produces_warning(UserWarning, match="You are merging"): + result = df2.merge(df1) + tm.assert_frame_equal(result, expected.astype("float64")) + + df2 = DataFrame([1.0]) + expected = DataFrame([1], columns=[0], dtype="Int64") + result = df1.merge(df2) + tm.assert_frame_equal(result, expected) + + result = df2.merge(df1) + tm.assert_frame_equal(result, expected.astype("float64")) + + +def test_merge_arrow_string_index(any_string_dtype): + # GH#54894 + pytest.importorskip("pyarrow") + left = DataFrame({"a": ["a", "b"]}, dtype=any_string_dtype) + right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype=any_string_dtype)) + result = left.merge(right, left_on="a", right_index=True, how="left") + expected = DataFrame( + {"a": Series(["a", "b"], dtype=any_string_dtype), "b": [1, np.nan]} + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("left_empty", [True, False]) +@pytest.mark.parametrize("right_empty", [True, False]) +def test_merge_empty_frames_column_order(left_empty, right_empty): + # GH 51929 + df1 = DataFrame(1, index=[0], columns=["A", "B"]) + df2 = DataFrame(1, index=[0], columns=["A", "C", "D"]) + + if left_empty: + df1 = df1.iloc[:0] + if right_empty: + df2 = df2.iloc[:0] + + result = merge(df1, df2, on=["A"], how="outer") + expected = DataFrame(1, index=[0], columns=["A", "B", "C", "D"]) + if left_empty and right_empty: + expected = expected.iloc[:0] + elif left_empty: + expected.loc[:, "B"] = np.nan + elif right_empty: + expected.loc[:, ["C", "D"]] = np.nan + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 088d1e7e3c85e..ab010bdb909f1 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -741,10 +741,8 @@ def test_join_multi_levels2(self): expected = ( DataFrame( { - "household_id": [1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], + "household_id": [2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 2, 4], "asset_id": [ - "nl0000301109", - "nl0000301109", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", @@ -754,11 +752,11 @@ def test_join_multi_levels2(self): "lu0197800237", "lu0197800237", "nl0000289965", + "nl0000301109", + "nl0000301109", None, ], "t": [ - None, - None, 233, 234, 235, @@ -769,10 +767,10 @@ def test_join_multi_levels2(self): 181, None, None, + None, + None, ], "share": [ - 1.0, - 0.4, 0.6, 0.6, 0.6, @@ -783,10 +781,10 @@ def test_join_multi_levels2(self): 0.6, 0.25, 1.0, + 0.4, + 1.0, ], "log_return": [ - None, - None, 0.09604978, -0.06524096, 0.03532373, @@ -797,6 +795,8 @@ def test_join_multi_levels2(self): 0.036997, None, None, + None, + None, ], } ) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index b2a6ac49fdff2..8c4c51289870b 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -21,7 +21,7 @@ to_datetime, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT +from pandas.api.types import CategoricalDtype import pandas.core.reshape.tile as tmod @@ -359,7 +359,7 @@ def test_cut_return_intervals(): IntervalIndex.from_breaks(exp_bins, closed="right").take( [0, 0, 0, 1, 1, 1, 2, 2, 2] ) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) @@ -370,7 +370,7 @@ def test_series_ret_bins(): expected = Series( IntervalIndex.from_breaks([-0.003, 1.5, 3], closed="right").repeat(2) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) @@ -445,7 +445,7 @@ def test_datetime_bin(conv): Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])), ] ) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) bins = [conv(v) for v in bin_data] result = Series(cut(data, bins=bins)) @@ -491,19 +491,37 @@ def test_datetime_cut(data): ), ] ) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(Series(result), expected) -@pytest.mark.parametrize( - "bins", - [ - 3, +@pytest.mark.parametrize("box", [list, np.array, Index, Series]) +def test_datetime_tz_cut_mismatched_tzawareness(box): + # GH#54964 + bins = box( [ Timestamp("2013-01-01 04:57:07.200000"), Timestamp("2013-01-01 21:00:00"), Timestamp("2013-01-02 13:00:00"), Timestamp("2013-01-03 05:00:00"), + ] + ) + ser = Series(date_range("20130101", periods=3, tz="US/Eastern")) + + msg = "Cannot use timezone-naive bins with timezone-aware values" + with pytest.raises(ValueError, match=msg): + cut(ser, bins) + + +@pytest.mark.parametrize( + "bins", + [ + 3, + [ + Timestamp("2013-01-01 04:57:07.200000", tz="UTC").tz_convert("US/Eastern"), + Timestamp("2013-01-01 21:00:00", tz="UTC").tz_convert("US/Eastern"), + Timestamp("2013-01-02 13:00:00", tz="UTC").tz_convert("US/Eastern"), + Timestamp("2013-01-03 05:00:00", tz="UTC").tz_convert("US/Eastern"), ], ], ) @@ -534,7 +552,7 @@ def test_datetime_tz_cut(bins, box): ), ] ) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) @@ -700,7 +718,7 @@ def test_cut_with_duplicated_index_lowest_included(): def test_cut_with_nonexact_categorical_indices(): # GH 42424 - ser = Series(range(0, 100)) + ser = Series(range(100)) ser1 = cut(ser, 10).value_counts().head(5) ser2 = cut(ser, 10).value_counts().tail(5) result = DataFrame({"1": ser1, "2": ser2}) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 46da18445e135..8435f4a189c56 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -23,7 +23,7 @@ date_range, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT +from pandas.api.types import CategoricalDtype from pandas.core.reshape import reshape as reshape_lib from pandas.core.reshape.pivot import pivot_table @@ -33,7 +33,7 @@ def dropna(request): return request.param -@pytest.fixture(params=[([0] * 4, [1] * 4), (range(0, 3), range(1, 4))]) +@pytest.fixture(params=[([0] * 4, [1] * 4), (range(3), range(1, 4))]) def interval_values(request, closed): left, right = request.param return Categorical(pd.IntervalIndex.from_arrays(left, right, closed)) @@ -215,14 +215,16 @@ def test_pivot_table_dropna_categoricals(self, dropna): { "A": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], "B": [1, 2, 3, 1, 2, 3, 1, 2, 3], - "C": range(0, 9), + "C": range(9), } ) - df["A"] = df["A"].astype(CDT(categories, ordered=False)) + df["A"] = df["A"].astype(CategoricalDtype(categories, ordered=False)) result = df.pivot_table(index="B", columns="A", values="C", dropna=dropna) expected_columns = Series(["a", "b", "c"], name="A") - expected_columns = expected_columns.astype(CDT(categories, ordered=False)) + expected_columns = expected_columns.astype( + CategoricalDtype(categories, ordered=False) + ) expected_index = Series([1, 2, 3], name="B") expected = DataFrame( [[0.0, 3.0, 6.0], [1.0, 4.0, 7.0], [2.0, 5.0, 8.0]], @@ -434,7 +436,7 @@ def test_pivot_no_values(self): }, index=idx, ) - res = df.pivot_table(index=df.index.month, columns=Grouper(key="dt", freq="M")) + res = df.pivot_table(index=df.index.month, columns=Grouper(key="dt", freq="ME")) exp_columns = MultiIndex.from_tuples([("A", pd.Timestamp("2011-01-31"))]) exp_columns.names = [None, "dt"] exp = DataFrame( @@ -443,7 +445,7 @@ def test_pivot_no_values(self): tm.assert_frame_equal(res, exp) res = df.pivot_table( - index=Grouper(freq="A"), columns=Grouper(key="dt", freq="M") + index=Grouper(freq="A"), columns=Grouper(key="dt", freq="ME") ) exp = DataFrame( [3.0], index=pd.DatetimeIndex(["2011-12-31"], freq="A"), columns=exp_columns @@ -1434,8 +1436,8 @@ def test_pivot_timegrouper_double(self): result = pivot_table( df, - index=Grouper(freq="M", key="Date"), - columns=Grouper(freq="M", key="PayDay"), + index=Grouper(freq="ME", key="Date"), + columns=Grouper(freq="ME", key="PayDay"), values="Quantity", aggfunc="sum", ) @@ -1467,7 +1469,7 @@ def test_pivot_timegrouper_double(self): datetime(2013, 11, 30), datetime(2013, 12, 31), ], - freq="M", + freq="ME", ), columns=pd.DatetimeIndex( [ @@ -1476,7 +1478,7 @@ def test_pivot_timegrouper_double(self): datetime(2013, 11, 30), datetime(2013, 12, 31), ], - freq="M", + freq="ME", ), ) expected.index.name = "Date" @@ -1486,8 +1488,8 @@ def test_pivot_timegrouper_double(self): result = pivot_table( df, - index=Grouper(freq="M", key="PayDay"), - columns=Grouper(freq="M", key="Date"), + index=Grouper(freq="ME", key="PayDay"), + columns=Grouper(freq="ME", key="Date"), values="Quantity", aggfunc="sum", ) @@ -1513,7 +1515,7 @@ def test_pivot_timegrouper_double(self): result = pivot_table( df, - index=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")], + index=[Grouper(freq="ME", key="Date"), Grouper(freq="ME", key="PayDay")], columns=["Branch"], values="Quantity", aggfunc="sum", @@ -1523,7 +1525,7 @@ def test_pivot_timegrouper_double(self): result = pivot_table( df, index=["Branch"], - columns=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")], + columns=[Grouper(freq="ME", key="Date"), Grouper(freq="ME", key="PayDay")], values="Quantity", aggfunc="sum", ) @@ -1729,7 +1731,7 @@ def test_daily(self, i): @pytest.mark.parametrize("i", range(1, 13)) def test_monthly(self, i): - rng = date_range("1/1/2000", "12/31/2004", freq="M") + rng = date_range("1/1/2000", "12/31/2004", freq="ME") ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) annual = pivot_table(DataFrame(ts), index=ts.index.year, columns=ts.index.month) @@ -1768,7 +1770,7 @@ def test_pivot_table_margins_name_with_aggfunc_list(self): { "item": ["bacon", "cheese", "bacon", "cheese"], "cost": [2.5, 4.5, 3.2, 3.3], - "day": ["M", "M", "T", "T"], + "day": ["ME", "ME", "T", "T"], } ) table = costs.pivot_table( @@ -1780,10 +1782,10 @@ def test_pivot_table_margins_name_with_aggfunc_list(self): ) ix = Index(["bacon", "cheese", margins_name], dtype="object", name="item") tups = [ - ("mean", "cost", "M"), + ("mean", "cost", "ME"), ("mean", "cost", "T"), ("mean", "cost", margins_name), - ("max", "cost", "M"), + ("max", "cost", "ME"), ("max", "cost", "T"), ("max", "cost", margins_name), ] diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index 907eeca6e9b5e..bcfbe5ed1aa20 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -20,7 +20,7 @@ timedelta_range, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT +from pandas.api.types import CategoricalDtype from pandas.tseries.offsets import ( Day, @@ -129,7 +129,9 @@ def test_qcut_return_intervals(): exp_levels = np.array( [Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)] ) - exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True)) + exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype( + CategoricalDtype(ordered=True) + ) tm.assert_series_equal(res, exp) @@ -199,7 +201,7 @@ def test_single_quantile(data, start, end, length, labels): if labels is None: intervals = IntervalIndex([Interval(start, end)] * length, closed="right") - expected = Series(intervals).astype(CDT(ordered=True)) + expected = Series(intervals).astype(CategoricalDtype(ordered=True)) else: expected = Series([0] * length, dtype=np.intp) @@ -249,7 +251,7 @@ def test_datetime_tz_qcut(bins): ), ] ) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/scalar/interval/test_interval.py b/pandas/tests/scalar/interval/test_interval.py index 192aaacbac2b5..a02dbf0a0413f 100644 --- a/pandas/tests/scalar/interval/test_interval.py +++ b/pandas/tests/scalar/interval/test_interval.py @@ -81,7 +81,7 @@ def test_hash(self, interval): (Timedelta("0 days"), Timedelta("5 days"), Timedelta("5 days")), (Timedelta("10 days"), Timedelta("10 days"), Timedelta("0 days")), (Timedelta("1H10min"), Timedelta("5H5min"), Timedelta("3H55min")), - (Timedelta("5S"), Timedelta("1H"), Timedelta("59min55S")), + (Timedelta("5s"), Timedelta("1H"), Timedelta("59min55s")), ], ) def test_length(self, left, right, expected): diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index f6c1675766210..00285148a3c90 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -50,13 +50,13 @@ def test_to_timestamp_out_of_bounds(self): def test_asfreq_corner(self): val = Period(freq="A", year=2007) - result1 = val.asfreq("5t") - result2 = val.asfreq("t") - expected = Period("2007-12-31 23:59", freq="t") + result1 = val.asfreq("5min") + result2 = val.asfreq("min") + expected = Period("2007-12-31 23:59", freq="min") assert result1.ordinal == expected.ordinal - assert result1.freqstr == "5T" + assert result1.freqstr == "5min" assert result2.ordinal == expected.ordinal - assert result2.freqstr == "T" + assert result2.freqstr == "min" def test_conv_annual(self): # frequency conversion tests: from Annual Frequency @@ -87,10 +87,10 @@ def test_conv_annual(self): freq="Min", year=2007, month=12, day=31, hour=23, minute=59 ) ival_A_to_S_start = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=0 ) ival_A_to_S_end = Period( - freq="S", year=2007, month=12, day=31, hour=23, minute=59, second=59 + freq="s", year=2007, month=12, day=31, hour=23, minute=59, second=59 ) ival_AJAN_to_D_end = Period(freq="D", year=2007, month=1, day=31) @@ -100,33 +100,37 @@ def test_conv_annual(self): ival_ANOV_to_D_end = Period(freq="D", year=2007, month=11, day=30) ival_ANOV_to_D_start = Period(freq="D", year=2006, month=12, day=1) - assert ival_A.asfreq("Q", "S") == ival_A_to_Q_start + assert ival_A.asfreq("Q", "s") == ival_A_to_Q_start assert ival_A.asfreq("Q", "e") == ival_A_to_Q_end assert ival_A.asfreq("M", "s") == ival_A_to_M_start assert ival_A.asfreq("M", "E") == ival_A_to_M_end - assert ival_A.asfreq("W", "S") == ival_A_to_W_start + assert ival_A.asfreq("W", "s") == ival_A_to_W_start assert ival_A.asfreq("W", "E") == ival_A_to_W_end with tm.assert_produces_warning(FutureWarning, match=bday_msg): - assert ival_A.asfreq("B", "S") == ival_A_to_B_start + assert ival_A.asfreq("B", "s") == ival_A_to_B_start assert ival_A.asfreq("B", "E") == ival_A_to_B_end - assert ival_A.asfreq("D", "S") == ival_A_to_D_start + assert ival_A.asfreq("D", "s") == ival_A_to_D_start assert ival_A.asfreq("D", "E") == ival_A_to_D_end - assert ival_A.asfreq("H", "S") == ival_A_to_H_start + assert ival_A.asfreq("H", "s") == ival_A_to_H_start assert ival_A.asfreq("H", "E") == ival_A_to_H_end - assert ival_A.asfreq("min", "S") == ival_A_to_T_start + assert ival_A.asfreq("min", "s") == ival_A_to_T_start assert ival_A.asfreq("min", "E") == ival_A_to_T_end - assert ival_A.asfreq("T", "S") == ival_A_to_T_start - assert ival_A.asfreq("T", "E") == ival_A_to_T_end - assert ival_A.asfreq("S", "S") == ival_A_to_S_start - assert ival_A.asfreq("S", "E") == ival_A_to_S_end - - assert ival_AJAN.asfreq("D", "S") == ival_AJAN_to_D_start + msg = "'T' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ival_A.asfreq("T", "s") == ival_A_to_T_start + assert ival_A.asfreq("T", "E") == ival_A_to_T_end + msg = "'S' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ival_A.asfreq("S", "S") == ival_A_to_S_start + assert ival_A.asfreq("S", "E") == ival_A_to_S_end + + assert ival_AJAN.asfreq("D", "s") == ival_AJAN_to_D_start assert ival_AJAN.asfreq("D", "E") == ival_AJAN_to_D_end - assert ival_AJUN.asfreq("D", "S") == ival_AJUN_to_D_start + assert ival_AJUN.asfreq("D", "s") == ival_AJUN_to_D_start assert ival_AJUN.asfreq("D", "E") == ival_AJUN_to_D_end - assert ival_ANOV.asfreq("D", "S") == ival_ANOV_to_D_start + assert ival_ANOV.asfreq("D", "s") == ival_ANOV_to_D_start assert ival_ANOV.asfreq("D", "E") == ival_ANOV_to_D_end assert ival_A.asfreq("A") == ival_A @@ -159,10 +163,10 @@ def test_conv_quarterly(self): freq="Min", year=2007, month=3, day=31, hour=23, minute=59 ) ival_Q_to_S_start = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=0 ) ival_Q_to_S_end = Period( - freq="S", year=2007, month=3, day=31, hour=23, minute=59, second=59 + freq="s", year=2007, month=3, day=31, hour=23, minute=59, second=59 ) ival_QEJAN_to_D_start = Period(freq="D", year=2006, month=2, day=1) @@ -174,25 +178,25 @@ def test_conv_quarterly(self): assert ival_Q.asfreq("A") == ival_Q_to_A assert ival_Q_end_of_year.asfreq("A") == ival_Q_to_A - assert ival_Q.asfreq("M", "S") == ival_Q_to_M_start + assert ival_Q.asfreq("M", "s") == ival_Q_to_M_start assert ival_Q.asfreq("M", "E") == ival_Q_to_M_end - assert ival_Q.asfreq("W", "S") == ival_Q_to_W_start + assert ival_Q.asfreq("W", "s") == ival_Q_to_W_start assert ival_Q.asfreq("W", "E") == ival_Q_to_W_end with tm.assert_produces_warning(FutureWarning, match=bday_msg): - assert ival_Q.asfreq("B", "S") == ival_Q_to_B_start + assert ival_Q.asfreq("B", "s") == ival_Q_to_B_start assert ival_Q.asfreq("B", "E") == ival_Q_to_B_end - assert ival_Q.asfreq("D", "S") == ival_Q_to_D_start + assert ival_Q.asfreq("D", "s") == ival_Q_to_D_start assert ival_Q.asfreq("D", "E") == ival_Q_to_D_end - assert ival_Q.asfreq("H", "S") == ival_Q_to_H_start + assert ival_Q.asfreq("H", "s") == ival_Q_to_H_start assert ival_Q.asfreq("H", "E") == ival_Q_to_H_end - assert ival_Q.asfreq("Min", "S") == ival_Q_to_T_start + assert ival_Q.asfreq("Min", "s") == ival_Q_to_T_start assert ival_Q.asfreq("Min", "E") == ival_Q_to_T_end - assert ival_Q.asfreq("S", "S") == ival_Q_to_S_start - assert ival_Q.asfreq("S", "E") == ival_Q_to_S_end + assert ival_Q.asfreq("s", "s") == ival_Q_to_S_start + assert ival_Q.asfreq("s", "E") == ival_Q_to_S_end - assert ival_QEJAN.asfreq("D", "S") == ival_QEJAN_to_D_start + assert ival_QEJAN.asfreq("D", "s") == ival_QEJAN_to_D_start assert ival_QEJAN.asfreq("D", "E") == ival_QEJAN_to_D_end - assert ival_QEJUN.asfreq("D", "S") == ival_QEJUN_to_D_start + assert ival_QEJUN.asfreq("D", "s") == ival_QEJUN_to_D_start assert ival_QEJUN.asfreq("D", "E") == ival_QEJUN_to_D_end assert ival_Q.asfreq("Q") == ival_Q @@ -221,10 +225,10 @@ def test_conv_monthly(self): freq="Min", year=2007, month=1, day=31, hour=23, minute=59 ) ival_M_to_S_start = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=0 ) ival_M_to_S_end = Period( - freq="S", year=2007, month=1, day=31, hour=23, minute=59, second=59 + freq="s", year=2007, month=1, day=31, hour=23, minute=59, second=59 ) assert ival_M.asfreq("A") == ival_M_to_A @@ -232,19 +236,19 @@ def test_conv_monthly(self): assert ival_M.asfreq("Q") == ival_M_to_Q assert ival_M_end_of_quarter.asfreq("Q") == ival_M_to_Q - assert ival_M.asfreq("W", "S") == ival_M_to_W_start + assert ival_M.asfreq("W", "s") == ival_M_to_W_start assert ival_M.asfreq("W", "E") == ival_M_to_W_end with tm.assert_produces_warning(FutureWarning, match=bday_msg): - assert ival_M.asfreq("B", "S") == ival_M_to_B_start + assert ival_M.asfreq("B", "s") == ival_M_to_B_start assert ival_M.asfreq("B", "E") == ival_M_to_B_end - assert ival_M.asfreq("D", "S") == ival_M_to_D_start + assert ival_M.asfreq("D", "s") == ival_M_to_D_start assert ival_M.asfreq("D", "E") == ival_M_to_D_end - assert ival_M.asfreq("H", "S") == ival_M_to_H_start + assert ival_M.asfreq("H", "s") == ival_M_to_H_start assert ival_M.asfreq("H", "E") == ival_M_to_H_end - assert ival_M.asfreq("Min", "S") == ival_M_to_T_start + assert ival_M.asfreq("Min", "s") == ival_M_to_T_start assert ival_M.asfreq("Min", "E") == ival_M_to_T_end - assert ival_M.asfreq("S", "S") == ival_M_to_S_start - assert ival_M.asfreq("S", "E") == ival_M_to_S_end + assert ival_M.asfreq("s", "s") == ival_M_to_S_start + assert ival_M.asfreq("s", "E") == ival_M_to_S_end assert ival_M.asfreq("M") == ival_M @@ -311,10 +315,10 @@ def test_conv_weekly(self): freq="Min", year=2007, month=1, day=7, hour=23, minute=59 ) ival_W_to_S_start = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=0 ) ival_W_to_S_end = Period( - freq="S", year=2007, month=1, day=7, hour=23, minute=59, second=59 + freq="s", year=2007, month=1, day=7, hour=23, minute=59, second=59 ) assert ival_W.asfreq("A") == ival_W_to_A @@ -327,33 +331,33 @@ def test_conv_weekly(self): assert ival_W_end_of_month.asfreq("M") == ival_W_to_M_end_of_month with tm.assert_produces_warning(FutureWarning, match=bday_msg): - assert ival_W.asfreq("B", "S") == ival_W_to_B_start + assert ival_W.asfreq("B", "s") == ival_W_to_B_start assert ival_W.asfreq("B", "E") == ival_W_to_B_end - assert ival_W.asfreq("D", "S") == ival_W_to_D_start + assert ival_W.asfreq("D", "s") == ival_W_to_D_start assert ival_W.asfreq("D", "E") == ival_W_to_D_end - assert ival_WSUN.asfreq("D", "S") == ival_WSUN_to_D_start + assert ival_WSUN.asfreq("D", "s") == ival_WSUN_to_D_start assert ival_WSUN.asfreq("D", "E") == ival_WSUN_to_D_end - assert ival_WSAT.asfreq("D", "S") == ival_WSAT_to_D_start + assert ival_WSAT.asfreq("D", "s") == ival_WSAT_to_D_start assert ival_WSAT.asfreq("D", "E") == ival_WSAT_to_D_end - assert ival_WFRI.asfreq("D", "S") == ival_WFRI_to_D_start + assert ival_WFRI.asfreq("D", "s") == ival_WFRI_to_D_start assert ival_WFRI.asfreq("D", "E") == ival_WFRI_to_D_end - assert ival_WTHU.asfreq("D", "S") == ival_WTHU_to_D_start + assert ival_WTHU.asfreq("D", "s") == ival_WTHU_to_D_start assert ival_WTHU.asfreq("D", "E") == ival_WTHU_to_D_end - assert ival_WWED.asfreq("D", "S") == ival_WWED_to_D_start + assert ival_WWED.asfreq("D", "s") == ival_WWED_to_D_start assert ival_WWED.asfreq("D", "E") == ival_WWED_to_D_end - assert ival_WTUE.asfreq("D", "S") == ival_WTUE_to_D_start + assert ival_WTUE.asfreq("D", "s") == ival_WTUE_to_D_start assert ival_WTUE.asfreq("D", "E") == ival_WTUE_to_D_end - assert ival_WMON.asfreq("D", "S") == ival_WMON_to_D_start + assert ival_WMON.asfreq("D", "s") == ival_WMON_to_D_start assert ival_WMON.asfreq("D", "E") == ival_WMON_to_D_end - assert ival_W.asfreq("H", "S") == ival_W_to_H_start + assert ival_W.asfreq("H", "s") == ival_W_to_H_start assert ival_W.asfreq("H", "E") == ival_W_to_H_end - assert ival_W.asfreq("Min", "S") == ival_W_to_T_start + assert ival_W.asfreq("Min", "s") == ival_W_to_T_start assert ival_W.asfreq("Min", "E") == ival_W_to_T_end - assert ival_W.asfreq("S", "S") == ival_W_to_S_start - assert ival_W.asfreq("S", "E") == ival_W_to_S_end + assert ival_W.asfreq("s", "s") == ival_W_to_S_start + assert ival_W.asfreq("s", "E") == ival_W_to_S_end assert ival_W.asfreq("W") == ival_W @@ -404,10 +408,10 @@ def test_conv_business(self): freq="Min", year=2007, month=1, day=1, hour=23, minute=59 ) ival_B_to_S_start = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=0 ) ival_B_to_S_end = Period( - freq="S", year=2007, month=1, day=1, hour=23, minute=59, second=59 + freq="s", year=2007, month=1, day=1, hour=23, minute=59, second=59 ) assert ival_B.asfreq("A") == ival_B_to_A @@ -421,12 +425,12 @@ def test_conv_business(self): assert ival_B.asfreq("D") == ival_B_to_D - assert ival_B.asfreq("H", "S") == ival_B_to_H_start + assert ival_B.asfreq("H", "s") == ival_B_to_H_start assert ival_B.asfreq("H", "E") == ival_B_to_H_end - assert ival_B.asfreq("Min", "S") == ival_B_to_T_start + assert ival_B.asfreq("Min", "s") == ival_B_to_T_start assert ival_B.asfreq("Min", "E") == ival_B_to_T_end - assert ival_B.asfreq("S", "S") == ival_B_to_S_start - assert ival_B.asfreq("S", "E") == ival_B_to_S_end + assert ival_B.asfreq("s", "s") == ival_B_to_S_start + assert ival_B.asfreq("s", "E") == ival_B_to_S_end with tm.assert_produces_warning(FutureWarning, match=bday_msg): assert ival_B.asfreq("B") == ival_B @@ -470,10 +474,10 @@ def test_conv_daily(self): freq="Min", year=2007, month=1, day=1, hour=23, minute=59 ) ival_D_to_S_start = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=0 ) ival_D_to_S_end = Period( - freq="S", year=2007, month=1, day=1, hour=23, minute=59, second=59 + freq="s", year=2007, month=1, day=1, hour=23, minute=59, second=59 ) assert ival_D.asfreq("A") == ival_D_to_A @@ -494,17 +498,17 @@ def test_conv_daily(self): with tm.assert_produces_warning(FutureWarning, match=bday_msg): assert ival_D_friday.asfreq("B") == ival_B_friday - assert ival_D_saturday.asfreq("B", "S") == ival_B_friday + assert ival_D_saturday.asfreq("B", "s") == ival_B_friday assert ival_D_saturday.asfreq("B", "E") == ival_B_monday - assert ival_D_sunday.asfreq("B", "S") == ival_B_friday + assert ival_D_sunday.asfreq("B", "s") == ival_B_friday assert ival_D_sunday.asfreq("B", "E") == ival_B_monday - assert ival_D.asfreq("H", "S") == ival_D_to_H_start + assert ival_D.asfreq("H", "s") == ival_D_to_H_start assert ival_D.asfreq("H", "E") == ival_D_to_H_end - assert ival_D.asfreq("Min", "S") == ival_D_to_T_start + assert ival_D.asfreq("Min", "s") == ival_D_to_T_start assert ival_D.asfreq("Min", "E") == ival_D_to_T_end - assert ival_D.asfreq("S", "S") == ival_D_to_S_start - assert ival_D.asfreq("S", "E") == ival_D_to_S_end + assert ival_D.asfreq("s", "s") == ival_D_to_S_start + assert ival_D.asfreq("s", "E") == ival_D_to_S_end assert ival_D.asfreq("D") == ival_D @@ -534,10 +538,10 @@ def test_conv_hourly(self): freq="Min", year=2007, month=1, day=1, hour=0, minute=59 ) ival_H_to_S_start = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=0 ) ival_H_to_S_end = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=59, second=59 + freq="s", year=2007, month=1, day=1, hour=0, minute=59, second=59 ) assert ival_H.asfreq("A") == ival_H_to_A @@ -554,10 +558,10 @@ def test_conv_hourly(self): assert ival_H.asfreq("B") == ival_H_to_B assert ival_H_end_of_bus.asfreq("B") == ival_H_to_B - assert ival_H.asfreq("Min", "S") == ival_H_to_T_start + assert ival_H.asfreq("Min", "s") == ival_H_to_T_start assert ival_H.asfreq("Min", "E") == ival_H_to_T_end - assert ival_H.asfreq("S", "S") == ival_H_to_S_start - assert ival_H.asfreq("S", "E") == ival_H_to_S_end + assert ival_H.asfreq("s", "s") == ival_H_to_S_start + assert ival_H.asfreq("s", "E") == ival_H_to_S_end assert ival_H.asfreq("H") == ival_H @@ -597,10 +601,10 @@ def test_conv_minutely(self): ival_T_to_H = Period(freq="H", year=2007, month=1, day=1, hour=0) ival_T_to_S_start = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=0 ) ival_T_to_S_end = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=59 + freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=59 ) assert ival_T.asfreq("A") == ival_T_to_A @@ -619,38 +623,38 @@ def test_conv_minutely(self): assert ival_T.asfreq("H") == ival_T_to_H assert ival_T_end_of_hour.asfreq("H") == ival_T_to_H - assert ival_T.asfreq("S", "S") == ival_T_to_S_start - assert ival_T.asfreq("S", "E") == ival_T_to_S_end + assert ival_T.asfreq("s", "s") == ival_T_to_S_start + assert ival_T.asfreq("s", "E") == ival_T_to_S_end assert ival_T.asfreq("Min") == ival_T def test_conv_secondly(self): # frequency conversion tests: from Secondly Frequency" - ival_S = Period(freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0) + ival_S = Period(freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=0) ival_S_end_of_year = Period( - freq="S", year=2007, month=12, day=31, hour=23, minute=59, second=59 + freq="s", year=2007, month=12, day=31, hour=23, minute=59, second=59 ) ival_S_end_of_quarter = Period( - freq="S", year=2007, month=3, day=31, hour=23, minute=59, second=59 + freq="s", year=2007, month=3, day=31, hour=23, minute=59, second=59 ) ival_S_end_of_month = Period( - freq="S", year=2007, month=1, day=31, hour=23, minute=59, second=59 + freq="s", year=2007, month=1, day=31, hour=23, minute=59, second=59 ) ival_S_end_of_week = Period( - freq="S", year=2007, month=1, day=7, hour=23, minute=59, second=59 + freq="s", year=2007, month=1, day=7, hour=23, minute=59, second=59 ) ival_S_end_of_day = Period( - freq="S", year=2007, month=1, day=1, hour=23, minute=59, second=59 + freq="s", year=2007, month=1, day=1, hour=23, minute=59, second=59 ) ival_S_end_of_bus = Period( - freq="S", year=2007, month=1, day=1, hour=23, minute=59, second=59 + freq="s", year=2007, month=1, day=1, hour=23, minute=59, second=59 ) ival_S_end_of_hour = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=59, second=59 + freq="s", year=2007, month=1, day=1, hour=0, minute=59, second=59 ) ival_S_end_of_minute = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=59 + freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=59 ) ival_S_to_A = Period(freq="A", year=2007) @@ -681,12 +685,12 @@ def test_conv_secondly(self): assert ival_S.asfreq("Min") == ival_S_to_T assert ival_S_end_of_minute.asfreq("Min") == ival_S_to_T - assert ival_S.asfreq("S") == ival_S + assert ival_S.asfreq("s") == ival_S def test_conv_microsecond(self): # GH#31475 Avoid floating point errors dropping the start_time to # before the beginning of the Period - per = Period("2020-01-30 15:57:27.576166", freq="U") + per = Period("2020-01-30 15:57:27.576166", freq="us") assert per.ordinal == 1580399847576166 start = per.start_time @@ -733,7 +737,7 @@ def test_asfreq_mult(self): assert result.freq == expected.freq # ordinal will not change for freq in ["A", offsets.YearEnd()]: - result = p.asfreq(freq, how="S") + result = p.asfreq(freq, how="s") expected = Period("2007", freq="A") assert result == expected @@ -749,7 +753,7 @@ def test_asfreq_mult(self): assert result.ordinal == expected.ordinal assert result.freq == expected.freq for freq in ["2M", offsets.MonthEnd(2)]: - result = p.asfreq(freq, how="S") + result = p.asfreq(freq, how="s") expected = Period("2007-01", freq="2M") assert result == expected @@ -765,7 +769,7 @@ def test_asfreq_mult(self): assert result.ordinal == expected.ordinal assert result.freq == expected.freq for freq in ["2M", offsets.MonthEnd(2)]: - result = p.asfreq(freq, how="S") + result = p.asfreq(freq, how="s") expected = Period("2007-01", freq="2M") assert result == expected diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index b1fb657bb2051..7d07f327e3978 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -69,10 +69,7 @@ def test_construction(self): assert i1 == i3 i4 = Period("2005", freq="M") - i5 = Period("2005", freq="m") - assert i1 != i4 - assert i4 == i5 i1 = Period.now(freq="Q") i2 = Period(datetime.now(), freq="Q") @@ -102,17 +99,17 @@ def test_construction(self): assert i1 == i3 i1 = Period("2007-01-01 09:00:00.001") - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq="L") + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq="ms") assert i1 == expected - expected = Period("2007-01-01 09:00:00.001", freq="L") + expected = Period("2007-01-01 09:00:00.001", freq="ms") assert i1 == expected i1 = Period("2007-01-01 09:00:00.00101") - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq="U") + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq="us") assert i1 == expected - expected = Period("2007-01-01 09:00:00.00101", freq="U") + expected = Period("2007-01-01 09:00:00.00101", freq="us") assert i1 == expected msg = "Must supply freq for ordinal value" @@ -282,17 +279,17 @@ def test_period_constructor_offsets(self): assert i1 == i5 i1 = Period("2007-01-01 09:00:00.001") - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq="L") + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq="ms") assert i1 == expected - expected = Period("2007-01-01 09:00:00.001", freq="L") + expected = Period("2007-01-01 09:00:00.001", freq="ms") assert i1 == expected i1 = Period("2007-01-01 09:00:00.00101") - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq="U") + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq="us") assert i1 == expected - expected = Period("2007-01-01 09:00:00.00101", freq="U") + expected = Period("2007-01-01 09:00:00.00101", freq="us") assert i1 == expected def test_invalid_arguments(self): @@ -346,21 +343,21 @@ def test_constructor_infer_freq(self): assert p.freq == "H" p = Period("2007-01-01 07:10") - assert p.freq == "T" + assert p.freq == "min" p = Period("2007-01-01 07:10:15") - assert p.freq == "S" + assert p.freq == "s" p = Period("2007-01-01 07:10:15.123") - assert p.freq == "L" + assert p.freq == "ms" # We see that there are 6 digits after the decimal, so get microsecond # even though they are all zeros. p = Period("2007-01-01 07:10:15.123000") - assert p.freq == "U" + assert p.freq == "us" p = Period("2007-01-01 07:10:15.123400") - assert p.freq == "U" + assert p.freq == "us" def test_multiples(self): result1 = Period("1989", freq="2A") @@ -638,7 +635,7 @@ def test_to_timestamp(self): assert end_ts == p.to_timestamp("D", how=a) assert end_ts == p.to_timestamp("3D", how=a) - from_lst = ["A", "Q", "M", "W", "B", "D", "H", "Min", "S"] + from_lst = ["A", "Q", "M", "W", "B", "D", "H", "Min", "s"] def _ex(p): if p.freq == "B": @@ -664,10 +661,10 @@ def _ex(p): result = p.to_timestamp("3H", how="end") assert result == expected - result = p.to_timestamp("T", how="end") + result = p.to_timestamp("min", how="end") expected = Timestamp(1986, 1, 1) - Timedelta(1, "ns") assert result == expected - result = p.to_timestamp("2T", how="end") + result = p.to_timestamp("2min", how="end") assert result == expected result = p.to_timestamp(how="end") @@ -677,13 +674,13 @@ def _ex(p): expected = datetime(1985, 1, 1) result = p.to_timestamp("H", how="start") assert result == expected - result = p.to_timestamp("T", how="start") + result = p.to_timestamp("min", how="start") assert result == expected - result = p.to_timestamp("S", how="start") + result = p.to_timestamp("s", how="start") assert result == expected result = p.to_timestamp("3H", how="start") assert result == expected - result = p.to_timestamp("5S", how="start") + result = p.to_timestamp("5s", how="start") assert result == expected def test_to_timestamp_business_end(self): @@ -724,16 +721,16 @@ def test_to_timestamp_microsecond(self, ts, expected, freq): ("2000-12-15", None, "2000-12-15", "D"), ( "2000-12-15 13:45:26.123456789", - "N", + "ns", "2000-12-15 13:45:26.123456789", - "N", + "ns", ), - ("2000-12-15 13:45:26.123456789", "U", "2000-12-15 13:45:26.123456", "U"), - ("2000-12-15 13:45:26.123456", None, "2000-12-15 13:45:26.123456", "U"), - ("2000-12-15 13:45:26.123456789", "L", "2000-12-15 13:45:26.123", "L"), - ("2000-12-15 13:45:26.123", None, "2000-12-15 13:45:26.123", "L"), - ("2000-12-15 13:45:26", "S", "2000-12-15 13:45:26", "S"), - ("2000-12-15 13:45:26", "T", "2000-12-15 13:45", "T"), + ("2000-12-15 13:45:26.123456789", "us", "2000-12-15 13:45:26.123456", "us"), + ("2000-12-15 13:45:26.123456", None, "2000-12-15 13:45:26.123456", "us"), + ("2000-12-15 13:45:26.123456789", "ms", "2000-12-15 13:45:26.123", "ms"), + ("2000-12-15 13:45:26.123", None, "2000-12-15 13:45:26.123", "ms"), + ("2000-12-15 13:45:26", "s", "2000-12-15 13:45:26", "s"), + ("2000-12-15 13:45:26", "min", "2000-12-15 13:45", "min"), ("2000-12-15 13:45:26", "H", "2000-12-15 13:00", "H"), ("2000-12-15", "Y", "2000", "A-DEC"), ("2000-12-15", "Q", "2000Q4", "Q-DEC"), @@ -757,7 +754,7 @@ def test_repr_nat(self): def test_strftime(self): # GH#3363 - p = Period("2000-1-1 12:34:12", freq="S") + p = Period("2000-1-1 12:34:12", freq="s") res = p.strftime("%Y-%m-%d %H:%M:%S") assert res == "2000-01-01 12:34:12" assert isinstance(res, str) @@ -801,7 +798,7 @@ def test_quarterly_negative_ordinals(self): def test_freq_str(self): i1 = Period("1982", freq="Min") assert i1.freq == offsets.Minute() - assert i1.freqstr == "T" + assert i1.freqstr == "min" @pytest.mark.filterwarnings( "ignore:Period with BDay freq is deprecated:FutureWarning" @@ -812,11 +809,11 @@ def test_period_deprecated_freq(self): "B": ["BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY", "bus"], "D": ["DAY", "DLY", "DAILY", "Day", "Dly", "Daily"], "H": ["HR", "HOUR", "HRLY", "HOURLY", "hr", "Hour", "HRly"], - "T": ["minute", "MINUTE", "MINUTELY", "minutely"], - "S": ["sec", "SEC", "SECOND", "SECONDLY", "second"], - "L": ["MILLISECOND", "MILLISECONDLY", "millisecond"], - "U": ["MICROSECOND", "MICROSECONDLY", "microsecond"], - "N": ["NANOSECOND", "NANOSECONDLY", "nanosecond"], + "min": ["minute", "MINUTE", "MINUTELY", "minutely"], + "s": ["sec", "SEC", "SECOND", "SECONDLY", "second"], + "ms": ["MILLISECOND", "MILLISECONDLY", "millisecond"], + "us": ["MICROSECOND", "MICROSECONDLY", "microsecond"], + "ns": ["NANOSECOND", "NANOSECONDLY", "nanosecond"], } msg = INVALID_FREQ_ERR_MSG @@ -858,13 +855,13 @@ def test_outer_bounds_start_and_end_time(self, bound, offset, period_property): def test_inner_bounds_start_and_end_time(self, bound, offset, period_property): # GH #13346 period = TestPeriodProperties._period_constructor(bound, -offset) - expected = period.to_timestamp().round(freq="S") - assert getattr(period, period_property).round(freq="S") == expected - expected = (bound - offset * Timedelta(1, unit="S")).floor("S") - assert getattr(period, period_property).floor("S") == expected + expected = period.to_timestamp().round(freq="s") + assert getattr(period, period_property).round(freq="s") == expected + expected = (bound - offset * Timedelta(1, unit="s")).floor("s") + assert getattr(period, period_property).floor("s") == expected def test_start_time(self): - freq_lst = ["A", "Q", "M", "D", "H", "T", "S"] + freq_lst = ["A", "Q", "M", "D", "H", "min", "s"] xp = datetime(2012, 1, 1) for f in freq_lst: p = Period("2012", freq=f) @@ -1592,7 +1589,7 @@ def test_small_year_parsing(): def test_negone_ordinals(): - freqs = ["A", "M", "Q", "D", "H", "T", "S"] + freqs = ["A", "M", "Q", "D", "H", "min", "s"] period = Period(ordinal=-1, freq="D") for freq in freqs: @@ -1614,3 +1611,9 @@ def test_invalid_frequency_error_message(): msg = "Invalid frequency: " with pytest.raises(ValueError, match=msg): Period("2012-01-02", freq="WOM-1MON") + + +def test_invalid_frequency_period_error_message(): + msg = "Invalid frequency: ME" + with pytest.raises(ValueError, match=msg): + Period("2012-01-02", freq="ME") diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 701cfdf157d26..b1483342eb6e4 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -511,7 +511,6 @@ def test_nat_converters(self): "seconds", "sec", "second", - "S", "Seconds", "Sec", "Second", @@ -576,32 +575,40 @@ def test_unit_parser(self, unit, np_unit, wrapper): dtype="m8[ns]", ) # TODO(2.0): the desired output dtype may have non-nano resolution - result = to_timedelta(wrapper(range(5)), unit=unit) - tm.assert_index_equal(result, expected) - result = TimedeltaIndex(wrapper(range(5)), unit=unit) - tm.assert_index_equal(result, expected) - - str_repr = [f"{x}{unit}" for x in np.arange(5)] - result = to_timedelta(wrapper(str_repr)) - tm.assert_index_equal(result, expected) - result = to_timedelta(wrapper(str_repr)) - tm.assert_index_equal(result, expected) - - # scalar - expected = Timedelta(np.timedelta64(2, np_unit).astype("timedelta64[ns]")) - result = to_timedelta(2, unit=unit) - assert result == expected - result = Timedelta(2, unit=unit) - assert result == expected + msg = f"'{unit}' is deprecated and will be removed in a future version." - result = to_timedelta(f"2{unit}") - assert result == expected - result = Timedelta(f"2{unit}") - assert result == expected + if (unit, np_unit) in (("u", "us"), ("U", "us"), ("n", "ns"), ("N", "ns")): + warn = FutureWarning + else: + warn = None + with tm.assert_produces_warning(warn, match=msg): + result = to_timedelta(wrapper(range(5)), unit=unit) + tm.assert_index_equal(result, expected) + result = TimedeltaIndex(wrapper(range(5)), unit=unit) + tm.assert_index_equal(result, expected) + + str_repr = [f"{x}{unit}" for x in np.arange(5)] + result = to_timedelta(wrapper(str_repr)) + tm.assert_index_equal(result, expected) + result = to_timedelta(wrapper(str_repr)) + tm.assert_index_equal(result, expected) + + # scalar + expected = Timedelta(np.timedelta64(2, np_unit).astype("timedelta64[ns]")) + result = to_timedelta(2, unit=unit) + assert result == expected + result = Timedelta(2, unit=unit) + assert result == expected + + result = to_timedelta(f"2{unit}") + assert result == expected + result = Timedelta(f"2{unit}") + assert result == expected @pytest.mark.parametrize("unit", ["Y", "y", "M"]) def test_unit_m_y_raises(self, unit): msg = "Units 'M', 'Y', and 'y' are no longer supported" + with pytest.raises(ValueError, match=msg): Timedelta(10, unit) @@ -647,25 +654,25 @@ def test_to_numpy_alias(self): [ # This first case has s1, s2 being the same as t1,t2 below ( - "N", + "ns", Timedelta("1 days 02:34:56.789123456"), Timedelta("-1 days 02:34:56.789123456"), ), ( - "U", + "us", Timedelta("1 days 02:34:56.789123000"), Timedelta("-1 days 02:34:56.789123000"), ), ( - "L", + "ms", Timedelta("1 days 02:34:56.789000000"), Timedelta("-1 days 02:34:56.789000000"), ), - ("S", Timedelta("1 days 02:34:57"), Timedelta("-1 days 02:34:57")), - ("2S", Timedelta("1 days 02:34:56"), Timedelta("-1 days 02:34:56")), - ("5S", Timedelta("1 days 02:34:55"), Timedelta("-1 days 02:34:55")), - ("T", Timedelta("1 days 02:35:00"), Timedelta("-1 days 02:35:00")), - ("12T", Timedelta("1 days 02:36:00"), Timedelta("-1 days 02:36:00")), + ("s", Timedelta("1 days 02:34:57"), Timedelta("-1 days 02:34:57")), + ("2s", Timedelta("1 days 02:34:56"), Timedelta("-1 days 02:34:56")), + ("5s", Timedelta("1 days 02:34:55"), Timedelta("-1 days 02:34:55")), + ("min", Timedelta("1 days 02:35:00"), Timedelta("-1 days 02:35:00")), + ("12min", Timedelta("1 days 02:36:00"), Timedelta("-1 days 02:36:00")), ("H", Timedelta("1 days 03:00:00"), Timedelta("-1 days 03:00:00")), ("d", Timedelta("1 days"), Timedelta("-1 days")), ], @@ -684,7 +691,7 @@ def test_round_invalid(self): for freq, msg in [ ("Y", " is a non-fixed frequency"), - ("M", " is a non-fixed frequency"), + ("ME", " is a non-fixed frequency"), ("foobar", "Invalid frequency: foobar"), ]: with pytest.raises(ValueError, match=msg): @@ -921,7 +928,6 @@ def test_timedelta_hash_equality(self): @pytest.mark.xfail( reason="pd.Timedelta violates the Python hash invariant (GH#44504).", - raises=AssertionError, ) @given( st.integers( @@ -976,21 +982,21 @@ def test_implementation_limits(self): def test_total_seconds_precision(self): # GH 19458 - assert Timedelta("30S").total_seconds() == 30.0 + assert Timedelta("30s").total_seconds() == 30.0 assert Timedelta("0").total_seconds() == 0.0 - assert Timedelta("-2S").total_seconds() == -2.0 - assert Timedelta("5.324S").total_seconds() == 5.324 - assert (Timedelta("30S").total_seconds() - 30.0) < 1e-20 - assert (30.0 - Timedelta("30S").total_seconds()) < 1e-20 + assert Timedelta("-2s").total_seconds() == -2.0 + assert Timedelta("5.324s").total_seconds() == 5.324 + assert (Timedelta("30s").total_seconds() - 30.0) < 1e-20 + assert (30.0 - Timedelta("30s").total_seconds()) < 1e-20 def test_resolution_string(self): assert Timedelta(days=1).resolution_string == "D" assert Timedelta(days=1, hours=6).resolution_string == "H" - assert Timedelta(days=1, minutes=6).resolution_string == "T" - assert Timedelta(days=1, seconds=6).resolution_string == "S" - assert Timedelta(days=1, milliseconds=6).resolution_string == "L" - assert Timedelta(days=1, microseconds=6).resolution_string == "U" - assert Timedelta(days=1, nanoseconds=6).resolution_string == "N" + assert Timedelta(days=1, minutes=6).resolution_string == "min" + assert Timedelta(days=1, seconds=6).resolution_string == "s" + assert Timedelta(days=1, milliseconds=6).resolution_string == "ms" + assert Timedelta(days=1, microseconds=6).resolution_string == "us" + assert Timedelta(days=1, nanoseconds=6).resolution_string == "ns" def test_resolution_deprecated(self): # GH#21344 @@ -1007,8 +1013,8 @@ def test_resolution_deprecated(self): @pytest.mark.parametrize( "value, expected", [ - (Timedelta("10S"), True), - (Timedelta("-10S"), True), + (Timedelta("10s"), True), + (Timedelta("-10s"), True), (Timedelta(10, unit="ns"), True), (Timedelta(0, unit="ns"), False), (Timedelta(-10, unit="ns"), True), @@ -1032,3 +1038,23 @@ def test_timedelta_attribute_precision(): result += td.nanoseconds expected = td._value assert result == expected + + +@pytest.mark.parametrize( + "unit,unit_depr", + [ + ("min", "T"), + ("s", "S"), + ("ms", "L"), + ("ns", "N"), + ("us", "U"), + ], +) +def test_units_t_l_u_n_deprecated(unit, unit_depr): + # GH 52536 + msg = f"'{unit_depr}' is deprecated and will be removed in a future version." + + expected = Timedelta(1, unit=unit) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = Timedelta(1, unit=unit_depr) + tm.assert_equal(result, expected) diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index 0a43db87674af..e501bd93bc1c6 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -46,7 +46,7 @@ def test_round_division_by_zero_raises(self): ("20130104 12:00:00", "D", "20130105"), ("2000-01-05 05:09:15.13", "D", "2000-01-05 00:00:00"), ("2000-01-05 05:09:15.13", "H", "2000-01-05 05:00:00"), - ("2000-01-05 05:09:15.13", "S", "2000-01-05 05:09:15"), + ("2000-01-05 05:09:15.13", "s", "2000-01-05 05:09:15"), ], ) def test_round_frequencies(self, timestamp, freq, expected): @@ -137,10 +137,10 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected): "test_input, freq, expected", [ ("2018-01-01 00:02:06", "2s", "2018-01-01 00:02:06"), - ("2018-01-01 00:02:00", "2T", "2018-01-01 00:02:00"), - ("2018-01-01 00:04:00", "4T", "2018-01-01 00:04:00"), - ("2018-01-01 00:15:00", "15T", "2018-01-01 00:15:00"), - ("2018-01-01 00:20:00", "20T", "2018-01-01 00:20:00"), + ("2018-01-01 00:02:00", "2min", "2018-01-01 00:02:00"), + ("2018-01-01 00:04:00", "4min", "2018-01-01 00:04:00"), + ("2018-01-01 00:15:00", "15min", "2018-01-01 00:15:00"), + ("2018-01-01 00:20:00", "20min", "2018-01-01 00:20:00"), ("2018-01-01 03:00:00", "3H", "2018-01-01 03:00:00"), ], ) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index dd810a31c25af..fba16749c026e 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -253,7 +253,7 @@ def test_dt_accessor_limited_display_api(self): tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) # tzaware - ser = Series(date_range("2015-01-01", "2016-01-01", freq="T"), name="xxx") + ser = Series(date_range("2015-01-01", "2016-01-01", freq="min"), name="xxx") ser = ser.dt.tz_localize("UTC").dt.tz_convert("America/Chicago") results = get_dir(ser) tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) @@ -270,11 +270,11 @@ def test_dt_accessor_limited_display_api(self): def test_dt_accessor_ambiguous_freq_conversions(self): # GH#11295 # ambiguous time error on the conversions - ser = Series(date_range("2015-01-01", "2016-01-01", freq="T"), name="xxx") + ser = Series(date_range("2015-01-01", "2016-01-01", freq="min"), name="xxx") ser = ser.dt.tz_localize("UTC").dt.tz_convert("America/Chicago") exp_values = date_range( - "2015-01-01", "2016-01-01", freq="T", tz="UTC" + "2015-01-01", "2016-01-01", freq="min", tz="UTC" ).tz_convert("America/Chicago") # freq not preserved by tz_localize above exp_values = exp_values._with_freq(None) @@ -385,7 +385,7 @@ def test_dt_round_tz_nonexistent(self, method, ts_str, freq): with pytest.raises(pytz.NonExistentTimeError, match="2018-03-11 02:00:00"): getattr(ser.dt, method)(freq, nonexistent="raise") - @pytest.mark.parametrize("freq", ["ns", "U", "1000U"]) + @pytest.mark.parametrize("freq", ["ns", "us", "1000us"]) def test_dt_round_nonnano_higher_resolution_no_op(self, freq): # GH 52761 ser = Series( @@ -499,7 +499,7 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale): ser = pd.concat([ser, Series([pd.NaT])]) assert np.isnan(ser.dt.day_name(locale=time_locale).iloc[-1]) - ser = Series(date_range(freq="M", start="2012", end="2013")) + ser = Series(date_range(freq="ME", start="2012", end="2013")) result = ser.dt.month_name(locale=time_locale) expected = Series([month.capitalize() for month in expected_months]) @@ -611,7 +611,7 @@ def test_strftime_period_hours(self): tm.assert_series_equal(result, expected) def test_strftime_period_minutes(self): - ser = Series(period_range("20130101", periods=4, freq="L")) + ser = Series(period_range("20130101", periods=4, freq="ms")) result = ser.dt.strftime("%Y/%m/%d %H:%M:%S.%l") expected = Series( [ @@ -784,8 +784,8 @@ class TestSeriesPeriodValuesDtAccessor: Period("2016-01-01 00:01:00", freq="M"), ], [ - Period("2016-01-01 00:00:00", freq="S"), - Period("2016-01-01 00:00:01", freq="S"), + Period("2016-01-01 00:00:00", freq="s"), + Period("2016-01-01 00:00:01", freq="s"), ], ], ) diff --git a/pandas/tests/series/accessors/test_struct_accessor.py b/pandas/tests/series/accessors/test_struct_accessor.py new file mode 100644 index 0000000000000..c645bb6807052 --- /dev/null +++ b/pandas/tests/series/accessors/test_struct_accessor.py @@ -0,0 +1,147 @@ +import re + +import pytest + +from pandas import ( + ArrowDtype, + DataFrame, + Index, + Series, +) +import pandas._testing as tm +from pandas.core.arrays.arrow.accessors import StructAccessor + +pa = pytest.importorskip("pyarrow") + + +def test_struct_accessor_dtypes(): + ser = Series( + [], + dtype=ArrowDtype( + pa.struct( + [ + ("int_col", pa.int64()), + ("string_col", pa.string()), + ( + "struct_col", + pa.struct( + [ + ("int_col", pa.int64()), + ("float_col", pa.float64()), + ] + ), + ), + ] + ) + ), + ) + actual = ser.struct.dtypes + expected = Series( + [ + ArrowDtype(pa.int64()), + ArrowDtype(pa.string()), + ArrowDtype( + pa.struct( + [ + ("int_col", pa.int64()), + ("float_col", pa.float64()), + ] + ) + ), + ], + index=Index(["int_col", "string_col", "struct_col"]), + ) + tm.assert_series_equal(actual, expected) + + +def test_struct_accessor_field(): + index = Index([-100, 42, 123]) + ser = Series( + [ + {"rice": 1.0, "maize": -1, "wheat": "a"}, + {"rice": 2.0, "maize": 0, "wheat": "b"}, + {"rice": 3.0, "maize": 1, "wheat": "c"}, + ], + dtype=ArrowDtype( + pa.struct( + [ + ("rice", pa.float64()), + ("maize", pa.int64()), + ("wheat", pa.string()), + ] + ) + ), + index=index, + ) + by_name = ser.struct.field("maize") + by_name_expected = Series( + [-1, 0, 1], + dtype=ArrowDtype(pa.int64()), + index=index, + name="maize", + ) + tm.assert_series_equal(by_name, by_name_expected) + + by_index = ser.struct.field(2) + by_index_expected = Series( + ["a", "b", "c"], + dtype=ArrowDtype(pa.string()), + index=index, + name="wheat", + ) + tm.assert_series_equal(by_index, by_index_expected) + + +def test_struct_accessor_field_with_invalid_name_or_index(): + ser = Series([], dtype=ArrowDtype(pa.struct([("field", pa.int64())]))) + + with pytest.raises(ValueError, match="name_or_index must be an int or str"): + ser.struct.field(1.1) + + +def test_struct_accessor_explode(): + index = Index([-100, 42, 123]) + ser = Series( + [ + {"painted": 1, "snapping": {"sea": "green"}}, + {"painted": 2, "snapping": {"sea": "leatherback"}}, + {"painted": 3, "snapping": {"sea": "hawksbill"}}, + ], + dtype=ArrowDtype( + pa.struct( + [ + ("painted", pa.int64()), + ("snapping", pa.struct([("sea", pa.string())])), + ] + ) + ), + index=index, + ) + actual = ser.struct.explode() + expected = DataFrame( + { + "painted": Series([1, 2, 3], index=index, dtype=ArrowDtype(pa.int64())), + "snapping": Series( + [{"sea": "green"}, {"sea": "leatherback"}, {"sea": "hawksbill"}], + index=index, + dtype=ArrowDtype(pa.struct([("sea", pa.string())])), + ), + }, + ) + tm.assert_frame_equal(actual, expected) + + +@pytest.mark.parametrize( + "invalid", + [ + pytest.param(Series([1, 2, 3], dtype="int64"), id="int64"), + pytest.param( + Series(["a", "b", "c"], dtype="string[pyarrow]"), id="string-pyarrow" + ), + ], +) +def test_struct_accessor_api_for_invalid(invalid): + msg = re.escape(StructAccessor._validation_msg.format(dtype=invalid.dtype)) + + with pytest.raises(AttributeError, match=msg): + invalid.struct diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 8daaf087085c6..317967bcbb7ff 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -355,7 +355,7 @@ def test_indexing_over_size_cutoff_period_index(monkeypatch): monkeypatch.setattr(libindex, "_SIZE_CUTOFF", 1000) n = 1100 - idx = period_range("1/1/2000", freq="T", periods=n) + idx = period_range("1/1/2000", freq="min", periods=n) assert idx._engine.over_size_threshold s = Series(np.random.default_rng(2).standard_normal(len(idx)), index=idx) @@ -411,7 +411,7 @@ def compare(slobj): def test_indexing_unordered2(): # diff freq - rng = date_range(datetime(2005, 1, 1), periods=20, freq="M") + rng = date_range(datetime(2005, 1, 1), periods=20, freq="ME") ts = Series(np.arange(len(rng)), index=rng) ts = ts.take(np.random.default_rng(2).permutation(20)) @@ -421,7 +421,7 @@ def test_indexing_unordered2(): def test_indexing(): - idx = date_range("2001-1-1", periods=20, freq="M") + idx = date_range("2001-1-1", periods=20, freq="ME") ts = Series(np.random.default_rng(2).random(len(idx)), index=idx) # getting @@ -455,7 +455,7 @@ def test_getitem_str_month_with_datetimeindex(): expected = ts["2013-05"] tm.assert_series_equal(expected, ts) - idx = date_range(start="2013-05-31 00:00", end="2013-05-31 23:59", freq="S") + idx = date_range(start="2013-05-31 00:00", end="2013-05-31 23:59", freq="s") ts = Series(range(len(idx)), index=idx) expected = ts["2013-05"] tm.assert_series_equal(expected, ts) @@ -486,3 +486,12 @@ def test_getitem_str_second_with_datetimeindex(): msg = r"Timestamp\('2012-01-02 18:01:02-0600', tz='US/Central'\)" with pytest.raises(KeyError, match=msg): df[df.index[2]] + + +def test_compare_datetime_with_all_none(): + # GH#54870 + ser = Series(["2020-01-01", "2020-01-02"], dtype="datetime64[ns]") + ser2 = Series([None, None]) + result = ser > ser2 + expected = Series([False, False]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 7b857a487db78..0fa28920d41bd 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -19,6 +19,7 @@ Timestamp, concat, date_range, + isna, period_range, timedelta_range, ) @@ -456,25 +457,25 @@ def test_setitem_dict_and_set_disallowed_multiindex(self, key): class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py # but checks for warnings instead of errors. - def _check_setitem_invalid(self, ser, invalid, indexer): + def _check_setitem_invalid(self, ser, invalid, indexer, warn): msg = "Setting an item of incompatible dtype is deprecated" msg = re.escape(msg) orig_ser = ser.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): ser[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): ser.iloc[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): ser.loc[indexer] = invalid ser = orig_ser.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning(warn, match=msg): ser[:] = invalid _invalid_scalars = [ @@ -494,16 +495,20 @@ def _check_setitem_invalid(self, ser, invalid, indexer): @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_bool(self, invalid, indexer): ser = Series([True, False, False], dtype="bool") - self._check_setitem_invalid(ser, invalid, indexer) + self._check_setitem_invalid(ser, invalid, indexer, FutureWarning) @pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): ser = Series([1, 2, 3], dtype=any_int_numpy_dtype) - self._check_setitem_invalid(ser, invalid, indexer) + if isna(invalid) and invalid is not NaT: + warn = None + else: + warn = FutureWarning + self._check_setitem_invalid(ser, invalid, indexer, warn) @pytest.mark.parametrize("invalid", _invalid_scalars + [True]) @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer): ser = Series([1, 2, None], dtype=float_numpy_dtype) - self._check_setitem_invalid(ser, invalid, indexer) + self._check_setitem_invalid(ser, invalid, indexer, FutureWarning) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 1e091db21ff83..f419ff9384042 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -191,14 +191,11 @@ def test_setitem_series_object_dtype(self, indexer, ser_index): expected = Series([Series([42], index=[ser_index]), 0], dtype="object") tm.assert_series_equal(ser, expected) - @pytest.mark.parametrize( - "index, exp_value, warn", [(0, 42, None), (1, np.nan, FutureWarning)] - ) - def test_setitem_series(self, index, exp_value, warn): + @pytest.mark.parametrize("index, exp_value", [(0, 42), (1, np.nan)]) + def test_setitem_series(self, index, exp_value): # GH#38303 ser = Series([0, 0]) - with tm.assert_produces_warning(warn, match="item of incompatible dtype"): - ser.loc[0] = Series([42], index=[index]) + ser.loc[0] = Series([42], index=[index]) expected = Series([exp_value, 0]) tm.assert_series_equal(ser, expected) @@ -575,7 +572,7 @@ def test_setitem_keep_precision(self, any_numeric_ea_dtype): [ (NA, NA, "Int64", "Int64", 1, None), (NA, NA, "Int64", "Int64", 2, None), - (NA, np.nan, "int64", "float64", 1, FutureWarning), + (NA, np.nan, "int64", "float64", 1, None), (NA, np.nan, "int64", "float64", 2, None), (NaT, NaT, "int64", "object", 1, FutureWarning), (NaT, NaT, "int64", "object", 2, None), @@ -583,7 +580,7 @@ def test_setitem_keep_precision(self, any_numeric_ea_dtype): (np.nan, NA, "Int64", "Int64", 2, None), (np.nan, NA, "Float64", "Float64", 1, None), (np.nan, NA, "Float64", "Float64", 2, None), - (np.nan, np.nan, "int64", "float64", 1, FutureWarning), + (np.nan, np.nan, "int64", "float64", 1, None), (np.nan, np.nan, "int64", "float64", 2, None), ], ) @@ -592,7 +589,7 @@ def test_setitem_enlarge_with_na( ): # GH#32346 ser = Series([1, 2], dtype=dtype) - with tm.assert_produces_warning(warn, match="item of incompatible dtype"): + with tm.assert_produces_warning(warn, match="incompatible dtype"): ser[indexer] = na expected_values = [1, target_na] if indexer == 1 else [1, 2, target_na] expected = Series(expected_values, dtype=target_dtype) @@ -884,7 +881,7 @@ def test_index_putmask(self, obj, key, expected, warn, val): Series([2, 3, 4, 5, 6, 7, 8, 9, 10]), Series([np.nan, 3, np.nan, 5, np.nan, 7, np.nan, 9, np.nan]), slice(None, None, 2), - FutureWarning, + None, id="int_series_slice_key_step", ), pytest.param( @@ -899,7 +896,7 @@ def test_index_putmask(self, obj, key, expected, warn, val): Series(np.arange(10)), Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, 9]), slice(None, 5), - FutureWarning, + None, id="int_series_slice_key", ), pytest.param( @@ -907,7 +904,7 @@ def test_index_putmask(self, obj, key, expected, warn, val): Series([1, 2, 3]), Series([np.nan, 2, 3]), 0, - FutureWarning, + None, id="int_series_int_key", ), pytest.param( @@ -1134,7 +1131,7 @@ def warn(self): "obj,expected,warn", [ # For numeric series, we should coerce to NaN. - (Series([1, 2, 3]), Series([np.nan, 2, 3]), FutureWarning), + (Series([1, 2, 3]), Series([np.nan, 2, 3]), None), (Series([1.0, 2.0, 3.0]), Series([np.nan, 2.0, 3.0]), None), # For datetime series, we should coerce to NaT. ( @@ -1584,13 +1581,11 @@ def test_20643_comment(): expected = Series([np.nan, 1, 2], index=["a", "b", "c"]) ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - ser.iat[0] = None + ser.iat[0] = None tm.assert_series_equal(ser, expected) ser = orig.copy() - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - ser.iloc[0] = None + ser.iloc[0] = None tm.assert_series_equal(ser, expected) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index 4e002420dadfc..7c1507ce423ad 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -393,16 +393,21 @@ def test_where_datetimelike_coerce(dtype): expected = Series([10, 10]) mask = np.array([False, False]) - rs = ser.where(mask, [10, 10]) + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.where(mask, [10, 10]) tm.assert_series_equal(rs, expected) - rs = ser.where(mask, 10) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.where(mask, 10) tm.assert_series_equal(rs, expected) - rs = ser.where(mask, 10.0) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.where(mask, 10.0) tm.assert_series_equal(rs, expected) - rs = ser.where(mask, [10.0, 10.0]) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.where(mask, [10.0, 10.0]) tm.assert_series_equal(rs, expected) rs = ser.where(mask, [10.0, np.nan]) diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py index da144261ea5b4..130b77db0a1fb 100644 --- a/pandas/tests/series/methods/test_clip.py +++ b/pandas/tests/series/methods/test_clip.py @@ -69,8 +69,15 @@ def test_clip_with_na_args(self): tm.assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3])) # GH#19992 - tm.assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, 3])) - tm.assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, 2, 1])) + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + # TODO: avoid this warning here? seems like we should never be upcasting + # in the first place? + with tm.assert_produces_warning(FutureWarning, match=msg): + res = s.clip(lower=[0, 4, np.nan]) + tm.assert_series_equal(res, Series([1, 4, 3])) + with tm.assert_produces_warning(FutureWarning, match=msg): + res = s.clip(upper=[1, np.nan, 1]) + tm.assert_series_equal(res, Series([1, 2, 1])) # GH#40420 s = Series([1, 2, 3]) diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index d2d8eab1cb38b..aabed794ac557 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -16,7 +16,7 @@ class TestCombineFirst: def test_combine_first_period_datetime(self): # GH#3367 - didx = date_range(start="1950-01-31", end="1950-07-31", freq="M") + didx = date_range(start="1950-01-31", end="1950-07-31", freq="ME") pidx = period_range(start=Period("1950-1"), end=Period("1950-7"), freq="M") # check to be consistent with DatetimeIndex for idx in [didx, pidx]: diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 9cd0ce250b5df..d1c79d0f00365 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -206,17 +206,7 @@ def test_convert_dtypes( # Test that it is a copy copy = series.copy(deep=True) - if result.notna().sum() > 0 and result.dtype in [ - "int8", - "uint8", - "int16", - "uint16", - "int32", - "uint32", - "int64", - "uint64", - "interval[int64, right]", - ]: + if result.notna().sum() > 0 and result.dtype in ["interval[int64, right]"]: with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): result[result.notna()] = np.nan else: diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index 96c2e1ba6d9bb..10b2e98586365 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas as pd from pandas import ( Categorical, Series, @@ -249,3 +250,18 @@ def test_drop_duplicates_ignore_index(self): result = ser.drop_duplicates(ignore_index=True) expected = Series([1, 2, 3]) tm.assert_series_equal(result, expected) + + def test_duplicated_arrow_dtype(self): + pytest.importorskip("pyarrow") + ser = Series([True, False, None, False], dtype="bool[pyarrow]") + result = ser.drop_duplicates() + expected = Series([True, False, None], dtype="bool[pyarrow]") + tm.assert_series_equal(result, expected) + + def test_drop_duplicates_arrow_strings(self): + # GH#54904 + pa = pytest.importorskip("pyarrow") + ser = Series(["a", "a"], dtype=pd.ArrowDtype(pa.string())) + result = ser.drop_duplicates() + expecetd = Series(["a"], dtype=pd.ArrowDtype(pa.string())) + tm.assert_series_equal(result, expecetd) diff --git a/pandas/tests/series/methods/test_explode.py b/pandas/tests/series/methods/test_explode.py index c8a9eb6f89fde..5a0188585ef30 100644 --- a/pandas/tests/series/methods/test_explode.py +++ b/pandas/tests/series/methods/test_explode.py @@ -163,3 +163,13 @@ def test_explode_pyarrow_list_type(ignore_index): dtype=pd.ArrowDtype(pa.int64()), ) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ignore_index", [True, False]) +def test_explode_pyarrow_non_list_type(ignore_index): + pa = pytest.importorskip("pyarrow") + data = [1, 2, 3] + ser = pd.Series(data, dtype=pd.ArrowDtype(pa.int64())) + result = ser.explode(ignore_index=ignore_index) + expected = pd.Series([1, 2, 3], dtype="int64[pyarrow]", index=[0, 1, 2]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index 619690f400d98..549f429f09d35 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -858,3 +858,11 @@ def test_interpolate_asfreq_raises(self): with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning, match=msg2): ser.interpolate(method="asfreq") + + def test_interpolate_fill_value(self): + # GH#54920 + pytest.importorskip("scipy") + ser = Series([np.nan, 0, 1, np.nan, 3, np.nan]) + result = ser.interpolate(method="nearest", fill_value=0) + expected = Series([np.nan, 0, 1, 1, 3, 0]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_pct_change.py b/pandas/tests/series/methods/test_pct_change.py index 4dabf7b87e2cd..6740b8756853e 100644 --- a/pandas/tests/series/methods/test_pct_change.py +++ b/pandas/tests/series/methods/test_pct_change.py @@ -107,3 +107,11 @@ def test_pct_change_with_duplicated_indices(fill_method): expected = Series([np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], index=["a", "b"] * 3) tm.assert_series_equal(result, expected) + + +def test_pct_change_no_warning_na_beginning(): + # GH#54981 + ser = Series([None, None, 1, 2, 3]) + result = ser.pct_change() + expected = Series([np.nan, np.nan, np.nan, 1, 0.5]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 2ab1cd13a31d8..f3075c116883a 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -25,12 +25,7 @@ def test_reindex(datetime_series, string_series): identity = string_series.reindex(string_series.index) - # __array_interface__ is not defined for older numpies - # and on some pythons - try: - assert np.may_share_memory(string_series.index, identity.index) - except AttributeError: - pass + assert np.may_share_memory(string_series.index, identity.index) assert identity.index.is_(string_series.index) assert identity.index.identical(string_series.index) @@ -157,16 +152,20 @@ def test_reindex_inference(): # inference of new dtype s = Series([True, False, False, True], index=list("abcd")) new_index = "agc" - result = s.reindex(list(new_index)).ffill() + msg = "Downcasting object dtype arrays on" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.reindex(list(new_index)).ffill() expected = Series([True, True, False], index=list(new_index)) tm.assert_series_equal(result, expected) def test_reindex_downcasting(): # GH4618 shifted series downcasting - s = Series(False, index=range(0, 5)) - result = s.shift(1).bfill() - expected = Series(False, index=range(0, 5)) + s = Series(False, index=range(5)) + msg = "Downcasting object dtype arrays on" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.shift(1).bfill() + expected = Series(False, index=range(5)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index ec9db8c3830d6..f08966c3816c0 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -76,7 +76,9 @@ def test_replace(self): ser[20:30] = "bar" # replace list with a single value - rs = ser.replace([np.nan, "foo", "bar"], -1) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.replace([np.nan, "foo", "bar"], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() @@ -84,7 +86,8 @@ def test_replace(self): assert (pd.isna(ser[:5])).all() # replace with different values - rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() @@ -92,11 +95,13 @@ def test_replace(self): assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists - rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace - return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) assert return_value is None assert (ser[:5] == -1).all() @@ -294,7 +299,9 @@ def test_replace2(self): ser[20:30] = "bar" # replace list with a single value - rs = ser.replace([np.nan, "foo", "bar"], -1) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.replace([np.nan, "foo", "bar"], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() @@ -302,7 +309,8 @@ def test_replace2(self): assert (pd.isna(ser[:5])).all() # replace with different values - rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() @@ -310,11 +318,13 @@ def test_replace2(self): assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists - rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace - return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) assert return_value is None assert (ser[:5] == -1).all() assert (ser[6:10] == -1).all() @@ -373,7 +383,9 @@ def test_replace_unicode_with_number(self): def test_replace_mixed_types_with_string(self): # Testing mixed s = pd.Series([1, 2, 3, "4", 4, 5]) - result = s.replace([2, "4"], np.nan) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.replace([2, "4"], np.nan) expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) @@ -387,7 +399,9 @@ def test_replace_mixed_types_with_string(self): def test_replace_categorical(self, categorical, numeric): # GH 24971, GH#23305 ser = pd.Series(categorical) - result = ser.replace({"A": 1, "B": 2}) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.replace({"A": 1, "B": 2}) expected = pd.Series(numeric).astype("category") if 2 not in expected.cat.categories: # i.e. categories should be [1, 2] even if there are no "B"s present @@ -710,7 +724,9 @@ def test_replace_regex_dtype_series(self, regex): # GH-48644 series = pd.Series(["0"]) expected = pd.Series([1]) - result = series.replace(to_replace="0", value=1, regex=regex) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = series.replace(to_replace="0", value=1, regex=regex) tm.assert_series_equal(result, expected) def test_replace_different_int_types(self, any_int_numpy_dtype): diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index be63d9500ce73..a39b3ff7e6f2b 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -203,6 +203,7 @@ def test_series_datetimelike_attribute_access_invalid(self): with pytest.raises(AttributeError, match=msg): ser.weekday + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize( "kernel, has_numeric_only", [ diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 80fd2fd7c0a06..55fc77fb5705f 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -639,10 +639,12 @@ def test_comparison_operators_with_nas(self, comparison_op): result = comparison_op(ser, val) expected = comparison_op(ser.dropna(), val).reindex(ser.index) - if comparison_op is operator.ne: - expected = expected.fillna(True).astype(bool) - else: - expected = expected.fillna(False).astype(bool) + msg = "Downcasting object dtype arrays" + with tm.assert_produces_warning(FutureWarning, match=msg): + if comparison_op is operator.ne: + expected = expected.fillna(True).astype(bool) + else: + expected = expected.fillna(False).astype(bool) tm.assert_series_equal(result, expected) @@ -777,7 +779,7 @@ class TestNamePreservation: @pytest.mark.parametrize("box", [list, tuple, np.array, Index, Series, pd.array]) @pytest.mark.parametrize("flex", [True, False]) def test_series_ops_name_retention(self, flex, box, names, all_binary_operators): - # GH#33930 consistent name renteiton + # GH#33930 consistent name-retention op = all_binary_operators left = Series(range(10), name=names[0]) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 611f4a7f790a6..d45c655a4c0a2 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1022,7 +1022,7 @@ def test_constructor_dtype_datetime64_8(self): def test_constructor_dtype_datetime64_7(self): # GH6529 # coerce datetime64 non-ns properly - dates = date_range("01-Jan-2015", "01-Dec-2015", freq="M") + dates = date_range("01-Jan-2015", "01-Dec-2015", freq="ME") values2 = dates.view(np.ndarray).astype("datetime64[ns]") expected = Series(values2, index=dates) @@ -2077,8 +2077,8 @@ def test_series_from_index_dtype_equal_does_not_copy(self): def test_series_string_inference(self): # GH#54430 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" expected = Series(["a", "b"], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Series(["a", "b"]) @@ -2092,8 +2092,8 @@ def test_series_string_inference(self): @pytest.mark.parametrize("na_value", [None, np.nan, pd.NA]) def test_series_string_with_na_inference(self, na_value): # GH#54430 - pa = pytest.importorskip("pyarrow") - dtype = pd.ArrowDtype(pa.string()) + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" expected = Series(["a", na_value], dtype=dtype) with pd.option_context("future.infer_string", True): ser = Series(["a", na_value]) @@ -2101,12 +2101,28 @@ def test_series_string_with_na_inference(self, na_value): def test_series_string_inference_scalar(self): # GH#54430 - pa = pytest.importorskip("pyarrow") - expected = Series("a", index=[1], dtype=pd.ArrowDtype(pa.string())) + pytest.importorskip("pyarrow") + expected = Series("a", index=[1], dtype="string[pyarrow_numpy]") with pd.option_context("future.infer_string", True): ser = Series("a", index=[1]) tm.assert_series_equal(ser, expected) + def test_series_string_inference_array_string_dtype(self): + # GH#54496 + pytest.importorskip("pyarrow") + expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") + with pd.option_context("future.infer_string", True): + ser = Series(np.array(["a", "b"])) + tm.assert_series_equal(ser, expected) + + def test_series_string_inference_storage_definition(self): + # GH#54793 + pytest.importorskip("pyarrow") + expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") + with pd.option_context("future.infer_string", True): + result = Series(["a", "b"], dtype="string") + tm.assert_series_equal(result, expected) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 26046ef9ba295..2146e154dc7fa 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -15,6 +15,7 @@ class TestSeriesLogicalOps: + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize("bool_op", [operator.and_, operator.or_, operator.xor]) def test_bool_operators_with_nas(self, bool_op): # boolean &, |, ^ should work with object arrays and propagate NAs diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index 9a7622b4f1cd8..01b49b5e5b633 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -1,2 +1,15 @@ -# Needed for new arrow string dtype -object_pyarrow_numpy = ("object",) +import numpy as np + +import pandas as pd + +object_pyarrow_numpy = ("object", "string[pyarrow_numpy]") + + +def _convert_na_value(ser, expected): + if ser.dtype != object: + if ser.dtype.storage == "pyarrow_numpy": + expected = expected.fillna(np.nan) + else: + # GH#18463 + expected = expected.fillna(pd.NA) + return expected diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py index ced941187f548..1dee25e631648 100644 --- a/pandas/tests/strings/test_case_justify.py +++ b/pandas/tests/strings/test_case_justify.py @@ -278,6 +278,11 @@ def test_center_ljust_rjust_mixed_object(): def test_center_ljust_rjust_fillchar(any_string_dtype): + if any_string_dtype == "string[pyarrow_numpy]": + pytest.skip( + "Arrow logic is different, " + "see https://github.com/pandas-dev/pandas/pull/54533/files#r1299808126", + ) s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype) result = s.str.center(5, fillchar="X") diff --git a/pandas/tests/strings/test_cat.py b/pandas/tests/strings/test_cat.py index a6303610b2037..3e620b7664335 100644 --- a/pandas/tests/strings/test_cat.py +++ b/pandas/tests/strings/test_cat.py @@ -106,7 +106,7 @@ def test_str_cat_categorical(index_or_series, dtype_caller, dtype_target, sep): # Series/Index with Series having different Index t = Series(t.values, index=t.values) - expected = Index(["aa", "aa", "aa", "bb", "bb"]) + expected = Index(["aa", "aa", "bb", "bb", "aa"]) expected = expected if box == Index else Series(expected, index=expected.str[:1]) result = s.str.cat(t, sep=sep) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index bcb8db96b37fa..78f0730d730e8 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -11,7 +11,10 @@ Series, _testing as tm, ) -from pandas.tests.strings import object_pyarrow_numpy +from pandas.tests.strings import ( + _convert_na_value, + object_pyarrow_numpy, +) # -------------------------------------------------------------------------------------- # str.contains @@ -19,7 +22,7 @@ def using_pyarrow(dtype): - return dtype in ("string[pyarrow]",) + return dtype in ("string[pyarrow]", "string[pyarrow_numpy]") def test_contains(any_string_dtype): @@ -220,6 +223,8 @@ def test_contains_nan(any_string_dtype): result = s.str.contains("foo", na="foo") if any_string_dtype == "object": expected = Series(["foo", "foo", "foo"], dtype=np.object_) + elif any_string_dtype == "string[pyarrow_numpy]": + expected = Series([True, True, True], dtype=np.bool_) else: expected = Series([True, True, True], dtype="boolean") tm.assert_series_equal(result, expected) @@ -758,9 +763,7 @@ def test_findall(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan, "foo", "BAD"], dtype=any_string_dtype) result = ser.str.findall("BAD[_]*") expected = Series([["BAD__", "BAD"], np.nan, [], ["BAD"]]) - if ser.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + expected = _convert_na_value(ser, expected) tm.assert_series_equal(result, expected) @@ -806,7 +809,7 @@ def test_find(any_string_dtype): ser = Series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype == "object" else "Int64" + expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" result = ser.str.find("EF") expected = Series([4, 3, 1, 0, -1], dtype=expected_dtype) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 0298694ccaf71..0a7d409773dd6 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -12,6 +12,10 @@ Series, _testing as tm, ) +from pandas.tests.strings import ( + _convert_na_value, + object_pyarrow_numpy, +) @pytest.mark.parametrize("method", ["split", "rsplit"]) @@ -20,9 +24,7 @@ def test_split(any_string_dtype, method): result = getattr(values.str, method)("_") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) - if values.dtype != object: - # GH#18463 - exp = exp.fillna(pd.NA) + exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) @@ -32,9 +34,7 @@ def test_split_more_than_one_char(any_string_dtype, method): values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype) result = getattr(values.str, method)("__") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) - if values.dtype != object: - # GH#18463 - exp = exp.fillna(pd.NA) + exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) result = getattr(values.str, method)("__", expand=False) @@ -46,9 +46,7 @@ def test_split_more_regex_split(any_string_dtype): values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) result = values.str.split("[,_]") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) - if values.dtype != object: - # GH#18463 - exp = exp.fillna(pd.NA) + exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) @@ -118,8 +116,8 @@ def test_split_object_mixed(expand, method): def test_split_n(any_string_dtype, method, n): s = Series(["a b", pd.NA, "b c"], dtype=any_string_dtype) expected = Series([["a", "b"], pd.NA, ["b", "c"]]) - result = getattr(s.str, method)(" ", n=n) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) @@ -128,9 +126,7 @@ def test_rsplit(any_string_dtype): values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) result = values.str.rsplit("[,_]") exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]]) - if values.dtype != object: - # GH#18463 - exp = exp.fillna(pd.NA) + exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) @@ -139,9 +135,7 @@ def test_rsplit_max_number(any_string_dtype): values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) result = values.str.rsplit("_", n=1) exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]]) - if values.dtype != object: - # GH#18463 - exp = exp.fillna(pd.NA) + exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) @@ -390,7 +384,7 @@ def test_split_nan_expand(any_string_dtype): # check that these are actually np.nan/pd.NA and not None # TODO see GH 18463 # tm.assert_frame_equal does not differentiate - if any_string_dtype == "object": + if any_string_dtype in object_pyarrow_numpy: assert all(np.isnan(x) for x in result.iloc[1]) else: assert all(x is pd.NA for x in result.iloc[1]) @@ -455,9 +449,7 @@ def test_partition_series_more_than_one_char(method, exp, any_string_dtype): s = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None], dtype=any_string_dtype) result = getattr(s.str, method)("__", expand=False) expected = Series(exp) - if s.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) @@ -480,9 +472,7 @@ def test_partition_series_none(any_string_dtype, method, exp): s = Series(["a b c", "c d e", np.nan, "f g h", None], dtype=any_string_dtype) result = getattr(s.str, method)(expand=False) expected = Series(exp) - if s.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) @@ -505,9 +495,7 @@ def test_partition_series_not_split(any_string_dtype, method, exp): s = Series(["abc", "cde", np.nan, "fgh", None], dtype=any_string_dtype) result = getattr(s.str, method)("_", expand=False) expected = Series(exp) - if s.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) @@ -531,9 +519,7 @@ def test_partition_series_unicode(any_string_dtype, method, exp): result = getattr(s.str, method)("_", expand=False) expected = Series(exp) - if s.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 856c31b9ccb06..661290fb00d13 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -17,7 +17,7 @@ is_integer_dtype, is_object_dtype, ) -from pandas.core.dtypes.dtypes import CategoricalDtype as CDT +from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd from pandas import ( @@ -1182,7 +1182,7 @@ def test_value_counts(self): with tm.assert_produces_warning(FutureWarning, match=msg): result = algos.value_counts(factor) breaks = [-1.606, -1.018, -0.431, 0.155, 0.741] - index = IntervalIndex.from_breaks(breaks).astype(CDT(ordered=True)) + index = IntervalIndex.from_breaks(breaks).astype(CategoricalDtype(ordered=True)) expected = Series([1, 0, 2, 1], index=index, name="count") tm.assert_series_equal(result.sort_index(), expected.sort_index()) @@ -1412,6 +1412,19 @@ def test_value_counts_uint64(self): tm.assert_series_equal(result, expected) + def test_value_counts_series(self): + # GH#54857 + values = np.array([3, 1, 2, 3, 4, np.nan]) + result = Series(values).value_counts(bins=3) + expected = Series( + [2, 2, 1], + index=IntervalIndex.from_tuples( + [(0.996, 2.0), (2.0, 3.0), (3.0, 4.0)], dtype="interval[float64, right]" + ), + name="count", + ) + tm.assert_series_equal(result, expected) + class TestDuplicated: def test_duplicated_with_nas(self): diff --git a/pandas/tests/tseries/frequencies/test_freq_code.py b/pandas/tests/tseries/frequencies/test_freq_code.py index e961fdc295c96..417bf0e90201b 100644 --- a/pandas/tests/tseries/frequencies/test_freq_code.py +++ b/pandas/tests/tseries/frequencies/test_freq_code.py @@ -8,10 +8,12 @@ ) from pandas._libs.tslibs.dtypes import _attrname_to_abbrevs +import pandas._testing as tm + @pytest.mark.parametrize( "freqstr,exp_freqstr", - [("D", "D"), ("W", "D"), ("M", "D"), ("S", "S"), ("T", "S"), ("H", "S")], + [("D", "D"), ("W", "D"), ("ME", "D"), ("s", "s"), ("min", "s"), ("H", "s")], ) def test_get_to_timestamp_base(freqstr, exp_freqstr): off = to_offset(freqstr) @@ -30,18 +32,18 @@ def test_get_to_timestamp_base(freqstr, exp_freqstr): ("M", "month"), ("D", "day"), ("H", "hour"), - ("T", "minute"), - ("S", "second"), - ("L", "millisecond"), - ("U", "microsecond"), - ("N", "nanosecond"), + ("min", "minute"), + ("s", "second"), + ("ms", "millisecond"), + ("us", "microsecond"), + ("ns", "nanosecond"), ], ) def test_get_attrname_from_abbrev(freqstr, expected): assert Resolution.get_reso_from_freqstr(freqstr).attrname == expected -@pytest.mark.parametrize("freq", ["D", "H", "T", "S", "L", "U", "N"]) +@pytest.mark.parametrize("freq", ["D", "H", "min", "s", "ms", "us", "ns"]) def test_get_freq_roundtrip2(freq): obj = Resolution.get_reso_from_freqstr(freq) result = _attrname_to_abbrevs[obj.attrname] @@ -51,12 +53,12 @@ def test_get_freq_roundtrip2(freq): @pytest.mark.parametrize( "args,expected", [ - ((1.5, "T"), (90, "S")), - ((62.4, "T"), (3744, "S")), - ((1.04, "H"), (3744, "S")), + ((1.5, "min"), (90, "s")), + ((62.4, "min"), (3744, "s")), + ((1.04, "H"), (3744, "s")), ((1, "D"), (1, "D")), - ((0.342931, "H"), (1234551600, "U")), - ((1.2345, "D"), (106660800, "L")), + ((0.342931, "H"), (1234551600, "us")), + ((1.2345, "D"), (106660800, "ms")), ], ) def test_resolution_bumping(args, expected): @@ -69,7 +71,7 @@ def test_resolution_bumping(args, expected): @pytest.mark.parametrize( "args", [ - (0.5, "N"), + (0.5, "ns"), # Too much precision in the input can prevent. (0.3429324798798269273987982, "H"), ], @@ -87,7 +89,7 @@ def test_cat(args): ("1H", "2021-01-01T09:00:00"), ("1D", "2021-01-02T08:00:00"), ("1W", "2021-01-03T08:00:00"), - ("1M", "2021-01-31T08:00:00"), + ("1ME", "2021-01-31T08:00:00"), ("1Y", "2021-12-31T08:00:00"), ], ) @@ -95,3 +97,12 @@ def test_compatibility(freqstr, expected): ts_np = np.datetime64("2021-01-01T08:00:00.00") do = to_offset(freqstr) assert ts_np + do == np.datetime64(expected) + + +@pytest.mark.parametrize("freq", ["T", "S", "L", "N", "U"]) +def test_units_t_l_deprecated_from__attrname_to_abbrevs(freq): + # GH 52536 + msg = f"'{freq}' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + Resolution.get_reso_from_freqstr(freq) diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index d811b2cf12c19..82cceeac2cd25 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -39,11 +39,11 @@ params=[ (timedelta(1), "D"), (timedelta(hours=1), "H"), - (timedelta(minutes=1), "T"), - (timedelta(seconds=1), "S"), - (np.timedelta64(1, "ns"), "N"), - (timedelta(microseconds=1), "U"), - (timedelta(microseconds=1000), "L"), + (timedelta(minutes=1), "min"), + (timedelta(seconds=1), "s"), + (np.timedelta64(1, "ns"), "ns"), + (timedelta(microseconds=1), "us"), + (timedelta(microseconds=1000), "ms"), ] ) def base_delta_code_pair(request): @@ -53,7 +53,7 @@ def base_delta_code_pair(request): freqs = ( [f"Q-{month}" for month in MONTHS] + [f"{annual}-{month}" for annual in ["A", "BA"] for month in MONTHS] - + ["M", "BM", "BMS"] + + ["ME", "BM", "BMS"] + [f"WOM-{count}{day}" for count in range(1, 5) for day in DAYS] + [f"W-{day}" for day in DAYS] ) @@ -162,7 +162,7 @@ def test_fifth_week_of_month(): def test_monthly_ambiguous(): rng = DatetimeIndex(["1/31/2000", "2/29/2000", "3/31/2000"]) - assert rng.inferred_freq == "M" + assert rng.inferred_freq == "ME" def test_annual_ambiguous(): @@ -217,7 +217,7 @@ def test_infer_freq_index(freq, expected): { "AS-JAN": ["2009-01-01", "2010-01-01", "2011-01-01", "2012-01-01"], "Q-OCT": ["2009-01-31", "2009-04-30", "2009-07-31", "2009-10-31"], - "M": ["2010-11-30", "2010-12-31", "2011-01-31", "2011-02-28"], + "ME": ["2010-11-30", "2010-12-31", "2011-01-31", "2011-02-28"], "W-SAT": ["2010-12-25", "2011-01-01", "2011-01-08", "2011-01-15"], "D": ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], "H": [ @@ -254,7 +254,8 @@ def test_infer_freq_tz_series(tz_naive_fixture): ], ) @pytest.mark.parametrize( - "freq", ["H", "3H", "10T", "3601S", "3600001L", "3600000001U", "3600000000001N"] + "freq", + ["H", "3H", "10min", "3601s", "3600001ms", "3600000001us", "3600000000001ns"], ) def test_infer_freq_tz_transition(tz_naive_fixture, date_pair, freq): # see gh-8772 @@ -437,7 +438,7 @@ def test_series_inconvertible_string(): frequencies.infer_freq(Series(["foo", "bar"])) -@pytest.mark.parametrize("freq", [None, "L"]) +@pytest.mark.parametrize("freq", [None, "ms"]) def test_series_period_index(freq): # see gh-6407 # @@ -449,7 +450,7 @@ def test_series_period_index(freq): frequencies.infer_freq(s) -@pytest.mark.parametrize("freq", ["M", "L", "S"]) +@pytest.mark.parametrize("freq", ["ME", "ms", "s"]) def test_series_datetime_index(freq): s = Series(date_range("20130101", periods=10, freq=freq)) inferred = frequencies.infer_freq(s) @@ -530,12 +531,12 @@ def test_infer_freq_non_nano(): arr = np.arange(10).astype(np.int64).view("M8[s]") dta = DatetimeArray._simple_new(arr, dtype=arr.dtype) res = frequencies.infer_freq(dta) - assert res == "S" + assert res == "s" arr2 = arr.view("m8[ms]") tda = TimedeltaArray._simple_new(arr2, dtype=arr2.dtype) res2 = frequencies.infer_freq(tda) - assert res2 == "L" + assert res2 == "ms" def test_infer_freq_non_nano_tzaware(tz_aware_fixture): diff --git a/pandas/tests/tseries/holiday/test_holiday.py b/pandas/tests/tseries/holiday/test_holiday.py index ee83ca144d38a..e751339ca2cd0 100644 --- a/pandas/tests/tseries/holiday/test_holiday.py +++ b/pandas/tests/tseries/holiday/test_holiday.py @@ -3,7 +3,10 @@ import pytest from pytz import utc -from pandas import DatetimeIndex +from pandas import ( + DatetimeIndex, + Series, +) import pandas._testing as tm from pandas.tseries.holiday import ( @@ -17,6 +20,7 @@ HolidayCalendarFactory, Timestamp, USColumbusDay, + USFederalHolidayCalendar, USLaborDay, USMartinLutherKingJr, USMemorialDay, @@ -311,3 +315,17 @@ class TestHolidayCalendar(AbstractHolidayCalendar): tm.assert_index_equal(date_interval_low, expected_results) tm.assert_index_equal(date_window_edge, expected_results) tm.assert_index_equal(date_interval_high, expected_results) + + +def test_holidays_with_timezone_specified_but_no_occurences(): + # GH 54580 + # _apply_rule() in holiday.py was silently dropping timezones if you passed it + # an empty list of holiday dates that had timezone information + start_date = Timestamp("2018-01-01", tz="America/Chicago") + end_date = Timestamp("2018-01-11", tz="America/Chicago") + test_case = USFederalHolidayCalendar().holidays( + start_date, end_date, return_name=True + ) + expected_results = Series("New Year's Day", index=[start_date]) + + tm.assert_equal(test_case, expected_results) diff --git a/pandas/tests/tseries/offsets/test_business_month.py b/pandas/tests/tseries/offsets/test_business_month.py index 9f7fb990d238a..a14451e60aa89 100644 --- a/pandas/tests/tseries/offsets/test_business_month.py +++ b/pandas/tests/tseries/offsets/test_business_month.py @@ -31,7 +31,7 @@ ) def test_apply_index(cls, n): offset = cls(n=n) - rng = pd.date_range(start="1/1/2000", periods=100000, freq="T") + rng = pd.date_range(start="1/1/2000", periods=100000, freq="min") ser = pd.Series(rng) res = rng + offset diff --git a/pandas/tests/tseries/offsets/test_index.py b/pandas/tests/tseries/offsets/test_index.py index ad3478b319898..7a62944556d11 100644 --- a/pandas/tests/tseries/offsets/test_index.py +++ b/pandas/tests/tseries/offsets/test_index.py @@ -44,7 +44,7 @@ ) def test_apply_index(cls, n): offset = cls(n=n) - rng = date_range(start="1/1/2000", periods=100000, freq="T") + rng = date_range(start="1/1/2000", periods=100000, freq="min") ser = Series(rng) res = rng + offset diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 5139331bebaf7..a7e9854c38f18 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -811,7 +811,7 @@ def test_alias_equality(self): assert k == v.copy() def test_rule_code(self): - lst = ["M", "MS", "BM", "BMS", "D", "B", "H", "T", "S", "L", "U"] + lst = ["ME", "MS", "BM", "BMS", "D", "B", "H", "min", "s", "ms", "us"] for k in lst: assert k == _get_offset(k).rule_code # should be cached - this is kind of an internals test... diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 2c8a6827a3bf1..ec3579109e7a4 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -168,7 +168,7 @@ def test_parsers_quarter_invalid(date_str): [("201101", datetime(2011, 1, 1, 0, 0)), ("200005", datetime(2000, 5, 1, 0, 0))], ) def test_parsers_month_freq(date_str, expected): - result, _ = parsing.parse_datetime_string_with_reso(date_str, freq="M") + result, _ = parsing.parse_datetime_string_with_reso(date_str, freq="ME") assert result == expected diff --git a/pandas/tests/tslibs/test_period_asfreq.py b/pandas/tests/tslibs/test_period_asfreq.py index 7c9047b3e7c60..99f0a82d6711e 100644 --- a/pandas/tests/tslibs/test_period_asfreq.py +++ b/pandas/tests/tslibs/test_period_asfreq.py @@ -15,7 +15,7 @@ def get_freq_code(freqstr: str) -> int: - off = to_offset(freqstr) + off = to_offset(freqstr, is_period=True) # error: "BaseOffset" has no attribute "_period_dtype_code" code = off._period_dtype_code # type: ignore[attr-defined] return code @@ -25,26 +25,26 @@ def get_freq_code(freqstr: str) -> int: "freq1,freq2,expected", [ ("D", "H", 24), - ("D", "T", 1440), - ("D", "S", 86400), - ("D", "L", 86400000), - ("D", "U", 86400000000), - ("D", "N", 86400000000000), - ("H", "T", 60), - ("H", "S", 3600), - ("H", "L", 3600000), - ("H", "U", 3600000000), - ("H", "N", 3600000000000), - ("T", "S", 60), - ("T", "L", 60000), - ("T", "U", 60000000), - ("T", "N", 60000000000), - ("S", "L", 1000), - ("S", "U", 1000000), - ("S", "N", 1000000000), - ("L", "U", 1000), - ("L", "N", 1000000), - ("U", "N", 1000), + ("D", "min", 1440), + ("D", "s", 86400), + ("D", "ms", 86400000), + ("D", "us", 86400000000), + ("D", "ns", 86400000000000), + ("H", "min", 60), + ("H", "s", 3600), + ("H", "ms", 3600000), + ("H", "us", 3600000000), + ("H", "ns", 3600000000000), + ("min", "s", 60), + ("min", "ms", 60000), + ("min", "us", 60000000), + ("min", "ns", 60000000000), + ("s", "ms", 1000), + ("s", "us", 1000000), + ("s", "ns", 1000000000), + ("ms", "us", 1000), + ("ms", "ns", 1000000), + ("us", "ns", 1000), ], ) def test_intra_day_conversion_factors(freq1, freq2, expected): diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index 27ddbb82f49a9..bc3e06646b235 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -20,12 +20,12 @@ ("2h 60min", offsets.Hour(3)), ("2h 20.5min", offsets.Second(8430)), ("1.5min", offsets.Second(90)), - ("0.5S", offsets.Milli(500)), - ("15l500u", offsets.Micro(15500)), - ("10s75L", offsets.Milli(10075)), + ("0.5s", offsets.Milli(500)), + ("15ms500us", offsets.Micro(15500)), + ("10s75ms", offsets.Milli(10075)), ("1s0.25ms", offsets.Micro(1000250)), - ("1s0.25L", offsets.Micro(1000250)), - ("2800N", offsets.Nano(2800)), + ("1s0.25ms", offsets.Micro(1000250)), + ("2800ns", offsets.Nano(2800)), ("2SM", offsets.SemiMonthEnd(2)), ("2SM-16", offsets.SemiMonthEnd(2, day_of_month=16)), ("2SMS-14", offsets.SemiMonthBegin(2, day_of_month=14)), @@ -38,7 +38,7 @@ def test_to_offset(freq_input, expected): @pytest.mark.parametrize( - "freqstr,expected", [("-1S", -1), ("-2SM", -2), ("-1SMS", -1), ("-5min10s", -310)] + "freqstr,expected", [("-1s", -1), ("-2SM", -2), ("-1SMS", -1), ("-5min10s", -310)] ) def test_to_offset_negative(freqstr, expected): result = to_offset(freqstr) @@ -49,12 +49,12 @@ def test_to_offset_negative(freqstr, expected): "freqstr", [ "2h20m", - "U1", - "-U", - "3U1", - "-2-3U", + "us1", + "-us", + "3us1", + "-2-3us", "-2D:3H", - "1.5.0S", + "1.5.0s", "2SMS-15-15", "2SMS-15D", "100foo", @@ -119,7 +119,7 @@ def test_to_offset_whitespace(freqstr, expected): @pytest.mark.parametrize( - "freqstr,expected", [("00H 00T 01S", 1), ("-00H 03T 14S", -194)] + "freqstr,expected", [("00H 00min 01s", 1), ("-00H 03min 14s", -194)] ) def test_to_offset_leading_zero(freqstr, expected): result = to_offset(freqstr) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index ab00e18fc4812..b8e0173ee131f 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -99,7 +99,9 @@ def test_rolling(self, f, roll_frame): r = g.rolling(window=4) result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.rolling(4), f)()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.rolling(4), f)()) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -113,7 +115,9 @@ def test_rolling_ddof(self, f, roll_frame): r = g.rolling(window=4) result = getattr(r, f)(ddof=1) - expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -129,9 +133,11 @@ def test_rolling_quantile(self, interpolation, roll_frame): r = g.rolling(window=4) result = r.quantile(0.4, interpolation=interpolation) - expected = g.apply( - lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply( + lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) + ) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -174,7 +180,9 @@ def test_rolling_corr_cov_other_diff_size_as_groups(self, f, roll_frame): def func(x): return getattr(x.rolling(4), f)(roll_frame) - expected = g.apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(func) # GH 39591: The grouped column should be all np.nan # (groupby.apply inserts 0s for cov) expected["A"] = np.nan @@ -190,7 +198,9 @@ def test_rolling_corr_cov_pairwise(self, f, roll_frame): def func(x): return getattr(x.B.rolling(4), f)(pairwise=True) - expected = g.apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(func) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -235,7 +245,9 @@ def test_rolling_apply(self, raw, roll_frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) - expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -466,20 +478,23 @@ def test_groupby_rolling_subset_with_closed(self): # GH 35549 df = DataFrame( { - "column1": range(6), - "column2": range(6), - "group": 3 * ["A", "B"], - "date": [Timestamp("2019-01-01")] * 6, + "column1": range(8), + "column2": range(8), + "group": ["A"] * 4 + ["B"] * 4, + "date": [ + Timestamp(date) + for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"] + ] + * 2, } ) result = ( df.groupby("group").rolling("1D", on="date", closed="left")["column1"].sum() ) expected = Series( - [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], - index=MultiIndex.from_tuples( - [("A", Timestamp("2019-01-01"))] * 3 - + [("B", Timestamp("2019-01-01"))] * 3, + [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0], + index=MultiIndex.from_frame( + df[["group", "date"]], names=["group", "date"], ), name="column1", @@ -490,10 +505,14 @@ def test_groupby_subset_rolling_subset_with_closed(self): # GH 35549 df = DataFrame( { - "column1": range(6), - "column2": range(6), - "group": 3 * ["A", "B"], - "date": [Timestamp("2019-01-01")] * 6, + "column1": range(8), + "column2": range(8), + "group": ["A"] * 4 + ["B"] * 4, + "date": [ + Timestamp(date) + for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"] + ] + * 2, } ) @@ -503,10 +522,9 @@ def test_groupby_subset_rolling_subset_with_closed(self): .sum() ) expected = Series( - [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], - index=MultiIndex.from_tuples( - [("A", Timestamp("2019-01-01"))] * 3 - + [("B", Timestamp("2019-01-01"))] * 3, + [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0], + index=MultiIndex.from_frame( + df[["group", "date"]], names=["group", "date"], ), name="column1", @@ -778,9 +796,13 @@ def test_groupby_rolling_resulting_multiindex3(self): def test_groupby_rolling_object_doesnt_affect_groupby_apply(self, roll_frame): # GH 39732 g = roll_frame.groupby("A", group_keys=False) - expected = g.apply(lambda x: x.rolling(4).sum()).index + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: x.rolling(4).sum()).index _ = g.rolling(window=4) - result = g.apply(lambda x: x.rolling(4).sum()).index + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = g.apply(lambda x: x.rolling(4).sum()).index tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -954,11 +976,13 @@ def test_groupby_monotonic(self): df["date"] = to_datetime(df["date"]) df = df.sort_values("date") - expected = ( - df.set_index("date") - .groupby("name") - .apply(lambda x: x.rolling("180D")["amount"].sum()) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = ( + df.set_index("date") + .groupby("name") + .apply(lambda x: x.rolling("180D")["amount"].sum()) + ) result = df.groupby("name").rolling("180D", on="date")["amount"].sum() tm.assert_series_equal(result, expected) @@ -977,9 +1001,13 @@ def test_datelike_on_monotonic_within_each_group(self): } ) - expected = ( - df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = ( + df.set_index("B") + .groupby("A") + .apply(lambda x: x.rolling("4s")["C"].mean()) + ) result = df.groupby("A").rolling("4s", on="B").C.mean() tm.assert_series_equal(result, expected) @@ -1009,7 +1037,9 @@ def test_expanding(self, f, frame): r = g.expanding() result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.expanding(), f)()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.expanding(), f)()) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -1023,7 +1053,9 @@ def test_expanding_ddof(self, f, frame): r = g.expanding() result = getattr(r, f)(ddof=0) - expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -1039,9 +1071,11 @@ def test_expanding_quantile(self, interpolation, frame): r = g.expanding() result = r.quantile(0.4, interpolation=interpolation) - expected = g.apply( - lambda x: x.expanding().quantile(0.4, interpolation=interpolation) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply( + lambda x: x.expanding().quantile(0.4, interpolation=interpolation) + ) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -1059,7 +1093,9 @@ def test_expanding_corr_cov(self, f, frame): def func_0(x): return getattr(x.expanding(), f)(frame) - expected = g.apply(func_0) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(func_0) # GH 39591: groupby.apply returns 1 instead of nan for windows # with all nan values null_idx = list(range(20, 61)) + list(range(72, 113)) @@ -1074,7 +1110,9 @@ def func_0(x): def func_1(x): return getattr(x.B.expanding(), f)(pairwise=True) - expected = g.apply(func_1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply(func_1) tm.assert_series_equal(result, expected) def test_expanding_apply(self, raw, frame): @@ -1083,7 +1121,11 @@ def test_expanding_apply(self, raw, frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) - expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = g.apply( + lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw) + ) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 70b7534b296f3..3fe922539780d 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -100,9 +100,9 @@ def test_freq_window_not_implemented(window): index=date_range("2015-12-24", periods=10, freq="D"), ) with pytest.raises( - NotImplementedError, match="step is not supported with frequency windows" + NotImplementedError, match="^step (not implemented|is not supported)" ): - df.rolling("3D", step=3) + df.rolling(window, step=3).sum() @pytest.mark.parametrize("agg", ["cov", "corr"]) @@ -304,6 +304,76 @@ def test_datetimelike_nonunique_index_centering( tm.assert_equal(result, expected) +@pytest.mark.parametrize( + "closed,expected", + [ + ("left", [np.nan, np.nan, 1, 1, 1, 10, 14, 14, 18, 21]), + ("neither", [np.nan, np.nan, 1, 1, 1, 9, 5, 5, 13, 8]), + ("right", [0, 1, 3, 6, 10, 14, 11, 18, 21, 17]), + ("both", [0, 1, 3, 6, 10, 15, 20, 27, 26, 30]), + ], +) +def test_variable_window_nonunique(closed, expected, frame_or_series): + # GH 20712 + index = DatetimeIndex( + [ + "2011-01-01", + "2011-01-01", + "2011-01-02", + "2011-01-02", + "2011-01-02", + "2011-01-03", + "2011-01-04", + "2011-01-04", + "2011-01-05", + "2011-01-06", + ] + ) + + df = frame_or_series(range(10), index=index, dtype=float) + expected = frame_or_series(expected, index=index, dtype=float) + + result = df.rolling("2D", closed=closed).sum() + + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "closed,expected", + [ + ("left", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 18, 21]), + ("neither", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 13, 8]), + ("right", [0, 1, 3, 6, 10, 15, 21, 28, 21, 17]), + ("both", [0, 1, 3, 6, 10, 15, 21, 28, 26, 30]), + ], +) +def test_variable_offset_window_nonunique(closed, expected, frame_or_series): + # GH 20712 + index = DatetimeIndex( + [ + "2011-01-01", + "2011-01-01", + "2011-01-02", + "2011-01-02", + "2011-01-02", + "2011-01-03", + "2011-01-04", + "2011-01-04", + "2011-01-05", + "2011-01-06", + ] + ) + + df = frame_or_series(range(10), index=index, dtype=float) + expected = frame_or_series(expected, index=index, dtype=float) + + offset = BusinessDay(2) + indexer = VariableOffsetWindowIndexer(index=index, offset=offset) + result = df.rolling(indexer, closed=closed, min_periods=1).sum() + + tm.assert_equal(result, expected) + + def test_even_number_window_alignment(): # see discussion in GH 38780 s = Series(range(3), index=date_range(start="2020-01-01", freq="D", periods=3)) @@ -920,7 +990,7 @@ def test_rolling_numerical_accuracy_kahan_mean(add): result = ( df.resample("1s").ffill().rolling("3s", closed="left", min_periods=3).mean() ) - dates = date_range("19700101 09:00:00", periods=7, freq="S") + dates = date_range("19700101 09:00:00", periods=7, freq="s") expected = DataFrame( { "A": [ @@ -1065,11 +1135,13 @@ def test_rolling_on_df_transposed(): ("index", "window"), [ ( - period_range(start="2020-01-01 08:00", end="2020-01-01 08:08", freq="T"), - "2T", + period_range(start="2020-01-01 08:00", end="2020-01-01 08:08", freq="min"), + "2min", ), ( - period_range(start="2020-01-01 08:00", end="2020-01-01 12:00", freq="30T"), + period_range( + start="2020-01-01 08:00", end="2020-01-01 12:00", freq="30min" + ), "1h", ), ], diff --git a/pandas/tests/window/test_rolling_functions.py b/pandas/tests/window/test_rolling_functions.py index 940f0845befa2..51f801ab3761b 100644 --- a/pandas/tests/window/test_rolling_functions.py +++ b/pandas/tests/window/test_rolling_functions.py @@ -388,7 +388,7 @@ def test_rolling_max_resample(step): # So that we can have 3 datapoints on last day (4, 10, and 20) indices.append(datetime(1975, 1, 5, 1)) indices.append(datetime(1975, 1, 5, 2)) - series = Series(list(range(0, 5)) + [10, 20], index=indices) + series = Series(list(range(5)) + [10, 20], index=indices) # Use floats instead of ints as values series = series.map(lambda x: float(x)) # Sort chronologically @@ -425,7 +425,7 @@ def test_rolling_min_resample(step): # So that we can have 3 datapoints on last day (4, 10, and 20) indices.append(datetime(1975, 1, 5, 1)) indices.append(datetime(1975, 1, 5, 2)) - series = Series(list(range(0, 5)) + [10, 20], index=indices) + series = Series(list(range(5)) + [10, 20], index=indices) # Use floats instead of ints as values series = series.map(lambda x: float(x)) # Sort chronologically @@ -445,7 +445,7 @@ def test_rolling_median_resample(): # So that we can have 3 datapoints on last day (4, 10, and 20) indices.append(datetime(1975, 1, 5, 1)) indices.append(datetime(1975, 1, 5, 2)) - series = Series(list(range(0, 5)) + [10, 20], index=indices) + series = Series(list(range(5)) + [10, 20], index=indices) # Use floats instead of ints as values series = series.map(lambda x: float(x)) # Sort chronologically diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index caa34a067ac69..7aa245341cbdd 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -19,6 +19,10 @@ MONTHS, int_to_weekday, ) +from pandas._libs.tslibs.dtypes import ( + OFFSET_TO_PERIOD_FREQSTR, + freq_to_period_freqstr, +) from pandas._libs.tslibs.fields import ( build_field_sarray, month_position_check, @@ -53,58 +57,29 @@ ) from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin # --------------------------------------------------------------------- -# Offset names ("time rules") and related functions - -_offset_to_period_map = { - "WEEKDAY": "D", - "EOM": "M", - "BM": "M", - "BQS": "Q", - "QS": "Q", - "BQ": "Q", - "BA": "A", - "AS": "A", - "BAS": "A", - "MS": "M", - "D": "D", - "B": "B", - "T": "T", - "S": "S", - "L": "L", - "U": "U", - "N": "N", - "H": "H", - "Q": "Q", - "A": "A", - "W": "W", - "M": "M", - "Y": "A", - "BY": "A", - "YS": "A", - "BYS": "A", -} +# Offset related functions _need_suffix = ["QS", "BQ", "BQS", "YS", "AS", "BY", "BA", "BYS", "BAS"] for _prefix in _need_suffix: for _m in MONTHS: key = f"{_prefix}-{_m}" - _offset_to_period_map[key] = _offset_to_period_map[_prefix] + OFFSET_TO_PERIOD_FREQSTR[key] = OFFSET_TO_PERIOD_FREQSTR[_prefix] for _prefix in ["A", "Q"]: for _m in MONTHS: _alias = f"{_prefix}-{_m}" - _offset_to_period_map[_alias] = _alias + OFFSET_TO_PERIOD_FREQSTR[_alias] = _alias for _d in DAYS: - _offset_to_period_map[f"W-{_d}"] = f"W-{_d}" + OFFSET_TO_PERIOD_FREQSTR[f"W-{_d}"] = f"W-{_d}" def get_period_alias(offset_str: str) -> str | None: """ Alias to closest period strings BQ->Q etc. """ - return _offset_to_period_map.get(offset_str, None) + return OFFSET_TO_PERIOD_FREQSTR.get(offset_str, None) # --------------------------------------------------------------------- @@ -271,19 +246,19 @@ def get_freq(self) -> str | None: return _maybe_add_count("H", delta / pph) elif _is_multiple(delta, ppm): # Minutes - return _maybe_add_count("T", delta / ppm) + return _maybe_add_count("min", delta / ppm) elif _is_multiple(delta, pps): # Seconds - return _maybe_add_count("S", delta / pps) + return _maybe_add_count("s", delta / pps) elif _is_multiple(delta, (pps // 1000)): # Milliseconds - return _maybe_add_count("L", delta / (pps // 1000)) + return _maybe_add_count("ms", delta / (pps // 1000)) elif _is_multiple(delta, (pps // 1_000_000)): # Microseconds - return _maybe_add_count("U", delta / (pps // 1_000_000)) + return _maybe_add_count("us", delta / (pps // 1_000_000)) else: # Nanoseconds - return _maybe_add_count("N", delta) + return _maybe_add_count("ns", delta) @cache_readonly def day_deltas(self) -> list[int]: @@ -394,7 +369,7 @@ def _get_monthly_rule(self) -> str | None: if pos_check is None: return None else: - return {"cs": "MS", "bs": "BMS", "ce": "M", "be": "BM"}.get(pos_check) + return {"cs": "MS", "bs": "BMS", "ce": "ME", "be": "BM"}.get(pos_check) def _is_business_daily(self) -> bool: # quick check: cannot be business daily @@ -472,7 +447,6 @@ def is_subperiod(source, target) -> bool: ------- bool """ - if target is None or source is None: return False source = _maybe_coerce_freq(source) @@ -483,31 +457,31 @@ def is_subperiod(source, target) -> bool: return _quarter_months_conform( get_rule_month(source), get_rule_month(target) ) - return source in {"D", "C", "B", "M", "H", "T", "S", "L", "U", "N"} + return source in {"D", "C", "B", "M", "H", "min", "s", "ms", "us", "ns"} elif _is_quarterly(target): - return source in {"D", "C", "B", "M", "H", "T", "S", "L", "U", "N"} + return source in {"D", "C", "B", "M", "H", "min", "s", "ms", "us", "ns"} elif _is_monthly(target): - return source in {"D", "C", "B", "H", "T", "S", "L", "U", "N"} + return source in {"D", "C", "B", "H", "min", "s", "ms", "us", "ns"} elif _is_weekly(target): - return source in {target, "D", "C", "B", "H", "T", "S", "L", "U", "N"} + return source in {target, "D", "C", "B", "H", "min", "s", "ms", "us", "ns"} elif target == "B": - return source in {"B", "H", "T", "S", "L", "U", "N"} + return source in {"B", "H", "min", "s", "ms", "us", "ns"} elif target == "C": - return source in {"C", "H", "T", "S", "L", "U", "N"} + return source in {"C", "H", "min", "s", "ms", "us", "ns"} elif target == "D": - return source in {"D", "H", "T", "S", "L", "U", "N"} + return source in {"D", "H", "min", "s", "ms", "us", "ns"} elif target == "H": - return source in {"H", "T", "S", "L", "U", "N"} - elif target == "T": - return source in {"T", "S", "L", "U", "N"} - elif target == "S": - return source in {"S", "L", "U", "N"} - elif target == "L": - return source in {"L", "U", "N"} - elif target == "U": - return source in {"U", "N"} - elif target == "N": - return source in {"N"} + return source in {"H", "min", "s", "ms", "us", "ns"} + elif target == "min": + return source in {"min", "s", "ms", "us", "ns"} + elif target == "s": + return source in {"s", "ms", "us", "ns"} + elif target == "ms": + return source in {"ms", "us", "ns"} + elif target == "us": + return source in {"us", "ns"} + elif target == "ns": + return source in {"ns"} else: return False @@ -541,31 +515,31 @@ def is_superperiod(source, target) -> bool: smonth = get_rule_month(source) tmonth = get_rule_month(target) return _quarter_months_conform(smonth, tmonth) - return target in {"D", "C", "B", "M", "H", "T", "S", "L", "U", "N"} + return target in {"D", "C", "B", "M", "H", "min", "s", "ms", "us", "ns"} elif _is_quarterly(source): - return target in {"D", "C", "B", "M", "H", "T", "S", "L", "U", "N"} + return target in {"D", "C", "B", "M", "H", "min", "s", "ms", "us", "ns"} elif _is_monthly(source): - return target in {"D", "C", "B", "H", "T", "S", "L", "U", "N"} + return target in {"D", "C", "B", "H", "min", "s", "ms", "us", "ns"} elif _is_weekly(source): - return target in {source, "D", "C", "B", "H", "T", "S", "L", "U", "N"} + return target in {source, "D", "C", "B", "H", "min", "s", "ms", "us", "ns"} elif source == "B": - return target in {"D", "C", "B", "H", "T", "S", "L", "U", "N"} + return target in {"D", "C", "B", "H", "min", "s", "ms", "us", "ns"} elif source == "C": - return target in {"D", "C", "B", "H", "T", "S", "L", "U", "N"} + return target in {"D", "C", "B", "H", "min", "s", "ms", "us", "ns"} elif source == "D": - return target in {"D", "C", "B", "H", "T", "S", "L", "U", "N"} + return target in {"D", "C", "B", "H", "min", "s", "ms", "us", "ns"} elif source == "H": - return target in {"H", "T", "S", "L", "U", "N"} - elif source == "T": - return target in {"T", "S", "L", "U", "N"} - elif source == "S": - return target in {"S", "L", "U", "N"} - elif source == "L": - return target in {"L", "U", "N"} - elif source == "U": - return target in {"U", "N"} - elif source == "N": - return target in {"N"} + return target in {"H", "min", "s", "ms", "us", "ns"} + elif source == "min": + return target in {"min", "s", "ms", "us", "ns"} + elif source == "s": + return target in {"s", "ms", "us", "ns"} + elif source == "ms": + return target in {"ms", "us", "ns"} + elif source == "us": + return target in {"us", "ns"} + elif source == "ns": + return target in {"ns"} else: return False @@ -585,8 +559,11 @@ def _maybe_coerce_freq(code) -> str: """ assert code is not None if isinstance(code, DateOffset): - code = code.rule_code - return code.upper() + code = freq_to_period_freqstr(1, code.name) + if code in {"min", "s", "ms", "us", "ns"}: + return code + else: + return code.upper() def _quarter_months_conform(source: str, target: str) -> bool: diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 44c21bc284121..3c429a960b451 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -283,11 +283,11 @@ def dates( holiday_dates = self._apply_rule(dates) if self.days_of_week is not None: holiday_dates = holiday_dates[ - np.in1d( + np.isin( # error: "DatetimeIndex" has no attribute "dayofweek" holiday_dates.dayofweek, # type: ignore[attr-defined] self.days_of_week, - ) + ).ravel() ] if self.start_date is not None: @@ -354,7 +354,7 @@ def _apply_rule(self, dates: DatetimeIndex) -> DatetimeIndex: Dates with rules applied """ if dates.empty: - return DatetimeIndex([]) + return dates.copy() if self.observance is not None: return dates.map(lambda d: self.observance(d)) diff --git a/pandas/util/__init__.py b/pandas/util/__init__.py index 8fe928ed6c5cf..82b3aa56c653c 100644 --- a/pandas/util/__init__.py +++ b/pandas/util/__init__.py @@ -23,3 +23,7 @@ def __getattr__(key: str): return cache_readonly raise AttributeError(f"module 'pandas.util' has no attribute '{key}'") + + +def capitalize_first_letter(s): + return s[:1].upper() + s[1:] diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 03011a1ffe622..9be0c3edaa998 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -38,6 +38,9 @@ def test_foo(): if TYPE_CHECKING: from pandas._typing import F + +from pandas._config.config import _get_option + from pandas.compat import ( IS64, is_platform_windows, @@ -230,12 +233,12 @@ def mark_array_manager_not_yet_implemented(request) -> None: skip_array_manager_not_yet_implemented = pytest.mark.xfail( - get_option("mode.data_manager") == "array", + _get_option("mode.data_manager", silent=True) == "array", reason="Not yet implemented for ArrayManager", ) skip_array_manager_invalid_test = pytest.mark.skipif( - get_option("mode.data_manager") == "array", + _get_option("mode.data_manager", silent=True) == "array", reason="Test that relies on BlockManager internals or specific behaviour", ) diff --git a/pandas/util/version/__init__.py b/pandas/util/version/__init__.py index c72598eb50410..3a5efbbb09c1e 100644 --- a/pandas/util/version/__init__.py +++ b/pandas/util/version/__init__.py @@ -4,8 +4,7 @@ # 04/30/2021 # This file is dual licensed under the terms of the Apache License, Version -# 2.0, and the BSD License. See the LICENSE file in the root of this repository -# for complete details. +# 2.0, and the BSD License. Licence at LICENSES/PACKAGING_LICENSE from __future__ import annotations import collections diff --git a/pyproject.toml b/pyproject.toml index 9af86a6bdcd16..4e1c77413efda 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,14 +3,14 @@ # See https://github.com/scipy/scipy/pull/12940 for the AIX issue. requires = [ "meson-python==0.13.1", - "meson==1.0.1", + "meson==1.2.1", "wheel", "Cython>=0.29.33,<3", # Note: sync with setup.py, environment.yml and asv.conf.json # Note: numpy 1.25 has a backwards compatible C API by default # we don't want to force users to compile with 1.25 though # (Ideally, in the future, though, oldest-supported-numpy can be dropped when our min numpy is 1.25.x) "oldest-supported-numpy>=2022.8.16; python_version<'3.12'", - "numpy>=1.22.4; python_version>='3.12'", + "numpy>=1.26.0; python_version>='3.12'", "versioneer[toml]" ] @@ -30,7 +30,8 @@ license = {file = 'LICENSE'} requires-python = '>=3.9' dependencies = [ "numpy>=1.22.4; python_version<'3.11'", - "numpy>=1.23.2; python_version>='3.11'", + "numpy>=1.23.2; python_version=='3.11'", + "numpy>=1.26.0; python_version>='3.12'", "python-dateutil>=2.8.2", "pytz>=2020.1", "tzdata>=2022.1" @@ -66,7 +67,7 @@ computation = ['scipy>=1.8.1', 'xarray>=2022.03.0'] fss = ['fsspec>=2022.05.0'] aws = ['s3fs>=2022.05.0'] gcp = ['gcsfs>=2022.05.0', 'pandas-gbq>=0.17.5'] -excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.10', 'pyxlsb>=1.0.9', 'xlrd>=2.0.1', 'xlsxwriter>=3.0.3'] +excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.10', 'python-calamine>=0.1.6', 'pyxlsb>=1.0.9', 'xlrd>=2.0.1', 'xlsxwriter>=3.0.3'] parquet = ['pyarrow>=7.0.0'] feather = ['pyarrow>=7.0.0'] hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) @@ -109,6 +110,7 @@ all = ['beautifulsoup4>=4.11.1', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0', + 'python-calamine>=0.1.6', 'pyxlsb>=1.0.9', 'qtpy>=2.2.0', 'scipy>=1.8.1', @@ -227,7 +229,7 @@ select = [ # flake8-gettext "INT", # pylint - "PLC", "PLE", "PLR", "PLW", + "PL", # misc lints "PIE", # flake8-pyi @@ -250,6 +252,10 @@ select = [ "NPY002", # Perflint "PERF", + # flynt + "FLY", + # flake8-logging-format + "G", ] ignore = [ @@ -354,7 +360,7 @@ exclude = [ "asv_bench/*" = ["TID", "NPY002"] # to be enabled gradually "pandas/core/*" = ["PLR5501"] -"pandas/tests/*" = ["B028"] +"pandas/tests/*" = ["B028", "FLY"] "scripts/*" = ["B028"] # Keep this one enabled "pandas/_typing.py" = ["TCH"] diff --git a/requirements-dev.txt b/requirements-dev.txt index be02007a36333..98339f45a5052 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,7 +4,7 @@ pip versioneer[toml] cython==0.29.33 -meson[ninja]==1.0.1 +meson[ninja]==1.2.1 meson-python==0.13.1 pytest>=7.3.2 pytest-cov @@ -25,7 +25,7 @@ gcsfs>=2022.05.0 ipython jinja2>=3.1.2 lxml>=4.8.0 -matplotlib>=3.6.1 +matplotlib>=3.6.1, <3.8 numba>=0.55.2 numexpr>=2.8.0 openpyxl>=3.0.10 @@ -36,6 +36,7 @@ pyarrow>=7.0.0 pymysql>=1.0.2 pyreadstat>=1.1.5 tables>=3.7.0 +python-calamine>=0.1.6 pyxlsb>=1.0.9 s3fs>=2022.05.0 scipy>=1.8.1 @@ -59,7 +60,7 @@ gitdb google-auth natsort numpydoc -pydata-sphinx-theme +pydata-sphinx-theme==0.13 pytest-cython sphinx sphinx-design @@ -76,7 +77,6 @@ ipywidgets nbformat notebook>=6.0.3 ipykernel -jinja2 markdown feedparser pyyaml diff --git a/scripts/no_bool_in_generic.py b/scripts/no_bool_in_generic.py index 3368b485662ad..15bc2247469c7 100644 --- a/scripts/no_bool_in_generic.py +++ b/scripts/no_bool_in_generic.py @@ -9,6 +9,7 @@ The function `visit` is adapted from a function by the same name in pyupgrade: https://github.com/asottile/pyupgrade/blob/5495a248f2165941c5d3b82ac3226ba7ad1fa59d/pyupgrade/_data.py#L70-L113 +Licence at LICENSES/PYUPGRADE_LICENSE """ from __future__ import annotations @@ -64,7 +65,7 @@ def replace_bool_with_bool_t(to_replace, content: str) -> str: + replaced_line[col_offset + 4 :] ) new_lines.append(replaced_line) - return "\n".join(new_lines) + return "\n".join(new_lines) + "\n" def check_for_bool_in_generic(content: str) -> tuple[bool, str]: diff --git a/scripts/tests/data/deps_expected_random.yaml b/scripts/tests/data/deps_expected_random.yaml index c70025f8f019d..1ede20f5cc0d8 100644 --- a/scripts/tests/data/deps_expected_random.yaml +++ b/scripts/tests/data/deps_expected_random.yaml @@ -44,6 +44,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.2 - pytables>=3.6.1 + - python-calamine>=0.1.6 - pyxlsb>=1.0.8 - s3fs>=2021.08.0 - scipy>=1.7.1 diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index b43815a982139..501ec4f061f17 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -62,7 +62,7 @@ computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] fss = ['fsspec>=2021.07.0'] aws = ['s3fs>=2021.08.0'] gcp = ['gcsfs>=2021.07.0', 'pandas-gbq>=0.15.0'] -excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] +excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'python-calamine>=0.1.6', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] parquet = ['pyarrow>=7.0.0'] feather = ['pyarrow>=7.0.0'] hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) @@ -103,6 +103,7 @@ all = ['beautifulsoup4>=5.9.3', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0', + 'python-calamine>=0.1.6', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', 'scipy>=1.7.1', diff --git a/scripts/tests/data/deps_unmodified_random.yaml b/scripts/tests/data/deps_unmodified_random.yaml index 503eb3c7c7734..14bedd1025bf8 100644 --- a/scripts/tests/data/deps_unmodified_random.yaml +++ b/scripts/tests/data/deps_unmodified_random.yaml @@ -44,6 +44,7 @@ dependencies: - pymysql>=1.0.2 - pyreadstat>=1.1.2 - pytables>=3.6.1 + - python-calamine>=0.1.6 - pyxlsb>=1.0.8 - s3fs>=2021.08.0 - scipy>=1.7.1 diff --git a/scripts/tests/test_no_bool_in_generic.py b/scripts/tests/test_no_bool_in_generic.py index 0bc91c5d1cf1e..a57612548dcc5 100644 --- a/scripts/tests/test_no_bool_in_generic.py +++ b/scripts/tests/test_no_bool_in_generic.py @@ -1,7 +1,7 @@ from scripts.no_bool_in_generic import check_for_bool_in_generic -BAD_FILE = "def foo(a: bool) -> bool:\n return bool(0)" -GOOD_FILE = "def foo(a: bool_t) -> bool_t:\n return bool(0)" +BAD_FILE = "def foo(a: bool) -> bool:\n return bool(0)\n" +GOOD_FILE = "def foo(a: bool_t) -> bool_t:\n return bool(0)\n" def test_bad_file_with_replace(): diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index aab29fce89abe..ffe1e9acd1185 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -110,10 +110,10 @@ def _import_path(self, klass=None, func=None): base_path = "scripts.tests.test_validate_docstrings" if klass: - base_path = ".".join([base_path, klass]) + base_path = f"{base_path}.{klass}" if func: - base_path = ".".join([base_path, func]) + base_path = f"{base_path}.{func}" return base_path diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index b1b63b469ec3b..0a6a852bb0f85 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -143,7 +143,7 @@ def get_api_items(api_doc_fd): func = getattr(func, part) yield ( - ".".join([current_module, line_stripped]), + f"{current_module}.{line_stripped}", func, current_section, current_subsection, diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 47534226f972f..d765d7bc7dcb9 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -33,6 +33,7 @@ "_agg_template_series", "_agg_template_frame", "_pipe_template", + "_apply_groupings_depr", "__main__", "_transform_template", "_use_inf_as_na", @@ -50,6 +51,8 @@ "_chained_assignment_msg", "_chained_assignment_method_msg", "_version_meson", + # TODO(3.0): GH#55043 - remove upon removal of ArrayManager + "_get_option", } diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index e9f5c8643b493..561503de416a5 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -511,9 +511,16 @@ assumptions about your datasets and check that they're *actually* true. Pandas provides an interface for defining [extension types](https://pandas.pydata.org/docs/development/extending.html#extension-types) to extend NumPy's type system. -The following librariesimplement that interface to provide types not found in NumPy or pandas, +The following libraries implement that interface to provide types not found in NumPy or pandas, which work well with pandas' data containers. +### [awkward-pandas](https://awkward-pandas.readthedocs.io/) + +Awkward-pandas provides an extension type for storing [Awkward +Arrays](https://awkward-array.org/) inside pandas' Series and +DataFrame. It also provides an accessor for using awkward functions +on Series that are of awkward type. + ### [cyberpandas](https://cyberpandas.readthedocs.io/en/latest) Cyberpandas provides an extension type for storing arrays of IP @@ -553,6 +560,7 @@ authors to coordinate on the namespace. | Library | Accessor | Classes | | -------------------------------------------------------------------- | ---------- | --------------------- | + | [awkward-pandas](https://awkward-pandas.readthedocs.io/en/latest/) | `ak` | `Series` | | [cyberpandas](https://cyberpandas.readthedocs.io/en/latest) | `ip` | `Series` | | [pdvega](https://altair-viz.github.io/pdvega/) | `vgplot` | `Series`, `DataFrame` | | [pandas-genomics](https://pandas-genomics.readthedocs.io/en/latest/) | `genomics` | `Series`, `DataFrame` | diff --git a/web/pandas/versions.json b/web/pandas/versions.json index 4be8f10a88334..43efaf8ebe259 100644 --- a/web/pandas/versions.json +++ b/web/pandas/versions.json @@ -5,10 +5,15 @@ "url": "https://pandas.pydata.org/docs/dev/" }, { - "name": "2.0 (stable)", - "version": "2.0", + "name": "2.1 (stable)", + "version": "2.1", "url": "https://pandas.pydata.org/docs/" }, + { + "name": "2.0", + "version": "2.0", + "url": "https://pandas.pydata.org/pandas-docs/version/2.0/" + }, { "name": "1.5", "version": "1.5",