diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ca0c75f9de94f6..a5a802c678e208 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,7 +22,9 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v1 + uses: actions/checkout@v2 + with: + fetch-depth: 0 - name: Looking for unwanted patterns run: ci/code_checks.sh patterns @@ -94,7 +96,9 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v1 + uses: actions/checkout@v2 + with: + fetch-depth: 0 - name: Set up pandas uses: ./.github/actions/setup @@ -147,7 +151,9 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v1 + uses: actions/checkout@v2 + with: + fetch-depth: 0 - name: Set up pandas uses: ./.github/actions/setup diff --git a/.github/workflows/database.yml b/.github/workflows/database.yml index 69f2e689c0228e..b15889351386a5 100644 --- a/.github/workflows/database.yml +++ b/.github/workflows/database.yml @@ -56,10 +56,12 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v1 + uses: actions/checkout@v2 + with: + fetch-depth: 0 - name: Cache conda - uses: actions/cache@v1 + uses: actions/cache@v2 env: CACHE_NUMBER: 0 with: diff --git a/.github/workflows/posix.yml b/.github/workflows/posix.yml index 34e6c2c9d94ce7..3a4d3c106f8512 100644 --- a/.github/workflows/posix.yml +++ b/.github/workflows/posix.yml @@ -44,10 +44,12 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v1 + uses: actions/checkout@v2 + with: + fetch-depth: 0 - name: Cache conda - uses: actions/cache@v1 + uses: actions/cache@v2 env: CACHE_NUMBER: 0 with: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1fbd3cf85383e8..3078619ecac355 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,14 +19,14 @@ repos: types_or: [python, rst, markdown] files: ^(pandas|doc)/ - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v3.4.0 + rev: v4.0.1 hooks: - id: debug-statements - id: end-of-file-fixer exclude: \.txt$ - id: trailing-whitespace - repo: https://github.com/cpplint/cpplint - rev: f7061b1 # the latest tag does not have the hook + rev: 1.5.5 hooks: - id: cpplint # We don't lint all C files because we don't want to lint any that are built @@ -57,7 +57,7 @@ repos: hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v2.12.0 + rev: v2.18.3 hooks: - id: pyupgrade args: [--py37-plus] @@ -72,7 +72,7 @@ repos: types: [text] # overwrite types: [rst] types_or: [python, rst] - repo: https://github.com/asottile/yesqa - rev: v1.2.2 + rev: v1.2.3 hooks: - id: yesqa additional_dependencies: diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 540cd026a43d56..00000000000000 --- a/.travis.yml +++ /dev/null @@ -1,73 +0,0 @@ -language: python -python: 3.7 - -addons: - apt: - update: true - packages: - - xvfb - -services: - - xvfb - -# To turn off cached cython files and compiler cache -# set NOCACHE-true -# To delete caches go to https://travis-ci.org/OWNER/REPOSITORY/caches or run -# travis cache --delete inside the project directory from the travis command line client -# The cache directories will be deleted if anything in ci/ changes in a commit -cache: - apt: true - ccache: true - directories: - - $HOME/.cache # cython cache - -env: - global: - # create a github personal access token - # cd pandas-dev/pandas - # travis encrypt 'PANDAS_GH_TOKEN=personal_access_token' -r pandas-dev/pandas - - secure: "EkWLZhbrp/mXJOx38CHjs7BnjXafsqHtwxPQrqWy457VDFWhIY1DMnIR/lOWG+a20Qv52sCsFtiZEmMfUjf0pLGXOqurdxbYBGJ7/ikFLk9yV2rDwiArUlVM9bWFnFxHvdz9zewBH55WurrY4ShZWyV+x2dWjjceWG5VpWeI6sA=" - -git: - depth: false - -matrix: - fast_finish: true - - include: - - arch: arm64-graviton2 - virt: lxd - group: edge - env: - - JOB="3.7, arm64" PYTEST_WORKERS="auto" ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard and not arm_slow)" - -before_install: - - echo "before_install" - # Use blocking IO on travis. Ref: https://github.com/travis-ci/travis-ci/issues/8920#issuecomment-352661024 - - python -c 'import os,sys,fcntl; flags = fcntl.fcntl(sys.stdout, fcntl.F_GETFL); fcntl.fcntl(sys.stdout, fcntl.F_SETFL, flags&~os.O_NONBLOCK);' - - source ci/travis_process_gbq_encryption.sh - - export PATH="$HOME/miniconda3/bin:$PATH" - - df -h - - pwd - - uname -a - - git --version - - ./ci/check_git_tags.sh - -install: - - echo "install start" - - ci/prep_cython_cache.sh - - ci/setup_env.sh - - ci/submit_cython_cache.sh - - echo "install done" - -script: - - echo "script start" - - echo "$JOB" - - source activate pandas-dev - - ci/run_tests.sh - -after_script: - - echo "after_script start" - - source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - - ci/print_skipped.py - - echo "after_script done" diff --git a/asv_bench/benchmarks/io/style.py b/asv_bench/benchmarks/io/style.py index a01610a69278bc..82166a2a95c763 100644 --- a/asv_bench/benchmarks/io/style.py +++ b/asv_bench/benchmarks/io/style.py @@ -20,19 +20,19 @@ def setup(self, cols, rows): def time_apply_render(self, cols, rows): self._style_apply() - self.st._render_html() + self.st._render_html(True, True) def peakmem_apply_render(self, cols, rows): self._style_apply() - self.st._render_html() + self.st._render_html(True, True) def time_classes_render(self, cols, rows): self._style_classes() - self.st._render_html() + self.st._render_html(True, True) def peakmem_classes_render(self, cols, rows): self._style_classes() - self.st._render_html() + self.st._render_html(True, True) def time_format_render(self, cols, rows): self._style_format() diff --git a/ci/check_git_tags.sh b/ci/check_git_tags.sh deleted file mode 100755 index 9dbcd4f98683e9..00000000000000 --- a/ci/check_git_tags.sh +++ /dev/null @@ -1,28 +0,0 @@ -set -e - -if [[ ! $(git tag) ]]; then - echo "No git tags in clone, please sync your git tags with upstream using:" - echo " git fetch --tags upstream" - echo " git push --tags origin" - echo "" - echo "If the issue persists, the clone depth needs to be increased in .travis.yml" - exit 1 -fi - -# This will error if there are no tags and we omit --always -DESCRIPTION=$(git describe --long --tags) -echo "$DESCRIPTION" - -if [[ "$DESCRIPTION" == *"untagged"* ]]; then - echo "Unable to determine most recent tag, aborting build" - exit 1 -else - if [[ "$DESCRIPTION" != *"g"* ]]; then - # A good description will have the hash prefixed by g, a bad one will be - # just the hash - echo "Unable to determine most recent tag, aborting build" - exit 1 - else - echo "$(git tag)" - fi -fi diff --git a/ci/deps/actions-37-db-min.yaml b/ci/deps/actions-37-db-min.yaml index 65c4c5769b1a3d..cae4361ca37a78 100644 --- a/ci/deps/actions-37-db-min.yaml +++ b/ci/deps/actions-37-db-min.yaml @@ -6,7 +6,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-cov - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/actions-37-db.yaml b/ci/deps/actions-37-db.yaml index fa58f412cebf41..e568f8615a8dfe 100644 --- a/ci/deps/actions-37-db.yaml +++ b/ci/deps/actions-37-db.yaml @@ -6,7 +6,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-cov>=2.10.1 # this is only needed in the coverage build, ref: GH 35737 @@ -25,7 +25,7 @@ dependencies: - flask - nomkl - numexpr - - numpy=1.16.* + - numpy=1.17.* - odfpy - openpyxl - pandas-gbq diff --git a/ci/deps/actions-37-locale_slow.yaml b/ci/deps/actions-37-locale_slow.yaml index d9ad1f538908e6..c6eb3b00a63aca 100644 --- a/ci/deps/actions-37-locale_slow.yaml +++ b/ci/deps/actions-37-locale_slow.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-cov - pytest-xdist>=1.21 - hypothesis>=3.58.0 @@ -17,13 +17,13 @@ dependencies: - bottleneck=1.2.* - lxml - matplotlib=3.0.0 - - numpy=1.16.* + - numpy=1.17.* - openpyxl=3.0.0 - python-dateutil - python-blosc - pytz=2017.3 - scipy - - sqlalchemy=1.2.8 + - sqlalchemy=1.3.0 - xlrd=1.2.0 - xlsxwriter=1.0.2 - xlwt=1.3.0 diff --git a/ci/deps/actions-37-minimum_versions.yaml b/ci/deps/actions-37-minimum_versions.yaml index aa5284e4f35d12..b97601d18917c1 100644 --- a/ci/deps/actions-37-minimum_versions.yaml +++ b/ci/deps/actions-37-minimum_versions.yaml @@ -17,7 +17,7 @@ dependencies: - bottleneck=1.2.1 - jinja2=2.10 - numba=0.46.0 - - numexpr=2.6.8 + - numexpr=2.7.0 - numpy=1.17.3 - openpyxl=3.0.0 - pytables=3.5.1 diff --git a/ci/deps/actions-37-slow.yaml b/ci/deps/actions-37-slow.yaml index 573ff7f02c162b..166f2237dcad3d 100644 --- a/ci/deps/actions-37-slow.yaml +++ b/ci/deps/actions-37-slow.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-cov - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/actions-37.yaml b/ci/deps/actions-37.yaml index a209a9099d2bb7..0effe6f80df867 100644 --- a/ci/deps/actions-37.yaml +++ b/ci/deps/actions-37.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-cov - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/actions-38-locale.yaml b/ci/deps/actions-38-locale.yaml index 629804c71e7262..34a6860936550d 100644 --- a/ci/deps/actions-38-locale.yaml +++ b/ci/deps/actions-38-locale.yaml @@ -6,7 +6,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-cov - pytest-xdist>=1.21 - pytest-asyncio>=0.12.0 @@ -20,7 +20,7 @@ dependencies: - jinja2 - jedi<0.18.0 - lxml - - matplotlib <3.3.0 + - matplotlib<3.3.0 - moto - nomkl - numexpr diff --git a/ci/deps/actions-38-numpydev.yaml b/ci/deps/actions-38-numpydev.yaml index e7ee6ccfd7bac7..6eed2daac0c3b2 100644 --- a/ci/deps/actions-38-numpydev.yaml +++ b/ci/deps/actions-38-numpydev.yaml @@ -5,14 +5,14 @@ dependencies: - python=3.8.* # tools - - pytest>=5.0.1 + - pytest>=6.0 - pytest-cov - pytest-xdist>=1.21 - hypothesis>=3.58.0 # pandas dependencies - pytz - - pip=20.2 + - pip - pip: - cython==0.29.21 # GH#34014 - "git+git://github.com/dateutil/dateutil.git" diff --git a/ci/deps/actions-38-slow.yaml b/ci/deps/actions-38-slow.yaml index 2106f487555603..afba60e451b901 100644 --- a/ci/deps/actions-38-slow.yaml +++ b/ci/deps/actions-38-slow.yaml @@ -6,7 +6,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-cov - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index e2660d07c35580..11daa92046eb4a 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-cov - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 36e8bf528fc3ea..b74f1af8ee0f65 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -6,7 +6,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-cov - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/azure-macos-37.yaml b/ci/deps/azure-macos-37.yaml index a0b1cdc684d2c3..63e858eac433fa 100644 --- a/ci/deps/azure-macos-37.yaml +++ b/ci/deps/azure-macos-37.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.7.* # tools - - pytest>=5.0.1 + - pytest>=6.0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 8266e3bc4d07db..5cbc029f8c03d0 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index 200e695a69d1fd..7fdecae626f9de 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/travis-37-arm64.yaml b/ci/deps/circle-37-arm64.yaml similarity index 93% rename from ci/deps/travis-37-arm64.yaml rename to ci/deps/circle-37-arm64.yaml index 8df6104f43a504..995ebda1f97e71 100644 --- a/ci/deps/travis-37-arm64.yaml +++ b/ci/deps/circle-37-arm64.yaml @@ -6,7 +6,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/prep_cython_cache.sh b/ci/prep_cython_cache.sh deleted file mode 100755 index 18d9388327ddc4..00000000000000 --- a/ci/prep_cython_cache.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash - -ls "$HOME/.cache/" - -PYX_CACHE_DIR="$HOME/.cache/pyxfiles" -pyx_file_list=`find ${TRAVIS_BUILD_DIR} -name "*.pyx" -o -name "*.pxd" -o -name "*.pxi.in"` -pyx_cache_file_list=`find ${PYX_CACHE_DIR} -name "*.pyx" -o -name "*.pxd" -o -name "*.pxi.in"` - -CACHE_File="$HOME/.cache/cython_files.tar" - -# Clear the cython cache 0 = NO, 1 = YES -clear_cache=0 - -pyx_files=`echo "$pyx_file_list" | wc -l` -pyx_cache_files=`echo "$pyx_cache_file_list" | wc -l` - -if [[ pyx_files -ne pyx_cache_files ]] -then - echo "Different number of pyx files" - clear_cache=1 -fi - -home_dir=$(pwd) - -if [ -f "$CACHE_File" ] && [ -z "$NOCACHE" ] && [ -d "$PYX_CACHE_DIR" ]; then - - echo "Cache available - checking pyx diff" - - for i in ${pyx_file_list} - do - diff=`diff -u $i $PYX_CACHE_DIR${i}` - if [[ $? -eq 2 ]] - then - echo "${i##*/} can't be diffed; probably not in cache" - clear_cache=1 - fi - if [[ ! -z $diff ]] - then - echo "${i##*/} has changed:" - echo $diff - clear_cache=1 - fi - done - - if [ "$TRAVIS_PULL_REQUEST" == "false" ] - then - echo "Not a PR" - # Uncomment next 2 lines to turn off cython caching not in a PR - # echo "Non PR cython caching is disabled" - # clear_cache=1 - else - echo "In a PR" - # Uncomment next 2 lines to turn off cython caching in a PR - # echo "PR cython caching is disabled" - # clear_cache=1 - fi - -fi - -if [ $clear_cache -eq 0 ] && [ -z "$NOCACHE" ] -then - # No and nocache is not set - echo "Will reuse cached cython file" - cd / - tar xvmf $CACHE_File - cd $home_dir -else - echo "Rebuilding cythonized files" - echo "No cache = $NOCACHE" - echo "Clear cache (1=YES) = $clear_cache" -fi - - -exit 0 diff --git a/ci/setup_env.sh b/ci/setup_env.sh index c36422884f2ec1..2e16bc65451613 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -12,41 +12,30 @@ if [[ "$(uname)" == "Linux" && -n "$LC_ALL" ]]; then echo fi -MINICONDA_DIR="$HOME/miniconda3" - - -if [ -d "$MINICONDA_DIR" ]; then - echo - echo "rm -rf "$MINICONDA_DIR"" - rm -rf "$MINICONDA_DIR" -fi echo "Install Miniconda" -UNAME_OS=$(uname) -if [[ "$UNAME_OS" == 'Linux' ]]; then +DEFAULT_CONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest" +if [[ "$(uname -m)" == 'aarch64' ]]; then + CONDA_URL="https://github.com/conda-forge/miniforge/releases/download/4.10.1-4/Miniforge3-4.10.1-4-Linux-aarch64.sh" +elif [[ "$(uname)" == 'Linux' ]]; then if [[ "$BITS32" == "yes" ]]; then - CONDA_OS="Linux-x86" + CONDA_URL="$DEFAULT_CONDA_URL-Linux-x86.sh" else - CONDA_OS="Linux-x86_64" + CONDA_URL="$DEFAULT_CONDA_URL-Linux-x86_64.sh" fi -elif [[ "$UNAME_OS" == 'Darwin' ]]; then - CONDA_OS="MacOSX-x86_64" +elif [[ "$(uname)" == 'Darwin' ]]; then + CONDA_URL="$DEFAULT_CONDA_URL-MacOSX-x86_64.sh" else - echo "OS $UNAME_OS not supported" + echo "OS $(uname) not supported" exit 1 fi - -if [ "${TRAVIS_CPU_ARCH}" == "arm64" ]; then - CONDA_URL="https://github.com/conda-forge/miniforge/releases/download/4.8.5-1/Miniforge3-4.8.5-1-Linux-aarch64.sh" -else - CONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-$CONDA_OS.sh" -fi +echo "Downloading $CONDA_URL" wget -q $CONDA_URL -O miniconda.sh chmod +x miniconda.sh -# Installation path is required for ARM64 platform as miniforge script installs in path $HOME/miniforge3. +MINICONDA_DIR="$HOME/miniconda3" +rm -rf $MINICONDA_DIR ./miniconda.sh -b -p $MINICONDA_DIR - export PATH=$MINICONDA_DIR/bin:$PATH echo @@ -63,29 +52,6 @@ conda update -n base conda echo "conda info -a" conda info -a -echo -echo "set the compiler cache to work" -if [ -z "$NOCACHE" ] && [ "${TRAVIS_OS_NAME}" == "linux" ]; then - echo "Using ccache" - export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH - GCC=$(which gcc) - echo "gcc: $GCC" - CCACHE=$(which ccache) - echo "ccache: $CCACHE" - export CC='ccache gcc' -elif [ -z "$NOCACHE" ] && [ "${TRAVIS_OS_NAME}" == "osx" ]; then - echo "Install ccache" - brew install ccache > /dev/null 2>&1 - echo "Using ccache" - export PATH=/usr/local/opt/ccache/libexec:$PATH - gcc=$(which gcc) - echo "gcc: $gcc" - CCACHE=$(which ccache) - echo "ccache: $CCACHE" -else - echo "Not using ccache" -fi - echo "source deactivate" source deactivate diff --git a/ci/submit_cython_cache.sh b/ci/submit_cython_cache.sh deleted file mode 100755 index b87acef0ba11c6..00000000000000 --- a/ci/submit_cython_cache.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -CACHE_File="$HOME/.cache/cython_files.tar" -PYX_CACHE_DIR="$HOME/.cache/pyxfiles" -pyx_file_list=`find ${TRAVIS_BUILD_DIR} -name "*.pyx" -o -name "*.pxd" -o -name "*.pxi.in"` - -rm -rf $CACHE_File -rm -rf $PYX_CACHE_DIR - -home_dir=$(pwd) - -mkdir -p $PYX_CACHE_DIR -rsync -Rv $pyx_file_list $PYX_CACHE_DIR - -echo "pyx files:" -echo $pyx_file_list - -tar cf ${CACHE_File} --files-from /dev/null - -for i in ${pyx_file_list} -do - f=${i%.pyx} - ls $f.{c,cpp} | tar rf ${CACHE_File} -T - -done - -echo "Cython files in cache tar:" -tar tvf ${CACHE_File} - -exit 0 diff --git a/ci/travis_encrypt_gbq.sh b/ci/travis_encrypt_gbq.sh deleted file mode 100755 index 7d5692d9520af3..00000000000000 --- a/ci/travis_encrypt_gbq.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -GBQ_JSON_FILE=$1 - -if [[ $# -ne 1 ]]; then - echo -e "Too few arguments.\nUsage: ./travis_encrypt_gbq.sh "\ - "" - exit 1 -fi - -if [[ $GBQ_JSON_FILE != *.json ]]; then - echo "ERROR: Expected *.json file" - exit 1 -fi - -if [[ ! -f $GBQ_JSON_FILE ]]; then - echo "ERROR: File $GBQ_JSON_FILE does not exist" - exit 1 -fi - -echo "Encrypting $GBQ_JSON_FILE..." -read -d "\n" TRAVIS_KEY TRAVIS_IV <<<$(travis encrypt-file -r pandas-dev/pandas $GBQ_JSON_FILE \ -travis_gbq.json.enc -f | grep -o "\w*_iv\|\w*_key"); - -echo "Adding your secure key to travis_gbq_config.txt ..." -echo -e "TRAVIS_IV_ENV=$TRAVIS_IV\nTRAVIS_KEY_ENV=$TRAVIS_KEY"\ -> travis_gbq_config.txt - -echo "Done. Removing file $GBQ_JSON_FILE" -rm $GBQ_JSON_FILE - -echo -e "Created encrypted credentials file travis_gbq.json.enc.\n"\ - "NOTE: Do NOT commit the *.json file containing your unencrypted" \ - "private key" diff --git a/ci/travis_gbq.json.enc b/ci/travis_gbq.json.enc deleted file mode 100644 index 6e0b6cee4048c7..00000000000000 Binary files a/ci/travis_gbq.json.enc and /dev/null differ diff --git a/ci/travis_gbq_config.txt b/ci/travis_gbq_config.txt deleted file mode 100644 index dc857c450331c7..00000000000000 --- a/ci/travis_gbq_config.txt +++ /dev/null @@ -1,2 +0,0 @@ -TRAVIS_IV_ENV=encrypted_e05c934e101e_iv -TRAVIS_KEY_ENV=encrypted_e05c934e101e_key diff --git a/ci/travis_process_gbq_encryption.sh b/ci/travis_process_gbq_encryption.sh deleted file mode 100755 index b5118ad5defc6d..00000000000000 --- a/ci/travis_process_gbq_encryption.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -source ci/travis_gbq_config.txt - -if [[ -n ${SERVICE_ACCOUNT_KEY} ]]; then - echo "${SERVICE_ACCOUNT_KEY}" > ci/travis_gbq.json; -elif [[ -n ${!TRAVIS_IV_ENV} ]]; then - openssl aes-256-cbc -K ${!TRAVIS_KEY_ENV} -iv ${!TRAVIS_IV_ENV} \ - -in ci/travis_gbq.json.enc -out ci/travis_gbq.json -d; - export GBQ_PROJECT_ID='pandas-gbq-tests'; - echo 'Successfully decrypted gbq credentials' -fi diff --git a/doc/README.rst b/doc/README.rst deleted file mode 100644 index 5423e7419d03ba..00000000000000 --- a/doc/README.rst +++ /dev/null @@ -1 +0,0 @@ -See `contributing.rst `_ in this repo. diff --git a/doc/source/_static/style/latex_1.png b/doc/source/_static/style/latex_1.png new file mode 100644 index 00000000000000..8b901878a0ec9e Binary files /dev/null and b/doc/source/_static/style/latex_1.png differ diff --git a/doc/source/_static/style/latex_2.png b/doc/source/_static/style/latex_2.png new file mode 100644 index 00000000000000..7d6baa681575eb Binary files /dev/null and b/doc/source/_static/style/latex_2.png differ diff --git a/doc/source/_static/style/tg_ax0.png b/doc/source/_static/style/tg_ax0.png new file mode 100644 index 00000000000000..3460329352282e Binary files /dev/null and b/doc/source/_static/style/tg_ax0.png differ diff --git a/doc/source/_static/style/tg_axNone.png b/doc/source/_static/style/tg_axNone.png new file mode 100644 index 00000000000000..00357f7eb016b1 Binary files /dev/null and b/doc/source/_static/style/tg_axNone.png differ diff --git a/doc/source/_static/style/tg_axNone_gmap.png b/doc/source/_static/style/tg_axNone_gmap.png new file mode 100644 index 00000000000000..d06a4b244a23d1 Binary files /dev/null and b/doc/source/_static/style/tg_axNone_gmap.png differ diff --git a/doc/source/_static/style/tg_axNone_lowhigh.png b/doc/source/_static/style/tg_axNone_lowhigh.png new file mode 100644 index 00000000000000..bc3fb16ee8e405 Binary files /dev/null and b/doc/source/_static/style/tg_axNone_lowhigh.png differ diff --git a/doc/source/_static/style/tg_axNone_vminvmax.png b/doc/source/_static/style/tg_axNone_vminvmax.png new file mode 100644 index 00000000000000..42579c2840fb9a Binary files /dev/null and b/doc/source/_static/style/tg_axNone_vminvmax.png differ diff --git a/doc/source/_static/style/tg_gmap.png b/doc/source/_static/style/tg_gmap.png new file mode 100644 index 00000000000000..fb735295441801 Binary files /dev/null and b/doc/source/_static/style/tg_gmap.png differ diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index bc2325f15852c0..ee061e7b7d3e63 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -75,12 +75,12 @@ Statsmodels leverages pandas objects as the underlying data container for comput Use pandas DataFrames in your `scikit-learn `__ ML pipeline. -`Featuretools `__ +`Featuretools `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Featuretools is a Python library for automated feature engineering built on top of pandas. It excels at transforming temporal and relational datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community. -`Compose `__ +`Compose `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Compose is a machine learning tool for labeling data and prediction engineering. It allows you to structure the labeling process by parameterizing prediction problems and transforming time-driven relational data into target values with cutoff times that can be used for supervised learning. @@ -551,11 +551,12 @@ Library Accessor Classes Description ================== ============ ==================================== =============================================================================== `cyberpandas`_ ``ip`` ``Series`` Provides common operations for working with IP addresses. `pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` Provides plotting functions from the Altair_ library. -`pandas-genomics`_ ``genomics`` ``Series``, ``DataFrame`` Provides common operations for quality control and analysis of genomics data +`pandas-genomics`_ ``genomics`` ``Series``, ``DataFrame`` Provides common operations for quality control and analysis of genomics data. `pandas_path`_ ``path`` ``Index``, ``Series`` Provides `pathlib.Path`_ functions for Series. `pint-pandas`_ ``pint`` ``Series``, ``DataFrame`` Provides units support for numeric Series and DataFrames. `composeml`_ ``slice`` ``DataFrame`` Provides a generator for enhanced data slicing. `datatest`_ ``validate`` ``Series``, ``DataFrame``, ``Index`` Provides validation, differences, and acceptance managers. +`woodwork`_ ``ww`` ``Series``, ``DataFrame`` Provides physical, logical, and semantic data typing information for Series and DataFrames. ================== ============ ==================================== =============================================================================== .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest @@ -565,5 +566,6 @@ Library Accessor Classes Description .. _pandas_path: https://github.com/drivendataorg/pandas-path/ .. _pathlib.Path: https://docs.python.org/3/library/pathlib.html .. _pint-pandas: https://github.com/hgrecco/pint-pandas -.. _composeml: https://github.com/FeatureLabs/compose +.. _composeml: https://github.com/alteryx/compose .. _datatest: https://datatest.readthedocs.io/ +.. _woodwork: https://github.com/alteryx/woodwork diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index ce35e9e15976f9..be9c0da34f8a97 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -234,7 +234,7 @@ Recommended dependencies * `numexpr `__: for accelerating certain numerical operations. ``numexpr`` uses multiple cores as well as smart chunking and caching to achieve large speedups. - If installed, must be Version 2.6.8 or higher. + If installed, must be Version 2.7.0 or higher. * `bottleneck `__: for accelerating certain types of ``nan`` evaluations. ``bottleneck`` uses specialized cython routines to achieve large speedups. If installed, diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst index 8c443f3ae9bb61..0d743b5fe8b8b6 100644 --- a/doc/source/reference/style.rst +++ b/doc/source/reference/style.rst @@ -24,6 +24,7 @@ Styler properties Styler.env Styler.template_html + Styler.template_latex Styler.loader Style application @@ -55,6 +56,7 @@ Builtin styles Styler.highlight_min Styler.highlight_between Styler.background_gradient + Styler.text_gradient Styler.bar Style export and import @@ -66,3 +68,4 @@ Style export and import Styler.export Styler.use Styler.to_excel + Styler.to_latex diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index ef6d45fa0140bd..7a55acbd3031df 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1000,6 +1000,7 @@ instance method on each data group. This is pretty easy to do by passing lambda functions: .. ipython:: python + :okwarning: grouped = df.groupby("A") grouped.agg(lambda x: x.std()) @@ -1009,6 +1010,7 @@ arguments. Using a bit of metaprogramming cleverness, GroupBy now has the ability to "dispatch" method calls to the groups: .. ipython:: python + :okwarning: grouped.std() diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 7f0cd613726dc0..b4e35d1f22840c 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -22,6 +22,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like text;Fixed-Width Text File;:ref:`read_fwf` text;`JSON `__;:ref:`read_json`;:ref:`to_json` text;`HTML `__;:ref:`read_html`;:ref:`to_html` + text;`LaTeX `__;;:ref:`Styler.to_latex` text;`XML `__;:ref:`read_xml`;:ref:`to_xml` text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard` binary;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel` @@ -343,16 +344,33 @@ dialect : str or :class:`python:csv.Dialect` instance, default ``None`` Error handling ++++++++++++++ -error_bad_lines : boolean, default ``True`` +error_bad_lines : boolean, default ``None`` Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no ``DataFrame`` will be returned. If ``False``, then these "bad lines" will dropped from the ``DataFrame`` that is returned. See :ref:`bad lines ` below. -warn_bad_lines : boolean, default ``True`` + + .. deprecated:: 1.3 + The ``on_bad_lines`` parameter should be used instead to specify behavior upon + encountering a bad line instead. +warn_bad_lines : boolean, default ``None`` If error_bad_lines is ``False``, and warn_bad_lines is ``True``, a warning for each "bad line" will be output. + .. deprecated:: 1.3 + The ``on_bad_lines`` parameter should be used instead to specify behavior upon + encountering a bad line instead. +on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error' + Specifies what to do upon encountering a bad line (a line with too many fields). + Allowed values are : + + - 'error', raise an ParserError when a bad line is encountered. + - 'warn', print a warning when a bad line is encountered and skip that line. + - 'skip', skip bad lines without raising or warning when they are encountered. + + .. versionadded:: 1.3 + .. _io.dtypes: Specifying column data types @@ -1244,7 +1262,7 @@ You can elect to skip bad lines: .. code-block:: ipython - In [29]: pd.read_csv(StringIO(data), error_bad_lines=False) + In [29]: pd.read_csv(StringIO(data), on_bad_lines="warn") Skipping line 3: expected 3 fields, saw 4 Out[29]: @@ -1896,7 +1914,7 @@ Writing in ISO date format: dfd = pd.DataFrame(np.random.randn(5, 2), columns=list("AB")) dfd["date"] = pd.Timestamp("20130101") - dfd = dfd.sort_index(1, ascending=False) + dfd = dfd.sort_index(axis=1, ascending=False) json = dfd.to_json(date_format="iso") json @@ -2830,7 +2848,42 @@ parse HTML tables in the top-level pandas io function ``read_html``. .. |lxml| replace:: **lxml** .. _lxml: https://lxml.de +.. _io.latex: + +LaTeX +----- + +.. versionadded:: 1.3.0 + +Currently there are no methods to read from LaTeX, only output methods. + +Writing to LaTeX files +'''''''''''''''''''''' + +.. note:: + + DataFrame *and* Styler objects currently have a ``to_latex`` method. We recommend + using the `Styler.to_latex() <../reference/api/pandas.io.formats.style.Styler.to_latex.rst>`__ method + over `DataFrame.to_latex() <../reference/api/pandas.DataFrame.to_latex.rst>`__ due to the former's greater flexibility with + conditional styling, and the latter's possible future deprecation. + +Review the documentation for `Styler.to_latex <../reference/api/pandas.io.formats.style.Styler.to_latex.rst>`__, +which gives examples of conditional styling and explains the operation of its keyword +arguments. + +For simple application the following pattern is sufficient. + +.. ipython:: python + + df = pd.DataFrame([[1, 2], [3, 4]], index=["a", "b"], columns=["c", "d"]) + print(df.style.to_latex()) +To format values before output, chain the `Styler.format <../reference/api/pandas.io.formats.style.Styler.format.rst>`__ +method. + +.. ipython:: python + + print(df.style.format("€ {}").to_latex()) XML --- @@ -3648,15 +3701,6 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`. df1.to_excel(writer, sheet_name="Sheet1") df2.to_excel(writer, sheet_name="Sheet2") -.. note:: - - Wringing a little more performance out of ``read_excel`` - Internally, Excel stores all numeric data as floats. Because this can - produce unexpected behavior when reading in data, pandas defaults to trying - to convert integers to floats if it doesn't lose information (``1.0 --> - 1``). You can pass ``convert_float=False`` to disable this behavior, which - may give a slight performance improvement. - .. _io.excel_writing_buffer: Writing Excel files to memory diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index 278eb907102ed5..aa8a8fae417bee 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -482,6 +482,11 @@ plotting.backend matplotlib Change the plotting backend like Bokeh, Altair, etc. plotting.matplotlib.register_converters True Register custom converters with matplotlib. Set to False to de-register. +styler.sparse.index True "Sparsify" MultiIndex display for rows + in Styler output (don't display repeated + elements in outer levels within groups). +styler.sparse.columns True "Sparsify" MultiIndex display for columns + in Styler output. ======================================= ============ ================================== diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 86696cc909764c..7d8d8e90dfbdac 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -1012,7 +1012,8 @@ " - [.highlight_min][minfunc] and [.highlight_max][maxfunc]: for use with identifying extremeties in data.\n", " - [.highlight_between][betweenfunc] and [.highlight_quantile][quantilefunc]: for use with identifying classes within data.\n", " - [.background_gradient][bgfunc]: a flexible method for highlighting cells based or their, or other, values on a numeric scale.\n", - " - [.bar][barfunc]: to display mini-charts within cell backgrounds.\n", + " - [.text_gradient][textfunc]: similar method for highlighting text based on their, or other, values on a numeric scale.\n", + " - [.bar][barfunc]: to display mini-charts within cell backgrounds.\n", " \n", "The individual documentation on each function often gives more examples of their arguments.\n", "\n", @@ -1022,6 +1023,7 @@ "[betweenfunc]: ../reference/api/pandas.io.formats.style.Styler.highlight_between.rst\n", "[quantilefunc]: ../reference/api/pandas.io.formats.style.Styler.highlight_quantile.rst\n", "[bgfunc]: ../reference/api/pandas.io.formats.style.Styler.background_gradient.rst\n", + "[textfunc]: ../reference/api/pandas.io.formats.style.Styler.text_gradient.rst\n", "[barfunc]: ../reference/api/pandas.io.formats.style.Styler.bar.rst" ] }, @@ -1098,14 +1100,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Background Gradient" + "### Background Gradient and Text Gradient" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "You can create \"heatmaps\" with the `background_gradient` method. These require matplotlib, and we'll use [Seaborn](https://stanford.edu/~mwaskom/software/seaborn/) to get a nice colormap." + "You can create \"heatmaps\" with the `background_gradient` and `text_gradient` methods. These require matplotlib, and we'll use [Seaborn](https://stanford.edu/~mwaskom/software/seaborn/) to get a nice colormap." ] }, { @@ -1120,19 +1122,31 @@ "df2.style.background_gradient(cmap=cm)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df2.style.text_gradient(cmap=cm)" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "[.background_gradient][bgfunc] has a number of keyword arguments to customise the gradients and colors. See its documentation.\n", + "[.background_gradient][bgfunc] and [.text_gradient][textfunc] have a number of keyword arguments to customise the gradients and colors. See the documentation.\n", "\n", - "[bgfunc]: ../reference/api/pandas.io.formats.style.Styler.background_gradient.rst" + "[bgfunc]: ../reference/api/pandas.io.formats.style.Styler.background_gradient.rst\n", + "[textfunc]: ../reference/api/pandas.io.formats.style.Styler.text_gradient.rst" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ + "### Set properties\n", + "\n", "Use `Styler.set_properties` when the style doesn't actually depend on the values. This is just a simple wrapper for `.applymap` where the function returns the same properties for all cells." ] }, @@ -1448,7 +1462,7 @@ "metadata": {}, "outputs": [], "source": [ - "df4.style.format(escape=True)" + "df4.style.format(escape=\"html\")" ] }, { @@ -1457,7 +1471,7 @@ "metadata": {}, "outputs": [], "source": [ - "df4.style.format('{}', escape=True)" + "df4.style.format('{}', escape=\"html\")" ] }, { diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6dd011c5887023..36b591c3c31423 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -381,6 +381,7 @@ this pathological behavior (:issue:`37827`): *New behavior*: .. ipython:: python + :okwarning: df.mean() @@ -394,6 +395,7 @@ instead of casting to a NumPy array which may have different semantics (:issue:` :issue:`28949`, :issue:`21020`). .. ipython:: python + :okwarning: ser = pd.Series([0, 1], dtype="category", name="A") df = ser.to_frame() @@ -411,6 +413,7 @@ instead of casting to a NumPy array which may have different semantics (:issue:` *New behavior*: .. ipython:: python + :okwarning: df.any() diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst index 60e146b2212eb7..500030e1304c65 100644 --- a/doc/source/whatsnew/v1.2.5.rst +++ b/doc/source/whatsnew/v1.2.5.rst @@ -15,8 +15,9 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`) +- Fixed regression in :meth:`DataFrame.sum` and :meth:`DataFrame.prod` when ``min_count`` and ``numeric_only`` are both given (:issue:`41074`) - Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`) -- +- Regression in :meth:`DataFrame.replace` and :meth:`Series.replace` when the values to replace is a NumPy float array (:issue:`40371`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index e33b40225bef15..b36499c340fd97 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -120,8 +120,8 @@ to allow custom CSS highlighting instead of default background coloring (:issue: Enhancements to other built-in methods include extending the :meth:`.Styler.background_gradient` method to shade elements based on a given gradient map and not be restricted only to values in the DataFrame (:issue:`39930` :issue:`22727` :issue:`28901`). Additional -built-in methods such as :meth:`.Styler.highlight_between` and :meth:`.Styler.highlight_quantile` -have been added (:issue:`39821` and :issue:`40926`). +built-in methods such as :meth:`.Styler.highlight_between`, :meth:`.Styler.highlight_quantile` +and :math:`.Styler.text_gradient` have been added (:issue:`39821`, :issue:`40926`, :issue:`41098`). The :meth:`.Styler.apply` now consistently allows functions with ``ndarray`` output to allow more flexible development of UDFs when ``axis`` is ``None`` ``0`` or ``1`` (:issue:`39393`). @@ -139,6 +139,10 @@ precision, and perform HTML escaping (:issue:`40437` :issue:`40134`). There have properly format HTML and eliminate some inconsistencies (:issue:`39942` :issue:`40356` :issue:`39807` :issue:`39889` :issue:`39627`) :class:`.Styler` has also been compatible with non-unique index or columns, at least for as many features as are fully compatible, others made only partially compatible (:issue:`41269`). +One also has greater control of the display through separate sparsification of the index or columns, using the new 'styler' options context (:issue:`41142`). + +We have added an extension to allow LaTeX styling as an alternative to CSS styling and a method :meth:`.Styler.to_latex` +which renders the necessary LaTeX format including built-up styles. An additional file io function :meth:`Styler.to_html` has been added for convenience (:issue:`40312`). Documentation has also seen major revisions in light of new features (:issue:`39720` :issue:`39317` :issue:`40493`) @@ -197,7 +201,7 @@ Other enhancements - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`) - :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`) - Add support for dict-like names in :class:`MultiIndex.set_names` and :class:`MultiIndex.rename` (:issue:`20421`) -- :func:`pandas.read_excel` can now auto detect .xlsb files (:issue:`35416`) +- :func:`pandas.read_excel` can now auto detect .xlsb files and older .xls files (:issue:`35416`, :issue:`41225`) - :class:`pandas.ExcelWriter` now accepts an ``if_sheet_exists`` parameter to control the behaviour of append mode when writing to existing sheets (:issue:`40230`) - :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.ExponentialMovingWindow.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support ``Numba`` execution with the ``engine`` keyword (:issue:`38895`, :issue:`41267`) - :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) @@ -229,7 +233,7 @@ Other enhancements - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`) - Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`) - :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`) -- Read and write :class:`DataFrame` and :class:`Series` attrs to parquet with pyarrow engine (:issue:`20521`) +- Improved error message in ``corr`` and ``cov`` methods on :class:`.Rolling`, :class:`.Expanding`, and :class:`.ExponentialMovingWindow` when ``other`` is not a :class:`DataFrame` or :class:`Series` (:issue:`41741`) .. --------------------------------------------------------------------------- @@ -334,6 +338,31 @@ values as measured by ``np.allclose``. Now no such casting occurs. df.groupby('key').agg(lambda x: x.sum()) +``float`` result for :meth:`.GroupBy.mean`, :meth:`.GroupBy.median`, and :meth:`.GroupBy.var` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, these methods could result in different dtypes depending on the input values. +Now, these methods will always return a float dtype. (:issue:`41137`) + +.. ipython:: python + + df = pd.DataFrame({'a': [True], 'b': [1], 'c': [1.0]}) + +*pandas 1.2.x* + +.. code-block:: ipython + + In [5]: df.groupby(df.index).mean() + Out[5]: + a b c + 0 True 1 1.0 + +*pandas 1.3.0* + +.. ipython:: python + + df.groupby(df.index).mean() + Try operating inplace when setting values with ``loc`` and ``iloc`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -548,7 +577,7 @@ If installed, we now require: +-----------------+-----------------+----------+---------+ | bottleneck | 1.2.1 | | | +-----------------+-----------------+----------+---------+ -| numexpr | 2.6.8 | | | +| numexpr | 2.7.0 | | X | +-----------------+-----------------+----------+---------+ | pytest (dev) | 6.0 | | X | +-----------------+-----------------+----------+---------+ @@ -590,7 +619,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | scipy | 1.2.0 | | +-----------------+-----------------+---------+ -| sqlalchemy | 1.2.8 | | +| sqlalchemy | 1.3.0 | X | +-----------------+-----------------+---------+ | tabulate | 0.8.7 | X | +-----------------+-----------------+---------+ @@ -614,6 +643,7 @@ Other API changes - Partially initialized :class:`CategoricalDtype` (i.e. those with ``categories=None`` objects will no longer compare as equal to fully initialized dtype objects. - Accessing ``_constructor_expanddim`` on a :class:`DataFrame` and ``_constructor_sliced`` on a :class:`Series` now raise an ``AttributeError``. Previously a ``NotImplementedError`` was raised (:issue:`38782`) - Added new ``engine`` and ``**engine_kwargs`` parameters to :meth:`DataFrame.to_sql` to support other future "SQL engines". Currently we still only use ``SQLAlchemy`` under the hood, but more engines are planned to be supported such as ``turbodbc`` (:issue:`36893`) +- Removed redundant ``freq`` from :class:`PeriodIndex` string representation (:issue:`41653`) Build ===== @@ -627,6 +657,7 @@ Build Deprecations ~~~~~~~~~~~~ - Deprecated allowing scalars to be passed to the :class:`Categorical` constructor (:issue:`38433`) +- Deprecated constructing :class:`CategoricalIndex` without passing list-like data (:issue:`38944`) - Deprecated allowing subclass-specific keyword arguments in the :class:`Index` constructor, use the specific subclass directly instead (:issue:`14093`, :issue:`21311`, :issue:`22315`, :issue:`26974`) - Deprecated ``astype`` of datetimelike (``timedelta64[ns]``, ``datetime64[ns]``, ``Datetime64TZDtype``, ``PeriodDtype``) to integer dtypes, use ``values.view(...)`` instead (:issue:`38544`) - Deprecated :meth:`MultiIndex.is_lexsorted` and :meth:`MultiIndex.lexsort_depth`, use :meth:`MultiIndex.is_monotonic_increasing` instead (:issue:`32259`) @@ -640,6 +671,7 @@ Deprecations - Deprecated casting ``datetime.date`` objects to ``datetime64`` when used as ``fill_value`` in :meth:`DataFrame.unstack`, :meth:`DataFrame.shift`, :meth:`Series.shift`, and :meth:`DataFrame.reindex`, pass ``pd.Timestamp(dateobj)`` instead (:issue:`39767`) - Deprecated :meth:`.Styler.set_na_rep` and :meth:`.Styler.set_precision` in favour of :meth:`.Styler.format` with ``na_rep`` and ``precision`` as existing and new input arguments respectively (:issue:`40134`, :issue:`40425`) - Deprecated allowing partial failure in :meth:`Series.transform` and :meth:`DataFrame.transform` when ``func`` is list-like or dict-like and raises anything but ``TypeError``; ``func`` raising anything but a ``TypeError`` will raise in a future version (:issue:`40211`) +- Deprecated arguments ``error_bad_lines`` and ``warn_bad_lines`` in :meth:``read_csv`` and :meth:``read_table`` in favor of argument ``on_bad_lines`` (:issue:`15122`) - Deprecated support for ``np.ma.mrecords.MaskedRecords`` in the :class:`DataFrame` constructor, pass ``{name: data[name] for name in data.dtype.names}`` instead (:issue:`40363`) - Deprecated using :func:`merge` or :func:`join` on a different number of levels (:issue:`34862`) - Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`) @@ -647,8 +679,114 @@ Deprecations - The ``inplace`` parameter of :meth:`Categorical.remove_categories`, :meth:`Categorical.add_categories`, :meth:`Categorical.reorder_categories`, :meth:`Categorical.rename_categories`, :meth:`Categorical.set_categories` is deprecated and will be removed in a future version (:issue:`37643`) - Deprecated :func:`merge` producing duplicated columns through the ``suffixes`` keyword and already existing columns (:issue:`22818`) - Deprecated setting :attr:`Categorical._codes`, create a new :class:`Categorical` with the desired codes instead (:issue:`40606`) +- Deprecated the ``convert_float`` optional argument in :func:`read_excel` and :meth:`ExcelFile.parse` (:issue:`41127`) - Deprecated behavior of :meth:`DatetimeIndex.union` with mixed timezones; in a future version both will be cast to UTC instead of object dtype (:issue:`39328`) - Deprecated using ``usecols`` with out of bounds indices for ``read_csv`` with ``engine="c"`` (:issue:`25623`) +- Deprecated passing arguments as positional (except for ``"codes"``) in :meth:`MultiIndex.codes` (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`Index.set_names` and :meth:`MultiIndex.set_names` (except for ``names``) (:issue:`41485`) +- Deprecated passing arguments (apart from ``cond`` and ``other``) as positional in :meth:`DataFrame.mask` and :meth:`Series.mask` (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`DataFrame.clip` and :meth:`Series.clip` (other than ``"upper"`` and ``"lower"``) (:issue:`41485`) +- Deprecated special treatment of lists with first element a Categorical in the :class:`DataFrame` constructor; pass as ``pd.DataFrame({col: categorical, ...})`` instead (:issue:`38845`) +- Deprecated behavior of :class:`DataFrame` constructor when a ``dtype`` is passed and the data cannot be cast to that dtype. In a future version, this will raise instead of being silently ignored (:issue:`24435`) +- Deprecated passing arguments as positional (except for ``"method"``) in :meth:`DataFrame.interpolate` and :meth:`Series.interpolate` (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`DataFrame.ffill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill`, and :meth:`Series.bfill` (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`DataFrame.sort_values` (other than ``"by"``) and :meth:`Series.sort_values` (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`DataFrame.dropna` and :meth:`Series.dropna` (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`DataFrame.set_index` (other than ``"keys"``) (:issue:`41485`) +- Deprecated passing arguments as positional (except for ``"levels"``) in :meth:`MultiIndex.set_levels` (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`DataFrame.drop_duplicates` (except for ``subset``), :meth:`Series.drop_duplicates`, :meth:`Index.drop_duplicates` and :meth:`MultiIndex.drop_duplicates`(:issue:`41485`) +- Deprecated passing arguments (apart from ``value``) as positional in :meth:`DataFrame.fillna` and :meth:`Series.fillna` (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`DataFrame.reset_index` (other than ``"level"``) and :meth:`Series.reset_index` (:issue:`41485`) +- Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype. Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`,:issue:`33401`) +- Deprecated behavior of :class:`Series` construction with large-integer values and small-integer dtype silently overflowing; use ``Series(data).astype(dtype)`` instead (:issue:`41734`) +- Deprecated inference of ``timedelta64[ns]``, ``datetime64[ns]``, or ``DatetimeTZDtype`` dtypes in :class:`Series` construction when data containing strings is passed and no ``dtype`` is passed (:issue:`33558`) +- In a future version, constructing :class:`Series` or :class:`DataFrame` with ``datetime64[ns]`` data and ``DatetimeTZDtype`` will treat the data as wall-times instead of as UTC times (matching DatetimeIndex behavior). To treat the data as UTC times, use ``pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(dtype.tz)`` or ``pd.Series(data.view("int64"), dtype=dtype)`` (:issue:`33401`) +- Deprecated passing arguments as positional in :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``"labels"``) (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`DataFrame.where` and :meth:`Series.where` (other than ``"cond"`` and ``"other"``) (:issue:`41485`) +- Deprecated passing arguments as positional (other than ``filepath_or_buffer``) in :func:`read_csv` (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`DataFrame.drop` (other than ``"labels"``) and :meth:`Series.drop` (:issue:`41485`) +- Deprecated passing arguments as positional (other than ``filepath_or_buffer``) in :func:`read_table` (:issue:`41485`) + + +.. _whatsnew_130.deprecations.nuisance_columns: + +Deprecated Dropping Nuisance Columns in DataFrame Reductions and DataFrameGroupBy Operations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The default of calling a reduction (.min, .max, .sum, ...) on a :class:`DataFrame` with +``numeric_only=None`` (the default, columns on which the reduction raises ``TypeError`` +are silently ignored and dropped from the result. + +This behavior is deprecated. In a future version, the ``TypeError`` will be raised, +and users will need to select only valid columns before calling the function. + +For example: + +.. ipython:: python + + df = pd.DataFrame({"A": [1, 2, 3, 4], "B": pd.date_range("2016-01-01", periods=4)}) + df + +*Old behavior*: + +.. code-block:: ipython + + In [3]: df.prod() + Out[3]: + Out[3]: + A 24 + dtype: int64 + +*Future behavior*: + +.. code-block:: ipython + + In [4]: df.prod() + ... + TypeError: 'DatetimeArray' does not implement reduction 'prod' + + In [5]: df[["A"]].prod() + Out[5]: + A 24 + dtype: int64 + + +Similarly, when applying a function to :class:`DataFrameGroupBy`, columns on which +the function raises ``TypeError`` are currently silently ignored and dropped +from the result. + +This behavior is deprecated. In a future version, the ``TypeError`` +will be raised, and users will need to select only valid columns before calling +the function. + +For example: + +.. ipython:: python + + df = pd.DataFrame({"A": [1, 2, 3, 4], "B": pd.date_range("2016-01-01", periods=4)}) + gb = df.groupby([1, 1, 2, 2]) + +*Old behavior*: + +.. code-block:: ipython + + In [4]: gb.prod(numeric_only=False) + Out[4]: + A + 1 2 + 2 12 + +.. code-block:: ipython + + In [5]: gb.prod(numeric_only=False) + ... + TypeError: datetime64 type does not support prod operations + + In [6]: gb[["A"]].prod(numeric_only=False) + Out[6]: + A + 1 2 + 2 12 .. --------------------------------------------------------------------------- @@ -749,6 +887,8 @@ Conversion - Bug in :func:`factorize` where, when given an array with a numeric numpy dtype lower than int64, uint64 and float64, the unique values did not keep their original dtype (:issue:`41132`) - Bug in :class:`DataFrame` construction with a dictionary containing an arraylike with ``ExtensionDtype`` and ``copy=True`` failing to make a copy (:issue:`38939`) - Bug in :meth:`qcut` raising error when taking ``Float64DType`` as input (:issue:`40730`) +- Bug in :class:`DataFrame` and :class:`Series` construction with ``datetime64[ns]`` data and ``dtype=object`` resulting in ``datetime`` objects instead of :class:`Timestamp` objects (:issue:`41599`) +- Bug in :class:`DataFrame` and :class:`Series` construction with ``timedelta64[ns]`` data and ``dtype=object`` resulting in ``np.timedelta64`` objects instead of :class:`Timedelta` objects (:issue:`41599`) Strings ^^^^^^^ @@ -756,6 +896,7 @@ Strings - Bug in the conversion from ``pyarrow.ChunkedArray`` to :class:`~arrays.StringArray` when the original had zero chunks (:issue:`41040`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` ignoring replacements with ``regex=True`` for ``StringDType`` data (:issue:`41333`, :issue:`35977`) - Bug in :meth:`Series.str.extract` with :class:`~arrays.StringArray` returning object dtype for empty :class:`DataFrame` (:issue:`41441`) +- Bug in :meth:`Series.str.replace` where the ``case`` argument was ignored when ``regex=False`` (:issue:`41602`) Interval ^^^^^^^^ @@ -767,8 +908,9 @@ Interval Indexing ^^^^^^^^ -- Bug in :meth:`Index.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`, :issue:`40862`) +- Bug in :meth:`Index.union` and :meth:`MultiIndex.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`, :issue:`40862`) - Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`) +- Bug in :meth:`Series.loc` raising ``ValueError`` when input was filtered with a boolean list and values to set were a list with lower dimension (:issue:`20438`) - Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`) - Bug in :meth:`DataFrame.__setitem__` raising ``ValueError`` when setting multiple values to duplicate columns (:issue:`15695`) - Bug in :meth:`DataFrame.loc`, :meth:`Series.loc`, :meth:`DataFrame.__getitem__` and :meth:`Series.__getitem__` returning incorrect elements for non-monotonic :class:`DatetimeIndex` for string slices (:issue:`33146`) @@ -797,9 +939,12 @@ Indexing - Bug in :meth:`DataFrame.__setitem__` and :meth:`DataFrame.iloc.__setitem__` raising ``ValueError`` when trying to index with a row-slice and setting a list as values (:issue:`40440`) - Bug in :meth:`DataFrame.loc` not raising ``KeyError`` when key was not found in :class:`MultiIndex` when levels contain more values than used (:issue:`41170`) - Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contains duplicates (:issue:`40096`) +- Bug in :meth:`DataFrame.loc.__getitem__` with :class:`MultiIndex` casting to float when at least one column is from has float dtype and we retrieve a scalar (:issue:`41369`) - Bug in :meth:`DataFrame.loc` incorrectly matching non-boolean index elements (:issue:`20432`) - Bug in :meth:`Series.__delitem__` with ``ExtensionDtype`` incorrectly casting to ``ndarray`` (:issue:`40386`) +- Bug in :meth:`DataFrame.loc` returning :class:`MultiIndex` in wrong order if indexer has duplicates (:issue:`40978`) - Bug in :meth:`DataFrame.__setitem__` raising ``TypeError`` when using a str subclass as the column name with a :class:`DatetimeIndex` (:issue:`37366`) +- Bug in :meth:`PeriodIndex.get_loc` failing to raise ``KeyError`` when given a :class:`Period` with a mismatched ``freq`` (:issue:`41670`) Missing ^^^^^^^ @@ -808,6 +953,7 @@ Missing - Bug in :func:`isna`, and :meth:`Series.isna`, :meth:`Index.isna`, :meth:`DataFrame.isna` (and the corresponding ``notna`` functions) not recognizing ``Decimal("NaN")`` objects (:issue:`39409`) - Bug in :meth:`DataFrame.fillna` not accepting dictionary for ``downcast`` keyword (:issue:`40809`) - Bug in :func:`isna` not returning a copy of the mask for nullable types, causing any subsequent mask modification to change the original array (:issue:`40935`) +- Bug in :class:`DataFrame` construction with float data containing ``NaN`` and an integer ``dtype`` casting instead of retaining the ``NaN`` (:issue:`26919`) MultiIndex ^^^^^^^^^^ @@ -817,6 +963,7 @@ MultiIndex - Bug in :meth:`MultiIndex.equals` incorrectly returning ``True`` when :class:`MultiIndex` containing ``NaN`` even when they are differently ordered (:issue:`38439`) - Bug in :meth:`MultiIndex.intersection` always returning empty when intersecting with :class:`CategoricalIndex` (:issue:`38653`) - Bug in :meth:`MultiIndex.reindex` raising ``ValueError`` with empty MultiIndex and indexing only a specific level (:issue:`41170`) +- Bug in :meth:`MultiIndex.reindex` raising ``TypeError`` when reindexing against a flat :class:`Index` (:issue:`41707`) I/O ^^^ @@ -844,12 +991,16 @@ I/O - Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`) - Bug in :func:`read_excel` loading trailing empty rows/columns for some filetypes (:issue:`41167`) - Bug in :func:`read_excel` raising ``AttributeError`` with ``MultiIndex`` header followed by two empty rows and no index, and bug affecting :func:`read_excel`, :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_clipboard` where one blank row after a ``MultiIndex`` header with no index would be dropped (:issue:`40442`) -- Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`) +- Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40904`) +- Bug in :meth:`DataFrame.to_string` adding an extra dot and misaligning the truncation row when ``index=False`` (:issue:`40904`) - Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`) - Bug in :func:`read_csv` and :func:`read_table` silently ignoring ``prefix`` if ``names`` and ``prefix`` are defined, now raising ``ValueError`` (:issue:`39123`) - Bug in :func:`read_csv` and :func:`read_excel` not respecting dtype for duplicated column name when ``mangle_dupe_cols`` is set to ``True`` (:issue:`35211`) +- Bug in :func:`read_csv` silently ignoring ``sep`` if ``delimiter`` and ``sep`` are defined, now raising ``ValueError`` (:issue:`39823`) - Bug in :func:`read_csv` and :func:`read_table` misinterpreting arguments when ``sys.setprofile`` had been previously called (:issue:`41069`) - Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`) +- Bug in :func:`read_excel` would raise an error when pandas could not determine the file type, even when user specified the ``engine`` argument (:issue:`41225`) +- Bug in :func:`read_clipboard` copying from an excel file shifts values into the wrong column if there are null values in first column (:issue:`41108`) Period ^^^^^^ @@ -909,6 +1060,10 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.__getitem__` with non-unique columns incorrectly returning a malformed :class:`SeriesGroupBy` instead of :class:`DataFrameGroupBy` (:issue:`41427`) - Bug in :meth:`DataFrameGroupBy.transform` with non-unique columns incorrectly raising ``AttributeError`` (:issue:`41427`) - Bug in :meth:`Resampler.apply` with non-unique columns incorrectly dropping duplicated columns (:issue:`41445`) +- Bug in :meth:`SeriesGroupBy` aggregations incorrectly returning empty :class:`Series` instead of raising ``TypeError`` on aggregations that are invalid for its dtype, e.g. ``.prod`` with ``datetime64[ns]`` dtype (:issue:`41342`) +- Bug in :class:`DataFrameGroupBy` aggregations incorrectly failing to drop columns with invalid dtypes for that aggregation when there are no valid columns (:issue:`41291`) +- Bug in :meth:`DataFrame.rolling.__iter__` where ``on`` was not assigned to the index of the resulting objects (:issue:`40373`) +- Bug in :meth:`DataFrameGroupBy.transform` and :meth:`DataFrameGroupBy.agg` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`41647`) Reshaping ^^^^^^^^^ @@ -924,6 +1079,7 @@ Reshaping - Bug in :meth:`DataFrame.sort_values` not reshaping index correctly after sorting on columns, when ``ignore_index=True`` (:issue:`39464`) - Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`) - Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``datetime64`` and ``timedelta64`` dtypes (:issue:`39574`) +- Bug in :meth:`DataFrame.append` with a :class:`DataFrame` with a :class:`MultiIndex` and appending a :class:`Series` whose :class:`Index` is not a :class:`MultiIndex` (:issue:`41707`) - Bug in :meth:`DataFrame.pivot_table` returning a ``MultiIndex`` for a single value when operating on and empty ``DataFrame`` (:issue:`13483`) - Allow :class:`Index` to be passed to the :func:`numpy.all` function (:issue:`40180`) - Bug in :meth:`DataFrame.stack` not preserving ``CategoricalDtype`` in a ``MultiIndex`` (:issue:`36991`) @@ -972,9 +1128,13 @@ Other - Bug in :func:`pandas.testing.assert_index_equal` with ``exact=True`` not raising when comparing :class:`CategoricalIndex` instances with ``Int64Index`` and ``RangeIndex`` categories (:issue:`41263`) - Bug in :meth:`DataFrame.equals`, :meth:`Series.equals`, :meth:`Index.equals` with object-dtype containing ``np.datetime64("NaT")`` or ``np.timedelta64("NaT")`` (:issue:`39650`) - Bug in :func:`pandas.util.show_versions` where console JSON output was not proper JSON (:issue:`39701`) +- Let Pandas compile on z/OS when using `xlc `_ (:issue:`35826`) - Bug in :meth:`DataFrame.convert_dtypes` incorrectly raised ValueError when called on an empty DataFrame (:issue:`40393`) +- Bug in :meth:`DataFrame.agg()` not sorting the aggregated axis in the order of the provided aggragation functions when one or more aggregation function fails to produce results (:issue:`33634`) - Bug in :meth:`DataFrame.clip` not interpreting missing values as no threshold (:issue:`40420`) - Bug in :class:`Series` backed by :class:`DatetimeArray` or :class:`TimedeltaArray` sometimes failing to set the array's ``freq`` to ``None`` (:issue:`41425`) +- Bug in creating a :class:`Series` from a ``range`` object that does not fit in the bounds of ``int64`` dtype (:issue:`30173`) +- Bug in creating a :class:`Series` from a ``dict`` with all-tuple keys and an :class:`Index` that requires reindexing (:issue:`41707`) .. --------------------------------------------------------------------------- diff --git a/doc/sphinxext/announce.py b/doc/sphinxext/announce.py index 2ec0b515ea95c1..b0b430ed6a8665 100755 --- a/doc/sphinxext/announce.py +++ b/doc/sphinxext/announce.py @@ -54,7 +54,7 @@ def get_authors(revision_range): pat = "^.*\\t(.*)$" - lst_release, cur_release = [r.strip() for r in revision_range.split("..")] + lst_release, cur_release = (r.strip() for r in revision_range.split("..")) if "|" in cur_release: # e.g. v1.0.1|HEAD @@ -119,7 +119,7 @@ def get_pull_requests(repo, revision_range): def build_components(revision_range, heading="Contributors"): - lst_release, cur_release = [r.strip() for r in revision_range.split("..")] + lst_release, cur_release = (r.strip() for r in revision_range.split("..")) authors = get_authors(revision_range) return { diff --git a/environment.yml b/environment.yml index 56a36c593a458d..bb96235123af30 100644 --- a/environment.yml +++ b/environment.yml @@ -81,7 +81,7 @@ dependencies: - ipython>=7.11.1 - jinja2<3.0.0 # pandas.Styler - matplotlib>=2.2.2 # pandas.plotting, Series.plot, DataFrame.plot - - numexpr>=2.6.8 + - numexpr>=2.7.0 - scipy>=1.2 - numba>=0.46.0 diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 455f800073c152..37f5a5730439d4 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -157,7 +157,7 @@ def _describe_option(pat: str = "", _print_desc: bool = True): if len(keys) == 0: raise OptionError("No such keys(s)") - s = "\n".join([_build_option_description(k) for k in keys]) + s = "\n".join(_build_option_description(k) for k in keys) if _print_desc: print(s) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 7a286188c4e749..b72b927b3c2a81 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -516,7 +516,7 @@ def group_add(add_t[:, ::1] out, val = values[i, j] # not nan - if val == val: + if not checknull(val): nobs[lab, j] += 1 if nobs[lab, j] == 1: diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 5e1cc612bed570..f91b96dc1b1dc0 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -11,7 +11,10 @@ from typing import ( import numpy as np -from pandas._typing import ArrayLike +from pandas._typing import ( + ArrayLike, + DtypeObj, +) # placeholder until we can specify np.ndarray[object, ndim=2] ndarray_obj_2d = np.ndarray @@ -52,8 +55,6 @@ def is_float_array(values: np.ndarray, skipna: bool = False): ... def is_integer_array(values: np.ndarray, skipna: bool = False): ... def is_bool_array(values: np.ndarray, skipna: bool = False): ... -def fast_multiget(mapping: dict, keys: np.ndarray, default=np.nan) -> np.ndarray: ... - def fast_unique_multiple_list_gen(gen: Generator, sort: bool = True) -> list: ... def fast_unique_multiple_list(lists: list, sort: bool = True) -> list: ... def fast_unique_multiple(arrays: list, sort: bool = True) -> list: ... @@ -73,6 +74,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: Literal[False] = ..., convert_to_nullable_integer: Literal[False] = ..., + dtype_if_all_nat: DtypeObj | None = ..., ) -> np.ndarray: ... @overload @@ -85,6 +87,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: bool = ..., convert_to_nullable_integer: Literal[True] = ..., + dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @overload @@ -97,6 +100,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: bool = ..., convert_to_nullable_integer: bool = ..., + dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @overload @@ -109,6 +113,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: Literal[True] = ..., convert_to_nullable_integer: bool = ..., + dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @overload @@ -121,6 +126,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: bool = ..., convert_to_nullable_integer: bool = ..., + dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @overload @@ -153,7 +159,7 @@ def ensure_string_array( def infer_datetimelike_array( arr: np.ndarray # np.ndarray[object] -) -> str: ... +) -> tuple[str, bool]: ... def astype_intsafe( arr: np.ndarray, # np.ndarray[object] @@ -185,7 +191,7 @@ def maybe_indices_to_slice( ) -> slice | np.ndarray: ... # np.ndarray[np.uint8] def clean_index_list(obj: list) -> tuple[ - list | np.ndarray, # np.ndarray[object] | np.ndarray[np.int64] + list | np.ndarray, # np.ndarray[object | np.int64 | np.uint64] bool, ]: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index cbef4ed44dc069..4b5ef3e909a009 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -84,6 +84,10 @@ from pandas._libs.util cimport ( ) from pandas._libs.tslib import array_to_datetime +from pandas._libs.tslibs import ( + OutOfBoundsDatetime, + OutOfBoundsTimedelta, +) from pandas._libs.tslibs.period import Period from pandas._libs.missing cimport ( @@ -291,7 +295,7 @@ def item_from_zerodim(val: object) -> object: @cython.wraparound(False) @cython.boundscheck(False) -def fast_unique_multiple(list arrays, sort: bool = True) -> list: +def fast_unique_multiple(list arrays, sort: bool = True): """ Generate a list of unique values from a list of arrays. @@ -747,10 +751,14 @@ def clean_index_list(obj: list): object val bint all_arrays = True + # First check if we have a list of arraylikes, in which case we will + # pass them to MultiIndex.from_arrays for i in range(n): val = obj[i] if not (isinstance(val, list) or util.is_array(val) or hasattr(val, '_data')): + # TODO: EA? + # exclude tuples, frozensets as they may be contained in an Index all_arrays = False break @@ -762,11 +770,21 @@ def clean_index_list(obj: list): if inferred in ['string', 'bytes', 'mixed', 'mixed-integer']: return np.asarray(obj, dtype=object), 0 elif inferred in ['integer']: - # TODO: we infer an integer but it *could* be a uint64 - try: - return np.asarray(obj, dtype='int64'), 0 - except OverflowError: - return np.asarray(obj, dtype='object'), 0 + # we infer an integer but it *could* be a uint64 + + arr = np.asarray(obj) + if arr.dtype.kind not in ["i", "u"]: + # eg [0, uint64max] gets cast to float64, + # but then we know we have either uint64 or object + if (arr < 0).any(): + # TODO: similar to maybe_cast_to_integer_array + return np.asarray(obj, dtype="object"), 0 + + # GH#35481 + guess = np.asarray(obj, dtype="uint64") + return guess, 0 + + return arr, 0 return np.asarray(obj), 0 @@ -1187,6 +1205,7 @@ cdef class Seen: bint timedelta_ # seen_timedelta bint datetimetz_ # seen_datetimetz bint period_ # seen_period + bint interval_ # seen_interval def __cinit__(self, bint coerce_numeric=False): """ @@ -1212,6 +1231,7 @@ cdef class Seen: self.timedelta_ = False self.datetimetz_ = False self.period_ = False + self.interval_ = False self.coerce_numeric = coerce_numeric cdef inline bint check_uint64_conflict(self) except -1: @@ -1461,7 +1481,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: for i in range(n): val = values[i] - # do not use is_nul_datetimelike to keep + # do not use is_null_datetimelike to keep # np.datetime64('nat') and np.timedelta64('nat') if val is None or util.is_nan(val): pass @@ -1550,15 +1570,13 @@ def infer_dtype(value: object, skipna: bool = True) -> str: for i in range(n): val = values[i] - if (util.is_integer_object(val) and - not util.is_timedelta64_object(val) and - not util.is_datetime64_object(val)): + if util.is_integer_object(val): return "mixed-integer" return "mixed" -def infer_datetimelike_array(arr: ndarray[object]) -> str: +def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]: """ Infer if we have a datetime or timedelta array. - date: we have *only* date and maybe strings, nulls @@ -1576,12 +1594,13 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str: Returns ------- str: {datetime, timedelta, date, nat, mixed} + bool """ cdef: Py_ssize_t i, n = len(arr) bint seen_timedelta = False, seen_date = False, seen_datetime = False bint seen_tz_aware = False, seen_tz_naive = False - bint seen_nat = False + bint seen_nat = False, seen_str = False list objs = [] object v @@ -1589,6 +1608,7 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str: v = arr[i] if isinstance(v, str): objs.append(v) + seen_str = True if len(objs) == 3: break @@ -1609,7 +1629,7 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str: seen_tz_aware = True if seen_tz_naive and seen_tz_aware: - return "mixed" + return "mixed", seen_str elif util.is_datetime64_object(v): # np.datetime64 seen_datetime = True @@ -1619,16 +1639,16 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str: # timedelta, or timedelta64 seen_timedelta = True else: - return "mixed" + return "mixed", seen_str if seen_date and not (seen_datetime or seen_timedelta): - return "date" + return "date", seen_str elif seen_datetime and not seen_timedelta: - return "datetime" + return "datetime", seen_str elif seen_timedelta and not seen_datetime: - return "timedelta" + return "timedelta", seen_str elif seen_nat: - return "nat" + return "nat", seen_str # short-circuit by trying to # actually convert these strings @@ -1636,15 +1656,16 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str: # convert *every* string array if len(objs): try: - array_to_datetime(objs, errors="raise") - return "datetime" + # require_iso8601 as in maybe_infer_to_datetimelike + array_to_datetime(objs, errors="raise", require_iso8601=True) + return "datetime", seen_str except (ValueError, TypeError): pass # we are *not* going to infer from strings # for timedelta as too much ambiguity - return 'mixed' + return "mixed", seen_str cdef inline bint is_timedelta(object o): @@ -2029,16 +2050,58 @@ cdef bint is_period_array(ndarray[object] values): return True -cdef class IntervalValidator(Validator): - cdef inline bint is_value_typed(self, object value) except -1: - return is_interval(value) - - cpdef bint is_interval_array(ndarray values): + """ + Is this an ndarray of Interval (or np.nan) with a single dtype? + """ cdef: - IntervalValidator validator = IntervalValidator(len(values), - skipna=True) - return validator.validate(values) + Py_ssize_t i, n = len(values) + str closed = None + bint numeric = False + bint dt64 = False + bint td64 = False + object val + + if len(values) == 0: + return False + + for val in values: + if is_interval(val): + if closed is None: + closed = val.closed + numeric = ( + util.is_float_object(val.left) + or util.is_integer_object(val.left) + ) + td64 = is_timedelta(val.left) + dt64 = PyDateTime_Check(val.left) + elif val.closed != closed: + # mismatched closedness + return False + elif numeric: + if not ( + util.is_float_object(val.left) + or util.is_integer_object(val.left) + ): + # i.e. datetime64 or timedelta64 + return False + elif td64: + if not is_timedelta(val.left): + return False + elif dt64: + if not PyDateTime_Check(val.left): + return False + else: + raise ValueError(val) + elif util.is_nan(val) or val is None: + pass + else: + return False + + if closed is None: + # we saw all-NAs, no actual Intervals + return False + return True @cython.boundscheck(False) @@ -2275,7 +2338,9 @@ def maybe_convert_objects(ndarray[object] objects, bint convert_datetime=False, bint convert_timedelta=False, bint convert_period=False, - bint convert_to_nullable_integer=False) -> "ArrayLike": + bint convert_interval=False, + bint convert_to_nullable_integer=False, + object dtype_if_all_nat=None) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2298,9 +2363,14 @@ def maybe_convert_objects(ndarray[object] objects, convert_period : bool, default False If an array-like object contains only (homogeneous-freq) Period values or NaT, whether to convert and return a PeriodArray. + convert_interval : bool, default False + If an array-like object contains only Interval objects (with matching + dtypes and closedness) or NaN, whether to convert to IntervalArray. convert_to_nullable_integer : bool, default False If an array-like object contains only integer values (and NaN) is encountered, whether to convert and return an IntegerArray. + dtype_if_all_nat : np.dtype, ExtensionDtype, or None, default None + Dtype to cast to if we have all-NaT. Returns ------- @@ -2369,8 +2439,12 @@ def maybe_convert_objects(ndarray[object] objects, seen.float_ = True elif is_timedelta(val): if convert_timedelta: - itimedeltas[i] = convert_to_timedelta64(val, "ns").view("i8") seen.timedelta_ = True + try: + itimedeltas[i] = convert_to_timedelta64(val, "ns").view("i8") + except OutOfBoundsTimedelta: + seen.object_ = True + break else: seen.object_ = True break @@ -2407,8 +2481,12 @@ def maybe_convert_objects(ndarray[object] objects, break else: seen.datetime_ = True - idatetimes[i] = convert_to_tsobject( - val, None, None, 0, 0).value + try: + idatetimes[i] = convert_to_tsobject( + val, None, None, 0, 0).value + except OutOfBoundsDatetime: + seen.object_ = True + break else: seen.object_ = True break @@ -2428,6 +2506,13 @@ def maybe_convert_objects(ndarray[object] objects, except (ValueError, TypeError): seen.object_ = True break + elif is_interval(val): + if convert_interval: + seen.interval_ = True + break + else: + seen.object_ = True + break else: seen.object_ = True break @@ -2449,6 +2534,17 @@ def maybe_convert_objects(ndarray[object] objects, # unbox to PeriodArray return pi._data + seen.object_ = True + + if seen.interval_: + if is_interval_array(objects): + from pandas import IntervalIndex + ii = IntervalIndex(objects) + + # unbox to IntervalArray + return ii._data + + seen.object_ = True if not seen.object_: result = None @@ -2478,8 +2574,13 @@ def maybe_convert_objects(ndarray[object] objects, elif seen.nat_: if not seen.numeric_: if convert_datetime and convert_timedelta: - # TODO: array full of NaT ambiguity resolve here needed - pass + dtype = dtype_if_all_nat + if dtype is not None: + # otherwise we keep object dtype + result = _infer_all_nats( + dtype, datetimes, timedeltas + ) + elif convert_datetime: result = datetimes elif convert_timedelta: @@ -2518,8 +2619,13 @@ def maybe_convert_objects(ndarray[object] objects, elif seen.nat_: if not seen.numeric_: if convert_datetime and convert_timedelta: - # TODO: array full of NaT ambiguity resolve here needed - pass + dtype = dtype_if_all_nat + if dtype is not None: + # otherwise we keep object dtype + result = _infer_all_nats( + dtype, datetimes, timedeltas + ) + elif convert_datetime: result = datetimes elif convert_timedelta: @@ -2550,6 +2656,26 @@ def maybe_convert_objects(ndarray[object] objects, return objects +cdef _infer_all_nats(dtype, ndarray datetimes, ndarray timedeltas): + """ + If we have all-NaT values, cast these to the given dtype. + """ + if isinstance(dtype, np.dtype): + if dtype == "M8[ns]": + result = datetimes + elif dtype == "m8[ns]": + result = timedeltas + else: + raise ValueError(dtype) + else: + # ExtensionDtype + cls = dtype.construct_array_type() + i8vals = np.empty(len(datetimes), dtype="i8") + i8vals.fill(NPY_NAT) + result = cls(i8vals, dtype=dtype) + return result + + class NoDefault(Enum): # We make this an Enum # 1) because it round-trips through pickle correctly (see GH#40397) @@ -2773,25 +2899,3 @@ def to_object_array_tuples(rows: object) -> np.ndarray: result[i, j] = row[j] return result - - -@cython.wraparound(False) -@cython.boundscheck(False) -def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> np.ndarray: - cdef: - Py_ssize_t i, n = len(keys) - object val - ndarray[object] output = np.empty(n, dtype='O') - - if n == 0: - # kludge, for Series - return np.empty(0, dtype='f8') - - for i in range(n): - val = keys[i] - if val in mapping: - output[i] = mapping[val] - else: - output[i] = default - - return maybe_convert_objects(output) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index b2d548e04eab4c..7d7074988e5f0b 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -146,6 +146,11 @@ cdef extern from "parser/tokenizer.h": enum: ERROR_OVERFLOW + ctypedef enum BadLineHandleMethod: + ERROR, + WARN, + SKIP + ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors) ctypedef int (*io_cleanup)(void *src) @@ -198,8 +203,7 @@ cdef extern from "parser/tokenizer.h": int usecols int expected_fields - int error_bad_lines - int warn_bad_lines + BadLineHandleMethod on_bad_lines # floating point options char decimal @@ -351,8 +355,7 @@ cdef class TextReader: thousands=None, # bytes | str dtype=None, usecols=None, - bint error_bad_lines=True, - bint warn_bad_lines=True, + on_bad_lines = ERROR, bint na_filter=True, na_values=None, na_fvalues=None, @@ -435,9 +438,7 @@ cdef class TextReader: raise ValueError('Only length-1 comment characters supported') self.parser.commentchar = ord(comment) - # error handling of bad lines - self.parser.error_bad_lines = int(error_bad_lines) - self.parser.warn_bad_lines = int(warn_bad_lines) + self.parser.on_bad_lines = on_bad_lines self.skiprows = skiprows if skiprows is not None: @@ -454,8 +455,7 @@ cdef class TextReader: # XXX if skipfooter > 0: - self.parser.error_bad_lines = 0 - self.parser.warn_bad_lines = 0 + self.parser.on_bad_lines = SKIP self.delimiter = delimiter @@ -570,9 +570,6 @@ cdef class TextReader: kh_destroy_str_starts(self.false_set) self.false_set = NULL - def set_error_bad_lines(self, int status) -> None: - self.parser.error_bad_lines = status - def _set_quoting(self, quote_char: str | bytes | None, quoting: int): if not isinstance(quoting, int): raise TypeError('"quoting" must be an integer') diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index c28db9b669a4bb..d730084692dd44 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -1,4 +1,3 @@ -from copy import copy from libc.stdlib cimport ( free, @@ -307,13 +306,11 @@ cpdef inline extract_result(object res): # Preserve EA res = res._values if res.ndim == 1 and len(res) == 1: + # see test_agg_lambda_with_timezone, test_resampler_grouper.py::test_apply res = res[0] - if hasattr(res, 'values') and is_array(res.values): - res = res.values if is_array(res): - if res.ndim == 0: - res = res.item() - elif res.ndim == 1 and len(res) == 1: + if res.ndim == 1 and len(res) == 1: + # see test_resampler_grouper.py::test_apply res = res[0] return res @@ -386,7 +383,7 @@ def apply_frame_axis0(object frame, object f, object names, # Need to infer if low level index slider will cause segfaults require_slow_apply = i == 0 and piece is chunk try: - if not piece.index is chunk.index: + if piece.index is not chunk.index: mutated = True except AttributeError: # `piece` might not have an index, could be e.g. an int @@ -397,7 +394,7 @@ def apply_frame_axis0(object frame, object f, object names, try: piece = piece.copy(deep="all") except (TypeError, AttributeError): - piece = copy(piece) + pass results.append(piece) diff --git a/pandas/_libs/src/headers/cmath b/pandas/_libs/src/headers/cmath index 632e1fc2390d0b..9e7540cfefc130 100644 --- a/pandas/_libs/src/headers/cmath +++ b/pandas/_libs/src/headers/cmath @@ -25,6 +25,18 @@ namespace std { __inline int isnan(double x) { return _isnan(x); } __inline int notnan(double x) { return x == x; } } +#elif defined(__MVS__) +#include + +#define _signbit signbit +#undef signbit +#undef isnan + +namespace std { + __inline int notnan(double x) { return x == x; } + __inline int signbit(double num) { return _signbit(num); } + __inline int isnan(double x) { return isnan(x); } +} #else #include diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 49eb1e78550984..49797eea59ddc6 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -93,8 +93,7 @@ void parser_set_default_options(parser_t *self) { self->allow_embedded_newline = 1; self->expected_fields = -1; - self->error_bad_lines = 0; - self->warn_bad_lines = 0; + self->on_bad_lines = ERROR; self->commentchar = '#'; self->thousands = '\0'; @@ -457,7 +456,7 @@ static int end_line(parser_t *self) { self->line_fields[self->lines] = 0; // file_lines is now the actual file line number (starting at 1) - if (self->error_bad_lines) { + if (self->on_bad_lines == ERROR) { self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "Expected %d fields in line %" PRIu64 ", saw %" PRId64 "\n", @@ -468,7 +467,7 @@ static int end_line(parser_t *self) { return -1; } else { // simply skip bad lines - if (self->warn_bad_lines) { + if (self->on_bad_lines == WARN) { // pass up error message msg = malloc(bufsize); snprintf(msg, bufsize, diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index f69fee4993d341..623d3690f252a0 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -84,6 +84,12 @@ typedef enum { QUOTE_NONE } QuoteStyle; +typedef enum { + ERROR, + WARN, + SKIP +} BadLineHandleMethod; + typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors); typedef int (*io_cleanup)(void *src); @@ -136,8 +142,7 @@ typedef struct parser_t { int usecols; // Boolean: 1: usecols provided, 0: none provided int expected_fields; - int error_bad_lines; - int warn_bad_lines; + BadLineHandleMethod on_bad_lines; // floating point options char decimal; diff --git a/pandas/_libs/tslibs/nattype.pyi b/pandas/_libs/tslibs/nattype.pyi index 0f81dcb4b2df14..5a2985d0e815b0 100644 --- a/pandas/_libs/tslibs/nattype.pyi +++ b/pandas/_libs/tslibs/nattype.pyi @@ -1,8 +1,14 @@ -from datetime import datetime +from datetime import ( + datetime, + timedelta, +) +from typing import Any import numpy as np +from pandas._libs.tslibs.period import Period + NaT: NaTType iNaT: int nat_strings: set[str] @@ -133,3 +139,31 @@ class NaTType(datetime): # inject Period properties @property def qyear(self) -> float: ... + + def __eq__(self, other: Any) -> bool: ... + def __ne__(self, other: Any) -> bool: ... + # https://github.com/python/mypy/issues/9015 + # error: Argument 1 of "__lt__" is incompatible with supertype "date"; + # supertype defines the argument type as "date" + def __lt__( # type: ignore[override] + self, + other: datetime | timedelta | Period | np.datetime64 | np.timedelta64 + ) -> bool: ... + # error: Argument 1 of "__le__" is incompatible with supertype "date"; + # supertype defines the argument type as "date" + def __le__( # type: ignore[override] + self, + other: datetime | timedelta | Period | np.datetime64 | np.timedelta64 + ) -> bool: ... + # error: Argument 1 of "__gt__" is incompatible with supertype "date"; + # supertype defines the argument type as "date" + def __gt__( # type: ignore[override] + self, + other: datetime | timedelta | Period | np.datetime64 | np.timedelta64 + ) -> bool: ... + # error: Argument 1 of "__ge__" is incompatible with supertype "date"; + # supertype defines the argument type as "date" + def __ge__( # type: ignore[override] + self, + other: datetime | timedelta | Period | np.datetime64 | np.timedelta64 + ) -> bool: ... diff --git a/pandas/_libs/tslibs/timezones.pyi b/pandas/_libs/tslibs/timezones.pyi index 04a1b391dc30a1..a631191f8b005d 100644 --- a/pandas/_libs/tslibs/timezones.pyi +++ b/pandas/_libs/tslibs/timezones.pyi @@ -2,31 +2,22 @@ from datetime import ( datetime, tzinfo, ) -from typing import ( - Callable, - Optional, - Union, -) +from typing import Callable import numpy as np # imported from dateutil.tz dateutil_gettz: Callable[[str], tzinfo] - def tz_standardize(tz: tzinfo) -> tzinfo: ... - -def tz_compare(start: Optional[tzinfo], end: Optional[tzinfo]) -> bool: ... - +def tz_compare(start: tzinfo | None, end: tzinfo | None) -> bool: ... def infer_tzinfo( - start: Optional[datetime], end: Optional[datetime], -) -> Optional[tzinfo]: ... + start: datetime | None, + end: datetime | None, +) -> tzinfo | None: ... # ndarrays returned are both int64_t def get_dst_info(tz: tzinfo) -> tuple[np.ndarray, np.ndarray, str]: ... - -def maybe_get_tz(tz: Optional[Union[str, int, np.int64, tzinfo]]) -> Optional[tzinfo]: ... - -def get_timezone(tz: tzinfo) -> Union[tzinfo, str]: ... - -def is_utc(tz: Optional[tzinfo]) -> bool: ... +def maybe_get_tz(tz: str | int | np.int64 | tzinfo | None) -> tzinfo | None: ... +def get_timezone(tz: tzinfo) -> tzinfo | str: ... +def is_utc(tz: tzinfo | None) -> bool: ... diff --git a/pandas/_libs/tslibs/tzconversion.pyi b/pandas/_libs/tslibs/tzconversion.pyi index f47885a2e3306a..1cbe55320099b0 100644 --- a/pandas/_libs/tslibs/tzconversion.pyi +++ b/pandas/_libs/tslibs/tzconversion.pyi @@ -2,11 +2,7 @@ from datetime import ( timedelta, tzinfo, ) -from typing import ( - Iterable, - Optional, - Union, -) +from typing import Iterable import numpy as np @@ -14,12 +10,10 @@ def tz_convert_from_utc( vals: np.ndarray, # const int64_t[:] tz: tzinfo, ) -> np.ndarray: ... # np.ndarray[np.int64] - def tz_convert_from_utc_single(val: np.int64, tz: tzinfo) -> np.int64: ... - def tz_localize_to_utc( vals: np.ndarray, # np.ndarray[np.int64] - tz: Optional[tzinfo], - ambiguous: Optional[Union[str, bool, Iterable[bool]]] = None, - nonexistent: Optional[Union[str, timedelta, np.timedelta64]] = None, + tz: tzinfo | None, + ambiguous: str | bool | Iterable[bool] | None = None, + nonexistent: str | timedelta | np.timedelta64 | None = None, ) -> np.ndarray: ... # np.ndarray[np.int64] diff --git a/pandas/_libs/tslibs/vectorized.pyi b/pandas/_libs/tslibs/vectorized.pyi index 6ed1e10ef23532..2a23289cdf61b8 100644 --- a/pandas/_libs/tslibs/vectorized.pyi +++ b/pandas/_libs/tslibs/vectorized.pyi @@ -3,10 +3,6 @@ For cython types that cannot be represented precisely, closest-available python equivalents are used, and the precise types kept as adjacent comments. """ from datetime import tzinfo -from typing import ( - Optional, - Union, -) import numpy as np @@ -16,32 +12,24 @@ from pandas._libs.tslibs.offsets import BaseOffset def dt64arr_to_periodarr( stamps: np.ndarray, # const int64_t[:] freq: int, - tz: Optional[tzinfo], + tz: tzinfo | None, ) -> np.ndarray: ... # np.ndarray[np.int64, ndim=1] - - def is_date_array_normalized( stamps: np.ndarray, # const int64_t[:] - tz: Optional[tzinfo] = None, + tz: tzinfo | None = None, ) -> bool: ... - - def normalize_i8_timestamps( stamps: np.ndarray, # const int64_t[:] - tz: Optional[tzinfo], + tz: tzinfo | None, ) -> np.ndarray: ... # np.ndarray[np.int64] - - def get_resolution( stamps: np.ndarray, # const int64_t[:] - tz: Optional[tzinfo] = None, + tz: tzinfo | None = None, ) -> Resolution: ... - - def ints_to_pydatetime( arr: np.ndarray, # const int64_t[:}] - tz: Optional[tzinfo] = None, - freq: Optional[Union[str, BaseOffset]] = None, + tz: tzinfo | None = None, + freq: str | BaseOffset | None = None, fold: bool = False, box: str = "datetime", ) -> np.ndarray: ... # np.ndarray[object] diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 8d64bf8852946c..369832e9bc05cc 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -92,6 +92,18 @@ def is_platform_mac() -> bool: return sys.platform == "darwin" +def is_platform_arm() -> bool: + """ + Checking if he running platform use ARM architecture. + + Returns + ------- + bool + True if the running platform uses ARM architecture. + """ + return platform.machine() in ("arm64", "aarch64") + + def import_lzma(): """ Importing the `lzma` module. diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index f8eccfeb2c60a3..941c59592dbbd9 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -17,16 +17,16 @@ "gcsfs": "0.6.0", "lxml.etree": "4.3.0", "matplotlib": "2.2.3", - "numexpr": "2.6.8", + "numexpr": "2.7.0", "odfpy": "1.3.0", "openpyxl": "3.0.0", "pandas_gbq": "0.12.0", "pyarrow": "0.17.0", - "pytest": "5.0.1", + "pytest": "6.0", "pyxlsb": "1.0.6", "s3fs": "0.4.0", "scipy": "1.2.0", - "sqlalchemy": "1.2.8", + "sqlalchemy": "1.3.0", "tables": "3.5.1", "tabulate": "0.8.7", "xarray": "0.12.3", diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 63ea5554e32d73..69dc3ac4175109 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -22,10 +22,7 @@ Union, ) -from numpy import ( - __version__, - ndarray, -) +from numpy import ndarray from pandas._libs.lib import ( is_bool, @@ -38,8 +35,6 @@ validate_kwargs, ) -from pandas.util.version import Version - class CompatValidator: def __init__( @@ -128,10 +123,7 @@ def validate_argmax_with_skipna(skipna, args, kwargs): ARGSORT_DEFAULTS["axis"] = -1 ARGSORT_DEFAULTS["kind"] = "quicksort" ARGSORT_DEFAULTS["order"] = None - -if Version(__version__) >= Version("1.17.0"): - # GH-26361. NumPy added radix sort and changed default to None. - ARGSORT_DEFAULTS["kind"] = None +ARGSORT_DEFAULTS["kind"] = None validate_argsort = CompatValidator( diff --git a/pandas/conftest.py b/pandas/conftest.py index f948dc11bc0140..329023ed7ba6a5 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -66,6 +66,11 @@ MultiIndex, ) +# Until https://github.com/numpy/numpy/issues/19078 is sorted out, just suppress +suppress_npdev_promotion_warning = pytest.mark.filterwarnings( + "ignore:Promotion of numbers and bools:FutureWarning" +) + # ---------------------------------------------------------------- # Configuration / Settings # ---------------------------------------------------------------- @@ -112,6 +117,8 @@ def pytest_collection_modifyitems(items): if "/frame/" in item.nodeid: item.add_marker(pytest.mark.arraymanager) + item.add_marker(suppress_npdev_promotion_warning) + # Hypothesis hypothesis.settings.register_profile( diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f8f5e5e05bc353..30f42435ad1773 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1266,14 +1266,14 @@ def compute(self, method: str) -> Series: return dropped.sort_values(ascending=ascending).head(n) # fast method - arr, pandas_dtype = _ensure_data(dropped.values) + arr, new_dtype = _ensure_data(dropped.values) if method == "nlargest": arr = -arr - if is_integer_dtype(pandas_dtype): + if is_integer_dtype(new_dtype): # GH 21426: ensure reverse ordering at boundaries arr -= 1 - elif is_bool_dtype(pandas_dtype): + elif is_bool_dtype(new_dtype): # GH 26154: ensure False is smaller than True arr = 1 - (-arr) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index d0c6a1a841edb6..00b49c2f4f9511 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -376,12 +376,10 @@ def agg_list_like(self) -> FrameOrSeriesUnion: raise ValueError("no results") try: - return concat(results, keys=keys, axis=1, sort=False) + concatenated = concat(results, keys=keys, axis=1, sort=False) except TypeError as err: - # we are concatting non-NDFrame objects, # e.g. a list of scalars - from pandas import Series result = Series(results, index=keys, name=obj.name) @@ -390,6 +388,16 @@ def agg_list_like(self) -> FrameOrSeriesUnion: "cannot combine transform and aggregation operations" ) from err return result + else: + # Concat uses the first index to determine the final indexing order. + # The union of a shorter first index with the other indices causes + # the index sorting to be different from the order of the aggregating + # functions. Reindex if this is the case. + index_size = concatenated.index.size + full_ordered_index = next( + result.index for result in results if result.index.size == index_size + ) + return concatenated.reindex(full_ordered_index, copy=False) def agg_dict_like(self) -> FrameOrSeriesUnion: """ diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index efa36a5bd3ae9d..32c50ed38eba01 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -37,7 +37,18 @@ def quantile_compat(values: ArrayLike, qs: np.ndarray, interpolation: str) -> Ar mask = isna(values) return _quantile_with_mask(values, mask, fill_value, qs, interpolation) else: - return _quantile_ea_compat(values, qs, interpolation) + # In general we don't want to import from arrays here; + # this is temporary pending discussion in GH#41428 + from pandas.core.arrays import BaseMaskedArray + + if isinstance(values, BaseMaskedArray): + # e.g. IntegerArray, does not implement _from_factorized + out = _quantile_ea_fallback(values, qs, interpolation) + + else: + out = _quantile_ea_compat(values, qs, interpolation) + + return out def _quantile_with_mask( @@ -144,3 +155,31 @@ def _quantile_ea_compat( # error: Incompatible return value type (got "ndarray", expected "ExtensionArray") return result # type: ignore[return-value] + + +def _quantile_ea_fallback( + values: ExtensionArray, qs: np.ndarray, interpolation: str +) -> ExtensionArray: + """ + quantile compatibility for ExtensionArray subclasses that do not + implement `_from_factorized`, e.g. IntegerArray. + + Notes + ----- + We assume that all impacted cases are 1D-only. + """ + mask = np.atleast_2d(np.asarray(values.isna())) + npvalues = np.atleast_2d(np.asarray(values)) + + res = _quantile_with_mask( + npvalues, + mask=mask, + fill_value=values.dtype.na_value, + qs=qs, + interpolation=interpolation, + ) + assert res.ndim == 2 + assert res.shape[0] == 1 + res = res[0] + out = type(values)._from_sequence(res, dtype=values.dtype) + return out diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f337589c355832..7dddb9f3d6f25a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -493,8 +493,7 @@ def size(self) -> int: """ The number of elements in the array. """ - # error: Incompatible return value type (got "number", expected "int") - return np.prod(self.shape) # type: ignore[return-value] + return np.prod(self.shape) @property def ndim(self) -> int: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index cb8a08f5668ac3..47779dd6dba253 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -439,12 +439,6 @@ def __init__( "explicitly specify the categories order " "by passing in a categories argument." ) from err - except ValueError as err: - - # TODO(EA2D) - raise NotImplementedError( - "> 1 ndim Categorical are not supported at this time" - ) from err # we're inferring from values dtype = CategoricalDtype(categories, dtype.ordered) @@ -2453,7 +2447,9 @@ def replace(self, to_replace, value, inplace: bool = False): # ------------------------------------------------------------------------ # String methods interface - def _str_map(self, f, na_value=np.nan, dtype=np.dtype("object")): + def _str_map( + self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True + ): # Optimization to apply the callable `f` to the categories once # and rebuild the result by `take`ing from the result with the codes. # Returns the same type as the object-dtype implementation though. diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a3c58b6c6ae154..ff46715d0a5275 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -599,7 +599,9 @@ def _validate_shift_value(self, fill_value): "will raise in a future version, pass " f"{self._scalar_type.__name__} instead.", FutureWarning, - stacklevel=8, + # There is no way to hard-code the level since this might be + # reached directly or called from the Index or Block method + stacklevel=find_stack_level(), ) fill_value = new_fill diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index f07a04b8087e09..020f7086063534 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1119,14 +1119,14 @@ def to_period(self, freq=None) -> PeriodArray: ... "2000-08-31 00:00:00"])) >>> df.index.to_period("M") PeriodIndex(['2000-03', '2000-05', '2000-08'], - dtype='period[M]', freq='M') + dtype='period[M]') Infer the daily frequency >>> idx = pd.date_range("2017-01-01", periods=2) >>> idx.to_period() PeriodIndex(['2017-01-01', '2017-01-02'], - dtype='period[D]', freq='D') + dtype='period[D]') """ from pandas.core.arrays import PeriodArray @@ -1175,6 +1175,7 @@ def to_perioddelta(self, freq) -> TimedeltaArray: "future version. " "Use `dtindex - dtindex.to_period(freq).to_timestamp()` instead", FutureWarning, + # stacklevel chosen to be correct for when called from DatetimeIndex stacklevel=3, ) from pandas.core.arrays.timedeltas import TimedeltaArray @@ -2103,7 +2104,6 @@ def sequence_to_dt64ns( result = data.view(DT64NS_DTYPE) if copy: - # TODO: should this be deepcopy? result = result.copy() assert isinstance(result, np.ndarray), type(result) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index a99bf245a60739..4aa3bab168ac62 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -32,7 +32,6 @@ from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender -from pandas.core.dtypes.cast import maybe_convert_platform from pandas.core.dtypes.common import ( is_categorical_dtype, is_datetime64_dtype, @@ -1650,4 +1649,6 @@ def _maybe_convert_platform_interval(values) -> ArrayLike: else: values = extract_array(values, extract_numpy=True) - return maybe_convert_platform(values) + if not hasattr(values, "dtype"): + return np.asarray(values) + return values diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 101209be30b40f..d8c1b9cef468aa 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -564,15 +564,15 @@ def asfreq(self, freq=None, how: str = "E") -> PeriodArray: >>> pidx = pd.period_range('2010-01-01', '2015-01-01', freq='A') >>> pidx PeriodIndex(['2010', '2011', '2012', '2013', '2014', '2015'], - dtype='period[A-DEC]', freq='A-DEC') + dtype='period[A-DEC]') >>> pidx.asfreq('M') PeriodIndex(['2010-12', '2011-12', '2012-12', '2013-12', '2014-12', - '2015-12'], dtype='period[M]', freq='M') + '2015-12'], dtype='period[M]') >>> pidx.asfreq('M', how='S') PeriodIndex(['2010-01', '2011-01', '2012-01', '2013-01', '2014-01', - '2015-01'], dtype='period[M]', freq='M') + '2015-01'], dtype='period[M]') """ how = libperiod.validate_end_alias(how) @@ -866,7 +866,7 @@ def start_time(self) -> DatetimeArray: def end_time(self) -> DatetimeArray: return self.to_timestamp(how="end") - def _require_matching_freq(self, other, base=False): + def _require_matching_freq(self, other, base: bool = False) -> None: # See also arrays.period.raise_on_incompatible if isinstance(other, BaseOffset): other_freq = other @@ -1057,7 +1057,7 @@ def dt64arr_to_periodarr(data, freq, tz=None): Returns ------- - ordinals : ndarray[int] + ordinals : ndarray[int64] freq : Tick The frequency extracted from the Series or DatetimeIndex if that's used. diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index d4faea4fbc42c1..8efdfb719bbfac 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -354,9 +354,8 @@ def density(self) -> float: """ Ratio of non-sparse points to total (dense) data points. """ - # error: Incompatible return value type (got "number", expected "float") tmp = np.mean([column.array.density for _, column in self._parent.items()]) - return tmp # type: ignore[return-value] + return tmp @staticmethod def _prep_index(data, index, columns): diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 4847372f182391..6ab296b314615c 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1397,7 +1397,7 @@ def max(self, axis=0, *args, **kwargs): # This condition returns a nan if there are no valid values in the array. if self.size > 0 and self._valid_sp_values.size == 0: - return np.nan + return self.fill_value else: return np.nanmax(self, axis) @@ -1406,7 +1406,7 @@ def min(self, axis=0, *args, **kwargs): # This condition returns a nan if there are no valid values in the array. if self.size > 0 and self._valid_sp_values.size == 0: - return np.nan + return self.fill_value else: return np.nanmin(self, axis) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 74ca5130ca322b..ab1dadf4d2dfa0 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -410,7 +410,9 @@ def _cmp_method(self, other, op): # String methods interface _str_na_value = StringDtype.na_value - def _str_map(self, f, na_value=None, dtype: Dtype | None = None): + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): from pandas.arrays import BooleanArray if dtype is None: diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index d5ee28eb7017e9..3cf471e381da90 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections.abc import Callable # noqa: PDF001 import re from typing import ( TYPE_CHECKING, @@ -22,6 +23,7 @@ type_t, ) from pandas.compat import ( + pa_version_under1p0, pa_version_under2p0, pa_version_under3p0, pa_version_under4p0, @@ -29,14 +31,17 @@ from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, + is_dtype_equal, is_integer, is_integer_dtype, is_object_dtype, is_scalar, is_string_dtype, + pandas_dtype, ) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.missing import isna @@ -46,39 +51,41 @@ from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import Int64Dtype +from pandas.core.arrays.numeric import NumericDtype from pandas.core.arrays.string_ import StringDtype from pandas.core.indexers import ( check_array_indexer, validate_indices, ) from pandas.core.strings.object_array import ObjectStringArrayMixin -from pandas.util.version import Version -try: +# PyArrow backed StringArrays are available starting at 1.0.0, but this +# file is imported from even if pyarrow is < 1.0.0, before pyarrow.compute +# and its compute functions existed. GH38801 +if not pa_version_under1p0: import pyarrow as pa -except ImportError: - pa = None -else: - # PyArrow backed StringArrays are available starting at 1.0.0, but this - # file is imported from even if pyarrow is < 1.0.0, before pyarrow.compute - # and its compute functions existed. GH38801 - if Version(pa.__version__) >= Version("1.0.0"): - import pyarrow.compute as pc - - ARROW_CMP_FUNCS = { - "eq": pc.equal, - "ne": pc.not_equal, - "lt": pc.less, - "gt": pc.greater, - "le": pc.less_equal, - "ge": pc.greater_equal, - } + import pyarrow.compute as pc + + ARROW_CMP_FUNCS = { + "eq": pc.equal, + "ne": pc.not_equal, + "lt": pc.less, + "gt": pc.greater, + "le": pc.less_equal, + "ge": pc.greater_equal, + } if TYPE_CHECKING: from pandas import Series +def _chk_pyarrow_available() -> None: + if pa_version_under1p0: + msg = "pyarrow>=1.0.0 is required for PyArrow backed StringArray." + raise ImportError(msg) + + @register_extension_dtype class ArrowStringDtype(StringDtype): """ @@ -111,6 +118,9 @@ class ArrowStringDtype(StringDtype): #: StringDtype.na_value uses pandas.NA na_value = libmissing.NA + def __init__(self): + _chk_pyarrow_available() + @property def type(self) -> type[str]: return str @@ -212,10 +222,8 @@ class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin): Length: 4, dtype: arrow_string """ - _dtype = ArrowStringDtype() - def __init__(self, values): - self._chk_pyarrow_available() + self._dtype = ArrowStringDtype() if isinstance(values, pa.Array): self._data = pa.chunked_array([values]) elif isinstance(values, pa.ChunkedArray): @@ -228,19 +236,11 @@ def __init__(self, values): "ArrowStringArray requires a PyArrow (chunked) array of string type" ) - @classmethod - def _chk_pyarrow_available(cls) -> None: - # TODO: maybe update import_optional_dependency to allow a minimum - # version to be specified rather than use the global minimum - if pa is None or Version(pa.__version__) < Version("1.0.0"): - msg = "pyarrow>=1.0.0 is required for PyArrow backed StringArray." - raise ImportError(msg) - @classmethod def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False): from pandas.core.arrays.masked import BaseMaskedArray - cls._chk_pyarrow_available() + _chk_pyarrow_available() if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and @@ -289,10 +289,14 @@ def to_numpy( # type: ignore[override] """ # TODO: copy argument is ignored - if na_value is lib.no_default: - na_value = self._dtype.na_value - result = self._data.__array__(dtype=dtype) - result[isna(result)] = na_value + result = np.array(self._data, dtype=dtype) + if self._data.null_count > 0: + if na_value is lib.no_default: + if dtype and np.issubdtype(dtype, np.floating): + return result + na_value = self._dtype.na_value + mask = self.isna() + result[mask] = na_value return result def __len__(self) -> int: @@ -736,12 +740,32 @@ def value_counts(self, dropna: bool = True) -> Series: return Series(counts, index=index).astype("Int64") + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + + if is_dtype_equal(dtype, self.dtype): + if copy: + return self.copy() + return self + + elif isinstance(dtype, NumericDtype): + data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype)) + return dtype.__from_arrow__(data) + + elif isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + return cls._from_sequence(self, dtype=dtype, copy=copy) + + return super().astype(dtype, copy) + # ------------------------------------------------------------------------ # String methods interface _str_na_value = ArrowStringDtype.na_value - def _str_map(self, f, na_value=None, dtype: Dtype | None = None): + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): # TODO: de-duplicate with StringArray method. This method is moreless copy and # paste. @@ -834,6 +858,28 @@ def _str_endswith(self, pat: str, na=None): pat = re.escape(pat) + "$" return self._str_contains(pat, na=na, regex=True) + def _str_replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool = True, + flags: int = 0, + regex: bool = True, + ): + if ( + pa_version_under4p0 + or isinstance(pat, re.Pattern) + or callable(repl) + or not case + or flags + ): + return super()._str_replace(pat, repl, n, case, flags, regex) + + func = pc.replace_substring_regex if regex else pc.replace_substring + result = func(self._data, pattern=pat, replacement=repl, max_replacements=n) + return type(self)(result) + def _str_match( self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None ): diff --git a/pandas/core/common.py b/pandas/core/common.py index 04ff2d2c4618f6..c0e44a437f59ee 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -142,11 +142,8 @@ def is_bool_indexer(key: Any) -> bool: elif is_bool_dtype(key.dtype): return True elif isinstance(key, list): - try: - arr = np.asarray(key) - return arr.dtype == np.bool_ and len(arr) == len(key) - except TypeError: # pragma: no cover - return False + arr = np.asarray(key) + return arr.dtype == np.bool_ and len(arr) == len(key) return False diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 231beb40e9630c..8758565cf9f2a7 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -27,7 +27,6 @@ result_type_many, ) from pandas.core.computation.scope import DEFAULT_GLOBALS -from pandas.util.version import Version from pandas.io.formats.printing import ( pprint_thing, @@ -616,18 +615,8 @@ def __repr__(self) -> str: class FuncNode: def __init__(self, name: str): - from pandas.core.computation.check import ( - NUMEXPR_INSTALLED, - NUMEXPR_VERSION, - ) - - if name not in MATHOPS or ( - NUMEXPR_INSTALLED - and Version(NUMEXPR_VERSION) < Version("2.6.9") - and name in ("floor", "ceil") - ): + if name not in MATHOPS: raise ValueError(f'"{name}" is not a supported function') - self.name = name self.func = getattr(np, name) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index baac872a6a4663..a88bc8900ccdd4 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -726,3 +726,26 @@ def register_converter_cb(key): validator=is_one_of_factory(["auto", True, False]), cb=register_converter_cb, ) + +# ------ +# Styler +# ------ + +styler_sparse_index_doc = """ +: bool + Whether to sparsify the display of a hierarchical index. Setting to False will + display each explicit level element in a hierarchical key for each row. +""" + +styler_sparse_columns_doc = """ +: bool + Whether to sparsify the display of hierarchical columns. Setting to False will + display each explicit level element in a hierarchical key for each column. +""" + +with cf.config_prefix("styler"): + cf.register_option("sparse.index", True, styler_sparse_index_doc, validator=bool) + + cf.register_option( + "sparse.columns", True, styler_sparse_columns_doc, validator=bool + ) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index f3133480108a6f..edaa53cd550428 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -6,22 +6,18 @@ """ from __future__ import annotations -from collections import abc from typing import ( TYPE_CHECKING, Any, Sequence, cast, ) +import warnings import numpy as np import numpy.ma as ma from pandas._libs import lib -from pandas._libs.tslibs import ( - IncompatibleFrequency, - OutOfBoundsDatetime, -) from pandas._typing import ( AnyArrayLike, ArrayLike, @@ -39,9 +35,10 @@ construct_1d_object_array_from_listlike, maybe_cast_to_datetime, maybe_cast_to_integer_array, - maybe_castable, maybe_convert_platform, + maybe_infer_to_datetimelike, maybe_upcast, + sanitize_to_nanoseconds, ) from pandas.core.dtypes.common import ( is_datetime64_ns_dtype, @@ -292,9 +289,9 @@ def array( IntegerArray, IntervalArray, PandasArray, + PeriodArray, StringArray, TimedeltaArray, - period_array, ) if lib.is_scalar(data): @@ -318,19 +315,10 @@ def array( if dtype is None: inferred_dtype = lib.infer_dtype(data, skipna=True) if inferred_dtype == "period": - try: - return period_array(data, copy=copy) - except IncompatibleFrequency: - # We may have a mixture of frequencies. - # We choose to return an ndarray, rather than raising. - pass + return PeriodArray._from_sequence(data, copy=copy) + elif inferred_dtype == "interval": - try: - return IntervalArray(data, copy=copy) - except ValueError: - # We may have a mixture of `closed` here. - # We choose to return an ndarray, rather than raising. - pass + return IntervalArray(data, copy=copy) elif inferred_dtype.startswith("datetime"): # datetime, datetime64 @@ -468,6 +456,8 @@ def sanitize_array( dtype: DtypeObj | None = None, copy: bool = False, raise_cast_failure: bool = True, + *, + allow_2d: bool = False, ) -> ArrayLike: """ Sanitize input data to an ndarray or ExtensionArray, copy if specified, @@ -480,6 +470,8 @@ def sanitize_array( dtype : np.dtype, ExtensionDtype, or None, default None copy : bool, default False raise_cast_failure : bool, default True + allow_2d : bool, default False + If False, raise if we have a 2D Arraylike. Returns ------- @@ -501,6 +493,16 @@ def sanitize_array( if dtype is None: dtype = data.dtype data = lib.item_from_zerodim(data) + elif isinstance(data, range): + # GH#16804 + data = range_to_ndarray(data) + copy = False + + if not is_list_like(data): + if index is None: + raise ValueError("index must be specified when data is not list-like") + data = construct_1d_arraylike_from_scalar(data, len(index), dtype) + return data # GH#846 if isinstance(data, np.ndarray): @@ -525,39 +527,25 @@ def sanitize_array( subarr = subarr.copy() return subarr - elif isinstance(data, (list, tuple, abc.Set, abc.ValuesView)) and len(data) > 0: - # TODO: deque, array.array + else: if isinstance(data, (set, frozenset)): # Raise only for unordered sets, e.g., not for dict_keys raise TypeError(f"'{type(data).__name__}' type is unordered") + + # materialize e.g. generators, convert e.g. tuples, abc.ValueView + # TODO: non-standard array-likes we can convert to ndarray more efficiently? data = list(data) - if dtype is not None: + if dtype is not None or len(data) == 0: subarr = _try_cast(data, dtype, copy, raise_cast_failure) else: + # TODO: copy? subarr = maybe_convert_platform(data) - # error: Incompatible types in assignment (expression has type - # "Union[ExtensionArray, ndarray, List[Any]]", variable has type - # "ExtensionArray") - subarr = maybe_cast_to_datetime(subarr, dtype) # type: ignore[assignment] - - elif isinstance(data, range): - # GH#16804 - arr = np.arange(data.start, data.stop, data.step, dtype="int64") - subarr = _try_cast(arr, dtype, copy, raise_cast_failure) - - elif not is_list_like(data): - if index is None: - raise ValueError("index must be specified when data is not list-like") - subarr = construct_1d_arraylike_from_scalar(data, len(index), dtype) - - else: - # realize e.g. generators - # TODO: non-standard array-likes we can convert to ndarray more efficiently? - data = list(data) - subarr = _try_cast(data, dtype, copy, raise_cast_failure) + if subarr.dtype == object: + subarr = cast(np.ndarray, subarr) + subarr = maybe_infer_to_datetimelike(subarr) - subarr = _sanitize_ndim(subarr, data, dtype, index) + subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d) if not ( isinstance(subarr.dtype, ExtensionDtype) or isinstance(dtype, ExtensionDtype) @@ -574,8 +562,32 @@ def sanitize_array( return subarr +def range_to_ndarray(rng: range) -> np.ndarray: + """ + Cast a range object to ndarray. + """ + # GH#30171 perf avoid realizing range as a list in np.array + try: + arr = np.arange(rng.start, rng.stop, rng.step, dtype="int64") + except OverflowError: + # GH#30173 handling for ranges that overflow int64 + if (rng.start >= 0 and rng.step > 0) or (rng.stop >= 0 and rng.step < 0): + try: + arr = np.arange(rng.start, rng.stop, rng.step, dtype="uint64") + except OverflowError: + arr = construct_1d_object_array_from_listlike(list(rng)) + else: + arr = construct_1d_object_array_from_listlike(list(rng)) + return arr + + def _sanitize_ndim( - result: ArrayLike, data, dtype: DtypeObj | None, index: Index | None + result: ArrayLike, + data, + dtype: DtypeObj | None, + index: Index | None, + *, + allow_2d: bool = False, ) -> ArrayLike: """ Ensure we have a 1-dimensional result array. @@ -589,13 +601,13 @@ def _sanitize_ndim( elif result.ndim > 1: if isinstance(data, np.ndarray): + if allow_2d: + return result raise ValueError("Data must be 1-dimensional") if is_object_dtype(dtype) and isinstance(dtype, ExtensionDtype): # i.e. PandasDtype("O") - # error: Argument "dtype" to "asarray_tuplesafe" has incompatible type - # "Type[object]"; expected "Union[str, dtype[Any], None]" - result = com.asarray_tuplesafe(data, dtype=object) # type: ignore[arg-type] + result = com.asarray_tuplesafe(data, dtype=np.dtype("object")) cls = dtype.construct_array_type() result = cls._from_sequence(result, dtype=dtype) else: @@ -661,26 +673,52 @@ def _try_cast( ------- np.ndarray or ExtensionArray """ - # perf shortcut as this is the most common case - if ( - isinstance(arr, np.ndarray) - and maybe_castable(arr.dtype) - and not copy - and dtype is None - ): - return arr + is_ndarray = isinstance(arr, np.ndarray) + + if dtype is None: + # perf shortcut as this is the most common case + if is_ndarray: + arr = cast(np.ndarray, arr) + if arr.dtype != object: + return sanitize_to_nanoseconds(arr, copy=copy) - if isinstance(dtype, ExtensionDtype) and not isinstance(dtype, DatetimeTZDtype): + out = maybe_infer_to_datetimelike(arr) + if out is arr and copy: + out = out.copy() + return out + + else: + # i.e. list + varr = np.array(arr, copy=False) + # filter out cases that we _dont_ want to go through + # maybe_infer_to_datetimelike + if varr.dtype != object or varr.size == 0: + return varr + return maybe_infer_to_datetimelike(varr) + + elif isinstance(dtype, ExtensionDtype): # create an extension array from its dtype - # DatetimeTZ case needs to go through maybe_cast_to_datetime but - # SparseDtype does not + if isinstance(dtype, DatetimeTZDtype): + # We can't go through _from_sequence because it handles dt64naive + # data differently; _from_sequence treats naive as wall times, + # while maybe_cast_to_datetime treats it as UTC + # see test_maybe_promote_any_numpy_dtype_with_datetimetz + + return maybe_cast_to_datetime(arr, dtype) + # TODO: copy? + array_type = dtype.construct_array_type()._from_sequence subarr = array_type(arr, dtype=dtype, copy=copy) return subarr - if is_object_dtype(dtype) and not isinstance(arr, np.ndarray): - subarr = construct_1d_object_array_from_listlike(arr) - return subarr + elif is_object_dtype(dtype): + if not is_ndarray: + subarr = construct_1d_object_array_from_listlike(arr) + return subarr + return ensure_wrapped_if_datetimelike(arr).astype(dtype, copy=copy) + + elif dtype.kind in ["m", "M"]: + return maybe_cast_to_datetime(arr, dtype) try: # GH#15832: Check if we are requesting a numeric dtype and @@ -688,29 +726,32 @@ def _try_cast( if is_integer_dtype(dtype): # this will raise if we have e.g. floats - # error: Argument 2 to "maybe_cast_to_integer_array" has incompatible type - # "Union[dtype, ExtensionDtype, None]"; expected "Union[ExtensionDtype, str, - # dtype, Type[str], Type[float], Type[int], Type[complex], Type[bool], - # Type[object]]" - maybe_cast_to_integer_array(arr, dtype) # type: ignore[arg-type] + maybe_cast_to_integer_array(arr, dtype) subarr = arr else: - subarr = maybe_cast_to_datetime(arr, dtype) - if dtype is not None and dtype.kind == "M": - return subarr + subarr = arr if not isinstance(subarr, ABCExtensionArray): + # 4 tests fail if we move this to a try/except/else; see + # test_constructor_compound_dtypes, test_constructor_cast_failure + # test_constructor_dict_cast2, test_loc_setitem_dtype subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy) - except OutOfBoundsDatetime: - # in case of out of bound datetime64 -> always raise - raise - except (ValueError, TypeError) as err: - if dtype is not None and raise_cast_failure: - raise - elif "Cannot cast" in str(err): - # via _disallow_mismatched_datetimelike + + except (ValueError, TypeError): + if raise_cast_failure: raise else: + # we only get here with raise_cast_failure False, which means + # called via the DataFrame constructor + # GH#24435 + warnings.warn( + f"Could not cast to {dtype}, falling back to object. This " + "behavior is deprecated. In a future version, when a dtype is " + "passed to 'DataFrame', either all columns will be cast to that " + "dtype, or a TypeError will be raised", + FutureWarning, + stacklevel=7, + ) subarr = np.array(arr, dtype=object, copy=copy) return subarr diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 783474c53f3048..161572f3f1ac36 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -31,7 +31,6 @@ Timedelta, Timestamp, conversion, - ints_to_pydatetime, ) from pandas._libs.tslibs.timedeltas import array_to_timedelta64 from pandas._typing import ( @@ -40,12 +39,12 @@ DtypeObj, Scalar, ) +from pandas.errors import IntCastingNaNError from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( DT64NS_DTYPE, - POSSIBLY_CAST_DTYPES, TD64NS_DTYPE, ensure_int8, ensure_int16, @@ -58,7 +57,6 @@ is_complex, is_complex_dtype, is_datetime64_dtype, - is_datetime64_ns_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, @@ -70,10 +68,8 @@ is_numeric_dtype, is_object_dtype, is_scalar, - is_sparse, is_string_dtype, is_timedelta64_dtype, - is_timedelta64_ns_dtype, is_unsigned_integer_dtype, pandas_dtype, ) @@ -127,9 +123,8 @@ def maybe_convert_platform( arr = values if arr.dtype == object: - # error: Argument 1 to "maybe_convert_objects" has incompatible type - # "Union[ExtensionArray, ndarray]"; expected "ndarray" - arr = lib.maybe_convert_objects(arr) # type: ignore[arg-type] + arr = cast(np.ndarray, arr) + arr = lib.maybe_convert_objects(arr) return arr @@ -219,6 +214,8 @@ def maybe_unbox_datetimelike(value: Scalar, dtype: DtypeObj) -> Scalar: elif isinstance(value, Timestamp): if value.tz is None: value = value.to_datetime64() + elif not isinstance(dtype, DatetimeTZDtype): + raise TypeError("Cannot unbox tzaware Timestamp to tznaive dtype") elif isinstance(value, Timedelta): value = value.to_timedelta64() @@ -783,22 +780,6 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> tuple[DtypeObj, return dtype, val -def dict_compat(d: dict[Scalar, Scalar]) -> dict[Scalar, Scalar]: - """ - Convert datetimelike-keyed dicts to a Timestamp-keyed dict. - - Parameters - ---------- - d: dict-like object - - Returns - ------- - dict - - """ - return {maybe_box_datetimelike(key): value for key, value in d.items()} - - def infer_dtype_from_array( arr, pandas_dtype: bool = False ) -> tuple[DtypeObj, ArrayLike]: @@ -1167,9 +1148,7 @@ def astype_nansafe( raise TypeError(f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]") elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype, np.integer): - - if not np.isfinite(arr).all(): - raise ValueError("Cannot convert non-finite values (NA or inf) to integer") + return astype_float_to_int_nansafe(arr, dtype, copy) elif is_object_dtype(arr): @@ -1207,6 +1186,19 @@ def astype_nansafe( return arr.astype(dtype, copy=copy) +def astype_float_to_int_nansafe( + values: np.ndarray, dtype: np.dtype, copy: bool +) -> np.ndarray: + """ + astype with a check preventing converting NaN to an meaningless integer value. + """ + if not np.isfinite(values).all(): + raise IntCastingNaNError( + "Cannot convert non-finite values (NA or inf) to integer" + ) + return values.astype(dtype, copy=copy) + + def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> ArrayLike: """ Cast array (ndarray or ExtensionArray) to the new dtype. @@ -1240,13 +1232,12 @@ def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> Arra return values.copy() return values - if isinstance(values, ABCExtensionArray): + if not isinstance(values, np.ndarray): + # i.e. ExtensionArray values = values.astype(dtype, copy=copy) else: - # error: Argument 1 to "astype_nansafe" has incompatible type "ExtensionArray"; - # expected "ndarray" - values = astype_nansafe(values, dtype, copy=copy) # type: ignore[arg-type] + values = astype_nansafe(values, dtype, copy=copy) # in pandas we don't store numpy str dtypes, so convert to object if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str): @@ -1376,7 +1367,7 @@ def convert_dtypes( convert_integer: bool = True, convert_boolean: bool = True, convert_floating: bool = True, -) -> Dtype: +) -> DtypeObj: """ Convert objects to best possible type, and optionally, to types supporting ``pd.NA``. @@ -1397,23 +1388,28 @@ def convert_dtypes( Returns ------- - str, np.dtype, or ExtensionDtype - dtype - new dtype + np.dtype, or ExtensionDtype """ - inferred_dtype: str | np.dtype | ExtensionDtype - # TODO: rule out str + inferred_dtype: str | DtypeObj if ( convert_string or convert_integer or convert_boolean or convert_floating ) and isinstance(input_array, np.ndarray): - inferred_dtype = lib.infer_dtype(input_array) - if not convert_string and is_string_dtype(inferred_dtype): + if is_object_dtype(input_array.dtype): + inferred_dtype = lib.infer_dtype(input_array) + else: inferred_dtype = input_array.dtype + if is_string_dtype(inferred_dtype): + if not convert_string: + inferred_dtype = input_array.dtype + else: + inferred_dtype = pandas_dtype("string") + return inferred_dtype + if convert_integer: - target_int_dtype = "Int64" + target_int_dtype = pandas_dtype("Int64") if is_integer_dtype(input_array.dtype): from pandas.core.arrays.integer import INT_STR_TO_DTYPE @@ -1421,14 +1417,13 @@ def convert_dtypes( inferred_dtype = INT_STR_TO_DTYPE.get( input_array.dtype.name, target_int_dtype ) - if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( - input_array.dtype - ): - inferred_dtype = target_int_dtype - - else: - if is_integer_dtype(inferred_dtype): - inferred_dtype = input_array.dtype + elif is_numeric_dtype(input_array.dtype): + # TODO: de-dup with maybe_cast_to_integer_array? + arr = input_array[notna(input_array)] + if (arr.astype(int) == arr).all(): + inferred_dtype = target_int_dtype + else: + inferred_dtype = input_array.dtype if convert_floating: if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( @@ -1436,51 +1431,40 @@ def convert_dtypes( ): from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE - inferred_float_dtype = FLOAT_STR_TO_DTYPE.get( - input_array.dtype.name, "Float64" + inferred_float_dtype: DtypeObj = FLOAT_STR_TO_DTYPE.get( + input_array.dtype.name, pandas_dtype("Float64") ) # if we could also convert to integer, check if all floats # are actually integers if convert_integer: + # TODO: de-dup with maybe_cast_to_integer_array? arr = input_array[notna(input_array)] if (arr.astype(int) == arr).all(): - inferred_dtype = "Int64" + inferred_dtype = pandas_dtype("Int64") else: inferred_dtype = inferred_float_dtype else: inferred_dtype = inferred_float_dtype - else: - if is_float_dtype(inferred_dtype): - inferred_dtype = input_array.dtype if convert_boolean: if is_bool_dtype(input_array.dtype): - inferred_dtype = "boolean" - else: - if isinstance(inferred_dtype, str) and inferred_dtype == "boolean": - inferred_dtype = input_array.dtype + inferred_dtype = pandas_dtype("boolean") + elif isinstance(inferred_dtype, str) and inferred_dtype == "boolean": + inferred_dtype = pandas_dtype("boolean") + + if isinstance(inferred_dtype, str): + # If we couldn't do anything else, then we retain the dtype + inferred_dtype = input_array.dtype else: - inferred_dtype = input_array.dtype + return input_array.dtype return inferred_dtype -def maybe_castable(dtype: np.dtype) -> bool: - # return False to force a non-fastpath - - # check datetime64[ns]/timedelta64[ns] are valid - # otherwise try to coerce - kind = dtype.kind - if kind == "M": - return is_datetime64_ns_dtype(dtype) - elif kind == "m": - return is_timedelta64_ns_dtype(dtype) - - return dtype.name not in POSSIBLY_CAST_DTYPES - - -def maybe_infer_to_datetimelike(value: np.ndarray | list): +def maybe_infer_to_datetimelike( + value: np.ndarray, +) -> np.ndarray | DatetimeArray | TimedeltaArray: """ we might have a array (or single object) that is datetime like, and no dtype is passed don't change the value unless we find a @@ -1491,18 +1475,19 @@ def maybe_infer_to_datetimelike(value: np.ndarray | list): Parameters ---------- - value : np.ndarray or list + value : np.ndarray[object] + + Returns + ------- + np.ndarray, DatetimeArray, or TimedeltaArray """ - if not isinstance(value, (np.ndarray, list)): + if not isinstance(value, np.ndarray) or value.dtype != object: + # Caller is responsible for passing only ndarray[object] raise TypeError(type(value)) # pragma: no cover v = np.array(value, copy=False) - # we only care about object dtypes - if not is_object_dtype(v.dtype): - return value - shape = v.shape if v.ndim != 1: v = v.ravel() @@ -1542,7 +1527,7 @@ def try_timedelta(v: np.ndarray) -> np.ndarray: else: return td_values.reshape(shape) - inferred_type = lib.infer_datetimelike_array(ensure_object(v)) + inferred_type, seen_str = lib.infer_datetimelike_array(ensure_object(v)) if inferred_type == "datetime": # error: Incompatible types in assignment (expression has type "ExtensionArray", @@ -1571,95 +1556,130 @@ def try_timedelta(v: np.ndarray) -> np.ndarray: # "ExtensionArray", variable has type "Union[ndarray, List[Any]]") value = try_datetime(v) # type: ignore[assignment] + if value.dtype.kind in ["m", "M"] and seen_str: + warnings.warn( + f"Inferring {value.dtype} from data containing strings is deprecated " + "and will be removed in a future version. To retain the old behavior " + "explicitly pass Series(data, dtype={value.dtype})", + FutureWarning, + stacklevel=find_stack_level(), + ) + # return v.reshape(shape) return value def maybe_cast_to_datetime( value: ExtensionArray | np.ndarray | list, dtype: DtypeObj | None -) -> ExtensionArray | np.ndarray | list: +) -> ExtensionArray | np.ndarray: """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT + + We allow a list *only* when dtype is not None. """ from pandas.core.arrays.datetimes import sequence_to_datetimes - from pandas.core.arrays.timedeltas import sequence_to_td64ns + from pandas.core.arrays.timedeltas import TimedeltaArray if not is_list_like(value): raise TypeError("value must be listlike") + if is_timedelta64_dtype(dtype): + # TODO: _from_sequence would raise ValueError in cases where + # ensure_nanosecond_dtype raises TypeError + dtype = cast(np.dtype, dtype) + dtype = ensure_nanosecond_dtype(dtype) + res = TimedeltaArray._from_sequence(value, dtype=dtype) + return res + if dtype is not None: is_datetime64 = is_datetime64_dtype(dtype) is_datetime64tz = is_datetime64tz_dtype(dtype) - is_timedelta64 = is_timedelta64_dtype(dtype) vdtype = getattr(value, "dtype", None) - if is_datetime64 or is_datetime64tz or is_timedelta64: + if is_datetime64 or is_datetime64tz: dtype = ensure_nanosecond_dtype(dtype) - if not is_sparse(value): - value = np.array(value, copy=False) - - # we have an array of datetime or timedeltas & nulls - if value.size or not is_dtype_equal(value.dtype, dtype): - _disallow_mismatched_datetimelike(value, dtype) - - try: - if is_datetime64: - dta = sequence_to_datetimes(value, allow_object=False) - # GH 25843: Remove tz information since the dtype - # didn't specify one - - if dta.tz is not None: - # equiv: dta.view(dtype) - # Note: NOT equivalent to dta.astype(dtype) - dta = dta.tz_localize(None) - value = dta - elif is_datetime64tz: - dtype = cast(DatetimeTZDtype, dtype) - # The string check can be removed once issue #13712 - # is solved. String data that is passed with a - # datetime64tz is assumed to be naive which should - # be localized to the timezone. - is_dt_string = is_string_dtype(value.dtype) - dta = sequence_to_datetimes(value, allow_object=False) - if dta.tz is not None: - value = dta.astype(dtype, copy=False) - elif is_dt_string: - # Strings here are naive, so directly localize - # equiv: dta.astype(dtype) # though deprecated - - value = dta.tz_localize(dtype.tz) - else: - # Numeric values are UTC at this point, - # so localize and convert - # equiv: Series(dta).astype(dtype) # though deprecated - - value = dta.tz_localize("UTC").tz_convert(dtype.tz) - elif is_timedelta64: - # if successful, we get a ndarray[td64ns] - value, _ = sequence_to_td64ns(value) - except OutOfBoundsDatetime: - raise - except ValueError: - # TODO(GH#40048): only catch dateutil's ParserError - # once we can reliably import it in all supported versions - if is_timedelta64: - raise - pass - - # coerce datetimelike to object - elif is_datetime64_dtype(vdtype) and not is_datetime64_dtype(dtype): - if is_object_dtype(dtype): - value = cast(np.ndarray, value) - - if value.dtype != DT64NS_DTYPE: - value = value.astype(DT64NS_DTYPE) - ints = np.asarray(value).view("i8") - return ints_to_pydatetime(ints) - - # we have a non-castable dtype that was passed - raise TypeError(f"Cannot cast datetime64 to {dtype}") + value = np.array(value, copy=False) + + # we have an array of datetime or timedeltas & nulls + if value.size or not is_dtype_equal(value.dtype, dtype): + _disallow_mismatched_datetimelike(value, dtype) + + try: + if is_datetime64: + dta = sequence_to_datetimes(value, allow_object=False) + # GH 25843: Remove tz information since the dtype + # didn't specify one + + if dta.tz is not None: + warnings.warn( + "Data is timezone-aware. Converting " + "timezone-aware data to timezone-naive by " + "passing dtype='datetime64[ns]' to " + "DataFrame or Series is deprecated and will " + "raise in a future version. Use " + "`pd.Series(values).dt.tz_localize(None)` " + "instead.", + FutureWarning, + stacklevel=8, + ) + # equiv: dta.view(dtype) + # Note: NOT equivalent to dta.astype(dtype) + dta = dta.tz_localize(None) + + value = dta + elif is_datetime64tz: + dtype = cast(DatetimeTZDtype, dtype) + # The string check can be removed once issue #13712 + # is solved. String data that is passed with a + # datetime64tz is assumed to be naive which should + # be localized to the timezone. + is_dt_string = is_string_dtype(value.dtype) + dta = sequence_to_datetimes(value, allow_object=False) + if dta.tz is not None: + value = dta.astype(dtype, copy=False) + elif is_dt_string: + # Strings here are naive, so directly localize + # equiv: dta.astype(dtype) # though deprecated + + value = dta.tz_localize(dtype.tz) + else: + # Numeric values are UTC at this point, + # so localize and convert + # equiv: Series(dta).astype(dtype) # though deprecated + if getattr(vdtype, "kind", None) == "M": + # GH#24559, GH#33401 deprecate behavior inconsistent + # with DatetimeArray/DatetimeIndex + warnings.warn( + "In a future version, constructing a Series " + "from datetime64[ns] data and a " + "DatetimeTZDtype will interpret the data " + "as wall-times instead of " + "UTC times, matching the behavior of " + "DatetimeIndex. To treat the data as UTC " + "times, use pd.Series(data).dt" + ".tz_localize('UTC').tz_convert(dtype.tz) " + "or pd.Series(data.view('int64'), dtype=dtype)", + FutureWarning, + stacklevel=5, + ) + + value = dta.tz_localize("UTC").tz_convert(dtype.tz) + except OutOfBoundsDatetime: + raise + except ValueError: + # TODO(GH#40048): only catch dateutil's ParserError + # once we can reliably import it in all supported versions + pass + + elif getattr(vdtype, "kind", None) in ["m", "M"]: + # we are already datetimelike and want to coerce to non-datetimelike; + # astype_nansafe will raise for anything other than object, then upcast. + # see test_datetimelike_values_with_object_dtype + # error: Argument 2 to "astype_nansafe" has incompatible type + # "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]" + return astype_nansafe(value, dtype) # type: ignore[arg-type] elif isinstance(value, np.ndarray): if value.dtype.kind in ["M", "m"]: @@ -1671,16 +1691,16 @@ def maybe_cast_to_datetime( value = maybe_infer_to_datetimelike(value) elif isinstance(value, list): - # only do this if we have an array and the dtype of the array is not - # setup already we are not an integer/object, so don't bother with this - # conversion + # we only get here with dtype=None, which we do not allow + raise ValueError( + "maybe_cast_to_datetime allows a list *only* if dtype is not None" + ) - value = maybe_infer_to_datetimelike(value) + # at this point we have converted or raised in all cases where we had a list + return cast(ArrayLike, value) - return value - -def sanitize_to_nanoseconds(values: np.ndarray) -> np.ndarray: +def sanitize_to_nanoseconds(values: np.ndarray, copy: bool = False) -> np.ndarray: """ Safely convert non-nanosecond datetime64 or timedelta64 values to nanosecond. """ @@ -1691,6 +1711,9 @@ def sanitize_to_nanoseconds(values: np.ndarray) -> np.ndarray: elif dtype.kind == "m" and dtype != TD64NS_DTYPE: values = conversion.ensure_timedelta64ns(values) + elif copy: + values = values.copy() + return values @@ -1806,7 +1829,7 @@ def construct_2d_arraylike_from_scalar( shape = (length, width) if dtype.kind in ["m", "M"]: - value = maybe_unbox_datetimelike(value, dtype) + value = maybe_unbox_datetimelike_tz_deprecation(value, dtype, stacklevel=4) elif dtype == object: if isinstance(value, (np.timedelta64, np.datetime64)): # calling np.array below would cast to pytimedelta/pydatetime @@ -1869,7 +1892,7 @@ def construct_1d_arraylike_from_scalar( if not isna(value): value = ensure_str(value) elif dtype.kind in ["M", "m"]: - value = maybe_unbox_datetimelike(value, dtype) + value = maybe_unbox_datetimelike_tz_deprecation(value, dtype) subarr = np.empty(length, dtype=dtype) subarr.fill(value) @@ -1877,6 +1900,46 @@ def construct_1d_arraylike_from_scalar( return subarr +def maybe_unbox_datetimelike_tz_deprecation( + value: Scalar, dtype: DtypeObj, stacklevel: int = 5 +): + """ + Wrap maybe_unbox_datetimelike with a check for a timezone-aware Timestamp + along with a timezone-naive datetime64 dtype, which is deprecated. + """ + # Caller is responsible for checking dtype.kind in ["m", "M"] + + if isinstance(value, datetime): + # we dont want to box dt64, in particular datetime64("NaT") + value = maybe_box_datetimelike(value, dtype) + + try: + value = maybe_unbox_datetimelike(value, dtype) + except TypeError: + if ( + isinstance(value, Timestamp) + and value.tzinfo is not None + and isinstance(dtype, np.dtype) + and dtype.kind == "M" + ): + warnings.warn( + "Data is timezone-aware. Converting " + "timezone-aware data to timezone-naive by " + "passing dtype='datetime64[ns]' to " + "DataFrame or Series is deprecated and will " + "raise in a future version. Use " + "`pd.Series(values).dt.tz_localize(None)` " + "instead.", + FutureWarning, + stacklevel=stacklevel, + ) + new_value = value.tz_localize(None) + return maybe_unbox_datetimelike(new_value, dtype) + else: + raise + return value + + def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: """ Transform any list-like object in a 1-dimensional numpy array of object @@ -1903,7 +1966,7 @@ def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: def construct_1d_ndarray_preserving_na( - values: Sequence, dtype: DtypeObj | None = None, copy: bool = False + values: Sequence, dtype: np.dtype | None = None, copy: bool = False ) -> np.ndarray: """ Construct a new ndarray, coercing `values` to `dtype`, preserving NA. @@ -1942,19 +2005,22 @@ def construct_1d_ndarray_preserving_na( ): # TODO(numpy#12550): special-case can be removed subarr = construct_1d_object_array_from_listlike(list(values)) + elif ( + dtype is not None + and dtype.kind in ["i", "u"] + and isinstance(values, np.ndarray) + and values.dtype.kind == "f" + ): + return astype_float_to_int_nansafe(values, dtype, copy=copy) else: - # error: Argument "dtype" to "array" has incompatible type - # "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[dtype[Any], - # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, - # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" - subarr = np.array(values, dtype=dtype, copy=copy) # type: ignore[arg-type] + subarr = np.array(values, dtype=dtype, copy=copy) return subarr def maybe_cast_to_integer_array( arr: list | np.ndarray, dtype: np.dtype, copy: bool = False -): +) -> np.ndarray: """ Takes any dtype and returns the casted version, raising for when data is incompatible with integer/unsigned integer dtypes. @@ -2022,9 +2088,23 @@ def maybe_cast_to_integer_array( if is_unsigned_integer_dtype(dtype) and (arr < 0).any(): raise OverflowError("Trying to coerce negative values to unsigned integers") - if is_float_dtype(arr) or is_object_dtype(arr): + if is_float_dtype(arr.dtype) or is_object_dtype(arr.dtype): raise ValueError("Trying to coerce float values to integers") + if casted.dtype < arr.dtype: + # GH#41734 e.g. [1, 200, 923442] and dtype="int8" -> overflows + warnings.warn( + f"Values are too large to be losslessly cast to {dtype}. " + "In a future version this will raise OverflowError. To retain the " + f"old behavior, use pd.Series(values).astype({dtype})", + FutureWarning, + stacklevel=find_stack_level(), + ) + return casted + + # No known cases that get here, but raising explicitly to cover our bases. + raise ValueError(f"values cannot be losslessly cast to {dtype}") + def convert_scalar_for_putitemlike(scalar: Scalar, dtype: np.dtype) -> Scalar: """ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 593e42f7ed749e..3f43681687945f 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -58,21 +58,6 @@ is_sequence, ) -POSSIBLY_CAST_DTYPES = { - np.dtype(t).name - for t in [ - "O", - "int8", - "uint8", - "int16", - "uint16", - "int32", - "uint32", - "int64", - "uint64", - ] -} - DT64NS_DTYPE = conversion.DT64NS_DTYPE TD64NS_DTYPE = conversion.TD64NS_DTYPE INT64_DTYPE = np.dtype(np.int64) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index efefeb23445afd..7545ea9a0733c0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -77,6 +77,7 @@ Appender, Substitution, deprecate_kwarg, + deprecate_nonkeyword_arguments, doc, rewrite_axis_style_signature, ) @@ -93,7 +94,6 @@ infer_dtype_from_scalar, invalidate_string_dtypes, maybe_box_native, - maybe_convert_platform, maybe_downcast_to_dtype, validate_numeric_casting, ) @@ -259,6 +259,8 @@ _merge_doc = """ Merge DataFrame or named Series objects with a database-style join. +A named Series object is treated as a DataFrame with a single named column. + The join is done on columns or indexes. If joining columns on columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes on indexes or indexes on a column or columns, the index will be passed on. @@ -726,6 +728,15 @@ def __init__( if index is None or columns is None: raise ValueError("DataFrame constructor not properly called!") + # Argument 1 to "ensure_index" has incompatible type "Collection[Any]"; + # expected "Union[Union[Union[ExtensionArray, ndarray], + # Index, Series], Sequence[Any]]" + index = ensure_index(index) # type: ignore[arg-type] + # Argument 1 to "ensure_index" has incompatible type "Collection[Any]"; + # expected "Union[Union[Union[ExtensionArray, ndarray], + # Index, Series], Sequence[Any]]" + columns = ensure_index(columns) # type: ignore[arg-type] + if not dtype: dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True) @@ -857,26 +868,37 @@ def _can_fast_transpose(self) -> bool: # TODO(EA2D) special case would be unnecessary with 2D EAs return not is_1d_only_ea_dtype(dtype) + # error: Return type "Union[ndarray, DatetimeArray, TimedeltaArray]" of + # "_values" incompatible with return type "ndarray" in supertype "NDFrame" @property - def _values_compat(self) -> np.ndarray | DatetimeArray | TimedeltaArray: + def _values( # type: ignore[override] + self, + ) -> np.ndarray | DatetimeArray | TimedeltaArray: """ Analogue to ._values that may return a 2D ExtensionArray. """ + self._consolidate_inplace() + mgr = self._mgr + if isinstance(mgr, ArrayManager): - return self._values + if len(mgr.arrays) == 1 and not is_1d_only_ea_obj(mgr.arrays[0]): + # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]" + # has no attribute "reshape" + return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr] + return self.values blocks = mgr.blocks if len(blocks) != 1: - return self._values + return self.values arr = blocks[0].values if arr.ndim == 1: # non-2D ExtensionArray - return self._values + return self.values # more generally, whatever we allow in NDArrayBackedExtensionBlock - arr = cast("DatetimeArray | TimedeltaArray", arr) + arr = cast("np.ndarray | DatetimeArray | TimedeltaArray", arr) return arr.T # ---------------------------------------------------------------------- @@ -1752,6 +1774,7 @@ def to_dict(self, orient: str = "dict", into=dict): "will be used in a future version. Use one of the above " "to silence this warning.", FutureWarning, + stacklevel=2, ) if orient.startswith("d"): @@ -2311,6 +2334,7 @@ def _from_arrays( dtype = pandas_dtype(dtype) manager = get_option("mode.data_manager") + columns = ensure_index(columns) mgr = arrays_to_mgr( arrays, columns, @@ -3322,7 +3346,7 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: if self._can_fast_transpose: # Note: tests pass without this, but this improves perf quite a bit. - new_vals = self._values_compat.T + new_vals = self._values.T if copy: new_vals = new_vals.copy() @@ -4485,35 +4509,11 @@ def _sanitize_column(self, value) -> ArrayLike: # We should never get here with DataFrame value if isinstance(value, Series): - value = _reindex_for_setitem(value, self.index) + return _reindex_for_setitem(value, self.index) - elif isinstance(value, ExtensionArray): - # Explicitly copy here - value = value.copy() + if is_list_like(value): com.require_length_match(value, self.index) - - elif is_sequence(value): - com.require_length_match(value, self.index) - - # turn me into an ndarray - if not isinstance(value, (np.ndarray, Index)): - if isinstance(value, list) and len(value) > 0: - value = maybe_convert_platform(value) - else: - value = com.asarray_tuplesafe(value) - elif isinstance(value, Index): - value = value.copy(deep=True)._values - else: - value = value.copy() - - # possibly infer to datetimelike - if is_object_dtype(value.dtype): - value = sanitize_array(value, None) - - else: - value = construct_1d_arraylike_from_scalar(value, len(self), dtype=None) - - return value + return sanitize_array(value, self.index, copy=True, allow_2d=True) @property def _series(self): @@ -4713,6 +4713,7 @@ def set_axis( ) -> DataFrame | None: ... + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"]) @Appender( """ Examples @@ -4776,6 +4777,7 @@ def reindex(self, *args, **kwargs) -> DataFrame: kwargs.pop("labels", None) return super().reindex(**kwargs) + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"]) def drop( self, labels=None, @@ -5166,6 +5168,7 @@ def fillna( ) -> DataFrame | None: ... + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "value"]) @doc(NDFrame.fillna, **_shared_doc_kwargs) def fillna( self, @@ -5339,6 +5342,7 @@ def shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value ) + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "keys"]) def set_index( self, keys, @@ -5605,6 +5609,7 @@ def reset_index( ) -> DataFrame | None: ... + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "level"]) def reset_index( self, level: Hashable | Sequence[Hashable] | None = None, @@ -5842,6 +5847,7 @@ def notna(self) -> DataFrame: def notnull(self) -> DataFrame: return ~self.isna() + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def dropna( self, axis: Axis = 0, @@ -5991,6 +5997,7 @@ def dropna( else: return result + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "subset"]) def drop_duplicates( self, subset: Hashable | Sequence[Hashable] | None = None, @@ -6226,6 +6233,7 @@ def f(vals) -> tuple[np.ndarray, int]: # ---------------------------------------------------------------------- # Sorting # TODO: Just move the sort_values doc here. + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "by"]) @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.sort_values.__doc__) # error: Signature of "sort_values" incompatible with supertype "NDFrame" @@ -6300,6 +6308,7 @@ def sort_values( # type: ignore[override] else: return result.__finalize__(self, method="sort_values") + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def sort_index( self, axis: Axis = 0, @@ -8912,10 +8921,7 @@ def append( index = Index([other.name], name=self.index.name) idx_diff = other.index.difference(self.columns) - try: - combined_columns = self.columns.append(idx_diff) - except TypeError: - combined_columns = self.columns.astype(object).append(idx_diff) + combined_columns = self.columns.append(idx_diff) other = ( other.reindex(combined_columns, copy=False) .to_frame() @@ -9766,7 +9772,6 @@ def _reduce( **kwds, ): - min_count = kwds.get("min_count", 0) assert filter_type is None or filter_type == "bool", filter_type out_dtype = "bool" if filter_type == "bool" else None @@ -9815,7 +9820,7 @@ def _get_data() -> DataFrame: data = self._get_bool_data() return data - if (numeric_only is not None or axis == 0) and min_count == 0: + if numeric_only is not None or axis == 0: # For numeric_only non-None and axis non-None, we know # which blocks to use and no try/except is needed. # For numeric_only=None only the case with axis==0 and no object @@ -9840,6 +9845,21 @@ def _get_data() -> DataFrame: # Even if we are object dtype, follow numpy and return # float64, see test_apply_funcs_over_empty out = out.astype(np.float64) + + if numeric_only is None and out.shape[0] != df.shape[1]: + # columns have been dropped GH#41480 + arg_name = "numeric_only" + if name in ["all", "any"]: + arg_name = "bool_only" + warnings.warn( + "Dropping of nuisance columns in DataFrame reductions " + f"(with '{arg_name}=None') is deprecated; in a future " + "version this will raise TypeError. Select only valid " + "columns before calling the reduction.", + FutureWarning, + stacklevel=5, + ) + return out assert numeric_only is None @@ -9860,6 +9880,19 @@ def _get_data() -> DataFrame: with np.errstate(all="ignore"): result = func(values) + # columns have been dropped GH#41480 + arg_name = "numeric_only" + if name in ["all", "any"]: + arg_name = "bool_only" + warnings.warn( + "Dropping of nuisance columns in DataFrame reductions " + f"(with '{arg_name}=None') is deprecated; in a future " + "version this will raise TypeError. Select only valid " + "columns before calling the reduction.", + FutureWarning, + stacklevel=5, + ) + if hasattr(result, "dtype"): if filter_type == "bool" and notna(result).all(): result = result.astype(np.bool_) @@ -10621,10 +10654,92 @@ def values(self) -> np.ndarray: self._consolidate_inplace() return self._mgr.as_array(transpose=True) - @property - def _values(self) -> np.ndarray: - """internal implementation""" - return self.values + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + def ffill( + self: DataFrame, + axis: None | Axis = None, + inplace: bool = False, + limit: None | int = None, + downcast=None, + ) -> DataFrame | None: + return super().ffill(axis, inplace, limit, downcast) + + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + def bfill( + self: DataFrame, + axis: None | Axis = None, + inplace: bool = False, + limit: None | int = None, + downcast=None, + ) -> DataFrame | None: + return super().bfill(axis, inplace, limit, downcast) + + @deprecate_nonkeyword_arguments( + version=None, allowed_args=["self", "lower", "upper"] + ) + def clip( + self: DataFrame, + lower=None, + upper=None, + axis: Axis | None = None, + inplace: bool = False, + *args, + **kwargs, + ) -> DataFrame | None: + return super().clip(lower, upper, axis, inplace, *args, **kwargs) + + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "method"]) + def interpolate( + self: DataFrame, + method: str = "linear", + axis: Axis = 0, + limit: int | None = None, + inplace: bool = False, + limit_direction: str | None = None, + limit_area: str | None = None, + downcast: str | None = None, + **kwargs, + ) -> DataFrame | None: + return super().interpolate( + method, + axis, + limit, + inplace, + limit_direction, + limit_area, + downcast, + **kwargs, + ) + + @deprecate_nonkeyword_arguments( + version=None, allowed_args=["self", "cond", "other"] + ) + def where( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=lib.no_default, + ): + return super().where(cond, other, inplace, axis, level, errors, try_cast) + + @deprecate_nonkeyword_arguments( + version=None, allowed_args=["self", "cond", "other"] + ) + def mask( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=lib.no_default, + ): + return super().mask(cond, other, inplace, axis, level, errors, try_cast) DataFrame._add_numeric_operations() diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a09cc0a6324c00..49dc71954fd8f5 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -267,7 +267,7 @@ def _init_mgr( if ( isinstance(mgr, BlockManager) and len(mgr.blocks) == 1 - and mgr.blocks[0].values.dtype == dtype + and is_dtype_equal(mgr.blocks[0].values.dtype, dtype) ): pass else: @@ -481,13 +481,19 @@ def _data(self): @property def _AXIS_NUMBERS(self) -> dict[str, int]: """.. deprecated:: 1.1.0""" - warnings.warn("_AXIS_NUMBERS has been deprecated.", FutureWarning, stacklevel=3) + level = self.ndim + 1 + warnings.warn( + "_AXIS_NUMBERS has been deprecated.", FutureWarning, stacklevel=level + ) return {"index": 0} @property def _AXIS_NAMES(self) -> dict[int, str]: """.. deprecated:: 1.1.0""" - warnings.warn("_AXIS_NAMES has been deprecated.", FutureWarning, stacklevel=3) + level = self.ndim + 1 + warnings.warn( + "_AXIS_NAMES has been deprecated.", FutureWarning, stacklevel=level + ) return {0: "index"} @final @@ -687,8 +693,7 @@ def size(self) -> int: >>> df.size 4 """ - # error: Incompatible return value type (got "number", expected "int") - return np.prod(self.shape) # type: ignore[return-value] + return np.prod(self.shape) @overload def set_axis( @@ -6387,47 +6392,6 @@ def fillna( else: return result.__finalize__(self, method="fillna") - @overload - def ffill( - self: FrameOrSeries, - axis: None | Axis = ..., - inplace: Literal[False] = ..., - limit: None | int = ..., - downcast=..., - ) -> FrameOrSeries: - ... - - @overload - def ffill( - self: FrameOrSeries, - axis: None | Axis, - inplace: Literal[True], - limit: None | int = ..., - downcast=..., - ) -> None: - ... - - @overload - def ffill( - self: FrameOrSeries, - *, - inplace: Literal[True], - limit: None | int = ..., - downcast=..., - ) -> None: - ... - - @overload - def ffill( - self: FrameOrSeries, - axis: None | Axis = ..., - inplace: bool_t = ..., - limit: None | int = ..., - downcast=..., - ) -> FrameOrSeries | None: - ... - - @final @doc(klass=_shared_doc_kwargs["klass"]) def ffill( self: FrameOrSeries, @@ -6450,47 +6414,6 @@ def ffill( pad = ffill - @overload - def bfill( - self: FrameOrSeries, - axis: None | Axis = ..., - inplace: Literal[False] = ..., - limit: None | int = ..., - downcast=..., - ) -> FrameOrSeries: - ... - - @overload - def bfill( - self: FrameOrSeries, - axis: None | Axis, - inplace: Literal[True], - limit: None | int = ..., - downcast=..., - ) -> None: - ... - - @overload - def bfill( - self: FrameOrSeries, - *, - inplace: Literal[True], - limit: None | int = ..., - downcast=..., - ) -> None: - ... - - @overload - def bfill( - self: FrameOrSeries, - axis: None | Axis = ..., - inplace: bool_t = ..., - limit: None | int = ..., - downcast=..., - ) -> FrameOrSeries | None: - ... - - @final @doc(klass=_shared_doc_kwargs["klass"]) def bfill( self: FrameOrSeries, @@ -6696,7 +6619,6 @@ def replace( else: return result.__finalize__(self, method="replace") - @final def interpolate( self: FrameOrSeries, method: str = "linear", @@ -7361,115 +7283,6 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): # GH 40420 return self.where(subset, threshold, axis=axis, inplace=inplace) - @overload - def clip( - self: FrameOrSeries, - lower=..., - upper=..., - axis: Axis | None = ..., - inplace: Literal[False] = ..., - *args, - **kwargs, - ) -> FrameOrSeries: - ... - - @overload - def clip( - self: FrameOrSeries, - lower, - *, - axis: Axis | None, - inplace: Literal[True], - **kwargs, - ) -> None: - ... - - @overload - def clip( - self: FrameOrSeries, - lower, - *, - inplace: Literal[True], - **kwargs, - ) -> None: - ... - - @overload - def clip( - self: FrameOrSeries, - *, - upper, - axis: Axis | None, - inplace: Literal[True], - **kwargs, - ) -> None: - ... - - @overload - def clip( - self: FrameOrSeries, - *, - upper, - inplace: Literal[True], - **kwargs, - ) -> None: - ... - - @overload - def clip( - self: FrameOrSeries, - *, - axis: Axis | None, - inplace: Literal[True], - **kwargs, - ) -> None: - ... - - @overload - def clip( - self: FrameOrSeries, - lower, - upper, - axis: Axis | None, - inplace: Literal[True], - *args, - **kwargs, - ) -> None: - ... - - @overload - def clip( - self: FrameOrSeries, - lower, - upper, - *, - inplace: Literal[True], - **kwargs, - ) -> None: - ... - - @overload - def clip( - self: FrameOrSeries, - *, - inplace: Literal[True], - **kwargs, - ) -> None: - ... - - @overload - def clip( - self: FrameOrSeries, - lower=..., - upper=..., - axis: Axis | None = ..., - inplace: bool_t = ..., - *args, - **kwargs, - ) -> FrameOrSeries | None: - ... - - @final def clip( self: FrameOrSeries, lower=None, @@ -9069,7 +8882,6 @@ def _where( result = self._constructor(new_data) return result.__finalize__(self) - @final @doc( klass=_shared_doc_kwargs["klass"], cond="True", @@ -9217,7 +9029,7 @@ def where( "try_cast keyword is deprecated and will be removed in a " "future version", FutureWarning, - stacklevel=2, + stacklevel=4, ) return self._where(cond, other, inplace, axis, level, errors=errors) @@ -9250,7 +9062,7 @@ def mask( "try_cast keyword is deprecated and will be removed in a " "future version", FutureWarning, - stacklevel=2, + stacklevel=4, ) # see gh-21891 @@ -9409,7 +9221,7 @@ def shift( else: new_ax = index.shift(periods, freq) - result = self.set_axis(new_ax, axis) + result = self.set_axis(new_ax, axis=axis) return result.__finalize__(self, method="shift") @final diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c38c51d46f83e9..69f992f840c7c9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -67,10 +67,7 @@ validate_func_kwargs, ) from pandas.core.apply import GroupByApply -from pandas.core.base import ( - DataError, - SpecificationError, -) +from pandas.core.base import SpecificationError import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.frame import DataFrame @@ -323,7 +320,7 @@ def _aggregate_multiple_funcs(self, arg) -> DataFrame: return output def _cython_agg_general( - self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 ): obj = self._selected_obj @@ -331,7 +328,10 @@ def _cython_agg_general( data = obj._mgr if numeric_only and not is_numeric_dtype(obj.dtype): - raise DataError("No numeric types to aggregate") + # GH#41291 match Series behavior + raise NotImplementedError( + f"{type(self).__name__}.{how} does not implement numeric_only." + ) # This is overkill because it is only called once, but is here to # mirror the array_func used in DataFrameGroupBy._cython_agg_general @@ -513,16 +513,12 @@ def _cython_transform( obj = self._selected_obj - is_numeric = is_numeric_dtype(obj.dtype) - if numeric_only and not is_numeric: - raise DataError("No numeric types to aggregate") - try: result = self.grouper._cython_operation( "transform", obj._values, how, axis, **kwargs ) - except (NotImplementedError, TypeError): - raise DataError("No numeric types to aggregate") + except NotImplementedError as err: + raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err return obj._constructor(result, index=self.obj.index, name=obj.name) @@ -1056,12 +1052,11 @@ def _iterate_slices(self) -> Iterable[Series]: yield values def _cython_agg_general( - self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 ) -> DataFrame: # Note: we never get here with how="ohlc"; that goes through SeriesGroupBy data: Manager2D = self._get_data_to_aggregate() - orig = data if numeric_only: data = data.get_numeric_data(copy=False) @@ -1084,9 +1079,15 @@ def array_func(values: ArrayLike) -> ArrayLike: # continue and exclude the block new_mgr = data.grouped_reduce(array_func, ignore_failures=True) - if not len(new_mgr) and len(orig): - # If the original Manager was already empty, no need to raise - raise DataError("No numeric types to aggregate") + if len(new_mgr) < len(data): + warnings.warn( + f"Dropping invalid columns in {type(self).__name__}.{how} " + "is deprecated. In a future version, a TypeError will be raised. " + f"Before calling .{how}, select only columns which should be " + "valid for the function.", + FutureWarning, + stacklevel=4, + ) return self._wrap_agged_manager(new_mgr) @@ -1283,6 +1284,16 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike: res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True) res_mgr.set_axis(1, mgr.axes[1]) + if len(res_mgr) < len(mgr): + warnings.warn( + f"Dropping invalid columns in {type(self).__name__}.{how} " + "is deprecated. In a future version, a TypeError will be raised. " + f"Before calling .{how}, select only columns which should be " + "valid for the transforming function.", + FutureWarning, + stacklevel=4, + ) + res_df = self.obj._constructor(res_mgr) if self.axis == 1: res_df = res_df.T @@ -1420,7 +1431,14 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: output[i] = sgb.transform(wrapper) except TypeError: # e.g. trying to call nanmean with string values - pass + warnings.warn( + f"Dropping invalid columns in {type(self).__name__}.transform " + "is deprecated. In a future version, a TypeError will be raised. " + "Before calling .transform, select only columns which should be " + "valid for the transforming function.", + FutureWarning, + stacklevel=5, + ) else: inds.append(i) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 29a161676b2db4..6deb5bb1a76f08 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -30,6 +30,7 @@ class providing the base-class of operations. Union, cast, ) +import warnings import numpy as np @@ -1100,6 +1101,34 @@ def _wrap_transformed_output(self, output: Mapping[base.OutputKey, ArrayLike]): def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) + def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: + """ + Determine subclass-specific default value for 'numeric_only'. + + For SeriesGroupBy we want the default to be False (to match Series behavior). + For DataFrameGroupBy we want it to be True (for backwards-compat). + + Parameters + ---------- + numeric_only : bool or lib.no_default + + Returns + ------- + bool + """ + # GH#41291 + if numeric_only is lib.no_default: + # i.e. not explicitly passed by user + if self.obj.ndim == 2: + # i.e. DataFrameGroupBy + numeric_only = True + else: + numeric_only = False + + # error: Incompatible return value type (got "Union[bool, NoDefault]", + # expected "bool") + return numeric_only # type: ignore[return-value] + # ----------------------------------------------------------------- # numba @@ -1131,10 +1160,16 @@ def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs) group_keys = self.grouper._get_group_keys() numba_transform_func = numba_.generate_numba_transform_func( - tuple(args), kwargs, func, engine_kwargs + kwargs, func, engine_kwargs ) result = numba_transform_func( - sorted_data, sorted_index, starts, ends, len(group_keys), len(data.columns) + sorted_data, + sorted_index, + starts, + ends, + len(group_keys), + len(data.columns), + *args, ) cache_key = (func, "groupby_transform") @@ -1157,11 +1192,15 @@ def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs) starts, ends, sorted_index, sorted_data = self._numba_prep(func, data) group_keys = self.grouper._get_group_keys() - numba_agg_func = numba_.generate_numba_agg_func( - tuple(args), kwargs, func, engine_kwargs - ) + numba_agg_func = numba_.generate_numba_agg_func(kwargs, func, engine_kwargs) result = numba_agg_func( - sorted_data, sorted_index, starts, ends, len(group_keys), len(data.columns) + sorted_data, + sorted_index, + starts, + ends, + len(group_keys), + len(data.columns), + *args, ) cache_key = (func, "groupby_agg") @@ -1270,6 +1309,14 @@ def _python_agg_general(self, func, *args, **kwargs): # if this function is invalid for this dtype, we will ignore it. result = self.grouper.agg_series(obj, f) except TypeError: + warnings.warn( + f"Dropping invalid columns in {type(self).__name__}.agg " + "is deprecated. In a future version, a TypeError will be raised. " + "Before calling .agg, select only columns which should be " + "valid for the aggregating function.", + FutureWarning, + stacklevel=3, + ) continue key = base.OutputKey(label=name, position=idx) @@ -1289,22 +1336,15 @@ def _agg_general( alias: str, npfunc: Callable, ): + with group_selection_context(self): # try a cython aggregation if we can - result = None - try: - result = self._cython_agg_general( - how=alias, - alt=npfunc, - numeric_only=numeric_only, - min_count=min_count, - ) - except DataError: - pass - - # apply a non-cython aggregation - if result is None: - result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + result = self._cython_agg_general( + how=alias, + alt=npfunc, + numeric_only=numeric_only, + min_count=min_count, + ) return result.__finalize__(self.obj, method="groupby") def _agg_py_fallback( @@ -1348,7 +1388,7 @@ def _agg_py_fallback( return ensure_block_shape(res_values, ndim=ndim) def _cython_agg_general( - self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 ): raise AbstractMethodError(self) @@ -1568,7 +1608,7 @@ def count(self): @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) - def mean(self, numeric_only: bool = True): + def mean(self, numeric_only: bool | lib.NoDefault = lib.no_default): """ Compute mean of groups, excluding missing values. @@ -1600,12 +1640,12 @@ def mean(self, numeric_only: bool = True): Groupby two columns and return the mean of the remaining column. >>> df.groupby(['A', 'B']).mean() - C + C A B - 1 2.0 2 - 4.0 1 - 2 3.0 1 - 5.0 2 + 1 2.0 2.0 + 4.0 1.0 + 2 3.0 1.0 + 5.0 2.0 Groupby one column and return the mean of only particular column in the group. @@ -1616,6 +1656,8 @@ def mean(self, numeric_only: bool = True): 2 4.0 Name: B, dtype: float64 """ + numeric_only = self._resolve_numeric_only(numeric_only) + result = self._cython_agg_general( "mean", alt=lambda x: Series(x).mean(numeric_only=numeric_only), @@ -1626,7 +1668,7 @@ def mean(self, numeric_only: bool = True): @final @Substitution(name="groupby") @Appender(_common_see_also) - def median(self, numeric_only=True): + def median(self, numeric_only: bool | lib.NoDefault = lib.no_default): """ Compute median of groups, excluding missing values. @@ -1643,6 +1685,8 @@ def median(self, numeric_only=True): Series or DataFrame Median of values within each group. """ + numeric_only = self._resolve_numeric_only(numeric_only) + result = self._cython_agg_general( "median", alt=lambda x: Series(x).median(numeric_only=numeric_only), @@ -1700,8 +1744,9 @@ def var(self, ddof: int = 1): Variance of values within each group. """ if ddof == 1: + numeric_only = self._resolve_numeric_only(lib.no_default) return self._cython_agg_general( - "var", alt=lambda x: Series(x).var(ddof=ddof) + "var", alt=lambda x: Series(x).var(ddof=ddof), numeric_only=numeric_only ) else: func = lambda x: x.var(ddof=ddof) @@ -1766,7 +1811,10 @@ def size(self) -> FrameOrSeriesUnion: @final @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) - def sum(self, numeric_only: bool = True, min_count: int = 0): + def sum( + self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0 + ): + numeric_only = self._resolve_numeric_only(numeric_only) # If we are grouping on categoricals we want unobserved categories to # return zero, rather than the default of NaN which the reindexing in @@ -1783,7 +1831,11 @@ def sum(self, numeric_only: bool = True, min_count: int = 0): @final @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0) - def prod(self, numeric_only: bool = True, min_count: int = 0): + def prod( + self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0 + ): + numeric_only = self._resolve_numeric_only(numeric_only) + return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod ) @@ -2712,7 +2764,7 @@ def _get_cythonized_result( how: str, cython_dtype: np.dtype, aggregate: bool = False, - numeric_only: bool = True, + numeric_only: bool | lib.NoDefault = lib.no_default, needs_counts: bool = False, needs_values: bool = False, needs_2d: bool = False, @@ -2780,6 +2832,8 @@ def _get_cythonized_result( ------- `Series` or `DataFrame` with filled values """ + numeric_only = self._resolve_numeric_only(numeric_only) + if result_is_index and aggregate: raise ValueError("'result_is_index' and 'aggregate' cannot both be True!") if post_processing and not callable(post_processing): @@ -2829,6 +2883,16 @@ def _get_cythonized_result( vals, inferences = pre_processing(vals) except TypeError as err: error_msg = str(err) + howstr = how.replace("group_", "") + warnings.warn( + "Dropping invalid columns in " + f"{type(self).__name__}.{howstr} is deprecated. " + "In a future version, a TypeError will be raised. " + f"Before calling .{howstr}, select only columns which " + "should be valid for the function.", + FutureWarning, + stacklevel=3, + ) continue vals = vals.astype(cython_dtype, copy=False) if needs_2d: @@ -3064,7 +3128,7 @@ def _reindex_output( # reindexing only applies to a Categorical grouper elif not any( - isinstance(ping.grouper, (Categorical, CategoricalIndex)) + isinstance(ping.grouping_vector, (Categorical, CategoricalIndex)) for ping in groupings ): return output diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e2855cbc904255..c5d5d5a3013363 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -150,8 +150,8 @@ class Grouper: >>> df.groupby(pd.Grouper(key="Animal")).mean() Speed Animal - Falcon 200 - Parrot 10 + Falcon 200.0 + Parrot 10.0 Specify a resample operation on the column 'Publish date' @@ -252,6 +252,8 @@ class Grouper: axis: int sort: bool dropna: bool + _gpr_index: Index | None + _grouper: Index | None _attributes: tuple[str, ...] = ("key", "level", "freq", "axis", "sort") @@ -279,6 +281,7 @@ def __init__( self.sort = sort self.grouper = None + self._gpr_index = None self.obj = None self.indexer = None self.binner = None @@ -288,8 +291,11 @@ def __init__( @final @property - def ax(self): - return self.grouper + def ax(self) -> Index: + index = self._gpr_index + if index is None: + raise ValueError("_set_grouper must be called before ax is accessed") + return index def _get_grouper(self, obj: FrameOrSeries, validate: bool = True): """ @@ -317,6 +323,7 @@ def _get_grouper(self, obj: FrameOrSeries, validate: bool = True): validate=validate, dropna=self.dropna, ) + return self.binner, self.grouper, self.obj @final @@ -338,14 +345,17 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): # Keep self.grouper value before overriding if self._grouper is None: - self._grouper = self.grouper + # TODO: What are we assuming about subsequent calls? + self._grouper = self._gpr_index self._indexer = self.indexer # the key must be a valid info item if self.key is not None: key = self.key # The 'on' is already defined - if getattr(self.grouper, "name", None) == key and isinstance(obj, Series): + if getattr(self._gpr_index, "name", None) == key and isinstance( + obj, Series + ): # Sometimes self._grouper will have been resorted while # obj has not. In this case there is a mismatch when we # call self._grouper.take(obj.index) so we need to undo the sorting @@ -390,10 +400,8 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): # error: Incompatible types in assignment (expression has type # "FrameOrSeries", variable has type "None") self.obj = obj # type: ignore[assignment] - # error: Incompatible types in assignment (expression has type "Index", - # variable has type "None") - self.grouper = ax # type: ignore[assignment] - return self.grouper + self._gpr_index = ax + return self._gpr_index @final @property @@ -441,6 +449,9 @@ class Grouping: _codes: np.ndarray | None = None _group_index: Index | None = None + _passed_categorical: bool + _all_grouper: Categorical | None + _index: Index def __init__( self, @@ -455,14 +466,14 @@ def __init__( ): self.level = level self._orig_grouper = grouper - self.grouper = _convert_grouper(index, grouper) - self.all_grouper = None - self.index = index - self.sort = sort + self.grouping_vector = _convert_grouper(index, grouper) + self._all_grouper = None + self._index = index + self._sort = sort self.obj = obj - self.observed = observed + self._observed = observed self.in_axis = in_axis - self.dropna = dropna + self._dropna = dropna self._passed_categorical = False @@ -471,20 +482,24 @@ def __init__( ilevel = self._ilevel if ilevel is not None: + mapper = self.grouping_vector + # In extant tests, the new self.grouping_vector matches + # `index.get_level_values(ilevel)` whenever + # mapper is None and isinstance(index, MultiIndex) ( - self.grouper, # Index + self.grouping_vector, # Index self._codes, self._group_index, - ) = index._get_grouper_for_level(self.grouper, ilevel) + ) = index._get_grouper_for_level(mapper, ilevel) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get codes - elif isinstance(self.grouper, Grouper): + elif isinstance(self.grouping_vector, Grouper): # get the new grouper; we already have disambiguated # what key/level refer to exactly, don't need to # check again as we have by this point converted these # to an actual value (rather than a pd.Grouper) - _, newgrouper, newobj = self.grouper._get_grouper( + _, newgrouper, newobj = self.grouping_vector._get_grouper( # error: Value of type variable "FrameOrSeries" of "_get_grouper" # of "Grouper" cannot be "Optional[FrameOrSeries]" self.obj, # type: ignore[type-var] @@ -495,44 +510,46 @@ def __init__( ng = newgrouper._get_grouper() if isinstance(newgrouper, ops.BinGrouper): # in this case we have `ng is newgrouper` - self.grouper = ng + self.grouping_vector = ng else: # ops.BaseGrouper # use Index instead of ndarray so we can recover the name - self.grouper = Index(ng, name=newgrouper.result_index.name) + self.grouping_vector = Index(ng, name=newgrouper.result_index.name) - elif is_categorical_dtype(self.grouper): + elif is_categorical_dtype(self.grouping_vector): # a passed Categorical self._passed_categorical = True - self.grouper, self.all_grouper = recode_for_groupby( - self.grouper, self.sort, observed + self.grouping_vector, self._all_grouper = recode_for_groupby( + self.grouping_vector, sort, observed ) - elif not isinstance(self.grouper, (Series, Index, ExtensionArray, np.ndarray)): + elif not isinstance( + self.grouping_vector, (Series, Index, ExtensionArray, np.ndarray) + ): # no level passed - if getattr(self.grouper, "ndim", 1) != 1: - t = self.name or str(type(self.grouper)) + if getattr(self.grouping_vector, "ndim", 1) != 1: + t = self.name or str(type(self.grouping_vector)) raise ValueError(f"Grouper for '{t}' not 1-dimensional") - self.grouper = self.index.map(self.grouper) + self.grouping_vector = index.map(self.grouping_vector) if not ( - hasattr(self.grouper, "__len__") - and len(self.grouper) == len(self.index) + hasattr(self.grouping_vector, "__len__") + and len(self.grouping_vector) == len(index) ): - grper = pprint_thing(self.grouper) + grper = pprint_thing(self.grouping_vector) errmsg = ( "Grouper result violates len(labels) == " f"len(data)\nresult: {grper}" ) - self.grouper = None # Try for sanity + self.grouping_vector = None # Try for sanity raise AssertionError(errmsg) - if isinstance(self.grouper, np.ndarray): + if isinstance(self.grouping_vector, np.ndarray): # if we have a date/time-like grouper, make sure that we have # Timestamps like - self.grouper = sanitize_to_nanoseconds(self.grouper) + self.grouping_vector = sanitize_to_nanoseconds(self.grouping_vector) def __repr__(self) -> str: return f"Grouping({self.name})" @@ -544,16 +561,16 @@ def __iter__(self): def name(self) -> Hashable: ilevel = self._ilevel if ilevel is not None: - return self.index.names[ilevel] + return self._index.names[ilevel] if isinstance(self._orig_grouper, (Index, Series)): return self._orig_grouper.name - elif isinstance(self.grouper, ops.BaseGrouper): - return self.grouper.result_index.name + elif isinstance(self.grouping_vector, ops.BaseGrouper): + return self.grouping_vector.result_index.name - elif isinstance(self.grouper, Index): - return self.grouper.name + elif isinstance(self.grouping_vector, Index): + return self.grouping_vector.name # otherwise we have ndarray or ExtensionArray -> no name return None @@ -567,7 +584,7 @@ def _ilevel(self) -> int | None: if level is None: return None if not isinstance(level, int): - index = self.index + index = self._index if level not in index.names: raise AssertionError(f"Level {level} not in index") return index.names.index(level) @@ -580,10 +597,10 @@ def ngroups(self) -> int: @cache_readonly def indices(self): # we have a list of groupers - if isinstance(self.grouper, ops.BaseGrouper): - return self.grouper.indices + if isinstance(self.grouping_vector, ops.BaseGrouper): + return self.grouping_vector.indices - values = Categorical(self.grouper) + values = Categorical(self.grouping_vector) return values._reverse_indexer() @property @@ -605,10 +622,10 @@ def group_arraylike(self) -> ArrayLike: @cache_readonly def result_index(self) -> Index: # TODO: what's the difference between result_index vs group_index? - if self.all_grouper is not None: + if self._all_grouper is not None: group_idx = self.group_index assert isinstance(group_idx, CategoricalIndex) - return recode_from_groupby(self.all_grouper, self.sort, group_idx) + return recode_from_groupby(self._all_grouper, self._sort, group_idx) return self.group_index @cache_readonly @@ -624,13 +641,13 @@ def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: if self._passed_categorical: # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes - cat = self.grouper + cat = self.grouping_vector categories = cat.categories - if self.observed: + if self._observed: ucodes = algorithms.unique1d(cat.codes) ucodes = ucodes[ucodes != -1] - if self.sort or cat.ordered: + if self._sort or cat.ordered: ucodes = np.sort(ucodes) else: ucodes = np.arange(len(categories)) @@ -640,24 +657,24 @@ def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: ) return cat.codes, uniques - elif isinstance(self.grouper, ops.BaseGrouper): + elif isinstance(self.grouping_vector, ops.BaseGrouper): # we have a list of groupers - codes = self.grouper.codes_info - uniques = self.grouper.result_arraylike + codes = self.grouping_vector.codes_info + uniques = self.grouping_vector.result_arraylike else: # GH35667, replace dropna=False with na_sentinel=None - if not self.dropna: + if not self._dropna: na_sentinel = None else: na_sentinel = -1 codes, uniques = algorithms.factorize( - self.grouper, sort=self.sort, na_sentinel=na_sentinel + self.grouping_vector, sort=self._sort, na_sentinel=na_sentinel ) return codes, uniques @cache_readonly def groups(self) -> dict[Hashable, np.ndarray]: - return self.index.groupby(Categorical.from_codes(self.codes, self.group_index)) + return self._index.groupby(Categorical.from_codes(self.codes, self.group_index)) def get_grouper( diff --git a/pandas/core/groupby/numba_.py b/pandas/core/groupby/numba_.py index 26070fcb5e89c2..ad78280c5d835e 100644 --- a/pandas/core/groupby/numba_.py +++ b/pandas/core/groupby/numba_.py @@ -56,11 +56,12 @@ def f(values, index, ...): def generate_numba_agg_func( - args: tuple, kwargs: dict[str, Any], func: Callable[..., Scalar], engine_kwargs: dict[str, bool] | None, -) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], np.ndarray]: +) -> Callable[ + [np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int, Any], np.ndarray +]: """ Generate a numba jitted agg function specified by values from engine_kwargs. @@ -72,8 +73,6 @@ def generate_numba_agg_func( Parameters ---------- - args : tuple - *args to be passed into the function kwargs : dict **kwargs to be passed into the function func : function @@ -103,6 +102,7 @@ def group_agg( end: np.ndarray, num_groups: int, num_columns: int, + *args: Any, ) -> np.ndarray: result = np.empty((num_groups, num_columns)) for i in numba.prange(num_groups): @@ -116,11 +116,12 @@ def group_agg( def generate_numba_transform_func( - args: tuple, kwargs: dict[str, Any], func: Callable[..., np.ndarray], engine_kwargs: dict[str, bool] | None, -) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], np.ndarray]: +) -> Callable[ + [np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int, Any], np.ndarray +]: """ Generate a numba jitted transform function specified by values from engine_kwargs. @@ -132,8 +133,6 @@ def generate_numba_transform_func( Parameters ---------- - args : tuple - *args to be passed into the function kwargs : dict **kwargs to be passed into the function func : function @@ -163,6 +162,7 @@ def group_transform( end: np.ndarray, num_groups: int, num_columns: int, + *args: Any, ) -> np.ndarray: result = np.empty((len(values), num_columns)) for i in numba.prange(num_groups): diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 746c6e00560641..6903c8e99e489c 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -49,6 +49,7 @@ is_categorical_dtype, is_complex_dtype, is_datetime64_any_dtype, + is_float_dtype, is_integer_dtype, is_numeric_dtype, is_sparse, @@ -304,10 +305,13 @@ def _get_result_dtype(self, dtype: DtypeObj) -> DtypeObj: return np.dtype(np.int64) elif isinstance(dtype, (BooleanDtype, _IntegerDtype)): return Int64Dtype() - elif how in ["mean", "median", "var"] and isinstance( - dtype, (BooleanDtype, _IntegerDtype) - ): - return Float64Dtype() + elif how in ["mean", "median", "var"]: + if isinstance(dtype, (BooleanDtype, _IntegerDtype)): + return Float64Dtype() + elif is_float_dtype(dtype): + return dtype + elif is_numeric_dtype(dtype): + return np.dtype(np.float64) return dtype def uses_mask(self) -> bool: @@ -678,7 +682,7 @@ def __init__( self.axis = axis self._groupings: list[grouper.Grouping] = list(groupings) - self.sort = sort + self._sort = sort self.group_keys = group_keys self.mutated = mutated self.indexer = indexer @@ -734,7 +738,7 @@ def _get_grouper(self): We have a specific method of grouping, so cannot convert to a Index for our grouper. """ - return self.groupings[0].grouper + return self.groupings[0].grouping_vector @final def _get_group_keys(self): @@ -858,7 +862,7 @@ def groups(self) -> dict[Hashable, np.ndarray]: if len(self.groupings) == 1: return self.groupings[0].groups else: - to_groupby = zip(*(ping.grouper for ping in self.groupings)) + to_groupby = zip(*(ping.grouping_vector for ping in self.groupings)) index = Index(to_groupby) return self.axis.groupby(index) @@ -891,7 +895,7 @@ def codes_info(self) -> np.ndarray: def _get_compressed_codes(self) -> tuple[np.ndarray, np.ndarray]: if len(self.groupings) > 1: group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) - return compress_group_index(group_index, sort=self.sort) + return compress_group_index(group_index, sort=self._sort) ping = self.groupings[0] return ping.codes, np.arange(len(ping.group_index)) diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 4f3f536cd32905..ed4b1a3fbb39cc 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -166,6 +166,8 @@ def check_setitem_lengths(indexer, value, values) -> bool: if is_list_like(value): if len(indexer) != len(value) and values.ndim == 1: # boolean with truth values == len of the value is ok too + if isinstance(indexer, list): + indexer = np.array(indexer) if not ( isinstance(indexer, np.ndarray) and indexer.dtype == np.bool_ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9f0a80ba0f5c7e..124903446220da 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -54,6 +54,7 @@ from pandas.util._decorators import ( Appender, cache_readonly, + deprecate_nonkeyword_arguments, doc, ) @@ -76,7 +77,6 @@ is_float_dtype, is_hashable, is_integer, - is_integer_dtype, is_interval_dtype, is_iterator, is_list_like, @@ -775,6 +775,7 @@ def _engine(self) -> libindex.IndexEngine: target_values = self._get_engine_target() return self._engine_type(lambda: target_values, len(self)) + @final @cache_readonly def _dir_additions_for_owner(self) -> set[str_t]: """ @@ -813,6 +814,7 @@ def __array_wrap__(self, result, context=None): return result attrs = self._get_attributes_dict() + attrs.pop("freq", None) # For DatetimeIndex/TimedeltaIndex return Index(result, **attrs) @cache_readonly @@ -904,13 +906,10 @@ def astype(self, dtype, copy=True): if is_dtype_equal(self.dtype, dtype): return self.copy() if copy else self - elif is_categorical_dtype(dtype): - from pandas.core.indexes.category import CategoricalIndex - - return CategoricalIndex(self, name=self.name, dtype=dtype, copy=copy) - - elif is_extension_array_dtype(dtype): - return Index(np.asarray(self), name=self.name, dtype=dtype, copy=copy) + elif isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + new_values = cls._from_sequence(self, dtype=dtype, copy=False) + return Index(new_values, dtype=dtype, copy=copy, name=self.name) try: casted = self._values.astype(dtype, copy=copy) @@ -929,19 +928,20 @@ def astype(self, dtype, copy=True): Parameters ---------- - indices : list + indices : array-like Indices to be taken. axis : int, optional The axis over which to select values, always 0. allow_fill : bool, default True - fill_value : bool, default None + fill_value : scalar, default None If allow_fill=True and fill_value is not None, indices specified by - -1 is regarded as NA. If Index doesn't hold NA, raise ValueError. + -1 are regarded as NA. If Index doesn't hold NA, raise ValueError. Returns ------- - numpy.ndarray - Elements of given indices. + Index + An index formed of elements at the given indices. Will be the same + type as self, except for RangeIndex. See Also -------- @@ -950,7 +950,9 @@ def astype(self, dtype, copy=True): """ @Appender(_index_shared_docs["take"] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): + def take( + self, indices, axis: int = 0, allow_fill: bool = True, fill_value=None, **kwargs + ): if kwargs: nv.validate_take((), kwargs) indices = ensure_platform_int(indices) @@ -1155,18 +1157,25 @@ def _format_data(self, name=None) -> str_t: is_justify = False return format_object_summary( - self, self._formatter_func, is_justify=is_justify, name=name + self, + self._formatter_func, + is_justify=is_justify, + name=name, + line_break_each_value=self._is_multi, ) - def _format_attrs(self): + def _format_attrs(self) -> list[tuple[str_t, str_t | int]]: """ Return a list of tuples of the (attr,formatted_value). """ - return format_object_attrs(self) + return format_object_attrs(self, include_dtype=not self._is_multi) - def _mpl_repr(self): + @final + def _mpl_repr(self) -> np.ndarray: # how to represent ourselves to matplotlib - return self.values + if isinstance(self.dtype, np.dtype) and self.dtype.kind != "M": + return cast(np.ndarray, self.values) + return self.astype(object, copy=False)._values def format( self, @@ -1526,7 +1535,7 @@ def _set_names(self, values, level=None) -> None: names = property(fset=_set_names, fget=_get_names) - @final + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "names"]) def set_names(self, names, level=None, inplace: bool = False): """ Set Index or MultiIndex name. @@ -2404,6 +2413,13 @@ def is_all_dates(self) -> bool: ) return self._is_all_dates + @cache_readonly + def _is_multi(self) -> bool: + """ + Cached check equivalent to isinstance(self, MultiIndex) + """ + return isinstance(self, ABCMultiIndex) + # -------------------------------------------------------------------- # Pickle Methods @@ -2633,7 +2649,7 @@ def unique(self: _IndexT, level: Hashable | None = None) -> _IndexT: result = super().unique() return self._shallow_copy(result) - @final + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def drop_duplicates(self: _IndexT, keep: str_t | bool = "first") -> _IndexT: """ Return Index with duplicate values removed. @@ -2946,20 +2962,7 @@ def union(self, other, sort=None): stacklevel=2, ) - dtype = find_common_type([self.dtype, other.dtype]) - if self._is_numeric_dtype and other._is_numeric_dtype: - # Right now, we treat union(int, float) a bit special. - # See https://github.com/pandas-dev/pandas/issues/26778 for discussion - # We may change union(int, float) to go to object. - # float | [u]int -> float (the special case) - # | -> T - # | -> object - if not (is_integer_dtype(self.dtype) and is_integer_dtype(other.dtype)): - dtype = np.dtype("float64") - else: - # one is int64 other is uint64 - dtype = np.dtype("object") - + dtype = self._find_common_type_compat(other) left = self.astype(dtype, copy=False) right = other.astype(dtype, copy=False) return left.union(right, sort=sort) @@ -3691,7 +3694,7 @@ def is_int(v): "and will raise TypeError in a future version. " "Use .loc with labels or .iloc with positions instead.", FutureWarning, - stacklevel=6, + stacklevel=5, ) indexer = key else: @@ -5393,6 +5396,19 @@ def _find_common_type_compat(self, target) -> DtypeObj: return IntervalDtype(np.float64, closed=self.closed) target_dtype, _ = infer_dtype_from(target, pandas_dtype=True) + + # special case: if one dtype is uint64 and the other a signed int, return object + # See https://github.com/pandas-dev/pandas/issues/26778 for discussion + # Now it's: + # * float | [u]int -> float + # * uint64 | signed int -> object + # We may change union(float | [u]int) to go to object. + if self.dtype == "uint64" or target_dtype == "uint64": + if is_signed_integer_dtype(self.dtype) or is_signed_integer_dtype( + target_dtype + ): + return np.dtype("object") + dtype = find_common_type([self.dtype, target_dtype]) if dtype.kind in ["i", "u"]: # TODO: what about reversed with self being categorical? @@ -6193,6 +6209,7 @@ def shape(self) -> Shape: # See GH#27775, GH#27384 for history/reasoning in how this is defined. return (len(self),) + @final def _deprecated_arg(self, value, name: str_t, methodname: str_t) -> None: """ Issue a FutureWarning if the arg/kwarg is not no_default. @@ -6281,27 +6298,18 @@ def ensure_index(index_like: AnyArrayLike | Sequence, copy: bool = False) -> Ind if copy: index_like = index_like.copy() return index_like - if hasattr(index_like, "name"): - # https://github.com/python/mypy/issues/1424 - # error: Item "ExtensionArray" of "Union[ExtensionArray, - # Sequence[Any]]" has no attribute "name" - # error: Item "Sequence[Any]" of "Union[ExtensionArray, Sequence[Any]]" - # has no attribute "name" - # error: "Sequence[Any]" has no attribute "name" - # error: Item "Sequence[Any]" of "Union[Series, Sequence[Any]]" has no - # attribute "name" - # error: Item "Sequence[Any]" of "Union[Any, Sequence[Any]]" has no - # attribute "name" - name = index_like.name # type: ignore[union-attr, attr-defined] + + if isinstance(index_like, ABCSeries): + name = index_like.name return Index(index_like, name=name, copy=copy) if is_iterator(index_like): index_like = list(index_like) - # must check for exactly list here because of strict type - # check in clean_index_list if isinstance(index_like, list): - if type(index_like) != list: + if type(index_like) is not list: + # must check for exactly list here because of strict type + # check in clean_index_list index_like = list(index_like) converted, all_arrays = lib.clean_index_list(index_like) @@ -6311,13 +6319,6 @@ def ensure_index(index_like: AnyArrayLike | Sequence, copy: bool = False) -> Ind return MultiIndex.from_arrays(converted) else: - if isinstance(converted, np.ndarray) and converted.dtype == np.int64: - # Check for overflows if we should actually be uint64 - # xref GH#35481 - alt = np.asarray(index_like) - if alt.dtype == np.uint64: - converted = alt - index_like = converted else: # clean_index_list does the equivalent of copying @@ -6386,19 +6387,18 @@ def maybe_extract_name(name, obj, cls) -> Hashable: return name -def _maybe_cast_data_without_dtype(subarr): +def _maybe_cast_data_without_dtype(subarr: np.ndarray) -> ArrayLike: """ If we have an arraylike input but no passed dtype, try to infer a supported dtype. Parameters ---------- - subarr : np.ndarray, Index, or Series + subarr : np.ndarray[object] Returns ------- - converted : np.ndarray or ExtensionArray - dtype : np.dtype or ExtensionDtype + np.ndarray or ExtensionArray """ # Runtime import needed bc IntervalArray imports Index from pandas.core.arrays import ( @@ -6413,11 +6413,7 @@ def _maybe_cast_data_without_dtype(subarr): if inferred == "integer": try: - # error: Argument 3 to "_try_convert_to_int_array" has incompatible type - # "None"; expected "dtype[Any]" - data = _try_convert_to_int_array( - subarr, False, None # type: ignore[arg-type] - ) + data = _try_convert_to_int_array(subarr) return data except ValueError: pass @@ -6430,12 +6426,8 @@ def _maybe_cast_data_without_dtype(subarr): return data elif inferred == "interval": - try: - ia_data = IntervalArray._from_sequence(subarr, copy=False) - return ia_data - except (ValueError, TypeError): - # GH27172: mixed closed Intervals --> object dtype - pass + ia_data = IntervalArray._from_sequence(subarr, copy=False) + return ia_data elif inferred == "boolean": # don't support boolean explicitly ATM pass @@ -6454,27 +6446,19 @@ def _maybe_cast_data_without_dtype(subarr): tda = TimedeltaArray._from_sequence(subarr, copy=False) return tda elif inferred == "period": - try: - parr = PeriodArray._from_sequence(subarr) - return parr - except IncompatibleFrequency: - pass + parr = PeriodArray._from_sequence(subarr) + return parr return subarr -def _try_convert_to_int_array( - data: np.ndarray, copy: bool, dtype: np.dtype -) -> np.ndarray: +def _try_convert_to_int_array(data: np.ndarray) -> np.ndarray: """ Attempt to convert an array of data into an integer array. Parameters ---------- - data : The data to convert. - copy : bool - Whether to copy the data or not. - dtype : np.dtype + data : np.ndarray[object] Returns ------- @@ -6484,22 +6468,19 @@ def _try_convert_to_int_array( ------ ValueError if the conversion was not successful. """ - if not is_unsigned_integer_dtype(dtype): - # skip int64 conversion attempt if uint-like dtype is passed, as - # this could return Int64Index when UInt64Index is what's desired - try: - res = data.astype("i8", copy=False) - if (res == data).all(): - return res # TODO: might still need to copy - except (OverflowError, TypeError, ValueError): - pass + try: + res = data.astype("i8", copy=False) + if (res == data).all(): + return res + except (OverflowError, TypeError, ValueError): + pass - # Conversion to int64 failed (possibly due to overflow) or was skipped, + # Conversion to int64 failed (possibly due to overflow), # so let's try now with uint64. try: res = data.astype("u8", copy=False) if (res == data).all(): - return res # TODO: might still need to copy + return res except (OverflowError, TypeError, ValueError): pass diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index e835990eb8d89f..1541885887dab3 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -222,6 +222,17 @@ def __new__( name = maybe_extract_name(name, data, cls) + if data is None: + # GH#38944 + warnings.warn( + "Constructing a CategoricalIndex without passing data is " + "deprecated and will raise in a future version. " + "Use CategoricalIndex([], ...) instead", + FutureWarning, + stacklevel=2, + ) + data = [] + if is_scalar(data): raise cls._scalar_data_error(data) @@ -324,13 +335,8 @@ def _format_attrs(self): # error: "CategoricalIndex" has no attribute "ordered" ("ordered", self.ordered), # type: ignore[attr-defined] ] - if self.name is not None: - attrs.append(("name", ibase.default_pprint(self.name))) - attrs.append(("dtype", f"'{self.dtype.name}'")) - max_seq_items = get_option("display.max_seq_items") or len(self) - if len(self) > max_seq_items: - attrs.append(("length", len(self))) - return attrs + extra = super()._format_attrs() + return attrs + extra def _format_with_header(self, header: list[str], na_rep: str = "NaN") -> list[str]: from pandas.io.formats.printing import pprint_thing diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index b2377f5b27966f..5f24eb0cfaad61 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -35,7 +35,6 @@ ) from pandas.core.dtypes.common import ( - is_bool_dtype, is_categorical_dtype, is_dtype_equal, is_integer, @@ -83,6 +82,7 @@ class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): Common ops mixin to support a unified interface datetimelike Index. """ + _is_numeric_dtype = False _can_hold_strings = False _data: DatetimeArray | TimedeltaArray | PeriodArray freq: BaseOffset | None @@ -113,15 +113,10 @@ def __array_wrap__(self, result, context=None): """ Gets called after a ufunc and other functions. """ - result = lib.item_from_zerodim(result) - if is_bool_dtype(result) or lib.is_scalar(result): - return result - - attrs = self._get_attributes_dict() - if not is_period_dtype(self.dtype) and attrs["freq"]: - # no need to infer if freq is None - attrs["freq"] = "infer" - return type(self)(result, **attrs) + out = super().__array_wrap__(result, context=context) + if isinstance(out, DatetimeTimedeltaMixin) and self.freq is not None: + out = out._with_freq("infer") + return out # ------------------------------------------------------------------------ @@ -361,7 +356,9 @@ def _format_attrs(self): freq = self.freqstr if freq is not None: freq = repr(freq) - attrs.append(("freq", freq)) + # Argument 1 to "append" of "list" has incompatible type + # "Tuple[str, Optional[str]]"; expected "Tuple[str, Union[str, int]]" + attrs.append(("freq", freq)) # type: ignore[arg-type] return attrs def _summary(self, name=None) -> str: @@ -612,6 +609,8 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin): """ _data: DatetimeArray | TimedeltaArray + _comparables = ["name", "freq"] + _attributes = ["name", "freq"] # Compat for frequency inference, see GH#23789 _is_monotonic_increasing = Index.is_monotonic_increasing diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e8b21f3cec668e..c4329393bb8953 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -25,7 +25,6 @@ ) from pandas._libs.tslibs import ( Resolution, - ints_to_pydatetime, parsing, timezones, to_offset, @@ -40,6 +39,7 @@ cache_readonly, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( DT64NS_DTYPE, @@ -256,11 +256,6 @@ class DatetimeIndex(DatetimeTimedeltaMixin): _engine_type = libindex.DatetimeEngine _supports_partial_string_indexing = True - _comparables = ["name", "freqstr", "tz"] - _attributes = ["name", "tz", "freq"] - - _is_numeric_dtype = False - _data: DatetimeArray inferred_freq: str | None tz: tzinfo | None @@ -391,10 +386,6 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: # -------------------------------------------------------------------- # Rendering Methods - def _mpl_repr(self) -> np.ndarray: - # how to represent ourselves to matplotlib - return ints_to_pydatetime(self.asi8, self.tz) - @property def _formatter_func(self): from pandas.io.formats.format import get_format_datetime64 @@ -660,7 +651,7 @@ def _deprecate_mismatched_indexing(self, key) -> None: "raise KeyError in a future version. " "Use a timezone-aware object instead." ) - warnings.warn(msg, FutureWarning, stacklevel=5) + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) def get_loc(self, key, method=None, tolerance=None): """ diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 83998a2792a8ab..066fa1f5473282 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -18,6 +18,7 @@ cache_readonly, doc, ) +from pandas.util._exceptions import rewrite_exception from pandas.core.dtypes.cast import ( find_common_type, @@ -365,11 +366,17 @@ def astype(self, dtype, copy: bool = True) -> Index: return self return self.copy() - if isinstance(dtype, np.dtype) and dtype.kind == "M" and dtype != "M8[ns]": + if ( + isinstance(self.dtype, np.dtype) + and isinstance(dtype, np.dtype) + and dtype.kind == "M" + and dtype != "M8[ns]" + ): # For now Datetime supports this by unwrapping ndarray, but DTI doesn't - raise TypeError(f"Cannot cast {type(self._data).__name__} to dtype") + raise TypeError(f"Cannot cast {type(self).__name__} to dtype") - new_values = self._data.astype(dtype, copy=copy) + with rewrite_exception(type(self._data).__name__, type(self).__name__): + new_values = self._data.astype(dtype, copy=copy) # pass copy=False because any copying will be done in the # _data.astype call above diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index fc92a1b3afe538..6dcb2a44e7d3d8 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -16,8 +16,6 @@ import numpy as np -from pandas._config import get_option - from pandas._libs import lib from pandas._libs.interval import ( Interval, @@ -80,7 +78,6 @@ from pandas.core.indexes.base import ( Index, _index_shared_docs, - default_pprint, ensure_index, maybe_extract_name, ) @@ -255,8 +252,6 @@ def func(self, other, sort=None): @inherit_names(["is_non_overlapping_monotonic", "closed"], IntervalArray, cache=True) class IntervalIndex(ExtensionIndex): _typ = "intervalindex" - _comparables = ["name"] - _attributes = ["name", "closed"] # annotate properties pinned via inherit_names closed: str @@ -422,21 +417,11 @@ def __contains__(self, key: Any) -> bool: def _multiindex(self) -> MultiIndex: return MultiIndex.from_arrays([self.left, self.right], names=["left", "right"]) - def __array_wrap__(self, result, context=None): - # we don't want the superclass implementation - return result - def __reduce__(self): - d = {"left": self.left, "right": self.right} + d = {"left": self.left, "right": self.right, "closed": self.closed} d.update(self._get_attributes_dict()) return _new_IntervalIndex, (type(self), d), None - @Appender(Index.astype.__doc__) - def astype(self, dtype, copy: bool = True): - with rewrite_exception("IntervalArray", type(self).__name__): - new_values = self._values.astype(dtype, copy=copy) - return Index(new_values, dtype=new_values.dtype, name=self.name) - @property def inferred_type(self) -> str: """Return a string of the type inferred from the values""" @@ -789,9 +774,11 @@ def _get_indexer_pointwise(self, target: Index) -> tuple[np.ndarray, np.ndarray] except KeyError: missing.append(i) locs = np.array([-1]) - except InvalidIndexError as err: - # i.e. non-scalar key - raise TypeError(key) from err + except InvalidIndexError: + # i.e. non-scalar key e.g. a tuple. + # see test_append_different_columns_types_raises + missing.append(i) + locs = np.array([-1]) indexer.append(locs) @@ -919,49 +906,9 @@ def _format_native_types(self, na_rep="NaN", quoting=None, **kwargs): return super()._format_native_types(na_rep=na_rep, quoting=quoting, **kwargs) def _format_data(self, name=None) -> str: - # TODO: integrate with categorical and make generic # name argument is unused here; just for compat with base / categorical - n = len(self) - max_seq_items = min((get_option("display.max_seq_items") or n) // 10, 10) - - formatter = str - - if n == 0: - summary = "[]" - elif n == 1: - first = formatter(self[0]) - summary = f"[{first}]" - elif n == 2: - first = formatter(self[0]) - last = formatter(self[-1]) - summary = f"[{first}, {last}]" - else: - - if n > max_seq_items: - n = min(max_seq_items // 2, 10) - head = [formatter(x) for x in self[:n]] - tail = [formatter(x) for x in self[-n:]] - head_joined = ", ".join(head) - tail_joined = ", ".join(tail) - summary = f"[{head_joined} ... {tail_joined}]" - else: - tail = [formatter(x) for x in self] - joined = ", ".join(tail) - summary = f"[{joined}]" - - return summary + "," + self._format_space() - - def _format_attrs(self): - attrs = [] - if self.name is not None: - attrs.append(("name", default_pprint(self.name))) - attrs.append(("dtype", f"'{self.dtype}'")) - return attrs - - def _format_space(self) -> str: - space = " " * (len(type(self).__name__) + 1) - return f"\n{space}" + return self._data._format_data() + "," + self._format_space() # -------------------------------------------------------------------- # Set Operations @@ -1214,6 +1161,8 @@ def interval_range( if periods is not None: periods += 1 + breaks: np.ndarray | TimedeltaIndex | DatetimeIndex + if is_number(endpoint): # force consistency between start/end/freq (lower end if freq skips it) if com.all_not_none(start, end, freq): @@ -1239,16 +1188,8 @@ def interval_range( else: # delegate to the appropriate range function if isinstance(endpoint, Timestamp): - # error: Incompatible types in assignment (expression has type - # "DatetimeIndex", variable has type "ndarray") - breaks = date_range( # type: ignore[assignment] - start=start, end=end, periods=periods, freq=freq - ) + breaks = date_range(start=start, end=end, periods=periods, freq=freq) else: - # error: Incompatible types in assignment (expression has type - # "TimedeltaIndex", variable has type "ndarray") - breaks = timedelta_range( # type: ignore[assignment] - start=start, end=end, periods=periods, freq=freq - ) + breaks = timedelta_range(start=start, end=end, periods=periods, freq=freq) return IntervalIndex.from_breaks(breaks, name=name, closed=closed) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1a3719233a1da2..805420a83108aa 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -41,6 +41,7 @@ from pandas.util._decorators import ( Appender, cache_readonly, + deprecate_nonkeyword_arguments, doc, ) @@ -89,11 +90,7 @@ lexsort_indexer, ) -from pandas.io.formats.printing import ( - format_object_attrs, - format_object_summary, - pprint_thing, -) +from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: from pandas import ( @@ -295,7 +292,6 @@ class MultiIndex(Index): _levels = FrozenList() _codes = FrozenList() _comparables = ["names"] - rename = Index.set_names sortorder: int | None @@ -807,6 +803,7 @@ def _set_levels( self._reset_cache() + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "levels"]) def set_levels( self, levels, level=None, inplace=None, verify_integrity: bool = True ): @@ -898,7 +895,7 @@ def set_levels( warnings.warn( "inplace is deprecated and will be removed in a future version.", FutureWarning, - stacklevel=2, + stacklevel=3, ) else: inplace = False @@ -994,6 +991,7 @@ def _set_codes( self._reset_cache() + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "codes"]) def set_codes(self, codes, level=None, inplace=None, verify_integrity: bool = True): """ Set new codes on MultiIndex. Defaults to returning new index. @@ -1061,7 +1059,7 @@ def set_codes(self, codes, level=None, inplace=None, verify_integrity: bool = Tr warnings.warn( "inplace is deprecated and will be removed in a future version.", FutureWarning, - stacklevel=2, + stacklevel=3, ) else: inplace = False @@ -1287,20 +1285,6 @@ def _formatter_func(self, tup): formatter_funcs = [level._formatter_func for level in self.levels] return tuple(func(val) for func, val in zip(formatter_funcs, tup)) - def _format_data(self, name=None) -> str: - """ - Return the formatted data as a unicode string - """ - return format_object_summary( - self, self._formatter_func, name=name, line_break_each_value=True - ) - - def _format_attrs(self): - """ - Return a list of tuples of the (attr,formatted_value). - """ - return format_object_attrs(self, include_dtype=False) - def _format_native_types(self, na_rep="nan", **kwargs): new_levels = [] new_codes = [] @@ -2557,9 +2541,11 @@ def reindex( elif (indexer >= 0).all(): target = self.take(indexer) else: - # hopefully? - target = MultiIndex.from_tuples(target) - + try: + target = MultiIndex.from_tuples(target) + except TypeError: + # not all tuples, see test_constructor_dict_multiindex_reindex_flat + return target, indexer if ( preserve_names and target.nlevels == self.nlevels @@ -3446,6 +3432,7 @@ def _reorder_indexer( new_order = np.arange(n)[indexer] elif is_list_like(k): # Generate a map with all level codes as sorted initially + k = algos.unique(k) key_order_map = np.ones(len(self.levels[i]), dtype=np.uint64) * len( self.levels[i] ) @@ -3574,14 +3561,20 @@ def equal_levels(self, other: MultiIndex) -> bool: def _union(self, other, sort) -> MultiIndex: other, result_names = self._convert_can_do_setop(other) + if ( + any(-1 in code for code in self.codes) + and any(-1 in code for code in self.codes) + or self.has_duplicates + or other.has_duplicates + ): + # This is only necessary if both sides have nans or one has dups, + # fast_unique_multiple is faster + result = super()._union(other, sort) + else: + rvals = other._values.astype(object, copy=False) + result = lib.fast_unique_multiple([self._values, rvals], sort=sort) - # We could get here with CategoricalIndex other - rvals = other._values.astype(object, copy=False) - uniq_tuples = lib.fast_unique_multiple([self._values, rvals], sort=sort) - - return MultiIndex.from_arrays( - zip(*uniq_tuples), sortorder=0, names=result_names - ) + return MultiIndex.from_arrays(zip(*result), sortorder=0, names=result_names) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: return is_object_dtype(dtype) @@ -3594,7 +3587,9 @@ def _get_reconciled_name_object(self, other) -> MultiIndex: """ names = self._maybe_match_names(other) if self.names != names: - return self.rename(names) + # Incompatible return value type (got "Optional[MultiIndex]", expected + # "MultiIndex") + return self.rename(names) # type: ignore[return-value] return self def _maybe_match_names(self, other): @@ -3793,6 +3788,16 @@ def isin(self, values, level=None) -> np.ndarray: return np.zeros(len(levs), dtype=np.bool_) return levs.isin(values) + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "names"]) + def set_names(self, names, level=None, inplace: bool = False) -> MultiIndex | None: + return super().set_names(names=names, level=level, inplace=inplace) + + rename = set_names + + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + def drop_duplicates(self, keep: str | bool = "first") -> MultiIndex: + return super().drop_duplicates(keep=keep) + # --------------------------------------------------------------- # Arithmetic/Numeric Methods - Disabled diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 136843938b6839..c1104b80a0a7a9 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -4,10 +4,7 @@ datetime, timedelta, ) -from typing import ( - Any, - Hashable, -) +from typing import Hashable import warnings import numpy as np @@ -34,7 +31,6 @@ from pandas.util._decorators import doc from pandas.core.dtypes.common import ( - is_bool_dtype, is_datetime64_any_dtype, is_float, is_integer, @@ -153,14 +149,11 @@ class PeriodIndex(DatetimeIndexOpsMixin): -------- >>> idx = pd.PeriodIndex(year=[2000, 2002], quarter=[1, 3]) >>> idx - PeriodIndex(['2000Q1', '2002Q3'], dtype='period[Q-DEC]', freq='Q-DEC') + PeriodIndex(['2000Q1', '2002Q3'], dtype='period[Q-DEC]') """ _typ = "periodindex" - _attributes = ["name", "freq"] - - # define my properties & methods for delegation - _is_numeric_dtype = False + _attributes = ["name"] _data: PeriodArray freq: BaseOffset @@ -322,70 +315,9 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: return False return dtype.freq == self.freq - # ------------------------------------------------------------------------ - # Rendering Methods - - def _mpl_repr(self) -> np.ndarray: - # how to represent ourselves to matplotlib - return self.astype(object)._values - - # ------------------------------------------------------------------------ - # Indexing - - @doc(Index.__contains__) - def __contains__(self, key: Any) -> bool: - if isinstance(key, Period): - if key.freq != self.freq: - return False - else: - return key.ordinal in self._engine - else: - hash(key) - try: - self.get_loc(key) - return True - except KeyError: - return False - # ------------------------------------------------------------------------ # Index Methods - def __array_wrap__(self, result, context=None): - """ - Gets called after a ufunc and other functions. - - Needs additional handling as PeriodIndex stores internal data as int - dtype - - Replace this to __numpy_ufunc__ in future version and implement - __array_function__ for Indexes - """ - if isinstance(context, tuple) and len(context) > 0: - func = context[0] - if func is np.add: - pass - elif func is np.subtract: - name = self.name - left = context[1][0] - right = context[1][1] - if isinstance(left, PeriodIndex) and isinstance(right, PeriodIndex): - name = left.name if left.name == right.name else None - return Index(result, name=name) - elif isinstance(left, Period) or isinstance(right, Period): - return Index(result, name=name) - elif isinstance(func, np.ufunc): - if "M->M" not in func.types: - msg = f"ufunc '{func.__name__}' not supported for the PeriodIndex" - # This should be TypeError, but TypeError cannot be raised - # from here because numpy catches. - raise ValueError(msg) - - if is_bool_dtype(result): - return result - # the result is object dtype array of Period - # cannot pass _simple_new as it is - return type(self)(result, freq=self.freq, name=self.name) - def asof_locs(self, where: Index, mask: np.ndarray) -> np.ndarray: """ where : array of timestamps @@ -519,6 +451,8 @@ def get_loc(self, key, method=None, tolerance=None): elif is_integer(key): # Period constructor will cast to string, which we dont want raise KeyError(key) + elif isinstance(key, Period) and key.freq != self.freq: + raise KeyError(key) try: key = Period(key, freq=self.freq) @@ -636,7 +570,7 @@ def period_range( PeriodIndex(['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06', '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12', '2018-01'], - dtype='period[M]', freq='M') + dtype='period[M]') If ``start`` or ``end`` are ``Period`` objects, they will be used as anchor endpoints for a ``PeriodIndex`` with frequency matching that of the @@ -645,7 +579,7 @@ def period_range( >>> pd.period_range(start=pd.Period('2017Q1', freq='Q'), ... end=pd.Period('2017Q2', freq='Q'), freq='M') PeriodIndex(['2017-03', '2017-04', '2017-05', '2017-06'], - dtype='period[M]', freq='M') + dtype='period[M]') """ if com.count_not_none(start, end, periods) != 2: raise ValueError( diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 0e6fb77e8b51bf..ead1a2a4a544be 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -604,11 +604,6 @@ def _min_fitting_element(self, lower_limit: int) -> int: no_steps = -(-(lower_limit - self.start) // abs(self.step)) return self.start + abs(self.step) * no_steps - def _max_fitting_element(self, upper_limit: int) -> int: - """Returns the largest element smaller than or equal to the limit""" - no_steps = (upper_limit - self.start) // abs(self.step) - return self.start + abs(self.step) * no_steps - def _extended_gcd(self, a: int, b: int) -> tuple[int, int, int]: """ Extended Euclidean algorithms to solve Bezout's identity: diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index ec97fa1e058514..cb83a0bccc7482 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -112,10 +112,6 @@ class TimedeltaIndex(DatetimeTimedeltaMixin): _data_cls = TimedeltaArray _engine_type = libindex.TimedeltaEngine - _comparables = ["name", "freq"] - _attributes = ["name", "freq"] - _is_numeric_dtype = False - _data: TimedeltaArray # ------------------------------------------------------------------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 0a06dff790cbf3..d5555561088eb2 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -886,26 +886,22 @@ def _getitem_nested_tuple(self, tup: tuple): # handle the multi-axis by taking sections and reducing # this is iterative obj = self.obj - axis = 0 - for key in tup: + # GH#41369 Loop in reverse order ensures indexing along columns before rows + # which selects only necessary blocks which avoids dtype conversion if possible + axis = len(tup) - 1 + for key in tup[::-1]: if com.is_null_slice(key): - axis += 1 + axis -= 1 continue - current_ndim = obj.ndim obj = getattr(obj, self.name)._getitem_axis(key, axis=axis) - axis += 1 + axis -= 1 # if we have a scalar, we are done if is_scalar(obj) or not hasattr(obj, "ndim"): break - # has the dim of the obj changed? - # GH 7199 - if obj.ndim < current_ndim: - axis -= 1 - return obj def _convert_to_indexer(self, key, axis: int, is_setter: bool = False): @@ -1938,7 +1934,9 @@ def _setitem_with_indexer_missing(self, indexer, value): # e.g. 0.0 -> 0 # GH#12246 if index.is_unique: - new_indexer = index.get_indexer([new_index[-1]]) + # pass new_index[-1:] instead if [new_index[-1]] + # so that we retain dtype + new_indexer = index.get_indexer(new_index[-1:]) if (new_indexer != -1).any(): # We get only here with loc, so can hard code return self._setitem_with_indexer(new_indexer, value, "loc") diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4f1b16e7473940..c7769046c70b28 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -6,6 +6,8 @@ TYPE_CHECKING, Any, Callable, + Iterable, + Sequence, cast, ) import warnings @@ -393,7 +395,7 @@ def reduce(self, func, ignore_failures: bool = False) -> list[Block]: return [] raise - if np.ndim(result) == 0: + if self.values.ndim == 1: # TODO(EA2D): special case not needed with 2D EAs res_values = np.array([[result]]) else: @@ -763,8 +765,8 @@ def _replace_regex( @final def _replace_list( self, - src_list: list[Any], - dest_list: list[Any], + src_list: Iterable[Any], + dest_list: Sequence[Any], inplace: bool = False, regex: bool = False, ) -> list[Block]: @@ -779,6 +781,14 @@ def _replace_list( # so un-tile here return self.replace(src_list, dest_list[0], inplace, regex) + # https://github.com/pandas-dev/pandas/issues/40371 + # the following pairs check code caused a regression so we catch that case here + # until the issue is fixed properly in can_hold_element + + # error: "Iterable[Any]" has no attribute "tolist" + if hasattr(src_list, "tolist"): + src_list = src_list.tolist() # type: ignore[attr-defined] + # Exclude anything that we know we won't contain pairs = [ (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) @@ -1316,7 +1326,6 @@ def quantile( assert is_list_like(qs) # caller is responsible for this result = quantile_compat(self.values, np.asarray(qs._values), interpolation) - return new_block(result, placement=self._mgr_locs, ndim=2) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index f33cb104cef44c..270eddf2bd3a5b 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -11,6 +11,7 @@ Hashable, Sequence, ) +import warnings import numpy as np import numpy.ma as ma @@ -21,11 +22,11 @@ DtypeObj, Manager, ) +from pandas.errors import IntCastingNaNError from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na, - dict_compat, maybe_cast_to_datetime, maybe_convert_platform, maybe_infer_to_datetimelike, @@ -45,10 +46,7 @@ from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, - ABCDatetimeIndex, - ABCIndex, ABCSeries, - ABCTimedeltaIndex, ) from pandas.core import ( @@ -62,8 +60,10 @@ TimedeltaArray, ) from pandas.core.construction import ( + create_series_with_explicit_dtype, ensure_wrapped_if_datetimelike, extract_array, + range_to_ndarray, sanitize_array, ) from pandas.core.indexes import base as ibase @@ -98,7 +98,7 @@ def arrays_to_mgr( arrays, - arr_names, + arr_names: Index, index, columns, *, @@ -112,8 +112,6 @@ def arrays_to_mgr( Needs to handle a lot of exceptional cases. """ - arr_names = ensure_index(arr_names) - if verify_integrity: # figure out the index, if necessary if index is None: @@ -283,10 +281,12 @@ def ndarray_to_mgr( if columns is None: columns = Index(range(len(values))) + else: + columns = ensure_index(columns) return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ) - if is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): + elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): # i.e. Datetime64TZ values = extract_array(values, extract_numpy=True) if copy: @@ -315,10 +315,11 @@ def ndarray_to_mgr( values = construct_1d_ndarray_preserving_na( flat, dtype=dtype, copy=False ) - except Exception as err: - # e.g. ValueError when trying to cast object dtype to float64 - msg = f"failed to cast to '{dtype}' (Exception was: {err})" - raise ValueError(msg) from err + except IntCastingNaNError: + # following Series, we ignore the dtype and retain floating + # values instead of casting nans to meaningless ints + pass + values = values.reshape(shape) # _prep_ndarray ensures that values.ndim == 2 at this point @@ -357,8 +358,8 @@ def ndarray_to_mgr( if values.ndim == 2 and values.shape[0] != 1: # transpose and separate blocks - dvals_list = [maybe_infer_to_datetimelike(row) for row in values] - dvals_list = [ensure_block_shape(dval, 2) for dval in dvals_list] + dtlike_vals = [maybe_infer_to_datetimelike(row) for row in values] + dvals_list = [ensure_block_shape(dval, 2) for dval in dtlike_vals] # TODO: What about re-joining object columns? block_values = [ @@ -450,7 +451,7 @@ def dict_to_mgr( arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] # GH#24096 need copy to be deep for datetime64tz case # TODO: See if we can avoid these copies - arrays = [arr if not isinstance(arr, ABCIndex) else arr._data for arr in arrays] + arrays = [arr if not isinstance(arr, Index) else arr._data for arr in arrays] arrays = [ arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays ] @@ -476,7 +477,7 @@ def nested_data_to_arrays( columns: Index | None, index: Index | None, dtype: DtypeObj | None, -): +) -> tuple[list[ArrayLike], Index, Index]: """ Convert a single sequence of arrays to multiple arrays. """ @@ -527,15 +528,12 @@ def _prep_ndarray(values, copy: bool = True) -> np.ndarray: if len(values) == 0: return np.empty((0, 0), dtype=object) elif isinstance(values, range): - arr = np.arange(values.start, values.stop, values.step, dtype="int64") + arr = range_to_ndarray(values) return arr[..., np.newaxis] def convert(v): if not is_list_like(v) or isinstance(v, ABCDataFrame): return v - elif not hasattr(v, "dtype") and not isinstance(v, (list, tuple, range)): - # TODO: should we cast these to list? - return v v = extract_array(v, extract_numpy=True) res = maybe_convert_platform(v) @@ -547,17 +545,15 @@ def convert(v): if is_list_like(values[0]): values = np.array([convert(v) for v in values]) elif isinstance(values[0], np.ndarray) and values[0].ndim == 0: - # GH#21861 + # GH#21861 see test_constructor_list_of_lists values = np.array([convert(v) for v in values]) else: values = convert(values) else: - # drop subclass info, do not copy data - values = np.asarray(values) - if copy: - values = values.copy() + # drop subclass info + values = np.array(values, copy=copy) if values.ndim == 1: values = values.reshape((values.shape[0], 1)) @@ -567,33 +563,25 @@ def convert(v): return values -def _homogenize(data, index: Index, dtype: DtypeObj | None): - oindex = None +def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: homogenized = [] for val in data: if isinstance(val, ABCSeries): if dtype is not None: - val = val.astype(dtype) + val = val.astype(dtype, copy=False) if val.index is not index: # Forces alignment. No need to copy data since we # are putting it into an ndarray later val = val.reindex(index, copy=False) - # TODO extract_array should be preferred, but that gives failures for - # `extension/test_numpy.py` (extract_array will convert numpy arrays - # to PandasArray), see https://github.com/pandas-dev/pandas/issues/40021 - # val = extract_array(val, extract_numpy=True) + val = val._values else: if isinstance(val, dict): - if oindex is None: - oindex = index.astype("O") - - if isinstance(index, (ABCDatetimeIndex, ABCTimedeltaIndex)): - val = dict_compat(val) - else: - val = dict(val) - val = lib.fast_multiget(val, oindex._values, default=np.nan) + # see test_constructor_subclass_dict + # test_constructor_dict_datetime64_index + val = create_series_with_explicit_dtype(val, index=index)._values + val = sanitize_array( val, index, dtype=dtype, copy=False, raise_cast_failure=False ) @@ -750,6 +738,7 @@ def to_arrays( Return list of arrays, columns. """ if isinstance(data, ABCDataFrame): + # see test_from_records_with_index_data, test_from_records_bad_index_column if columns is not None: arrays = [ data._ixs(i, axis=1).values @@ -772,6 +761,16 @@ def to_arrays( return [], ensure_index([]) elif isinstance(data[0], Categorical): + # GH#38845 deprecate special case + warnings.warn( + "The behavior of DataFrame([categorical, ...]) is deprecated and " + "in a future version will be changed to match the behavior of " + "DataFrame([any_listlike, ...]). " + "To retain the old behavior, pass as a dictionary " + "DataFrame({col: categorical, ..})", + FutureWarning, + stacklevel=4, + ) if columns is None: columns = ibase.default_index(len(data)) return data, columns @@ -875,7 +874,7 @@ def _list_of_dict_to_arrays( # assure that they are of the base dict class and not of derived # classes - data = [(type(d) is dict) and d or dict(d) for d in data] + data = [d if type(d) is dict else dict(d) for d in data] content = lib.dicts_to_array(data, list(columns)) return content, columns diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index ea31f9663cffee..323aa45874d968 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -759,7 +759,8 @@ def _slice_take_blocks_ax0( blk = self.blocks[blkno] # Otherwise, slicing along items axis is necessary. - if not blk._can_consolidate: + if not blk._can_consolidate and not blk._validate_ndim: + # i.e. we dont go through here for DatetimeTZBlock # A non-consolidatable block, it's easy, because there's # only one item and each mgr loc is a copy of that single # item. diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 2c31e45d0b4e17..673c482bced18e 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -245,8 +245,7 @@ def _maybe_get_mask( """ if mask is None: if is_bool_dtype(values.dtype) or is_integer_dtype(values.dtype): - # Boolean data cannot contain nulls, so signal via mask being None - return None + return np.broadcast_to(False, values.shape) if skipna or needs_i8_conversion(values.dtype): mask = isna(values) @@ -588,17 +587,9 @@ def nansum( dtype_sum = np.float64 # type: ignore[assignment] the_sum = values.sum(axis, dtype=dtype_sum) - # error: Incompatible types in assignment (expression has type "float", variable has - # type "Union[number, ndarray]") - # error: Argument 1 to "_maybe_null_out" has incompatible type "Union[number, - # ndarray]"; expected "ndarray" - the_sum = _maybe_null_out( # type: ignore[assignment] - the_sum, axis, mask, values.shape, min_count=min_count # type: ignore[arg-type] - ) + the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count) - # error: Incompatible return value type (got "Union[number, ndarray]", expected - # "float") - return the_sum # type: ignore[return-value] + return the_sum def _mask_datetimelike_result( @@ -1343,12 +1334,10 @@ def nanprod( values = values.copy() values[mask] = 1 result = values.prod(axis) - # error: Argument 1 to "_maybe_null_out" has incompatible type "Union[number, - # ndarray]"; expected "ndarray" # error: Incompatible return value type (got "Union[ndarray, float]", expected # "float") return _maybe_null_out( # type: ignore[return-value] - result, axis, mask, values.shape, min_count=min_count # type: ignore[arg-type] + result, axis, mask, values.shape, min_count=min_count ) @@ -1424,13 +1413,7 @@ def _get_counts( # expected "Union[int, float, ndarray]") return dtype.type(count) # type: ignore[return-value] try: - # error: Incompatible return value type (got "Union[ndarray, generic]", expected - # "Union[int, float, ndarray]") - # error: Argument 1 to "astype" of "_ArrayOrScalarCommon" has incompatible type - # "Union[ExtensionDtype, dtype]"; expected "Union[dtype, None, type, - # _SupportsDtype, str, Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], - # List[Any], _DtypeDict, Tuple[Any, Any]]" - return count.astype(dtype) # type: ignore[return-value,arg-type] + return count.astype(dtype) except AttributeError: # error: Argument "dtype" to "array" has incompatible type # "Union[ExtensionDtype, dtype]"; expected "Union[dtype, None, type, diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 1ab2b90d6564ae..8195c18768eecb 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -198,6 +198,8 @@ def obj(self) -> FrameOrSeries: # type: ignore[override] @property def ax(self): + # we can infer that this is a PeriodIndex/DatetimeIndex/TimedeltaIndex, + # but skipping annotating bc the overrides overwhelming return self.groupby.ax @property diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f8085b2bab1ed8..c05130278f75b7 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -27,7 +27,6 @@ ArrayLike, DtypeObj, FrameOrSeries, - FrameOrSeriesUnion, IndexLabel, Suffixes, ) @@ -81,15 +80,18 @@ from pandas.core.sorting import is_int64_overflow_possible if TYPE_CHECKING: - from pandas import DataFrame + from pandas import ( + DataFrame, + Series, + ) from pandas.core.arrays import DatetimeArray -@Substitution("\nleft : DataFrame") +@Substitution("\nleft : DataFrame or named Series") @Appender(_merge_doc, indents=0) def merge( - left: FrameOrSeriesUnion, - right: FrameOrSeriesUnion, + left: DataFrame | Series, + right: DataFrame | Series, how: str = "inner", on: IndexLabel | None = None, left_on: IndexLabel | None = None, @@ -322,8 +324,8 @@ def _merger(x, y) -> DataFrame: def merge_asof( - left: DataFrame, - right: DataFrame, + left: DataFrame | Series, + right: DataFrame | Series, on: IndexLabel | None = None, left_on: IndexLabel | None = None, right_on: IndexLabel | None = None, @@ -362,8 +364,8 @@ def merge_asof( Parameters ---------- - left : DataFrame - right : DataFrame + left : DataFrame or named Series + right : DataFrame or named Series on : label Field name to join on. Must be found in both DataFrames. The data MUST be ordered. Furthermore this must be a numeric column, @@ -608,8 +610,8 @@ class _MergeOperation: def __init__( self, - left: FrameOrSeriesUnion, - right: FrameOrSeriesUnion, + left: DataFrame | Series, + right: DataFrame | Series, how: str = "inner", on: IndexLabel | None = None, left_on: IndexLabel | None = None, @@ -673,6 +675,8 @@ def __init__( f"in a future version. ({left.columns.nlevels} levels on the left," f"{right.columns.nlevels} on the right)" ) + # stacklevel chosen to be correct when this is reached via pd.merge + # (and not DataFrame.join) warnings.warn(msg, FutureWarning, stacklevel=3) self._validate_specification() @@ -1471,7 +1475,7 @@ def get_join_indexers( for n in range(len(left_keys)) ) zipped = zip(*mapped) - llab, rlab, shape = [list(x) for x in zipped] + llab, rlab, shape = (list(x) for x in zipped) # get flat i8 keys from label lists lkey, rkey = _get_join_keys(llab, rlab, shape, sort) @@ -1597,8 +1601,8 @@ class _OrderedMerge(_MergeOperation): def __init__( self, - left: DataFrame, - right: DataFrame, + left: DataFrame | Series, + right: DataFrame | Series, on: IndexLabel | None = None, left_on: IndexLabel | None = None, right_on: IndexLabel | None = None, @@ -1702,8 +1706,8 @@ class _AsOfMerge(_OrderedMerge): def __init__( self, - left: DataFrame, - right: DataFrame, + left: DataFrame | Series, + right: DataFrame | Series, on: IndexLabel | None = None, left_on: IndexLabel | None = None, right_on: IndexLabel | None = None, @@ -1981,7 +1985,7 @@ def _get_multiindex_indexer( for n in range(index.nlevels) ) zipped = zip(*mapped) - rcodes, lcodes, shape = [list(x) for x in zipped] + rcodes, lcodes, shape = (list(x) for x in zipped) if sort: rcodes = list(map(np.take, rcodes, index.codes)) else: diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 037fe5366255ae..93859eb11dd441 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -176,9 +176,7 @@ def _make_selectors(self): self.full_shape = ngroups, stride selector = self.sorted_labels[-1] + stride * comp_index + self.lift - # error: Argument 1 to "zeros" has incompatible type "number"; expected - # "Union[int, Sequence[int]]" - mask = np.zeros(np.prod(self.full_shape), dtype=bool) # type: ignore[arg-type] + mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) if mask.sum() < len(self.index): diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 7b9c3883d74e30..64daf2542e15a4 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -250,7 +250,7 @@ def cut( raise ValueError("Cannot cut empty array") rng = (nanops.nanmin(x), nanops.nanmax(x)) - mn, mx = [mi + 0.0 for mi in rng] + mn, mx = (mi + 0.0 for mi in rng) if np.isinf(mn) or np.isinf(mx): # GH 24314 diff --git a/pandas/core/series.py b/pandas/core/series.py index d0ff50cca53550..2f45a2adbdec73 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -51,6 +51,7 @@ from pandas.util._decorators import ( Appender, Substitution, + deprecate_nonkeyword_arguments, doc, ) from pandas.util._validators import ( @@ -223,7 +224,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): name : str, optional The name to give to the Series. copy : bool, default False - Copy input data. + Copy input data. Only affects Series or 1d ndarray input. See examples. Examples -------- @@ -251,6 +252,38 @@ class Series(base.IndexOpsMixin, generic.NDFrame): Note that the Index is first build with the keys from the dictionary. After this the Series is reindexed with the given Index values, hence we get all NaN as a result. + + Constructing Series from a list with `copy=False`. + + >>> r = [1, 2] + >>> ser = pd.Series(r, copy=False) + >>> ser.iloc[0] = 999 + >>> r + [1, 2] + >>> ser + 0 999 + 1 2 + dtype: int64 + + Due to input data type the Series has a `copy` of + the original data even though `copy=False`, so + the data is unchanged. + + Constructing Series from a 1d ndarray with `copy=False`. + + >>> r = np.array([1, 2]) + >>> ser = pd.Series(r, copy=False) + >>> ser.iloc[0] = 999 + >>> r + array([999, 2]) + >>> ser + 0 999 + 1 2 + dtype: int64 + + Due to input data type the Series has a `view` on + the original data, so + the data is changed as well. """ _typ = "series" @@ -1275,6 +1308,7 @@ def repeat(self, repeats, axis=None) -> Series: self, method="repeat" ) + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "level"]) def reset_index(self, level=None, drop=False, name=None, inplace=False): """ Generate a new DataFrame or Series with the index reset. @@ -2024,6 +2058,7 @@ def drop_duplicates(self, *, inplace: Literal[True]) -> None: def drop_duplicates(self, keep=..., inplace: bool = ...) -> Series | None: ... + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def drop_duplicates(self, keep="first", inplace=False) -> Series | None: """ Return Series with duplicate values removed. @@ -3224,6 +3259,7 @@ def update(self, other) -> None: # ---------------------------------------------------------------------- # Reindexing, sorting + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def sort_values( self, axis=0, @@ -3434,6 +3470,7 @@ def sort_values( else: return result.__finalize__(self, method="sort_values") + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def sort_index( self, axis=0, @@ -4446,6 +4483,7 @@ def set_axis(self, labels, *, inplace: Literal[True]) -> None: def set_axis(self, labels, axis: Axis = ..., inplace: bool = ...) -> Series | None: ... + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"]) @Appender( """ Examples @@ -4485,6 +4523,7 @@ def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): def reindex(self, index=None, **kwargs): return super().reindex(index=index, **kwargs) + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"]) def drop( self, labels=None, @@ -4707,6 +4746,7 @@ def fillna( ... # error: Cannot determine type of 'fillna' + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "value"]) @doc(NDFrame.fillna, **_shared_doc_kwargs) # type: ignore[has-type] def fillna( self, @@ -5031,10 +5071,7 @@ def _convert_dtypes( convert_boolean, convert_floating, ) - try: - result = input_series.astype(inferred_dtype) - except TypeError: - result = input_series.copy() + result = input_series.astype(inferred_dtype) else: result = input_series.copy() return result @@ -5059,6 +5096,7 @@ def notna(self) -> Series: def notnull(self) -> Series: return super().notnull() + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def dropna(self, axis=0, inplace=False, how=None): """ Return a new Series with missing values removed. @@ -5256,6 +5294,93 @@ def to_period(self, freq=None, copy=True) -> Series: self, method="to_period" ) + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + def ffill( + self: Series, + axis: None | Axis = None, + inplace: bool = False, + limit: None | int = None, + downcast=None, + ) -> Series | None: + return super().ffill(axis, inplace, limit, downcast) + + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + def bfill( + self: Series, + axis: None | Axis = None, + inplace: bool = False, + limit: None | int = None, + downcast=None, + ) -> Series | None: + return super().bfill(axis, inplace, limit, downcast) + + @deprecate_nonkeyword_arguments( + version=None, allowed_args=["self", "lower", "upper"] + ) + def clip( + self: Series, + lower=None, + upper=None, + axis: Axis | None = None, + inplace: bool = False, + *args, + **kwargs, + ) -> Series | None: + return super().clip(lower, upper, axis, inplace, *args, **kwargs) + + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "method"]) + def interpolate( + self: Series, + method: str = "linear", + axis: Axis = 0, + limit: int | None = None, + inplace: bool = False, + limit_direction: str | None = None, + limit_area: str | None = None, + downcast: str | None = None, + **kwargs, + ) -> Series | None: + return super().interpolate( + method, + axis, + limit, + inplace, + limit_direction, + limit_area, + downcast, + **kwargs, + ) + + @deprecate_nonkeyword_arguments( + version=None, allowed_args=["self", "cond", "other"] + ) + def where( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=lib.no_default, + ): + return super().where(cond, other, inplace, axis, level, errors, try_cast) + + @deprecate_nonkeyword_arguments( + version=None, allowed_args=["self", "cond", "other"] + ) + def mask( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=lib.no_default, + ): + return super().mask(cond, other, inplace, axis, level, errors, try_cast) + # ---------------------------------------------------------------------- # Add index _AXIS_ORDERS = ["index"] diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index f6c1afbde0bd9b..8531f93fba3217 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -630,22 +630,15 @@ def get_group_index_sorter( np.ndarray[np.intp] """ if ngroups is None: - # error: Incompatible types in assignment (expression has type "number[Any]", - # variable has type "Optional[int]") - ngroups = 1 + group_index.max() # type: ignore[assignment] + ngroups = 1 + group_index.max() count = len(group_index) alpha = 0.0 # taking complexities literally; there may be beta = 1.0 # some room for fine-tuning these parameters - # error: Unsupported operand types for * ("float" and "None") - do_groupsort = count > 0 and ( - (alpha + beta * ngroups) < (count * np.log(count)) # type: ignore[operator] - ) + do_groupsort = count > 0 and ((alpha + beta * ngroups) < (count * np.log(count))) if do_groupsort: - # Argument 2 to "groupsort_indexer" has incompatible type - # "Optional[int]"; expected "int" sorter, _ = algos.groupsort_indexer( ensure_platform_int(group_index), - ngroups, # type: ignore[arg-type] + ngroups, ) # sorter _should_ already be intp, but mypy is not yet able to verify else: diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 43df34a7ecbb26..7643019ff8c555 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1,24 +1,19 @@ +from __future__ import annotations + import codecs +from collections.abc import Callable # noqa: PDF001 from functools import wraps import re from typing import ( TYPE_CHECKING, - Dict, Hashable, - List, - Optional, - Pattern, - Union, ) import warnings import numpy as np import pandas._libs.lib as lib -from pandas._typing import ( - ArrayLike, - FrameOrSeriesUnion, -) +from pandas._typing import FrameOrSeriesUnion from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -43,7 +38,7 @@ if TYPE_CHECKING: from pandas import Index -_shared_docs: Dict[str, str] = {} +_shared_docs: dict[str, str] = {} _cpython_optimized_encoders = ( "utf-8", "utf8", @@ -162,7 +157,6 @@ class StringMethods(NoNewAttributesMixin): # TODO: Dispatch all the methods # Currently the following are not dispatched to the array # * cat - # * extract # * extractall def __init__(self, data): @@ -245,7 +239,7 @@ def _wrap_result( self, result, name=None, - expand=None, + expand: bool | None = None, fill_value=np.nan, returns_string=True, ): @@ -284,7 +278,7 @@ def cons_row(x): return [x] result = [cons_row(x) for x in result] - if result: + if result and not self._is_string: # propagate nan values to match longest sequence (GH 18450) max_len = max(len(x) for x in result) result = [ @@ -325,7 +319,7 @@ def cons_row(x): else: index = self._orig.index # This is a mess. - dtype: Optional[str] + dtype: str | None if self._is_string and returns_string: dtype = self._orig.dtype else: @@ -391,7 +385,7 @@ def _get_series_list(self, others): or (isinstance(x, np.ndarray) and x.ndim == 1) for x in others ): - los: List[Series] = [] + los: list[Series] = [] while others: # iterate through list and append each element los = los + self._get_series_list(others.pop(0)) return los @@ -1219,7 +1213,15 @@ def fullmatch(self, pat, case=True, flags=0, na=None): return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) - def replace(self, pat, repl, n=-1, case=None, flags=0, regex=None): + def replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool | None = None, + flags: int = 0, + regex: bool | None = None, + ): r""" Replace each occurrence of pattern/regex in the Series/Index. @@ -1348,26 +1350,21 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=None): ) if len(pat) == 1: msg += ( - " In addition, single character regular expressions will" + " In addition, single character regular expressions will " "*not* be treated as literal strings when regex=True." ) warnings.warn(msg, FutureWarning, stacklevel=3) - regex = True # Check whether repl is valid (GH 13438, GH 15055) if not (isinstance(repl, str) or callable(repl)): raise TypeError("repl must be a string or callable") is_compiled_re = is_re(pat) - if regex: - if is_compiled_re: - if (case is not None) or (flags != 0): - raise ValueError( - "case and flags cannot be set when pat is a compiled regex" - ) - elif case is None: - # not a compiled regex, set default case - case = True + if regex or regex is None: + if is_compiled_re and (case is not None or flags != 0): + raise ValueError( + "case and flags cannot be set when pat is a compiled regex" + ) elif is_compiled_re: raise ValueError( @@ -1376,6 +1373,17 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=None): elif callable(repl): raise ValueError("Cannot use a callable replacement when regex=False") + # The current behavior is to treat single character patterns as literal strings, + # even when ``regex`` is set to ``True``. + if isinstance(pat, str) and len(pat) == 1: + regex = False + + if regex is None: + regex = True + + if case is None: + case = True + result = self._data.array._str_replace( pat, repl, n=n, case=case, flags=flags, regex=regex ) @@ -2292,7 +2300,7 @@ def findall(self, pat, flags=0): @forbid_nonstring_types(["bytes"]) def extract( self, pat: str, flags: int = 0, expand: bool = True - ) -> Union[FrameOrSeriesUnion, "Index"]: + ) -> FrameOrSeriesUnion | Index: r""" Extract capture groups in the regex `pat` as columns in a DataFrame. @@ -2373,6 +2381,8 @@ def extract( 2 NaN dtype: object """ + from pandas import DataFrame + if not isinstance(expand, bool): raise ValueError("expand must be True or False") @@ -2383,8 +2393,37 @@ def extract( if not expand and regex.groups > 1 and isinstance(self._data, ABCIndex): raise ValueError("only one regex group is supported with Index") - # TODO: dispatch - return str_extract(self, pat, flags, expand=expand) + obj = self._data + result_dtype = _result_dtype(obj) + + returns_df = regex.groups > 1 or expand + + if returns_df: + name = None + columns = _get_group_names(regex) + + if obj.array.size == 0: + result = DataFrame(columns=columns, dtype=result_dtype) + + else: + result_list = self._data.array._str_extract( + pat, flags=flags, expand=returns_df + ) + + result_index: Index | None + if isinstance(obj, ABCSeries): + result_index = obj.index + else: + result_index = None + + result = DataFrame( + result_list, columns=columns, index=result_index, dtype=result_dtype + ) + + else: + name = _get_single_group_name(regex) + result = self._data.array._str_extract(pat, flags=flags, expand=returns_df) + return self._wrap_result(result, name=name) @forbid_nonstring_types(["bytes"]) def extractall(self, pat, flags=0): @@ -2733,7 +2772,7 @@ def len(self): # boolean: # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle # _doc_args holds dict of strings to use in substituting casemethod docs - _doc_args: Dict[str, Dict[str, str]] = {} + _doc_args: dict[str, dict[str, str]] = {} _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""} _doc_args["upper"] = {"type": "uppercase", "method": "upper", "version": ""} _doc_args["title"] = {"type": "titlecase", "method": "title", "version": ""} @@ -2971,7 +3010,7 @@ def casefold(self): ) -def cat_safe(list_of_columns: List, sep: str): +def cat_safe(list_of_columns: list, sep: str): """ Auxiliary function for :meth:`str.cat`. @@ -3007,7 +3046,7 @@ def cat_safe(list_of_columns: List, sep: str): return result -def cat_core(list_of_columns: List, sep: str): +def cat_core(list_of_columns: list, sep: str): """ Auxiliary function for :meth:`str.cat` @@ -3046,14 +3085,14 @@ def _result_dtype(arr): return object -def _get_single_group_name(regex: Pattern) -> Hashable: +def _get_single_group_name(regex: re.Pattern) -> Hashable: if regex.groupindex: return next(iter(regex.groupindex)) else: return None -def _get_group_names(regex: Pattern) -> List[Hashable]: +def _get_group_names(regex: re.Pattern) -> list[Hashable]: """ Get named groups from compiled regex. @@ -3071,72 +3110,6 @@ def _get_group_names(regex: Pattern) -> List[Hashable]: return [names.get(1 + i, i) for i in range(regex.groups)] -def _str_extract(arr: ArrayLike, pat: str, flags=0, expand: bool = True): - """ - Find groups in each string in the array using passed regular expression. - - Returns - ------- - np.ndarray or list of lists is expand is True - """ - regex = re.compile(pat, flags=flags) - - empty_row = [np.nan] * regex.groups - - def f(x): - if not isinstance(x, str): - return empty_row - m = regex.search(x) - if m: - return [np.nan if item is None else item for item in m.groups()] - else: - return empty_row - - if expand: - return [f(val) for val in np.asarray(arr)] - - return np.array([f(val)[0] for val in np.asarray(arr)], dtype=object) - - -def str_extract(accessor: StringMethods, pat: str, flags: int = 0, expand: bool = True): - from pandas import ( - DataFrame, - array as pd_array, - ) - - obj = accessor._data - result_dtype = _result_dtype(obj) - regex = re.compile(pat, flags=flags) - returns_df = regex.groups > 1 or expand - - if returns_df: - name = None - columns = _get_group_names(regex) - - if obj.array.size == 0: - result = DataFrame(columns=columns, dtype=result_dtype) - - else: - result_list = _str_extract(obj.array, pat, flags=flags, expand=returns_df) - - result_index: Optional["Index"] - if isinstance(obj, ABCSeries): - result_index = obj.index - else: - result_index = None - - result = DataFrame( - result_list, columns=columns, index=result_index, dtype=result_dtype - ) - - else: - name = _get_single_group_name(regex) - result_arr = _str_extract(obj.array, pat, flags=flags, expand=returns_df) - # not dispatching, so we have to reconstruct here. - result = pd_array(result_arr, dtype=result_dtype) - return accessor._wrap_result(result, name=name) - - def str_extractall(arr, pat, flags=0): regex = re.compile(pat, flags=flags) # the regex must contain capture groups. diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index a77f8861a7c02d..cd71844d3b5271 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -1,8 +1,8 @@ +from __future__ import annotations + import abc -from typing import ( - Pattern, - Union, -) +from collections.abc import Callable # noqa: PDF001 +import re import numpy as np @@ -52,7 +52,15 @@ def _str_endswith(self, pat, na=None): pass @abc.abstractmethod - def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): + def _str_replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool = True, + flags: int = 0, + regex: bool = True, + ): pass @abc.abstractmethod @@ -68,7 +76,7 @@ def _str_match( @abc.abstractmethod def _str_fullmatch( self, - pat: Union[str, Pattern], + pat: str | re.Pattern, case: bool = True, flags: int = 0, na: Scalar = np.nan, @@ -222,3 +230,7 @@ def _str_split(self, pat=None, n=-1, expand=False): @abc.abstractmethod def _str_rsplit(self, pat=None, n=-1): pass + + @abc.abstractmethod + def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): + pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 869eabc76b5557..7ce4abe904f3bb 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -1,11 +1,8 @@ +from __future__ import annotations + +from collections.abc import Callable # noqa: PDF001 import re import textwrap -from typing import ( - Optional, - Pattern, - Set, - Union, -) import unicodedata import numpy as np @@ -18,10 +15,7 @@ Scalar, ) -from pandas.core.dtypes.common import ( - is_re, - is_scalar, -) +from pandas.core.dtypes.common import is_scalar from pandas.core.dtypes.missing import isna from pandas.core.strings.base import BaseStringArrayMethods @@ -38,7 +32,9 @@ def __len__(self): # For typing, _str_map relies on the object being sized. raise NotImplementedError - def _str_map(self, f, na_value=None, dtype: Optional[Dtype] = None): + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): """ Map a callable over valid element of the array. @@ -53,6 +49,8 @@ def _str_map(self, f, na_value=None, dtype: Optional[Dtype] = None): for object-dtype and Categorical and ``pd.NA`` for StringArray. dtype : Dtype, optional The dtype of the result array. + convert : bool, default True + Whether to call `maybe_convert_objects` on the resulting ndarray """ if dtype is None: dtype = np.dtype("object") @@ -66,9 +64,9 @@ def _str_map(self, f, na_value=None, dtype: Optional[Dtype] = None): arr = np.asarray(self, dtype=object) mask = isna(arr) - convert = not np.all(mask) + map_convert = convert and not np.all(mask) try: - result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) + result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert) except (TypeError, AttributeError) as e: # Reraise the exception if callable `f` got wrong number of args. # The user may want to be warned by this, instead of getting NaN @@ -94,7 +92,7 @@ def g(x): return result if na_value is not np.nan: np.putmask(result, mask, na_value) - if result.dtype == object: + if convert and result.dtype == object: result = lib.maybe_convert_objects(result) return result @@ -138,15 +136,23 @@ def _str_endswith(self, pat, na=None): f = lambda x: x.endswith(pat) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) - def _str_replace(self, pat, repl, n=-1, case: bool = True, flags=0, regex=True): - is_compiled_re = is_re(pat) - + def _str_replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool = True, + flags: int = 0, + regex: bool = True, + ): if case is False: # add case flag, if provided flags |= re.IGNORECASE - if regex and (is_compiled_re or len(pat) > 1 or flags or callable(repl)): - if not is_compiled_re: + if regex or flags or callable(repl): + if not isinstance(pat, re.Pattern): + if regex is False: + pat = re.escape(pat) pat = re.compile(pat, flags=flags) n = n if n >= 0 else 0 @@ -198,7 +204,7 @@ def _str_match( def _str_fullmatch( self, - pat: Union[str, Pattern], + pat: str | re.Pattern, case: bool = True, flags: int = 0, na: Scalar = None, @@ -339,7 +345,7 @@ def _str_get_dummies(self, sep="|"): except TypeError: arr = sep + arr.astype(str) + sep - tags: Set[str] = set() + tags: set[str] = set() for ts in Series(arr).str.split(sep): tags.update(ts) tags2 = sorted(tags - {""}) @@ -408,3 +414,28 @@ def _str_lstrip(self, to_strip=None): def _str_rstrip(self, to_strip=None): return self._str_map(lambda x: x.rstrip(to_strip)) + + def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): + regex = re.compile(pat, flags=flags) + na_value = self._str_na_value + + if not expand: + + def g(x): + m = regex.search(x) + return m.groups()[0] if m else na_value + + return self._str_map(g, convert=False) + + empty_row = [na_value] * regex.groups + + def f(x): + if not isinstance(x, str): + return empty_row + m = regex.search(x) + if m: + return [na_value if item is None else item for item in m.groups()] + else: + return empty_row + + return [f(val) for val in np.asarray(self)] diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index d85aa20de5ab4d..e0720c5d86df1c 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -1,7 +1,6 @@ """Common utility functions for rolling operations""" from collections import defaultdict from typing import cast -import warnings import numpy as np @@ -15,17 +14,7 @@ def flex_binary_moment(arg1, arg2, f, pairwise=False): - if not ( - isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame)) - and isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame)) - ): - raise TypeError( - "arguments to moment function must be of type np.ndarray/Series/DataFrame" - ) - - if isinstance(arg1, (np.ndarray, ABCSeries)) and isinstance( - arg2, (np.ndarray, ABCSeries) - ): + if isinstance(arg1, ABCSeries) and isinstance(arg2, ABCSeries): X, Y = prep_binary(arg1, arg2) return f(X, Y) @@ -43,7 +32,7 @@ def dataframe_from_int_dict(data, frame_template): if pairwise is False: if arg1 is arg2: # special case in order to handle duplicate column names - for i, col in enumerate(arg1.columns): + for i in range(len(arg1.columns)): results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i]) return dataframe_from_int_dict(results, arg1) else: @@ -51,23 +40,17 @@ def dataframe_from_int_dict(data, frame_template): raise ValueError("'arg1' columns are not unique") if not arg2.columns.is_unique: raise ValueError("'arg2' columns are not unique") - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", RuntimeWarning) - X, Y = arg1.align(arg2, join="outer") - X = X + 0 * Y - Y = Y + 0 * X - - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", RuntimeWarning) - res_columns = arg1.columns.union(arg2.columns) + X, Y = arg1.align(arg2, join="outer") + X, Y = prep_binary(X, Y) + res_columns = arg1.columns.union(arg2.columns) for col in res_columns: if col in X and col in Y: results[col] = f(X[col], Y[col]) return DataFrame(results, index=X.index, columns=res_columns) elif pairwise is True: results = defaultdict(dict) - for i, k1 in enumerate(arg1.columns): - for j, k2 in enumerate(arg2.columns): + for i in range(len(arg1.columns)): + for j in range(len(arg2.columns)): if j < i and arg2 is arg1: # Symmetric case results[i][j] = results[j][i] @@ -85,10 +68,10 @@ def dataframe_from_int_dict(data, frame_template): result = concat( [ concat( - [results[i][j] for j, c in enumerate(arg2.columns)], + [results[i][j] for j in range(len(arg2.columns))], ignore_index=True, ) - for i, c in enumerate(arg1.columns) + for i in range(len(arg1.columns)) ], ignore_index=True, axis=1, @@ -135,13 +118,10 @@ def dataframe_from_int_dict(data, frame_template): ) return result - - else: - raise ValueError("'pairwise' is not True/False") else: results = { i: f(*prep_binary(arg1.iloc[:, i], arg2)) - for i, col in enumerate(arg1.columns) + for i in range(len(arg1.columns)) } return dataframe_from_int_dict(results, arg1) @@ -165,11 +145,7 @@ def zsqrt(x): def prep_binary(arg1, arg2): - if not isinstance(arg2, type(arg1)): - raise Exception("Input arrays must be of the same type!") - # mask out values, this also makes a common index... X = arg1 + 0 * arg2 Y = arg2 + 0 * arg1 - return X, Y diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 0ef0896df8d446..2d5f148a6437ac 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -291,6 +291,7 @@ def __repr__(self) -> str: def __iter__(self): obj = self._create_data(self._selected_obj) + obj = obj.set_axis(self._on) indexer = self._get_window_indexer() start, end = indexer.get_window_bounds( @@ -471,6 +472,8 @@ def _apply_pairwise( other = target # only default unset pairwise = True if pairwise is None else pairwise + elif not isinstance(other, (ABCDataFrame, ABCSeries)): + raise ValueError("other must be a DataFrame or Series") return flex_binary_moment(target, other, func, pairwise=bool(pairwise)) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index a0f6ddfd84d7ba..92516a1609f10a 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -12,6 +12,15 @@ ) +class IntCastingNaNError(ValueError): + """ + raised when attempting an astype operation on an array with NaN to an integer + dtype. + """ + + pass + + class NullFrequencyError(ValueError): """ Error raised when a null `freq` attribute is used in an operation diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 00a99eb8a44800..a6940c08198b00 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -58,9 +58,14 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover # 0 1 2 # 1 3 4 - counts = {x.lstrip().count("\t") for x in lines} + counts = {x.lstrip(" ").count("\t") for x in lines} if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: sep = "\t" + # check the number of leading tabs in the first line + # to account for index columns + index_length = len(lines[0]) - len(lines[0].lstrip(" \t")) + if index_length != 0: + kwargs.setdefault("index_col", list(range(index_length))) # Edge case where sep is specified to be None, return to default if sep is None and kwargs.get("delim_whitespace") is None: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index cf2246f917bbec..42ca68376452db 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -2,7 +2,6 @@ import abc import datetime -import inspect from io import BytesIO import os from textwrap import fill @@ -33,6 +32,7 @@ deprecate_nonkeyword_arguments, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_bool, @@ -245,6 +245,10 @@ Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric data will be read in as floats: Excel stores all numbers as floats internally. + + .. deprecated:: 1.3.0 + convert_float will be removed in a future version + mangle_dupe_cols : bool, default True Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 'X'...'X'. Passing in False will cause data to be overwritten if there @@ -355,7 +359,7 @@ def read_excel( thousands=None, comment=None, skipfooter=0, - convert_float=True, + convert_float=None, mangle_dupe_cols=True, storage_options: StorageOptions = None, ): @@ -489,11 +493,21 @@ def parse( thousands=None, comment=None, skipfooter=0, - convert_float=True, + convert_float=None, mangle_dupe_cols=True, **kwds, ): + if convert_float is None: + convert_float = True + else: + stacklevel = find_stack_level() + warnings.warn( + "convert_float is deprecated and will be removed in a future version", + FutureWarning, + stacklevel=stacklevel, + ) + validate_header_arg(header) ret_dict = False @@ -1014,16 +1028,21 @@ def close(self): return content -XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" +XLS_SIGNATURES = ( + b"\x09\x00\x04\x00\x07\x00\x10\x00", # BIFF2 + b"\x09\x02\x06\x00\x00\x00\x10\x00", # BIFF3 + b"\x09\x04\x06\x00\x00\x00\x10\x00", # BIFF4 + b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", # Compound File Binary +) ZIP_SIGNATURE = b"PK\x03\x04" -PEEK_SIZE = max(len(XLS_SIGNATURE), len(ZIP_SIGNATURE)) +PEEK_SIZE = max(map(len, XLS_SIGNATURES + (ZIP_SIGNATURE,))) @doc(storage_options=_shared_docs["storage_options"]) def inspect_excel_format( content_or_path: FilePathOrBuffer, storage_options: StorageOptions = None, -) -> str: +) -> str | None: """ Inspect the path or content of an excel file and get its format. @@ -1037,8 +1056,8 @@ def inspect_excel_format( Returns ------- - str - Format of file. + str or None + Format of file if it can be determined. Raises ------ @@ -1063,10 +1082,10 @@ def inspect_excel_format( peek = buf stream.seek(0) - if peek.startswith(XLS_SIGNATURE): + if any(peek.startswith(sig) for sig in XLS_SIGNATURES): return "xls" elif not peek.startswith(ZIP_SIGNATURE): - raise ValueError("File is not a recognized excel file") + return None # ZipFile typing is overly-strict # https://github.com/python/typeshed/issues/4212 @@ -1174,8 +1193,12 @@ def __init__( ext = inspect_excel_format( content_or_path=path_or_buffer, storage_options=storage_options ) + if ext is None: + raise ValueError( + "Excel file format cannot be determined, you must specify " + "an engine manually." + ) - # ext will always be valid, otherwise inspect_excel_format would raise engine = config.get_option(f"io.excel.{ext}.reader", silent=True) if engine == "auto": engine = get_default_engine(ext, mode="reader") @@ -1190,22 +1213,14 @@ def __init__( path_or_buffer, storage_options=storage_options ) - if ext != "xls" and xlrd_version >= Version("2"): + # Pass through if ext is None, otherwise check if ext valid for xlrd + if ext and ext != "xls" and xlrd_version >= Version("2"): raise ValueError( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install openpyxl instead." ) - elif ext != "xls": - caller = inspect.stack()[1] - if ( - caller.filename.endswith( - os.path.join("pandas", "io", "excel", "_base.py") - ) - and caller.function == "read_excel" - ): - stacklevel = 4 - else: - stacklevel = 2 + elif ext and ext != "xls": + stacklevel = find_stack_level() warnings.warn( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install " @@ -1241,7 +1256,7 @@ def parse( thousands=None, comment=None, skipfooter=0, - convert_float=True, + convert_float=None, mangle_dupe_cols=True, **kwds, ): diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 648df0ff2b6d9c..c6ff4e21808939 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -773,7 +773,7 @@ def _generate_body(self, coloffset: int) -> Iterable[ExcelCell]: series = self.df.iloc[:, colidx] for i, val in enumerate(series): if styles is not None: - css = ";".join([a + ":" + str(v) for (a, v) in styles[i, colidx]]) + css = ";".join(a + ":" + str(v) for (a, v) in styles[i, colidx]) xlstyle = self.style_converter(css) yield ExcelCell(self.rowcounter + i, colidx + coloffset, val, xlstyle) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 9d653c9a5f97c6..485610af747f61 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1664,19 +1664,9 @@ def format_percentiles( ).astype(int) prec = max(1, prec) out = np.empty_like(percentiles, dtype=object) - # error: No overload variant of "__getitem__" of "list" matches argument type - # "Union[bool_, ndarray]" - out[int_idx] = ( - percentiles[int_idx].astype(int).astype(str) # type: ignore[call-overload] - ) + out[int_idx] = percentiles[int_idx].astype(int).astype(str) - # error: Item "float" of "Union[Any, float, str]" has no attribute "round" - # error: Item "str" of "Union[Any, float, str]" has no attribute "round" - # error: Invalid index type "Union[bool_, Any]" for "Union[ndarray, List[Union[int, - # float]], List[float], List[Union[str, float]]]"; expected type "int" - out[~int_idx] = ( - percentiles[~int_idx].round(prec).astype(str) # type: ignore[union-attr,index] - ) + out[~int_idx] = percentiles[~int_idx].round(prec).astype(str) return [i + "%" for i in out] diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index fce0814e979a4c..476a3647207d60 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -361,7 +361,7 @@ def get_result(self) -> str: self.bottom_separator, self.env_end, ] - result = "\n".join([item for item in elements if item]) + result = "\n".join(item for item in elements if item) trailing_newline = "\n" result += trailing_newline return result @@ -530,13 +530,13 @@ def env_begin(self) -> str: f"\\begin{{longtable}}{self._position_macro}{{{self.column_format}}}" ) elements = [first_row, f"{self._caption_and_label()}"] - return "\n".join([item for item in elements if item]) + return "\n".join(item for item in elements if item) def _caption_and_label(self) -> str: if self.caption or self.label: double_backslash = "\\\\" elements = [f"{self._caption_macro}", f"{self._label_macro}"] - caption_and_label = "\n".join([item for item in elements if item]) + caption_and_label = "\n".join(item for item in elements if item) caption_and_label += double_backslash return caption_and_label else: @@ -614,7 +614,7 @@ def env_begin(self) -> str: f"{self._label_macro}", f"\\begin{{tabular}}{{{self.column_format}}}", ] - return "\n".join([item for item in elements if item]) + return "\n".join(item for item in elements if item) @property def bottom_separator(self) -> str: diff --git a/pandas/io/formats/string.py b/pandas/io/formats/string.py index de53646b5f95f6..20fc84a4df303c 100644 --- a/pandas/io/formats/string.py +++ b/pandas/io/formats/string.py @@ -74,11 +74,14 @@ def _insert_dot_separators(self, strcols: List[List[str]]) -> List[List[str]]: return strcols + @property + def _adjusted_tr_col_num(self) -> int: + return self.fmt.tr_col_num + 1 if self.fmt.index else self.fmt.tr_col_num + def _insert_dot_separator_horizontal( self, strcols: List[List[str]], index_length: int ) -> List[List[str]]: - tr_col_num = self.fmt.tr_col_num + 1 if self.fmt.index else self.fmt.tr_col_num - strcols.insert(tr_col_num, [" ..."] * index_length) + strcols.insert(self._adjusted_tr_col_num, [" ..."] * index_length) return strcols def _insert_dot_separator_vertical( @@ -90,7 +93,7 @@ def _insert_dot_separator_vertical( cwidth = self.adj.len(col[row_num]) if self.fmt.is_truncated_horizontally: - is_dot_col = ix == self.fmt.tr_col_num + 1 + is_dot_col = ix == self._adjusted_tr_col_num else: is_dot_col = False @@ -99,7 +102,7 @@ def _insert_dot_separator_vertical( else: dots = ".." - if ix == 0: + if ix == 0 and self.fmt.index: dot_mode = "left" elif is_dot_col: cwidth = 4 diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index a96196a698f438..73924631aea5c1 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -17,8 +17,11 @@ import numpy as np +from pandas._config import get_option + from pandas._typing import ( Axis, + FilePathOrBuffer, FrameOrSeries, FrameOrSeriesUnion, IndexLabel, @@ -28,6 +31,7 @@ from pandas.util._decorators import doc import pandas as pd +from pandas import RangeIndex from pandas.api.types import is_list_like from pandas.core import generic import pandas.core.common as com @@ -37,6 +41,8 @@ ) from pandas.core.generic import NDFrame +from pandas.io.formats.format import save_to_buffer + jinja2 = import_optional_dependency("jinja2", extra="DataFrame.style requires jinja2.") from pandas.io.formats.style_render import ( @@ -68,7 +74,7 @@ def _mpl(func: Callable): class Styler(StylerRenderer): - """ + r""" Helps style a DataFrame or Series according to the data with HTML and CSS. Parameters @@ -113,9 +119,12 @@ class Styler(StylerRenderer): .. versionadded:: 1.3.0 - escape : bool, default False - Replace the characters ``&``, ``<``, ``>``, ``'``, and ``"`` in cell display - strings with HTML-safe sequences. + escape : str, optional + Use 'html' to replace the characters ``&``, ``<``, ``>``, ``'``, and ``"`` + in cell display string with HTML-safe sequences. + Use 'latex' to replace the characters ``&``, ``%``, ``$``, ``#``, ``_``, + ``{``, ``}``, ``~``, ``^``, and ``\`` in the cell display string with + LaTeX-safe sequences. ... versionadded:: 1.3.0 @@ -173,7 +182,7 @@ def __init__( uuid_len: int = 5, decimal: str = ".", thousands: str | None = None, - escape: bool = False, + escape: str | None = None, ): super().__init__( data=data, @@ -201,14 +210,27 @@ def _repr_html_(self) -> str: """ Hooks into Jupyter notebook rich display system. """ - return self._render_html() + return self.render() - def render(self, **kwargs) -> str: + def render( + self, + sparse_index: bool | None = None, + sparse_columns: bool | None = None, + **kwargs, + ) -> str: """ Render the ``Styler`` including all applied styles to HTML. Parameters ---------- + sparse_index : bool, optional + Whether to sparsify the display of a hierarchical index. Setting to False + will display each explicit level element in a hierarchical key for each row. + Defaults to ``pandas.options.styler.sparse.index`` value. + sparse_columns : bool, optional + Whether to sparsify the display of a hierarchical index. Setting to False + will display each explicit level element in a hierarchical key for each row. + Defaults to ``pandas.options.styler.sparse.columns`` value. **kwargs Any additional keyword arguments are passed through to ``self.template.render``. @@ -240,7 +262,11 @@ def render(self, **kwargs) -> str: * caption * table_attributes """ - return self._render_html(**kwargs) + if sparse_index is None: + sparse_index = get_option("styler.sparse.index") + if sparse_columns is None: + sparse_columns = get_option("styler.sparse.columns") + return self._render_html(sparse_index, sparse_columns, **kwargs) def set_tooltips( self, @@ -384,6 +410,406 @@ def to_excel( engine=engine, ) + def to_latex( + self, + buf: FilePathOrBuffer[str] | None = None, + *, + column_format: str | None = None, + position: str | None = None, + position_float: str | None = None, + hrules: bool = False, + label: str | None = None, + caption: str | None = None, + sparse_index: bool | None = None, + sparse_columns: bool | None = None, + multirow_align: str = "c", + multicol_align: str = "r", + siunitx: bool = False, + encoding: str | None = None, + ): + r""" + Write Styler to a file, buffer or string in LaTeX format. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + buf : str, Path, or StringIO-like, optional, default None + Buffer to write to. If ``None``, the output is returned as a string. + column_format : str, optional + The LaTeX column specification placed in location: + + \\begin{tabular}{} + + Defaults to 'l' for index and + non-numeric data columns, and, for numeric data columns, + to 'r' by default, or 'S' if ``siunitx`` is ``True``. + position : str, optional + The LaTeX positional argument (e.g. 'h!') for tables, placed in location: + + \\begin{table}[] + position_float : {"centering", "raggedleft", "raggedright"}, optional + The LaTeX float command placed in location: + + \\begin{table}[] + + \\ + hrules : bool, default False + Set to `True` to add \\toprule, \\midrule and \\bottomrule from the + {booktabs} LaTeX package. + label : str, optional + The LaTeX label included as: \\label{