diff --git a/.github/workflows/comment-commands.yml b/.github/workflows/comment-commands.yml index 55dd733d25b50..425abef850184 100644 --- a/.github/workflows/comment-commands.yml +++ b/.github/workflows/comment-commands.yml @@ -77,7 +77,7 @@ jobs: echo 'EOF' >> $GITHUB_ENV echo "REGEX=$REGEX" >> $GITHUB_ENV - - uses: actions/github-script@v6 + - uses: actions/github-script@v7 env: BENCH_OUTPUT: ${{env.BENCH_OUTPUT}} REGEX: ${{env.REGEX}} diff --git a/.github/workflows/deprecation-tracking-bot.yml b/.github/workflows/deprecation-tracking-bot.yml index b3f9bcd840c68..ec71daf6f84ab 100644 --- a/.github/workflows/deprecation-tracking-bot.yml +++ b/.github/workflows/deprecation-tracking-bot.yml @@ -21,7 +21,7 @@ jobs: env: DEPRECATION_TRACKER_ISSUE: 50578 steps: - - uses: actions/github-script@v6 + - uses: actions/github-script@v7 id: update-deprecation-issue with: script: | diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 9785b81ae9e0b..33b6e7a8c2340 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -23,7 +23,7 @@ defaults: jobs: ubuntu: runs-on: ubuntu-22.04 - timeout-minutes: 180 + timeout-minutes: 90 strategy: matrix: env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml] @@ -88,7 +88,6 @@ jobs: name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} env: PATTERN: ${{ matrix.pattern }} - EXTRA_APT: ${{ matrix.extra_apt || '' }} LANG: ${{ matrix.lang || 'C.UTF-8' }} LC_ALL: ${{ matrix.lc_all || '' }} PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }} @@ -96,6 +95,8 @@ jobs: TEST_ARGS: ${{ matrix.test_args || '' }} PYTEST_WORKERS: 'auto' PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} + # Clipboard tests + QT_QPA_PLATFORM: offscreen concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_copy_on_write || '' }} @@ -145,8 +146,8 @@ jobs: fetch-depth: 0 - name: Extra installs - # xsel for clipboard tests - run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }} + run: sudo apt-get update && sudo apt-get install -y ${{ matrix.extra_apt }} + if: ${{ matrix.extra_apt }} - name: Generate extra locales # These extra locales will be available for locale.setlocale() calls in tests @@ -177,7 +178,7 @@ jobs: if: ${{ matrix.pattern == '' && (always() && steps.build.outcome == 'success')}} macos-windows: - timeout-minutes: 180 + timeout-minutes: 90 strategy: matrix: os: [macos-latest, windows-latest] @@ -322,7 +323,7 @@ jobs: matrix: os: [ubuntu-22.04, macOS-latest, windows-latest] - timeout-minutes: 180 + timeout-minutes: 90 concurrency: #https://github.community/t/concurrecy-not-work-for-push/183068/7 diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 810764754b7e1..9b0dc14fe6747 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -41,7 +41,7 @@ // pip (with all the conda available packages installed first, // followed by the pip installed packages). "matrix": { - "Cython": ["0.29.33"], + "Cython": ["3.0.5"], "matplotlib": [], "sqlalchemy": [], "scipy": [], diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 5e3eb7c7d4dac..202ba6e981b70 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -802,6 +802,51 @@ def time_groupby_extra_cat_nosort(self, observed): self.df_extra_cat.groupby("a", observed=observed, sort=False)["b"].count() +class MultipleCategories: + def setup(self): + N = 10**3 + arr = np.random.random(N) + data = { + "a1": Categorical(np.random.randint(10000, size=N)), + "a2": Categorical(np.random.randint(10000, size=N)), + "b": arr, + } + self.df = DataFrame(data) + data = { + "a1": Categorical(np.random.randint(10000, size=N), ordered=True), + "a2": Categorical(np.random.randint(10000, size=N), ordered=True), + "b": arr, + } + self.df_ordered = DataFrame(data) + data = { + "a1": Categorical(np.random.randint(100, size=N), categories=np.arange(N)), + "a2": Categorical(np.random.randint(100, size=N), categories=np.arange(N)), + "b": arr, + } + self.df_extra_cat = DataFrame(data) + + def time_groupby_sort(self): + self.df.groupby(["a1", "a2"], observed=False)["b"].count() + + def time_groupby_nosort(self): + self.df.groupby(["a1", "a2"], observed=False, sort=False)["b"].count() + + def time_groupby_ordered_sort(self): + self.df_ordered.groupby(["a1", "a2"], observed=False)["b"].count() + + def time_groupby_ordered_nosort(self): + self.df_ordered.groupby(["a1", "a2"], observed=False, sort=False)["b"].count() + + def time_groupby_extra_cat_sort(self): + self.df_extra_cat.groupby(["a1", "a2"], observed=False)["b"].count() + + def time_groupby_extra_cat_nosort(self): + self.df_extra_cat.groupby(["a1", "a2"], observed=False, sort=False)["b"].count() + + def time_groupby_transform(self): + self.df_extra_cat.groupby(["a1", "a2"], observed=False)["b"].cumsum() + + class Datelike: # GH 14338 params = ["period_range", "date_range", "date_range_tz"] diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 8058b347383a7..4961722c0e9cd 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -306,6 +306,10 @@ def time_loc_null_slice_plus_slice(self, unique_levels): target = (self.tgt_null_slice, self.tgt_slice) self.df.loc[target, :] + def time_loc_multiindex(self, unique_levels): + target = self.df.index[::10] + self.df.loc[target] + def time_xs_level_0(self, unique_levels): target = self.tgt_scalar self.df.xs(target, level=0) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 1826291034dee..a45315f63d62e 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -621,4 +621,15 @@ def time_read_csv_index_col(self): ) +class ReadCSVCParserLowMemory: + # GH 16798 + def setup(self): + self.csv = StringIO( + "strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)]) + ) + + def peakmem_over_2gb_input(self): + read_csv(self.csv, engine="c", low_memory=False) + + from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index d70d9d0aa5227..6682a60f42997 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -245,7 +245,8 @@ def time_extract_single_group(self, dtype, expand): class Dummies(Dtypes): def setup(self, dtype): super().setup(dtype) - self.s = self.s.str.join("|") + N = len(self.s) // 5 + self.s = self.s[:N].str.join("|") def time_get_dummies(self, dtype): self.s.str.get_dummies("|") diff --git a/asv_bench/benchmarks/tslibs/period.py b/asv_bench/benchmarks/tslibs/period.py index 67f3b7736018d..af3bfac6d3d01 100644 --- a/asv_bench/benchmarks/tslibs/period.py +++ b/asv_bench/benchmarks/tslibs/period.py @@ -151,7 +151,11 @@ def setup(self, size, freq, tz): # tzlocal is cumbersomely slow, so skip to keep runtime in check raise NotImplementedError - arr = np.arange(10, dtype="i8").repeat(size // 10) + # we pick 2**55 because smaller values end up returning + # -1 from npy_datetimestruct_to_datetime with NPY_FR_Y frequency + # this artificially slows down functions since -1 is also the + # error sentinel + arr = np.arange(2**55, 2**55 + 10, dtype="i8").repeat(size // 10) self.i8values = arr def time_dt64arr_to_periodarr(self, size, freq, tz): diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index b22f8cb34c814..7d08f2df5a8ca 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -15,6 +15,7 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - pytest-localserver>=0.7.1 + - pytest-qt>=4.2.0 - boto3 # required dependencies @@ -33,7 +34,7 @@ dependencies: - gcsfs>=2022.11.0 - jinja2>=3.1.2 - lxml>=4.9.2 - - matplotlib>=3.6.3, <3.8 + - matplotlib>=3.6.3 - numba>=0.56.4 - numexpr>=2.8.4 - odfpy>=1.4.1 @@ -42,6 +43,7 @@ dependencies: - psycopg2>=2.9.6 - pyarrow>=10.0.1 - pymysql>=1.0.2 + - pyqt>=5.15.9 - pyreadstat>=1.2.0 - pytables>=3.8.0 - python-calamine>=0.1.6 @@ -56,5 +58,6 @@ dependencies: - zstandard>=0.19.0 - pip: - - pyqt5>=5.15.8 + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index ceea734352fca..7a2bbe442cde7 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -16,6 +16,7 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - pytest-localserver>=0.7.1 + - pytest-qt>=4.2.0 - boto3 # required dependencies @@ -34,7 +35,7 @@ dependencies: - gcsfs>=2022.11.0 - jinja2>=3.1.2 - lxml>=4.9.2 - - matplotlib>=3.6.3, <3.8 + - matplotlib>=3.6.3 - numba>=0.56.4 - numexpr>=2.8.4 - odfpy>=1.4.1 @@ -43,6 +44,7 @@ dependencies: - psycopg2>=2.9.6 - pyarrow>=10.0.1 - pymysql>=1.0.2 + - pyqt>=5.15.9 - pyreadstat>=1.2.0 - pytables>=3.8.0 - python-calamine>=0.1.6 @@ -70,6 +72,7 @@ dependencies: - pyyaml - py - pip: + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 - dataframe-api-compat>=0.1.7 - - pyqt5>=5.15.8 - tzdata>=2022.7 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 1c9f349c2eb28..681345a34513d 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -15,6 +15,7 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - pytest-localserver>=0.7.1 + - pytest-qt>=4.2.0 - boto3 # required dependencies @@ -33,11 +34,12 @@ dependencies: - gcsfs>=2022.11.0 - jinja2>=3.1.2 - lxml>=4.9.2 - - matplotlib>=3.6.3, <3.8 + - matplotlib>=3.6.3 - numba>=0.56.4 - numexpr>=2.8.4 - odfpy>=1.4.1 - qtpy>=2.3.0 + - pyqt>=5.15.9 - openpyxl>=3.1.0 - psycopg2>=2.9.6 - pyarrow>=10.0.1 @@ -56,5 +58,6 @@ dependencies: - zstandard>=0.19.0 - pip: - - pyqt5>=5.15.8 + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index aa8597978ecf7..2e3b03f63eb99 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -17,6 +17,7 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - pytest-localserver>=0.7.1 + - pytest-qt>=4.2.0 - boto3 # required dependencies @@ -44,6 +45,7 @@ dependencies: - psycopg2=2.9.6 - pyarrow=10.0.1 - pymysql=1.0.2 + - pyqt=5.15.9 - pyreadstat=1.2.0 - pytables=3.8.0 - python-calamine=0.1.6 @@ -58,6 +60,7 @@ dependencies: - zstandard=0.19.0 - pip: + - adbc-driver-postgresql==0.8.0 + - adbc-driver-sqlite==0.8.0 - dataframe-api-compat==0.1.7 - - pyqt5==5.15.8 - tzdata==2022.7 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 92b88a34094d2..d3e545b636de9 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -15,6 +15,7 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - pytest-localserver>=0.7.1 + - pytest-qt>=4.2.0 - boto3 # required dependencies @@ -33,7 +34,7 @@ dependencies: - gcsfs>=2022.11.0 - jinja2>=3.1.2 - lxml>=4.9.2 - - matplotlib>=3.6.3, <3.8 + - matplotlib>=3.6.3 - numba>=0.56.4 - numexpr>=2.8.4 - odfpy>=1.4.1 @@ -42,6 +43,7 @@ dependencies: - psycopg2>=2.9.6 - pyarrow>=10.0.1 - pymysql>=1.0.2 + - pyqt>=5.15.9 - pyreadstat>=1.2.0 - pytables>=3.8.0 - python-calamine>=0.1.6 @@ -56,5 +58,6 @@ dependencies: - zstandard>=0.19.0 - pip: - - pyqt5>=5.15.8 + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index f81b91fcbae3b..0098406037876 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -15,6 +15,7 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - pytest-localserver>=0.7.1 + - pytest-qt>=4.2.0 - boto3 # required dependencies @@ -33,7 +34,7 @@ dependencies: - gcsfs>=2022.11.0 - jinja2>=3.1.2 - lxml>=4.9.2 - - matplotlib>=3.6.3, <3.8 + - matplotlib>=3.6.3 - numba>=0.56.4 - numexpr>=2.8.4 - odfpy>=1.4.1 @@ -42,6 +43,7 @@ dependencies: - psycopg2>=2.9.6 - pyarrow>=10.0.1 - pymysql>=1.0.2 + - pyqt>=5.15.9 - pyreadstat>=1.2.0 - pytables>=3.8.0 - python-calamine>=0.1.6 @@ -54,3 +56,6 @@ dependencies: - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 + - pip: + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 2af66da6b8baf..c048c71613b57 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -335,7 +335,7 @@ lxml 4.9.2 xml XML parser for read SQL databases ^^^^^^^^^^^^^ -Installable with ``pip install "pandas[postgresql, mysql, sql-other]"``. +Traditional drivers are installable with ``pip install "pandas[postgresql, mysql, sql-other]"`` ========================= ================== =============== ============================================================= Dependency Minimum Version pip extra Notes @@ -345,6 +345,8 @@ SQLAlchemy 2.0.0 postgresql, SQL support for dat sql-other psycopg2 2.9.6 postgresql PostgreSQL engine for sqlalchemy pymysql 1.0.2 mysql MySQL engine for sqlalchemy +adbc-driver-postgresql 0.8.0 postgresql ADBC Driver for PostgreSQL +adbc-driver-sqlite 0.8.0 sql-other ADBC Driver for SQLite ========================= ================== =============== ============================================================= Other data sources @@ -395,7 +397,7 @@ Installable with ``pip install "pandas[clipboard]"``. ========================= ================== =============== ============================================================= Dependency Minimum Version pip extra Notes ========================= ================== =============== ============================================================= -PyQt4/PyQt5 5.15.8 clipboard Clipboard I/O +PyQt4/PyQt5 5.15.9 clipboard Clipboard I/O qtpy 2.3.0 clipboard Clipboard I/O ========================= ================== =============== ============================================================= diff --git a/doc/source/reference/indexing.rst b/doc/source/reference/indexing.rst index 25e5b3b46b4f3..fa6105761df0a 100644 --- a/doc/source/reference/indexing.rst +++ b/doc/source/reference/indexing.rst @@ -489,3 +489,5 @@ Methods PeriodIndex.asfreq PeriodIndex.strftime PeriodIndex.to_timestamp + PeriodIndex.from_fields + PeriodIndex.from_ordinals diff --git a/doc/source/user_guide/copy_on_write.rst b/doc/source/user_guide/copy_on_write.rst index d0c57b56585db..fc6f62ec2a4bb 100644 --- a/doc/source/user_guide/copy_on_write.rst +++ b/doc/source/user_guide/copy_on_write.rst @@ -6,6 +6,12 @@ Copy-on-Write (CoW) ******************* +.. note:: + + Copy-on-Write will become the default in pandas 3.0. We recommend + :ref:`turning it on now ` + to benefit from all improvements. + Copy-on-Write was first introduced in version 1.5.0. Starting from version 2.0 most of the optimizations that become possible through CoW are implemented and supported. All possible optimizations are supported starting from pandas 2.1. @@ -123,6 +129,8 @@ CoW triggers a copy when ``df`` is changed to avoid mutating ``view`` as well: df view +.. _copy_on_write_chained_assignment: + Chained Assignment ------------------ @@ -238,6 +246,8 @@ and :meth:`DataFrame.rename`. These methods return views when Copy-on-Write is enabled, which provides a significant performance improvement compared to the regular execution. +.. _copy_on_write_enabling: + How to enable CoW ----------------- diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 7b839d62ddde9..4954ee1538697 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1727,6 +1727,22 @@ You can assign a custom index to the ``index`` attribute: Returning a view versus a copy ------------------------------ +.. warning:: + + :ref:`Copy-on-Write ` + will become the new default in pandas 3.0. This means than chained indexing will + never work. As a consequence, the ``SettingWithCopyWarning`` won't be necessary + anymore. + See :ref:`this section ` + for more context. + We recommend turning Copy-on-Write on to leverage the improvements with + + ``` + pd.options.mode.copy_on_write = True + ``` + + even before pandas 3.0 is available. + When setting values in a pandas object, care must be taken to avoid what is called ``chained indexing``. Here is an example. @@ -1765,6 +1781,22 @@ faster, and allows one to index *both* axes if so desired. Why does assignment fail when using chained indexing? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. warning:: + + :ref:`Copy-on-Write ` + will become the new default in pandas 3.0. This means than chained indexing will + never work. As a consequence, the ``SettingWithCopyWarning`` won't be necessary + anymore. + See :ref:`this section ` + for more context. + We recommend turning Copy-on-Write on to leverage the improvements with + + ``` + pd.options.mode.copy_on_write = True + ``` + + even before pandas 3.0 is available. + The problem in the previous section is just a performance issue. What's up with the ``SettingWithCopy`` warning? We don't **usually** throw warnings around when you do something that might cost a few extra milliseconds! @@ -1821,6 +1853,22 @@ Yikes! Evaluation order matters ~~~~~~~~~~~~~~~~~~~~~~~~ +.. warning:: + + :ref:`Copy-on-Write ` + will become the new default in pandas 3.0. This means than chained indexing will + never work. As a consequence, the ``SettingWithCopyWarning`` won't be necessary + anymore. + See :ref:`this section ` + for more context. + We recommend turning Copy-on-Write on to leverage the improvements with + + ``` + pd.options.mode.copy_on_write = True + ``` + + even before pandas 3.0 is available. + When you use chained indexing, the order and type of the indexing operation partially determine whether the result is a slice into the original object, or a copy of the slice. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 73ec09cdd12f1..21e07e8d00ad6 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5565,9 +5565,23 @@ SQL queries ----------- The :mod:`pandas.io.sql` module provides a collection of query wrappers to both -facilitate data retrieval and to reduce dependency on DB-specific API. Database abstraction -is provided by SQLAlchemy if installed. In addition you will need a driver library for -your database. Examples of such drivers are `psycopg2 `__ +facilitate data retrieval and to reduce dependency on DB-specific API. + +Where available, users may first want to opt for `Apache Arrow ADBC +`_ drivers. These drivers +should provide the best performance, null handling, and type detection. + + .. versionadded:: 2.2.0 + + Added native support for ADBC drivers + +For a full list of ADBC drivers and their development status, see the `ADBC Driver +Implementation Status `_ +documentation. + +Where an ADBC driver is not available or may be missing functionality, +users should opt for installing SQLAlchemy alongside their database driver library. +Examples of such drivers are `psycopg2 `__ for PostgreSQL or `pymysql `__ for MySQL. For `SQLite `__ this is included in Python's standard library by default. @@ -5600,6 +5614,18 @@ In the following example, we use the `SQlite engine. You can use a temporary SQLite database where data are stored in "memory". +To connect using an ADBC driver you will want to install the ``adbc_driver_sqlite`` using your +package manager. Once installed, you can use the DBAPI interface provided by the ADBC driver +to connect to your database. + +.. code-block:: python + + import adbc_driver_sqlite.dbapi as sqlite_dbapi + + # Create the connection + with sqlite_dbapi.connect("sqlite:///:memory:") as conn: + df = pd.read_sql_table("data", conn) + To connect with SQLAlchemy you use the :func:`create_engine` function to create an engine object from database URI. You only need to create the engine once per database you are connecting to. @@ -5675,9 +5701,74 @@ writes ``data`` to the database in batches of 1000 rows at a time: SQL data types ++++++++++++++ -:func:`~pandas.DataFrame.to_sql` will try to map your data to an appropriate -SQL data type based on the dtype of the data. When you have columns of dtype -``object``, pandas will try to infer the data type. +Ensuring consistent data type management across SQL databases is challenging. +Not every SQL database offers the same types, and even when they do the implementation +of a given type can vary in ways that have subtle effects on how types can be +preserved. + +For the best odds at preserving database types users are advised to use +ADBC drivers when available. The Arrow type system offers a wider array of +types that more closely match database types than the historical pandas/NumPy +type system. To illustrate, note this (non-exhaustive) listing of types +available in different databases and pandas backends: + ++-----------------+-----------------------+----------------+---------+ +|numpy/pandas |arrow |postgres |sqlite | ++=================+=======================+================+=========+ +|int16/Int16 |int16 |SMALLINT |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|int32/Int32 |int32 |INTEGER |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|int64/Int64 |int64 |BIGINT |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|float32 |float32 |REAL |REAL | ++-----------------+-----------------------+----------------+---------+ +|float64 |float64 |DOUBLE PRECISION|REAL | ++-----------------+-----------------------+----------------+---------+ +|object |string |TEXT |TEXT | ++-----------------+-----------------------+----------------+---------+ +|bool |``bool_`` |BOOLEAN | | ++-----------------+-----------------------+----------------+---------+ +|datetime64[ns] |timestamp(us) |TIMESTAMP | | ++-----------------+-----------------------+----------------+---------+ +|datetime64[ns,tz]|timestamp(us,tz) |TIMESTAMPTZ | | ++-----------------+-----------------------+----------------+---------+ +| |date32 |DATE | | ++-----------------+-----------------------+----------------+---------+ +| |month_day_nano_interval|INTERVAL | | ++-----------------+-----------------------+----------------+---------+ +| |binary |BINARY |BLOB | ++-----------------+-----------------------+----------------+---------+ +| |decimal128 |DECIMAL [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ +| |list |ARRAY [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ +| |struct |COMPOSITE TYPE | | +| | | [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ + +.. rubric:: Footnotes + +.. [#f1] Not implemented as of writing, but theoretically possible + +If you are interested in preserving database types as best as possible +throughout the lifecycle of your DataFrame, users are encouraged to +leverage the ``dtype_backend="pyarrow"`` argument of :func:`~pandas.read_sql` + +.. code-block:: ipython + + # for roundtripping + with pg_dbapi.connect(uri) as conn: + df2 = pd.read_sql("pandas_table", conn, dtype_backend="pyarrow") + +This will prevent your data from being converted to the traditional pandas/NumPy +type system, which often converts SQL types in ways that make them impossible to +round-trip. + +In case an ADBC driver is not available, :func:`~pandas.DataFrame.to_sql` +will try to map your data to an appropriate SQL data type based on the dtype of +the data. When you have columns of dtype ``object``, pandas will try to infer +the data type. You can always override the default type by specifying the desired SQL type of any of the columns by using the ``dtype`` argument. This argument needs a @@ -5696,7 +5787,9 @@ default ``Text`` type for string columns: Due to the limited support for timedelta's in the different database flavors, columns with type ``timedelta64`` will be written as integer - values as nanoseconds to the database and a warning will be raised. + values as nanoseconds to the database and a warning will be raised. The only + exception to this is when using the ADBC PostgreSQL driver in which case a + timedelta will be written to the database as an ``INTERVAL`` .. note:: @@ -5711,7 +5804,7 @@ default ``Text`` type for string columns: Datetime data types ''''''''''''''''''' -Using SQLAlchemy, :func:`~pandas.DataFrame.to_sql` is capable of writing +Using ADBC or SQLAlchemy, :func:`~pandas.DataFrame.to_sql` is capable of writing datetime data that is timezone naive or timezone aware. However, the resulting data stored in the database ultimately depends on the supported data type for datetime data of the database system being used. @@ -5802,7 +5895,7 @@ table name and optionally a subset of columns to read. .. note:: In order to use :func:`~pandas.read_sql_table`, you **must** have the - SQLAlchemy optional dependency installed. + ADBC driver or SQLAlchemy optional dependency installed. .. ipython:: python @@ -5810,7 +5903,8 @@ table name and optionally a subset of columns to read. .. note:: - Note that pandas infers column dtypes from query outputs, and not by looking + ADBC drivers will map database types directly back to arrow types. For other drivers + note that pandas infers column dtypes from query outputs, and not by looking up data types in the physical database schema. For example, assume ``userid`` is an integer column in a table. Then, intuitively, ``select userid ...`` will return integer-valued series, while ``select cast(userid as text) ...`` will diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 6731b70fc3860..105d11b67c9bf 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -882,16 +882,16 @@ into ``freq`` keyword arguments. The available date offsets and associated frequ :class:`~pandas.tseries.offsets.BMonthBegin` or :class:`~pandas.tseries.offsets.BusinessMonthBegin`, ``'BMS'``, "business month begin" :class:`~pandas.tseries.offsets.CBMonthEnd` or :class:`~pandas.tseries.offsets.CustomBusinessMonthEnd`, ``'CBME'``, "custom business month end" :class:`~pandas.tseries.offsets.CBMonthBegin` or :class:`~pandas.tseries.offsets.CustomBusinessMonthBegin`, ``'CBMS'``, "custom business month begin" - :class:`~pandas.tseries.offsets.SemiMonthEnd`, ``'SM'``, "15th (or other day_of_month) and calendar month end" + :class:`~pandas.tseries.offsets.SemiMonthEnd`, ``'SME'``, "15th (or other day_of_month) and calendar month end" :class:`~pandas.tseries.offsets.SemiMonthBegin`, ``'SMS'``, "15th (or other day_of_month) and calendar month begin" :class:`~pandas.tseries.offsets.QuarterEnd`, ``'QE'``, "calendar quarter end" :class:`~pandas.tseries.offsets.QuarterBegin`, ``'QS'``, "calendar quarter begin" - :class:`~pandas.tseries.offsets.BQuarterEnd`, ``'BQ``, "business quarter end" + :class:`~pandas.tseries.offsets.BQuarterEnd`, ``'BQE``, "business quarter end" :class:`~pandas.tseries.offsets.BQuarterBegin`, ``'BQS'``, "business quarter begin" :class:`~pandas.tseries.offsets.FY5253Quarter`, ``'REQ'``, "retail (aka 52-53 week) quarter" :class:`~pandas.tseries.offsets.YearEnd`, ``'YE'``, "calendar year end" :class:`~pandas.tseries.offsets.YearBegin`, ``'YS'`` or ``'BYS'``,"calendar year begin" - :class:`~pandas.tseries.offsets.BYearEnd`, ``'BY'``, "business year end" + :class:`~pandas.tseries.offsets.BYearEnd`, ``'BYE'``, "business year end" :class:`~pandas.tseries.offsets.BYearBegin`, ``'BYS'``, "business year begin" :class:`~pandas.tseries.offsets.FY5253`, ``'RE'``, "retail (aka 52-53 week) year" :class:`~pandas.tseries.offsets.Easter`, None, "Easter holiday" @@ -1241,7 +1241,7 @@ frequencies. We will refer to these aliases as *offset aliases*. "D", "calendar day frequency" "W", "weekly frequency" "ME", "month end frequency" - "SM", "semi-month end frequency (15th and end of month)" + "SME", "semi-month end frequency (15th and end of month)" "BME", "business month end frequency" "CBME", "custom business month end frequency" "MS", "month start frequency" @@ -1249,11 +1249,11 @@ frequencies. We will refer to these aliases as *offset aliases*. "BMS", "business month start frequency" "CBMS", "custom business month start frequency" "QE", "quarter end frequency" - "BQ", "business quarter end frequency" + "BQE", "business quarter end frequency" "QS", "quarter start frequency" "BQS", "business quarter start frequency" "YE", "year end frequency" - "BY", "business year end frequency" + "BYE", "business year end frequency" "YS", "year start frequency" "BYS", "business year start frequency" "h", "hourly frequency" @@ -1686,7 +1686,7 @@ the end of the interval. .. warning:: The default values for ``label`` and ``closed`` is '**left**' for all - frequency offsets except for 'ME', 'YE', 'QE', 'BME', 'BY', 'BQ', and 'W' + frequency offsets except for 'ME', 'YE', 'QE', 'BME', 'BYE', 'BQE', and 'W' which all have a default of 'right'. This might unintendedly lead to looking ahead, where the value for a later diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 32b3ced4f9d39..1ae711113773f 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -329,11 +329,13 @@ These provide date offsets anchored (by default) to the 15th and end of month, a **SemiMonthEnd**: -.. ipython:: python +.. code-block:: python - pd.Timestamp("2016-01-01") + SemiMonthEnd() + In [46]: pd.Timestamp("2016-01-01") + SemiMonthEnd() + Out[46]: Timestamp('2016-01-15 00:00:00') - pd.date_range("2015-01-01", freq="SM", periods=4) + In [47]: pd.date_range("2015-01-01", freq="SM", periods=4) + Out[47]: DatetimeIndex(['2015-01-15', '2015-01-31', '2015-02-15', '2015-02-28'], dtype='datetime64[ns]', freq='SM-15') **SemiMonthBegin**: @@ -345,11 +347,13 @@ These provide date offsets anchored (by default) to the 15th and end of month, a Using the anchoring suffix, you can also specify the day of month to use instead of the 15th. -.. ipython:: python +.. code-block:: python - pd.date_range("2015-01-01", freq="SMS-16", periods=4) + In [50]: pd.date_range("2015-01-01", freq="SMS-16", periods=4) + Out[50]: DatetimeIndex(['2015-01-01', '2015-01-16', '2015-02-01', '2015-02-16'], dtype='datetime64[ns]', freq='SMS-16') - pd.date_range("2015-01-01", freq="SM-14", periods=4) + In [51]: pd.date_range("2015-01-01", freq="SM-14", periods=4) + Out[51]: DatetimeIndex(['2015-01-14', '2015-01-31', '2015-02-14', '2015-02-28'], dtype='datetime64[ns]', freq='SM-14') .. _whatsnew_0190.enhancements.index: diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 04bbb0f806cbd..543a9864ced26 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -21,7 +21,10 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`) +- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) +- Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) +- Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 9040dba238c88..8893fe0ecd398 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -89,6 +89,97 @@ a Series. (:issue:`55323`) ) series.list[0] +.. _whatsnew_220.enhancements.adbc_support: + +ADBC Driver support in to_sql and read_sql +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`read_sql` and :meth:`~DataFrame.to_sql` now work with `Apache Arrow ADBC +`_ drivers. Compared to +traditional drivers used via SQLAlchemy, ADBC drivers should provide +significant performance improvements, better type support and cleaner +nullability handling. + +.. code-block:: ipython + + import adbc_driver_postgresql.dbapi as pg_dbapi + + df = pd.DataFrame( + [ + [1, 2, 3], + [4, 5, 6], + ], + columns=['a', 'b', 'c'] + ) + uri = "postgresql://postgres:postgres@localhost/postgres" + with pg_dbapi.connect(uri) as conn: + df.to_sql("pandas_table", conn, index=False) + + # for roundtripping + with pg_dbapi.connect(uri) as conn: + df2 = pd.read_sql("pandas_table", conn) + +The Arrow type system offers a wider array of types that can more closely match +what databases like PostgreSQL can offer. To illustrate, note this (non-exhaustive) +listing of types available in different databases and pandas backends: + ++-----------------+-----------------------+----------------+---------+ +|numpy/pandas |arrow |postgres |sqlite | ++=================+=======================+================+=========+ +|int16/Int16 |int16 |SMALLINT |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|int32/Int32 |int32 |INTEGER |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|int64/Int64 |int64 |BIGINT |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|float32 |float32 |REAL |REAL | ++-----------------+-----------------------+----------------+---------+ +|float64 |float64 |DOUBLE PRECISION|REAL | ++-----------------+-----------------------+----------------+---------+ +|object |string |TEXT |TEXT | ++-----------------+-----------------------+----------------+---------+ +|bool |``bool_`` |BOOLEAN | | ++-----------------+-----------------------+----------------+---------+ +|datetime64[ns] |timestamp(us) |TIMESTAMP | | ++-----------------+-----------------------+----------------+---------+ +|datetime64[ns,tz]|timestamp(us,tz) |TIMESTAMPTZ | | ++-----------------+-----------------------+----------------+---------+ +| |date32 |DATE | | ++-----------------+-----------------------+----------------+---------+ +| |month_day_nano_interval|INTERVAL | | ++-----------------+-----------------------+----------------+---------+ +| |binary |BINARY |BLOB | ++-----------------+-----------------------+----------------+---------+ +| |decimal128 |DECIMAL [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ +| |list |ARRAY [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ +| |struct |COMPOSITE TYPE | | +| | | [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ + +.. rubric:: Footnotes + +.. [#f1] Not implemented as of writing, but theoretically possible + +If you are interested in preserving database types as best as possible +throughout the lifecycle of your DataFrame, users are encouraged to +leverage the ``dtype_backend="pyarrow"`` argument of :func:`~pandas.read_sql` + +.. code-block:: ipython + + # for roundtripping + with pg_dbapi.connect(uri) as conn: + df2 = pd.read_sql("pandas_table", conn, dtype_backend="pyarrow") + +This will prevent your data from being converted to the traditional pandas/NumPy +type system, which often converts SQL types in ways that make them impossible to +round-trip. + +For a full list of ADBC drivers and their development status, see the `ADBC Driver +Implementation Status `_ +documentation. + .. _whatsnew_220.enhancements.other: Other enhancements @@ -233,14 +324,19 @@ Other API changes Deprecations ~~~~~~~~~~~~ -Deprecate aliases ``M``, ``Q``, and ``Y`` in favour of ``ME``, ``QE``, and ``YE`` for offsets -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Deprecate aliases ``M``, ``SM``, ``BM``, ``CBM``, ``Q``, ``BQ``, ``Y``, and ``BY`` in favour of ``ME``, ``SME``, ``BME``, ``CBME``, ``QE``, ``BQE``, ``YE``, and ``BYE`` for offsets +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Deprecated the following frequency aliases (:issue:`9586`): - ``M`` (month end) has been renamed ``ME`` for offsets +- ``SM`` (semi month end) has been renamed ``SME`` for offsets +- ``BM`` (business month end) has been renamed ``BME`` for offsets +- ``CBM`` (custom business month end) has been renamed ``CBME`` for offsets - ``Q`` (quarter end) has been renamed ``QE`` for offsets +- ``BQ`` (business quarter end) has been renamed ``BQE`` for offsets - ``Y`` (year end) has been renamed ``YE`` for offsets +- ``BY`` (business year end) has been renamed ``BYE`` for offsets For example: @@ -262,8 +358,13 @@ For example: Other Deprecations ^^^^^^^^^^^^^^^^^^ - Changed :meth:`Timedelta.resolution_string` to return ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns`` instead of ``H``, ``T``, ``S``, ``L``, ``U``, and ``N``, for compatibility with respective deprecations in frequency aliases (:issue:`52536`) +- Deprecated :func:`pandas.api.types.is_interval` and :func:`pandas.api.types.is_period`, use ``isinstance(obj, pd.Interval)`` and ``isinstance(obj, pd.Period)`` instead (:issue:`55264`) - Deprecated :func:`read_gbq` and :meth:`DataFrame.to_gbq`. Use ``pandas_gbq.read_gbq`` and ``pandas_gbq.to_gbq`` instead https://pandas-gbq.readthedocs.io/en/latest/api.html (:issue:`55525`) +- Deprecated :meth:`.DataFrameGroupBy.fillna` and :meth:`.SeriesGroupBy.fillna`; use :meth:`.DataFrameGroupBy.ffill`, :meth:`.DataFrameGroupBy.bfill` for forward and backward filling or :meth:`.DataFrame.fillna` to fill with a single value (or the Series equivalents) (:issue:`55718`) - Deprecated :meth:`Index.format`, use ``index.astype(str)`` or ``index.map(formatter)`` instead (:issue:`55413`) +- Deprecated ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock``, use public APIs instead (:issue:`55139`) +- Deprecated ``year``, ``month``, ``quarter``, ``day``, ``hour``, ``minute``, and ``second`` keywords in the :class:`PeriodIndex` constructor, use :meth:`PeriodIndex.from_fields` instead (:issue:`55960`) +- Deprecated allowing non-integer ``periods`` argument in :func:`date_range`, :func:`timedelta_range`, :func:`period_range`, and :func:`interval_range` (:issue:`56036`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard`. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf``. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_dict`. (:issue:`54229`) @@ -282,18 +383,19 @@ Other Deprecations - Deprecated automatic downcasting of object-dtype results in :meth:`Series.replace` and :meth:`DataFrame.replace`, explicitly call ``result = result.infer_objects(copy=False)`` instead. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`54710`) - Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`53656`) - Deprecated including the groups in computations when using :meth:`DataFrameGroupBy.apply` and :meth:`DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`) +- Deprecated indexing an :class:`Index` with a boolean indexer of length zero (:issue:`55820`) - Deprecated not passing a tuple to :class:`DataFrameGroupBy.get_group` or :class:`SeriesGroupBy.get_group` when grouping by a length-1 list-like (:issue:`25971`) - Deprecated string ``AS`` denoting frequency in :class:`YearBegin` and strings ``AS-DEC``, ``AS-JAN``, etc. denoting annual frequencies with various fiscal year starts (:issue:`54275`) - Deprecated string ``A`` denoting frequency in :class:`YearEnd` and strings ``A-DEC``, ``A-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`54275`) - Deprecated string ``BAS`` denoting frequency in :class:`BYearBegin` and strings ``BAS-DEC``, ``BAS-JAN``, etc. denoting annual frequencies with various fiscal year starts (:issue:`54275`) - Deprecated string ``BA`` denoting frequency in :class:`BYearEnd` and strings ``BA-DEC``, ``BA-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`54275`) -- Deprecated strings ``BM``, and ``CBM`` denoting frequencies in :class:`BusinessMonthEnd`, :class:`CustomBusinessMonthEnd` (:issue:`52064`) - Deprecated strings ``H``, ``BH``, and ``CBH`` denoting frequencies in :class:`Hour`, :class:`BusinessHour`, :class:`CustomBusinessHour` (:issue:`52536`) - Deprecated strings ``H``, ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`) - Deprecated strings ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`) - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`) - Deprecated the ``errors="ignore"`` option in :func:`to_datetime`, :func:`to_timedelta`, and :func:`to_numeric`; explicitly catch exceptions instead (:issue:`54467`) - Deprecated the ``fastpath`` keyword in the :class:`Series` constructor (:issue:`20110`) +- Deprecated the ``ordinal`` keyword in :class:`PeriodIndex`, use :meth:`PeriodIndex.from_ordinals` instead (:issue:`55960`) - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) - Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`) - Deprecated the previous implementation of :class:`DataFrame.stack`; specify ``future_stack=True`` to adopt the future version (:issue:`53515`) @@ -305,17 +407,22 @@ Other Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Performance improvement in :func:`.testing.assert_frame_equal` and :func:`.testing.assert_series_equal` for objects indexed by a :class:`MultiIndex` (:issue:`55949`) +- Performance improvement in :func:`.testing.assert_frame_equal` and :func:`.testing.assert_series_equal` (:issue:`55949`, :issue:`55971`) - Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`) +- Performance improvement in :func:`get_dummies` (:issue:`56089`) - Performance improvement in :func:`merge_asof` when ``by`` is not ``None`` (:issue:`55580`, :issue:`55678`) - Performance improvement in :func:`read_stata` for files with many variables (:issue:`55515`) - Performance improvement in :func:`to_dict` on converting DataFrame to dictionary (:issue:`50990`) - Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`) +- Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` when indexing with a :class:`MultiIndex` (:issue:`56062`) - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`) - Performance improvement in :meth:`Index.difference` (:issue:`55108`) +- Performance improvement in :meth:`Index.sort_values` when index is already sorted (:issue:`56128`) - Performance improvement in :meth:`MultiIndex.get_indexer` when ``method`` is not ``None`` (:issue:`55839`) - Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`) +- Performance improvement in :meth:`Series.str.get_dummies` when dtype is ``"string[pyarrow]"`` or ``"string[pyarrow_numpy]"`` (:issue:`56110`) - Performance improvement in :meth:`Series.str` methods (:issue:`55736`) +- Performance improvement in :meth:`Series.value_counts` and :meth:`Series.mode` for masked dtypes (:issue:`54984`, :issue:`55340`) - Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`) - Performance improvement when indexing into a non-unique index (:issue:`55816`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) @@ -341,16 +448,22 @@ Datetimelike - Bug in :func:`testing.assert_extension_array_equal` that could use the wrong unit when comparing resolutions (:issue:`55730`) - Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing a list of mixed-string-and-numeric types incorrectly raising (:issue:`55780`) - Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing mixed-type objects with a mix of timezones or mix of timezone-awareness failing to raise ``ValueError`` (:issue:`55693`) +- Bug in :meth:`DatetimeIndex.shift` with non-nanosecond resolution incorrectly returning with nanosecond resolution (:issue:`56117`) - Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`) - Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`) - Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`) - Bug in :meth:`Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) +- Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetim64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`55979`) - Bug in adding or subtracting a :class:`Week` offset to a ``datetime64`` :class:`Series`, :class:`Index`, or :class:`DataFrame` column with non-nanosecond resolution returning incorrect results (:issue:`55583`) - Bug in addition or subtraction of :class:`BusinessDay` offset with ``offset`` attribute to non-nanosecond :class:`Index`, :class:`Series`, or :class:`DataFrame` column giving incorrect results (:issue:`55608`) - Bug in addition or subtraction of :class:`DateOffset` objects with microsecond components to ``datetime64`` :class:`Index`, :class:`Series`, or :class:`DataFrame` columns with non-nanosecond resolution (:issue:`55595`) - Bug in addition or subtraction of very large :class:`Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`) +- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` (or :class:`DatetimeTZDtype`) from mixed-numeric inputs treating those as nanoseconds instead of as multiples of the dtype's unit (which would happen with non-mixed numeric inputs) (:issue:`56004`) - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`) +- Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`) +- Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`) +- Bug in the results of :func:`pd.to_datetime` with an floating-dtype argument with ``unit`` not matching the pointwise results of :class:`Timestamp` (:issue:`56037`) - Timedelta @@ -362,7 +475,7 @@ Timezones ^^^^^^^^^ - Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`) - Bug in :class:`Timestamp` construction with an ambiguous value and a ``pytz`` timezone failing to raise ``pytz.AmbiguousTimeError`` (:issue:`55657`) -- +- Bug in :meth:`Timestamp.tz_localize` with ``nonexistent="shift_forward`` around UTC+0 during DST (:issue:`51501`) Numeric ^^^^^^^ @@ -385,6 +498,7 @@ Strings Interval ^^^^^^^^ - Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown. (:issue:`55015`) +- Bug in :meth:`IntervalIndex.factorize` and :meth:`Series.factorize` with :class:`IntervalDtype` with datetime64 or timedelta64 intervals not preserving non-nanosecond units (:issue:`56099`) - Bug in :meth:`IntervalIndex.from_arrays` when passed ``datetime64`` or ``timedelta64`` arrays with mismatched resolutions constructing an invalid ``IntervalArray`` object (:issue:`55714`) - Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`) - Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`) @@ -440,6 +554,9 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) +- Bug in :meth:`DataFrameGroupBy.value_counts` and :meth:`SeriesGroupBy.value_count` could result in incorrect sorting if the columns of the DataFrame or name of the Series are integers (:issue:`55951`) +- Bug in :meth:`DataFrameGroupBy.value_counts` and :meth:`SeriesGroupBy.value_count` would not respect ``sort=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`55951`) +- Bug in :meth:`DataFrameGroupBy.value_counts` and :meth:`SeriesGroupBy.value_count` would sort by proportions rather than frequencies when ``sort=True`` and ``normalize=True`` (:issue:`55951`) Reshaping ^^^^^^^^^ @@ -448,10 +565,11 @@ Reshaping - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) - Bug in :meth:`pandas.DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`) - Bug in :meth:`pandas.DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) +- Bug in :meth:`pandas.DataFrame.pivot_table` where the row margin is incorrect when the columns have numeric names (:issue:`26568`) Sparse ^^^^^^ -- +- Bug in :meth:`SparseArray.take` when using a different fill value than the array's fill value (:issue:`55181`) - ExtensionArray @@ -473,6 +591,7 @@ Other - Bug in :meth:`Dataframe.from_dict` which would always sort the rows of the created :class:`DataFrame`. (:issue:`55683`) - Bug in rendering ``inf`` values inside a a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`) - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) +- Bug in the error message when assigning an empty dataframe to a column (:issue:`55956`) .. ***DO NOT USE THIS SECTION*** diff --git a/environment.yml b/environment.yml index e41389c7f262a..4aa8073e93f6d 100644 --- a/environment.yml +++ b/environment.yml @@ -8,7 +8,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython=0.29.33 + - cython=3.0.5 - meson[ninja]=1.2.1 - meson-python=0.13.1 @@ -16,6 +16,8 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 + - pytest-qt>=4.2.0 + - pyqt>=5.15.9 - coverage # required dependencies @@ -35,7 +37,7 @@ dependencies: - ipython - jinja2>=3.1.2 - lxml>=4.9.2 - - matplotlib>=3.6.3, <3.8 + - matplotlib>=3.6.3 - numba>=0.56.4 - numexpr>=2.8.4 - openpyxl>=3.1.0 @@ -113,6 +115,8 @@ dependencies: - pygments # Code highlighting - pip: + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 - dataframe-api-compat>=0.1.7 - sphinx-toggleprompt # conda-forge version has stricter pins on jinja2 - typing_extensions; python_version<"3.11" diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index b330b37aebd8d..e70ac26a2c28e 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -998,8 +998,7 @@ def rank_1d( N = len(values) if labels is not None: - # TODO(cython3): cast won't be necessary (#2992) - assert len(labels) == N + assert len(labels) == N out = np.empty(N) grp_sizes = np.ones(N, dtype=np.int64) @@ -1145,107 +1144,7 @@ cdef void rank_sorted_1d( # that sorted value for retrieval back from the original # values / masked_vals arrays # TODO(cython3): de-duplicate once cython supports conditional nogil - if numeric_object_t is object: - with gil: - for i in range(N): - at_end = i == N - 1 - - # dups and sum_ranks will be incremented each loop where - # the value / group remains the same, and should be reset - # when either of those change. Used to calculate tiebreakers - dups += 1 - sum_ranks += i - grp_start + 1 - - next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]], - masked_vals[sort_indexer[i+1]]) - - # We'll need this check later anyway to determine group size, so just - # compute it here since shortcircuiting won't help - group_changed = at_end or (check_labels and - (labels[sort_indexer[i]] - != labels[sort_indexer[i+1]])) - - # Update out only when there is a transition of values or labels. - # When a new value or group is encountered, go back #dups steps( - # the number of occurrence of current value) and assign the ranks - # based on the starting index of the current group (grp_start) - # and the current index - if (next_val_diff or group_changed or (check_mask and - (mask[sort_indexer[i]] - ^ mask[sort_indexer[i+1]]))): - - # If keep_na, check for missing values and assign back - # to the result where appropriate - if keep_na and check_mask and mask[sort_indexer[i]]: - grp_na_count = dups - for j in range(i - dups + 1, i + 1): - out[sort_indexer[j]] = NaN - elif tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - out[sort_indexer[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - out[sort_indexer[j]] = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - out[sort_indexer[j]] = i - grp_start + 1 - - # With n as the previous rank in the group and m as the number - # of duplicates in this stretch, if TIEBREAK_FIRST and ascending, - # then rankings should be n + 1, n + 2 ... n + m - elif tiebreak == TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - out[sort_indexer[j]] = j + 1 - grp_start - - # If TIEBREAK_FIRST and descending, the ranking should be - # n + m, n + (m - 1) ... n + 1. This is equivalent to - # (i - dups + 1) + (i - j + 1) - grp_start - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start - elif tiebreak == TIEBREAK_DENSE: - for j in range(i - dups + 1, i + 1): - out[sort_indexer[j]] = grp_vals_seen - - # Look forward to the next value (using the sorting in - # lexsort_indexer). If the value does not equal the current - # value then we need to reset the dups and sum_ranks, knowing - # that a new value is coming up. The conditional also needs - # to handle nan equality and the end of iteration. If group - # changes we do not record seeing a new value in the group - if not group_changed and (next_val_diff or (check_mask and - (mask[sort_indexer[i]] - ^ mask[sort_indexer[i+1]]))): - dups = sum_ranks = 0 - grp_vals_seen += 1 - - # Similar to the previous conditional, check now if we are - # moving to a new group. If so, keep track of the index where - # the new group occurs, so the tiebreaker calculations can - # decrement that from their position. Fill in the size of each - # group encountered (used by pct calculations later). Also be - # sure to reset any of the items helping to calculate dups - if group_changed: - - # If not dense tiebreak, group size used to compute - # percentile will be # of non-null elements in group - if tiebreak != TIEBREAK_DENSE: - grp_size = i - grp_start + 1 - grp_na_count - - # Otherwise, it will be the number of distinct values - # in the group, subtracting 1 if NaNs are present - # since that is a distinct value we shouldn't count - else: - grp_size = grp_vals_seen - (grp_na_count > 0) - - for j in range(grp_start, i + 1): - grp_sizes[sort_indexer[j]] = grp_size - - dups = sum_ranks = 0 - grp_na_count = 0 - grp_start = i + 1 - grp_vals_seen = 1 - else: + with gil(numeric_object_t is object): for i in range(N): at_end = i == N - 1 @@ -1255,8 +1154,12 @@ cdef void rank_sorted_1d( dups += 1 sum_ranks += i - grp_start + 1 - next_val_diff = at_end or (masked_vals[sort_indexer[i]] - != masked_vals[sort_indexer[i+1]]) + if numeric_object_t is object: + next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]], + masked_vals[sort_indexer[i+1]]) + else: + next_val_diff = at_end or (masked_vals[sort_indexer[i]] + != masked_vals[sort_indexer[i+1]]) # We'll need this check later anyway to determine group size, so just # compute it here since shortcircuiting won't help @@ -1269,10 +1172,9 @@ cdef void rank_sorted_1d( # the number of occurrence of current value) and assign the ranks # based on the starting index of the current group (grp_start) # and the current index - if (next_val_diff or group_changed - or (check_mask and - (mask[sort_indexer[i]] ^ mask[sort_indexer[i+1]]))): - + if (next_val_diff or group_changed or (check_mask and + (mask[sort_indexer[i]] + ^ mask[sort_indexer[i+1]]))): # If keep_na, check for missing values and assign back # to the result where appropriate if keep_na and check_mask and mask[sort_indexer[i]]: @@ -1483,7 +1385,7 @@ def diff_2d( cdef: Py_ssize_t i, j, sx, sy, start, stop bint f_contig = arr.flags.f_contiguous - # bint f_contig = arr.is_f_contig() # TODO(cython3) + # bint f_contig = arr.is_f_contig() # TODO(cython3) once arr is memoryview diff_t left, right # Disable for unsupported dtype combinations, diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index 78fee8f01319c..86f69c3cdfc75 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -26,7 +26,7 @@ class NDArrayBacked: def size(self) -> int: ... @property def nbytes(self) -> int: ... - def copy(self): ... + def copy(self, order=...): ... def delete(self, loc, axis=...): ... def swapaxes(self, axis1, axis2): ... def repeat(self, repeats: int | Sequence[int], axis: int | None = ...): ... diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 718fb358e26bc..9889436a542c1 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -126,8 +126,7 @@ cdef class NDArrayBacked: @property def size(self) -> int: - # TODO(cython3): use self._ndarray.size - return cnp.PyArray_SIZE(self._ndarray) + return self._ndarray.size @property def nbytes(self) -> int: diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 609663c721950..135828a23648a 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -44,7 +44,6 @@ def group_fillna_indexer( labels: np.ndarray, # ndarray[int64_t] sorted_labels: npt.NDArray[np.intp], mask: npt.NDArray[np.uint8], - direction: Literal["ffill", "bfill"], limit: int, # int64_t dropna: bool, ) -> None: ... @@ -55,7 +54,7 @@ def group_any_all( mask: np.ndarray, # const uint8_t[::1] val_test: Literal["any", "all"], skipna: bool, - nullable: bool, + result_mask: np.ndarray | None, ) -> None: ... def group_sum( out: np.ndarray, # complexfloatingintuint_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 6f9b33b042959..19d71b0a6fde3 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -695,54 +695,37 @@ def group_sum( N, K = (values).shape - if sum_t is object: - # NB: this does not use 'compensation' like the non-object track does. + with nogil(sum_t is not object): for i in range(N): lab = labels[i] if lab < 0: continue counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if not checknull(val): - nobs[lab, j] += 1 - - if nobs[lab, j] == 1: - # i.e. we haven't added anything yet; avoid TypeError - # if e.g. val is a str and sumx[lab, j] is 0 - t = val - else: - t = sumx[lab, j] + val - sumx[lab, j] = t - for i in range(ncounts): for j in range(K): - if nobs[i, j] < min_count: - out[i, j] = None + val = values[i, j] + if uses_mask: + isna_entry = mask[i, j] else: - out[i, j] = sumx[i, j] - else: - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue + isna_entry = _treat_as_na(val, is_datetimelike) - counts[lab] += 1 - for j in range(K): - val = values[i, j] + if not isna_entry: + nobs[lab, j] += 1 - if uses_mask: - isna_entry = mask[i, j] - else: - isna_entry = _treat_as_na(val, is_datetimelike) + if sum_t is object: + # NB: this does not use 'compensation' like the non-object + # track does. + if nobs[lab, j] == 1: + # i.e. we haven't added anything yet; avoid TypeError + # if e.g. val is a str and sumx[lab, j] is 0 + t = val + else: + t = sumx[lab, j] + val + sumx[lab, j] = t - if not isna_entry: - nobs[lab, j] += 1 + else: y = val - compensation[lab, j] t = sumx[lab, j] + y compensation[lab, j] = t - sumx[lab, j] - y @@ -755,9 +738,9 @@ def group_sum( compensation[lab, j] = 0 sumx[lab, j] = t - _check_below_mincount( - out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx - ) + _check_below_mincount( + out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx + ) @cython.wraparound(False) @@ -809,9 +792,9 @@ def group_prod( nobs[lab, j] += 1 prodx[lab, j] *= val - _check_below_mincount( - out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx - ) + _check_below_mincount( + out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx + ) @cython.wraparound(False) @@ -1320,9 +1303,8 @@ ctypedef fused numeric_object_complex_t: cdef bint _treat_as_na(numeric_object_complex_t val, bint is_datetimelike) noexcept nogil: if numeric_object_complex_t is object: - # Should never be used, but we need to avoid the `val != val` below - # or else cython will raise about gil acquisition. - raise NotImplementedError + with gil: + return checknull(val) elif numeric_object_complex_t is int64_t: return is_datetimelike and val == NPY_NAT @@ -1369,7 +1351,7 @@ cdef numeric_t _get_na_val(numeric_t val, bint is_datetimelike): ctypedef fused mincount_t: - numeric_t + numeric_object_t complex64_t complex128_t @@ -1385,7 +1367,7 @@ cdef inline void _check_below_mincount( int64_t[:, ::1] nobs, int64_t min_count, mincount_t[:, ::1] resx, -) noexcept nogil: +) noexcept: """ Check if the number of observations for a group is below min_count, and if so set the result for that group to the appropriate NA-like value. @@ -1393,38 +1375,40 @@ cdef inline void _check_below_mincount( cdef: Py_ssize_t i, j - for i in range(ncounts): - for j in range(K): - - if nobs[i, j] < min_count: - # if we are integer dtype, not is_datetimelike, and - # not uses_mask, then getting here implies that - # counts[i] < min_count, which means we will - # be cast to float64 and masked at the end - # of WrappedCythonOp._call_cython_op. So we can safely - # set a placeholder value in out[i, j]. - if uses_mask: - result_mask[i, j] = True - # set out[i, j] to 0 to be deterministic, as - # it was initialized with np.empty. Also ensures - # we can downcast out if appropriate. - out[i, j] = 0 - elif ( - mincount_t is float32_t - or mincount_t is float64_t - or mincount_t is complex64_t - or mincount_t is complex128_t - ): - out[i, j] = NAN - elif mincount_t is int64_t: - # Per above, this is a placeholder in - # non-is_datetimelike cases. - out[i, j] = NPY_NAT + with nogil(mincount_t is not object): + for i in range(ncounts): + for j in range(K): + if nobs[i, j] >= min_count: + out[i, j] = resx[i, j] else: - # placeholder, see above - out[i, j] = 0 - else: - out[i, j] = resx[i, j] + # if we are integer dtype, not is_datetimelike, and + # not uses_mask, then getting here implies that + # counts[i] < min_count, which means we will + # be cast to float64 and masked at the end + # of WrappedCythonOp._call_cython_op. So we can safely + # set a placeholder value in out[i, j]. + if uses_mask: + result_mask[i, j] = True + # set out[i, j] to 0 to be deterministic, as + # it was initialized with np.empty. Also ensures + # we can downcast out if appropriate. + out[i, j] = 0 + elif ( + mincount_t is float32_t + or mincount_t is float64_t + or mincount_t is complex64_t + or mincount_t is complex128_t + ): + out[i, j] = NAN + elif mincount_t is int64_t: + # Per above, this is a placeholder in + # non-is_datetimelike cases. + out[i, j] = NPY_NAT + elif mincount_t is object: + out[i, j] = None + else: + # placeholder, see above + out[i, j] = 0 # TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can @@ -1452,9 +1436,7 @@ def group_last( bint uses_mask = mask is not None bint isna_entry - # TODO(cython3): - # Instead of `labels.shape[0]` use `len(labels)` - if not len(values) == labels.shape[0]: + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") min_count = max(min_count, 1) @@ -1466,8 +1448,7 @@ def group_last( N, K = (values).shape - if numeric_object_t is object: - # TODO(cython3): De-duplicate once conditional-nogil is available + with nogil(numeric_object_t is not object): for i in range(N): lab = labels[i] if lab < 0: @@ -1480,43 +1461,15 @@ def group_last( if uses_mask: isna_entry = mask[i, j] else: - isna_entry = checknull(val) + isna_entry = _treat_as_na(val, is_datetimelike) if not isna_entry: - # TODO(cython3): use _treat_as_na here once - # conditional-nogil is available. nobs[lab, j] += 1 resx[lab, j] = val - for i in range(ncounts): - for j in range(K): - if nobs[i, j] < min_count: - out[i, j] = None - else: - out[i, j] = resx[i, j] - else: - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if uses_mask: - isna_entry = mask[i, j] - else: - isna_entry = _treat_as_na(val, is_datetimelike) - - if not isna_entry: - nobs[lab, j] += 1 - resx[lab, j] = val - - _check_below_mincount( - out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx - ) + _check_below_mincount( + out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx + ) # TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can @@ -1545,9 +1498,7 @@ def group_nth( bint uses_mask = mask is not None bint isna_entry - # TODO(cython3): - # Instead of `labels.shape[0]` use `len(labels)` - if not len(values) == labels.shape[0]: + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") min_count = max(min_count, 1) @@ -1559,8 +1510,7 @@ def group_nth( N, K = (values).shape - if numeric_object_t is object: - # TODO(cython3): De-duplicate once conditional-nogil is available + with nogil(numeric_object_t is not object): for i in range(N): lab = labels[i] if lab < 0: @@ -1573,46 +1523,16 @@ def group_nth( if uses_mask: isna_entry = mask[i, j] else: - isna_entry = checknull(val) + isna_entry = _treat_as_na(val, is_datetimelike) if not isna_entry: - # TODO(cython3): use _treat_as_na here once - # conditional-nogil is available. nobs[lab, j] += 1 if nobs[lab, j] == rank: resx[lab, j] = val - for i in range(ncounts): - for j in range(K): - if nobs[i, j] < min_count: - out[i, j] = None - else: - out[i, j] = resx[i, j] - - else: - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if uses_mask: - isna_entry = mask[i, j] - else: - isna_entry = _treat_as_na(val, is_datetimelike) - - if not isna_entry: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - _check_below_mincount( - out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx - ) + _check_below_mincount( + out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx + ) @cython.boundscheck(False) @@ -1752,9 +1672,7 @@ cdef group_min_max( bint uses_mask = mask is not None bint isna_entry - # TODO(cython3): - # Instead of `labels.shape[0]` use `len(labels)` - if not len(values) == labels.shape[0]: + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") min_count = max(min_count, 1) @@ -1789,9 +1707,9 @@ cdef group_min_max( if val < group_min_or_max[lab, j]: group_min_or_max[lab, j] = val - _check_below_mincount( - out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max - ) + _check_below_mincount( + out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max + ) @cython.wraparound(False) @@ -1855,9 +1773,7 @@ def group_idxmin_idxmax( assert name == "idxmin" or name == "idxmax" - # TODO(cython3): - # Instead of `labels.shape[0]` use `len(labels)` - if not len(values) == labels.shape[0]: + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") N, K = (values).shape @@ -1876,8 +1792,7 @@ def group_idxmin_idxmax( # a category is not observed; these values will be dropped out[:] = 0 - # TODO(cython3): De-duplicate once conditional-nogil is available - if numeric_object_t is object: + with nogil(numeric_object_t is not object): for i in range(N): lab = labels[i] if lab < 0: @@ -1892,8 +1807,7 @@ def group_idxmin_idxmax( if uses_mask: isna_entry = mask[i, j] else: - # TODO(cython3): use _treat_as_na here - isna_entry = checknull(val) + isna_entry = _treat_as_na(val, is_datetimelike) if isna_entry: if not skipna: @@ -1907,36 +1821,6 @@ def group_idxmin_idxmax( if val < group_min_or_max[lab, j]: group_min_or_max[lab, j] = val out[lab, j] = i - else: - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - for j in range(K): - if not skipna and out[lab, j] == -1: - # Once we've hit NA there is no going back - continue - val = values[i, j] - - if uses_mask: - isna_entry = mask[i, j] - else: - isna_entry = _treat_as_na(val, is_datetimelike) - - if isna_entry: - if not skipna: - out[lab, j] = -1 - else: - if compute_max: - if val > group_min_or_max[lab, j]: - group_min_or_max[lab, j] = val - out[lab, j] = i - else: - if val < group_min_or_max[lab, j]: - group_min_or_max[lab, j] = val - out[lab, j] = i @cython.wraparound(False) diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 0ac914e86f699..555ec73acd9b2 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -20,7 +20,6 @@ class Factorizer: def factorize( self, values: np.ndarray, - sort: bool = ..., na_sentinel=..., na_value=..., mask=..., @@ -157,9 +156,9 @@ class HashTable: def __contains__(self, key: Hashable) -> bool: ... def sizeof(self, deep: bool = ...) -> int: ... def get_state(self) -> dict[str, int]: ... - # TODO: `item` type is subclass-specific - def get_item(self, item): ... # TODO: return type? - def set_item(self, item, val) -> None: ... + # TODO: `val/key` type is subclass-specific + def get_item(self, val): ... # TODO: return type? + def set_item(self, key, val) -> None: ... def get_na(self): ... # TODO: return type? def set_na(self, val) -> None: ... def map_locations( @@ -185,6 +184,7 @@ class HashTable: self, values: np.ndarray, # np.ndarray[subclass-specific] return_inverse: bool = ..., + mask=..., ) -> ( tuple[ np.ndarray, # np.ndarray[subclass-specific] @@ -198,6 +198,7 @@ class HashTable: na_sentinel: int = ..., na_value: object = ..., mask=..., + ignore_na: bool = True, ) -> tuple[np.ndarray, npt.NDArray[np.intp]]: ... # np.ndarray[subclass-specific] class Complex128HashTable(HashTable): ... diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 1cf5d734705af..c0723392496c1 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -1239,9 +1239,10 @@ cdef class StringHashTable(HashTable): na_value=na_value, ignore_na=ignore_na, return_inverse=True) + # Add unused mask parameter for compat with other signatures def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): + object na_value=None, object mask=None): # -> np.ndarray[np.intp] _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, @@ -1496,9 +1497,10 @@ cdef class PyObjectHashTable(HashTable): na_value=na_value, ignore_na=ignore_na, return_inverse=True) + # Add unused mask parameter for compat with other signatures def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): + object na_value=None, object mask=None): # -> np.ndarray[np.intp] _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 19acd4acbdee7..336af306d410f 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -404,12 +404,13 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None): cdef: ndarray[htfunc_t] keys ndarray[htfunc_t] modes + ndarray[uint8_t] res_mask = None int64_t[::1] counts int64_t count, _, max_count = -1 - Py_ssize_t nkeys, k, j = 0 + Py_ssize_t nkeys, k, na_counter, j = 0 - keys, counts, _ = value_count(values, dropna, mask=mask) + keys, counts, na_counter = value_count(values, dropna, mask=mask) nkeys = len(keys) modes = np.empty(nkeys, dtype=values.dtype) @@ -440,7 +441,10 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None): modes[j] = keys[k] - return modes[:j + 1] + if na_counter > 0: + res_mask = np.zeros(j+1, dtype=np.bool_) + res_mask[j] = True + return modes[:j + 1], res_mask {{py: diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index fdfb8e1c99f6e..05c4e7bd5e9dc 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -2,14 +2,10 @@ from collections import defaultdict import weakref cimport cython +from cpython.pyport cimport PY_SSIZE_T_MAX from cpython.slice cimport PySlice_GetIndicesEx from cython cimport Py_ssize_t - -cdef extern from "Python.h": - # TODO(cython3): from cpython.pyport cimport PY_SSIZE_T_MAX - Py_ssize_t PY_SSIZE_T_MAX - import numpy as np cimport numpy as cnp diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 82f69c1dedd53..a37ab45dd57ed 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -39,7 +39,6 @@ from pandas._libs.tslibs.timezones cimport tz_compare from pandas._libs.tslibs.util cimport ( is_float_object, is_integer_object, - is_timedelta64_object, ) VALID_CLOSED = frozenset(["left", "right", "both", "neither"]) @@ -493,27 +492,16 @@ cdef class Interval(IntervalMixin): if ( isinstance(y, numbers.Number) or PyDelta_Check(y) - or is_timedelta64_object(y) + or cnp.is_timedelta64_object(y) ): return Interval(self.left + y, self.right + y, closed=self.closed) - elif ( - # __radd__ pattern - # TODO(cython3): remove this - isinstance(y, Interval) - and ( - isinstance(self, numbers.Number) - or PyDelta_Check(self) - or is_timedelta64_object(self) - ) - ): - return Interval(y.left + self, y.right + self, closed=y.closed) return NotImplemented def __radd__(self, other): if ( isinstance(other, numbers.Number) or PyDelta_Check(other) - or is_timedelta64_object(other) + or cnp.is_timedelta64_object(other) ): return Interval(self.left + other, self.right + other, closed=self.closed) return NotImplemented @@ -522,7 +510,7 @@ cdef class Interval(IntervalMixin): if ( isinstance(y, numbers.Number) or PyDelta_Check(y) - or is_timedelta64_object(y) + or cnp.is_timedelta64_object(y) ): return Interval(self.left - y, self.right - y, closed=self.closed) return NotImplemented @@ -530,10 +518,6 @@ cdef class Interval(IntervalMixin): def __mul__(self, y): if isinstance(y, numbers.Number): return Interval(self.left * y, self.right * y, closed=self.closed) - elif isinstance(y, Interval) and isinstance(self, numbers.Number): - # __radd__ semantics - # TODO(cython3): remove this - return Interval(y.left * self, y.right * self, closed=y.closed) return NotImplemented def __rmul__(self, other): diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 33f6f16381639..b9fd970e68f5b 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -45,22 +45,24 @@ def is_scalar(val: object) -> bool: ... def is_list_like(obj: object, allow_sets: bool = ...) -> bool: ... def is_pyarrow_array(obj: object) -> bool: ... def is_period(val: object) -> TypeGuard[Period]: ... -def is_interval(val: object) -> TypeGuard[Interval]: ... -def is_decimal(val: object) -> TypeGuard[Decimal]: ... -def is_complex(val: object) -> TypeGuard[complex]: ... -def is_bool(val: object) -> TypeGuard[bool | np.bool_]: ... -def is_integer(val: object) -> TypeGuard[int | np.integer]: ... +def is_interval(obj: object) -> TypeGuard[Interval]: ... +def is_decimal(obj: object) -> TypeGuard[Decimal]: ... +def is_complex(obj: object) -> TypeGuard[complex]: ... +def is_bool(obj: object) -> TypeGuard[bool | np.bool_]: ... +def is_integer(obj: object) -> TypeGuard[int | np.integer]: ... def is_int_or_none(obj) -> bool: ... -def is_float(val: object) -> TypeGuard[float]: ... +def is_float(obj: object) -> TypeGuard[float]: ... def is_interval_array(values: np.ndarray) -> bool: ... -def is_datetime64_array(values: np.ndarray) -> bool: ... -def is_timedelta_or_timedelta64_array(values: np.ndarray) -> bool: ... +def is_datetime64_array(values: np.ndarray, skipna: bool = True) -> bool: ... +def is_timedelta_or_timedelta64_array( + values: np.ndarray, skipna: bool = True +) -> bool: ... def is_datetime_with_singletz_array(values: np.ndarray) -> bool: ... def is_time_array(values: np.ndarray, skipna: bool = ...): ... def is_date_array(values: np.ndarray, skipna: bool = ...): ... def is_datetime_array(values: np.ndarray, skipna: bool = ...): ... def is_string_array(values: np.ndarray, skipna: bool = ...): ... -def is_float_array(values: np.ndarray, skipna: bool = ...): ... +def is_float_array(values: np.ndarray): ... def is_integer_array(values: np.ndarray, skipna: bool = ...): ... def is_bool_array(values: np.ndarray, skipna: bool = ...): ... def fast_multiget( @@ -185,7 +187,7 @@ def count_level_2d( max_bin: int, ) -> np.ndarray: ... # np.ndarray[np.int64, ndim=2] def get_level_sorter( - label: np.ndarray, # const int64_t[:] + codes: np.ndarray, # const int64_t[:] starts: np.ndarray, # const intp_t[:] ) -> np.ndarray: ... # np.ndarray[np.intp, ndim=1] def generate_bins_dt64( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e8a69891c6093..8493f8bd066e0 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -25,7 +25,6 @@ from cpython.object cimport ( Py_EQ, PyObject, PyObject_RichCompareBool, - PyTypeObject, ) from cpython.ref cimport Py_INCREF from cpython.sequence cimport PySequence_Check @@ -66,34 +65,8 @@ from numpy cimport ( ) cnp.import_array() +from pandas._libs.interval import Interval -cdef extern from "Python.h": - # Note: importing extern-style allows us to declare these as nogil - # functions, whereas `from cpython cimport` does not. - bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil - -cdef extern from "numpy/arrayobject.h": - # cython's numpy.dtype specification is incorrect, which leads to - # errors in issubclass(self.dtype.type, np.bool_), so we directly - # include the correct version - # https://github.com/cython/cython/issues/2022 - - ctypedef class numpy.dtype [object PyArray_Descr]: - # Use PyDataType_* macros when possible, however there are no macros - # for accessing some of the fields, so some are defined. Please - # ask on cython-dev if you need more. - cdef: - int type_num - int itemsize "elsize" - char byteorder - object fields - tuple names - - PyTypeObject PySignedIntegerArrType_Type - PyTypeObject PyUnsignedIntegerArrType_Type - -cdef extern from "numpy/ndarrayobject.h": - bint PyArray_CheckScalar(obj) nogil cdef extern from "pandas/parser/pd_parser.h": int floatify(object, float64_t *result, int *maybe_int) except -1 @@ -256,7 +229,7 @@ def is_scalar(val: object) -> bool: # Note: PyNumber_Check check includes Decimal, Fraction, numbers.Number return (PyNumber_Check(val) or is_period_object(val) - or is_interval(val) + or isinstance(val, Interval) or is_offset_object(val)) @@ -272,7 +245,7 @@ cdef int64_t get_itemsize(object val): ------- is_ndarray : bool """ - if PyArray_CheckScalar(val): + if cnp.PyArray_CheckScalar(val): return cnp.PyArray_DescrFromScalar(val).itemsize else: return -1 @@ -513,8 +486,7 @@ def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray: @cython.wraparound(False) @cython.boundscheck(False) -# TODO(cython3): Can add const once cython#1772 is resolved -def has_infs(floating[:] arr) -> bool: +def has_infs(const floating[:] arr) -> bool: cdef: Py_ssize_t i, n = len(arr) floating inf, neginf, val @@ -865,10 +837,16 @@ def is_all_arraylike(obj: list) -> bool: object val bint all_arrays = True + from pandas.core.dtypes.generic import ( + ABCIndex, + ABCMultiIndex, + ABCSeries, + ) + for i in range(n): val = obj[i] - if not (isinstance(val, list) or - util.is_array(val) or hasattr(val, "_data")): + if (not (isinstance(val, (list, ABCSeries, ABCIndex)) or util.is_array(val)) + or isinstance(val, ABCMultiIndex)): # TODO: EA? # exclude tuples, frozensets as they may be contained in an Index all_arrays = False @@ -1184,6 +1162,17 @@ cpdef bint is_decimal(object obj): cpdef bint is_interval(object obj): + import warnings + + from pandas.util._exceptions import find_stack_level + + warnings.warn( + # GH#55264 + "is_interval is deprecated and will be removed in a future version. " + "Use isinstance(obj, pd.Interval) instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return getattr(obj, "_typ", "_typ") == "interval" @@ -1195,6 +1184,17 @@ def is_period(val: object) -> bool: ------- bool """ + import warnings + + from pandas.util._exceptions import find_stack_level + + warnings.warn( + # GH#55264 + "is_period is deprecated and will be removed in a future version. " + "Use isinstance(obj, pd.Period) instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return is_period_object(val) @@ -1434,14 +1434,12 @@ cdef class Seen: self.sint_ = ( self.sint_ or (oINT64_MIN <= val < 0) - # Cython equivalent of `isinstance(val, np.signedinteger)` - or PyObject_TypeCheck(val, &PySignedIntegerArrType_Type) + or isinstance(val, cnp.signedinteger) ) self.uint_ = ( self.uint_ or (oINT64_MAX < val <= oUINT64_MAX) - # Cython equivalent of `isinstance(val, np.unsignedinteger)` - or PyObject_TypeCheck(val, &PyUnsignedIntegerArrType_Type) + or isinstance(val, cnp.unsignedinteger) ) @property @@ -1719,7 +1717,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: if is_period_array(values, skipna=skipna): return "period" - elif is_interval(val): + elif isinstance(val, Interval): if is_interval_array(values): return "interval" @@ -1743,10 +1741,10 @@ cdef class Validator: cdef: Py_ssize_t n - dtype dtype + cnp.dtype dtype bint skipna - def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_), + def __cinit__(self, Py_ssize_t n, cnp.dtype dtype=np.dtype(np.object_), bint skipna=False): self.n = n self.dtype = dtype @@ -1827,7 +1825,7 @@ cdef class BoolValidator(Validator): return util.is_bool_object(value) cdef bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.bool_) + return cnp.PyDataType_ISBOOL(self.dtype) cpdef bint is_bool_array(ndarray values, bint skipna=False): @@ -1844,7 +1842,7 @@ cdef class IntegerValidator(Validator): return util.is_integer_object(value) cdef bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.integer) + return cnp.PyDataType_ISINTEGER(self.dtype) # Note: only python-exposed for tests @@ -1876,7 +1874,7 @@ cdef class IntegerFloatValidator(Validator): return util.is_integer_object(value) or util.is_float_object(value) cdef bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.integer) + return cnp.PyDataType_ISINTEGER(self.dtype) cdef bint is_integer_float_array(ndarray values, bint skipna=True): @@ -1893,7 +1891,7 @@ cdef class FloatValidator(Validator): return util.is_float_object(value) cdef bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.floating) + return cnp.PyDataType_ISFLOAT(self.dtype) # Note: only python-exposed for tests @@ -1912,7 +1910,7 @@ cdef class ComplexValidator(Validator): ) cdef bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.complexfloating) + return cnp.PyDataType_ISCOMPLEX(self.dtype) cdef bint is_complex_array(ndarray values): @@ -1941,7 +1939,7 @@ cdef class StringValidator(Validator): return isinstance(value, str) cdef bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.str_) + return self.dtype.type_num == cnp.NPY_UNICODE cpdef bint is_string_array(ndarray values, bint skipna=False): @@ -1958,7 +1956,7 @@ cdef class BytesValidator(Validator): return isinstance(value, bytes) cdef bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.bytes_) + return self.dtype.type_num == cnp.NPY_STRING cdef bint is_bytes_array(ndarray values, bint skipna=False): @@ -1973,7 +1971,7 @@ cdef class TemporalValidator(Validator): cdef: bint all_generic_na - def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_), + def __cinit__(self, Py_ssize_t n, cnp.dtype dtype=np.dtype(np.object_), bint skipna=False): self.n = n self.dtype = dtype @@ -2194,7 +2192,7 @@ cpdef bint is_interval_array(ndarray values): for i in range(n): val = values[i] - if is_interval(val): + if isinstance(val, Interval): if closed is None: closed = val.closed numeric = ( @@ -2633,6 +2631,7 @@ def maybe_convert_objects(ndarray[object] objects, tsobj = convert_to_tsobject(val, None, None, 0, 0) tsobj.ensure_reso(NPY_FR_ns) except OutOfBoundsDatetime: + # e.g. test_out_of_s_bounds_datetime64 seen.object_ = True break else: @@ -2654,7 +2653,7 @@ def maybe_convert_objects(ndarray[object] objects, except (ValueError, TypeError): seen.object_ = True break - elif is_interval(val): + elif isinstance(val, Interval): if convert_non_numeric: seen.interval_ = True break diff --git a/pandas/_libs/ops.pyi b/pandas/_libs/ops.pyi index 515f7aa53ba15..6738a1dff4a9e 100644 --- a/pandas/_libs/ops.pyi +++ b/pandas/_libs/ops.pyi @@ -37,8 +37,8 @@ def vec_binop( @overload def maybe_convert_bool( arr: npt.NDArray[np.object_], - true_values: Iterable = ..., - false_values: Iterable = ..., + true_values: Iterable | None = None, + false_values: Iterable | None = None, convert_to_masked_nullable: Literal[False] = ..., ) -> tuple[np.ndarray, None]: ... @overload diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 823c267cf92b1..ab28b34be58f2 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -34,6 +34,7 @@ from cpython.unicode cimport ( PyUnicode_AsUTF8String, PyUnicode_Decode, PyUnicode_DecodeUTF8, + PyUnicode_FromString, ) from cython cimport Py_ssize_t from libc.stdlib cimport free @@ -44,11 +45,6 @@ from libc.string cimport ( ) -cdef extern from "Python.h": - # TODO(cython3): get this from cpython.unicode - object PyUnicode_FromString(char *v) - - import numpy as np cimport numpy as cnp diff --git a/pandas/_libs/sparse.pyi b/pandas/_libs/sparse.pyi index 9e5cecc61e5ca..536265b25425e 100644 --- a/pandas/_libs/sparse.pyi +++ b/pandas/_libs/sparse.pyi @@ -39,6 +39,10 @@ class BlockIndex(SparseIndex): self, length: int, blocs: np.ndarray, blengths: np.ndarray ) -> None: ... + # Override to have correct parameters + def intersect(self, other: SparseIndex) -> Self: ... + def make_union(self, y: SparseIndex) -> Self: ... + def make_mask_object_ndarray( arr: npt.NDArray[np.object_], fill_value ) -> npt.NDArray[np.bool_]: ... diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index db1c735bd6094..f6efae77c5c01 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -29,6 +29,46 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #include #include +#if defined(_WIN32) +#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS +#define ENABLE_INTSAFE_SIGNED_FUNCTIONS +#endif +#include +#define checked_int64_add(a, b, res) LongLongAdd(a, b, res) +#define checked_int64_sub(a, b, res) LongLongSub(a, b, res) +#define checked_int64_mul(a, b, res) LongLongMult(a, b, res) +#else +#if defined __has_builtin +#if __has_builtin(__builtin_add_overflow) +#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res) +#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res) +#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res) +#else +_Static_assert(0, + "Overflow checking not detected; please try a newer compiler"); +#endif +// __has_builtin was added in gcc 10, but our muslinux_1_1 build environment +// only has gcc-9.3, so fall back to __GNUC__ macro as long as we have that +#elif __GNUC__ > 7 +#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res) +#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res) +#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res) +#else +_Static_assert(0, "__has_builtin not detected; please try a newer compiler"); +#endif +#endif + +#define PD_CHECK_OVERFLOW(FUNC) \ + do { \ + if ((FUNC) != 0) { \ + PyGILState_STATE gstate = PyGILState_Ensure(); \ + PyErr_SetString(PyExc_OverflowError, \ + "Overflow occurred in npy_datetimestruct_to_datetime"); \ + PyGILState_Release(gstate); \ + return -1; \ + } \ + } while (0) + const int days_per_month_table[2][12] = { {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}}; @@ -299,96 +339,189 @@ PyObject *extract_utc_offset(PyObject *obj) { return tmp; } +static inline int scaleYearToEpoch(int64_t year, int64_t *result) { + return checked_int64_sub(year, 1970, result); +} + +static inline int scaleYearsToMonths(int64_t years, int64_t *result) { + return checked_int64_mul(years, 12, result); +} + +static inline int scaleDaysToWeeks(int64_t days, int64_t *result) { + if (days >= 0) { + *result = days / 7; + return 0; + } else { + int res; + int64_t checked_days; + if ((res = checked_int64_sub(days, 6, &checked_days))) { + return res; + } + + *result = checked_days / 7; + return 0; + } +} + +static inline int scaleDaysToHours(int64_t days, int64_t *result) { + return checked_int64_mul(days, 24, result); +} + +static inline int scaleHoursToMinutes(int64_t hours, int64_t *result) { + return checked_int64_mul(hours, 60, result); +} + +static inline int scaleMinutesToSeconds(int64_t minutes, int64_t *result) { + return checked_int64_mul(minutes, 60, result); +} + +static inline int scaleSecondsToMilliseconds(int64_t seconds, int64_t *result) { + return checked_int64_mul(seconds, 1000, result); +} + +static inline int scaleSecondsToMicroseconds(int64_t seconds, int64_t *result) { + return checked_int64_mul(seconds, 1000000, result); +} + +static inline int scaleMicrosecondsToNanoseconds(int64_t microseconds, + int64_t *result) { + return checked_int64_mul(microseconds, 1000, result); +} + +static inline int scaleMicrosecondsToPicoseconds(int64_t microseconds, + int64_t *result) { + return checked_int64_mul(microseconds, 1000000, result); +} + +static inline int64_t scalePicosecondsToFemtoseconds(int64_t picoseconds, + int64_t *result) { + return checked_int64_mul(picoseconds, 1000, result); +} + +static inline int64_t scalePicosecondsToAttoseconds(int64_t picoseconds, + int64_t *result) { + return checked_int64_mul(picoseconds, 1000000, result); +} + /* * Converts a datetime from a datetimestruct to a datetime based - * on a metadata unit. The date is assumed to be valid. + * on a metadata unit. Returns -1 on and sets PyErr on error. */ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, const npy_datetimestruct *dts) { - npy_datetime ret; - - if (base == NPY_FR_Y) { - /* Truncate to the year */ - ret = dts->year - 1970; - } else if (base == NPY_FR_M) { - /* Truncate to the month */ - ret = 12 * (dts->year - 1970) + (dts->month - 1); - } else { - /* Otherwise calculate the number of days to start */ - npy_int64 days = get_datetimestruct_days(dts); - - switch (base) { - case NPY_FR_W: - /* Truncate to weeks */ - if (days >= 0) { - ret = days / 7; - } else { - ret = (days - 6) / 7; - } - break; - case NPY_FR_D: - ret = days; - break; - case NPY_FR_h: - ret = days * 24 + dts->hour; - break; - case NPY_FR_m: - ret = (days * 24 + dts->hour) * 60 + dts->min; - break; - case NPY_FR_s: - ret = ((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec; - break; - case NPY_FR_ms: - ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) * 1000 + - dts->us / 1000; - break; - case NPY_FR_us: - ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) * - 1000000 + - dts->us; - break; - case NPY_FR_ns: - ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) * - 1000000 + - dts->us) * - 1000 + - dts->ps / 1000; - break; - case NPY_FR_ps: - ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) * - 1000000 + - dts->us) * - 1000000 + - dts->ps; - break; - case NPY_FR_fs: - /* only 2.6 hours */ - ret = (((((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) * - 1000000 + - dts->us) * - 1000000 + - dts->ps) * - 1000 + - dts->as / 1000; - break; - case NPY_FR_as: - /* only 9.2 secs */ - ret = (((((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) * - 1000000 + - dts->us) * - 1000000 + - dts->ps) * - 1000000 + - dts->as; - break; - default: - /* Something got corrupted */ - PyErr_SetString(PyExc_ValueError, - "NumPy datetime metadata with corrupt unit value"); - return -1; - } - } - return ret; + if ((base == NPY_FR_Y) || (base == NPY_FR_M)) { + int64_t years; + PD_CHECK_OVERFLOW(scaleYearToEpoch(dts->year, &years)); + + if (base == NPY_FR_Y) { + return years; + } + + int64_t months; + PD_CHECK_OVERFLOW(scaleYearsToMonths(years, &months)); + + int64_t months_adder; + PD_CHECK_OVERFLOW(checked_int64_sub(dts->month, 1, &months_adder)); + PD_CHECK_OVERFLOW(checked_int64_add(months, months_adder, &months)); + + if (base == NPY_FR_M) { + return months; + } + } + + const int64_t days = get_datetimestruct_days(dts); + if (base == NPY_FR_D) { + return days; + } + + if (base == NPY_FR_W) { + int64_t weeks; + PD_CHECK_OVERFLOW(scaleDaysToWeeks(days, &weeks)); + return weeks; + } + + int64_t hours; + PD_CHECK_OVERFLOW(scaleDaysToHours(days, &hours)); + PD_CHECK_OVERFLOW(checked_int64_add(hours, dts->hour, &hours)); + + if (base == NPY_FR_h) { + return hours; + } + + int64_t minutes; + PD_CHECK_OVERFLOW(scaleHoursToMinutes(hours, &minutes)); + PD_CHECK_OVERFLOW(checked_int64_add(minutes, dts->min, &minutes)); + + if (base == NPY_FR_m) { + return minutes; + } + + int64_t seconds; + PD_CHECK_OVERFLOW(scaleMinutesToSeconds(minutes, &seconds)); + PD_CHECK_OVERFLOW(checked_int64_add(seconds, dts->sec, &seconds)); + + if (base == NPY_FR_s) { + return seconds; + } + + if (base == NPY_FR_ms) { + int64_t milliseconds; + PD_CHECK_OVERFLOW(scaleSecondsToMilliseconds(seconds, &milliseconds)); + PD_CHECK_OVERFLOW( + checked_int64_add(milliseconds, dts->us / 1000, &milliseconds)); + + return milliseconds; + } + + int64_t microseconds; + PD_CHECK_OVERFLOW(scaleSecondsToMicroseconds(seconds, µseconds)); + PD_CHECK_OVERFLOW(checked_int64_add(microseconds, dts->us, µseconds)); + + if (base == NPY_FR_us) { + return microseconds; + } + + if (base == NPY_FR_ns) { + int64_t nanoseconds; + PD_CHECK_OVERFLOW( + scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds)); + PD_CHECK_OVERFLOW( + checked_int64_add(nanoseconds, dts->ps / 1000, &nanoseconds)); + + return nanoseconds; + } + + int64_t picoseconds; + PD_CHECK_OVERFLOW(scaleMicrosecondsToPicoseconds(microseconds, &picoseconds)); + PD_CHECK_OVERFLOW(checked_int64_add(picoseconds, dts->ps, &picoseconds)); + + if (base == NPY_FR_ps) { + return picoseconds; + } + + if (base == NPY_FR_fs) { + int64_t femtoseconds; + PD_CHECK_OVERFLOW( + scalePicosecondsToFemtoseconds(picoseconds, &femtoseconds)); + PD_CHECK_OVERFLOW( + checked_int64_add(femtoseconds, dts->as / 1000, &femtoseconds)); + return femtoseconds; + } + + if (base == NPY_FR_as) { + int64_t attoseconds; + PD_CHECK_OVERFLOW(scalePicosecondsToAttoseconds(picoseconds, &attoseconds)); + PD_CHECK_OVERFLOW(checked_int64_add(attoseconds, dts->as, &attoseconds)); + return attoseconds; + } + + /* Something got corrupted */ + PyGILState_STATE gstate = PyGILState_Ensure(); + PyErr_SetString(PyExc_ValueError, + "NumPy datetime metadata with corrupt unit value"); + PyGILState_Release(gstate); + + return -1; } /* diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 72faf1757a7b0..da4eeaeb3b692 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -23,7 +23,6 @@ import_datetime() cimport numpy as cnp from numpy cimport ( int64_t, - is_datetime64_object, ndarray, ) @@ -31,10 +30,14 @@ import numpy as np cnp.import_array() +from pandas._libs.tslibs.dtypes cimport ( + get_supported_reso, + npy_unit_to_abbrev, +) from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, NPY_FR_ns, - check_dts_bounds, + get_datetime64_unit, import_pandas_datetime, npy_datetimestruct, npy_datetimestruct_to_datetime, @@ -99,8 +102,10 @@ def _test_parse_iso8601(ts: str): obj = _TSObject() string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True) - obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts) - check_dts_bounds(&obj.dts) + try: + obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts) + except OverflowError as err: + raise OutOfBoundsDatetime(f"Out of bounds nanosecond timestamp: {ts}") from err if out_local == 1: obj.tzinfo = timezone(timedelta(minutes=out_tzoffset)) obj.value = tz_localize_to_utc_single(obj.value, obj.tzinfo) @@ -440,16 +445,17 @@ cpdef array_to_datetime( utc : bool, default False indicator whether the dates should be UTC creso : NPY_DATETIMEUNIT, default NPY_FR_ns + Set to NPY_FR_GENERIC to infer a resolution. Returns ------- np.ndarray - May be datetime64[ns] or object dtype + May be datetime64[creso_unit] or object dtype tzinfo or None """ cdef: Py_ssize_t i, n = values.size - object val, tz + object val ndarray[int64_t] iresult npy_datetimestruct dts bint utc_convert = bool(utc) @@ -461,16 +467,21 @@ cpdef array_to_datetime( _TSObject _ts float tz_offset set out_tzoffset_vals = set() - tzinfo tz_out = None + tzinfo tz, tz_out = None cnp.flatiter it = cnp.PyArray_IterNew(values) - DatetimeParseState state = DatetimeParseState() - str reso_str + NPY_DATETIMEUNIT item_reso + bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC + DatetimeParseState state = DatetimeParseState(creso) + str abbrev # specify error conditions assert is_raise or is_ignore or is_coerce - reso_str = npy_unit_to_abbrev(creso) - result = np.empty((values).shape, dtype=f"M8[{reso_str}]") + if infer_reso: + abbrev = "ns" + else: + abbrev = npy_unit_to_abbrev(creso) + result = np.empty((values).shape, dtype=f"M8[{abbrev}]") iresult = result.view("i8").ravel() for i in range(n): @@ -483,15 +494,29 @@ cpdef array_to_datetime( iresult[i] = NPY_NAT elif PyDateTime_Check(val): + if isinstance(val, _Timestamp): + item_reso = val._creso + else: + item_reso = NPY_DATETIMEUNIT.NPY_FR_us + state.update_creso(item_reso) + if infer_reso: + creso = state.creso tz_out = state.process_datetime(val, tz_out, utc_convert) iresult[i] = parse_pydatetime(val, &dts, creso=creso) elif PyDate_Check(val): + item_reso = NPY_DATETIMEUNIT.NPY_FR_s + state.update_creso(item_reso) + if infer_reso: + creso = state.creso iresult[i] = pydate_to_dt64(val, &dts, reso=creso) - check_dts_bounds(&dts, creso) state.found_other = True - elif is_datetime64_object(val): + elif cnp.is_datetime64_object(val): + item_reso = get_supported_reso(get_datetime64_unit(val)) + state.update_creso(item_reso) + if infer_reso: + creso = state.creso iresult[i] = get_datetime64_nanos(val, creso) state.found_other = True @@ -501,8 +526,13 @@ cpdef array_to_datetime( if val != val or val == NPY_NAT: iresult[i] = NPY_NAT else: - # we now need to parse this as if unit='ns' - iresult[i] = cast_from_unit(val, "ns", out_reso=creso) + item_reso = NPY_FR_ns + state.update_creso(item_reso) + if infer_reso: + creso = state.creso + + # we now need to parse this as if unit=abbrev + iresult[i] = cast_from_unit(val, abbrev, out_reso=creso) state.found_other = True elif isinstance(val, str): @@ -514,22 +544,36 @@ cpdef array_to_datetime( if parse_today_now(val, &iresult[i], utc, creso): # We can't _quite_ dispatch this to convert_str_to_tsobject # bc there isn't a nice way to pass "utc" + item_reso = NPY_DATETIMEUNIT.NPY_FR_us + state.update_creso(item_reso) + if infer_reso: + creso = state.creso continue _ts = convert_str_to_tsobject( - val, None, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst + val, None, dayfirst=dayfirst, yearfirst=yearfirst ) - _ts.ensure_reso(creso, val) - iresult[i] = _ts.value - - tz = _ts.tzinfo if _ts.value == NPY_NAT: # e.g. "NaT" string or empty string, we do not consider # this as either tzaware or tznaive. See # test_to_datetime_with_empty_str_utc_false_format_mixed - pass - elif tz is not None: + # We also do not update resolution inference based on this, + # see test_infer_with_nat_int_float_str + iresult[i] = _ts.value + continue + + item_reso = _ts.creso + state.update_creso(item_reso) + if infer_reso: + creso = state.creso + + _ts.ensure_reso(creso, val) + + iresult[i] = _ts.value + + tz = _ts.tzinfo + if tz is not None: # dateutil timezone objects cannot be hashed, so # store the UTC offsets in seconds instead nsecs = tz.utcoffset(None).total_seconds() @@ -586,6 +630,28 @@ cpdef array_to_datetime( # e.g. test_to_datetime_mixed_awareness_mixed_types raise ValueError("Cannot mix tz-aware with tz-naive values") + if infer_reso: + if state.creso_ever_changed: + # We encountered mismatched resolutions, need to re-parse with + # the correct one. + return array_to_datetime( + values, + errors=errors, + yearfirst=yearfirst, + dayfirst=dayfirst, + utc=utc, + creso=state.creso, + ) + elif state.creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: + # i.e. we never encountered anything non-NaT, default to "s". This + # ensures that insert and concat-like operations with NaT + # do not upcast units + result = iresult.view("M8[s]").reshape(result.shape) + else: + # Otherwise we can use the single reso that we encountered and avoid + # a second pass. + abbrev = npy_unit_to_abbrev(state.creso) + result = iresult.view(f"M8[{abbrev}]").reshape(result.shape) return result, tz_out @@ -660,7 +726,7 @@ cdef _array_to_datetime_object( try: tsobj = convert_str_to_tsobject( - val, None, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst + val, None, dayfirst=dayfirst, yearfirst=yearfirst ) tsobj.ensure_reso(NPY_FR_ns, val) @@ -720,6 +786,13 @@ def array_to_datetime_with_tz( _TSObject tsobj bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC DatetimeParseState state = DatetimeParseState(creso) + str abbrev + + if infer_reso: + # We treat ints/floats as nanoseconds + abbrev = "ns" + else: + abbrev = npy_unit_to_abbrev(creso) for i in range(n): # Analogous to `item = values[i]` @@ -731,7 +804,12 @@ def array_to_datetime_with_tz( else: tsobj = convert_to_tsobject( - item, tz=tz, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst, nanos=0 + item, + tz=tz, + unit=abbrev, + dayfirst=dayfirst, + yearfirst=yearfirst, + nanos=0, ) if tsobj.value != NPY_NAT: state.update_creso(tsobj.creso) @@ -750,14 +828,16 @@ def array_to_datetime_with_tz( # We encountered mismatched resolutions, need to re-parse with # the correct one. return array_to_datetime_with_tz(values, tz=tz, creso=creso) - - # Otherwise we can use the single reso that we encountered and avoid - # a second pass. - abbrev = npy_unit_to_abbrev(creso) - result = result.view(f"M8[{abbrev}]") - elif creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: - # We didn't find any non-NaT to infer from, default to "ns" - result = result.view("M8[ns]") + elif creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: + # i.e. we never encountered anything non-NaT, default to "s". This + # ensures that insert and concat-like operations with NaT + # do not upcast units + result = result.view("M8[s]") + else: + # Otherwise we can use the single reso that we encountered and avoid + # a second pass. + abbrev = npy_unit_to_abbrev(creso) + result = result.view(f"M8[{abbrev}]") else: abbrev = npy_unit_to_abbrev(creso) result = result.view(f"M8[{abbrev}]") diff --git a/pandas/_libs/tslibs/ccalendar.pxd b/pandas/_libs/tslibs/ccalendar.pxd index ca9e01ffa3dbe..3c0f7ea824396 100644 --- a/pandas/_libs/tslibs/ccalendar.pxd +++ b/pandas/_libs/tslibs/ccalendar.pxd @@ -15,6 +15,6 @@ cpdef int32_t get_day_of_year(int year, int month, int day) noexcept nogil cpdef int get_lastbday(int year, int month) noexcept nogil cpdef int get_firstbday(int year, int month) noexcept nogil -cdef dict c_MONTH_NUMBERS +cdef dict c_MONTH_NUMBERS, MONTH_TO_CAL_NUM cdef int32_t* month_offset diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index 1db355fa91a20..536fa19f4c5d7 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -38,7 +38,7 @@ MONTHS_FULL = ["", "January", "February", "March", "April", "May", "June", MONTH_NUMBERS = {name: num for num, name in enumerate(MONTHS)} cdef dict c_MONTH_NUMBERS = MONTH_NUMBERS MONTH_ALIASES = {(num + 1): name for num, name in enumerate(MONTHS)} -MONTH_TO_CAL_NUM = {name: num + 1 for num, name in enumerate(MONTHS)} +cdef dict MONTH_TO_CAL_NUM = {name: num + 1 for num, name in enumerate(MONTHS)} DAYS = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"] DAYS_FULL = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index e97f4900d20c8..6b9f41b1bb06f 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -37,7 +37,7 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, tzinfo tz, int32_t nanos=*, NPY_DATETIMEUNIT reso=*) -cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit, +cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, bint dayfirst=*, bint yearfirst=*) @@ -45,7 +45,7 @@ cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1 cpdef datetime localize_pydatetime(datetime dt, tzinfo tz) cdef int64_t cast_from_unit(object ts, str unit, NPY_DATETIMEUNIT out_reso=*) except? -1 -cpdef (int64_t, int) precision_from_unit( +cdef (int64_t, int) precision_from_unit( NPY_DATETIMEUNIT in_reso, NPY_DATETIMEUNIT out_reso=* ) diff --git a/pandas/_libs/tslibs/conversion.pyi b/pandas/_libs/tslibs/conversion.pyi index be75c2f2f68ea..cfe39fe2964cb 100644 --- a/pandas/_libs/tslibs/conversion.pyi +++ b/pandas/_libs/tslibs/conversion.pyi @@ -8,7 +8,5 @@ import numpy as np DT64NS_DTYPE: np.dtype TD64NS_DTYPE: np.dtype -def precision_from_unit( - in_reso: int, # NPY_DATETIMEUNIT -) -> tuple[int, int]: ... # (int64_t, _) def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ... +def cast_from_unit_vectorized(values: np.ndarray, unit: str) -> np.ndarray: ... diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index a2bc6d4d52f2e..5ad9a648c52a2 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -1,11 +1,13 @@ +cimport cython + import numpy as np cimport numpy as cnp from libc.math cimport log10 from numpy cimport ( + float64_t, int32_t, int64_t, - is_datetime64_object, ) cnp.import_array() @@ -31,14 +33,17 @@ from pandas._libs.tslibs.base cimport ABCTimestamp from pandas._libs.tslibs.dtypes cimport ( abbrev_to_npy_unit, get_supported_reso, + npy_unit_to_attrname, periods_per_second, ) from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, NPY_FR_ns, NPY_FR_us, + astype_overflowsafe, check_dts_bounds, convert_reso, + dts_to_iso_string, get_conversion_factor, get_datetime64_unit, get_implementation_bounds, @@ -73,6 +78,7 @@ from pandas._libs.tslibs.tzconversion cimport ( from pandas._libs.tslibs.util cimport ( is_float_object, is_integer_object, + is_nan, ) # ---------------------------------------------------------------------- @@ -85,6 +91,78 @@ TD64NS_DTYPE = np.dtype("m8[ns]") # ---------------------------------------------------------------------- # Unit Conversion Helpers +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.overflowcheck(True) +def cast_from_unit_vectorized( + ndarray values, + str unit, +): + """ + Vectorized analogue to cast_from_unit. + """ + cdef: + int64_t m + int p + NPY_DATETIMEUNIT in_reso, out_reso + Py_ssize_t i + + assert values.dtype.kind == "f" + + if unit in "YM": + if not (((values % 1) == 0) | np.isnan(values)).all(): + # GH#47267 it is clear that 2 "M" corresponds to 1970-02-01, + # but not clear what 2.5 "M" corresponds to, so we will + # disallow that case. + raise ValueError( + f"Conversion of non-round float with unit={unit} " + "is ambiguous" + ) + + # GH#47266 go through np.datetime64 to avoid weird results e.g. with "Y" + # and 150 we'd get 2120-01-01 09:00:00 + values = values.astype(f"M8[{unit}]") + dtype = np.dtype("M8[ns]") + return astype_overflowsafe(values, dtype=dtype, copy=False).view("i8") + + in_reso = abbrev_to_npy_unit(unit) + out_reso = abbrev_to_npy_unit("ns") + m, p = precision_from_unit(in_reso, out_reso) + + cdef: + ndarray[int64_t] base, out + ndarray[float64_t] frac + tuple shape = (values).shape + + out = np.empty(shape, dtype="i8") + base = np.empty(shape, dtype="i8") + frac = np.empty(shape, dtype="f8") + + for i in range(len(values)): + if is_nan(values[i]): + base[i] = NPY_NAT + else: + base[i] = values[i] + frac[i] = values[i] - base[i] + + if p: + frac = np.round(frac, p) + + try: + for i in range(len(values)): + if base[i] == NPY_NAT: + out[i] = NPY_NAT + else: + out[i] = (base[i] * m) + (frac[i] * m) + except (OverflowError, FloatingPointError) as err: + # FloatingPointError can be issued if we have float dtype and have + # set np.errstate(over="raise") + raise OutOfBoundsDatetime( + f"cannot convert input {values[i]} with the unit '{unit}'" + ) from err + return out + + cdef int64_t cast_from_unit( object ts, str unit, @@ -154,7 +232,7 @@ cdef int64_t cast_from_unit( ) from err -cpdef (int64_t, int) precision_from_unit( +cdef (int64_t, int) precision_from_unit( NPY_DATETIMEUNIT in_reso, NPY_DATETIMEUNIT out_reso=NPY_DATETIMEUNIT.NPY_FR_ns, ): @@ -212,8 +290,13 @@ cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1: if unit != reso: pandas_datetime_to_datetimestruct(ival, unit, &dts) - check_dts_bounds(&dts, reso) - ival = npy_datetimestruct_to_datetime(reso, &dts) + try: + ival = npy_datetimestruct_to_datetime(reso, &dts) + except OverflowError as err: + attrname = npy_unit_to_attrname[reso] + raise OutOfBoundsDatetime( + f"Out of bounds {attrname} timestamp: {val}" + ) from err return ival @@ -245,8 +328,9 @@ cdef class _TSObject: ) except OverflowError as err: if val is not None: + attrname = npy_unit_to_attrname[creso] raise OutOfBoundsDatetime( - f"Out of bounds nanosecond timestamp: {val}" + f"Out of bounds {attrname} timestamp: {val}" ) from err raise OutOfBoundsDatetime from err @@ -277,11 +361,11 @@ cdef _TSObject convert_to_tsobject(object ts, tzinfo tz, str unit, obj = _TSObject() if isinstance(ts, str): - return convert_str_to_tsobject(ts, tz, unit, dayfirst, yearfirst) + return convert_str_to_tsobject(ts, tz, dayfirst, yearfirst) if checknull_with_nat_and_na(ts): obj.value = NPY_NAT - elif is_datetime64_object(ts): + elif cnp.is_datetime64_object(ts): reso = get_supported_reso(get_datetime64_unit(ts)) obj.creso = reso obj.value = get_datetime64_nanos(ts, reso) @@ -413,14 +497,17 @@ cdef _TSObject convert_datetime_to_tsobject( if nanos: obj.dts.ps = nanos * 1000 - obj.value = npy_datetimestruct_to_datetime(reso, &obj.dts) + try: + obj.value = npy_datetimestruct_to_datetime(reso, &obj.dts) + except OverflowError as err: + attrname = npy_unit_to_attrname[reso] + raise OutOfBoundsDatetime(f"Out of bounds {attrname} timestamp") from err if obj.tzinfo is not None and not is_utc(obj.tzinfo): offset = get_utcoffset(obj.tzinfo, ts) pps = periods_per_second(reso) obj.value -= int(offset.total_seconds() * pps) - check_dts_bounds(&obj.dts, reso) check_overflows(obj, reso) return obj @@ -467,7 +554,7 @@ cdef _adjust_tsobject_tz_using_offset(_TSObject obj, tzinfo tz): check_overflows(obj, obj.creso) -cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit, +cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, bint dayfirst=False, bint yearfirst=False): """ @@ -483,7 +570,6 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit, Value to be converted to _TSObject tz : tzinfo or None timezone for the timezone-aware output - unit : str or None dayfirst : bool, default False When parsing an ambiguous date string, interpret e.g. "3/4/1975" as April 3, as opposed to the standard US interpretation March 4. @@ -499,7 +585,7 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit, npy_datetimestruct dts int out_local = 0, out_tzoffset = 0, string_to_dts_failed datetime dt - int64_t ival + int64_t ival, nanos = 0 NPY_DATETIMEUNIT out_bestunit, reso _TSObject obj @@ -551,10 +637,14 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit, return obj dt = parse_datetime_string( - ts, dayfirst=dayfirst, yearfirst=yearfirst, out_bestunit=&out_bestunit + ts, + dayfirst=dayfirst, + yearfirst=yearfirst, + out_bestunit=&out_bestunit, + nanos=&nanos, ) reso = get_supported_reso(out_bestunit) - return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=reso) + return convert_datetime_to_tsobject(dt, tz, nanos=nanos, reso=reso) return convert_datetime_to_tsobject(dt, tz) @@ -585,18 +675,18 @@ cdef check_overflows(_TSObject obj, NPY_DATETIMEUNIT reso=NPY_FR_ns): if obj.dts.year == lb.year: if not (obj.value < 0): from pandas._libs.tslibs.timestamps import Timestamp - fmt = (f"{obj.dts.year}-{obj.dts.month:02d}-{obj.dts.day:02d} " - f"{obj.dts.hour:02d}:{obj.dts.min:02d}:{obj.dts.sec:02d}") + fmt = dts_to_iso_string(&obj.dts) + min_ts = (<_Timestamp>Timestamp(0))._as_creso(reso).min raise OutOfBoundsDatetime( - f"Converting {fmt} underflows past {Timestamp.min}" + f"Converting {fmt} underflows past {min_ts}" ) elif obj.dts.year == ub.year: if not (obj.value > 0): from pandas._libs.tslibs.timestamps import Timestamp - fmt = (f"{obj.dts.year}-{obj.dts.month:02d}-{obj.dts.day:02d} " - f"{obj.dts.hour:02d}:{obj.dts.min:02d}:{obj.dts.sec:02d}") + fmt = dts_to_iso_string(&obj.dts) + max_ts = (<_Timestamp>Timestamp(0))._as_creso(reso).max raise OutOfBoundsDatetime( - f"Converting {fmt} overflows past {Timestamp.max}" + f"Converting {fmt} overflows past {max_ts}" ) # ---------------------------------------------------------------------- @@ -710,8 +800,7 @@ cdef int64_t parse_pydatetime( result = _ts.value else: if isinstance(val, _Timestamp): - result = (<_Timestamp>val)._as_creso(creso, round_ok=False)._value + result = (<_Timestamp>val)._as_creso(creso, round_ok=True)._value else: result = pydatetime_to_dt64(val, dts, reso=creso) - check_dts_bounds(dts, creso) return result diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi index d8680ed2d27b4..76649aaaa41bf 100644 --- a/pandas/_libs/tslibs/dtypes.pyi +++ b/pandas/_libs/tslibs/dtypes.pyi @@ -1,19 +1,11 @@ from enum import Enum -from pandas._libs.tslibs.timedeltas import UnitChoices - -# These are not public API, but are exposed in the .pyi file because they -# are imported in tests. -_attrname_to_abbrevs: dict[str, str] -_period_code_map: dict[str, int] OFFSET_TO_PERIOD_FREQSTR: dict[str, str] -OFFSET_DEPR_FREQSTR: dict[str, str] -DEPR_ABBREVS: dict[str, UnitChoices] -def periods_per_day(reso: int) -> int: ... +def periods_per_day(reso: int = ...) -> int: ... def periods_per_second(reso: int) -> int: ... def is_supported_unit(reso: int) -> bool: ... -def npy_unit_to_abbrev(reso: int) -> str: ... +def npy_unit_to_abbrev(unit: int) -> str: ... def get_supported_reso(reso: int) -> int: ... def abbrev_to_npy_unit(abbrev: str) -> int: ... def freq_to_period_freqstr(freq_n: int, freq_name: str) -> str: ... diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index a8dc6b12a54b5..17f517e5e7264 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -5,6 +5,7 @@ import warnings from pandas.util._exceptions import find_stack_level +from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, get_conversion_factor, @@ -46,7 +47,7 @@ cdef class PeriodDtypeBase: def _resolution_obj(self) -> "Resolution": fgc = self._freq_group_code freq_group = FreqGroup(fgc) - abbrev = _reverse_period_code_map[freq_group.value].split("-")[0] + abbrev = _period_code_to_abbrev[freq_group.value].split("-")[0] if abbrev == "B": return Resolution.RESO_DAY attrname = _abbrev_to_attrnames[abbrev] @@ -55,7 +56,7 @@ cdef class PeriodDtypeBase: @property def _freqstr(self) -> str: # Will be passed to to_offset in Period._maybe_convert_freq - out = _reverse_period_code_map.get(self._dtype_code) + out = _period_code_to_abbrev.get(self._dtype_code) if self._n == 1: return out return str(self._n) + out @@ -150,27 +151,13 @@ _period_code_map = { "ns": PeriodDtypeCode.N, # Nanosecondly } -_reverse_period_code_map = { +cdef dict _period_code_to_abbrev = { _period_code_map[key]: key for key in _period_code_map} -# Yearly aliases; careful not to put these in _reverse_period_code_map -_period_code_map.update({"Y" + key[1:]: _period_code_map[key] - for key in _period_code_map - if key.startswith("Y-")}) - -_period_code_map.update({ - "Q": 2000, # Quarterly - December year end (default quarterly) - "Y": PeriodDtypeCode.A, # Annual - "W": 4000, # Weekly - "C": 5000, # Custom Business Day -}) - -cdef set _month_names = { - x.split("-")[-1] for x in _period_code_map.keys() if x.startswith("Y-") -} +cdef set _month_names = set(c_MONTH_NUMBERS.keys()) # Map attribute-name resolutions to resolution abbreviations -_attrname_to_abbrevs = { +cdef dict attrname_to_abbrevs = { "year": "Y", "quarter": "Q", "month": "M", @@ -182,16 +169,28 @@ _attrname_to_abbrevs = { "microsecond": "us", "nanosecond": "ns", } -cdef dict attrname_to_abbrevs = _attrname_to_abbrevs cdef dict _abbrev_to_attrnames = {v: k for k, v in attrname_to_abbrevs.items()} OFFSET_TO_PERIOD_FREQSTR: dict = { "WEEKDAY": "D", "EOM": "M", "BME": "M", + "SME": "M", "BQS": "Q", "QS": "Q", - "BQ": "Q", + "BQE": "Q", + "BQE-DEC": "Q-DEC", + "BQE-JAN": "Q-JAN", + "BQE-FEB": "Q-FEB", + "BQE-MAR": "Q-MAR", + "BQE-APR": "Q-APR", + "BQE-MAY": "Q-MAY", + "BQE-JUN": "Q-JUN", + "BQE-JUL": "Q-JUL", + "BQE-AUG": "Q-AUG", + "BQE-SEP": "Q-SEP", + "BQE-OCT": "Q-OCT", + "BQE-NOV": "Q-NOV", "MS": "M", "D": "D", "B": "B", @@ -230,11 +229,23 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "W": "W", "ME": "M", "Y": "Y", - "BY": "Y", + "BYE": "Y", + "BYE-DEC": "Y-DEC", + "BYE-JAN": "Y-JAN", + "BYE-FEB": "Y-FEB", + "BYE-MAR": "Y-MAR", + "BYE-APR": "Y-APR", + "BYE-MAY": "Y-MAY", + "BYE-JUN": "Y-JUN", + "BYE-JUL": "Y-JUL", + "BYE-AUG": "Y-AUG", + "BYE-SEP": "Y-SEP", + "BYE-OCT": "Y-OCT", + "BYE-NOV": "Y-NOV", "YS": "Y", "BYS": "Y", } -OFFSET_DEPR_FREQSTR: dict[str, str]= { +cdef dict c_OFFSET_DEPR_FREQSTR = { "M": "ME", "Q": "QE", "Q-DEC": "QE-DEC", @@ -275,10 +286,53 @@ OFFSET_DEPR_FREQSTR: dict[str, str]= { "A-SEP": "YE-SEP", "A-OCT": "YE-OCT", "A-NOV": "YE-NOV", + "BY": "BYE", + "BY-DEC": "BYE-DEC", + "BY-JAN": "BYE-JAN", + "BY-FEB": "BYE-FEB", + "BY-MAR": "BYE-MAR", + "BY-APR": "BYE-APR", + "BY-MAY": "BYE-MAY", + "BY-JUN": "BYE-JUN", + "BY-JUL": "BYE-JUL", + "BY-AUG": "BYE-AUG", + "BY-SEP": "BYE-SEP", + "BY-OCT": "BYE-OCT", + "BY-NOV": "BYE-NOV", + "BA": "BYE", + "BA-DEC": "BYE-DEC", + "BA-JAN": "BYE-JAN", + "BA-FEB": "BYE-FEB", + "BA-MAR": "BYE-MAR", + "BA-APR": "BYE-APR", + "BA-MAY": "BYE-MAY", + "BA-JUN": "BYE-JUN", + "BA-JUL": "BYE-JUL", + "BA-AUG": "BYE-AUG", + "BA-SEP": "BYE-SEP", + "BA-OCT": "BYE-OCT", + "BA-NOV": "BYE-NOV", + "BM": "BME", + "CBM": "CBME", + "SM": "SME", + "BQ": "BQE", + "BQ-DEC": "BQE-DEC", + "BQ-JAN": "BQE-JAN", + "BQ-FEB": "BQE-FEB", + "BQ-MAR": "BQE-MAR", + "BQ-APR": "BQE-APR", + "BQ-MAY": "BQE-MAY", + "BQ-JUN": "BQE-JUN", + "BQ-JUL": "BQE-JUL", + "BQ-AUG": "BQE-AUG", + "BQ-SEP": "BQE-SEP", + "BQ-OCT": "BQE-OCT", + "BQ-NOV": "BQE-NOV", } cdef dict c_OFFSET_TO_PERIOD_FREQSTR = OFFSET_TO_PERIOD_FREQSTR -cdef dict c_OFFSET_DEPR_FREQSTR = OFFSET_DEPR_FREQSTR -cdef dict c_REVERSE_OFFSET_DEPR_FREQSTR = {v: k for k, v in OFFSET_DEPR_FREQSTR.items()} +cdef dict c_REVERSE_OFFSET_DEPR_FREQSTR = { + v: k for k, v in c_OFFSET_DEPR_FREQSTR.items() +} cpdef freq_to_period_freqstr(freq_n, freq_name): if freq_n == 1: @@ -290,7 +344,7 @@ cpdef freq_to_period_freqstr(freq_n, freq_name): return freqstr # Map deprecated resolution abbreviations to correct resolution abbreviations -DEPR_ABBREVS: dict[str, str]= { +cdef dict c_DEPR_ABBREVS = { "A": "Y", "a": "Y", "A-DEC": "Y-DEC", @@ -344,8 +398,6 @@ DEPR_ABBREVS: dict[str, str]= { "BAS-SEP": "BYS-SEP", "BAS-OCT": "BYS-OCT", "BAS-NOV": "BYS-NOV", - "BM": "BME", - "CBM": "CBME", "H": "h", "BH": "bh", "CBH": "cbh", @@ -359,7 +411,6 @@ DEPR_ABBREVS: dict[str, str]= { "N": "ns", "n": "ns", } -cdef dict c_DEPR_ABBREVS = DEPR_ABBREVS class FreqGroup(Enum): @@ -406,7 +457,7 @@ class Resolution(Enum): @property def attr_abbrev(self) -> str: # string that we can pass to to_offset - return _attrname_to_abbrevs[self.attrname] + return attrname_to_abbrevs[self.attrname] @property def attrname(self) -> str: @@ -450,16 +501,19 @@ class Resolution(Enum): >>> Resolution.get_reso_from_freqstr('h') == Resolution.RESO_HR True """ + cdef: + str abbrev try: - if freq in DEPR_ABBREVS: + if freq in c_DEPR_ABBREVS: + abbrev = c_DEPR_ABBREVS[freq] warnings.warn( f"\'{freq}\' is deprecated and will be removed in a future " - f"version. Please use \'{DEPR_ABBREVS.get(freq)}\' " + f"version. Please use \'{abbrev}\' " "instead of \'{freq}\'.", FutureWarning, stacklevel=find_stack_level(), ) - freq = DEPR_ABBREVS[freq] + freq = abbrev attr_name = _abbrev_to_attrnames[freq] except KeyError: # For quarterly and yearly resolutions, we need to chop off @@ -470,15 +524,16 @@ class Resolution(Enum): if split_freq[1] not in _month_names: # i.e. we want e.g. "Q-DEC", not "Q-INVALID" raise - if split_freq[0] in DEPR_ABBREVS: + if split_freq[0] in c_DEPR_ABBREVS: + abbrev = c_DEPR_ABBREVS[split_freq[0]] warnings.warn( f"\'{split_freq[0]}\' is deprecated and will be removed in a " - f"future version. Please use \'{DEPR_ABBREVS.get(split_freq[0])}\' " + f"future version. Please use \'{abbrev}\' " f"instead of \'{split_freq[0]}\'.", FutureWarning, stacklevel=find_stack_level(), ) - split_freq[0] = DEPR_ABBREVS[split_freq[0]] + split_freq[0] = abbrev attr_name = _abbrev_to_attrnames[split_freq[0]] return cls.from_attrname(attr_name) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 3416c68a23f63..65db0e05f859c 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -124,11 +124,6 @@ cdef class _NaT(datetime): return NotImplemented def __add__(self, other): - if self is not c_NaT: - # TODO(cython3): remove this it moved to __radd__ - # cython __radd__ semantics - self, other = other, self - if PyDateTime_Check(other): return c_NaT elif PyDelta_Check(other): @@ -158,14 +153,6 @@ cdef class _NaT(datetime): def __sub__(self, other): # Duplicate some logic from _Timestamp.__sub__ to avoid needing # to subclass; allows us to @final(_Timestamp.__sub__) - cdef: - bint is_rsub = False - - if self is not c_NaT: - # cython __rsub__ semantics - # TODO(cython3): remove __rsub__ logic from here - self, other = other, self - is_rsub = True if PyDateTime_Check(other): return c_NaT @@ -180,19 +167,9 @@ cdef class _NaT(datetime): elif util.is_array(other): if other.dtype.kind == "m": - if not is_rsub: - # NaT - timedelta64 we treat NaT as datetime64, so result - # is datetime64 - result = np.empty(other.shape, dtype="datetime64[ns]") - result.fill("NaT") - return result - - # __rsub__ logic here - # TODO(cython3): remove this, move above code out of - # ``if not is_rsub`` block - # timedelta64 - NaT we have to treat NaT as timedelta64 - # for this to be meaningful, and the result is timedelta64 - result = np.empty(other.shape, dtype="timedelta64[ns]") + # NaT - timedelta64 we treat NaT as datetime64, so result + # is datetime64 + result = np.empty(other.shape, dtype="datetime64[ns]") result.fill("NaT") return result diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index 445b832e8a5bd..a87c3d3f0955d 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -6,25 +6,12 @@ from cpython.datetime cimport ( from numpy cimport ( int32_t, int64_t, + npy_datetime, + npy_timedelta, ) # TODO(cython3): most of these can be cimported directly from numpy -cdef extern from "numpy/ndarrayobject.h": - ctypedef int64_t npy_timedelta - ctypedef int64_t npy_datetime - -cdef extern from "numpy/ndarraytypes.h": - ctypedef struct PyArray_DatetimeMetaData: - NPY_DATETIMEUNIT base - int64_t num - -cdef extern from "numpy/arrayscalars.h": - ctypedef struct PyDatetimeScalarObject: - # PyObject_HEAD - npy_datetime obval - PyArray_DatetimeMetaData obmeta - cdef extern from "numpy/ndarraytypes.h": ctypedef struct npy_datetimestruct: int64_t year @@ -60,7 +47,7 @@ cdef extern from "pandas/datetime/pd_datetime.h": npy_datetimestruct *result) nogil npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT fr, - npy_datetimestruct *d) nogil + npy_datetimestruct *d) except? -1 nogil void pandas_timedelta_to_timedeltastruct(npy_timedelta val, NPY_DATETIMEUNIT fr, @@ -80,15 +67,17 @@ cdef inline void import_pandas_datetime() noexcept: cdef bint cmp_scalar(int64_t lhs, int64_t rhs, int op) except -1 +cdef str dts_to_iso_string(npy_datetimestruct *dts) + cdef check_dts_bounds(npy_datetimestruct *dts, NPY_DATETIMEUNIT unit=?) cdef int64_t pydatetime_to_dt64( datetime val, npy_datetimestruct *dts, NPY_DATETIMEUNIT reso=? -) +) except? -1 cdef void pydatetime_to_dtstruct(datetime dt, npy_datetimestruct *dts) noexcept cdef int64_t pydate_to_dt64( date val, npy_datetimestruct *dts, NPY_DATETIMEUNIT reso=? -) +) except? -1 cdef void pydate_to_dtstruct(date val, npy_datetimestruct *dts) noexcept cdef NPY_DATETIMEUNIT get_datetime64_unit(object obj) noexcept nogil diff --git a/pandas/_libs/tslibs/np_datetime.pyi b/pandas/_libs/tslibs/np_datetime.pyi index 0cb0e3b0237d7..c42bc43ac9d89 100644 --- a/pandas/_libs/tslibs/np_datetime.pyi +++ b/pandas/_libs/tslibs/np_datetime.pyi @@ -9,7 +9,7 @@ class OutOfBoundsTimedelta(ValueError): ... def py_get_unit_from_dtype(dtype: np.dtype): ... def py_td64_to_tdstruct(td64: int, unit: int) -> dict: ... def astype_overflowsafe( - arr: np.ndarray, + values: np.ndarray, dtype: np.dtype, copy: bool = ..., round_ok: bool = ..., diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 3ffd70f83e88f..9958206c51b7a 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -22,17 +22,25 @@ from libc.stdint cimport INT64_MAX import_datetime() PandasDateTime_IMPORT +import operator + import numpy as np cimport numpy as cnp cnp.import_array() from numpy cimport ( + PyArray_DatetimeMetaData, + PyDatetimeScalarObject, int64_t, ndarray, uint8_t, ) +from pandas._libs.tslibs.dtypes cimport ( + npy_unit_to_abbrev, + npy_unit_to_attrname, +) from pandas._libs.tslibs.util cimport get_c_string_buf_and_size @@ -194,6 +202,11 @@ cdef get_implementation_bounds( raise NotImplementedError(reso) +cdef str dts_to_iso_string(npy_datetimestruct *dts): + return (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " + f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}") + + cdef check_dts_bounds(npy_datetimestruct *dts, NPY_DATETIMEUNIT unit=NPY_FR_ns): """Raises OutOfBoundsDatetime if the given date is outside the range that can be represented by nanosecond-resolution 64-bit integers.""" @@ -209,10 +222,9 @@ cdef check_dts_bounds(npy_datetimestruct *dts, NPY_DATETIMEUNIT unit=NPY_FR_ns): error = True if error: - fmt = (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " - f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}") - # TODO: "nanosecond" in the message assumes NPY_FR_ns - raise OutOfBoundsDatetime(f"Out of bounds nanosecond timestamp: {fmt}") + fmt = dts_to_iso_string(dts) + attrname = npy_unit_to_attrname[unit] + raise OutOfBoundsDatetime(f"Out of bounds {attrname} timestamp: {fmt}") # ---------------------------------------------------------------------- @@ -246,12 +258,21 @@ cdef void pydatetime_to_dtstruct(datetime dt, npy_datetimestruct *dts) noexcept: cdef int64_t pydatetime_to_dt64(datetime val, npy_datetimestruct *dts, - NPY_DATETIMEUNIT reso=NPY_FR_ns): + NPY_DATETIMEUNIT reso=NPY_FR_ns) except? -1: """ Note we are assuming that the datetime object is timezone-naive. """ + cdef int64_t result pydatetime_to_dtstruct(val, dts) - return npy_datetimestruct_to_datetime(reso, dts) + try: + result = npy_datetimestruct_to_datetime(reso, dts) + except OverflowError as err: + attrname = npy_unit_to_attrname[reso] + raise OutOfBoundsDatetime( + f"Out of bounds {attrname} timestamp: {val}" + ) from err + + return result cdef void pydate_to_dtstruct(date val, npy_datetimestruct *dts) noexcept: @@ -265,9 +286,17 @@ cdef void pydate_to_dtstruct(date val, npy_datetimestruct *dts) noexcept: cdef int64_t pydate_to_dt64( date val, npy_datetimestruct *dts, NPY_DATETIMEUNIT reso=NPY_FR_ns -): +) except? -1: + cdef int64_t result pydate_to_dtstruct(val, dts) - return npy_datetimestruct_to_datetime(reso, dts) + + try: + result = npy_datetimestruct_to_datetime(reso, dts) + except OverflowError as err: + attrname = npy_unit_to_attrname[reso] + raise OutOfBoundsDatetime(f"Out of bounds {attrname} timestamp: {val}") from err + + return result cdef int string_to_dts( @@ -346,13 +375,10 @@ cpdef ndarray astype_overflowsafe( return values elif from_unit > to_unit: - if round_ok: - # e.g. ns -> us, so there is no risk of overflow, so we can use - # numpy's astype safely. Note there _is_ risk of truncation. - return values.astype(dtype) - else: - iresult2 = astype_round_check(values.view("i8"), from_unit, to_unit) - return iresult2.view(dtype) + iresult2 = _astype_overflowsafe_to_smaller_unit( + values.view("i8"), from_unit, to_unit, round_ok=round_ok + ) + return iresult2.view(dtype) if (values).dtype.byteorder == ">": # GH#29684 we incorrectly get OutOfBoundsDatetime if we dont swap @@ -407,7 +433,7 @@ cpdef ndarray astype_overflowsafe( return iresult.view(dtype) -# TODO: try to upstream this fix to numpy +# TODO(numpy#16352): try to upstream this fix to numpy def compare_mismatched_resolutions(ndarray left, ndarray right, op): """ Overflow-safe comparison of timedelta64/datetime64 with mismatched resolutions. @@ -465,9 +491,6 @@ def compare_mismatched_resolutions(ndarray left, ndarray right, op): return result -import operator - - cdef int op_to_op_code(op): if op is operator.eq: return Py_EQ @@ -483,13 +506,20 @@ cdef int op_to_op_code(op): return Py_GT -cdef ndarray astype_round_check( +cdef ndarray _astype_overflowsafe_to_smaller_unit( ndarray i8values, NPY_DATETIMEUNIT from_unit, - NPY_DATETIMEUNIT to_unit + NPY_DATETIMEUNIT to_unit, + bint round_ok, ): - # cases with from_unit > to_unit, e.g. ns->us, raise if the conversion - # involves truncation, e.g. 1500ns->1us + """ + Overflow-safe conversion for cases with from_unit > to_unit, e.g. ns->us. + In addition for checking for overflows (which can occur near the lower + implementation bound, see numpy#22346), this checks for truncation, + e.g. 1500ns->1us. + """ + # e.g. test_astype_ns_to_ms_near_bounds is a case with round_ok=True where + # just using numpy's astype silently fails cdef: Py_ssize_t i, N = i8values.size @@ -512,9 +542,7 @@ cdef ndarray astype_round_check( new_value = NPY_DATETIME_NAT else: new_value, mod = divmod(value, mult) - if mod != 0: - # TODO: avoid runtime import - from pandas._libs.tslibs.dtypes import npy_unit_to_abbrev + if not round_ok and mod != 0: from_abbrev = npy_unit_to_abbrev(from_unit) to_abbrev = npy_unit_to_abbrev(to_unit) raise ValueError( @@ -641,7 +669,12 @@ cdef int64_t _convert_reso_with_dtstruct( ) except? -1: cdef: npy_datetimestruct dts + int64_t result pandas_datetime_to_datetimestruct(value, from_unit, &dts) - check_dts_bounds(&dts, to_unit) - return npy_datetimestruct_to_datetime(to_unit, &dts) + try: + result = npy_datetimestruct_to_datetime(to_unit, &dts) + except OverflowError as err: + raise OutOfBoundsDatetime from err + + return result diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 7f3a72178a359..445330e813d2c 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -25,7 +25,6 @@ import numpy as np cimport numpy as cnp from numpy cimport ( int64_t, - is_datetime64_object, ndarray, ) @@ -43,13 +42,13 @@ from pandas._libs.tslibs.util cimport ( from pandas._libs.tslibs.ccalendar import ( MONTH_ALIASES, - MONTH_TO_CAL_NUM, int_to_weekday, weekday_to_int, ) from pandas.util._exceptions import find_stack_level from pandas._libs.tslibs.ccalendar cimport ( + MONTH_TO_CAL_NUM, dayofweek, get_days_in_month, get_firstbday, @@ -102,12 +101,6 @@ cdef bint is_tick_object(object obj): return isinstance(obj, Tick) -cdef datetime _as_datetime(datetime obj): - if isinstance(obj, _Timestamp): - return obj.to_pydatetime() - return obj - - cdef bint _is_normalized(datetime dt): if dt.hour != 0 or dt.minute != 0 or dt.second != 0 or dt.microsecond != 0: # Regardless of whether dt is datetime vs Timestamp @@ -159,7 +152,7 @@ def apply_wraps(func): ): # timedelta path return func(self, other) - elif is_datetime64_object(other) or PyDate_Check(other): + elif cnp.is_datetime64_object(other) or PyDate_Check(other): # PyDate_Check includes date, datetime other = Timestamp(other) else: @@ -490,12 +483,7 @@ cdef class BaseOffset: return type(self)(n=1, normalize=self.normalize, **self.kwds) def __add__(self, other): - if not isinstance(self, BaseOffset): - # cython semantics; this is __radd__ - # TODO(cython3): remove this, this moved to __radd__ - return other.__add__(self) - - elif util.is_array(other) and other.dtype == object: + if util.is_array(other) and other.dtype == object: return np.array([self + x for x in other]) try: @@ -512,10 +500,6 @@ cdef class BaseOffset: elif type(other) is type(self): return type(self)(self.n - other.n, normalize=self.normalize, **self.kwds) - elif not isinstance(self, BaseOffset): - # TODO(cython3): remove, this moved to __rsub__ - # cython semantics, this is __rsub__ - return (-other).__add__(self) else: # e.g. PeriodIndex return NotImplemented @@ -529,10 +513,6 @@ cdef class BaseOffset: elif is_integer_object(other): return type(self)(n=other * self.n, normalize=self.normalize, **self.kwds) - elif not isinstance(self, BaseOffset): - # TODO(cython3): remove this, this moved to __rmul__ - # cython semantics, this is __rmul__ - return other.__mul__(self) return NotImplemented def __rmul__(self, other): @@ -1026,10 +1006,6 @@ cdef class Tick(SingleConstructorOffset): return self.delta.__gt__(other) def __mul__(self, other): - if not isinstance(self, Tick): - # TODO(cython3), remove this, this moved to __rmul__ - # cython semantics, this is __rmul__ - return other.__mul__(self) if is_float_object(other): n = other * self.n # If the new `n` is an integer, we can represent it using the @@ -1057,11 +1033,6 @@ cdef class Tick(SingleConstructorOffset): return _wrap_timedelta_result(result) def __add__(self, other): - if not isinstance(self, Tick): - # cython semantics; this is __radd__ - # TODO(cython3): remove this, this moved to __radd__ - return other.__add__(self) - if isinstance(other, Tick): if type(self) is type(other): return type(self)(self.n + other.n) @@ -1087,7 +1058,7 @@ cdef class Tick(SingleConstructorOffset): return other + self.delta elif other is NaT: return NaT - elif is_datetime64_object(other) or PyDate_Check(other): + elif cnp.is_datetime64_object(other) or PyDate_Check(other): # PyDate_Check includes date, datetime return Timestamp(other) + self @@ -1345,7 +1316,7 @@ cdef class RelativeDeltaOffset(BaseOffset): if self._use_relativedelta: if isinstance(other, _Timestamp): other_nanos = other.nanosecond - other = _as_datetime(other) + other = other.to_pydatetime(warn=False) if len(self.kwds) > 0: tzinfo = getattr(other, "tzinfo", None) @@ -2434,7 +2405,7 @@ cdef class BYearEnd(YearOffset): _outputName = "BusinessYearEnd" _default_month = 12 - _prefix = "BY" + _prefix = "BYE" _day_opt = "business_end" @@ -2693,7 +2664,7 @@ cdef class BQuarterEnd(QuarterOffset): _output_name = "BusinessQuarterEnd" _default_starting_month = 3 _from_name_starting_month = 12 - _prefix = "BQ" + _prefix = "BQE" _day_opt = "business_end" @@ -3158,7 +3129,7 @@ cdef class SemiMonthEnd(SemiMonthOffset): >>> pd.offsets.SemiMonthEnd().rollforward(ts) Timestamp('2022-01-15 00:00:00') """ - _prefix = "SM" + _prefix = "SME" _min_day_of_month = 1 def is_on_offset(self, dt: datetime) -> bool: @@ -4564,11 +4535,11 @@ prefix_mapping = { YearBegin, # 'YS' YearEnd, # 'YE' BYearBegin, # 'BYS' - BYearEnd, # 'BY' + BYearEnd, # 'BYE' BusinessDay, # 'B' BusinessMonthBegin, # 'BMS' BusinessMonthEnd, # 'BME' - BQuarterEnd, # 'BQ' + BQuarterEnd, # 'BQE' BQuarterBegin, # 'BQS' BusinessHour, # 'bh' CustomBusinessDay, # 'C' @@ -4578,7 +4549,7 @@ prefix_mapping = { MonthEnd, # 'ME' MonthBegin, # 'MS' Nano, # 'ns' - SemiMonthEnd, # 'SM' + SemiMonthEnd, # 'SME' SemiMonthBegin, # 'SMS' Week, # 'W' Second, # 's' @@ -4606,7 +4577,7 @@ _lite_rule_alias = { "YE": "YE-DEC", # YearEnd(month=12), "YS": "YS-JAN", # YearBegin(month=1), - "BY": "BY-DEC", # BYearEnd(month=12), + "BYE": "BYE-DEC", # BYearEnd(month=12), "BYS": "BYS-JAN", # BYearBegin(month=1), "Min": "min", @@ -4752,7 +4723,7 @@ cpdef to_offset(freq, bint is_period=False): for n, (sep, stride, name) in enumerate(tups): if is_period is False and name in c_OFFSET_DEPR_FREQSTR: warnings.warn( - f"\'{name}\' will be deprecated, please use " + f"\'{name}\' is deprecated, please use " f"\'{c_OFFSET_DEPR_FREQSTR.get(name)}\' instead.", FutureWarning, stacklevel=find_stack_level(), @@ -4764,6 +4735,9 @@ cpdef to_offset(freq, bint is_period=False): f"for Period, please use \'Y{name[2:]}\' " f"instead of \'{name}\'" ) + if (name.startswith("B") or + name.startswith("S") or name.startswith("C")): + raise ValueError(INVALID_FREQ_ERR_MSG.format(name)) else: raise ValueError( f"for Period, please use " diff --git a/pandas/_libs/tslibs/parsing.pxd b/pandas/_libs/tslibs/parsing.pxd index 8809c81b530d0..fbe07e68f5adf 100644 --- a/pandas/_libs/tslibs/parsing.pxd +++ b/pandas/_libs/tslibs/parsing.pxd @@ -1,4 +1,5 @@ from cpython.datetime cimport datetime +from numpy cimport int64_t from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT @@ -10,5 +11,6 @@ cdef datetime parse_datetime_string( str date_string, bint dayfirst, bint yearfirst, - NPY_DATETIMEUNIT* out_bestunit + NPY_DATETIMEUNIT* out_bestunit, + int64_t* nanos, ) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 9590d7511891f..d0872a509c440 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -34,6 +34,7 @@ from numpy cimport ( PyArray_IterNew, flatiter, float64_t, + int64_t, ) cnp.import_array() @@ -51,7 +52,7 @@ from dateutil.tz import ( from pandas._config import get_option -from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS +from pandas._libs.tslibs.ccalendar cimport MONTH_TO_CAL_NUM from pandas._libs.tslibs.dtypes cimport ( attrname_to_npy_unit, npy_unit_to_attrname, @@ -272,8 +273,11 @@ def py_parse_datetime_string( # parse_datetime_string cpdef bc it has a pointer argument) cdef: NPY_DATETIMEUNIT out_bestunit + int64_t nanos - return parse_datetime_string(date_string, dayfirst, yearfirst, &out_bestunit) + return parse_datetime_string( + date_string, dayfirst, yearfirst, &out_bestunit, &nanos + ) cdef datetime parse_datetime_string( @@ -283,7 +287,8 @@ cdef datetime parse_datetime_string( str date_string, bint dayfirst, bint yearfirst, - NPY_DATETIMEUNIT* out_bestunit + NPY_DATETIMEUNIT* out_bestunit, + int64_t* nanos, ): """ Parse datetime string, only returns datetime. @@ -311,7 +316,7 @@ cdef datetime parse_datetime_string( default = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) dt = dateutil_parse(date_string, default=default, dayfirst=dayfirst, yearfirst=yearfirst, - ignoretz=False, out_bestunit=out_bestunit) + ignoretz=False, out_bestunit=out_bestunit, nanos=nanos) return dt dt = _parse_delimited_date(date_string, dayfirst, out_bestunit) @@ -330,7 +335,7 @@ cdef datetime parse_datetime_string( dt = dateutil_parse(date_string, default=_DEFAULT_DATETIME, dayfirst=dayfirst, yearfirst=yearfirst, - ignoretz=False, out_bestunit=out_bestunit) + ignoretz=False, out_bestunit=out_bestunit, nanos=nanos) return dt @@ -436,7 +441,7 @@ def parse_datetime_string_with_reso( parsed = dateutil_parse(date_string, _DEFAULT_DATETIME, dayfirst=dayfirst, yearfirst=yearfirst, - ignoretz=False, out_bestunit=&out_bestunit) + ignoretz=False, out_bestunit=&out_bestunit, nanos=NULL) reso = npy_unit_to_attrname[out_bestunit] return parsed, reso @@ -623,7 +628,7 @@ cpdef quarter_to_myear(int year, int quarter, str freq): raise ValueError("Quarter must be 1 <= q <= 4") if freq is not None: - mnum = c_MONTH_NUMBERS[get_rule_month(freq)] + 1 + mnum = MONTH_TO_CAL_NUM[get_rule_month(freq)] month = (mnum + (quarter - 1) * 3) % 12 + 1 if month > mnum: year -= 1 @@ -639,7 +644,8 @@ cdef datetime dateutil_parse( bint ignoretz, bint dayfirst, bint yearfirst, - NPY_DATETIMEUNIT* out_bestunit + NPY_DATETIMEUNIT* out_bestunit, + int64_t* nanos, ): """ lifted from dateutil to get resolution""" @@ -671,11 +677,8 @@ cdef datetime dateutil_parse( if reso is None: raise DateParseError(f"Unable to parse datetime string: {timestr}") - if reso == "microsecond": - if repl["microsecond"] == 0: - reso = "second" - elif repl["microsecond"] % 1000 == 0: - reso = "millisecond" + if reso == "microsecond" and repl["microsecond"] % 1000 == 0: + reso = _find_subsecond_reso(timestr, nanos=nanos) try: ret = default.replace(**repl) @@ -745,6 +748,38 @@ cdef datetime dateutil_parse( return ret +cdef object _reso_pattern = re.compile(r"\d:\d{2}:\d{2}\.(?P\d+)") + +cdef _find_subsecond_reso(str timestr, int64_t* nanos): + # GH#55737 + # Check for trailing zeros in a H:M:S.f pattern + match = _reso_pattern.search(timestr) + if not match: + reso = "second" + else: + frac = match.groupdict()["frac"] + if len(frac) <= 3: + reso = "millisecond" + elif len(frac) > 6: + if frac[6:] == "0" * len(frac[6:]): + # corner case where we haven't lost any data + reso = "nanosecond" + elif len(frac) <= 9: + reso = "nanosecond" + if nanos is not NULL: + if len(frac) < 9: + frac = frac + "0" * (9 - len(frac)) + nanos[0] = int(frac[6:]) + else: + # TODO: should we warn/raise in higher-than-nano cases? + reso = "nanosecond" + if nanos is not NULL: + nanos[0] = int(frac[6:9]) + else: + reso = "microsecond" + return reso + + # ---------------------------------------------------------------------- # Parsing for type-inference @@ -916,6 +951,7 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: yearfirst=False, ignoretz=False, out_bestunit=&out_bestunit, + nanos=NULL, ) except (ValueError, OverflowError, InvalidOperation): # In case the datetime can't be parsed, its format cannot be guessed diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index a4aecd2ce0a09..846d238beadbd 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -89,7 +89,7 @@ class Period(PeriodMixin): @classmethod def _from_ordinal(cls, ordinal: int, freq) -> Period: ... @classmethod - def now(cls, freq: Frequency = ...) -> Period: ... + def now(cls, freq: Frequency) -> Period: ... def strftime(self, fmt: str | None) -> str: ... def to_timestamp( self, diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index d1f925f3a0b48..22f96f8f6c3fa 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -43,6 +43,8 @@ from pandas._libs.tslibs.dtypes cimport ( freq_to_period_freqstr, ) +from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime + # import datetime C API import_datetime() @@ -52,7 +54,7 @@ from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, NPY_FR_D, astype_overflowsafe, - check_dts_bounds, + dts_to_iso_string, import_pandas_datetime, npy_datetimestruct, npy_datetimestruct_to_datetime, @@ -1156,14 +1158,20 @@ cpdef int64_t period_ordinal(int y, int m, int d, int h, int min, cdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) except? -1: cdef: npy_datetimestruct dts + int64_t result if ordinal == NPY_NAT: return NPY_NAT get_date_info(ordinal, freq, &dts) - check_dts_bounds(&dts) - return npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT.NPY_FR_ns, &dts) + try: + result = npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT.NPY_FR_ns, &dts) + except OverflowError as err: + fmt = dts_to_iso_string(&dts) + raise OutOfBoundsDatetime(f"Out of bounds nanosecond timestamp: {fmt}") from err + + return result cdef str period_format(int64_t value, int freq, object fmt=None): @@ -1847,13 +1855,6 @@ cdef class _Period(PeriodMixin): @cython.overflowcheck(True) def __add__(self, other): - if not is_period_object(self): - # cython semantics; this is analogous to a call to __radd__ - # TODO(cython3): remove this - if self is NaT: - return NaT - return other.__add__(self) - if is_any_td_scalar(other): return self._add_timedeltalike_scalar(other) elif is_offset_object(other): @@ -1885,14 +1886,7 @@ cdef class _Period(PeriodMixin): return self.__add__(other) def __sub__(self, other): - if not is_period_object(self): - # cython semantics; this is like a call to __rsub__ - # TODO(cython3): remove this - if self is NaT: - return NaT - return NotImplemented - - elif ( + if ( is_any_td_scalar(other) or is_offset_object(other) or util.is_integer_object(other) diff --git a/pandas/_libs/tslibs/strptime.pyi b/pandas/_libs/tslibs/strptime.pyi index 4565bb7ecf959..0ec1a1e25a2b3 100644 --- a/pandas/_libs/tslibs/strptime.pyi +++ b/pandas/_libs/tslibs/strptime.pyi @@ -8,6 +8,7 @@ def array_strptime( exact: bool = ..., errors: str = ..., utc: bool = ..., + creso: int = ..., # NPY_DATETIMEUNIT ) -> tuple[np.ndarray, np.ndarray]: ... # first ndarray is M8[ns], second is object ndarray of tzinfo | None diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index c8fd95be34cc0..f6400ce6bae56 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -43,7 +43,6 @@ import pytz cimport numpy as cnp from numpy cimport ( int64_t, - is_datetime64_object, ndarray, ) @@ -52,6 +51,7 @@ from pandas._libs.tslibs.conversion cimport get_datetime64_nanos from pandas._libs.tslibs.dtypes cimport ( get_supported_reso, npy_unit_to_abbrev, + npy_unit_to_attrname, ) from pandas._libs.tslibs.nattype cimport ( NPY_NAT, @@ -60,7 +60,6 @@ from pandas._libs.tslibs.nattype cimport ( from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, NPY_FR_ns, - check_dts_bounds, get_datetime64_unit, import_pandas_datetime, npy_datetimestruct, @@ -241,7 +240,7 @@ cdef _get_format_regex(str fmt): cdef class DatetimeParseState: - def __cinit__(self, NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_ns): + def __cinit__(self, NPY_DATETIMEUNIT creso): # found_tz and found_naive are specifically about datetime/Timestamp # objects with and without tzinfos attached. self.found_tz = False @@ -369,7 +368,6 @@ def array_strptime( iresult[i] = pydatetime_to_dt64( val.replace(tzinfo=None), &dts, reso=creso ) - check_dts_bounds(&dts, creso) result_timezone[i] = val.tzinfo continue elif PyDate_Check(val): @@ -378,9 +376,8 @@ def array_strptime( if infer_reso: creso = state.creso iresult[i] = pydate_to_dt64(val, &dts, reso=creso) - check_dts_bounds(&dts, creso) continue - elif is_datetime64_object(val): + elif cnp.is_datetime64_object(val): item_reso = get_supported_reso(get_datetime64_unit(val)) state.update_creso(item_reso) if infer_reso: @@ -413,7 +410,13 @@ def array_strptime( state.update_creso(item_reso) if infer_reso: creso = state.creso - value = npy_datetimestruct_to_datetime(creso, &dts) + try: + value = npy_datetimestruct_to_datetime(creso, &dts) + except OverflowError as err: + attrname = npy_unit_to_attrname[creso] + raise OutOfBoundsDatetime( + f"Out of bounds {attrname} timestamp: {val}" + ) from err if out_local == 1: # Store the out_tzoffset in seconds # since we store the total_seconds of @@ -423,7 +426,6 @@ def array_strptime( out_local = 0 out_tzoffset = 0 iresult[i] = value - check_dts_bounds(&dts, creso) continue if parse_today_now(val, &iresult[i], utc, creso, infer_reso=infer_reso): @@ -444,11 +446,17 @@ def array_strptime( tz = _parse_with_format( val, fmt, exact, format_regex, locale_time, &dts, &item_reso ) + state.update_creso(item_reso) if infer_reso: creso = state.creso - iresult[i] = npy_datetimestruct_to_datetime(creso, &dts) - check_dts_bounds(&dts, creso) + try: + iresult[i] = npy_datetimestruct_to_datetime(creso, &dts) + except OverflowError as err: + attrname = npy_unit_to_attrname[creso] + raise OutOfBoundsDatetime( + f"Out of bounds {attrname} timestamp: {val}" + ) from err result_timezone[i] = tz except (ValueError, OutOfBoundsDatetime) as ex: @@ -481,10 +489,16 @@ def array_strptime( creso=state.creso, ) - # Otherwise we can use the single reso that we encountered and avoid - # a second pass. - abbrev = npy_unit_to_abbrev(state.creso) - result = iresult.base.view(f"M8[{abbrev}]") + elif state.creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: + # i.e. we never encountered anything non-NaT, default to "s". This + # ensures that insert and concat-like operations with NaT + # do not upcast units + result = iresult.base.view("M8[s]") + else: + # Otherwise we can use the single reso that we encountered and avoid + # a second pass. + abbrev = npy_unit_to_abbrev(state.creso) + result = iresult.base.view(f"M8[{abbrev}]") return result, result_timezone.base @@ -625,7 +639,7 @@ cdef tzinfo _parse_with_format( item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_ns # Pad to always return nanoseconds s += "0" * (9 - len(s)) - us = long(s) + us = int(s) ns = us % 1000 us = us // 1000 elif parse_code == 11: diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index 181703c5f55b2..24ec6c8891a89 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -71,7 +71,7 @@ _S = TypeVar("_S", bound=timedelta) def get_unit_for_round(freq, creso: int) -> int: ... def disallow_ambiguous_unit(unit: str | None) -> None: ... def ints_to_pytimedelta( - arr: npt.NDArray[np.timedelta64], + m8values: npt.NDArray[np.timedelta64], box: bool = ..., ) -> npt.NDArray[np.object_]: ... def array_to_timedelta64( @@ -165,8 +165,10 @@ class Timedelta(timedelta): def __gt__(self, other: timedelta) -> bool: ... def __hash__(self) -> int: ... def isoformat(self) -> str: ... - def to_numpy(self) -> np.timedelta64: ... - def view(self, dtype: npt.DTypeLike = ...) -> object: ... + def to_numpy( + self, dtype: npt.DTypeLike = ..., copy: bool = False + ) -> np.timedelta64: ... + def view(self, dtype: npt.DTypeLike) -> object: ... @property def unit(self) -> str: ... def as_unit(self, unit: str, round_ok: bool = ...) -> Timedelta: ... diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 0a9bd2ac2770b..5852a7d95d994 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -20,8 +20,6 @@ import numpy as np cimport numpy as cnp from numpy cimport ( int64_t, - is_datetime64_object, - is_timedelta64_object, ndarray, ) @@ -253,7 +251,7 @@ cpdef int64_t delta_to_nanoseconds( n = delta._value in_reso = delta._creso - elif is_timedelta64_object(delta): + elif cnp.is_timedelta64_object(delta): in_reso = get_datetime64_unit(delta) if in_reso == NPY_DATETIMEUNIT.NPY_FR_Y or in_reso == NPY_DATETIMEUNIT.NPY_FR_M: raise ValueError( @@ -347,7 +345,7 @@ cdef convert_to_timedelta64(object ts, str unit): ts = ts.as_unit("ns").asm8 else: ts = np.timedelta64(ts._value, "ns") - elif is_timedelta64_object(ts): + elif cnp.is_timedelta64_object(ts): ts = ensure_td64ns(ts) elif is_integer_object(ts): if ts == NPY_NAT: @@ -367,7 +365,7 @@ cdef convert_to_timedelta64(object ts, str unit): if PyDelta_Check(ts): ts = np.timedelta64(delta_to_nanoseconds(ts), "ns") - elif not is_timedelta64_object(ts): + elif not cnp.is_timedelta64_object(ts): raise TypeError(f"Invalid type for timedelta scalar: {type(ts)}") return ts.astype("timedelta64[ns]") @@ -764,7 +762,7 @@ def _binary_op_method_timedeltalike(op, name): if other is NaT: return NaT - elif is_datetime64_object(other) or ( + elif cnp.is_datetime64_object(other) or ( PyDateTime_Check(other) and not isinstance(other, ABCTimestamp) ): # this case is for a datetime object that is specifically @@ -1853,7 +1851,7 @@ class Timedelta(_Timedelta): return cls._from_value_and_reso( new_value, reso=NPY_DATETIMEUNIT.NPY_FR_us ) - elif is_timedelta64_object(value): + elif cnp.is_timedelta64_object(value): # Retain the resolution if possible, otherwise cast to the nearest # supported resolution. new_value = cnp.get_timedelta64_value(value) @@ -1904,7 +1902,7 @@ class Timedelta(_Timedelta): f"float, timedelta or convertible, not {type(value).__name__}" ) - if is_timedelta64_object(value): + if cnp.is_timedelta64_object(value): value = value.view("i8") # nat @@ -2279,7 +2277,7 @@ cdef bint is_any_td_scalar(object obj): bool """ return ( - PyDelta_Check(obj) or is_timedelta64_object(obj) or is_tick_object(obj) + PyDelta_Check(obj) or cnp.is_timedelta64_object(obj) or is_tick_object(obj) ) diff --git a/pandas/_libs/tslibs/timestamps.pxd b/pandas/_libs/tslibs/timestamps.pxd index 26018cd904249..bd73c713f6c04 100644 --- a/pandas/_libs/tslibs/timestamps.pxd +++ b/pandas/_libs/tslibs/timestamps.pxd @@ -26,7 +26,7 @@ cdef class _Timestamp(ABCTimestamp): cdef bint _get_start_end_field(self, str field, freq) cdef _get_date_name_field(self, str field, object locale) - cdef int64_t _maybe_convert_value_to_local(self) + cdef int64_t _maybe_convert_value_to_local(self) except? -1 cdef bint _can_compare(self, datetime other) cpdef to_datetime64(self) cpdef datetime to_pydatetime(_Timestamp self, bint warn=*) diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index e23f01b800874..c78c174e27902 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -183,7 +183,7 @@ class Timestamp(datetime): def is_year_end(self) -> bool: ... def to_pydatetime(self, warn: bool = ...) -> datetime: ... def to_datetime64(self) -> np.datetime64: ... - def to_period(self, freq: BaseOffset | str = ...) -> Period: ... + def to_period(self, freq: BaseOffset | str | None = None) -> Period: ... def to_julian_date(self) -> np.float64: ... @property def asm8(self) -> np.datetime64: ... diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index d0c3c5c23b272..568539b53aee0 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -16,7 +16,6 @@ import numpy as np cimport numpy as cnp from numpy cimport ( int64_t, - is_datetime64_object, ndarray, uint8_t, ) @@ -61,6 +60,7 @@ from pandas._libs.tslibs.conversion cimport ( ) from pandas._libs.tslibs.dtypes cimport ( npy_unit_to_abbrev, + npy_unit_to_attrname, periods_per_day, periods_per_second, ) @@ -83,10 +83,10 @@ from pandas._libs.tslibs.nattype cimport ( from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, NPY_FR_ns, - check_dts_bounds, cmp_dtstructs, cmp_scalar, convert_reso, + dts_to_iso_string, get_datetime64_unit, get_unit_from_dtype, import_pandas_datetime, @@ -329,7 +329,7 @@ cdef class _Timestamp(ABCTimestamp): ots = other elif other is NaT: return op == Py_NE - elif is_datetime64_object(other): + elif cnp.is_datetime64_object(other): ots = Timestamp(other) elif PyDateTime_Check(other): if self.nanosecond == 0: @@ -448,15 +448,15 @@ cdef class _Timestamp(ABCTimestamp): nanos = other._value try: - new_value = self._value+ nanos + new_value = self._value + nanos result = type(self)._from_value_and_reso( new_value, reso=self._creso, tz=self.tzinfo ) except OverflowError as err: - # TODO: don't hard-code nanosecond here new_value = int(self._value) + int(nanos) + attrname = npy_unit_to_attrname[self._creso] raise OutOfBoundsDatetime( - f"Out of bounds nanosecond timestamp: {new_value}" + f"Out of bounds {attrname} timestamp: {new_value}" ) from err return result @@ -475,11 +475,6 @@ cdef class _Timestamp(ABCTimestamp): dtype=object, ) - elif not isinstance(self, _Timestamp): - # cython semantics, args have been switched and this is __radd__ - # TODO(cython3): remove this it moved to __radd__ - return other.__add__(self) - return NotImplemented def __radd__(self, other): @@ -509,17 +504,11 @@ cdef class _Timestamp(ABCTimestamp): return NotImplemented # coerce if necessary if we are a Timestamp-like - if (PyDateTime_Check(self) - and (PyDateTime_Check(other) or is_datetime64_object(other))): + if PyDateTime_Check(other) or cnp.is_datetime64_object(other): # both_timestamps is to determine whether Timedelta(self - other) # should raise the OOB error, or fall back returning a timedelta. - # TODO(cython3): clean out the bits that moved to __rsub__ - both_timestamps = (isinstance(other, _Timestamp) and - isinstance(self, _Timestamp)) - if isinstance(self, _Timestamp): - other = type(self)(other) - else: - self = type(other)(self) + both_timestamps = isinstance(other, _Timestamp) + other = type(self)(other) if (self.tzinfo is None) ^ (other.tzinfo is None): raise TypeError( @@ -536,24 +525,18 @@ cdef class _Timestamp(ABCTimestamp): # scalar Timestamp/datetime - Timestamp/datetime -> yields a # Timedelta try: - res_value = self._value- other._value + res_value = self._value - other._value return Timedelta._from_value_and_reso(res_value, self._creso) except (OverflowError, OutOfBoundsDatetime, OutOfBoundsTimedelta) as err: - if isinstance(other, _Timestamp): - if both_timestamps: - raise OutOfBoundsDatetime( - "Result is too large for pandas.Timedelta. Convert inputs " - "to datetime.datetime with 'Timestamp.to_pydatetime()' " - "before subtracting." - ) from err + if both_timestamps: + raise OutOfBoundsDatetime( + "Result is too large for pandas.Timedelta. Convert inputs " + "to datetime.datetime with 'Timestamp.to_pydatetime()' " + "before subtracting." + ) from err # We get here in stata tests, fall back to stdlib datetime # method and return stdlib timedelta object pass - elif is_datetime64_object(self): - # GH#28286 cython semantics for __rsub__, `other` is actually - # the Timestamp - # TODO(cython3): remove this, this moved to __rsub__ - return type(other)(self) - other return NotImplemented @@ -565,13 +548,13 @@ cdef class _Timestamp(ABCTimestamp): # We get here in stata tests, fall back to stdlib datetime # method and return stdlib timedelta object pass - elif is_datetime64_object(other): + elif cnp.is_datetime64_object(other): return type(self)(other) - self return NotImplemented # ----------------------------------------------------------------- - cdef int64_t _maybe_convert_value_to_local(self): + cdef int64_t _maybe_convert_value_to_local(self) except? -1: """Convert UTC i8 value to local i8 value if tz exists""" cdef: int64_t val @@ -2489,8 +2472,13 @@ default 'raise' # We can avoid going through pydatetime paths, which is robust # to datetimes outside of pydatetime range. ts = _TSObject() - check_dts_bounds(&dts, self._creso) - ts.value = npy_datetimestruct_to_datetime(self._creso, &dts) + try: + ts.value = npy_datetimestruct_to_datetime(self._creso, &dts) + except OverflowError as err: + fmt = dts_to_iso_string(&dts) + raise OutOfBoundsDatetime( + f"Out of bounds timestamp: {fmt} with frequency '{self.unit}'" + ) from err ts.dts = dts ts.creso = self._creso ts.fold = fold diff --git a/pandas/_libs/tslibs/tzconversion.pyi b/pandas/_libs/tslibs/tzconversion.pyi index a354765a348ec..2108fa0f35547 100644 --- a/pandas/_libs/tslibs/tzconversion.pyi +++ b/pandas/_libs/tslibs/tzconversion.pyi @@ -10,7 +10,7 @@ from pandas._typing import npt # tz_convert_from_utc_single exposed for testing def tz_convert_from_utc_single( - val: np.int64, tz: tzinfo, creso: int = ... + utc_val: np.int64, tz: tzinfo, creso: int = ... ) -> np.int64: ... def tz_localize_to_utc( vals: npt.NDArray[np.int64], diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index e77a385113e93..2c4f0cd14db13 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -416,8 +416,13 @@ timedelta-like} else: delta_idx = bisect_right_i8(info.tdata, new_local, info.ntrans) - - delta_idx = delta_idx - delta_idx_offset + # Logic similar to the precompute section. But check the current + # delta in case we are moving between UTC+0 and non-zero timezone + if (shift_forward or shift_delta > 0) and \ + info.deltas[delta_idx - 1] >= 0: + delta_idx = delta_idx - 1 + else: + delta_idx = delta_idx - delta_idx_offset result[i] = new_local - info.deltas[delta_idx] elif fill_nonexist: result[i] = NPY_NAT diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index ddd07276c7a7e..e4ac3a9e167a3 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -1,5 +1,6 @@ from cpython.object cimport PyTypeObject +from cpython.unicode cimport PyUnicode_AsUTF8AndSize cdef extern from "Python.h": @@ -10,19 +11,15 @@ cdef extern from "Python.h": bint PyComplex_Check(object obj) nogil bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil - # TODO(cython3): cimport this, xref GH#49670 # Note that following functions can potentially raise an exception, - # thus they cannot be declared 'nogil'. Also PyUnicode_AsUTF8AndSize() can - # potentially allocate memory inside in unlikely case of when underlying - # unicode object was stored as non-utf8 and utf8 wasn't requested before. - const char* PyUnicode_AsUTF8AndSize(object obj, - Py_ssize_t* length) except NULL - + # thus they cannot be declared 'nogil'. object PyUnicode_EncodeLocale(object obj, const char *errors) nogil object PyUnicode_DecodeLocale(const char *str, const char *errors) nogil +cimport numpy as cnp from numpy cimport ( + PyArray_Check, float64_t, int64_t, is_timedelta64_object, @@ -37,7 +34,6 @@ cdef extern from "numpy/ndarrayobject.h": PyTypeObject PyBoolArrType_Type bint PyArray_IsIntegerScalar(obj) nogil - bint PyArray_Check(obj) nogil cdef extern from "numpy/npy_common.h": int64_t NPY_MIN_INT64 @@ -54,7 +50,7 @@ cdef inline bint is_integer_object(object obj) noexcept: """ Cython equivalent of - `isinstance(val, (int, long, np.integer)) and not isinstance(val, bool)` + `isinstance(val, (int, np.integer)) and not isinstance(val, (bool, np.timedelta64))` Parameters ---------- @@ -68,13 +64,13 @@ cdef inline bint is_integer_object(object obj) noexcept: ----- This counts np.timedelta64 objects as integers. """ - return (not PyBool_Check(obj) and PyArray_IsIntegerScalar(obj) + return (not PyBool_Check(obj) and isinstance(obj, (int, cnp.integer)) and not is_timedelta64_object(obj)) cdef inline bint is_float_object(object obj) noexcept nogil: """ - Cython equivalent of `isinstance(val, (float, np.float64))` + Cython equivalent of `isinstance(val, (float, np.floating))` Parameters ---------- @@ -90,7 +86,7 @@ cdef inline bint is_float_object(object obj) noexcept nogil: cdef inline bint is_complex_object(object obj) noexcept nogil: """ - Cython equivalent of `isinstance(val, (complex, np.complex128))` + Cython equivalent of `isinstance(val, (complex, np.complexfloating))` Parameters ---------- @@ -179,6 +175,9 @@ cdef inline const char* get_c_string_buf_and_size(str py_string, ------- buf : const char* """ + # Note PyUnicode_AsUTF8AndSize() can + # potentially allocate memory inside in unlikely case of when underlying + # unicode object was stored as non-utf8 and utf8 wasn't requested before. return PyUnicode_AsUTF8AndSize(py_string, length) diff --git a/pandas/_libs/tslibs/vectorized.pyi b/pandas/_libs/tslibs/vectorized.pyi index 3fd9e2501e611..de19f592da62b 100644 --- a/pandas/_libs/tslibs/vectorized.pyi +++ b/pandas/_libs/tslibs/vectorized.pyi @@ -31,7 +31,7 @@ def get_resolution( reso: int = ..., # NPY_DATETIMEUNIT ) -> Resolution: ... def ints_to_pydatetime( - arr: npt.NDArray[np.int64], + stamps: npt.NDArray[np.int64], tz: tzinfo | None = ..., box: str = ..., reso: int = ..., # NPY_DATETIMEUNIT diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi index b926a7cb73425..a6cfbec9b15b9 100644 --- a/pandas/_libs/window/aggregations.pyi +++ b/pandas/_libs/window/aggregations.pyi @@ -111,8 +111,8 @@ def ewm( com: float, # float64_t adjust: bool, ignore_na: bool, - deltas: np.ndarray, # const float64_t[:] - normalize: bool, + deltas: np.ndarray | None = None, # const float64_t[:] + normalize: bool = True, ) -> np.ndarray: ... # np.ndarray[np.float64] def ewmcov( input_x: np.ndarray, # const float64_t[:] diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 832919db442d4..6f9766f32b894 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -291,24 +291,10 @@ comparison_dunder_methods = ["__eq__", "__ne__", "__le__", "__lt__", "__ge__", "__gt__"] -def reset_display_options() -> None: - """ - Reset the display options for printing and representing objects. - """ - pd.reset_option("^display.", silent=True) - - # ----------------------------------------------------------------------------- # Comparators -def equalContents(arr1, arr2) -> bool: - """ - Checks if the set of unique elements of arr1 and arr2 are equivalent. - """ - return frozenset(arr1) == frozenset(arr2) - - def box_expected(expected, box_cls, transpose: bool = True): """ Helper function to wrap the expected output of a test in a given box_class. @@ -1131,7 +1117,6 @@ def shares_memory(left, right) -> bool: "EMPTY_STRING_PATTERN", "ENDIAN", "ensure_clean", - "equalContents", "external_error_raised", "FLOAT_EA_DTYPES", "FLOAT_NUMPY_DTYPES", @@ -1182,7 +1167,6 @@ def shares_memory(left, right) -> bool: "NULL_OBJECTS", "OBJECT_DTYPES", "raise_assert_detail", - "reset_display_options", "raises_chained_assignment_error", "round_trip_localpath", "round_trip_pathlib", diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py index 11cf60ef36a9c..f11dc11f6ac0d 100644 --- a/pandas/_testing/_warnings.py +++ b/pandas/_testing/_warnings.py @@ -4,6 +4,7 @@ contextmanager, nullcontext, ) +import inspect import re import sys from typing import ( @@ -13,6 +14,8 @@ ) import warnings +from pandas.compat import PY311 + if TYPE_CHECKING: from collections.abc import ( Generator, @@ -179,6 +182,11 @@ def _assert_caught_no_extra_warnings( # due to these open files. if any("matplotlib" in mod for mod in sys.modules): continue + if PY311 and actual_warning.category == EncodingWarning: + # EncodingWarnings are checked in the CI + # pyproject.toml errors on EncodingWarnings in pandas + # Ignore EncodingWarnings from other libraries + continue extra_warnings.append( ( actual_warning.category.__name__, @@ -206,15 +214,14 @@ def _is_unexpected_warning( def _assert_raised_with_correct_stacklevel( actual_warning: warnings.WarningMessage, ) -> None: - from inspect import ( - getframeinfo, - stack, - ) - - caller = getframeinfo(stack()[4][0]) + # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow + frame = inspect.currentframe() + for _ in range(4): + frame = frame.f_back # type: ignore[union-attr] + caller_filename = inspect.getfile(frame) # type: ignore[arg-type] msg = ( "Warning not set with correct stacklevel. " f"File where warning is raised: {actual_warning.filename} != " - f"{caller.filename}. Warning message: {actual_warning.message}" + f"{caller_filename}. Warning message: {actual_warning.message}" ) - assert actual_warning.filename == caller.filename, msg + assert actual_warning.filename == caller_filename, msg diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 8e49fcfb355fa..0d8fb7bfce33a 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -43,7 +43,6 @@ Series, TimedeltaIndex, ) -from pandas.core.algorithms import take_nd from pandas.core.arrays import ( DatetimeArray, ExtensionArray, @@ -246,13 +245,6 @@ def _check_types(left, right, obj: str = "Index") -> None: assert_attr_equal("dtype", left, right, obj=obj) - def _get_ilevel_values(index, level): - # accept level number only - unique = index.levels[level] - level_codes = index.codes[level] - filled = take_nd(unique._values, level_codes, fill_value=unique._na_value) - return unique._shallow_copy(filled, name=index.names[level]) - # instance validation _check_isinstance(left, right, Index) @@ -299,9 +291,8 @@ def _get_ilevel_values(index, level): ) assert_numpy_array_equal(left.codes[level], right.codes[level]) except AssertionError: - # cannot use get_level_values here because it can change dtype - llevel = _get_ilevel_values(left, level) - rlevel = _get_ilevel_values(right, level) + llevel = left.get_level_values(level) + rlevel = right.get_level_values(level) assert_index_equal( llevel, @@ -592,7 +583,7 @@ def raise_assert_detail( {message}""" if isinstance(index_values, Index): - index_values = np.array(index_values) + index_values = np.asarray(index_values) if isinstance(index_values, np.ndarray): msg += f"\n[index]: {pprint_thing(index_values)}" diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index c45c2ff6eb6df..1354c0a6db7c5 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -15,6 +15,8 @@ # Update install.rst & setup.cfg when updating versions! VERSIONS = { + "adbc-driver-postgresql": "0.8.0", + "adbc-driver-sqlite": "0.8.0", "bs4": "4.11.2", "blosc": "1.21.3", "bottleneck": "1.3.6", @@ -50,7 +52,7 @@ "zstandard": "0.19.0", "tzdata": "2022.7", "qtpy": "2.3.0", - "pyqt5": "5.15.8", + "pyqt5": "5.15.9", } # A mapping from import name to package name (on PyPI) for packages where diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 8dcb2669aa663..beb4814914101 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -14,6 +14,7 @@ pa_version_under13p0 = _palv < Version("13.0.0") pa_version_under14p0 = _palv < Version("14.0.0") pa_version_under14p1 = _palv < Version("14.0.1") + pa_version_under15p0 = _palv < Version("15.0.0") except ImportError: pa_version_under10p1 = True pa_version_under11p0 = True @@ -21,3 +22,4 @@ pa_version_under13p0 = True pa_version_under14p0 = True pa_version_under14p1 = True + pa_version_under15p0 = True diff --git a/pandas/conftest.py b/pandas/conftest.py index 35a0d3b89400f..b24606f8007d2 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -30,7 +30,6 @@ from decimal import Decimal import operator import os -from pathlib import Path from typing import ( TYPE_CHECKING, Callable, @@ -143,11 +142,13 @@ def pytest_collection_modifyitems(items, config) -> None: ("is_datetime64tz_dtype", "is_datetime64tz_dtype is deprecated"), ("is_categorical_dtype", "is_categorical_dtype is deprecated"), ("is_sparse", "is_sparse is deprecated"), + ("DataFrameGroupBy.fillna", "DataFrameGroupBy.fillna is deprecated"), ("NDFrame.replace", "The 'method' keyword"), ("NDFrame.replace", "Series.replace without 'value'"), ("NDFrame.clip", "Downcasting behavior in Series and DataFrame methods"), ("Series.idxmin", "The behavior of Series.idxmin"), ("Series.idxmax", "The behavior of Series.idxmax"), + ("SeriesGroupBy.fillna", "SeriesGroupBy.fillna is deprecated"), ("SeriesGroupBy.idxmin", "The behavior of Series.idxmin"), ("SeriesGroupBy.idxmax", "The behavior of Series.idxmax"), # Docstring divides by zero to show behavior difference @@ -773,23 +774,6 @@ def series_with_simple_index(index) -> Series: return _create_series(index) -@pytest.fixture -def series_with_multilevel_index() -> Series: - """ - Fixture with a Series with a 2-level MultiIndex. - """ - arrays = [ - ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], - ["one", "two", "one", "two", "one", "two", "one", "two"], - ] - tuples = zip(*arrays) - index = MultiIndex.from_tuples(tuples) - data = np.random.default_rng(2).standard_normal(8) - ser = Series(data, index=index) - ser.iloc[3] = np.nan - return ser - - _narrow_series = { f"{dtype.__name__}-series": tm.make_rand_series(name="a", dtype=dtype) for dtype in tm.NARROW_NP_DTYPES @@ -863,35 +847,6 @@ def int_frame() -> DataFrame: return DataFrame(tm.getSeriesData()).astype("int64") -@pytest.fixture -def datetime_frame() -> DataFrame: - """ - Fixture for DataFrame of floats with DatetimeIndex - - Columns are ['A', 'B', 'C', 'D'] - - A B C D - 2000-01-03 -1.122153 0.468535 0.122226 1.693711 - 2000-01-04 0.189378 0.486100 0.007864 -1.216052 - 2000-01-05 0.041401 -0.835752 -0.035279 -0.414357 - 2000-01-06 0.430050 0.894352 0.090719 0.036939 - 2000-01-07 -0.620982 -0.668211 -0.706153 1.466335 - 2000-01-10 -0.752633 0.328434 -0.815325 0.699674 - 2000-01-11 -2.236969 0.615737 -0.829076 -1.196106 - ... ... ... ... ... - 2000-02-03 1.642618 -0.579288 0.046005 1.385249 - 2000-02-04 -0.544873 -1.160962 -0.284071 -1.418351 - 2000-02-07 -2.656149 -0.601387 1.410148 0.444150 - 2000-02-08 -1.201881 -1.289040 0.772992 -1.445300 - 2000-02-09 1.377373 0.398619 1.008453 -0.928207 - 2000-02-10 0.473194 -0.636677 0.984058 0.511519 - 2000-02-11 -0.965556 0.408313 -1.312844 -0.381948 - - [30 rows x 4 columns] - """ - return DataFrame(tm.getTimeSeriesData()) - - @pytest.fixture def float_frame() -> DataFrame: """ @@ -921,24 +876,6 @@ def float_frame() -> DataFrame: return DataFrame(tm.getSeriesData()) -@pytest.fixture -def mixed_type_frame() -> DataFrame: - """ - Fixture for DataFrame of float/int/string columns with RangeIndex - Columns are ['a', 'b', 'c', 'float32', 'int32']. - """ - return DataFrame( - { - "a": 1.0, - "b": 2, - "c": "foo", - "float32": np.array([1.0] * 10, dtype="float32"), - "int32": np.array([1] * 10, dtype="int32"), - }, - index=np.arange(10), - ) - - @pytest.fixture def rand_series_with_duplicate_datetimeindex() -> Series: """ @@ -1172,16 +1109,6 @@ def strict_data_files(pytestconfig): return pytestconfig.getoption("--no-strict-data-files") -@pytest.fixture -def tests_path() -> Path: - return Path(__file__).parent / "tests" - - -@pytest.fixture -def tests_io_data_path(tests_path) -> Path: - return tests_path / "io" / "data" - - @pytest.fixture def datapath(strict_data_files: str) -> Callable[..., str]: """ @@ -1216,14 +1143,6 @@ def deco(*args): return deco -@pytest.fixture -def iris(datapath) -> DataFrame: - """ - The iris dataset as a DataFrame. - """ - return pd.read_csv(datapath("io", "data", "csv", "iris.csv")) - - # ---------------------------------------------------------------- # Time zones # ---------------------------------------------------------------- @@ -1903,28 +1822,6 @@ def sort_by_key(request): return request.param -@pytest.fixture() -def fsspectest(): - pytest.importorskip("fsspec") - from fsspec import register_implementation - from fsspec.implementations.memory import MemoryFileSystem - from fsspec.registry import _registry as registry - - class TestMemoryFS(MemoryFileSystem): - protocol = "testmem" - test = [None] - - def __init__(self, **kwargs) -> None: - self.test[0] = kwargs.pop("test", None) - super().__init__(**kwargs) - - register_implementation("testmem", TestMemoryFS, clobber=True) - yield TestMemoryFS() - registry.pop("testmem", None) - TestMemoryFS.test[0] = None - TestMemoryFS.store.clear() - - @pytest.fixture( params=[ ("foo", None, None), @@ -2022,6 +1919,14 @@ def warn_copy_on_write() -> bool: ) +@pytest.fixture +def using_infer_string() -> bool: + """ + Fixture to check if infer_string is enabled. + """ + return pd.options.future.infer_string + + warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"] if zoneinfo is not None: warsaws.append(zoneinfo.ZoneInfo("Europe/Warsaw")) # type: ignore[arg-type] diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py index ebe2a752a12f7..ee09c9380fb0f 100644 --- a/pandas/core/_numba/extensions.py +++ b/pandas/core/_numba/extensions.py @@ -37,6 +37,7 @@ from pandas.core.indexes.base import Index from pandas.core.indexing import _iLocIndexer +from pandas.core.internals import SingleBlockManager from pandas.core.series import Series @@ -387,32 +388,40 @@ def box_series(typ, val, c): Convert a native series structure to a Series object. """ series = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val) - class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Series)) + series_const_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Series._from_mgr)) + mgr_const_obj = c.pyapi.unserialize( + c.pyapi.serialize_object(SingleBlockManager.from_array) + ) index_obj = c.box(typ.index, series.index) array_obj = c.box(typ.as_array, series.values) name_obj = c.box(typ.namety, series.name) - true_obj = c.pyapi.unserialize(c.pyapi.serialize_object(True)) - # This is equivalent of - # pd.Series(data=array_obj, index=index_obj, dtype=None, - # name=name_obj, copy=None, fastpath=True) - series_obj = c.pyapi.call_function_objargs( - class_obj, + # This is basically equivalent of + # pd.Series(data=array_obj, index=index_obj) + # To improve perf, we will construct the Series from a manager + # object to avoid checks. + # We'll also set the name attribute manually to avoid validation + mgr_obj = c.pyapi.call_function_objargs( + mgr_const_obj, ( array_obj, index_obj, - c.pyapi.borrow_none(), - name_obj, - c.pyapi.borrow_none(), - true_obj, ), ) + mgr_axes_obj = c.pyapi.object_getattr_string(mgr_obj, "axes") + # Series._constructor_from_mgr(mgr, axes) + series_obj = c.pyapi.call_function_objargs( + series_const_obj, (mgr_obj, mgr_axes_obj) + ) + c.pyapi.object_setattr_string(series_obj, "_name", name_obj) # Decrefs - c.pyapi.decref(class_obj) + c.pyapi.decref(series_const_obj) + c.pyapi.decref(mgr_axes_obj) + c.pyapi.decref(mgr_obj) + c.pyapi.decref(mgr_const_obj) c.pyapi.decref(index_obj) c.pyapi.decref(array_obj) c.pyapi.decref(name_obj) - c.pyapi.decref(true_obj) return series_obj diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index dd45969a13fd7..b38a27a9f6d0a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1034,7 +1034,10 @@ def mode( values = _ensure_data(values) - npresult = htable.mode(values, dropna=dropna, mask=mask) + npresult, res_mask = htable.mode(values, dropna=dropna, mask=mask) + if res_mask is not None: + return npresult, res_mask # type: ignore[return-value] + try: npresult = np.sort(npresult) except TypeError as err: @@ -1630,11 +1633,12 @@ def safe_sort( if use_na_sentinel: # take_nd is faster, but only works for na_sentinels of -1 order2 = sorter.argsort() - new_codes = take_nd(order2, codes, fill_value=-1) if verify: mask = (codes < -len(values)) | (codes >= len(values)) + codes[mask] = 0 else: mask = None + new_codes = take_nd(order2, codes, fill_value=-1) else: reverse_indexer = np.empty(len(sorter), dtype=int) reverse_indexer.put(sorter, np.arange(len(sorter))) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4bcc03643dac8..d162b66e5d369 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -553,6 +553,18 @@ def __getitem__(self, item: PositionalIndexer): ) # We are not an array indexer, so maybe e.g. a slice or integer # indexer. We dispatch to pyarrow. + if isinstance(item, slice): + # Arrow bug https://github.com/apache/arrow/issues/38768 + if item.start == item.stop: + pass + elif ( + item.stop is not None + and item.stop < -len(self) + and item.step is not None + and item.step < 0 + ): + item = slice(item.start, None, item.step) + value = self._pa_array[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 33b2f65340a3b..a88f40013b3f6 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1938,7 +1938,7 @@ def __init__( freq = values.freq elif freq and values.freq: freq = to_offset(freq) - freq, _ = validate_inferred_freq(freq, values.freq, False) + freq = _validate_inferred_freq(freq, values.freq) if dtype is not None and dtype != values.dtype: # TODO: we only have tests for this for DTA, not TDA (2022-07-01) @@ -2025,6 +2025,39 @@ def freq(self, value) -> None: self._freq = value + @final + def _maybe_pin_freq(self, freq, validate_kwds: dict): + """ + Constructor helper to pin the appropriate `freq` attribute. Assumes + that self._freq is currently set to any freq inferred in + _from_sequence_not_strict. + """ + if freq is None: + # user explicitly passed None -> override any inferred_freq + self._freq = None + elif freq == "infer": + # if self._freq is *not* None then we already inferred a freq + # and there is nothing left to do + if self._freq is None: + # Set _freq directly to bypass duplicative _validate_frequency + # check. + self._freq = to_offset(self.inferred_freq) + elif freq is lib.no_default: + # user did not specify anything, keep inferred freq if the original + # data had one, otherwise do nothing + pass + elif self._freq is None: + # We cannot inherit a freq from the data, so we need to validate + # the user-passed freq + freq = to_offset(freq) + type(self)._validate_frequency(self, freq, **validate_kwds) + self._freq = freq + else: + # Otherwise we just need to check that the user-passed freq + # doesn't conflict with the one we already have. + freq = to_offset(freq) + _validate_inferred_freq(freq, self._freq) + @final @classmethod def _validate_frequency(cls, index, freq: BaseOffset, **kwargs): @@ -2281,8 +2314,7 @@ def _concat_same_type( return new_obj def copy(self, order: str = "C") -> Self: - # error: Unexpected keyword argument "order" for "copy" - new_obj = super().copy(order=order) # type: ignore[call-arg] + new_obj = super().copy(order=order) new_obj._freq = self.freq return new_obj @@ -2353,7 +2385,9 @@ def _is_dates_only(self) -> bool: # Shared Constructor Helpers -def ensure_arraylike_for_datetimelike(data, copy: bool, cls_name: str): +def ensure_arraylike_for_datetimelike( + data, copy: bool, cls_name: str +) -> tuple[ArrayLike, bool]: if not hasattr(data, "dtype"): # e.g. list, tuple if not isinstance(data, (list, tuple)) and np.ndim(data) == 0: @@ -2420,15 +2454,23 @@ def validate_periods(periods: int | float | None) -> int | None: """ if periods is not None: if lib.is_float(periods): + warnings.warn( + # GH#56036 + "Non-integer 'periods' in pd.date_range, pd.timedelta_range, " + "pd.period_range, and pd.interval_range are deprecated and " + "will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) periods = int(periods) elif not lib.is_integer(periods): raise TypeError(f"periods must be a number, got {periods}") return periods -def validate_inferred_freq( - freq, inferred_freq, freq_infer -) -> tuple[BaseOffset | None, bool]: +def _validate_inferred_freq( + freq: BaseOffset | None, inferred_freq: BaseOffset | None +) -> BaseOffset | None: """ If the user passes a freq and another freq is inferred from passed data, require that they match. @@ -2437,17 +2479,10 @@ def validate_inferred_freq( ---------- freq : DateOffset or None inferred_freq : DateOffset or None - freq_infer : bool Returns ------- freq : DateOffset or None - freq_infer : bool - - Notes - ----- - We assume at this point that `maybe_infer_freq` has been called, so - `freq` is either a DateOffset object or None. """ if inferred_freq is not None: if freq is not None and freq != inferred_freq: @@ -2458,37 +2493,8 @@ def validate_inferred_freq( ) if freq is None: freq = inferred_freq - freq_infer = False - return freq, freq_infer - - -def maybe_infer_freq(freq): - """ - Comparing a DateOffset to the string "infer" raises, so we need to - be careful about comparisons. Make a dummy variable `freq_infer` to - signify the case where the given freq is "infer" and set freq to None - to avoid comparison trouble later on. - - Parameters - ---------- - freq : {DateOffset, None, str} - - Returns - ------- - freq : {DateOffset, None} - freq_infer : bool - Whether we should inherit the freq of passed data. - """ - freq_infer = False - if not isinstance(freq, BaseOffset): - # if a passed freq is None, don't infer automatically - if freq != "infer": - freq = to_offset(freq) - else: - freq_infer = True - freq = None - return freq, freq_infer + return freq def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype) -> str: diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 6efff2483e1ae..d12c2e7c4ae3b 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -76,6 +76,7 @@ from collections.abc import Iterator from pandas._typing import ( + ArrayLike, DateTimeErrorChoices, DtypeObj, IntervalClosedType, @@ -89,6 +90,9 @@ from pandas.core.arrays import PeriodArray +_ITER_CHUNKSIZE = 10_000 + + def tz_to_dtype( tz: tzinfo | None, unit: str = "ns" ) -> np.dtype[np.datetime64] | DatetimeTZDtype: @@ -327,13 +331,10 @@ def _from_sequence_not_strict( dayfirst: bool = False, yearfirst: bool = False, ambiguous: TimeAmbiguous = "raise", - ): + ) -> Self: """ A non-strict version of _from_sequence, called from DatetimeIndex.__new__. """ - explicit_none = freq is None - freq = freq if freq is not lib.no_default else None - freq, freq_infer = dtl.maybe_infer_freq(freq) # if the user either explicitly passes tz=None or a tz-naive dtype, we # disallows inferring a tz. @@ -349,13 +350,16 @@ def _from_sequence_not_strict( unit = None if dtype is not None: - if isinstance(dtype, np.dtype): - unit = np.datetime_data(dtype)[0] - else: - # DatetimeTZDtype - unit = dtype.unit + unit = dtl.dtype_to_unit(dtype) + + data, copy = dtl.ensure_arraylike_for_datetimelike( + data, copy, cls_name="DatetimeArray" + ) + inferred_freq = None + if isinstance(data, DatetimeArray): + inferred_freq = data.freq - subarr, tz, inferred_freq = _sequence_to_dt64( + subarr, tz = _sequence_to_dt64( data, copy=copy, tz=tz, @@ -372,26 +376,15 @@ def _from_sequence_not_strict( "Use obj.tz_localize(None) instead." ) - freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, freq_infer) - if explicit_none: - freq = None - data_unit = np.datetime_data(subarr.dtype)[0] data_dtype = tz_to_dtype(tz, data_unit) - result = cls._simple_new(subarr, freq=freq, dtype=data_dtype) + result = cls._simple_new(subarr, freq=inferred_freq, dtype=data_dtype) if unit is not None and unit != result.unit: # If unit was specified in user-passed dtype, cast to it here result = result.as_unit(unit) - if inferred_freq is None and freq is not None: - # this condition precludes `freq_infer` - cls._validate_frequency(result, freq, ambiguous=ambiguous) - - elif freq_infer: - # Set _freq directly to bypass duplicative _validate_frequency - # check. - result._freq = to_offset(result.inferred_freq) - + validate_kwds = {"ambiguous": ambiguous} + result._maybe_pin_freq(freq, validate_kwds) return result # error: Signature of "_generate_range" incompatible with supertype @@ -664,7 +657,7 @@ def __iter__(self) -> Iterator: # convert in chunks of 10k for efficiency data = self.asi8 length = len(self) - chunksize = 10000 + chunksize = _ITER_CHUNKSIZE chunks = (length // chunksize) + 1 for i in range(chunks): @@ -2180,7 +2173,7 @@ def std( def _sequence_to_dt64( - data, + data: ArrayLike, *, copy: bool = False, tz: tzinfo | None = None, @@ -2192,7 +2185,8 @@ def _sequence_to_dt64( """ Parameters ---------- - data : list-like + data : np.ndarray or ExtensionArray + dtl.ensure_arraylike_for_datetimelike has already been called. copy : bool, default False tz : tzinfo or None, default None dayfirst : bool, default False @@ -2209,21 +2203,11 @@ def _sequence_to_dt64( Where `unit` is "ns" unless specified otherwise by `out_unit`. tz : tzinfo or None Either the user-provided tzinfo or one inferred from the data. - inferred_freq : Tick or None - The inferred frequency of the sequence. Raises ------ TypeError : PeriodDType data is passed """ - inferred_freq = None - - data, copy = dtl.ensure_arraylike_for_datetimelike( - data, copy, cls_name="DatetimeArray" - ) - - if isinstance(data, DatetimeArray): - inferred_freq = data.freq # By this point we are assured to have either a numpy array or Index data, copy = maybe_convert_dtype(data, copy, tz=tz) @@ -2236,8 +2220,10 @@ def _sequence_to_dt64( if data_dtype == object or is_string_dtype(data_dtype): # TODO: We do not have tests specific to string-dtypes, # also complex or categorical or other extension + data = cast(np.ndarray, data) copy = False if lib.infer_dtype(data, skipna=False) == "integer": + # Much more performant than going through array_to_datetime data = data.astype(np.int64) elif tz is not None and ambiguous == "raise": obj_data = np.asarray(data, dtype=object) @@ -2248,7 +2234,7 @@ def _sequence_to_dt64( yearfirst=yearfirst, creso=abbrev_to_npy_unit(out_unit), ) - return result, tz, None + return result, tz else: converted, inferred_tz = objects_to_datetime64( data, @@ -2261,19 +2247,17 @@ def _sequence_to_dt64( if tz and inferred_tz: # two timezones: convert to intended from base UTC repr # GH#42505 by convention, these are _already_ UTC - assert converted.dtype == out_dtype, converted.dtype - result = converted.view(out_dtype) + result = converted elif inferred_tz: tz = inferred_tz - assert converted.dtype == out_dtype, converted.dtype - result = converted.view(out_dtype) + result = converted else: result, _ = _construct_from_dt64_naive( converted, tz=tz, copy=copy, ambiguous=ambiguous ) - return result, tz, None + return result, tz data_dtype = data.dtype @@ -2281,6 +2265,7 @@ def _sequence_to_dt64( # so we need to handle these types. if isinstance(data_dtype, DatetimeTZDtype): # DatetimeArray -> ndarray + data = cast(DatetimeArray, data) tz = _maybe_infer_tz(tz, data.tz) result = data._ndarray @@ -2289,6 +2274,7 @@ def _sequence_to_dt64( if isinstance(data, DatetimeArray): data = data._ndarray + data = cast(np.ndarray, data) result, copy = _construct_from_dt64_naive( data, tz=tz, copy=copy, ambiguous=ambiguous ) @@ -2299,6 +2285,7 @@ def _sequence_to_dt64( if data.dtype != INT64_DTYPE: data = data.astype(np.int64, copy=False) copy = False + data = cast(np.ndarray, data) result = data.view(out_dtype) if copy: @@ -2308,7 +2295,7 @@ def _sequence_to_dt64( assert result.dtype.kind == "M" assert result.dtype != "M8" assert is_supported_unit(get_unit_from_dtype(result.dtype)) - return result, tz, inferred_freq + return result, tz def _construct_from_dt64_naive( @@ -2395,6 +2382,7 @@ def objects_to_datetime64( Raises ------ ValueError : if data cannot be converted to datetimes + TypeError : When a type cannot be converted to datetime """ assert errors in ["raise", "ignore", "coerce"] @@ -2804,13 +2792,7 @@ def _generate_range( break # faster than cur + offset - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - "Discarding nonzero nanoseconds in conversion", - category=UserWarning, - ) - next_date = offset._apply(cur) + next_date = offset._apply(cur) next_date = next_date.as_unit(unit) if next_date <= cur: raise ValueError(f"Offset {offset} did not increment date") @@ -2825,13 +2807,7 @@ def _generate_range( break # faster than cur + offset - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - "Discarding nonzero nanoseconds in conversion", - category=UserWarning, - ) - next_date = offset._apply(cur) + next_date = offset._apply(cur) next_date = next_date.as_unit(unit) if next_date >= cur: raise ValueError(f"Offset {offset} did not decrement date") diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 91960173e7c1e..2f1363f180d08 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -391,12 +391,7 @@ def _from_sequence( @classmethod def _from_factorized(cls, values: np.ndarray, original: IntervalArray) -> Self: - if len(values) == 0: - # An empty array returns object-dtype here. We can't create - # a new IA from an (empty) object-dtype array, so turn it into the - # correct dtype. - values = values.astype(original.dtype.subtype) - return cls(values, closed=original.closed) + return cls._from_sequence(values, dtype=original.dtype) _interval_shared_docs["from_breaks"] = textwrap.dedent( """ diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index c8447397c7bfe..58909643ed46a 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -69,6 +69,7 @@ from pandas.core.algorithms import ( factorize_array, isin, + mode, take, ) from pandas.core.array_algos import ( @@ -1069,6 +1070,15 @@ def value_counts(self, dropna: bool = True) -> Series: ) return Series(arr, index=index, name="count", copy=False) + def _mode(self, dropna: bool = True) -> Self: + if dropna: + result = mode(self._data, dropna=dropna, mask=self._mask) + res_mask = np.zeros(result.shape, dtype=np.bool_) + else: + result, res_mask = mode(self._data, dropna=dropna, mask=self._mask) + result = type(self)(result, res_mask) # type: ignore[arg-type] + return result[result.argsort()] + @doc(ExtensionArray.equals) def equals(self, other) -> bool: if type(self) != type(other): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 25659d8738057..57b244e8d02e9 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -329,26 +329,25 @@ def _from_datetime64(cls, data, freq, tz=None) -> Self: return cls(data, dtype=dtype) @classmethod - def _generate_range(cls, start, end, periods, freq, fields): + def _generate_range(cls, start, end, periods, freq): periods = dtl.validate_periods(periods) if freq is not None: freq = Period._maybe_convert_freq(freq) - field_count = len(fields) if start is not None or end is not None: - if field_count > 0: - raise ValueError( - "Can either instantiate from fields or endpoints, but not both" - ) subarr, freq = _get_ordinal_range(start, end, periods, freq) - elif field_count > 0: - subarr, freq = _range_from_fields(freq=freq, **fields) else: raise ValueError("Not enough parameters to construct Period range") return subarr, freq + @classmethod + def _from_fields(cls, *, fields: dict, freq) -> Self: + subarr, freq = _range_from_fields(freq=freq, **fields) + dtype = PeriodDtype(freq) + return cls._simple_new(subarr, dtype=dtype) + # ----------------------------------------------------------------- # DatetimeLike Interface diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index cf349220e4ba7..5db77db2a9c66 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1086,9 +1086,10 @@ def _take_with_fill(self, indices, fill_value=None) -> np.ndarray: ) elif self.sp_index.npoints == 0: - # Avoid taking from the empty self.sp_values + # Use the old fill_value unless we took for an index of -1 _dtype = np.result_type(self.dtype.subtype, type(fill_value)) taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype) + taken[old_fill_indices] = self.fill_value else: taken = self.sp_values.take(sp_indexer) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 1c2ecd4684e2f..96ebb4901f797 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -527,6 +527,13 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): return super()._str_find(sub, start, end) return self._convert_int_dtype(result) + def _str_get_dummies(self, sep: str = "|"): + dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep) + if len(labels) == 0: + return np.empty(shape=(0, 0), dtype=np.int64), labels + dummies = np.vstack(dummies_pa.to_numpy()) + return dummies.astype(np.int64, copy=False), labels + def _convert_int_dtype(self, result): return Int64Dtype().__from_arrow__(result) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index b64ab1154ef96..f55d3de8878ad 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -6,7 +6,6 @@ TYPE_CHECKING, cast, ) -import warnings import numpy as np @@ -26,10 +25,8 @@ is_supported_unit, npy_unit_to_abbrev, periods_per_second, - to_offset, ) -from pandas._libs.tslibs.conversion import precision_from_unit -from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit +from pandas._libs.tslibs.conversion import cast_from_unit_vectorized from pandas._libs.tslibs.fields import ( get_timedelta_days, get_timedelta_field, @@ -236,9 +233,7 @@ def _from_sequence(cls, data, *, dtype=None, copy: bool = False) -> Self: if dtype: dtype = _validate_td64_dtype(dtype) - data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=None) - freq, _ = dtl.validate_inferred_freq(None, inferred_freq, False) - freq = cast("Tick | None", freq) + data, freq = sequence_to_td64ns(data, copy=copy, unit=None) if dtype is not None: data = astype_overflowsafe(data, dtype=dtype, copy=False) @@ -256,38 +251,22 @@ def _from_sequence_not_strict( unit=None, ) -> Self: """ - A non-strict version of _from_sequence, called from TimedeltaIndex.__new__. + _from_sequence_not_strict but without responsibility for finding the + result's `freq`. """ if dtype: dtype = _validate_td64_dtype(dtype) assert unit not in ["Y", "y", "M"] # caller is responsible for checking - explicit_none = freq is None - freq = freq if freq is not lib.no_default else None - - freq, freq_infer = dtl.maybe_infer_freq(freq) - data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=unit) - freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, freq_infer) - freq = cast("Tick | None", freq) - if explicit_none: - freq = None if dtype is not None: data = astype_overflowsafe(data, dtype=dtype, copy=False) - result = cls._simple_new(data, dtype=data.dtype, freq=freq) - - if inferred_freq is None and freq is not None: - # this condition precludes `freq_infer` - cls._validate_frequency(result, freq) - - elif freq_infer: - # Set _freq directly to bypass duplicative _validate_frequency - # check. - result._freq = to_offset(result.inferred_freq) + result = cls._simple_new(data, dtype=data.dtype, freq=inferred_freq) + result._maybe_pin_freq(freq, {}) return result # Signature of "_generate_range" incompatible with supertype @@ -1078,23 +1057,10 @@ def sequence_to_td64ns( data = data._data else: mask = np.isnan(data) - # The next few lines are effectively a vectorized 'cast_from_unit' - m, p = precision_from_unit(abbrev_to_npy_unit(unit or "ns")) - with warnings.catch_warnings(): - # Suppress RuntimeWarning about All-NaN slice - warnings.filterwarnings( - "ignore", "invalid value encountered in cast", RuntimeWarning - ) - base = data.astype(np.int64) - frac = data - base - if p: - frac = np.round(frac, p) - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", "invalid value encountered in cast", RuntimeWarning - ) - data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]") + + data = cast_from_unit_vectorized(data, unit or "ns") data[mask] = iNaT + data = data.view("m8[ns]") copy = False elif lib.is_np_dtype(data.dtype, "m"): diff --git a/pandas/core/common.py b/pandas/core/common.py index 8fd8b10c6fc32..7d864e02be54e 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -42,6 +42,7 @@ from pandas.core.dtypes.generic import ( ABCExtensionArray, ABCIndex, + ABCMultiIndex, ABCSeries, ) from pandas.core.dtypes.inference import iterable_not_string @@ -121,7 +122,9 @@ def is_bool_indexer(key: Any) -> bool: check_array_indexer : Check that `key` is a valid array to index, and convert to an ndarray. """ - if isinstance(key, (ABCSeries, np.ndarray, ABCIndex, ABCExtensionArray)): + if isinstance( + key, (ABCSeries, np.ndarray, ABCIndex, ABCExtensionArray) + ) and not isinstance(key, ABCMultiIndex): if key.dtype == np.object_: key_array = np.asarray(key) diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index ce0c50a810ab1..0c99e5e7bdc54 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -389,6 +389,9 @@ def eval( # to use a non-numeric indexer try: with warnings.catch_warnings(record=True): + warnings.filterwarnings( + "always", "Setting a value on a view", FutureWarning + ) # TODO: Filter the warnings we actually care about here. if inplace and isinstance(target, NDFrame): target.loc[:, assigner] = ret diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 4fbc32e5bb103..a0a92a99abe51 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -49,6 +49,7 @@ from pandas.core.dtypes.common import ( is_list_like, is_object_dtype, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import NumpyEADtype @@ -548,6 +549,10 @@ def sanitize_array( # Avoid ending up with a NumpyExtensionArray dtype = dtype.numpy_dtype + object_index = False + if isinstance(data, ABCIndex) and data.dtype == object and dtype is None: + object_index = True + # extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray data = extract_array(data, extract_numpy=True, extract_range=True) @@ -601,6 +606,13 @@ def sanitize_array( subarr = data if data.dtype == object: subarr = maybe_infer_to_datetimelike(data) + if ( + object_index + and using_pyarrow_string_dtype() + and is_string_dtype(subarr) + ): + # Avoid inference when string option is set + subarr = data elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): from pandas.core.arrays.string_ import StringDtype diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 716d1a78f93c5..792fa4fdc24a5 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -20,7 +20,11 @@ from pandas._config import using_pyarrow_string_dtype -from pandas._libs import lib +from pandas._libs import ( + Interval, + Period, + lib, +) from pandas._libs.missing import ( NA, NAType, @@ -357,15 +361,11 @@ def trans(x): # if we don't have any elements, just astype it return trans(result).astype(dtype) - # do a test on the first element, if it fails then we are done - r = result.ravel() - arr = np.array([r[0]]) - - if isna(arr).any(): - # if we have any nulls, then we are done - return result - - elif not isinstance(r[0], (np.integer, np.floating, int, float, bool)): + if isinstance(result, np.ndarray): + element = result.item(0) + else: + element = result.iloc[0] + if not isinstance(element, (np.integer, np.floating, int, float, bool)): # a comparable, e.g. a Decimal may slip in here return result @@ -850,9 +850,9 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]: elif is_complex(val): dtype = np.dtype(np.complex128) - if lib.is_period(val): + if isinstance(val, Period): dtype = PeriodDtype(freq=val.freq) - elif lib.is_interval(val): + elif isinstance(val, Interval): subtype = infer_dtype_from_scalar(val.left)[0] dtype = IntervalDtype(subtype=subtype, closed=val.closed) @@ -1133,7 +1133,7 @@ def convert_dtypes( base_dtype = inferred_dtype if ( base_dtype.kind == "O" # type: ignore[union-attr] - and len(input_array) > 0 + and input_array.size > 0 and isna(input_array).all() ): import pyarrow as pa diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 8760c8eeca454..a635ac77566e1 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -562,12 +562,29 @@ def _array_equivalent_datetimelike(left: np.ndarray, right: np.ndarray): def _array_equivalent_object(left: np.ndarray, right: np.ndarray, strict_nan: bool): - if not strict_nan: - # isna considers NaN and None to be equivalent. - - return lib.array_equivalent_object(ensure_object(left), ensure_object(right)) - - for left_value, right_value in zip(left, right): + left = ensure_object(left) + right = ensure_object(right) + + mask: npt.NDArray[np.bool_] | None = None + if strict_nan: + mask = isna(left) & isna(right) + if not mask.any(): + mask = None + + try: + if mask is None: + return lib.array_equivalent_object(left, right) + if not lib.array_equivalent_object(left[~mask], right[~mask]): + return False + left_remaining = left[mask] + right_remaining = right[mask] + except ValueError: + # can raise a ValueError if left and right cannot be + # compared (e.g. nested arrays) + left_remaining = left + right_remaining = right + + for left_value, right_value in zip(left_remaining, right_remaining): if left_value is NaT and right_value is not NaT: return False diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c3696be0579b0..5d05983529fba 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -62,6 +62,7 @@ InvalidIndexError, _chained_assignment_method_msg, _chained_assignment_msg, + _chained_assignment_warning_method_msg, ) from pandas.util._decorators import ( Appender, @@ -909,8 +910,8 @@ def __dataframe__( Parameters ---------- nan_as_null : bool, default False - Whether to tell the DataFrame to overwrite null values in the data - with ``NaN`` (or ``NaT``). + `nan_as_null` is DEPRECATED and has no effect. Please avoid using + it; it will be removed in a future release. allow_copy : bool, default True Whether to allow memory copying when exporting. If set to False it would cause non-zero-copy exports to fail. @@ -925,9 +926,6 @@ def __dataframe__( Details on the interchange protocol: https://data-apis.org/dataframe-protocol/latest/index.html - `nan_as_null` currently has no effect; once support for nullable extension - dtypes is added, this value should be propagated to columns. - Examples -------- >>> df_not_necessarily_pandas = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) @@ -947,7 +945,7 @@ def __dataframe__( from pandas.core.interchange.dataframe import PandasDataFrameXchg - return PandasDataFrameXchg(self, nan_as_null, allow_copy) + return PandasDataFrameXchg(self, allow_copy=allow_copy) def __dataframe_consortium_standard__( self, *, api_version: str | None = None @@ -4377,11 +4375,15 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None: return self.isetitem(locs, value) - if len(value.columns) != 1: + if len(value.columns) > 1: raise ValueError( "Cannot set a DataFrame with multiple columns to the single " f"column {key}" ) + elif len(value.columns) == 0: + raise ValueError( + f"Cannot set a DataFrame without columns to the column {key}" + ) self[key] = value[value.columns[0]] @@ -4857,6 +4859,7 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: inplace = validate_bool_kwarg(inplace, "inplace") kwargs["level"] = kwargs.pop("level", 0) + 1 + # TODO(CoW) those index/column resolvers create unnecessary refs to `self` index_resolvers = self._get_index_resolvers() column_resolvers = self._get_cleaned_column_resolvers() resolvers = column_resolvers, index_resolvers @@ -8851,6 +8854,13 @@ def update( ChainedAssignmentError, stacklevel=2, ) + elif not PYPY and not using_copy_on_write(): + if sys.getrefcount(self) <= REF_COUNT: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) from pandas.core.computation import expressions diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7918e43b48719..e4e95a973a3c1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -100,6 +100,7 @@ SettingWithCopyError, SettingWithCopyWarning, _chained_assignment_method_msg, + _chained_assignment_warning_method_msg, ) from pandas.util._decorators import ( deprecate_nonkeyword_arguments, @@ -6428,6 +6429,18 @@ def astype( Return a copy when ``copy=True`` (be very careful setting ``copy=False`` as changes to values then may propagate to other pandas objects). + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` errors : {'raise', 'ignore'}, default 'raise' Control raising of exceptions on invalid data for provided dtype. @@ -6940,36 +6953,16 @@ def convert_dtypes( dtype: string """ check_dtype_backend(dtype_backend) - if self.ndim == 1: - return self._convert_dtypes( - infer_objects, - convert_string, - convert_integer, - convert_boolean, - convert_floating, - dtype_backend=dtype_backend, - ) - else: - results = [ - col._convert_dtypes( - infer_objects, - convert_string, - convert_integer, - convert_boolean, - convert_floating, - dtype_backend=dtype_backend, - ) - for col_name, col in self.items() - ] - if len(results) > 0: - result = concat(results, axis=1, copy=False, keys=self.columns) - cons = cast(type["DataFrame"], self._constructor) - result = cons(result) - result = result.__finalize__(self, method="convert_dtypes") - # https://github.com/python/mypy/issues/8354 - return cast(Self, result) - else: - return self.copy(deep=None) + new_mgr = self._mgr.convert_dtypes( # type: ignore[union-attr] + infer_objects=infer_objects, + convert_string=convert_string, + convert_integer=convert_integer, + convert_boolean=convert_boolean, + convert_floating=convert_floating, + dtype_backend=dtype_backend, + ) + res = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + return res.__finalize__(self, method="convert_dtypes") # ---------------------------------------------------------------------- # Filling NA's @@ -7199,6 +7192,18 @@ def fillna( ChainedAssignmentError, stacklevel=2, ) + elif not PYPY and not using_copy_on_write(): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) value, method = validate_fillna_kwargs(value, method) if method is not None: @@ -7469,6 +7474,18 @@ def ffill( ChainedAssignmentError, stacklevel=2, ) + elif not PYPY and not using_copy_on_write(): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) return self._pad_or_backfill( "ffill", @@ -7640,6 +7657,19 @@ def bfill( ChainedAssignmentError, stacklevel=2, ) + elif not PYPY and not using_copy_on_write(): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) + return self._pad_or_backfill( "bfill", axis=axis, @@ -7793,6 +7823,22 @@ def replace( ChainedAssignmentError, stacklevel=2, ) + elif not PYPY and not using_copy_on_write(): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): + # in non-CoW mode, chained Series access will populate the + # `_item_cache` which results in an increased ref count not below + # the threshold, while we still need to warn. We detect this case + # of a Series derived from a DataFrame through the presence of + # `_cacher` + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) if not is_bool(regex) and to_replace is not None: raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool") @@ -8218,6 +8264,18 @@ def interpolate( ChainedAssignmentError, stacklevel=2, ) + elif not PYPY and not using_copy_on_write(): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) axis = self._get_axis_number(axis) @@ -9200,11 +9258,11 @@ def resample( closed : {{'right', 'left'}}, default None Which side of bin interval is closed. The default is 'left' for all frequency offsets except for 'ME', 'YE', 'QE', 'BME', - 'BA', 'BQ', and 'W' which all have a default of 'right'. + 'BA', 'BQE', and 'W' which all have a default of 'right'. label : {{'right', 'left'}}, default None Which bin edge label to label bucket with. The default is 'left' for all frequency offsets except for 'ME', 'YE', 'QE', 'BME', - 'BA', 'BQ', and 'W' which all have a default of 'right'. + 'BA', 'BQE', and 'W' which all have a default of 'right'. convention : {{'start', 'end', 's', 'e'}}, default 'start' For `PeriodIndex` only, controls whether to use the start or end of `rule`. @@ -12398,7 +12456,12 @@ def _inplace_method(self, other, op) -> Self: result = op(self, other) - if self.ndim == 1 and result._indexed_same(self) and result.dtype == self.dtype: + if ( + self.ndim == 1 + and result._indexed_same(self) + and result.dtype == self.dtype + and not using_copy_on_write() + ): # GH#36498 this inplace op can _actually_ be inplace. # Item "ArrayManager" of "Union[ArrayManager, SingleArrayManager, # BlockManager, SingleBlockManager]" has no attribute "setitem_inplace" diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1bdba5a3e71fb..1fb412e17c4ba 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -901,6 +901,12 @@ def fillna( """ Fill NA/NaN values using the specified method within groups. + .. deprecated:: 2.2.0 + This method is deprecated and will be removed in a future version. + Use the :meth:`.SeriesGroupBy.ffill` or :meth:`.SeriesGroupBy.bfill` + for forward or backward filling instead. If you want to fill with a + single value, use :meth:`Series.fillna` instead. + Parameters ---------- value : scalar, dict, Series, or DataFrame @@ -915,17 +921,8 @@ def fillna( Method to use for filling holes. ``'ffill'`` will propagate the last valid observation forward within a group. ``'bfill'`` will use next valid observation to fill the gap. - - .. deprecated:: 2.1.0 - Use obj.ffill or obj.bfill instead. - axis : {0 or 'index', 1 or 'columns'} Unused, only for compatibility with :meth:`DataFrameGroupBy.fillna`. - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. - inplace : bool, default False Broken. Do not set to True. limit : int, default None @@ -940,8 +937,6 @@ def fillna( or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). - .. deprecated:: 2.1.0 - Returns ------- Series @@ -973,6 +968,14 @@ def fillna( mouse 0.0 dtype: float64 """ + warnings.warn( + f"{type(self).__name__}.fillna is deprecated and " + "will be removed in a future version. Use obj.ffill() or obj.bfill() " + "for forward or backward filling instead. If you want to fill with a " + f"single value, use {type(self.obj).__name__}.fillna instead", + FutureWarning, + stacklevel=find_stack_level(), + ) result = self._op_via_apply( "fillna", value=value, @@ -2401,6 +2404,12 @@ def fillna( """ Fill NA/NaN values using the specified method within groups. + .. deprecated:: 2.2.0 + This method is deprecated and will be removed in a future version. + Use the :meth:`.DataFrameGroupBy.ffill` or :meth:`.DataFrameGroupBy.bfill` + for forward or backward filling instead. If you want to fill with a + single value, use :meth:`DataFrame.fillna` instead. + Parameters ---------- value : scalar, dict, Series, or DataFrame @@ -2421,11 +2430,6 @@ def fillna( the same results as :meth:`.DataFrame.fillna`. When the :class:`DataFrameGroupBy` ``axis`` argument is ``1``, using ``axis=0`` or ``axis=1`` here will produce the same results. - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. - inplace : bool, default False Broken. Do not set to True. limit : int, default None @@ -2440,8 +2444,6 @@ def fillna( or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). - .. deprecated:: 2.1.0 - Returns ------- DataFrame @@ -2516,14 +2518,14 @@ def fillna( 3 3.0 NaN 2.0 4 3.0 NaN NaN """ - if method is not None: - warnings.warn( - f"{type(self).__name__}.fillna with 'method' is deprecated and " - "will raise in a future version. Use obj.ffill() or obj.bfill() " - "instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) + warnings.warn( + f"{type(self).__name__}.fillna is deprecated and " + "will be removed in a future version. Use obj.ffill() or obj.bfill() " + "for forward or backward filling instead. If you want to fill with a " + f"single value, use {type(self.obj).__name__}.fillna instead", + FutureWarning, + stacklevel=find_stack_level(), + ) result = self._op_via_apply( "fillna", diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3412f18a40313..2d530b64e104b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2821,11 +2821,27 @@ def _value_counts( for grouping in groupings ): levels_list = [ping.result_index for ping in groupings] - multi_index, _ = MultiIndex.from_product( + multi_index = MultiIndex.from_product( levels_list, names=[ping.name for ping in groupings] - ).sortlevel() + ) result_series = result_series.reindex(multi_index, fill_value=0) + if sort: + # Sort by the values + result_series = result_series.sort_values( + ascending=ascending, kind="stable" + ) + if self.sort: + # Sort by the groupings + names = result_series.index.names + # GH#55951 - Temporarily replace names in case they are integers + result_series.index.names = range(len(names)) + index_level = list(range(len(self.grouper.groupings))) + result_series = result_series.sort_index( + level=index_level, sort_remaining=False + ) + result_series.index.names = names + if normalize: # Normalize the results by dividing by the original group sizes. # We are guaranteed to have the first N levels be the @@ -2845,13 +2861,6 @@ def _value_counts( # Handle groups of non-observed categories result_series = result_series.fillna(0.0) - if sort: - # Sort the values and then resort by the main grouping - index_level = range(len(self.grouper.groupings)) - result_series = result_series.sort_values(ascending=ascending).sort_index( - level=index_level, sort_remaining=False - ) - result: Series | DataFrame if self.as_index: result = result_series diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1b4e14f075f22..ac4d2976593a2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -23,6 +23,7 @@ from pandas._config import ( get_option, using_copy_on_write, + using_pyarrow_string_dtype, ) from pandas._libs import ( @@ -5352,6 +5353,16 @@ def __getitem__(self, key): else: key = np.asarray(key, dtype=bool) + if not isinstance(self.dtype, ExtensionDtype): + if len(key) == 0 and len(key) != len(self): + warnings.warn( + "Using a boolean indexer with length 0 on an Index with " + "length greater than 0 is deprecated and will raise in a " + "future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + result = getitem(key) # Because we ruled out integer above, we always get an arraylike here if result.ndim > 1: @@ -5814,6 +5825,19 @@ def sort_values( >>> idx.sort_values(ascending=False, return_indexer=True) (Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) """ + if key is None and ( + self.is_monotonic_increasing or self.is_monotonic_decreasing + ): + reverse = ascending != self.is_monotonic_increasing + sorted_index = self[::-1] if reverse else self.copy() + if return_indexer: + indexer = np.arange(len(self), dtype=np.intp) + if reverse: + indexer = indexer[::-1] + return sorted_index, indexer + else: + return sorted_index + # GH 35584. Sort missing values according to na_position kwarg # ignore na_position for MultiIndex if not isinstance(self, ABCMultiIndex): @@ -6909,7 +6933,14 @@ def insert(self, loc: int, item) -> Index: loc = loc if loc >= 0 else loc - 1 new_values[loc] = item - return Index._with_infer(new_values, name=self.name) + idx = Index._with_infer(new_values, name=self.name) + if ( + using_pyarrow_string_dtype() + and is_string_dtype(idx.dtype) + and new_values.dtype == object + ): + idx = idx.astype(new_values.dtype) + return idx def drop( self, diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index a3e6c50b21642..264ca8aa11495 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -518,7 +518,7 @@ def shift(self, periods: int = 1, freq=None) -> Self: # appropriate timezone from `start` and `end`, so tz does not need # to be passed explicitly. result = self._data._generate_range( - start=start, end=end, periods=None, freq=self.freq + start=start, end=end, periods=None, freq=self.freq, unit=self.unit ) return type(self)._simple_new(result, name=self.name) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index f73e2b5512262..4fcdb87974511 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -22,6 +22,7 @@ ) from pandas._libs.tslibs import ( BaseOffset, + Period, Timedelta, Timestamp, to_offset, @@ -42,7 +43,6 @@ ) from pandas.core.dtypes.common import ( ensure_platform_int, - is_float, is_float_dtype, is_integer, is_integer_dtype, @@ -59,6 +59,7 @@ from pandas.core.dtypes.missing import is_valid_na_for_dtype from pandas.core.algorithms import unique +from pandas.core.arrays.datetimelike import validate_periods from pandas.core.arrays.interval import ( IntervalArray, _interval_shared_docs, @@ -561,7 +562,7 @@ def _maybe_convert_i8(self, key): if scalar: # Timestamp/Timedelta key_dtype, key_i8 = infer_dtype_from_scalar(key) - if lib.is_period(key): + if isinstance(key, Period): key_i8 = key.ordinal elif isinstance(key_i8, Timestamp): key_i8 = key_i8._value @@ -1075,10 +1076,7 @@ def interval_range( if not _is_valid_endpoint(end): raise ValueError(f"end must be numeric or datetime-like, got {end}") - if is_float(periods): - periods = int(periods) - elif not is_integer(periods) and periods is not None: - raise TypeError(f"periods must be a number, got {periods}") + periods = validate_periods(periods) if freq is not None and not is_number(freq): try: diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index faf058eff4bf4..b2f1933800fd3 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -5,6 +5,7 @@ timedelta, ) from typing import TYPE_CHECKING +import warnings import numpy as np @@ -21,6 +22,7 @@ cache_readonly, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import is_integer from pandas.core.dtypes.dtypes import PeriodDtype @@ -97,12 +99,33 @@ class PeriodIndex(DatetimeIndexOpsMixin): freq : str or period object, optional One of pandas period strings or corresponding objects. year : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. month : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. quarter : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. day : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. hour : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. minute : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. second : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. dtype : str or PeriodDtype, default None Attributes @@ -135,6 +158,8 @@ class PeriodIndex(DatetimeIndexOpsMixin): asfreq strftime to_timestamp + from_fields + from_ordinals See Also -------- @@ -146,7 +171,7 @@ class PeriodIndex(DatetimeIndexOpsMixin): Examples -------- - >>> idx = pd.PeriodIndex(year=[2000, 2002], quarter=[1, 3]) + >>> idx = pd.PeriodIndex.from_fields(year=[2000, 2002], quarter=[1, 3]) >>> idx PeriodIndex(['2000Q1', '2002Q3'], dtype='period[Q-DEC]') """ @@ -233,6 +258,24 @@ def __new__( if not set(fields).issubset(valid_field_set): argument = next(iter(set(fields) - valid_field_set)) raise TypeError(f"__new__() got an unexpected keyword argument {argument}") + elif len(fields): + # GH#55960 + warnings.warn( + "Constructing PeriodIndex from fields is deprecated. Use " + "PeriodIndex.from_fields instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if ordinal is not None: + # GH#55960 + warnings.warn( + "The 'ordinal' keyword in PeriodIndex is deprecated and will " + "be removed in a future version. Use PeriodIndex.from_ordinals " + "instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) name = maybe_extract_name(name, data, cls) @@ -241,14 +284,9 @@ def __new__( if not fields: # test_pickle_compat_construction cls._raise_scalar_data_error(None) + data = cls.from_fields(**fields, freq=freq)._data + copy = False - data, freq2 = PeriodArray._generate_range(None, None, None, freq, fields) - # PeriodArray._generate range does validation that fields is - # empty when really using the range-based constructor. - freq = freq2 - - dtype = PeriodDtype(freq) - data = PeriodArray(data, dtype=dtype) elif fields: if data is not None: raise ValueError("Cannot pass both data and fields") @@ -280,6 +318,39 @@ def __new__( return cls._simple_new(data, name=name, refs=refs) + @classmethod + def from_fields( + cls, + *, + year=None, + quarter=None, + month=None, + day=None, + hour=None, + minute=None, + second=None, + freq=None, + ) -> Self: + fields = { + "year": year, + "quarter": quarter, + "month": month, + "day": day, + "hour": hour, + "minute": minute, + "second": second, + } + fields = {key: value for key, value in fields.items() if value is not None} + arr = PeriodArray._from_fields(fields=fields, freq=freq) + return cls._simple_new(arr) + + @classmethod + def from_ordinals(cls, ordinals, *, freq, name=None) -> Self: + ordinals = np.asarray(ordinals, dtype=np.int64) + dtype = PeriodDtype(freq) + data = PeriodArray._simple_new(ordinals, dtype=dtype) + return cls._simple_new(data, name=name) + # ------------------------------------------------------------------------ # Data @@ -537,7 +608,7 @@ def period_range( if freq is None and (not isinstance(start, Period) and not isinstance(end, Period)): freq = "D" - data, freq = PeriodArray._generate_range(start, end, periods, freq, fields={}) + data, freq = PeriodArray._generate_range(start, end, periods, freq) dtype = PeriodDtype(freq) data = PeriodArray(data, dtype=dtype) return PeriodIndex(data, name=name) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 9d8ef5b0a69cd..d33e704d24c9f 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -22,7 +22,6 @@ ) from pandas.core.dtypes.generic import ABCSeries -from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays.timedeltas import TimedeltaArray import pandas.core.common as com from pandas.core.indexes.base import ( @@ -338,7 +337,7 @@ def timedelta_range( if freq is None and com.any_none(periods, start, end): freq = "D" - freq, _ = dtl.maybe_infer_freq(freq) + freq = to_offset(freq) tdarr = TimedeltaArray._generate_range( start, end, periods, freq, closed=closed, unit=unit ) diff --git a/pandas/core/interchange/dataframe.py b/pandas/core/interchange/dataframe.py index 36f374152709d..4f08b2c2b3a7b 100644 --- a/pandas/core/interchange/dataframe.py +++ b/pandas/core/interchange/dataframe.py @@ -27,25 +27,20 @@ class PandasDataFrameXchg(DataFrameXchg): attributes defined on this class. """ - def __init__( - self, df: DataFrame, nan_as_null: bool = False, allow_copy: bool = True - ) -> None: + def __init__(self, df: DataFrame, allow_copy: bool = True) -> None: """ Constructor - an instance of this (private) class is returned from `pd.DataFrame.__dataframe__`. """ self._df = df - # ``nan_as_null`` is a keyword intended for the consumer to tell the - # producer to overwrite null values in the data with ``NaN`` (or ``NaT``). - # This currently has no effect; once support for nullable extension - # dtypes is added, this value should be propagated to columns. - self._nan_as_null = nan_as_null self._allow_copy = allow_copy def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True ) -> PandasDataFrameXchg: - return PandasDataFrameXchg(self._df, nan_as_null, allow_copy) + # `nan_as_null` can be removed here once it's removed from + # Dataframe.__dataframe__ + return PandasDataFrameXchg(self._df, allow_copy) @property def metadata(self) -> dict[str, Index]: @@ -84,7 +79,7 @@ def select_columns(self, indices: Sequence[int]) -> PandasDataFrameXchg: indices = list(indices) return PandasDataFrameXchg( - self._df.iloc[:, indices], self._nan_as_null, self._allow_copy + self._df.iloc[:, indices], allow_copy=self._allow_copy ) def select_columns_by_name(self, names: list[str]) -> PandasDataFrameXchg: # type: ignore[override] @@ -93,9 +88,7 @@ def select_columns_by_name(self, names: list[str]) -> PandasDataFrameXchg: # ty if not isinstance(names, list): names = list(names) - return PandasDataFrameXchg( - self._df.loc[:, names], self._nan_as_null, self._allow_copy - ) + return PandasDataFrameXchg(self._df.loc[:, names], allow_copy=self._allow_copy) def get_chunks(self, n_chunks: int | None = None) -> Iterable[PandasDataFrameXchg]: """ @@ -109,8 +102,7 @@ def get_chunks(self, n_chunks: int | None = None) -> Iterable[PandasDataFrameXch for start in range(0, step * n_chunks, step): yield PandasDataFrameXchg( self._df.iloc[start : start + step, :], - self._nan_as_null, - self._allow_copy, + allow_copy=self._allow_copy, ) else: yield self diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 0f8cb9f053174..2eb413440ba9c 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,4 +1,4 @@ -from pandas.core.internals.api import make_block +from pandas.core.internals.api import make_block # 2023-09-18 pyarrow uses this from pandas.core.internals.array_manager import ( ArrayManager, SingleArrayManager, @@ -7,11 +7,6 @@ DataManager, SingleDataManager, ) -from pandas.core.internals.blocks import ( # io.pytables, io.packers - Block, - DatetimeTZBlock, - ExtensionBlock, -) from pandas.core.internals.concat import concatenate_managers from pandas.core.internals.managers import ( BlockManager, @@ -19,9 +14,9 @@ ) __all__ = [ - "Block", - "DatetimeTZBlock", - "ExtensionBlock", + "Block", # pylint: disable=undefined-all-variable + "DatetimeTZBlock", # pylint: disable=undefined-all-variable + "ExtensionBlock", # pylint: disable=undefined-all-variable "make_block", "DataManager", "ArrayManager", @@ -34,33 +29,54 @@ def __getattr__(name: str): + # GH#55139 import warnings - from pandas.util._exceptions import find_stack_level - if name == "create_block_manager_from_blocks": # GH#33892 warnings.warn( f"{name} is deprecated and will be removed in a future version. " "Use public APIs instead.", DeprecationWarning, - stacklevel=find_stack_level(), + # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758 + # on hard-coding stacklevel + stacklevel=2, ) from pandas.core.internals.managers import create_block_manager_from_blocks return create_block_manager_from_blocks - if name in ["NumericBlock", "ObjectBlock"]: + if name in [ + "NumericBlock", + "ObjectBlock", + "Block", + "ExtensionBlock", + "DatetimeTZBlock", + ]: warnings.warn( f"{name} is deprecated and will be removed in a future version. " "Use public APIs instead.", DeprecationWarning, - stacklevel=find_stack_level(), + # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758 + # on hard-coding stacklevel + stacklevel=2, ) if name == "NumericBlock": from pandas.core.internals.blocks import NumericBlock return NumericBlock + elif name == "DatetimeTZBlock": + from pandas.core.internals.blocks import DatetimeTZBlock + + return DatetimeTZBlock + elif name == "ExtensionBlock": + from pandas.core.internals.blocks import ExtensionBlock + + return ExtensionBlock + elif name == "Block": + from pandas.core.internals.blocks import Block + + return Block else: from pandas.core.internals.blocks import ObjectBlock diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index a3fd77fc8d9ea..b0b3937ca47ea 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -23,9 +23,6 @@ from pandas.core.arrays import DatetimeArray from pandas.core.construction import extract_array from pandas.core.internals.blocks import ( - Block, - DatetimeTZBlock, - ExtensionBlock, check_ndim, ensure_block_shape, extract_pandas_array, @@ -36,6 +33,8 @@ if TYPE_CHECKING: from pandas._typing import Dtype + from pandas.core.internals.blocks import Block + def make_block( values, placement, klass=None, ndim=None, dtype: Dtype | None = None @@ -56,6 +55,11 @@ def make_block( values, dtype = extract_pandas_array(values, dtype, ndim) + from pandas.core.internals.blocks import ( + DatetimeTZBlock, + ExtensionBlock, + ) + if klass is ExtensionBlock and isinstance(values.dtype, PeriodDtype): # GH-44681 changed PeriodArray to be stored in the 2D # NDArrayBackedExtensionBlock instead of ExtensionBlock @@ -108,21 +112,44 @@ def maybe_infer_ndim(values, placement: BlockPlacement, ndim: int | None) -> int def __getattr__(name: str): + # GH#55139 import warnings - from pandas.util._exceptions import find_stack_level - - if name == "create_block_manager_from_blocks": + if name in [ + "Block", + "ExtensionBlock", + "DatetimeTZBlock", + "create_block_manager_from_blocks", + ]: # GH#33892 warnings.warn( f"{name} is deprecated and will be removed in a future version. " "Use public APIs instead.", DeprecationWarning, - stacklevel=find_stack_level(), + # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758 + # on hard-coding stacklevel + stacklevel=2, ) - from pandas.core.internals.managers import create_block_manager_from_blocks - return create_block_manager_from_blocks + if name == "create_block_manager_from_blocks": + from pandas.core.internals.managers import create_block_manager_from_blocks + + return create_block_manager_from_blocks + + elif name == "Block": + from pandas.core.internals.blocks import Block + + return Block + + elif name == "DatetimeTZBlock": + from pandas.core.internals.blocks import DatetimeTZBlock + + return DatetimeTZBlock + + elif name == "ExtensionBlock": + from pandas.core.internals.blocks import ExtensionBlock + + return ExtensionBlock raise AttributeError( f"module 'pandas.core.internals.api' has no attribute '{name}'" diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a3c2ede55dabf..535d18f99f0ef 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -33,6 +33,7 @@ from pandas._typing import ( ArrayLike, AxisInt, + DtypeBackend, DtypeObj, F, FillnaOptions, @@ -55,6 +56,7 @@ from pandas.core.dtypes.cast import ( LossySetitemError, can_hold_element, + convert_dtypes, find_result_type, maybe_downcast_to_dtype, np_can_hold_element, @@ -636,6 +638,52 @@ def convert( res_values = maybe_coerce_values(res_values) return [self.make_block(res_values, refs=refs)] + def convert_dtypes( + self, + copy: bool, + using_cow: bool, + infer_objects: bool = True, + convert_string: bool = True, + convert_integer: bool = True, + convert_boolean: bool = True, + convert_floating: bool = True, + dtype_backend: DtypeBackend = "numpy_nullable", + ) -> list[Block]: + if infer_objects and self.is_object: + blks = self.convert(copy=False, using_cow=using_cow) + else: + blks = [self] + + if not any( + [convert_floating, convert_integer, convert_boolean, convert_string] + ): + return [b.copy(deep=copy) for b in blks] + + rbs = [] + for blk in blks: + # Determine dtype column by column + sub_blks = [blk] if blk.ndim == 1 or self.shape[0] == 1 else blk._split() + dtypes = [ + convert_dtypes( + b.values, + convert_string, + convert_integer, + convert_boolean, + convert_floating, + infer_objects, + dtype_backend, + ) + for b in sub_blks + ] + if all(dtype == self.dtype for dtype in dtypes): + # Avoid block splitting if no dtype changes + rbs.append(blk.copy(deep=copy)) + continue + + for dtype, b in zip(dtypes, sub_blks): + rbs.append(b.astype(dtype=dtype, copy=copy, squeeze=b.ndim != 1)) + return rbs + # --------------------------------------------------------------------- # Array-Like Methods @@ -651,6 +699,7 @@ def astype( copy: bool = False, errors: IgnoreRaise = "raise", using_cow: bool = False, + squeeze: bool = False, ) -> Block: """ Coerce to the new dtype. @@ -665,12 +714,18 @@ def astype( - ``ignore`` : suppress exceptions. On error return original object using_cow: bool, default False Signaling if copy on write copy logic is used. + squeeze : bool, default False + squeeze values to ndim=1 if only one column is given Returns ------- Block """ values = self.values + if squeeze and values.ndim == 2: + if values.shape[0] != 1: + raise ValueError("Can not squeeze with more than one column.") + values = values[0, :] # type: ignore[call-overload] new_values = astype_array_safe(values, dtype, copy=copy, errors=errors) @@ -1737,13 +1792,14 @@ def round(self, decimals: int, using_cow: bool = False) -> Self: # has no attribute "round" values = self.values.round(decimals) # type: ignore[union-attr] if values is self.values: - refs = self.refs if not using_cow: # Normally would need to do this before, but # numpy only returns same array when round operation # is no-op # https://github.com/numpy/numpy/blob/486878b37fc7439a3b2b87747f50db9b62fea8eb/numpy/core/src/multiarray/calculation.c#L625-L636 values = values.copy() + else: + refs = self.refs return self.make_block_same_class(values, refs=refs) # --------------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 13c039cef3f91..843441e4865c7 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -464,6 +464,16 @@ def convert(self, copy: bool | None) -> Self: return self.apply("convert", copy=copy, using_cow=using_copy_on_write()) + def convert_dtypes(self, **kwargs): + if using_copy_on_write(): + copy = False + else: + copy = True + + return self.apply( + "convert_dtypes", copy=copy, using_cow=using_copy_on_write(), **kwargs + ) + def get_values_for_csv( self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None ) -> Self: @@ -1332,7 +1342,13 @@ def column_setitem( This is a method on the BlockManager level, to avoid creating an intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`) """ - if using_copy_on_write() and not self._has_no_reference(loc): + if warn_copy_on_write() and not self._has_no_reference(loc): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + elif using_copy_on_write() and not self._has_no_reference(loc): blkno = self.blknos[loc] # Split blocks to only copy the column we want to modify blk_loc = self.blklocs[loc] diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py index 894791cb46371..843c54de25bc9 100644 --- a/pandas/core/methods/selectn.py +++ b/pandas/core/methods/selectn.py @@ -139,7 +139,8 @@ def compute(self, method: str) -> Series: # arr passed into kth_smallest must be contiguous. We copy # here because kth_smallest will modify its input - kth_val = libalgos.kth_smallest(arr.copy(order="C"), n - 1) + # avoid OOB access with kth_smallest_c when n <= 0 + kth_val = libalgos.kth_smallest(arr.copy(order="C"), max(n - 1, 0)) (ns,) = np.nonzero(arr <= kth_val) inds = ns[arr[ns].argsort(kind="mergesort")] diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index e60c42a20a9af..20dc3f20bbb5e 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -757,6 +757,10 @@ def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask= >>> nanops.nanmedian(s.values) 2.0 """ + # for floats without mask, the data already uses NaN as missing value + # indicator, and `mask` will be calculated from that below -> in those + # cases we never need to set NaN to the masked values + using_nan_sentinel = values.dtype.kind == "f" and mask is None def get_median(x, _mask=None): if _mask is None: @@ -774,7 +778,7 @@ def get_median(x, _mask=None): return res dtype = values.dtype - values, mask = _get_values(values, skipna, mask=mask, fill_value=0) + values, mask = _get_values(values, skipna, mask=mask, fill_value=None) if values.dtype.kind != "f": if values.dtype == object: # GH#34671 avoid casting strings to numeric @@ -786,7 +790,9 @@ def get_median(x, _mask=None): except ValueError as err: # e.g. "could not convert string to float: 'a'" raise TypeError(str(err)) from err - if mask is not None: + if not using_nan_sentinel and mask is not None: + if not values.flags.writeable: + values = values.copy() values[mask] = np.nan notempty = values.size diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 14cd77ec8559b..8af81cd43d62e 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2130,7 +2130,7 @@ def __init__( else: freq = to_offset(freq) - end_types = {"ME", "YE", "QE", "BME", "BY", "BQ", "W"} + end_types = {"ME", "YE", "QE", "BME", "BYE", "BQE", "W"} rule = freq.rule_code if rule in end_types or ("-" in rule and rule[: rule.find("-")] in end_types): if closed is None: @@ -2327,8 +2327,8 @@ def _adjust_bin_edges( # Some hacks for > daily data, see #1471, #1458, #1483 if self.freq.name in ("BME", "ME", "W") or self.freq.name.split("-")[0] in ( - "BQ", - "BY", + "BQE", + "BYE", "QE", "YE", "W", diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 9ebce3a71c966..6963bf677bcfb 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -321,13 +321,15 @@ def get_empty_frame(data) -> DataFrame: return concat(sparse_series, axis=1, copy=False) else: - # take on axis=1 + transpose to ensure ndarray layout is column-major - eye_dtype: NpDtype + # ensure ndarray layout is column-major + shape = len(codes), number_of_cols + dummy_dtype: NpDtype if isinstance(_dtype, np.dtype): - eye_dtype = _dtype + dummy_dtype = _dtype else: - eye_dtype = np.bool_ - dummy_mat = np.eye(number_of_cols, dtype=eye_dtype).take(codes, axis=1).T + dummy_dtype = np.bool_ + dummy_mat = np.zeros(shape=shape, dtype=dummy_dtype, order="F") + dummy_mat[np.arange(len(codes)), codes] = 1 if not dummy_na: # reset NaN GH4446 diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index e333d263a6b7a..bb1cd0d738dac 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -12,7 +12,6 @@ from pandas.core.dtypes.missing import notna import pandas.core.algorithms as algos -from pandas.core.arrays import Categorical from pandas.core.indexes.api import MultiIndex from pandas.core.reshape.concat import concat from pandas.core.reshape.util import tile_compat @@ -139,7 +138,7 @@ def melt( return result -def lreshape(data: DataFrame, groups, dropna: bool = True) -> DataFrame: +def lreshape(data: DataFrame, groups: dict, dropna: bool = True) -> DataFrame: """ Reshape wide-format data to long. Generalized inverse of DataFrame.pivot. @@ -192,30 +191,20 @@ def lreshape(data: DataFrame, groups, dropna: bool = True) -> DataFrame: 2 Red Sox 2008 545 3 Yankees 2008 526 """ - if isinstance(groups, dict): - keys = list(groups.keys()) - values = list(groups.values()) - else: - keys, values = zip(*groups) - - all_cols = list(set.union(*(set(x) for x in values))) - id_cols = list(data.columns.difference(all_cols)) - - K = len(values[0]) - - for seq in values: - if len(seq) != K: - raise ValueError("All column lists must be same length") - mdata = {} pivot_cols = [] - - for target, names in zip(keys, values): + all_cols: set[Hashable] = set() + K = len(next(iter(groups.values()))) + for target, names in groups.items(): + if len(names) != K: + raise ValueError("All column lists must be same length") to_concat = [data[col]._values for col in names] mdata[target] = concat_compat(to_concat) pivot_cols.append(target) + all_cols = all_cols.union(names) + id_cols = list(data.columns.difference(all_cols)) for col in id_cols: mdata[col] = np.tile(data[col]._values, K) @@ -467,10 +456,10 @@ def wide_to_long( two 2.9 """ - def get_var_names(df, stub: str, sep: str, suffix: str) -> list[str]: + def get_var_names(df, stub: str, sep: str, suffix: str): regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$" pattern = re.compile(regex) - return [col for col in df.columns if pattern.match(col)] + return df.columns[df.columns.str.match(pattern)] def melt_stub(df, stub: str, i, j, value_vars, sep: str): newdf = melt( @@ -480,7 +469,6 @@ def melt_stub(df, stub: str, i, j, value_vars, sep: str): value_name=stub.rstrip(sep), var_name=j, ) - newdf[j] = Categorical(newdf[j]) newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True) # GH17627 Cast numerics suffixes to int/float @@ -497,7 +485,7 @@ def melt_stub(df, stub: str, i, j, value_vars, sep: str): else: stubnames = list(stubnames) - if any(col in stubnames for col in df.columns): + if df.columns.isin(stubnames).any(): raise ValueError("stubname can't be identical to a column name") if not is_list_like(i): @@ -508,18 +496,18 @@ def melt_stub(df, stub: str, i, j, value_vars, sep: str): if df[i].duplicated().any(): raise ValueError("the id variables need to uniquely identify each row") - value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames] - - value_vars_flattened = [e for sublist in value_vars for e in sublist] - id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) + _melted = [] + value_vars_flattened = [] + for stub in stubnames: + value_var = get_var_names(df, stub, sep, suffix) + value_vars_flattened.extend(value_var) + _melted.append(melt_stub(df, stub, i, j, value_var, sep)) - _melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)] - melted = _melted[0].join(_melted[1:], how="outer") + melted = concat(_melted, axis=1) + id_vars = df.columns.difference(value_vars_flattened) + new = df[id_vars] if len(i) == 1: - new = df[id_vars].set_index(i).join(melted) - return new - - new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j]) - - return new + return new.set_index(i).join(melted) + else: + return new.merge(melted.reset_index(), on=i).set_index(i + [j]) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 79354fdd12a2d..c39fbfe6b6d33 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -421,9 +421,10 @@ def _all_key(key): row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc) row_margin = row_margin.stack(future_stack=True) - # slight hack - new_order = [len(cols)] + list(range(len(cols))) - row_margin.index = row_margin.index.reorder_levels(new_order) + # GH#26568. Use names instead of indices in case of numeric names + new_order_indices = [len(cols)] + list(range(len(cols))) + new_order_names = [row_margin.index.names[i] for i in new_order_indices] + row_margin.index = row_margin.index.reorder_levels(new_order_names) else: row_margin = data._constructor_sliced(np.nan, index=result.columns) diff --git a/pandas/core/series.py b/pandas/core/series.py index a021ea7961cc0..a9679f22f9933 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -44,6 +44,7 @@ InvalidIndexError, _chained_assignment_method_msg, _chained_assignment_msg, + _chained_assignment_warning_method_msg, ) from pandas.util._decorators import ( Appender, @@ -61,7 +62,6 @@ from pandas.core.dtypes.astype import astype_is_view from pandas.core.dtypes.cast import ( LossySetitemError, - convert_dtypes, maybe_box_native, maybe_cast_pointwise_result, ) @@ -167,7 +167,6 @@ CorrelationMethod, DropKeep, Dtype, - DtypeBackend, DtypeObj, FilePath, Frequency, @@ -3562,6 +3561,18 @@ def update(self, other: Series | Sequence | Mapping) -> None: ChainedAssignmentError, stacklevel=2, ) + elif not PYPY and not using_copy_on_write(): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if hasattr(self, "_cacher"): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) if not isinstance(other, Series): other = Series(other) @@ -5556,39 +5567,6 @@ def between( return lmask & rmask - # ---------------------------------------------------------------------- - # Convert to types that support pd.NA - - def _convert_dtypes( - self, - infer_objects: bool = True, - convert_string: bool = True, - convert_integer: bool = True, - convert_boolean: bool = True, - convert_floating: bool = True, - dtype_backend: DtypeBackend = "numpy_nullable", - ) -> Series: - input_series = self - if infer_objects: - input_series = input_series.infer_objects() - if is_object_dtype(input_series.dtype): - input_series = input_series.copy(deep=None) - - if convert_string or convert_integer or convert_boolean or convert_floating: - inferred_dtype = convert_dtypes( - input_series._values, - convert_string, - convert_integer, - convert_boolean, - convert_floating, - infer_objects, - dtype_backend, - ) - result = input_series.astype(inferred_dtype) - else: - result = input_series.copy(deep=None) - return result - # error: Cannot determine type of 'isna' @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type] def isna(self) -> Series: diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 1b1d9d7640058..a431842218b3b 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -98,8 +98,8 @@ def get_indexer_indexer( sort_remaining=sort_remaining, na_position=na_position, ) - elif (ascending and target.is_monotonic_increasing) or ( - not ascending and target.is_monotonic_decreasing + elif (np.all(ascending) and target.is_monotonic_increasing) or ( + not np.any(ascending) and target.is_monotonic_decreasing ): # Check monotonic-ness before sort an index (GH 11080) return None diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index ea5e6e46f58ec..e42e89b76e6f2 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -26,12 +26,10 @@ Timestamp, astype_overflowsafe, get_unit_from_dtype, - iNaT, is_supported_unit, timezones as libtimezones, ) -from pandas._libs.tslibs.conversion import precision_from_unit -from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit +from pandas._libs.tslibs.conversion import cast_from_unit_vectorized from pandas._libs.tslibs.parsing import ( DateParseError, guess_datetime_format, @@ -497,7 +495,8 @@ def _convert_listlike_datetimes( if tz_parsed is not None: # We can take a shortcut since the datetime64 numpy array # is in UTC - dtype = cast(DatetimeTZDtype, tz_to_dtype(tz_parsed)) + out_unit = np.datetime_data(result.dtype)[0] + dtype = cast(DatetimeTZDtype, tz_to_dtype(tz_parsed, out_unit)) dt64_values = result.view(f"M8[{dtype.unit}]") dta = DatetimeArray._simple_new(dt64_values, dtype=dtype) return DatetimeIndex._simple_new(dta, name=name) @@ -551,23 +550,19 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: tz_parsed = None elif arg.dtype.kind == "f": - mult, _ = precision_from_unit(abbrev_to_npy_unit(unit)) - - mask = np.isnan(arg) | (arg == iNaT) - fvalues = (arg * mult).astype("f8", copy=False) - fvalues[mask] = 0 - - if (fvalues < Timestamp.min._value).any() or ( - fvalues > Timestamp.max._value - ).any(): - if errors != "raise": - arg = arg.astype(object) - return _to_datetime_with_unit(arg, unit, name, utc, errors) - raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") - - arr = fvalues.astype("M8[ns]", copy=False) - arr[mask] = np.datetime64("NaT", "ns") - + with np.errstate(over="raise"): + try: + arr = cast_from_unit_vectorized(arg, unit=unit) + except OutOfBoundsDatetime: + if errors != "raise": + return _to_datetime_with_unit( + arg.astype(object), unit, name, utc, errors + ) + raise OutOfBoundsDatetime( + f"cannot convert input with unit '{unit}'" + ) + + arr = arr.view("M8[ns]") tz_parsed = None else: arg = arg.astype(object, copy=False) @@ -848,8 +843,8 @@ def to_datetime( to the day starting at noon on January 1, 4713 BC. - If Timestamp convertible (Timestamp, dt.datetime, np.datetimt64 or date string), origin is set to Timestamp identified by origin. - - If a float or integer, origin is the millisecond difference - relative to 1970-01-01. + - If a float or integer, origin is the difference + (in units determined by the ``unit`` argument) relative to 1970-01-01. cache : bool, default True If :const:`True`, use a cache of unique, converted dates to apply the datetime conversion. May produce significant speed-up when parsing diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 09a612eca0529..e2aa9010dc109 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -503,6 +503,19 @@ class ChainedAssignmentError(Warning): ) +_chained_assignment_warning_method_msg = ( + "A value is trying to be set on a copy of a DataFrame or Series " + "through chained assignment using an inplace method.\n" + "The behavior will change in pandas 3.0. This inplace method will " + "never work because the intermediate object on which we are setting " + "values always behaves as a copy.\n\n" + "For example, when doing 'df[col].method(value, inplace=True)', try " + "using 'df.method({col: value}, inplace=True)' or " + "df[col] = df[col].method(value) instead, to perform " + "the operation inplace on the original object.\n\n" +) + + class NumExprClobberingError(NameError): """ Exception raised when trying to use a built-in numexpr name as a variable name. diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index a1d69deb6a21e..5786073c9d9cc 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -218,6 +218,17 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: raise ValueError(e) return frame + def _validate_usecols(self, usecols): + if lib.is_list_like(usecols) and not all(isinstance(x, str) for x in usecols): + raise ValueError( + "The pyarrow engine does not allow 'usecols' to be integer " + "column positions. Pass a list of string column names instead." + ) + elif callable(usecols): + raise ValueError( + "The pyarrow engine does not allow 'usecols' to be a callable." + ) + def read(self) -> DataFrame: """ Reads the contents of a CSV file into a DataFrame and @@ -233,12 +244,29 @@ def read(self) -> DataFrame: pyarrow_csv = import_optional_dependency("pyarrow.csv") self._get_pyarrow_options() + try: + convert_options = pyarrow_csv.ConvertOptions(**self.convert_options) + except TypeError: + include = self.convert_options.get("include_columns", None) + if include is not None: + self._validate_usecols(include) + + nulls = self.convert_options.get("null_values", set()) + if not lib.is_list_like(nulls) or not all( + isinstance(x, str) for x in nulls + ): + raise TypeError( + "The 'pyarrow' engine requires all na_values to be strings" + ) + + raise + try: table = pyarrow_csv.read_csv( self.src, read_options=pyarrow_csv.ReadOptions(**self.read_options), parse_options=pyarrow_csv.ParseOptions(**self.parse_options), - convert_options=pyarrow_csv.ConvertOptions(**self.convert_options), + convert_options=convert_options, ) except pa.ArrowInvalid as e: raise ParserError(e) from e diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 83d75508920a4..66990de6d3b89 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1716,7 +1716,10 @@ def _clean_options( # Converting values to NA keep_default_na = options["keep_default_na"] - na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) + floatify = engine != "pyarrow" + na_values, na_fvalues = _clean_na_values( + na_values, keep_default_na, floatify=floatify + ) # handle skiprows; this is internally handled by the # c-engine, so only need for python and pyarrow parsers @@ -1928,7 +1931,7 @@ def TextParser(*args, **kwds) -> TextFileReader: return TextFileReader(*args, **kwds) -def _clean_na_values(na_values, keep_default_na: bool = True): +def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = True): na_fvalues: set | dict if na_values is None: if keep_default_na: @@ -1956,7 +1959,7 @@ def _clean_na_values(na_values, keep_default_na: bool = True): else: if not is_list_like(na_values): na_values = [na_values] - na_values = _stringify_na_values(na_values) + na_values = _stringify_na_values(na_values, floatify) if keep_default_na: na_values = na_values | STR_NA_VALUES @@ -1978,7 +1981,7 @@ def _floatify_na_values(na_values): return result -def _stringify_na_values(na_values): +def _stringify_na_values(na_values, floatify: bool): """return a stringified and numeric for these values""" result: list[str | float] = [] for x in na_values: @@ -1993,13 +1996,15 @@ def _stringify_na_values(na_values): result.append(f"{v}.0") result.append(str(v)) - result.append(v) - except (TypeError, ValueError, OverflowError): - pass - try: - result.append(int(x)) + if floatify: + result.append(v) except (TypeError, ValueError, OverflowError): pass + if floatify: + try: + result.append(int(x)) + except (TypeError, ValueError, OverflowError): + pass return set(result) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 3659e7247495d..9e0e3686e4aa2 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2169,8 +2169,10 @@ def convert( # error: Incompatible types in assignment (expression has type # "Callable[[Any, KwArg(Any)], PeriodIndex]", variable has type # "Union[Type[Index], Type[DatetimeIndex]]") - factory = lambda x, **kwds: PeriodIndex( # type: ignore[assignment] - ordinal=x, **kwds + factory = lambda x, **kwds: PeriodIndex.from_ordinals( # type: ignore[assignment] + x, freq=kwds.get("freq", None) + )._rename( + kwds["name"] ) # making an Index instance could throw a number of different errors diff --git a/pandas/io/sql.py b/pandas/io/sql.py index c77567214a692..d4b6602d9f0eb 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -32,6 +32,8 @@ import numpy as np +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -45,7 +47,10 @@ is_dict_like, is_list_like, ) -from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + DatetimeTZDtype, +) from pandas.core.dtypes.missing import isna from pandas import get_option @@ -56,6 +61,7 @@ from pandas.core.arrays import ArrowExtensionArray from pandas.core.base import PandasObject import pandas.core.common as com +from pandas.core.common import maybe_make_list from pandas.core.internals.construction import convert_object_array from pandas.core.tools.datetimes import to_datetime @@ -186,7 +192,7 @@ def _wrap_result( dtype: DtypeArg | None = None, dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ): - """Wrap result set of query in a DataFrame.""" + """Wrap result set of a SQLAlchemy query in a DataFrame.""" frame = _convert_arrays_to_dataframe(data, columns, coerce_float, dtype_backend) if dtype: @@ -200,6 +206,26 @@ def _wrap_result( return frame +def _wrap_result_adbc( + df: DataFrame, + *, + index_col=None, + parse_dates=None, + dtype: DtypeArg | None = None, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", +) -> DataFrame: + """Wrap result set of a SQLAlchemy query in a DataFrame.""" + if dtype: + df = df.astype(dtype) + + df = _parse_date_columns(df, parse_dates) + + if index_col is not None: + df = df.set_index(index_col) + + return df + + def execute(sql, con, params=None): """ Execute the given SQL query using the provided connection object. @@ -559,12 +585,13 @@ def read_sql( ---------- sql : str or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. - con : SQLAlchemy connectable, str, or sqlite3 connection + con : ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection + ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible - for engine disposal and connection closure for the SQLAlchemy connectable; str - connections are closed automatically. See - `here `_. + for engine disposal and connection closure for the ADBC connection and + SQLAlchemy connectable; str connections are closed automatically. See + `here `_. index_col : str or list of str, optional, default: None Column(s) to set as index(MultiIndex). coerce_float : bool, default True @@ -648,6 +675,17 @@ def read_sql( int_column date_column 0 0 2012-11-10 1 1 2010-11-12 + + .. versionadded:: 2.2.0 + + pandas now supports reading via ADBC drivers + + >>> from adbc_driver_postgresql import dbapi + >>> with dbapi.connect('postgres:///db_name') as conn: # doctest:+SKIP + ... pd.read_sql('SELECT int_column FROM test_data', conn) + int_column + 0 0 + 1 1 """ check_dtype_backend(dtype_backend) @@ -719,8 +757,9 @@ def to_sql( frame : DataFrame, Series name : str Name of SQL table. - con : SQLAlchemy connectable(engine/connection) or database string URI + con : ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection or sqlite3 DBAPI2 connection + ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. @@ -775,7 +814,8 @@ def to_sql( Notes ----- The returned rows affected is the sum of the ``rowcount`` attribute of ``sqlite3.Cursor`` - or SQLAlchemy connectable. The returned value may not reflect the exact number of written + or SQLAlchemy connectable. If using ADBC the returned rows are the result + of ``Cursor.adbc_ingest``. The returned value may not reflect the exact number of written rows as stipulated in the `sqlite3 `__ or `SQLAlchemy `__ @@ -814,7 +854,8 @@ def has_table(table_name: str, con, schema: str | None = None) -> bool: ---------- table_name: string Name of SQL table. - con: SQLAlchemy connectable(engine/connection) or sqlite3 DBAPI2 connection + con: ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection + ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. @@ -856,6 +897,10 @@ def pandasSQL_builder( if sqlalchemy is not None and isinstance(con, (str, sqlalchemy.engine.Connectable)): return SQLDatabase(con, schema, need_transaction) + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(con, adbc.Connection): + return ADBCDatabase(con) + warnings.warn( "pandas only supports SQLAlchemy connectable (engine/connection) or " "database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 " @@ -2034,6 +2079,356 @@ def _create_sql_schema( # ---- SQL without SQLAlchemy --- + + +class ADBCDatabase(PandasSQL): + """ + This class enables conversion between DataFrame and SQL databases + using ADBC to handle DataBase abstraction. + + Parameters + ---------- + con : adbc_driver_manager.dbapi.Connection + """ + + def __init__(self, con) -> None: + self.con = con + + @contextmanager + def run_transaction(self): + with self.con.cursor() as cur: + try: + yield cur + except Exception: + self.con.rollback() + raise + self.con.commit() + + def execute(self, sql: str | Select | TextClause, params=None): + if not isinstance(sql, str): + raise TypeError("Query must be a string unless using sqlalchemy.") + args = [] if params is None else [params] + cur = self.con.cursor() + try: + cur.execute(sql, *args) + return cur + except Exception as exc: + try: + self.con.rollback() + except Exception as inner_exc: # pragma: no cover + ex = DatabaseError( + f"Execution failed on sql: {sql}\n{exc}\nunable to rollback" + ) + raise ex from inner_exc + + ex = DatabaseError(f"Execution failed on sql '{sql}': {exc}") + raise ex from exc + + def read_table( + self, + table_name: str, + index_col: str | list[str] | None = None, + coerce_float: bool = True, + parse_dates=None, + columns=None, + schema: str | None = None, + chunksize: int | None = None, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", + ) -> DataFrame | Iterator[DataFrame]: + """ + Read SQL database table into a DataFrame. + + Parameters + ---------- + table_name : str + Name of SQL table in database. + coerce_float : bool, default True + Raises NotImplementedError + parse_dates : list or dict, default: None + - List of column names to parse as dates. + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times, or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps. + - Dict of ``{column_name: arg}``, where the arg corresponds + to the keyword arguments of :func:`pandas.to_datetime`. + Especially useful with databases without native Datetime support, + such as SQLite. + columns : list, default: None + List of column names to select from SQL table. + schema : string, default None + Name of SQL schema in database to query (if database flavor + supports this). If specified, this overwrites the default + schema of the SQL database object. + chunksize : int, default None + Raises NotImplementedError + dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + + Returns + ------- + DataFrame + + See Also + -------- + pandas.read_sql_table + SQLDatabase.read_query + + """ + if coerce_float is not True: + raise NotImplementedError( + "'coerce_float' is not implemented for ADBC drivers" + ) + if chunksize: + raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") + + if columns: + if index_col: + index_select = maybe_make_list(index_col) + else: + index_select = [] + to_select = index_select + columns + select_list = ", ".join(f'"{x}"' for x in to_select) + else: + select_list = "*" + if schema: + stmt = f"SELECT {select_list} FROM {schema}.{table_name}" + else: + stmt = f"SELECT {select_list} FROM {table_name}" + + mapping: type[ArrowDtype] | None | Callable + if dtype_backend == "pyarrow": + mapping = ArrowDtype + elif dtype_backend == "numpy_nullable": + from pandas.io._util import _arrow_dtype_mapping + + mapping = _arrow_dtype_mapping().get + elif using_pyarrow_string_dtype(): + from pandas.io._util import arrow_string_types_mapper + + arrow_string_types_mapper() + else: + mapping = None + + with self.con.cursor() as cur: + cur.execute(stmt) + df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + + return _wrap_result_adbc( + df, + index_col=index_col, + parse_dates=parse_dates, + ) + + def read_query( + self, + sql: str, + index_col: str | list[str] | None = None, + coerce_float: bool = True, + parse_dates=None, + params=None, + chunksize: int | None = None, + dtype: DtypeArg | None = None, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", + ) -> DataFrame | Iterator[DataFrame]: + """ + Read SQL query into a DataFrame. + + Parameters + ---------- + sql : str + SQL query to be executed. + index_col : string, optional, default: None + Column name to use as index for the returned DataFrame object. + coerce_float : bool, default True + Raises NotImplementedError + params : list, tuple or dict, optional, default: None + Raises NotImplementedError + parse_dates : list or dict, default: None + - List of column names to parse as dates. + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times, or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps. + - Dict of ``{column_name: arg dict}``, where the arg dict + corresponds to the keyword arguments of + :func:`pandas.to_datetime` Especially useful with databases + without native Datetime support, such as SQLite. + chunksize : int, default None + Raises NotImplementedError + dtype : Type name or dict of columns + Data type for data or columns. E.g. np.float64 or + {'a': np.float64, 'b': np.int32, 'c': 'Int64'} + + .. versionadded:: 1.3.0 + + Returns + ------- + DataFrame + + See Also + -------- + read_sql_table : Read SQL database table into a DataFrame. + read_sql + + """ + if coerce_float is not True: + raise NotImplementedError( + "'coerce_float' is not implemented for ADBC drivers" + ) + if params: + raise NotImplementedError("'params' is not implemented for ADBC drivers") + if chunksize: + raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") + + mapping: type[ArrowDtype] | None | Callable + if dtype_backend == "pyarrow": + mapping = ArrowDtype + elif dtype_backend == "numpy_nullable": + from pandas.io._util import _arrow_dtype_mapping + + mapping = _arrow_dtype_mapping().get + else: + mapping = None + + with self.con.cursor() as cur: + cur.execute(sql) + df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + + return _wrap_result_adbc( + df, + index_col=index_col, + parse_dates=parse_dates, + dtype=dtype, + ) + + read_sql = read_query + + def to_sql( + self, + frame, + name: str, + if_exists: Literal["fail", "replace", "append"] = "fail", + index: bool = True, + index_label=None, + schema: str | None = None, + chunksize: int | None = None, + dtype: DtypeArg | None = None, + method: Literal["multi"] | Callable | None = None, + engine: str = "auto", + **engine_kwargs, + ) -> int | None: + """ + Write records stored in a DataFrame to a SQL database. + + Parameters + ---------- + frame : DataFrame + name : string + Name of SQL table. + if_exists : {'fail', 'replace', 'append'}, default 'fail' + - fail: If table exists, do nothing. + - replace: If table exists, drop it, recreate it, and insert data. + - append: If table exists, insert data. Create if does not exist. + index : boolean, default True + Write DataFrame index as a column. + index_label : string or sequence, default None + Raises NotImplementedError + schema : string, default None + Name of SQL schema in database to write to (if database flavor + supports this). If specified, this overwrites the default + schema of the SQLDatabase object. + chunksize : int, default None + Raises NotImplementedError + dtype : single type or dict of column name to SQL type, default None + Raises NotImplementedError + method : {None', 'multi', callable}, default None + Raises NotImplementedError + engine : {'auto', 'sqlalchemy'}, default 'auto' + Raises NotImplementedError if not set to 'auto' + """ + if index_label: + raise NotImplementedError( + "'index_label' is not implemented for ADBC drivers" + ) + if chunksize: + raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") + if dtype: + raise NotImplementedError("'dtype' is not implemented for ADBC drivers") + if method: + raise NotImplementedError("'method' is not implemented for ADBC drivers") + if engine != "auto": + raise NotImplementedError( + "engine != 'auto' not implemented for ADBC drivers" + ) + + if schema: + table_name = f"{schema}.{name}" + else: + table_name = name + + # pandas if_exists="append" will still create the + # table if it does not exist; ADBC is more explicit with append/create + # as applicable modes, so the semantics get blurred across + # the libraries + mode = "create" + if self.has_table(name, schema): + if if_exists == "fail": + raise ValueError(f"Table '{table_name}' already exists.") + elif if_exists == "replace": + with self.con.cursor() as cur: + cur.execute(f"DROP TABLE {table_name}") + elif if_exists == "append": + mode = "append" + + import pyarrow as pa + + try: + tbl = pa.Table.from_pandas(frame, preserve_index=index) + except pa.ArrowNotImplementedError as exc: + raise ValueError("datatypes not supported") from exc + + with self.con.cursor() as cur: + total_inserted = cur.adbc_ingest(table_name, tbl, mode=mode) + + self.con.commit() + return total_inserted + + def has_table(self, name: str, schema: str | None = None) -> bool: + meta = self.con.adbc_get_objects( + db_schema_filter=schema, table_name_filter=name + ).read_all() + + for catalog_schema in meta["catalog_db_schemas"].to_pylist(): + if not catalog_schema: + continue + for schema_record in catalog_schema: + if not schema_record: + continue + + for table_record in schema_record["db_schema_tables"]: + if table_record["table_name"] == name: + return True + + return False + + def _create_sql_schema( + self, + frame: DataFrame, + table_name: str, + keys: list[str] | None = None, + dtype: DtypeArg | None = None, + schema: str | None = None, + ): + raise NotImplementedError("not implemented for adbc") + + # sqlite-specific sql strings and handler class # dictionary used for readability purposes _SQL_TYPES = { @@ -2507,9 +2902,10 @@ def get_schema( name of SQL table keys : string or sequence, default: None columns to use a primary key - con: an open SQL database connection object or a SQLAlchemy connectable + con: ADBC Connection, SQLAlchemy connectable, sqlite3 connection, default: None + ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that - library, default: None + library If a DBAPI2 object, only sqlite3 is supported. dtype : dict of column name to SQL type, default None Optional specifying the datatype for columns. The SQL type should diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 05e6d2bcfb46a..d2b76decaa75d 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -15,6 +15,7 @@ from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import is_dict_like +from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import remove_na_arraylike import pandas as pd @@ -362,7 +363,7 @@ def boxplot( if return_type not in BoxPlot._valid_return_types: raise ValueError("return_type must be {'axes', 'dict', 'both'}") - if isinstance(data, pd.Series): + if isinstance(data, ABCSeries): data = data.to_frame("x") column = "x" diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 5a7ceabbf554e..0eb3318ac96c5 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -64,6 +64,8 @@ if TYPE_CHECKING: from collections.abc import Generator + from matplotlib.axis import Axis + from pandas._libs.tslibs.offsets import BaseOffset @@ -187,7 +189,7 @@ class TimeFormatter(Formatter): def __init__(self, locs) -> None: self.locs = locs - def __call__(self, x, pos: int = 0) -> str: + def __call__(self, x, pos: int | None = 0) -> str: """ Return the time of day as a formatted string. @@ -364,8 +366,14 @@ def get_locator(self, dmin, dmax): locator = MilliSecondLocator(self.tz) locator.set_axis(self.axis) - locator.axis.set_view_interval(*self.axis.get_view_interval()) - locator.axis.set_data_interval(*self.axis.get_data_interval()) + # error: Item "None" of "Axis | _DummyAxis | _AxisWrapper | None" + # has no attribute "get_data_interval" + locator.axis.set_view_interval( # type: ignore[union-attr] + *self.axis.get_view_interval() # type: ignore[union-attr] + ) + locator.axis.set_data_interval( # type: ignore[union-attr] + *self.axis.get_data_interval() # type: ignore[union-attr] + ) return locator return mdates.AutoDateLocator.get_locator(self, dmin, dmax) @@ -950,6 +958,8 @@ class TimeSeries_DateLocator(Locator): day : {int}, optional """ + axis: Axis + def __init__( self, freq: BaseOffset, @@ -973,10 +983,7 @@ def __init__( def _get_default_locs(self, vmin, vmax): """Returns the default locations of ticks.""" - if self.plot_obj.date_axis_info is None: - self.plot_obj.date_axis_info = self.finder(vmin, vmax, self.freq) - - locator = self.plot_obj.date_axis_info + locator = self.finder(vmin, vmax, self.freq) if self.isminor: return np.compress(locator["min"], locator["val"]) @@ -987,9 +994,6 @@ def __call__(self): # axis calls Locator.set_axis inside set_m_formatter vi = tuple(self.axis.get_view_interval()) - if vi != self.plot_obj.view_interval: - self.plot_obj.date_axis_info = None - self.plot_obj.view_interval = vi vmin, vmax = vi if vmax < vmin: vmin, vmax = vmax, vmin @@ -999,7 +1003,9 @@ def __call__(self): base = self.base (d, m) = divmod(vmin, base) vmin = (d + 1) * base - locs = list(range(vmin, vmax + 1, base)) + # error: No overload variant of "range" matches argument types "float", + # "float", "int" + locs = list(range(vmin, vmax + 1, base)) # type: ignore[call-overload] return locs def autoscale(self): @@ -1038,6 +1044,8 @@ class TimeSeries_DateFormatter(Formatter): Whether the formatter works in dynamic mode or not. """ + axis: Axis + def __init__( self, freq: BaseOffset, @@ -1058,9 +1066,7 @@ def __init__( def _set_default_format(self, vmin, vmax): """Returns the default ticks spacing.""" - if self.plot_obj.date_axis_info is None: - self.plot_obj.date_axis_info = self.finder(vmin, vmax, self.freq) - info = self.plot_obj.date_axis_info + info = self.finder(vmin, vmax, self.freq) if self.isminor: format = np.compress(info["min"] & np.logical_not(info["maj"]), info) @@ -1076,15 +1082,12 @@ def set_locs(self, locs) -> None: self.locs = locs - (vmin, vmax) = vi = tuple(self.axis.get_view_interval()) - if vi != self.plot_obj.view_interval: - self.plot_obj.date_axis_info = None - self.plot_obj.view_interval = vi + (vmin, vmax) = tuple(self.axis.get_view_interval()) if vmax < vmin: (vmin, vmax) = (vmax, vmin) self._set_default_format(vmin, vmax) - def __call__(self, x, pos: int = 0) -> str: + def __call__(self, x, pos: int | None = 0) -> str: if self.formatdict is None: return "" else: @@ -1107,6 +1110,8 @@ class TimeSeries_TimedeltaFormatter(Formatter): Formats the ticks along an axis controlled by a :class:`TimedeltaIndex`. """ + axis: Axis + @staticmethod def format_timedelta_ticks(x, pos, n_decimals: int) -> str: """ @@ -1124,7 +1129,7 @@ def format_timedelta_ticks(x, pos, n_decimals: int) -> str: s = f"{int(d):d} days {s}" return s - def __call__(self, x, pos: int = 0) -> str: + def __call__(self, x, pos: int | None = 0) -> str: (vmin, vmax) = tuple(self.axis.get_view_interval()) n_decimals = min(int(np.ceil(np.log10(100 * 10**9 / abs(vmax - vmin)))), 9) return self.format_timedelta_ticks(x, pos, n_decimals) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 76b68c6b03dd2..7096c1281a1b6 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -504,7 +504,7 @@ def generate(self) -> None: self._adorn_subplots(fig) for ax in self.axes: - self._post_plot_logic_common(ax, self.data) + self._post_plot_logic_common(ax) self._post_plot_logic(ax, self.data) @final @@ -529,9 +529,16 @@ def _maybe_right_yaxis(self, ax: Axes, axes_num: int) -> Axes: # otherwise, create twin axes orig_ax, new_ax = ax, ax.twinx() # TODO: use Matplotlib public API when available - new_ax._get_lines = orig_ax._get_lines - new_ax._get_patches_for_fill = orig_ax._get_patches_for_fill - orig_ax.right_ax, new_ax.left_ax = new_ax, orig_ax + new_ax._get_lines = orig_ax._get_lines # type: ignore[attr-defined] + # TODO #54485 + new_ax._get_patches_for_fill = ( # type: ignore[attr-defined] + orig_ax._get_patches_for_fill # type: ignore[attr-defined] + ) + # TODO #54485 + orig_ax.right_ax, new_ax.left_ax = ( # type: ignore[attr-defined] + new_ax, + orig_ax, + ) if not self._has_plotted_object(orig_ax): # no data on left y orig_ax.get_yaxis().set_visible(False) @@ -540,7 +547,7 @@ def _maybe_right_yaxis(self, ax: Axes, axes_num: int) -> Axes: new_ax.set_yscale("log") elif self.logy == "sym" or self.loglog == "sym": new_ax.set_yscale("symlog") - return new_ax + return new_ax # type: ignore[return-value] @final @cache_readonly @@ -690,7 +697,7 @@ def _compute_plot_data(self): if is_empty: raise TypeError("no numeric data to plot") - self.data = numeric_data.apply(self._convert_to_ndarray) + self.data = numeric_data.apply(type(self)._convert_to_ndarray) def _make_plot(self, fig: Figure): raise AbstractMethodError(self) @@ -707,21 +714,29 @@ def _add_table(self) -> None: tools.table(ax, data) @final - def _post_plot_logic_common(self, ax: Axes, data) -> None: + def _post_plot_logic_common(self, ax: Axes) -> None: """Common post process for each axes""" if self.orientation == "vertical" or self.orientation is None: - self._apply_axis_properties(ax.xaxis, rot=self.rot, fontsize=self.fontsize) - self._apply_axis_properties(ax.yaxis, fontsize=self.fontsize) + type(self)._apply_axis_properties( + ax.xaxis, rot=self.rot, fontsize=self.fontsize + ) + type(self)._apply_axis_properties(ax.yaxis, fontsize=self.fontsize) if hasattr(ax, "right_ax"): - self._apply_axis_properties(ax.right_ax.yaxis, fontsize=self.fontsize) + type(self)._apply_axis_properties( + ax.right_ax.yaxis, fontsize=self.fontsize + ) elif self.orientation == "horizontal": - self._apply_axis_properties(ax.yaxis, rot=self.rot, fontsize=self.fontsize) - self._apply_axis_properties(ax.xaxis, fontsize=self.fontsize) + type(self)._apply_axis_properties( + ax.yaxis, rot=self.rot, fontsize=self.fontsize + ) + type(self)._apply_axis_properties(ax.xaxis, fontsize=self.fontsize) if hasattr(ax, "right_ax"): - self._apply_axis_properties(ax.right_ax.yaxis, fontsize=self.fontsize) + type(self)._apply_axis_properties( + ax.right_ax.yaxis, fontsize=self.fontsize + ) else: # pragma no cover raise ValueError @@ -792,8 +807,9 @@ def _adorn_subplots(self, fig: Figure): self.axes[0].set_title(self.title) @final + @staticmethod def _apply_axis_properties( - self, axis: Axis, rot=None, fontsize: int | None = None + axis: Axis, rot=None, fontsize: int | None = None ) -> None: """ Tick creation within matplotlib is reasonably expensive and is @@ -1021,16 +1037,6 @@ def _get_ax(self, i: int): ax.get_yaxis().set_visible(True) return ax - @final - @classmethod - def get_default_ax(cls, ax) -> None: - import matplotlib.pyplot as plt - - if ax is None and len(plt.get_fignums()) > 0: - with plt.rc_context(): - ax = plt.gca() - ax = cls._get_ax_layer(ax) - @final def on_right(self, i: int): if isinstance(self.secondary_y, bool): @@ -1185,7 +1191,7 @@ def match_labels(data, e): @final def _get_errorbars( self, label=None, index=None, xerr: bool = True, yerr: bool = True - ): + ) -> dict[str, Any]: errors = {} for kw, flag in zip(["xerr", "yerr"], [xerr, yerr]): @@ -1206,12 +1212,15 @@ def _get_errorbars( @final def _get_subplots(self, fig: Figure): - from matplotlib.axes import Subplot + if Version(mpl.__version__) < Version("3.8"): + from matplotlib.axes import Subplot as Klass + else: + from matplotlib.axes import Axes as Klass return [ ax for ax in fig.get_axes() - if (isinstance(ax, Subplot) and ax.get_subplotspec() is not None) + if (isinstance(ax, Klass) and ax.get_subplotspec() is not None) ] @final @@ -1255,8 +1264,10 @@ def _post_plot_logic(self, ax: Axes, data) -> None: x, y = self.x, self.y xlabel = self.xlabel if self.xlabel is not None else pprint_thing(x) ylabel = self.ylabel if self.ylabel is not None else pprint_thing(y) - ax.set_xlabel(xlabel) - ax.set_ylabel(ylabel) + # error: Argument 1 to "set_xlabel" of "_AxesBase" has incompatible + # type "Hashable"; expected "str" + ax.set_xlabel(xlabel) # type: ignore[arg-type] + ax.set_ylabel(ylabel) # type: ignore[arg-type] @final def _plot_colorbar(self, ax: Axes, *, fig: Figure, **kwds): @@ -1393,7 +1404,7 @@ def _get_norm_and_cmap(self, c_values, color_by_categorical: bool): else: cmap = None - if color_by_categorical: + if color_by_categorical and cmap is not None: from matplotlib import colors n_cats = len(self.data[c].cat.categories) @@ -1578,19 +1589,19 @@ def _ts_plot(self, ax: Axes, x, data: Series, style=None, **kwds): freq, data = maybe_resample(data, ax, kwds) # Set ax with freq info - decorate_axes(ax, freq, kwds) + decorate_axes(ax, freq) # digging deeper if hasattr(ax, "left_ax"): - decorate_axes(ax.left_ax, freq, kwds) + decorate_axes(ax.left_ax, freq) if hasattr(ax, "right_ax"): - decorate_axes(ax.right_ax, freq, kwds) - ax._plot_data.append((data, self._kind, kwds)) + decorate_axes(ax.right_ax, freq) + # TODO #54485 + ax._plot_data.append((data, self._kind, kwds)) # type: ignore[attr-defined] lines = self._plot(ax, data.index, np.asarray(data.values), style=style, **kwds) # set date formatter, locators and rescale limits - # error: Argument 3 to "format_dateaxis" has incompatible type "Index"; - # expected "DatetimeIndex | PeriodIndex" - format_dateaxis(ax, ax.freq, data.index) # type: ignore[arg-type] + # TODO #54485 + format_dateaxis(ax, ax.freq, data.index) # type: ignore[arg-type, attr-defined] return lines @final @@ -1606,11 +1617,15 @@ def _initialize_stacker(cls, ax: Axes, stacking_id, n: int) -> None: if stacking_id is None: return if not hasattr(ax, "_stacker_pos_prior"): - ax._stacker_pos_prior = {} + # TODO #54485 + ax._stacker_pos_prior = {} # type: ignore[attr-defined] if not hasattr(ax, "_stacker_neg_prior"): - ax._stacker_neg_prior = {} - ax._stacker_pos_prior[stacking_id] = np.zeros(n) - ax._stacker_neg_prior[stacking_id] = np.zeros(n) + # TODO #54485 + ax._stacker_neg_prior = {} # type: ignore[attr-defined] + # TODO #54485 + ax._stacker_pos_prior[stacking_id] = np.zeros(n) # type: ignore[attr-defined] + # TODO #54485 + ax._stacker_neg_prior[stacking_id] = np.zeros(n) # type: ignore[attr-defined] @final @classmethod @@ -1624,9 +1639,17 @@ def _get_stacked_values( cls._initialize_stacker(ax, stacking_id, len(values)) if (values >= 0).all(): - return ax._stacker_pos_prior[stacking_id] + values + # TODO #54485 + return ( + ax._stacker_pos_prior[stacking_id] # type: ignore[attr-defined] + + values + ) elif (values <= 0).all(): - return ax._stacker_neg_prior[stacking_id] + values + # TODO #54485 + return ( + ax._stacker_neg_prior[stacking_id] # type: ignore[attr-defined] + + values + ) raise ValueError( "When stacked is True, each column must be either " @@ -1640,9 +1663,11 @@ def _update_stacker(cls, ax: Axes, stacking_id: int | None, values) -> None: if stacking_id is None: return if (values >= 0).all(): - ax._stacker_pos_prior[stacking_id] += values + # TODO #54485 + ax._stacker_pos_prior[stacking_id] += values # type: ignore[attr-defined] elif (values <= 0).all(): - ax._stacker_neg_prior[stacking_id] += values + # TODO #54485 + ax._stacker_neg_prior[stacking_id] += values # type: ignore[attr-defined] def _post_plot_logic(self, ax: Axes, data) -> None: from matplotlib.ticker import FixedLocator @@ -1658,7 +1683,9 @@ def get_label(i): if self._need_to_set_index: xticks = ax.get_xticks() xticklabels = [get_label(x) for x in xticks] - ax.xaxis.set_major_locator(FixedLocator(xticks)) + # error: Argument 1 to "FixedLocator" has incompatible type "ndarray[Any, + # Any]"; expected "Sequence[float]" + ax.xaxis.set_major_locator(FixedLocator(xticks)) # type: ignore[arg-type] ax.set_xticklabels(xticklabels) # If the index is an irregular time series, then by default @@ -1737,9 +1764,11 @@ def _plot( # type: ignore[override] if stacking_id is None: start = np.zeros(len(y)) elif (y >= 0).all(): - start = ax._stacker_pos_prior[stacking_id] + # TODO #54485 + start = ax._stacker_pos_prior[stacking_id] # type: ignore[attr-defined] elif (y <= 0).all(): - start = ax._stacker_neg_prior[stacking_id] + # TODO #54485 + start = ax._stacker_neg_prior[stacking_id] # type: ignore[attr-defined] else: start = np.zeros(len(y)) @@ -2005,7 +2034,9 @@ def _decorate_ticks( ax.set_yticklabels(ticklabels) if name is not None and self.use_index: ax.set_ylabel(name) - ax.set_xlabel(self.xlabel) + # error: Argument 1 to "set_xlabel" of "_AxesBase" has incompatible type + # "Hashable | None"; expected "str" + ax.set_xlabel(self.xlabel) # type: ignore[arg-type] class PiePlot(MPLPlot): diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py index d32741a1d1803..cbb66065a8039 100644 --- a/pandas/plotting/_matplotlib/groupby.py +++ b/pandas/plotting/_matplotlib/groupby.py @@ -7,9 +7,7 @@ from pandas.core.dtypes.missing import remove_na_arraylike from pandas import ( - DataFrame, MultiIndex, - Series, concat, ) @@ -20,6 +18,11 @@ from pandas._typing import IndexLabel + from pandas import ( + DataFrame, + Series, + ) + def create_iter_data_given_by( data: DataFrame, kind: str = "hist" @@ -48,10 +51,10 @@ def create_iter_data_given_by( >>> import numpy as np >>> tuples = [('h1', 'a'), ('h1', 'b'), ('h2', 'a'), ('h2', 'b')] - >>> mi = MultiIndex.from_tuples(tuples) + >>> mi = pd.MultiIndex.from_tuples(tuples) >>> value = [[1, 3, np.nan, np.nan], ... [3, 4, np.nan, np.nan], [np.nan, np.nan, 5, 6]] - >>> data = DataFrame(value, columns=mi) + >>> data = pd.DataFrame(value, columns=mi) >>> create_iter_data_given_by(data) {'h1': h1 a b @@ -104,7 +107,7 @@ def reconstruct_data_with_by( Examples -------- >>> d = {'h': ['h1', 'h1', 'h2'], 'a': [1, 3, 5], 'b': [3, 4, 6]} - >>> df = DataFrame(d) + >>> df = pd.DataFrame(d) >>> reconstruct_data_with_by(df, by='h', cols=['a', 'b']) h1 h2 a b a b diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index f5b415f12f37d..e610f1adb602c 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -161,7 +161,7 @@ def _make_plot(self, fig: Figure) -> None: kwds.pop("color") if self.weights is not None: - kwds["weights"] = self._get_column_weights(self.weights, i, y) + kwds["weights"] = type(self)._get_column_weights(self.weights, i, y) y = reformat_hist_y_given_by(y, self.by) @@ -199,11 +199,21 @@ def _get_column_weights(weights, i: int, y): def _post_plot_logic(self, ax: Axes, data) -> None: if self.orientation == "horizontal": - ax.set_xlabel("Frequency" if self.xlabel is None else self.xlabel) - ax.set_ylabel(self.ylabel) + # error: Argument 1 to "set_xlabel" of "_AxesBase" has incompatible + # type "Hashable"; expected "str" + ax.set_xlabel( + "Frequency" + if self.xlabel is None + else self.xlabel # type: ignore[arg-type] + ) + ax.set_ylabel(self.ylabel) # type: ignore[arg-type] else: - ax.set_xlabel(self.xlabel) - ax.set_ylabel("Frequency" if self.ylabel is None else self.ylabel) + ax.set_xlabel(self.xlabel) # type: ignore[arg-type] + ax.set_ylabel( + "Frequency" + if self.ylabel is None + else self.ylabel # type: ignore[arg-type] + ) @property def orientation(self) -> PlottingOrientation: @@ -274,7 +284,7 @@ def _plot( # type: ignore[override] def _make_plot_keywords(self, kwds: dict[str, Any], y: np.ndarray) -> None: kwds["bw_method"] = self.bw_method - kwds["ind"] = self._get_ind(y, ind=self.ind) + kwds["ind"] = type(self)._get_ind(y, ind=self.ind) def _post_plot_logic(self, ax: Axes, data) -> None: ax.set_ylabel("Density") @@ -282,7 +292,7 @@ def _post_plot_logic(self, ax: Axes, data) -> None: def _grouped_plot( plotf, - data, + data: Series | DataFrame, column=None, by=None, numeric_only: bool = True, @@ -325,7 +335,7 @@ def _grouped_plot( def _grouped_hist( - data, + data: Series | DataFrame, column=None, by=None, ax=None, @@ -407,7 +417,7 @@ def plot_group(group, ax) -> None: def hist_series( - self, + self: Series, by=None, ax=None, grid: bool = True, @@ -447,8 +457,14 @@ def hist_series( ax.grid(grid) axes = np.array([ax]) + # error: Argument 1 to "set_ticks_props" has incompatible type "ndarray[Any, + # dtype[Any]]"; expected "Axes | Sequence[Axes]" set_ticks_props( - axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot + axes, # type: ignore[arg-type] + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, ) else: @@ -479,7 +495,7 @@ def hist_series( def hist_frame( - data, + data: DataFrame, column=None, by=None, grid: bool = True, diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index a5f34e9434cb7..bf4e4be3bfd82 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -269,7 +269,9 @@ def _is_single_string_color(color: Color) -> bool: """ conv = matplotlib.colors.ColorConverter() try: - conv.to_rgba(color) + # error: Argument 1 to "to_rgba" of "ColorConverter" has incompatible type + # "str | Sequence[float]"; expected "tuple[float, float, float] | ..." + conv.to_rgba(color) # type: ignore[arg-type] except ValueError: return False else: diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index 5471305b03baf..6d3579c7f7adb 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -111,8 +111,8 @@ def _is_sup(f1: str, f2: str) -> bool: def _upsample_others(ax: Axes, freq: BaseOffset, kwargs: dict[str, Any]) -> None: legend = ax.get_legend() - lines, labels = _replot_ax(ax, freq, kwargs) - _replot_ax(ax, freq, kwargs) + lines, labels = _replot_ax(ax, freq) + _replot_ax(ax, freq) other_ax = None if hasattr(ax, "left_ax"): @@ -121,25 +121,26 @@ def _upsample_others(ax: Axes, freq: BaseOffset, kwargs: dict[str, Any]) -> None other_ax = ax.right_ax if other_ax is not None: - rlines, rlabels = _replot_ax(other_ax, freq, kwargs) + rlines, rlabels = _replot_ax(other_ax, freq) lines.extend(rlines) labels.extend(rlabels) if legend is not None and kwargs.get("legend", True) and len(lines) > 0: - title = legend.get_title().get_text() + title: str | None = legend.get_title().get_text() if title == "None": title = None ax.legend(lines, labels, loc="best", title=title) -def _replot_ax(ax: Axes, freq: BaseOffset, kwargs: dict[str, Any]): +def _replot_ax(ax: Axes, freq: BaseOffset): data = getattr(ax, "_plot_data", None) # clear current axes and data - ax._plot_data = [] + # TODO #54485 + ax._plot_data = [] # type: ignore[attr-defined] ax.clear() - decorate_axes(ax, freq, kwargs) + decorate_axes(ax, freq) lines = [] labels = [] @@ -148,7 +149,8 @@ def _replot_ax(ax: Axes, freq: BaseOffset, kwargs: dict[str, Any]): series = series.copy() idx = series.index.asfreq(freq, how="S") series.index = idx - ax._plot_data.append((series, plotf, kwds)) + # TODO #54485 + ax._plot_data.append((series, plotf, kwds)) # type: ignore[attr-defined] # for tsplot if isinstance(plotf, str): @@ -162,20 +164,17 @@ def _replot_ax(ax: Axes, freq: BaseOffset, kwargs: dict[str, Any]): return lines, labels -def decorate_axes(ax: Axes, freq: BaseOffset, kwargs: dict[str, Any]) -> None: +def decorate_axes(ax: Axes, freq: BaseOffset) -> None: """Initialize axes for time-series plotting""" if not hasattr(ax, "_plot_data"): - ax._plot_data = [] + # TODO #54485 + ax._plot_data = [] # type: ignore[attr-defined] - ax.freq = freq + # TODO #54485 + ax.freq = freq # type: ignore[attr-defined] xaxis = ax.get_xaxis() - xaxis.freq = freq - if not hasattr(ax, "legendlabels"): - ax.legendlabels = [kwargs.get("label", None)] - else: - ax.legendlabels.append(kwargs.get("label", None)) - ax.view_interval = None - ax.date_axis_info = None + # TODO #54485 + xaxis.freq = freq # type: ignore[attr-defined] def _get_ax_freq(ax: Axes): diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 8c0e401f991a6..898b5b25e7b01 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -52,10 +52,12 @@ def maybe_adjust_figure(fig: Figure, *args, **kwargs) -> None: def format_date_labels(ax: Axes, rot) -> None: # mini version of autofmt_xdate for label in ax.get_xticklabels(): - label.set_ha("right") + label.set_horizontalalignment("right") label.set_rotation(rot) fig = ax.get_figure() - maybe_adjust_figure(fig, bottom=0.2) + if fig is not None: + # should always be a Figure but can technically be None + maybe_adjust_figure(fig, bottom=0.2) def table( @@ -76,8 +78,14 @@ def table( cellText = data.values + # error: Argument "cellText" to "table" has incompatible type "ndarray[Any, + # Any]"; expected "Sequence[Sequence[str]] | None" return matplotlib.table.table( - ax, cellText=cellText, rowLabels=rowLabels, colLabels=colLabels, **kwargs + ax, + cellText=cellText, # type: ignore[arg-type] + rowLabels=rowLabels, + colLabels=colLabels, + **kwargs, ) @@ -369,12 +377,12 @@ def _has_externally_shared_axis(ax1: Axes, compare_axis: str) -> bool: "_has_externally_shared_axis() needs 'x' or 'y' as a second parameter" ) - axes = axes.get_siblings(ax1) + axes_siblings = axes.get_siblings(ax1) # Retain ax1 and any of its siblings which aren't in the same position as it ax1_points = ax1.get_position().get_points() - for ax2 in axes: + for ax2 in axes_siblings: if not np.array_equal(ax1_points, ax2.get_position().get_points()): return True diff --git a/pandas/tests/apply/conftest.py b/pandas/tests/apply/conftest.py deleted file mode 100644 index acccdd845b53c..0000000000000 --- a/pandas/tests/apply/conftest.py +++ /dev/null @@ -1,30 +0,0 @@ -import numpy as np -import pytest - -from pandas import DataFrame - - -@pytest.fixture -def int_frame_const_col(): - """ - Fixture for DataFrame of ints which are constant per column - - Columns are ['A', 'B', 'C'], with values (per column): [1, 2, 3] - """ - df = DataFrame( - np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1, - columns=["A", "B", "C"], - ) - return df - - -@pytest.fixture(params=["python", pytest.param("numba", marks=pytest.mark.single_cpu)]) -def engine(request): - if request.param == "numba": - pytest.importorskip("numba") - return request.param - - -@pytest.fixture(params=[0, 1]) -def apply_axis(request): - return request.param diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 5efc0dd2cd4e3..2d7549e09a986 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -18,6 +18,27 @@ from pandas.tests.frame.common import zip_frames +@pytest.fixture +def int_frame_const_col(): + """ + Fixture for DataFrame of ints which are constant per column + + Columns are ['A', 'B', 'C'], with values (per column): [1, 2, 3] + """ + df = DataFrame( + np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1, + columns=["A", "B", "C"], + ) + return df + + +@pytest.fixture(params=["python", pytest.param("numba", marks=pytest.mark.single_cpu)]) +def engine(request): + if request.param == "numba": + pytest.importorskip("numba") + return request.param + + def test_apply(float_frame, engine, request): if engine == "numba": mark = pytest.mark.xfail(reason="numba engine not supporting numpy ufunc yet") @@ -269,7 +290,7 @@ def test_apply_raw_float_frame_no_reduction(float_frame, engine): @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_raw_mixed_type_frame(mixed_type_frame, axis, engine): +def test_apply_raw_mixed_type_frame(axis, engine): if engine == "numba": pytest.skip("isinstance check doesn't work with numba") @@ -278,7 +299,17 @@ def _assert_raw(x): assert x.ndim == 1 # Mixed dtype (GH-32423) - mixed_type_frame.apply(_assert_raw, axis=axis, engine=engine, raw=True) + df = DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + "float32": np.array([1.0] * 10, dtype="float32"), + "int32": np.array([1] * 10, dtype="int32"), + }, + index=np.arange(10), + ) + df.apply(_assert_raw, axis=axis, engine=engine, raw=True) def test_apply_axis1(float_frame): @@ -1272,7 +1303,7 @@ def test_nuiscance_columns(): result = df.agg(["min"]) expected = DataFrame( - [[1, 1.0, "bar", Timestamp("20130101")]], + [[1, 1.0, "bar", Timestamp("20130101").as_unit("ns")]], index=["min"], columns=df.columns, ) diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index 44829c598253d..9f5157181843e 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -24,9 +24,12 @@ @pytest.mark.parametrize("result_type", ["foo", 1]) -def test_result_type_error(result_type, int_frame_const_col): +def test_result_type_error(result_type): # allowed result_type - df = int_frame_const_col + df = DataFrame( + np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1, + columns=["A", "B", "C"], + ) msg = ( "invalid value for result_type, must be one of " @@ -282,8 +285,11 @@ def test_transform_none_to_type(): lambda x: Series([1, 2]), ], ) -def test_apply_broadcast_error(int_frame_const_col, func): - df = int_frame_const_col +def test_apply_broadcast_error(func): + df = DataFrame( + np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1, + columns=["A", "B", "C"], + ) # > 1 ndim msg = "too many dims to broadcast|cannot broadcast result" diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 3924d8e74e156..ee239568d057d 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -12,6 +12,11 @@ pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu] +@pytest.fixture(params=[0, 1]) +def apply_axis(request): + return request.param + + def test_numba_vs_python_noop(float_frame, apply_axis): func = lambda x: x result = float_frame.apply(func, engine="numba", axis=apply_axis) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 2557b1eab4029..177dff2d771d4 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -150,39 +150,45 @@ def func(x): tm.assert_series_equal(result, expected) -def test_apply_box(): +def test_apply_box_dt64(): # ufunc will not be boxed. Same test cases as the test_map_box vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] - s = Series(vals) - assert s.dtype == "datetime64[ns]" + ser = Series(vals, dtype="M8[ns]") + assert ser.dtype == "datetime64[ns]" # boxed value must be Timestamp instance - res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row="compat") + res = ser.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row="compat") exp = Series(["Timestamp_1_None", "Timestamp_2_None"]) tm.assert_series_equal(res, exp) + +def test_apply_box_dt64tz(): vals = [ pd.Timestamp("2011-01-01", tz="US/Eastern"), pd.Timestamp("2011-01-02", tz="US/Eastern"), ] - s = Series(vals) - assert s.dtype == "datetime64[ns, US/Eastern]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row="compat") + ser = Series(vals, dtype="M8[ns, US/Eastern]") + assert ser.dtype == "datetime64[ns, US/Eastern]" + res = ser.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row="compat") exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) tm.assert_series_equal(res, exp) + +def test_apply_box_td64(): # timedelta vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] - s = Series(vals) - assert s.dtype == "timedelta64[ns]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.days}", by_row="compat") + ser = Series(vals) + assert ser.dtype == "timedelta64[ns]" + res = ser.apply(lambda x: f"{type(x).__name__}_{x.days}", by_row="compat") exp = Series(["Timedelta_1", "Timedelta_2"]) tm.assert_series_equal(res, exp) + +def test_apply_box_period(): # period vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] - s = Series(vals) - assert s.dtype == "Period[M]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.freqstr}", by_row="compat") + ser = Series(vals) + assert ser.dtype == "Period[M]" + res = ser.apply(lambda x: f"{type(x).__name__}_{x.freqstr}", by_row="compat") exp = Series(["Period_M", "Period_M"]) tm.assert_series_equal(res, exp) diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index c046c60e174b3..17e8322dc40e1 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -261,7 +261,11 @@ def test_transform_groupby_kernel_series(request, string_series, op): ) args = [0.0] if op == "fillna" else [] ones = np.ones(string_series.shape[0]) - expected = string_series.groupby(ones).transform(op, *args) + + warn = FutureWarning if op == "fillna" else None + msg = "SeriesGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg): + expected = string_series.groupby(ones).transform(op, *args) result = string_series.transform(op, 0, *args) tm.assert_series_equal(result, expected) @@ -285,7 +289,12 @@ def test_transform_groupby_kernel_frame(request, axis, float_frame, op): with tm.assert_produces_warning(FutureWarning, match=msg): gb = float_frame.groupby(ones, axis=axis) - expected = gb.transform(op, *args) + + warn = FutureWarning if op == "fillna" else None + op_msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=op_msg): + expected = gb.transform(op, *args) + result = float_frame.transform(op, axis, *args) tm.assert_frame_equal(result, expected) @@ -300,7 +309,10 @@ def test_transform_groupby_kernel_frame(request, axis, float_frame, op): ones = np.ones(float_frame.shape[1]) with tm.assert_produces_warning(FutureWarning, match=msg): gb2 = float_frame.groupby(ones, axis=axis) - expected2 = gb2.transform(op, *args) + warn = FutureWarning if op == "fillna" else None + op_msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=op_msg): + expected2 = gb2.transform(op, *args) result2 = float_frame.transform(op, axis, *args) tm.assert_frame_equal(result2, expected2) diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index f77b81574e1c1..c7703b34a5e38 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -2,11 +2,7 @@ import pytest import pandas as pd -from pandas import ( - Index, - RangeIndex, -) -import pandas._testing as tm +from pandas import Index @pytest.fixture(params=[1, np.array(1, dtype=np.int64)]) @@ -63,27 +59,6 @@ def zero(request): return request.param -# ------------------------------------------------------------------ -# Vector Fixtures - - -@pytest.fixture( - params=[ - # TODO: add more dtypes here - Index(np.arange(5, dtype="float64")), - Index(np.arange(5, dtype="int64")), - Index(np.arange(5, dtype="uint64")), - RangeIndex(5), - ], - ids=lambda x: type(x).__name__, -) -def numeric_idx(request): - """ - Several types of numeric-dtypes Index objects - """ - return request.param - - # ------------------------------------------------------------------ # Scalar Fixtures @@ -148,22 +123,6 @@ def two_hours(request): ] -@pytest.fixture( - params=[ - pd.Timedelta(minutes=30).to_pytimedelta(), - np.timedelta64(30, "s"), - pd.Timedelta(seconds=30), - ] - + _common_mismatch -) -def not_hourly(request): - """ - Several timedelta-like and DateOffset instances that are _not_ - compatible with Hourly frequencies. - """ - return request.param - - @pytest.fixture( params=[ np.timedelta64(4, "h"), @@ -178,33 +137,3 @@ def not_daily(request): compatible with Daily frequencies. """ return request.param - - -@pytest.fixture( - params=[ - np.timedelta64(365, "D"), - pd.Timedelta(days=365).to_pytimedelta(), - pd.Timedelta(days=365), - ] - + _common_mismatch -) -def mismatched_freq(request): - """ - Several timedelta-like and DateOffset instances that are _not_ - compatible with Monthly or Annual frequencies. - """ - return request.param - - -# ------------------------------------------------------------------ - - -@pytest.fixture( - params=[Index, pd.Series, tm.to_array, np.array, list], ids=lambda x: x.__name__ -) -def box_1d_array(request): - """ - Fixture to test behavior for Index, Series, tm.to_array, numpy Array and list - classes - """ - return request.param diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index a62bebec61ada..a63bfbf1835a9 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -414,7 +414,7 @@ def test_dti_cmp_datetimelike(self, other, tz_naive_fixture): dti = date_range("2016-01-01", periods=2, tz=tz) if tz is not None: if isinstance(other, np.datetime64): - pytest.skip("no tzaware version available") + pytest.skip(f"{type(other).__name__} is not tz aware") other = localize_pydatetime(other, dti.tzinfo) result = dti == other @@ -1286,7 +1286,7 @@ def test_dti_add_tick_tzaware(self, tz_aware_fixture, box_with_array): ["2010-11-01 05:00", "2010-11-01 06:00", "2010-11-01 07:00"], freq="h", tz=tz, - ) + ).as_unit("ns") dates = tm.box_expected(dates, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1580,7 +1580,7 @@ def test_dti_add_sub_nonzero_mth_offset( mth = getattr(date, op) result = mth(offset) - expected = DatetimeIndex(exp, tz=tz) + expected = DatetimeIndex(exp, tz=tz).as_unit("ns") expected = tm.box_expected(expected, box_with_array, False) tm.assert_equal(result, expected) @@ -2286,7 +2286,7 @@ def test_dti_add_series(self, tz_naive_fixture, names): tz = tz_naive_fixture index = DatetimeIndex( ["2016-06-28 05:30", "2016-06-28 05:31"], tz=tz, name=names[0] - ) + ).as_unit("ns") ser = Series([Timedelta(seconds=5)] * 2, index=index, name=names[1]) expected = Series(index + Timedelta(seconds=5), index=index, name=names[2]) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 8d49171cc43f3..f89711c0edee7 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -44,6 +44,34 @@ def box_pandas_1d_array(request): return request.param +@pytest.fixture( + params=[ + # TODO: add more dtypes here + Index(np.arange(5, dtype="float64")), + Index(np.arange(5, dtype="int64")), + Index(np.arange(5, dtype="uint64")), + RangeIndex(5), + ], + ids=lambda x: type(x).__name__, +) +def numeric_idx(request): + """ + Several types of numeric-dtypes Index objects + """ + return request.param + + +@pytest.fixture( + params=[Index, Series, tm.to_array, np.array, list], ids=lambda x: x.__name__ +) +def box_1d_array(request): + """ + Fixture to test behavior for Index, Series, tm.to_array, numpy Array and list + classes + """ + return request.param + + def adjust_negative_zero(zero, expected): """ Helper to adjust the expected result if we are dividing by -0.0 @@ -384,7 +412,7 @@ def test_divmod_zero(self, zero, numeric_idx): def test_div_negative_zero(self, zero, numeric_idx, op): # Check that -1 / -0.0 returns np.inf, not -np.inf if numeric_idx.dtype == np.uint64: - pytest.skip(f"Not relevant for {numeric_idx.dtype}") + pytest.skip(f"Div by negative 0 not relevant for {numeric_idx.dtype}") idx = numeric_idx - 3 expected = Index([-np.inf, -np.inf, -np.inf, np.nan, np.inf], dtype=np.float64) diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 5af63258921ed..88c633f5e747f 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -31,6 +31,45 @@ get_upcast_box, ) +_common_mismatch = [ + pd.offsets.YearBegin(2), + pd.offsets.MonthBegin(1), + pd.offsets.Minute(), +] + + +@pytest.fixture( + params=[ + Timedelta(minutes=30).to_pytimedelta(), + np.timedelta64(30, "s"), + Timedelta(seconds=30), + ] + + _common_mismatch +) +def not_hourly(request): + """ + Several timedelta-like and DateOffset instances that are _not_ + compatible with Hourly frequencies. + """ + return request.param + + +@pytest.fixture( + params=[ + np.timedelta64(365, "D"), + Timedelta(days=365).to_pytimedelta(), + Timedelta(days=365), + ] + + _common_mismatch +) +def mismatched_freq(request): + """ + Several timedelta-like and DateOffset instances that are _not_ + compatible with Monthly or Annual frequencies. + """ + return request.param + + # ------------------------------------------------------------------ # Comparisons @@ -1310,6 +1349,42 @@ def test_parr_add_sub_object_array(self): expected = PeriodIndex(["2000-12-30"] * 3, freq="D")._data.astype(object) tm.assert_equal(result, expected) + def test_period_add_timestamp_raises(self, box_with_array): + # GH#17983 + ts = Timestamp("2017") + per = Period("2017", freq="M") + + arr = pd.Index([per], dtype="Period[M]") + arr = tm.box_expected(arr, box_with_array) + + msg = "cannot add PeriodArray and Timestamp" + with pytest.raises(TypeError, match=msg): + arr + ts + with pytest.raises(TypeError, match=msg): + ts + arr + msg = "cannot add PeriodArray and DatetimeArray" + with pytest.raises(TypeError, match=msg): + arr + Series([ts]) + with pytest.raises(TypeError, match=msg): + Series([ts]) + arr + with pytest.raises(TypeError, match=msg): + arr + pd.Index([ts]) + with pytest.raises(TypeError, match=msg): + pd.Index([ts]) + arr + + if box_with_array is pd.DataFrame: + msg = "cannot add PeriodArray and DatetimeArray" + else: + msg = r"unsupported operand type\(s\) for \+: 'Period' and 'DatetimeArray" + with pytest.raises(TypeError, match=msg): + arr + pd.DataFrame([ts]) + if box_with_array is pd.DataFrame: + msg = "cannot add PeriodArray and DatetimeArray" + else: + msg = r"unsupported operand type\(s\) for \+: 'DatetimeArray' and 'Period'" + with pytest.raises(TypeError, match=msg): + pd.DataFrame([ts]) + arr + class TestPeriodSeriesArithmetic: def test_parr_add_timedeltalike_scalar(self, three_days, box_with_array): diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 18c6b437743e2..e88bde07aee90 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -344,12 +344,14 @@ def test_subtraction_ops(self): result = dti - td expected = DatetimeIndex( - ["20121231", "20130101", "20130102"], freq="D", name="bar" + ["20121231", "20130101", "20130102"], dtype="M8[ns]", freq="D", name="bar" ) tm.assert_index_equal(result, expected) result = dt - tdi - expected = DatetimeIndex(["20121231", NaT, "20121230"], name="foo") + expected = DatetimeIndex( + ["20121231", NaT, "20121230"], dtype="M8[ns]", name="foo" + ) tm.assert_index_equal(result, expected) def test_subtraction_ops_with_tz(self, box_with_array): @@ -432,7 +434,9 @@ def _check(result, expected): _check(result, expected) result = dti_tz - td - expected = DatetimeIndex(["20121231", "20130101", "20130102"], tz="US/Eastern") + expected = DatetimeIndex( + ["20121231", "20130101", "20130102"], tz="US/Eastern" + ).as_unit("ns") expected = tm.box_expected(expected, box_with_array) tm.assert_equal(result, expected) @@ -450,7 +454,7 @@ def test_dti_tdi_numeric_ops(self): tm.assert_index_equal(result, expected) result = dti - tdi # name will be reset - expected = DatetimeIndex(["20121231", NaT, "20130101"]) + expected = DatetimeIndex(["20121231", NaT, "20130101"], dtype="M8[ns]") tm.assert_index_equal(result, expected) def test_addition_ops(self): @@ -461,11 +465,15 @@ def test_addition_ops(self): dt = Timestamp("20130101") result = tdi + dt - expected = DatetimeIndex(["20130102", NaT, "20130103"], name="foo") + expected = DatetimeIndex( + ["20130102", NaT, "20130103"], dtype="M8[ns]", name="foo" + ) tm.assert_index_equal(result, expected) result = dt + tdi - expected = DatetimeIndex(["20130102", NaT, "20130103"], name="foo") + expected = DatetimeIndex( + ["20130102", NaT, "20130103"], dtype="M8[ns]", name="foo" + ) tm.assert_index_equal(result, expected) result = td + tdi @@ -492,11 +500,11 @@ def test_addition_ops(self): # pytest.raises(TypeError, lambda : Index([1,2,3]) + tdi) result = tdi + dti # name will be reset - expected = DatetimeIndex(["20130102", NaT, "20130105"]) + expected = DatetimeIndex(["20130102", NaT, "20130105"], dtype="M8[ns]") tm.assert_index_equal(result, expected) result = dti + tdi # name will be reset - expected = DatetimeIndex(["20130102", NaT, "20130105"]) + expected = DatetimeIndex(["20130102", NaT, "20130105"], dtype="M8[ns]") tm.assert_index_equal(result, expected) result = dt + td @@ -869,7 +877,7 @@ def test_operators_timedelta64(self): # timestamp on lhs result = resultb + df["A"] values = [Timestamp("20111230"), Timestamp("20120101"), Timestamp("20120103")] - expected = Series(values, name="A") + expected = Series(values, dtype="M8[ns]", name="A") tm.assert_series_equal(result, expected) # datetimes on rhs @@ -1031,7 +1039,7 @@ def test_td64arr_add_datetime64_nat(self, box_with_array): other = np.datetime64("NaT") tdi = timedelta_range("1 day", periods=3) - expected = DatetimeIndex(["NaT", "NaT", "NaT"]) + expected = DatetimeIndex(["NaT", "NaT", "NaT"], dtype="M8[ns]") tdser = tm.box_expected(tdi, box_with_array) expected = tm.box_expected(expected, box_with_array) diff --git a/pandas/tests/arrays/categorical/conftest.py b/pandas/tests/arrays/categorical/conftest.py index d5b49e3e5e8c8..37249210f28f4 100644 --- a/pandas/tests/arrays/categorical/conftest.py +++ b/pandas/tests/arrays/categorical/conftest.py @@ -3,12 +3,6 @@ from pandas import Categorical -@pytest.fixture(params=[True, False]) -def allow_fill(request): - """Boolean 'allow_fill' parameter for Categorical.take""" - return request.param - - @pytest.fixture def factor(): """Fixture returning a Categorical object""" diff --git a/pandas/tests/arrays/categorical/test_take.py b/pandas/tests/arrays/categorical/test_take.py index fb79fe4923522..373f1b30a13c2 100644 --- a/pandas/tests/arrays/categorical/test_take.py +++ b/pandas/tests/arrays/categorical/test_take.py @@ -5,6 +5,12 @@ import pandas._testing as tm +@pytest.fixture(params=[True, False]) +def allow_fill(request): + """Boolean 'allow_fill' parameter for Categorical.take""" + return request.param + + class TestTake: # https://github.com/pandas-dev/pandas/issues/20664 diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 024721896cc58..be4b2c3e7e74c 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -229,178 +229,3 @@ def test_min_max(self, left_right_dtypes, index_or_series_or_array): res = arr_na.max(skipna=True) assert res == MAX assert type(res) == type(MAX) - - -# ---------------------------------------------------------------------------- -# Arrow interaction - - -def test_arrow_extension_type(): - pa = pytest.importorskip("pyarrow") - - from pandas.core.arrays.arrow.extension_types import ArrowIntervalType - - p1 = ArrowIntervalType(pa.int64(), "left") - p2 = ArrowIntervalType(pa.int64(), "left") - p3 = ArrowIntervalType(pa.int64(), "right") - - assert p1.closed == "left" - assert p1 == p2 - assert p1 != p3 - assert hash(p1) == hash(p2) - assert hash(p1) != hash(p3) - - -def test_arrow_array(): - pa = pytest.importorskip("pyarrow") - - from pandas.core.arrays.arrow.extension_types import ArrowIntervalType - - intervals = pd.interval_range(1, 5, freq=1).array - - result = pa.array(intervals) - assert isinstance(result.type, ArrowIntervalType) - assert result.type.closed == intervals.closed - assert result.type.subtype == pa.int64() - assert result.storage.field("left").equals(pa.array([1, 2, 3, 4], type="int64")) - assert result.storage.field("right").equals(pa.array([2, 3, 4, 5], type="int64")) - - expected = pa.array([{"left": i, "right": i + 1} for i in range(1, 5)]) - assert result.storage.equals(expected) - - # convert to its storage type - result = pa.array(intervals, type=expected.type) - assert result.equals(expected) - - # unsupported conversions - with pytest.raises(TypeError, match="Not supported to convert IntervalArray"): - pa.array(intervals, type="float64") - - with pytest.raises(TypeError, match="Not supported to convert IntervalArray"): - pa.array(intervals, type=ArrowIntervalType(pa.float64(), "left")) - - -def test_arrow_array_missing(): - pa = pytest.importorskip("pyarrow") - - from pandas.core.arrays.arrow.extension_types import ArrowIntervalType - - arr = IntervalArray.from_breaks([0.0, 1.0, 2.0, 3.0]) - arr[1] = None - - result = pa.array(arr) - assert isinstance(result.type, ArrowIntervalType) - assert result.type.closed == arr.closed - assert result.type.subtype == pa.float64() - - # fields have missing values (not NaN) - left = pa.array([0.0, None, 2.0], type="float64") - right = pa.array([1.0, None, 3.0], type="float64") - assert result.storage.field("left").equals(left) - assert result.storage.field("right").equals(right) - - # structarray itself also has missing values on the array level - vals = [ - {"left": 0.0, "right": 1.0}, - {"left": None, "right": None}, - {"left": 2.0, "right": 3.0}, - ] - expected = pa.StructArray.from_pandas(vals, mask=np.array([False, True, False])) - assert result.storage.equals(expected) - - -@pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) -@pytest.mark.parametrize( - "breaks", - [[0.0, 1.0, 2.0, 3.0], date_range("2017", periods=4, freq="D")], - ids=["float", "datetime64[ns]"], -) -def test_arrow_table_roundtrip(breaks): - pa = pytest.importorskip("pyarrow") - - from pandas.core.arrays.arrow.extension_types import ArrowIntervalType - - arr = IntervalArray.from_breaks(breaks) - arr[1] = None - df = pd.DataFrame({"a": arr}) - - table = pa.table(df) - assert isinstance(table.field("a").type, ArrowIntervalType) - result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.IntervalDtype) - tm.assert_frame_equal(result, df) - - table2 = pa.concat_tables([table, table]) - result = table2.to_pandas() - expected = pd.concat([df, df], ignore_index=True) - tm.assert_frame_equal(result, expected) - - # GH-41040 - table = pa.table( - [pa.chunked_array([], type=table.column(0).type)], schema=table.schema - ) - result = table.to_pandas() - tm.assert_frame_equal(result, expected[0:0]) - - -@pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" -) -@pytest.mark.parametrize( - "breaks", - [[0.0, 1.0, 2.0, 3.0], date_range("2017", periods=4, freq="D")], - ids=["float", "datetime64[ns]"], -) -def test_arrow_table_roundtrip_without_metadata(breaks): - pa = pytest.importorskip("pyarrow") - - arr = IntervalArray.from_breaks(breaks) - arr[1] = None - df = pd.DataFrame({"a": arr}) - - table = pa.table(df) - # remove the metadata - table = table.replace_schema_metadata() - assert table.schema.metadata is None - - result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.IntervalDtype) - tm.assert_frame_equal(result, df) - - -def test_from_arrow_from_raw_struct_array(): - # in case pyarrow lost the Interval extension type (eg on parquet roundtrip - # with datetime64[ns] subtype, see GH-45881), still allow conversion - # from arrow to IntervalArray - pa = pytest.importorskip("pyarrow") - - arr = pa.array([{"left": 0, "right": 1}, {"left": 1, "right": 2}]) - dtype = pd.IntervalDtype(np.dtype("int64"), closed="neither") - - result = dtype.__from_arrow__(arr) - expected = IntervalArray.from_breaks( - np.array([0, 1, 2], dtype="int64"), closed="neither" - ) - tm.assert_extension_array_equal(result, expected) - - result = dtype.__from_arrow__(pa.chunked_array([arr])) - tm.assert_extension_array_equal(result, expected) - - -@pytest.mark.parametrize("timezone", ["UTC", "US/Pacific", "GMT"]) -def test_interval_index_subtype(timezone, inclusive_endpoints_fixture): - # GH 46999 - dates = date_range("2022", periods=3, tz=timezone) - dtype = f"interval[datetime64[ns, {timezone}], {inclusive_endpoints_fixture}]" - result = IntervalIndex.from_arrays( - ["2022-01-01", "2022-01-02"], - ["2022-01-02", "2022-01-03"], - closed=inclusive_endpoints_fixture, - dtype=dtype, - ) - expected = IntervalIndex.from_arrays( - dates[:-1], dates[1:], closed=inclusive_endpoints_fixture - ) - tm.assert_index_equal(result, expected) diff --git a/pandas/tests/arrays/interval/test_interval_pyarrow.py b/pandas/tests/arrays/interval/test_interval_pyarrow.py new file mode 100644 index 0000000000000..ef8701be81e2b --- /dev/null +++ b/pandas/tests/arrays/interval/test_interval_pyarrow.py @@ -0,0 +1,160 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import IntervalArray + + +def test_arrow_extension_type(): + pa = pytest.importorskip("pyarrow") + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + + p1 = ArrowIntervalType(pa.int64(), "left") + p2 = ArrowIntervalType(pa.int64(), "left") + p3 = ArrowIntervalType(pa.int64(), "right") + + assert p1.closed == "left" + assert p1 == p2 + assert p1 != p3 + assert hash(p1) == hash(p2) + assert hash(p1) != hash(p3) + + +def test_arrow_array(): + pa = pytest.importorskip("pyarrow") + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + + intervals = pd.interval_range(1, 5, freq=1).array + + result = pa.array(intervals) + assert isinstance(result.type, ArrowIntervalType) + assert result.type.closed == intervals.closed + assert result.type.subtype == pa.int64() + assert result.storage.field("left").equals(pa.array([1, 2, 3, 4], type="int64")) + assert result.storage.field("right").equals(pa.array([2, 3, 4, 5], type="int64")) + + expected = pa.array([{"left": i, "right": i + 1} for i in range(1, 5)]) + assert result.storage.equals(expected) + + # convert to its storage type + result = pa.array(intervals, type=expected.type) + assert result.equals(expected) + + # unsupported conversions + with pytest.raises(TypeError, match="Not supported to convert IntervalArray"): + pa.array(intervals, type="float64") + + with pytest.raises(TypeError, match="Not supported to convert IntervalArray"): + pa.array(intervals, type=ArrowIntervalType(pa.float64(), "left")) + + +def test_arrow_array_missing(): + pa = pytest.importorskip("pyarrow") + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + + arr = IntervalArray.from_breaks([0.0, 1.0, 2.0, 3.0]) + arr[1] = None + + result = pa.array(arr) + assert isinstance(result.type, ArrowIntervalType) + assert result.type.closed == arr.closed + assert result.type.subtype == pa.float64() + + # fields have missing values (not NaN) + left = pa.array([0.0, None, 2.0], type="float64") + right = pa.array([1.0, None, 3.0], type="float64") + assert result.storage.field("left").equals(left) + assert result.storage.field("right").equals(right) + + # structarray itself also has missing values on the array level + vals = [ + {"left": 0.0, "right": 1.0}, + {"left": None, "right": None}, + {"left": 2.0, "right": 3.0}, + ] + expected = pa.StructArray.from_pandas(vals, mask=np.array([False, True, False])) + assert result.storage.equals(expected) + + +@pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) +@pytest.mark.parametrize( + "breaks", + [[0.0, 1.0, 2.0, 3.0], pd.date_range("2017", periods=4, freq="D")], + ids=["float", "datetime64[ns]"], +) +def test_arrow_table_roundtrip(breaks): + pa = pytest.importorskip("pyarrow") + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + + arr = IntervalArray.from_breaks(breaks) + arr[1] = None + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + assert isinstance(table.field("a").type, ArrowIntervalType) + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.IntervalDtype) + tm.assert_frame_equal(result, df) + + table2 = pa.concat_tables([table, table]) + result = table2.to_pandas() + expected = pd.concat([df, df], ignore_index=True) + tm.assert_frame_equal(result, expected) + + # GH#41040 + table = pa.table( + [pa.chunked_array([], type=table.column(0).type)], schema=table.schema + ) + result = table.to_pandas() + tm.assert_frame_equal(result, expected[0:0]) + + +@pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) +@pytest.mark.parametrize( + "breaks", + [[0.0, 1.0, 2.0, 3.0], pd.date_range("2017", periods=4, freq="D")], + ids=["float", "datetime64[ns]"], +) +def test_arrow_table_roundtrip_without_metadata(breaks): + pa = pytest.importorskip("pyarrow") + + arr = IntervalArray.from_breaks(breaks) + arr[1] = None + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + # remove the metadata + table = table.replace_schema_metadata() + assert table.schema.metadata is None + + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.IntervalDtype) + tm.assert_frame_equal(result, df) + + +def test_from_arrow_from_raw_struct_array(): + # in case pyarrow lost the Interval extension type (eg on parquet roundtrip + # with datetime64[ns] subtype, see GH-45881), still allow conversion + # from arrow to IntervalArray + pa = pytest.importorskip("pyarrow") + + arr = pa.array([{"left": 0, "right": 1}, {"left": 1, "right": 2}]) + dtype = pd.IntervalDtype(np.dtype("int64"), closed="neither") + + result = dtype.__from_arrow__(arr) + expected = IntervalArray.from_breaks( + np.array([0, 1, 2], dtype="int64"), closed="neither" + ) + tm.assert_extension_array_equal(result, expected) + + result = dtype.__from_arrow__(pa.chunked_array([arr])) + tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/period/test_astype.py b/pandas/tests/arrays/period/test_astype.py index bb9b07a113ed3..9976c3a32580d 100644 --- a/pandas/tests/arrays/period/test_astype.py +++ b/pandas/tests/arrays/period/test_astype.py @@ -63,5 +63,5 @@ def test_astype_datetime(dtype): else: # GH#45038 allow period->dt64 because we allow dt64->period result = arr.astype(dtype) - expected = pd.DatetimeIndex(["2000", "2001", pd.NaT])._data + expected = pd.DatetimeIndex(["2000", "2001", pd.NaT], dtype=dtype)._data tm.assert_datetime_array_equal(result, expected) diff --git a/pandas/tests/arrays/sparse/test_indexing.py b/pandas/tests/arrays/sparse/test_indexing.py index d63d0fb07b404..60029ac06ddb4 100644 --- a/pandas/tests/arrays/sparse/test_indexing.py +++ b/pandas/tests/arrays/sparse/test_indexing.py @@ -166,9 +166,16 @@ def test_take(self, arr_data, arr): tm.assert_sp_array_equal(arr.take([0, 1, 2]), exp) def test_take_all_empty(self): - a = pd.array([0, 0], dtype=SparseDtype("int64")) - result = a.take([0, 1], allow_fill=True, fill_value=np.nan) - tm.assert_sp_array_equal(a, result) + sparse = pd.array([0, 0], dtype=SparseDtype("int64")) + result = sparse.take([0, 1], allow_fill=True, fill_value=np.nan) + tm.assert_sp_array_equal(sparse, result) + + def test_take_different_fill_value(self): + # Take with a different fill value shouldn't overwrite the original + sparse = pd.array([0.0], dtype=SparseDtype("float64", fill_value=0.0)) + result = sparse.take([0, -1], allow_fill=True, fill_value=np.nan) + expected = pd.array([0, np.nan], dtype=sparse.dtype) + tm.assert_sp_array_equal(expected, result) def test_take_fill_value(self): data = np.array([1, np.nan, 0, 3, 0]) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 92536a222296e..eb6e93b490574 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -113,17 +113,17 @@ def test_dt64_array(dtype_unit): ( pd.DatetimeIndex(["2000", "2001"]), np.dtype("datetime64[ns]"), - DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), ), ( pd.DatetimeIndex(["2000", "2001"]), None, - DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), ), ( ["2000", "2001"], np.dtype("datetime64[ns]"), - DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), ), # Datetime (tz-aware) ( diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 5a5abcc4aa85d..5996d8797d143 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -853,10 +853,14 @@ def test_concat_same_type_invalid(self, arr1d): with pytest.raises(ValueError, match="to_concat must have the same"): arr._concat_same_type([arr, other]) - def test_concat_same_type_different_freq(self): + def test_concat_same_type_different_freq(self, unit): # we *can* concatenate DTI with different freqs. - a = DatetimeArray(pd.date_range("2000", periods=2, freq="D", tz="US/Central")) - b = DatetimeArray(pd.date_range("2000", periods=2, freq="h", tz="US/Central")) + a = DatetimeArray( + pd.date_range("2000", periods=2, freq="D", tz="US/Central", unit=unit) + ) + b = DatetimeArray( + pd.date_range("2000", periods=2, freq="h", tz="US/Central", unit=unit) + ) result = DatetimeArray._concat_same_type([a, b]) expected = DatetimeArray( pd.to_datetime( @@ -866,7 +870,9 @@ def test_concat_same_type_different_freq(self): "2000-01-01 00:00:00", "2000-01-01 01:00:00", ] - ).tz_localize("US/Central") + ) + .tz_localize("US/Central") + .as_unit(unit) ) tm.assert_datetime_array_equal(result, expected) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 566117899cfc5..1fa55f13af2a8 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -309,6 +309,22 @@ def test_cmp_dt64_arraylike_tznaive(self, comparison_op): class TestDatetimeArray: + def test_astype_ns_to_ms_near_bounds(self): + # GH#55979 + ts = pd.Timestamp("1677-09-21 00:12:43.145225") + target = ts.as_unit("ms") + + dta = DatetimeArray._from_sequence([ts], dtype="M8[ns]") + assert (dta.view("i8") == ts.as_unit("ns").value).all() + + result = dta.astype("M8[ms]") + assert result[0] == target + + expected = DatetimeArray._from_sequence([ts], dtype="M8[ms]") + assert (expected.view("i8") == target._value).all() + + tm.assert_datetime_array_equal(result, expected) + def test_astype_non_nano_tznaive(self): dti = pd.date_range("2016-01-01", periods=3) @@ -756,9 +772,7 @@ def test_iter_zoneinfo_fold(self, tz): ) def test_date_range_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): # GH#9586, GH#54275 - depr_msg = ( - f"'{freq_depr[1:]}' will be deprecated, please use '{freq[1:]}' instead." - ) + depr_msg = f"'{freq_depr[1:]}' is deprecated, please use '{freq[1:]}' instead." expected = pd.date_range("1/1/2000", periods=4, freq=freq) with tm.assert_produces_warning(FutureWarning, match=depr_msg): diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 2311ce928ee27..fe49446424de1 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -63,7 +63,7 @@ reason=f"numexpr enabled->{USE_NUMEXPR}, " f"installed->{NUMEXPR_INSTALLED}", ), - td.skip_if_no_ne, + td.skip_if_no("numexpr"), ], ) for engine in ENGINES @@ -1164,7 +1164,9 @@ def test_assignment_single_assign_new(self): df.eval("c = a + b", inplace=True) tm.assert_frame_equal(df, expected) - def test_assignment_single_assign_local_overlap(self): + # TODO(CoW-warn) this should not warn (DataFrame.eval creates refs to self) + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + def test_assignment_single_assign_local_overlap(self, warn_copy_on_write): df = DataFrame( np.random.default_rng(2).standard_normal((5, 2)), columns=list("ab") ) @@ -1218,6 +1220,8 @@ def test_column_in(self): tm.assert_series_equal(result, expected, check_names=False) @pytest.mark.xfail(reason="Unknown: Omitted test_ in name prior.") + # TODO(CoW-warn) this should not warn (DataFrame.eval creates refs to self) + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_assignment_not_inplace(self): # see gh-9297 df = DataFrame( @@ -1683,14 +1687,14 @@ def test_empty_globals(self, engine, parser): pd.eval(e, engine=engine, parser=parser, global_dict={}) -@td.skip_if_no_ne +@td.skip_if_no("numexpr") def test_invalid_engine(): msg = "Invalid engine 'asdf' passed" with pytest.raises(KeyError, match=msg): pd.eval("x + y", local_dict={"x": 1, "y": 2}, engine="asdf") -@td.skip_if_no_ne +@td.skip_if_no("numexpr") @pytest.mark.parametrize( ("use_numexpr", "expected"), ( @@ -1707,7 +1711,7 @@ def test_numexpr_option_respected(use_numexpr, expected): assert result == expected -@td.skip_if_no_ne +@td.skip_if_no("numexpr") def test_numexpr_option_incompatible_op(): # GH 32556 with pd.option_context("compute.use_numexpr", False): @@ -1719,7 +1723,7 @@ def test_numexpr_option_incompatible_op(): tm.assert_frame_equal(result, expected) -@td.skip_if_no_ne +@td.skip_if_no("numexpr") def test_invalid_parser(): msg = "Invalid parser 'asdf' passed" with pytest.raises(KeyError, match=msg): @@ -1897,13 +1901,14 @@ def test_eval_no_support_column_name(request, column): tm.assert_frame_equal(result, expected) -def test_set_inplace(using_copy_on_write): +def test_set_inplace(using_copy_on_write, warn_copy_on_write): # https://github.com/pandas-dev/pandas/issues/47449 # Ensure we don't only update the DataFrame inplace, but also the actual # column values, such that references to this column also get updated df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) result_view = df[:] ser = df["A"] + # with tm.assert_cow_warning(warn_copy_on_write): df.eval("A = B + C", inplace=True) expected = DataFrame({"A": [11, 13, 15], "B": [4, 5, 6], "C": [7, 8, 9]}) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 221bf8759875f..89384a4ef6ba6 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -242,8 +242,8 @@ def test_dataframe_from_dict_of_series( assert np.shares_memory(get_array(result, "a"), get_array(s1)) # mutating the new dataframe doesn't mutate original - # TODO(CoW-warn) this should also warn - result.iloc[0, 0] = 10 + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[0, 0] = 10 if using_copy_on_write: assert not np.shares_memory(get_array(result, "a"), get_array(s1)) tm.assert_series_equal(s1, s1_orig) diff --git a/pandas/tests/copy_view/test_core_functionalities.py b/pandas/tests/copy_view/test_core_functionalities.py index bfdf2b92fd326..8dc80c5cc0e0e 100644 --- a/pandas/tests/copy_view/test_core_functionalities.py +++ b/pandas/tests/copy_view/test_core_functionalities.py @@ -28,27 +28,33 @@ def test_setitem_dont_track_unnecessary_references(using_copy_on_write): assert np.shares_memory(arr, get_array(df, "a")) -def test_setitem_with_view_copies(using_copy_on_write): +def test_setitem_with_view_copies(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) view = df[:] expected = df.copy() df["b"] = 100 arr = get_array(df, "a") - df.iloc[0, 0] = 100 # Check that we correctly track reference + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 100 # Check that we correctly track reference if using_copy_on_write: assert not np.shares_memory(arr, get_array(df, "a")) tm.assert_frame_equal(view, expected) -def test_setitem_with_view_invalidated_does_not_copy(using_copy_on_write, request): +def test_setitem_with_view_invalidated_does_not_copy( + using_copy_on_write, warn_copy_on_write, request +): df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) view = df[:] df["b"] = 100 arr = get_array(df, "a") view = None # noqa: F841 - df.iloc[0, 0] = 100 + # TODO(CoW-warn) false positive? -> block gets split because of `df["b"] = 100` + # which introduces additional refs, even when those of `view` go out of scopes + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 100 if using_copy_on_write: # Setitem split the block. Since the old block shared data with view # all the new blocks are referencing view and each other. When view diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index c4d5e9dbce72a..2e623f885b648 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -101,7 +101,7 @@ def test_subset_column_selection_modify_parent(backend, using_copy_on_write): tm.assert_frame_equal(subset, expected) -def test_subset_row_slice(backend, using_copy_on_write): +def test_subset_row_slice(backend, using_copy_on_write, warn_copy_on_write): # Case: taking a subset of the rows of a DataFrame using a slice # + afterwards modifying the subset _, DataFrame, _ = backend @@ -121,7 +121,8 @@ def test_subset_row_slice(backend, using_copy_on_write): # INFO this no longer raise warning since pandas 1.4 # with pd.option_context("chained_assignment", "warn"): # with tm.assert_produces_warning(SettingWithCopyWarning): - subset.iloc[0, 0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + subset.iloc[0, 0] = 0 subset._mgr._verify_integrity() @@ -162,7 +163,6 @@ def test_subset_column_slice( subset.iloc[0, 0] = 0 assert not np.shares_memory(get_array(subset, "b"), get_array(df, "b")) elif warn_copy_on_write: - # TODO(CoW-warn) should warn with tm.assert_cow_warning(single_block): subset.iloc[0, 0] = 0 else: @@ -333,9 +333,11 @@ def test_subset_set_with_row_indexer( ): pytest.skip("setitem with labels selects on columns") - # TODO(CoW-warn) should warn - if using_copy_on_write or warn_copy_on_write: + if using_copy_on_write: indexer_si(subset)[indexer] = 0 + elif warn_copy_on_write: + with tm.assert_cow_warning(): + indexer_si(subset)[indexer] = 0 else: # INFO iloc no longer raises warning since pandas 1.4 warn = SettingWithCopyWarning if indexer_si is tm.setitem else None @@ -365,7 +367,8 @@ def test_subset_set_with_mask(backend, using_copy_on_write, warn_copy_on_write): mask = subset > 3 - # TODO(CoW-warn) should warn + # TODO(CoW-warn) should warn -> mask is a DataFrame, which ends up going through + # DataFrame._where(..., inplace=True) if using_copy_on_write or warn_copy_on_write: subset[mask] = 0 else: @@ -399,7 +402,6 @@ def test_subset_set_column(backend, using_copy_on_write, warn_copy_on_write): else: arr = pd.array([10, 11], dtype="Int64") - # TODO(CoW-warn) should warn if using_copy_on_write or warn_copy_on_write: subset["a"] = arr else: @@ -419,7 +421,7 @@ def test_subset_set_column(backend, using_copy_on_write, warn_copy_on_write): "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) def test_subset_set_column_with_loc( - backend, using_copy_on_write, using_array_manager, dtype + backend, using_copy_on_write, warn_copy_on_write, using_array_manager, dtype ): # Case: setting a single column with loc on a viewing subset # -> subset.loc[:, col] = value @@ -432,6 +434,9 @@ def test_subset_set_column_with_loc( if using_copy_on_write: subset.loc[:, "a"] = np.array([10, 11], dtype="int64") + elif warn_copy_on_write: + with tm.assert_cow_warning(): + subset.loc[:, "a"] = np.array([10, 11], dtype="int64") else: with pd.option_context("chained_assignment", "warn"): with tm.assert_produces_warning( @@ -455,7 +460,9 @@ def test_subset_set_column_with_loc( tm.assert_frame_equal(df, df_orig) -def test_subset_set_column_with_loc2(backend, using_copy_on_write, using_array_manager): +def test_subset_set_column_with_loc2( + backend, using_copy_on_write, warn_copy_on_write, using_array_manager +): # Case: setting a single column with loc on a viewing subset # -> subset.loc[:, col] = value # separate test for case of DataFrame of a single column -> takes a separate @@ -467,6 +474,9 @@ def test_subset_set_column_with_loc2(backend, using_copy_on_write, using_array_m if using_copy_on_write: subset.loc[:, "a"] = 0 + elif warn_copy_on_write: + with tm.assert_cow_warning(): + subset.loc[:, "a"] = 0 else: with pd.option_context("chained_assignment", "warn"): with tm.assert_produces_warning( @@ -500,7 +510,6 @@ def test_subset_set_columns(backend, using_copy_on_write, warn_copy_on_write, dt df_orig = df.copy() subset = df[1:3] - # TODO(CoW-warn) should warn if using_copy_on_write or warn_copy_on_write: subset[["a", "c"]] = 0 else: @@ -528,7 +537,9 @@ def test_subset_set_columns(backend, using_copy_on_write, warn_copy_on_write, dt [slice("a", "b"), np.array([True, True, False]), ["a", "b"]], ids=["slice", "mask", "array"], ) -def test_subset_set_with_column_indexer(backend, indexer, using_copy_on_write): +def test_subset_set_with_column_indexer( + backend, indexer, using_copy_on_write, warn_copy_on_write +): # Case: setting multiple columns with a column indexer on a viewing subset # -> subset.loc[:, [col1, col2]] = value _, DataFrame, _ = backend @@ -538,6 +549,9 @@ def test_subset_set_with_column_indexer(backend, indexer, using_copy_on_write): if using_copy_on_write: subset.loc[:, indexer] = 0 + elif warn_copy_on_write: + with tm.assert_cow_warning(): + subset.loc[:, indexer] = 0 else: with pd.option_context("chained_assignment", "warn"): # As of 2.0, this setitem attempts (successfully) to set values @@ -659,10 +673,7 @@ def test_subset_chained_getitem_column( # modify parent -> don't modify subset subset = df[:]["a"][0:2] df._clear_item_cache() - # TODO(CoW-warn) should also warn for mixed block and nullable dtypes - with tm.assert_cow_warning( - warn_copy_on_write and dtype == "int64" and dtype_backend == "numpy" - ): + with tm.assert_cow_warning(warn_copy_on_write): df.iloc[0, 0] = 0 expected = Series([1, 2], name="a") if using_copy_on_write: @@ -765,8 +776,7 @@ def test_null_slice(backend, method, using_copy_on_write, warn_copy_on_write): assert df2 is not df # and those trigger CoW when mutated - # TODO(CoW-warn) should also warn for nullable dtypes - with tm.assert_cow_warning(warn_copy_on_write and dtype_backend == "numpy"): + with tm.assert_cow_warning(warn_copy_on_write): df2.iloc[0, 0] = 0 if using_copy_on_write: tm.assert_frame_equal(df, df_orig) @@ -864,6 +874,8 @@ def test_series_subset_set_with_indexer( ) if warn_copy_on_write: # TODO(CoW-warn) should also warn for setting with mask + # -> Series.__setitem__ with boolean mask ends up using Series._set_values + # or Series._where depending on value being set with tm.assert_cow_warning( not is_mask, raise_on_extra_warnings=warn is not None ): @@ -884,10 +896,10 @@ def test_series_subset_set_with_indexer( # del operator -def test_del_frame(backend, using_copy_on_write): +def test_del_frame(backend, using_copy_on_write, warn_copy_on_write): # Case: deleting a column with `del` on a viewing child dataframe should # not modify parent + update the references - _, DataFrame, _ = backend + dtype_backend, DataFrame, _ = backend df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = df[:] @@ -901,11 +913,14 @@ def test_del_frame(backend, using_copy_on_write): tm.assert_frame_equal(df2, df_orig[["a", "c"]]) df2._mgr._verify_integrity() - df.loc[0, "b"] = 200 + # TODO(CoW-warn) false positive, this should not warn? + with tm.assert_cow_warning(warn_copy_on_write and dtype_backend == "numpy"): + df.loc[0, "b"] = 200 assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) df_orig = df.copy() - df2.loc[0, "a"] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + df2.loc[0, "a"] = 100 if using_copy_on_write: # modifying child after deleting a column still doesn't update parent tm.assert_frame_equal(df, df_orig) @@ -990,6 +1005,7 @@ def test_column_as_series_set_with_upcast( s[0] = "foo" expected = Series([1, 2, 3], name="a") elif using_copy_on_write or warn_copy_on_write or using_array_manager: + # TODO(CoW-warn) assert the FutureWarning for CoW is also raised with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): s[0] = "foo" expected = Series(["foo", 2, 3], dtype=object, name="a") @@ -1109,22 +1125,18 @@ def test_set_value_copy_only_necessary_column( ): # When setting inplace, only copy column that is modified instead of the whole # block (by splitting the block) - single_block = isinstance(col[0], int) df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": col}) df_orig = df.copy() view = df[:] if val == "a" and indexer[0] != slice(None): + # TODO(CoW-warn) assert the FutureWarning for CoW is also raised with tm.assert_produces_warning( FutureWarning, match="Setting an item of incompatible dtype is deprecated" ): indexer_func(df)[indexer] = val else: - # TODO(CoW-warn) should also warn in the other cases - with tm.assert_cow_warning( - warn_copy_on_write - and not (indexer[0] == slice(None) or (not single_block and val == 100)) - ): + with tm.assert_cow_warning(warn_copy_on_write): indexer_func(df)[indexer] = val if using_copy_on_write: @@ -1143,6 +1155,8 @@ def test_series_midx_slice(using_copy_on_write): ser = Series([1, 2, 3], index=pd.MultiIndex.from_arrays([[1, 1, 2], [3, 4, 5]])) result = ser[1] assert np.shares_memory(get_array(ser), get_array(result)) + # TODO(CoW-warn) should warn -> reference is only tracked in CoW mode, so + # warning is not triggered result.iloc[0] = 100 if using_copy_on_write: expected = Series( @@ -1151,7 +1165,9 @@ def test_series_midx_slice(using_copy_on_write): tm.assert_series_equal(ser, expected) -def test_getitem_midx_slice(using_copy_on_write, using_array_manager): +def test_getitem_midx_slice( + using_copy_on_write, warn_copy_on_write, using_array_manager +): df = DataFrame({("a", "x"): [1, 2], ("a", "y"): 1, ("b", "x"): 2}) df_orig = df.copy() new_df = df[("a",)] @@ -1164,6 +1180,15 @@ def test_getitem_midx_slice(using_copy_on_write, using_array_manager): if using_copy_on_write: new_df.iloc[0, 0] = 100 tm.assert_frame_equal(df_orig, df) + else: + if warn_copy_on_write: + with tm.assert_cow_warning(): + new_df.iloc[0, 0] = 100 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(SettingWithCopyWarning): + new_df.iloc[0, 0] = 100 + assert df.iloc[0, 0] == 100 def test_series_midx_tuples_slice(using_copy_on_write): @@ -1173,6 +1198,8 @@ def test_series_midx_tuples_slice(using_copy_on_write): ) result = ser[(1, 2)] assert np.shares_memory(get_array(ser), get_array(result)) + # TODO(CoW-warn) should warn -> reference is only tracked in CoW mode, so + # warning is not triggered result.iloc[0] = 100 if using_copy_on_write: expected = Series( diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index 5507e81d04e2a..cdb06de90a568 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -10,6 +10,7 @@ Series, Timestamp, interval_range, + option_context, ) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -324,11 +325,12 @@ def test_fillna_ea_noop_shares_memory( def test_fillna_inplace_ea_noop_shares_memory( - using_copy_on_write, any_numeric_ea_and_arrow_dtype + using_copy_on_write, warn_copy_on_write, any_numeric_ea_and_arrow_dtype ): df = DataFrame({"a": [1, NA, 3], "b": 1}, dtype=any_numeric_ea_and_arrow_dtype) df_orig = df.copy() view = df[:] + # TODO(CoW-warn) df.fillna(100, inplace=True) if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write: @@ -342,7 +344,9 @@ def test_fillna_inplace_ea_noop_shares_memory( assert not df._mgr._has_no_reference(1) assert not view._mgr._has_no_reference(1) - df.iloc[0, 1] = 100 + # TODO(CoW-warn) should this warn for ArrowDtype? + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 1] = 100 if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write: tm.assert_frame_equal(df_orig, view) else: @@ -361,6 +365,17 @@ def test_fillna_chained_assignment(using_copy_on_write): with tm.raises_chained_assignment_error(): df[["a"]].fillna(100, inplace=True) tm.assert_frame_equal(df, df_orig) + else: + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + with option_context("mode.chained_assignment", None): + df[["a"]].fillna(100, inplace=True) + + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + with option_context("mode.chained_assignment", None): + df[df.a > 5].fillna(100, inplace=True) + + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["a"].fillna(100, inplace=True) @pytest.mark.parametrize("func", ["interpolate", "ffill", "bfill"]) @@ -375,3 +390,14 @@ def test_interpolate_chained_assignment(using_copy_on_write, func): with tm.raises_chained_assignment_error(): getattr(df[["a"]], func)(inplace=True) tm.assert_frame_equal(df, df_orig) + else: + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + getattr(df["a"], func)(inplace=True) + + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + with option_context("mode.chained_assignment", None): + getattr(df[["a"]], func)(inplace=True) + + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + with option_context("mode.chained_assignment", None): + getattr(df[df["a"] > 1], func)(inplace=True) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 73bb9b4a71741..6416dd50daddc 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -12,6 +12,7 @@ Series, Timestamp, date_range, + option_context, period_range, ) import pandas._testing as tm @@ -39,7 +40,7 @@ def test_copy(using_copy_on_write): assert df.iloc[0, 0] == 1 -def test_copy_shallow(using_copy_on_write): +def test_copy_shallow(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_copy = df.copy(deep=False) @@ -69,7 +70,8 @@ def test_copy_shallow(using_copy_on_write): assert np.shares_memory(get_array(df_copy, "c"), get_array(df, "c")) else: # mutating shallow copy does mutate original - df_copy.iloc[0, 0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + df_copy.iloc[0, 0] = 0 assert df.iloc[0, 0] == 0 # and still shares memory assert np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) @@ -546,7 +548,7 @@ def test_shift_columns(using_copy_on_write, warn_copy_on_write): tm.assert_frame_equal(df2, expected) -def test_pop(using_copy_on_write): +def test_pop(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() view_original = df[:] @@ -558,7 +560,8 @@ def test_pop(using_copy_on_write): if using_copy_on_write: result.iloc[0] = 0 assert not np.shares_memory(result.values, get_array(view_original, "a")) - df.iloc[0, 0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 0 if using_copy_on_write: assert not np.shares_memory(get_array(df, "b"), get_array(view_original, "b")) tm.assert_frame_equal(view_original, df_orig) @@ -744,7 +747,7 @@ def test_swapaxes_read_only_array(): ], ids=["shallow-copy", "reset_index", "rename", "select_dtypes"], ) -def test_chained_methods(request, method, idx, using_copy_on_write): +def test_chained_methods(request, method, idx, using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() @@ -753,13 +756,15 @@ def test_chained_methods(request, method, idx, using_copy_on_write): # modify df2 -> don't modify df df2 = method(df) - df2.iloc[0, idx] = 0 + with tm.assert_cow_warning(warn_copy_on_write and df2_is_view): + df2.iloc[0, idx] = 0 if not df2_is_view: tm.assert_frame_equal(df, df_orig) # modify df -> don't modify df2 df2 = method(df) - df.iloc[0, 0] = 0 + with tm.assert_cow_warning(warn_copy_on_write and df2_is_view): + df.iloc[0, 0] = 0 if not df2_is_view: tm.assert_frame_equal(df2.iloc[:, idx:], df_orig) @@ -908,7 +913,7 @@ def test_dropna_series(using_copy_on_write, val): lambda df: df.tail(3), ], ) -def test_head_tail(method, using_copy_on_write): +def test_head_tail(method, using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = method(df) @@ -921,14 +926,16 @@ def test_head_tail(method, using_copy_on_write): assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) # modify df2 to trigger CoW for that block - df2.iloc[0, 0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + df2.iloc[0, 0] = 0 if using_copy_on_write: assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: # without CoW enabled, head and tail return views. Mutating df2 also mutates df. assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - df2.iloc[0, 0] = 1 + with tm.assert_cow_warning(warn_copy_on_write): + df2.iloc[0, 0] = 1 tm.assert_frame_equal(df, df_orig) @@ -1160,7 +1167,7 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs, warn_copy_on_writ @pytest.mark.parametrize("decimals", [-1, 0, 1]) -def test_round(using_copy_on_write, decimals): +def test_round(using_copy_on_write, warn_copy_on_write, decimals): df = DataFrame({"a": [1, 2], "b": "c"}) df_orig = df.copy() df2 = df.round(decimals=decimals) @@ -1175,6 +1182,7 @@ def test_round(using_copy_on_write, decimals): assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df2.iloc[0, 1] = "d" df2.iloc[0, 0] = 4 @@ -1677,7 +1685,7 @@ def test_get(using_copy_on_write, warn_copy_on_write, key): warn = FutureWarning if isinstance(key, str) else None else: warn = SettingWithCopyWarning if isinstance(key, list) else None - with pd.option_context("chained_assignment", "warn"): + with option_context("chained_assignment", "warn"): with tm.assert_produces_warning(warn): result.iloc[0] = 0 @@ -1714,7 +1722,7 @@ def test_xs( with tm.assert_cow_warning(single_block or axis == 1): result.iloc[0] = 0 else: - with pd.option_context("chained_assignment", "warn"): + with option_context("chained_assignment", "warn"): with tm.assert_produces_warning(SettingWithCopyWarning): result.iloc[0] = 0 @@ -1749,20 +1757,22 @@ def test_xs_multiindex( warn = SettingWithCopyWarning else: warn = None - with pd.option_context("chained_assignment", "warn"): + with option_context("chained_assignment", "warn"): with tm.assert_produces_warning(warn): result.iloc[0, 0] = 0 tm.assert_frame_equal(df, df_orig) -def test_update_frame(using_copy_on_write): +def test_update_frame(using_copy_on_write, warn_copy_on_write): df1 = DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}) df2 = DataFrame({"b": [100.0]}, index=[1]) df1_orig = df1.copy() view = df1[:] - df1.update(df2) + # TODO(CoW) better warning message? + with tm.assert_cow_warning(warn_copy_on_write): + df1.update(df2) expected = DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 100.0, 6.0]}) tm.assert_frame_equal(df1, expected) @@ -1804,14 +1814,35 @@ def test_update_chained_assignment(using_copy_on_write): with tm.raises_chained_assignment_error(): df[["a"]].update(ser2.to_frame()) tm.assert_frame_equal(df, df_orig) + else: + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["a"].update(ser2) + + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + with option_context("mode.chained_assignment", None): + df[["a"]].update(ser2.to_frame()) + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + with option_context("mode.chained_assignment", None): + df[df["a"] > 1].update(ser2.to_frame()) -def test_inplace_arithmetic_series(): + +def test_inplace_arithmetic_series(using_copy_on_write): ser = Series([1, 2, 3]) + ser_orig = ser.copy() data = get_array(ser) ser *= 2 - assert np.shares_memory(get_array(ser), data) - tm.assert_numpy_array_equal(data, get_array(ser)) + if using_copy_on_write: + # https://github.com/pandas-dev/pandas/pull/55745 + # changed to NOT update inplace because there is no benefit (actual + # operation already done non-inplace). This was only for the optics + # of updating the backing array inplace, but we no longer want to make + # that guarantee + assert not np.shares_memory(get_array(ser), data) + tm.assert_numpy_array_equal(data, get_array(ser_orig)) + else: + assert np.shares_memory(get_array(ser), data) + tm.assert_numpy_array_equal(data, get_array(ser)) def test_inplace_arithmetic_series_with_reference( @@ -1947,7 +1978,7 @@ def test_eval(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -def test_eval_inplace(using_copy_on_write): +def test_eval_inplace(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": 1}) df_orig = df.copy() df_view = df[:] @@ -1955,6 +1986,7 @@ def test_eval_inplace(using_copy_on_write): df.eval("c = a+b", inplace=True) assert np.shares_memory(get_array(df, "a"), get_array(df_view, "a")) - df.iloc[0, 0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 100 if using_copy_on_write: tm.assert_frame_equal(df_view, df_orig) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 085f355dc4377..d11a2893becdc 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -4,6 +4,7 @@ from pandas import ( Categorical, DataFrame, + option_context, ) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -395,6 +396,17 @@ def test_replace_chained_assignment(using_copy_on_write): with tm.raises_chained_assignment_error(): df[["a"]].replace(1, 100, inplace=True) tm.assert_frame_equal(df, df_orig) + else: + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + with option_context("mode.chained_assignment", None): + df[["a"]].replace(1, 100, inplace=True) + + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + with option_context("mode.chained_assignment", None): + df[df.a > 5].replace(1, 100, inplace=True) + + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["a"].replace(1, 100, inplace=True) def test_replace_listlike(using_copy_on_write): diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py index 4e08e00dac2b2..bc3b939734534 100644 --- a/pandas/tests/copy_view/test_setitem.py +++ b/pandas/tests/copy_view/test_setitem.py @@ -1,5 +1,4 @@ import numpy as np -import pytest from pandas import ( DataFrame, @@ -67,8 +66,6 @@ def test_set_column_with_index(using_copy_on_write): assert not np.shares_memory(get_array(df, "d"), arr) -# TODO(CoW-warn) this should NOT warn -@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_set_columns_with_dataframe(using_copy_on_write): # Case: setting a DataFrame as new columns copies that data # (with delayed copy with CoW) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 8e99c074a2c55..d946b8da01fad 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -196,7 +196,7 @@ def test_is_object(): @pytest.mark.parametrize( - "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no("scipy"))] ) def test_is_sparse(check_scipy): msg = "is_sparse is deprecated" @@ -632,7 +632,7 @@ def test_is_bool_dtype_numpy_error(): @pytest.mark.parametrize( - "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no("scipy"))] ) def test_is_extension_array_dtype(check_scipy): assert not com.is_extension_array_dtype([1, 2, 3]) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 04b3feabb9854..1c5514eff46d5 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -200,7 +200,8 @@ def test_is_boolean(self, categories, expected): def test_dtype_specific_categorical_dtype(self): expected = "datetime64[ns]" - result = str(Categorical(DatetimeIndex([])).categories.dtype) + dti = DatetimeIndex([], dtype=expected) + result = str(Categorical(dti).categories.dtype) assert result == expected def test_not_string(self): diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index df7c787d2b9bf..32c8def669c21 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1622,11 +1622,23 @@ def test_to_object_array_width(self): tm.assert_numpy_array_equal(out, expected) def test_is_period(self): - assert lib.is_period(Period("2011-01", freq="M")) - assert not lib.is_period(PeriodIndex(["2011-01"], freq="M")) - assert not lib.is_period(Timestamp("2011-01")) - assert not lib.is_period(1) - assert not lib.is_period(np.nan) + # GH#55264 + msg = "is_period is deprecated and will be removed in a future version" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert lib.is_period(Period("2011-01", freq="M")) + assert not lib.is_period(PeriodIndex(["2011-01"], freq="M")) + assert not lib.is_period(Timestamp("2011-01")) + assert not lib.is_period(1) + assert not lib.is_period(np.nan) + + def test_is_interval(self): + # GH#55264 + msg = "is_interval is deprecated and will be removed in a future version" + item = Interval(1, 2) + with tm.assert_produces_warning(FutureWarning, match=msg): + assert lib.is_interval(item) + assert not lib.is_interval(pd.IntervalIndex([item])) + assert not lib.is_interval(pd.IntervalIndex([item])._engine) def test_categorical(self): # GH 8974 diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index b995dc591c749..1cc1e2725b9c7 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -560,9 +560,7 @@ def test_array_equivalent_str(dtype): ) -@pytest.mark.parametrize( - "strict_nan", [pytest.param(True, marks=pytest.mark.xfail), False] -) +@pytest.mark.parametrize("strict_nan", [True, False]) def test_array_equivalent_nested(strict_nan): # reached in groupby aggregations, make sure we use np.any when checking # if the comparison is truthy @@ -585,9 +583,7 @@ def test_array_equivalent_nested(strict_nan): @pytest.mark.filterwarnings("ignore:elementwise comparison failed:DeprecationWarning") -@pytest.mark.parametrize( - "strict_nan", [pytest.param(True, marks=pytest.mark.xfail), False] -) +@pytest.mark.parametrize("strict_nan", [True, False]) def test_array_equivalent_nested2(strict_nan): # more than one level of nesting left = np.array( @@ -612,9 +608,7 @@ def test_array_equivalent_nested2(strict_nan): assert not array_equivalent(left, right, strict_nan=strict_nan) -@pytest.mark.parametrize( - "strict_nan", [pytest.param(True, marks=pytest.mark.xfail), False] -) +@pytest.mark.parametrize("strict_nan", [True, False]) def test_array_equivalent_nested_list(strict_nan): left = np.array([[50, 70, 90], [20, 30]], dtype=object) right = np.array([[50, 70, 90], [20, 30]], dtype=object) diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index 7215e910365cf..8828f33b7c62c 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -35,8 +35,7 @@ def test_series_constructor(self, data): if hasattr(result._mgr, "blocks"): assert isinstance(result2._mgr.blocks[0], EABackedBlock) - def test_series_constructor_no_data_with_index(self, dtype): - na_value = dtype.na_value + def test_series_constructor_no_data_with_index(self, dtype, na_value): result = pd.Series(index=[1, 2, 3], dtype=dtype) expected = pd.Series([na_value] * 3, index=[1, 2, 3], dtype=dtype) tm.assert_series_equal(result, expected) @@ -46,8 +45,7 @@ def test_series_constructor_no_data_with_index(self, dtype): expected = pd.Series([], index=pd.Index([], dtype="object"), dtype=dtype) tm.assert_series_equal(result, expected) - def test_series_constructor_scalar_na_with_index(self, dtype): - na_value = dtype.na_value + def test_series_constructor_scalar_na_with_index(self, dtype, na_value): result = pd.Series(na_value, index=[1, 2, 3], dtype=dtype) expected = pd.Series([na_value] * 3, index=[1, 2, 3], dtype=dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index 12006248b1db3..132cda5a94ed0 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -29,7 +29,7 @@ def skip_if_doesnt_support_2d(self, dtype, request): test_func = node._obj if test_func.__qualname__.startswith("Dim2CompatTests"): # TODO: is there a less hacky way of checking this? - pytest.skip("Test is only for EAs that support 2D.") + pytest.skip(f"{dtype} does not support 2D.") def test_transpose(self, data): arr2d = data.repeat(2).reshape(-1, 2) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index be16a105384c6..5f0c1b960a475 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -148,8 +148,7 @@ def test_getitem_invalid(self, data): with pytest.raises(IndexError, match=msg): data[-ub - 1] - def test_getitem_scalar_na(self, data_missing, na_cmp): - na_value = data_missing.dtype.na_value + def test_getitem_scalar_na(self, data_missing, na_cmp, na_value): result = data_missing[0] assert na_cmp(result, na_value) @@ -349,8 +348,7 @@ def test_take_sequence(self, data): assert result.iloc[1] == data[1] assert result.iloc[2] == data[3] - def test_take(self, data, na_cmp): - na_value = data.dtype.na_value + def test_take(self, data, na_value, na_cmp): result = data.take([0, -1]) assert result.dtype == data.dtype assert result[0] == data[0] @@ -363,8 +361,7 @@ def test_take(self, data, na_cmp): with pytest.raises(IndexError, match="out of bounds"): data.take([len(data) + 1]) - def test_take_empty(self, data, na_cmp): - na_value = data.dtype.na_value + def test_take_empty(self, data, na_value, na_cmp): empty = data[:0] result = empty.take([-1], allow_fill=True) @@ -396,8 +393,7 @@ def test_take_non_na_fill_value(self, data_missing): expected = arr.take([1, 1]) tm.assert_extension_array_equal(result, expected) - def test_take_pandas_style_negative_raises(self, data): - na_value = data.dtype.na_value + def test_take_pandas_style_negative_raises(self, data, na_value): with pytest.raises(ValueError, match=""): data.take([0, -2], fill_value=na_value, allow_fill=True) @@ -417,8 +413,7 @@ def test_take_series(self, data): ) tm.assert_series_equal(result, expected) - def test_reindex(self, data): - na_value = data.dtype.na_value + def test_reindex(self, data, na_value): s = pd.Series(data) result = s.reindex([0, 1, 3]) expected = pd.Series(data.take([0, 1, 3]), index=[0, 1, 3]) diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index f19561e5a862b..08e385c9389d9 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -103,7 +103,7 @@ def test_copy(self, data): result = data.copy() if data.dtype._is_immutable: - pytest.skip("test_copy assumes mutability") + pytest.skip(f"test_copy assumes mutability and {data.dtype} is immutable") data[1] = data[0] assert result[1] != result[0] @@ -118,7 +118,7 @@ def test_view(self, data): assert type(result) == type(data) if data.dtype._is_immutable: - pytest.skip("test_view assumes mutability") + pytest.skip(f"test_view assumes mutability and {data.dtype} is immutable") result[1] = result[0] assert data[1] == data[0] diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index e10c6ef9a7018..b9407c7197f20 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -121,7 +121,7 @@ def test_argsort_missing(self, data_missing_for_sorting): expected = pd.Series(np.array([1, -1, 0], dtype=np.intp)) tm.assert_series_equal(result, expected) - def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting): + def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value): # GH 24382 is_bool = data_for_sorting.dtype._is_boolean @@ -154,10 +154,9 @@ def test_argmin_argmax_empty_array(self, method, data): getattr(data[:0], method)() @pytest.mark.parametrize("method", ["argmax", "argmin"]) - def test_argmin_argmax_all_na(self, method, data): + def test_argmin_argmax_all_na(self, method, data, na_value): # all missing with skipna=True is the same as empty err_msg = "attempt to get" - na_value = data.dtype.na_value data_na = type(data)._from_sequence([na_value, na_value], dtype=data.dtype) with pytest.raises(ValueError, match=err_msg): getattr(data_na, method)() @@ -556,8 +555,7 @@ def _test_searchsorted_bool_dtypes(self, data_for_sorting, as_series): sorter = np.array([1, 0]) assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 - def test_where_series(self, data, as_frame): - na_value = data.dtype.na_value + def test_where_series(self, data, na_value, as_frame): assert data[0] != data[1] cls = type(data) a, b = data[:2] @@ -684,8 +682,7 @@ def test_insert_invalid_loc(self, data): data.insert(1.5, data[0]) @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame]) - def test_equals(self, data, as_series, box): - na_value = data.dtype.na_value + def test_equals(self, data, na_value, as_series, box): data2 = type(data)._from_sequence([data[0]] * len(data), dtype=data.dtype) data_na = type(data)._from_sequence([na_value] * len(data), dtype=data.dtype) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 4477ba17c1683..6ea1b3a6fbe9d 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -120,7 +120,7 @@ def test_reduce_frame(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions ser = pd.Series(data) if not is_numeric_dtype(ser.dtype): - pytest.skip("not numeric dtype") + pytest.skip(f"{ser.dtype} is not numeric dtype") if op_name in ["count", "kurt", "sem"]: pytest.skip(f"{op_name} not an array method") diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index c320ccfe4c548..4550e3b055cfe 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -72,8 +72,7 @@ def test_concat_mixed_dtypes(self, data): expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")]) tm.assert_series_equal(result, expected) - def test_concat_columns(self, data): - na_value = data.dtype.na_value + def test_concat_columns(self, data, na_value): df1 = pd.DataFrame({"A": data[:3]}) df2 = pd.DataFrame({"B": [1, 2, 3]}) @@ -97,9 +96,8 @@ def test_concat_columns(self, data): result = pd.concat([df1["A"], df2["B"]], axis=1) tm.assert_frame_equal(result, expected) - def test_concat_extension_arrays_copy_false(self, data): + def test_concat_extension_arrays_copy_false(self, data, na_value): # GH 20756 - na_value = data.dtype.na_value df1 = pd.DataFrame({"A": data[:3]}) df2 = pd.DataFrame({"B": data[3:7]}) expected = pd.DataFrame( @@ -124,8 +122,7 @@ def test_concat_with_reindex(self, data): ) tm.assert_frame_equal(result, expected) - def test_align(self, data): - na_value = data.dtype.na_value + def test_align(self, data, na_value): a = data[:3] b = data[2:5] r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) @@ -136,8 +133,7 @@ def test_align(self, data): tm.assert_series_equal(r1, e1) tm.assert_series_equal(r2, e2) - def test_align_frame(self, data): - na_value = data.dtype.na_value + def test_align_frame(self, data, na_value): a = data[:3] b = data[2:5] r1, r2 = pd.DataFrame({"A": a}).align(pd.DataFrame({"A": b}, index=[1, 2, 3])) @@ -152,9 +148,8 @@ def test_align_frame(self, data): tm.assert_frame_equal(r1, e1) tm.assert_frame_equal(r2, e2) - def test_align_series_frame(self, data): + def test_align_series_frame(self, data, na_value): # https://github.com/pandas-dev/pandas/issues/20576 - na_value = data.dtype.na_value ser = pd.Series(data, name="a") df = pd.DataFrame({"col": np.arange(len(ser) + 1)}) r1, r2 = ser.align(df) @@ -185,7 +180,7 @@ def test_set_frame_overwrite_object(self, data): df["A"] = data assert df.dtypes["A"] == data.dtype - def test_merge(self, data): + def test_merge(self, data, na_value): # GH-20743 df1 = pd.DataFrame({"ext": data[:3], "int1": [1, 2, 3], "key": [0, 1, 2]}) df2 = pd.DataFrame({"int2": [1, 2, 3, 4], "key": [0, 0, 1, 3]}) @@ -210,8 +205,7 @@ def test_merge(self, data): "int2": [1, 2, 3, np.nan, 4], "key": [0, 0, 1, 2, 3], "ext": data._from_sequence( - [data[0], data[0], data[1], data[2], data.dtype.na_value], - dtype=data.dtype, + [data[0], data[0], data[1], data[2], na_value], dtype=data.dtype ), } ) @@ -343,7 +337,7 @@ def test_ravel(self, data): assert type(result) == type(data) if data.dtype._is_immutable: - pytest.skip("test_ravel assumes mutability") + pytest.skip(f"test_ravel assumes mutability and {data.dtype} is immutable") # Check that we have a view, not a copy result[0] = result[1] @@ -360,7 +354,9 @@ def test_transpose(self, data): assert result.shape == data.shape[::-1] if data.dtype._is_immutable: - pytest.skip("test_transpose assumes mutability") + pytest.skip( + f"test_transpose assumes mutability and {data.dtype} is immutable" + ) # Check that we have a view, not a copy result[0] = result[1] diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 83e2c795b8b5e..067b401ce2f23 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -43,7 +43,7 @@ def skip_if_immutable(self, dtype, request): # This fixture is auto-used, but we want to not-skip # test_is_immutable. return - pytest.skip("__setitem__ test not applicable with immutable dtype") + pytest.skip(f"__setitem__ test not applicable with immutable dtype {dtype}") def test_is_immutable(self, data): if data.dtype._is_immutable: @@ -359,8 +359,7 @@ def test_setitem_with_expansion_dataframe_column(self, data, full_indexer): tm.assert_frame_equal(result, expected) - def test_setitem_with_expansion_row(self, data): - na_value = data.dtype.na_value + def test_setitem_with_expansion_row(self, data, na_value): df = pd.DataFrame({"data": data[:1]}) df.loc[1, "data"] = data[1] @@ -402,7 +401,7 @@ def test_setitem_frame_2d_values(self, data): orig = df.copy() - df.iloc[:] = df + df.iloc[:] = df.copy() tm.assert_frame_equal(df, orig) df.iloc[:-1] = df.iloc[:-1].copy() diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 30b10541c8f66..c5b1295ee4a7d 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -37,7 +37,7 @@ def data_for_twos(dtype): if not (dtype._is_numeric or dtype.kind == "m"): # Object-dtypes may want to allow this, but for the most part # only numeric and timedelta-like dtypes will need to implement this. - pytest.skip("Not a numeric dtype") + pytest.skip(f"{dtype} is not a numeric dtype") raise NotImplementedError @@ -118,6 +118,16 @@ def na_cmp(): return operator.is_ +@pytest.fixture +def na_value(dtype): + """ + The scalar missing value for this type. Default dtype.na_value. + + TODO: can be removed in 3.x (see https://github.com/pandas-dev/pandas/pull/54930) + """ + return dtype.na_value + + @pytest.fixture def data_for_grouping(): """ diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 71133030a5c18..592b88c4701d2 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -97,24 +97,24 @@ def test_from_dtype(self, data): super().test_from_dtype(data) @pytest.mark.xfail(reason="RecursionError, GH-33900") - def test_series_constructor_no_data_with_index(self, dtype): + def test_series_constructor_no_data_with_index(self, dtype, na_value): # RecursionError: maximum recursion depth exceeded in comparison rec_limit = sys.getrecursionlimit() try: # Limit to avoid stack overflow on Windows CI sys.setrecursionlimit(100) - super().test_series_constructor_no_data_with_index(dtype) + super().test_series_constructor_no_data_with_index(dtype, na_value) finally: sys.setrecursionlimit(rec_limit) @pytest.mark.xfail(reason="RecursionError, GH-33900") - def test_series_constructor_scalar_na_with_index(self, dtype): + def test_series_constructor_scalar_na_with_index(self, dtype, na_value): # RecursionError: maximum recursion depth exceeded in comparison rec_limit = sys.getrecursionlimit() try: # Limit to avoid stack overflow on Windows CI sys.setrecursionlimit(100) - super().test_series_constructor_scalar_na_with_index(dtype) + super().test_series_constructor_scalar_na_with_index(dtype, na_value) finally: sys.setrecursionlimit(rec_limit) @@ -214,19 +214,19 @@ def test_combine_first(self, data): super().test_combine_first(data) @pytest.mark.xfail(reason="broadcasting error") - def test_where_series(self, data): + def test_where_series(self, data, na_value): # Fails with # *** ValueError: operands could not be broadcast together # with shapes (4,) (4,) (0,) - super().test_where_series(data) + super().test_where_series(data, na_value) @pytest.mark.xfail(reason="Can't compare dicts.") def test_searchsorted(self, data_for_sorting): super().test_searchsorted(data_for_sorting) @pytest.mark.xfail(reason="Can't compare dicts.") - def test_equals(self, data, as_series): - super().test_equals(data, as_series) + def test_equals(self, data, na_value, as_series): + super().test_equals(data, na_value, as_series) @pytest.mark.skip("fill-value is interpreted as a dict of values") def test_fillna_copy_frame(self, data_missing): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index adbc4a1bb729b..8298d39a5eca9 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -503,7 +503,9 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, reque super().test_reduce_series_numeric(data, all_numeric_reductions, skipna) @pytest.mark.parametrize("skipna", [True, False]) - def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna, request): + def test_reduce_series_boolean( + self, data, all_boolean_reductions, skipna, na_value, request + ): pa_dtype = data.dtype.pyarrow_dtype xfail_mark = pytest.mark.xfail( raises=TypeError, diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index f37ac4b289852..b427c12d2d40c 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -56,7 +56,7 @@ def data_missing(): @pytest.fixture def data_for_twos(): - pytest.skip("Not a numeric dtype") + pytest.skip("Interval is not a numeric dtype") @pytest.fixture diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 04b25fd0566da..f1939ea174841 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -151,7 +151,7 @@ def data_for_grouping(allow_in_pandas, dtype): @pytest.fixture def data_for_twos(dtype): if dtype.kind == "O": - pytest.skip("Not a numeric dtype") + pytest.skip(f"{dtype} is not a numeric dtype") arr = np.ones(100) * 2 return NumpyExtensionArray._from_sequence(arr, dtype=dtype) @@ -323,7 +323,7 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): expected = exp_op(skipna=skipna) tm.assert_almost_equal(result, expected) - @pytest.mark.skip("tests not written yet") + @pytest.mark.skip("TODO: tests not written yet") @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_frame(self, data, all_numeric_reductions, skipna): pass diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 5c793e53cf759..8e46b90d4df1e 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -149,29 +149,29 @@ def test_concat_mixed_dtypes(self, data): def test_stack(self, data, columns, future_stack): super().test_stack(data, columns, future_stack) - def test_concat_columns(self, data): + def test_concat_columns(self, data, na_value): self._check_unsupported(data) - super().test_concat_columns(data) + super().test_concat_columns(data, na_value) - def test_concat_extension_arrays_copy_false(self, data): + def test_concat_extension_arrays_copy_false(self, data, na_value): self._check_unsupported(data) - super().test_concat_extension_arrays_copy_false(data) + super().test_concat_extension_arrays_copy_false(data, na_value) - def test_align(self, data): + def test_align(self, data, na_value): self._check_unsupported(data) - super().test_align(data) + super().test_align(data, na_value) - def test_align_frame(self, data): + def test_align_frame(self, data, na_value): self._check_unsupported(data) - super().test_align_frame(data) + super().test_align_frame(data, na_value) - def test_align_series_frame(self, data): + def test_align_series_frame(self, data, na_value): self._check_unsupported(data) - super().test_align_series_frame(data) + super().test_align_series_frame(data, na_value) - def test_merge(self, data): + def test_merge(self, data, na_value): self._check_unsupported(data) - super().test_merge(data) + super().test_merge(data, na_value) class TestGetitem(BaseSparseTests, base.BaseGetitemTests): @@ -183,9 +183,9 @@ def test_get(self, data): assert ser.get(4) == ser.iloc[2] assert ser.get(2) == ser.iloc[1] - def test_reindex(self, data): + def test_reindex(self, data, na_value): self._check_unsupported(data) - super().test_reindex(data) + super().test_reindex(data, na_value) class TestSetitem(BaseSparseTests, base.BaseSetitemTests): @@ -285,7 +285,7 @@ def test_fillna_copy_series(self, data_missing, using_copy_on_write): def test_fillna_length_mismatch(self, data_missing): super().test_fillna_length_mismatch(data_missing) - def test_where_series(self, data): + def test_where_series(self, data, na_value): assert data[0] != data[1] cls = type(data) a, b = data[:2] @@ -296,7 +296,6 @@ def test_where_series(self, data): result = ser.where(cond) new_dtype = SparseDtype("float", 0.0) - na_value = data.dtype.na_value expected = pd.Series( cls._from_sequence([a, a, na_value, na_value], dtype=new_dtype) ) @@ -320,15 +319,15 @@ def test_shift_0_periods(self, data): assert result._sparse_values[0] != result._sparse_values[1] @pytest.mark.parametrize("method", ["argmax", "argmin"]) - def test_argmin_argmax_all_na(self, method, data): + def test_argmin_argmax_all_na(self, method, data, na_value): # overriding because Sparse[int64, 0] cannot handle na_value self._check_unsupported(data) - super().test_argmin_argmax_all_na(method, data) + super().test_argmin_argmax_all_na(method, data, na_value) @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame]) - def test_equals(self, data, as_series, box): + def test_equals(self, data, na_value, as_series, box): self._check_unsupported(data) - super().test_equals(data, as_series, box) + super().test_equals(data, na_value, as_series, box) @pytest.mark.parametrize( "func, na_action, expected", diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 840dd1057745f..8c5098a53adfc 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -26,22 +26,21 @@ from pandas.tests.extension import base -def split_array(arr): - if arr.dtype.storage != "pyarrow": - pytest.skip("only applicable for pyarrow chunked array n/a") - - def _split_array(arr): - import pyarrow as pa - - arrow_array = arr._pa_array - split = len(arrow_array) // 2 - arrow_array = pa.chunked_array( - [*arrow_array[:split].chunks, *arrow_array[split:].chunks] - ) - assert arrow_array.num_chunks == 2 - return type(arr)(arrow_array) - - return _split_array(arr) +def maybe_split_array(arr, chunked): + if not chunked: + return arr + elif arr.dtype.storage != "pyarrow": + return arr + + pa = pytest.importorskip("pyarrow") + + arrow_array = arr._pa_array + split = len(arrow_array) // 2 + arrow_array = pa.chunked_array( + [*arrow_array[:split].chunks, *arrow_array[split:].chunks] + ) + assert arrow_array.num_chunks == 2 + return type(arr)(arrow_array) @pytest.fixture(params=[True, False]) @@ -61,26 +60,26 @@ def data(dtype, chunked): strings = np.random.default_rng(2).choice(list(string.ascii_letters), size=100) arr = dtype.construct_array_type()._from_sequence(strings) - return split_array(arr) if chunked else arr + return maybe_split_array(arr, chunked) @pytest.fixture def data_missing(dtype, chunked): """Length 2 array with [NA, Valid]""" arr = dtype.construct_array_type()._from_sequence([pd.NA, "A"]) - return split_array(arr) if chunked else arr + return maybe_split_array(arr, chunked) @pytest.fixture def data_for_sorting(dtype, chunked): arr = dtype.construct_array_type()._from_sequence(["B", "C", "A"]) - return split_array(arr) if chunked else arr + return maybe_split_array(arr, chunked) @pytest.fixture def data_missing_for_sorting(dtype, chunked): arr = dtype.construct_array_type()._from_sequence(["B", pd.NA, "A"]) - return split_array(arr) if chunked else arr + return maybe_split_array(arr, chunked) @pytest.fixture @@ -88,7 +87,7 @@ def data_for_grouping(dtype, chunked): arr = dtype.construct_array_type()._from_sequence( ["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"] ) - return split_array(arr) if chunked else arr + return maybe_split_array(arr, chunked) class TestDtype(base.BaseDtypeTests): diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index fb2df0b82e5f4..f7ed5180b46d9 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -10,75 +10,32 @@ @pytest.fixture -def float_frame_with_na(): +def datetime_frame() -> DataFrame: """ - Fixture for DataFrame of floats with index of unique strings + Fixture for DataFrame of floats with DatetimeIndex - Columns are ['A', 'B', 'C', 'D']; some entries are missing + Columns are ['A', 'B', 'C', 'D'] A B C D - ABwBzA0ljw -1.128865 -0.897161 0.046603 0.274997 - DJiRzmbyQF 0.728869 0.233502 0.722431 -0.890872 - neMgPD5UBF 0.486072 -1.027393 -0.031553 1.449522 - 0yWA4n8VeX -1.937191 -1.142531 0.805215 -0.462018 - 3slYUbbqU1 0.153260 1.164691 1.489795 -0.545826 - soujjZ0A08 NaN NaN NaN NaN - 7W6NLGsjB9 NaN NaN NaN NaN + 2000-01-03 -1.122153 0.468535 0.122226 1.693711 + 2000-01-04 0.189378 0.486100 0.007864 -1.216052 + 2000-01-05 0.041401 -0.835752 -0.035279 -0.414357 + 2000-01-06 0.430050 0.894352 0.090719 0.036939 + 2000-01-07 -0.620982 -0.668211 -0.706153 1.466335 + 2000-01-10 -0.752633 0.328434 -0.815325 0.699674 + 2000-01-11 -2.236969 0.615737 -0.829076 -1.196106 ... ... ... ... ... - uhfeaNkCR1 -0.231210 -0.340472 0.244717 -0.901590 - n6p7GYuBIV -0.419052 1.922721 -0.125361 -0.727717 - ZhzAeY6p1y 1.234374 -1.425359 -0.827038 -0.633189 - uWdPsORyUh 0.046738 -0.980445 -1.102965 0.605503 - 3DJA6aN590 -0.091018 -1.684734 -1.100900 0.215947 - 2GBPAzdbMk -2.883405 -1.021071 1.209877 1.633083 - sHadBoyVHw -2.223032 -0.326384 0.258931 0.245517 + 2000-02-03 1.642618 -0.579288 0.046005 1.385249 + 2000-02-04 -0.544873 -1.160962 -0.284071 -1.418351 + 2000-02-07 -2.656149 -0.601387 1.410148 0.444150 + 2000-02-08 -1.201881 -1.289040 0.772992 -1.445300 + 2000-02-09 1.377373 0.398619 1.008453 -0.928207 + 2000-02-10 0.473194 -0.636677 0.984058 0.511519 + 2000-02-11 -0.965556 0.408313 -1.312844 -0.381948 [30 rows x 4 columns] """ - df = DataFrame(tm.getSeriesData()) - # set some NAs - df.iloc[5:10] = np.nan - df.iloc[15:20, -2:] = np.nan - return df - - -@pytest.fixture -def bool_frame_with_na(): - """ - Fixture for DataFrame of booleans with index of unique strings - - Columns are ['A', 'B', 'C', 'D']; some entries are missing - - A B C D - zBZxY2IDGd False False False False - IhBWBMWllt False True True True - ctjdvZSR6R True False True True - AVTujptmxb False True False True - G9lrImrSWq False False False True - sFFwdIUfz2 NaN NaN NaN NaN - s15ptEJnRb NaN NaN NaN NaN - ... ... ... ... ... - UW41KkDyZ4 True True False False - l9l6XkOdqV True False False False - X2MeZfzDYA False True False False - xWkIKU7vfX False True False True - QOhL6VmpGU False False False True - 22PwkRJdat False True False False - kfboQ3VeIK True False True False - - [30 rows x 4 columns] - """ - df = DataFrame(tm.getSeriesData()) > 0 - df = df.astype(object) - # set some NAs - df.iloc[5:10] = np.nan - df.iloc[15:20, -2:] = np.nan - - # For `any` tests we need to have at least one True before the first NaN - # in each column - for i in range(4): - df.iloc[i, i] = True - return df + return DataFrame(tm.getTimeSeriesData()) @pytest.fixture @@ -202,60 +159,3 @@ def timezone_frame(): df.iloc[1, 1] = NaT df.iloc[1, 2] = NaT return df - - -@pytest.fixture -def uint64_frame(): - """ - Fixture for DataFrame with uint64 values - - Columns are ['A', 'B'] - """ - return DataFrame( - {"A": np.arange(3), "B": [2**63, 2**63 + 5, 2**63 + 10]}, dtype=np.uint64 - ) - - -@pytest.fixture -def simple_frame(): - """ - Fixture for simple 3x3 DataFrame - - Columns are ['one', 'two', 'three'], index is ['a', 'b', 'c']. - - one two three - a 1.0 2.0 3.0 - b 4.0 5.0 6.0 - c 7.0 8.0 9.0 - """ - arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) - - return DataFrame(arr, columns=["one", "two", "three"], index=["a", "b", "c"]) - - -@pytest.fixture -def frame_of_index_cols(): - """ - Fixture for DataFrame of columns that can be used for indexing - - Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')]; - 'A' & 'B' contain duplicates (but are jointly unique), the rest are unique. - - A B C D E (tuple, as, label) - 0 foo one a 0.608477 -0.012500 -1.664297 - 1 foo two b -0.633460 0.249614 -0.364411 - 2 foo three c 0.615256 2.154968 -0.834666 - 3 bar one d 0.234246 1.085675 0.718445 - 4 bar two e 0.533841 -0.005702 -3.533912 - """ - df = DataFrame( - { - "A": ["foo", "foo", "foo", "bar", "bar"], - "B": ["one", "two", "three", "one", "two"], - "C": ["a", "b", "c", "d", "e"], - "D": np.random.default_rng(2).standard_normal(5), - "E": np.random.default_rng(2).standard_normal(5), - ("tuple", "as", "label"): np.random.default_rng(2).standard_normal(5), - } - ) - return df diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 93f391b16817c..135a86cad1395 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -48,7 +48,7 @@ def test_getitem(self, float_frame): # Column access for _, series in sl.items(): assert len(series.index) == 20 - assert tm.equalContents(series.index, sl.index) + tm.assert_index_equal(series.index, sl.index) for key, _ in float_frame._series.items(): assert float_frame[key] is not None @@ -561,7 +561,7 @@ def test_getitem_setitem_integer_slice_keyerrors(self): @td.skip_array_manager_invalid_test # already covered in test_iloc_col_slice_view def test_fancy_getitem_slice_mixed( - self, float_frame, float_string_frame, using_copy_on_write + self, float_frame, float_string_frame, using_copy_on_write, warn_copy_on_write ): sliced = float_string_frame.iloc[:, -3:] assert sliced["D"].dtype == np.float64 @@ -573,7 +573,8 @@ def test_fancy_getitem_slice_mixed( assert np.shares_memory(sliced["C"]._values, float_frame["C"]._values) - sliced.loc[:, "C"] = 4.0 + with tm.assert_cow_warning(warn_copy_on_write): + sliced.loc[:, "C"] = 4.0 if not using_copy_on_write: assert (float_frame["C"] == 4).all() @@ -1049,7 +1050,7 @@ def test_iloc_row(self): expected = df.reindex(df.index[[1, 2, 4, 6]]) tm.assert_frame_equal(result, expected) - def test_iloc_row_slice_view(self, using_copy_on_write, request): + def test_iloc_row_slice_view(self, using_copy_on_write, warn_copy_on_write): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), index=range(0, 20, 2) ) @@ -1062,9 +1063,9 @@ def test_iloc_row_slice_view(self, using_copy_on_write, request): assert np.shares_memory(df[2], subset[2]) exp_col = original[2].copy() - subset.loc[:, 2] = 0.0 - if not using_copy_on_write: + with tm.assert_cow_warning(warn_copy_on_write): subset.loc[:, 2] = 0.0 + if not using_copy_on_write: exp_col._values[4:8] = 0.0 # With the enforcement of GH#45333 in 2.0, this remains a view @@ -1094,7 +1095,9 @@ def test_iloc_col(self): expected = df.reindex(columns=df.columns[[1, 2, 4, 6]]) tm.assert_frame_equal(result, expected) - def test_iloc_col_slice_view(self, using_array_manager, using_copy_on_write): + def test_iloc_col_slice_view( + self, using_array_manager, using_copy_on_write, warn_copy_on_write + ): df = DataFrame( np.random.default_rng(2).standard_normal((4, 10)), columns=range(0, 20, 2) ) @@ -1105,7 +1108,8 @@ def test_iloc_col_slice_view(self, using_array_manager, using_copy_on_write): # verify slice is view assert np.shares_memory(df[8]._values, subset[8]._values) - subset.loc[:, 8] = 0.0 + with tm.assert_cow_warning(warn_copy_on_write): + subset.loc[:, 8] = 0.0 assert (df[8] == 0).all() @@ -1388,12 +1392,13 @@ def test_loc_setitem_rhs_frame(self, idxr, val, warn): tm.assert_frame_equal(df, expected) @td.skip_array_manager_invalid_test - def test_iloc_setitem_enlarge_no_warning(self): + def test_iloc_setitem_enlarge_no_warning(self, warn_copy_on_write): # GH#47381 df = DataFrame(columns=["a", "b"]) expected = df.copy() view = df[:] - with tm.assert_produces_warning(None): + # TODO(CoW-warn) false positive: shouldn't warn in case of enlargement? + with tm.assert_produces_warning(FutureWarning if warn_copy_on_write else None): df.iloc[:, 0] = np.array([1, 2], dtype=np.float64) tm.assert_frame_equal(view, expected) @@ -1461,8 +1466,6 @@ def test_loc_named_tuple_for_midx(self): ) tm.assert_frame_equal(result, expected) - # TODO(CoW-warn) shouldn't warn, but does because of item cache - @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") @pytest.mark.parametrize("indexer", [["a"], "a"]) @pytest.mark.parametrize("col", [{}, {"b": 1}]) def test_set_2d_casting_date_to_int(self, col, indexer): @@ -1566,8 +1569,11 @@ def test_setitem_value_coercing_dtypes(self, indexer, idx): class TestDataFrameIndexingUInt64: - def test_setitem(self, uint64_frame): - df = uint64_frame + def test_setitem(self): + df = DataFrame( + {"A": np.arange(3), "B": [2**63, 2**63 + 5, 2**63 + 10]}, + dtype=np.uint64, + ) idx = df["A"].rename("foo") # setitem @@ -1924,7 +1930,7 @@ def test_add_new_column_infer_string(): df.loc[df["x"] == 1, "y"] = "1" expected = DataFrame( {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, - columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + columns=Index(["x", "y"], dtype=object), ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 49dd7a3c4df9b..bc632209ff7e1 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -93,6 +93,11 @@ def test_setitem_error_msmgs(self): with pytest.raises(ValueError, match=msg): df["gr"] = df.groupby(["b", "c"]).count() + # GH 55956, specific message for zero columns + msg = "Cannot set a DataFrame without columns to the column gr" + with pytest.raises(ValueError, match=msg): + df["gr"] = DataFrame() + def test_setitem_benchmark(self): # from the vb_suite/frame_methods/frame_insert_columns N = 10 @@ -753,6 +758,15 @@ def test_setitem_frame_overwrite_with_ea_dtype(self, any_numeric_ea_dtype): ) tm.assert_frame_equal(df, expected) + def test_setitem_string_option_object_index(self): + # GH#55638 + pytest.importorskip("pyarrow") + df = DataFrame({"a": [1, 2]}) + with pd.option_context("future.infer_string", True): + df["b"] = Index(["a", "b"], dtype=object) + expected = DataFrame({"a": [1, 2], "b": Series(["a", "b"], dtype=object)}) + tm.assert_frame_equal(df, expected) + def test_setitem_frame_midx_columns(self): # GH#49121 df = DataFrame({("a", "b"): [10]}) @@ -816,7 +830,7 @@ def test_setitem_object_array_of_tzaware_datetimes(self, idx, expected): class TestDataFrameSetItemWithExpansion: - def test_setitem_listlike_views(self, using_copy_on_write): + def test_setitem_listlike_views(self, using_copy_on_write, warn_copy_on_write): # GH#38148 df = DataFrame({"a": [1, 2, 3], "b": [4, 4, 6]}) @@ -827,7 +841,8 @@ def test_setitem_listlike_views(self, using_copy_on_write): df[["c", "d"]] = np.array([[0.1, 0.2], [0.3, 0.4], [0.4, 0.5]]) # edit in place the first column to check view semantics - df.iloc[0, 0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 100 if using_copy_on_write: expected = Series([1, 2, 3], name="a") diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index 616ad5133e778..6926917a09602 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -32,6 +32,7 @@ def test_asfreq2(self, frame_or_series): datetime(2009, 11, 30), datetime(2009, 12, 31), ], + dtype="M8[ns]", freq="BME", ), ) @@ -240,17 +241,18 @@ def test_asfreq_2ME(self, freq, freq_half): ("2ME", "2M"), ("2QE", "2Q"), ("2QE-SEP", "2Q-SEP"), + ("1BQE", "1BQ"), + ("2BQE-SEP", "2BQ-SEP"), ("1YE", "1Y"), ("2YE-MAR", "2Y-MAR"), ("1YE", "1A"), ("2YE-MAR", "2A-MAR"), + ("2BYE-MAR", "2BA-MAR"), ], ) def test_asfreq_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): - # GH#9586 - depr_msg = ( - f"'{freq_depr[1:]}' will be deprecated, please use '{freq[1:]}' instead." - ) + # GH#9586, #55978 + depr_msg = f"'{freq_depr[1:]}' is deprecated, please use '{freq[1:]}' instead." index = date_range("1/1/2000", periods=4, freq=f"{freq[1:]}") df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0], index=index)}) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index eac10d307c61c..2578dfb622fbf 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -67,9 +67,19 @@ def test_astype_mixed_float(self, mixed_float_frame): casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float16") _check_cast(casted, "float16") - def test_astype_mixed_type(self, mixed_type_frame): + def test_astype_mixed_type(self): # mixed casting - mn = mixed_type_frame._get_numeric_data().copy() + df = DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + "float32": np.array([1.0] * 10, dtype="float32"), + "int32": np.array([1] * 10, dtype="int32"), + }, + index=np.arange(10), + ) + mn = df._get_numeric_data().copy() mn["little_float"] = np.array(12345.0, dtype="float16") mn["big_float"] = np.array(123456789101112.0, dtype="float64") diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py index ed8ccaea92c58..f783a388d7517 100644 --- a/pandas/tests/frame/methods/test_clip.py +++ b/pandas/tests/frame/methods/test_clip.py @@ -94,9 +94,13 @@ def test_clip_against_series(self, inplace): (1, [[2.0, 3.0, 4.0], [4.0, 5.0, 6.0], [5.0, 6.0, 7.0]]), ], ) - def test_clip_against_list_like(self, simple_frame, inplace, lower, axis, res): + def test_clip_against_list_like(self, inplace, lower, axis, res): # GH#15390 - original = simple_frame.copy(deep=True) + arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) + + original = DataFrame( + arr, columns=["one", "two", "three"], index=["a", "b", "c"] + ) result = original.clip(lower=lower, upper=[5, 6, 7], axis=axis, inplace=inplace) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 1779f703dd2b7..0335279b3a123 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -37,7 +37,7 @@ def test_combine_first(self, float_frame): combined = head.combine_first(tail) reordered_frame = float_frame.reindex(combined.index) tm.assert_frame_equal(combined, reordered_frame) - assert tm.equalContents(combined.columns, float_frame.columns) + tm.assert_index_equal(combined.columns, float_frame.columns) tm.assert_series_equal(combined["A"], reordered_frame["A"]) # same index diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index c2b1016e88402..4c371afcc4e00 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -175,3 +175,17 @@ def test_convert_dtypes_pyarrow_timestamp(self): expected = ser.astype("timestamp[ms][pyarrow]") result = expected.convert_dtypes(dtype_backend="pyarrow") tm.assert_series_equal(result, expected) + + def test_convert_dtypes_avoid_block_splitting(self): + # GH#55341 + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"}) + result = df.convert_dtypes(convert_integer=False) + expected = pd.DataFrame( + { + "a": [1, 2, 3], + "b": [4, 5, 6], + "c": pd.Series(["a"] * 3, dtype="string[python]"), + } + ) + tm.assert_frame_equal(result, expected) + assert result._mgr.nblocks == 2 diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index 1b7e669232592..bef18dbaf8a8a 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -89,7 +89,7 @@ def test_diff_datetime_with_nat_zero_periods(self, tz): # diff on NaT values should give NaT, not timedelta64(0) dti = date_range("2016-01-01", periods=4, tz=tz) ser = Series(dti) - df = ser.to_frame() + df = ser.to_frame().copy() df[1] = ser.copy() diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 52b4b64ee279f..f80ef09b50629 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -20,14 +20,18 @@ class TestFillNA: - def test_fillna_dict_inplace_nonunique_columns(self, using_copy_on_write): + def test_fillna_dict_inplace_nonunique_columns( + self, using_copy_on_write, warn_copy_on_write + ): df = DataFrame( {"A": [np.nan] * 3, "B": [NaT, Timestamp(1), NaT], "C": [np.nan, "foo", 2]} ) df.columns = ["A", "A", "A"] orig = df[:] - df.fillna({"A": 2}, inplace=True) + # TODO(CoW-warn) better warning message + with tm.assert_cow_warning(warn_copy_on_write): + df.fillna({"A": 2}, inplace=True) # The first and third columns can be set inplace, while the second cannot. expected = DataFrame( @@ -54,7 +58,8 @@ def test_fillna_on_column_view(self, using_copy_on_write): df[0].fillna(-1, inplace=True) assert np.isnan(arr[:, 0]).all() else: - df[0].fillna(-1, inplace=True) + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df[0].fillna(-1, inplace=True) assert (arr[:, 0] == -1).all() # i.e. we didn't create a new 49-column block @@ -733,12 +738,16 @@ def test_fillna_inplace_with_columns_limit_and_value(self): @td.skip_array_manager_invalid_test @pytest.mark.parametrize("val", [-1, {"x": -1, "y": -1}]) - def test_inplace_dict_update_view(self, val, using_copy_on_write): + def test_inplace_dict_update_view( + self, val, using_copy_on_write, warn_copy_on_write + ): # GH#47188 df = DataFrame({"x": [np.nan, 2], "y": [np.nan, 2]}) df_orig = df.copy() result_view = df[:] - df.fillna(val, inplace=True) + # TODO(CoW-warn) better warning message + should warn in all cases + with tm.assert_cow_warning(warn_copy_on_write and not isinstance(val, int)): + df.fillna(val, inplace=True) expected = DataFrame({"x": [-1, 2.0], "y": [-1.0, 2]}) tm.assert_frame_equal(df, expected) if using_copy_on_write: diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index bb12d7e202e09..5f37ed6d9e18a 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -322,7 +322,7 @@ def test_rowwise_alt(self): # TODO: assert something? @pytest.mark.parametrize( - "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no("scipy"))] ) def test_interp_leading_nans(self, check_scipy): df = DataFrame( @@ -378,7 +378,8 @@ def test_interp_inplace(self, using_copy_on_write): assert return_value is None tm.assert_frame_equal(result, expected_cow) else: - return_value = result["a"].interpolate(inplace=True) + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + return_value = result["a"].interpolate(inplace=True) assert return_value is None tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index b57b4f4422888..b6a6334b89fc1 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -624,7 +624,7 @@ def test_reindex(self, float_frame, using_copy_on_write): assert np.isnan(val) for col, series in newFrame.items(): - assert tm.equalContents(series.index, newFrame.index) + tm.assert_index_equal(series.index, newFrame.index) emptyFrame = float_frame.reindex(Index([])) assert len(emptyFrame.index) == 0 @@ -642,7 +642,7 @@ def test_reindex(self, float_frame, using_copy_on_write): assert np.isnan(val) for col, series in nonContigFrame.items(): - assert tm.equalContents(series.index, nonContigFrame.index) + tm.assert_index_equal(series.index, nonContigFrame.index) # corner cases diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py index b965a5d973fb6..c3bc96b44c807 100644 --- a/pandas/tests/frame/methods/test_rename.py +++ b/pandas/tests/frame/methods/test_rename.py @@ -164,12 +164,13 @@ def test_rename_multiindex(self): renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) tm.assert_index_equal(renamed.index, new_index) - def test_rename_nocopy(self, float_frame, using_copy_on_write): + def test_rename_nocopy(self, float_frame, using_copy_on_write, warn_copy_on_write): renamed = float_frame.rename(columns={"C": "foo"}, copy=False) assert np.shares_memory(renamed["foo"]._values, float_frame["C"]._values) - renamed.loc[:, "foo"] = 1.0 + with tm.assert_cow_warning(warn_copy_on_write): + renamed.loc[:, "foo"] = 1.0 if using_copy_on_write: assert not (float_frame["C"] == 1.0).all() else: diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index 9b87ffb0241ef..98113b6c41821 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -24,6 +24,34 @@ import pandas._testing as tm +@pytest.fixture +def frame_of_index_cols(): + """ + Fixture for DataFrame of columns that can be used for indexing + + Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')]; + 'A' & 'B' contain duplicates (but are jointly unique), the rest are unique. + + A B C D E (tuple, as, label) + 0 foo one a 0.608477 -0.012500 -1.664297 + 1 foo two b -0.633460 0.249614 -0.364411 + 2 foo three c 0.615256 2.154968 -0.834666 + 3 bar one d 0.234246 1.085675 0.718445 + 4 bar two e 0.533841 -0.005702 -3.533912 + """ + df = DataFrame( + { + "A": ["foo", "foo", "foo", "bar", "bar"], + "B": ["one", "two", "three", "one", "two"], + "C": ["a", "b", "c", "d", "e"], + "D": np.random.default_rng(2).standard_normal(5), + "E": np.random.default_rng(2).standard_normal(5), + ("tuple", "as", "label"): np.random.default_rng(2).standard_normal(5), + } + ) + return df + + class TestSetIndex: def test_set_index_multiindex(self): # segfault in GH#3308 diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index 471b9eaf936ad..28973fe0d7900 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -30,6 +30,7 @@ def test_copy_blocks(self, float_frame): # make sure we did not change the original DataFrame assert _last_df is not None and not _last_df[column].equals(df[column]) + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_no_copy_blocks(self, float_frame, using_copy_on_write): # GH#9607 df = DataFrame(float_frame, copy=True) diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index 50fc6fe6984e7..93a2f8d80019c 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -29,7 +29,8 @@ def test_transpose_td64_intervals(self): def test_transpose_empty_preserves_datetimeindex(self): # GH#41382 - df = DataFrame(index=DatetimeIndex([])) + dti = DatetimeIndex([], dtype="M8[ns]") + df = DataFrame(index=dti) expected = DatetimeIndex([], dtype="datetime64[ns]", freq=None) @@ -87,9 +88,13 @@ def test_transpose_object_to_tzaware_mixed_tz(self): res2 = df2.T assert (res2.dtypes == object).all() - def test_transpose_uint64(self, uint64_frame): - result = uint64_frame.T - expected = DataFrame(uint64_frame.values.T) + def test_transpose_uint64(self): + df = DataFrame( + {"A": np.arange(3), "B": [2**63, 2**63 + 5, 2**63 + 10]}, + dtype=np.uint64, + ) + result = df.T + expected = DataFrame(df.values.T) expected.index = ["A", "B"] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 5738a25f26fcb..0d32788b04b03 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -154,13 +154,15 @@ def test_update_with_different_dtype(self, using_copy_on_write): tm.assert_frame_equal(df, expected) @td.skip_array_manager_invalid_test - def test_update_modify_view(self, using_copy_on_write): + def test_update_modify_view(self, using_copy_on_write, warn_copy_on_write): # GH#47188 df = DataFrame({"A": ["1", np.nan], "B": ["100", np.nan]}) df2 = DataFrame({"A": ["a", "x"], "B": ["100", "200"]}) df2_orig = df2.copy() result_view = df2[:] - df2.update(df) + # TODO(CoW-warn) better warning message + with tm.assert_cow_warning(warn_copy_on_write): + df2.update(df) expected = DataFrame({"A": ["1", "x"], "B": ["100", "200"]}) tm.assert_frame_equal(df2, expected) if using_copy_on_write: diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 8b37d8fc2327b..8083795a69413 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -28,6 +28,23 @@ ) +@pytest.fixture +def simple_frame(): + """ + Fixture for simple 3x3 DataFrame + + Columns are ['one', 'two', 'three'], index is ['a', 'b', 'c']. + + one two three + a 1.0 2.0 3.0 + b 4.0 5.0 6.0 + c 7.0 8.0 9.0 + """ + arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) + + return DataFrame(arr, columns=["one", "two", "three"], index=["a", "b", "c"]) + + @pytest.fixture(autouse=True, params=[0, 100], ids=["numexpr", "python"]) def switch_numexpr_min_elements(request, monkeypatch): with monkeypatch.context() as m: @@ -1012,6 +1029,7 @@ def test_frame_with_frame_reindex(self): "bar": [pd.Timestamp("2018"), pd.Timestamp("2021")], }, columns=["foo", "bar"], + dtype="M8[ns]", ) df2 = df[["foo"]] diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 8644e11db9b0d..e56ee31abc402 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -429,7 +429,8 @@ def test_update_inplace_sets_valid_block_values(using_copy_on_write): with tm.raises_chained_assignment_error(): df["a"].fillna(1, inplace=True) else: - df["a"].fillna(1, inplace=True) + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["a"].fillna(1, inplace=True) # check we haven't put a Series into any block.values assert isinstance(df._mgr.blocks[0].values, Categorical) diff --git a/pandas/tests/frame/test_iteration.py b/pandas/tests/frame/test_iteration.py index 216fd5c2f70c5..a1c23ff05f3e1 100644 --- a/pandas/tests/frame/test_iteration.py +++ b/pandas/tests/frame/test_iteration.py @@ -1,6 +1,7 @@ import datetime import numpy as np +import pytest from pandas.compat import ( IS64, @@ -39,7 +40,7 @@ def test_items_names(self, float_string_frame): assert v.name == k def test_iter(self, float_frame): - assert tm.equalContents(list(float_frame), float_frame.columns) + assert list(float_frame) == list(float_frame.columns) def test_iterrows(self, float_frame, float_string_frame): for k, v in float_frame.iterrows(): @@ -91,6 +92,7 @@ def test_itertuples(self, float_frame): expected = float_frame.iloc[i, :].reset_index(drop=True) tm.assert_series_equal(ser, expected) + def test_itertuples_index_false(self): df = DataFrame( {"floats": np.random.default_rng(2).standard_normal(5), "ints": range(5)}, columns=["floats", "ints"], @@ -99,6 +101,7 @@ def test_itertuples(self, float_frame): for tup in df.itertuples(index=False): assert isinstance(tup[1], int) + def test_itertuples_duplicate_cols(self): df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]}) dfaa = df[["a", "a"]] @@ -111,32 +114,27 @@ def test_itertuples(self, float_frame): == "[(0, 1, 4), (1, 2, 5), (2, 3, 6)]" ) + def test_itertuples_tuple_name(self): + df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]}) tup = next(df.itertuples(name="TestName")) assert tup._fields == ("Index", "a", "b") assert (tup.Index, tup.a, tup.b) == tup assert type(tup).__name__ == "TestName" - df.columns = ["def", "return"] + def test_itertuples_disallowed_col_labels(self): + df = DataFrame(data={"def": [1, 2, 3], "return": [4, 5, 6]}) tup2 = next(df.itertuples(name="TestName")) assert tup2 == (0, 1, 4) assert tup2._fields == ("Index", "_1", "_2") - df3 = DataFrame({"f" + str(i): [i] for i in range(1024)}) - # will raise SyntaxError if trying to create namedtuple - tup3 = next(df3.itertuples()) - assert isinstance(tup3, tuple) - assert hasattr(tup3, "_fields") - + @pytest.mark.parametrize("limit", [254, 255, 1024]) + @pytest.mark.parametrize("index", [True, False]) + def test_itertuples_py2_3_field_limit_namedtuple(self, limit, index): # GH#28282 - df_254_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(254)}]) - result_254_columns = next(df_254_columns.itertuples(index=False)) - assert isinstance(result_254_columns, tuple) - assert hasattr(result_254_columns, "_fields") - - df_255_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(255)}]) - result_255_columns = next(df_255_columns.itertuples(index=False)) - assert isinstance(result_255_columns, tuple) - assert hasattr(result_255_columns, "_fields") + df = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(limit)}]) + result = next(df.itertuples(index=index)) + assert isinstance(result, tuple) + assert hasattr(result, "_fields") def test_sequence_like_with_categorical(self): # GH#7839 diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 72e8236159bda..2e8c5258547c3 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -27,7 +27,8 @@ def parser(request): @pytest.fixture( - params=["python", pytest.param("numexpr", marks=td.skip_if_no_ne)], ids=lambda x: x + params=["python", pytest.param("numexpr", marks=td.skip_if_no("numexpr"))], + ids=lambda x: x, ) def engine(request): return request.param @@ -35,7 +36,7 @@ def engine(request): def skip_if_no_pandas_parser(parser): if parser != "pandas": - pytest.skip(f"cannot evaluate with parser {repr(parser)}") + pytest.skip(f"cannot evaluate with parser={parser}") class TestCompat: @@ -387,7 +388,7 @@ def to_series(mi, level): raise AssertionError("object must be a Series or Index") -@td.skip_if_no_ne +@td.skip_if_no("numexpr") class TestDataFrameQueryNumExprPandas: @pytest.fixture def engine(self): @@ -765,7 +766,7 @@ def test_method_calls_in_query(self, engine, parser): tm.assert_frame_equal(result, expected) -@td.skip_if_no_ne +@td.skip_if_no("numexpr") class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas): @pytest.fixture def engine(self): diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 25d6dcaf53bfb..20ad93e6dce4d 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -147,6 +147,78 @@ def wrapper(x): tm.assert_series_equal(r1, expected) +@pytest.fixture +def bool_frame_with_na(): + """ + Fixture for DataFrame of booleans with index of unique strings + + Columns are ['A', 'B', 'C', 'D']; some entries are missing + + A B C D + zBZxY2IDGd False False False False + IhBWBMWllt False True True True + ctjdvZSR6R True False True True + AVTujptmxb False True False True + G9lrImrSWq False False False True + sFFwdIUfz2 NaN NaN NaN NaN + s15ptEJnRb NaN NaN NaN NaN + ... ... ... ... ... + UW41KkDyZ4 True True False False + l9l6XkOdqV True False False False + X2MeZfzDYA False True False False + xWkIKU7vfX False True False True + QOhL6VmpGU False False False True + 22PwkRJdat False True False False + kfboQ3VeIK True False True False + + [30 rows x 4 columns] + """ + df = DataFrame(tm.getSeriesData()) > 0 + df = df.astype(object) + # set some NAs + df.iloc[5:10] = np.nan + df.iloc[15:20, -2:] = np.nan + + # For `any` tests we need to have at least one True before the first NaN + # in each column + for i in range(4): + df.iloc[i, i] = True + return df + + +@pytest.fixture +def float_frame_with_na(): + """ + Fixture for DataFrame of floats with index of unique strings + + Columns are ['A', 'B', 'C', 'D']; some entries are missing + + A B C D + ABwBzA0ljw -1.128865 -0.897161 0.046603 0.274997 + DJiRzmbyQF 0.728869 0.233502 0.722431 -0.890872 + neMgPD5UBF 0.486072 -1.027393 -0.031553 1.449522 + 0yWA4n8VeX -1.937191 -1.142531 0.805215 -0.462018 + 3slYUbbqU1 0.153260 1.164691 1.489795 -0.545826 + soujjZ0A08 NaN NaN NaN NaN + 7W6NLGsjB9 NaN NaN NaN NaN + ... ... ... ... ... + uhfeaNkCR1 -0.231210 -0.340472 0.244717 -0.901590 + n6p7GYuBIV -0.419052 1.922721 -0.125361 -0.727717 + ZhzAeY6p1y 1.234374 -1.425359 -0.827038 -0.633189 + uWdPsORyUh 0.046738 -0.980445 -1.102965 0.605503 + 3DJA6aN590 -0.091018 -1.684734 -1.100900 0.215947 + 2GBPAzdbMk -2.883405 -1.021071 1.209877 1.633083 + sHadBoyVHw -2.223032 -0.326384 0.258931 0.245517 + + [30 rows x 4 columns] + """ + df = DataFrame(tm.getSeriesData()) + # set some NAs + df.iloc[5:10] = np.nan + df.iloc[15:20, -2:] = np.nan + return df + + class TestDataFrameAnalytics: # --------------------------------------------------------------------- # Reductions @@ -165,8 +237,8 @@ class TestDataFrameAnalytics: "var", "std", "sem", - pytest.param("skew", marks=td.skip_if_no_scipy), - pytest.param("kurt", marks=td.skip_if_no_scipy), + pytest.param("skew", marks=td.skip_if_no("scipy")), + pytest.param("kurt", marks=td.skip_if_no("scipy")), ], ) def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): @@ -219,8 +291,8 @@ def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): "var", "std", "sem", - pytest.param("skew", marks=td.skip_if_no_scipy), - pytest.param("kurt", marks=td.skip_if_no_scipy), + pytest.param("skew", marks=td.skip_if_no("scipy")), + pytest.param("kurt", marks=td.skip_if_no("scipy")), ], ) def test_stat_op_api_float_frame(self, float_frame, axis, opname): diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py index f750074a36e91..98962b3003b6d 100644 --- a/pandas/tests/frame/test_repr.py +++ b/pandas/tests/frame/test_repr.py @@ -24,8 +24,6 @@ ) import pandas._testing as tm -import pandas.io.formats.format as fmt - class TestDataFrameRepr: def test_repr_should_return_str(self): @@ -220,16 +218,14 @@ def test_repr_unsortable(self): def test_repr_float_frame_options(self, float_frame): repr(float_frame) - fmt.set_option("display.precision", 3) - repr(float_frame) + with option_context("display.precision", 3): + repr(float_frame) - fmt.set_option("display.max_rows", 10, "display.max_columns", 2) - repr(float_frame) - - fmt.set_option("display.max_rows", 1000, "display.max_columns", 1000) - repr(float_frame) + with option_context("display.max_rows", 10, "display.max_columns", 2): + repr(float_frame) - tm.reset_display_options() + with option_context("display.max_rows", 1000, "display.max_columns", 1000): + repr(float_frame) def test_repr_unicode(self): uval = "\u03c3\u03c3\u03c3\u03c3" diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 882f42ff18bdd..45884a4b3c20f 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -178,8 +178,8 @@ def test_agg_grouping_is_list_tuple(ts): tm.assert_frame_equal(result, expected) -def test_agg_python_multiindex(mframe): - grouped = mframe.groupby(["A", "B"]) +def test_agg_python_multiindex(multiindex_dataframe_random_data): + grouped = multiindex_dataframe_random_data.groupby(["A", "B"]) result = grouped.agg("mean") expected = grouped.mean() @@ -356,6 +356,12 @@ def test_agg_multiple_functions_maintain_order(df): tm.assert_index_equal(result.columns, exp_cols) +def test_series_index_name(df): + grouped = df.loc[:, ["C"]].groupby(df["A"]) + result = grouped.agg(lambda x: x.mean()) + assert result.index.name == "A" + + def test_agg_multiple_functions_same_name(): # GH 30880 df = DataFrame( diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 49fa9dc51f0d3..b8fb3b7fff676 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -24,21 +24,11 @@ def dropna(request): return request.param -@pytest.fixture(params=[True, False]) -def skipna(request): - return request.param - - @pytest.fixture(params=[True, False]) def observed(request): return request.param -@pytest.fixture -def mframe(multiindex_dataframe_random_data): - return multiindex_dataframe_random_data - - @pytest.fixture def df(): return DataFrame( @@ -57,25 +47,8 @@ def ts(): @pytest.fixture -def tsd(): - return tm.getTimeSeriesData() - - -@pytest.fixture -def tsframe(tsd): - return DataFrame(tsd) - - -@pytest.fixture -def df_mixed_floats(): - return DataFrame( - { - "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], - "B": ["one", "one", "two", "three", "two", "two", "one", "three"], - "C": np.random.default_rng(2).standard_normal(8), - "D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"), - } - ) +def tsframe(): + return DataFrame(tm.getTimeSeriesData()) @pytest.fixture diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py index f38de8faddb59..ee8f93bf3b549 100644 --- a/pandas/tests/groupby/methods/test_describe.py +++ b/pandas/tests/groupby/methods/test_describe.py @@ -11,8 +11,8 @@ import pandas._testing as tm -def test_apply_describe_bug(mframe): - grouped = mframe.groupby(level="first") +def test_apply_describe_bug(multiindex_dataframe_random_data): + grouped = multiindex_dataframe_random_data.groupby(level="first") grouped.describe() # it works! @@ -219,3 +219,73 @@ def test_describe_duplicate_columns(): ) expected.index.names = [1] tm.assert_frame_equal(result, expected) + + +class TestGroupByNonCythonPaths: + # GH#5610 non-cython calls should not include the grouper + # Tests for code not expected to go through cython paths. + + @pytest.fixture + def df(self): + df = DataFrame( + [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], + columns=["A", "B", "C"], + ) + return df + + @pytest.fixture + def gb(self, df): + gb = df.groupby("A") + return gb + + @pytest.fixture + def gni(self, df): + gni = df.groupby("A", as_index=False) + return gni + + def test_describe(self, df, gb, gni): + # describe + expected_index = Index([1, 3], name="A") + expected_col = MultiIndex( + levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]], + codes=[[0] * 8, list(range(8))], + ) + expected = DataFrame( + [ + [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], + [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + ], + index=expected_index, + columns=expected_col, + ) + result = gb.describe() + tm.assert_frame_equal(result, expected) + + expected = expected.reset_index() + result = gni.describe() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", [int, float, object]) +@pytest.mark.parametrize( + "kwargs", + [ + {"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None}, + {"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]}, + {"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None}, + ], +) +def test_groupby_empty_dataset(dtype, kwargs): + # GH#41575 + df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype) + df["B"] = df["B"].astype(int) + df["C"] = df["C"].astype(float) + + result = df.iloc[:0].groupby("A").describe(**kwargs) + expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0] + tm.assert_frame_equal(result, expected) + + result = df.iloc[:0].groupby("A").B.describe(**kwargs) + expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0] + expected.index = Index([]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_nth.py b/pandas/tests/groupby/methods/test_nth.py index b21060e7f1247..e39cfd520ba1a 100644 --- a/pandas/tests/groupby/methods/test_nth.py +++ b/pandas/tests/groupby/methods/test_nth.py @@ -44,7 +44,9 @@ def test_first_last_nth(df): grouped["B"].last() grouped["B"].nth(0) + df = df.copy() df.loc[df["A"] == "foo", "B"] = np.nan + grouped = df.groupby("A") assert isna(grouped["B"].first()["foo"]) assert isna(grouped["B"].last()["foo"]) assert isna(grouped["B"].nth(0).iloc[0]) @@ -120,8 +122,15 @@ def test_first_last_with_None_expanded(method, df, expected): tm.assert_frame_equal(result, expected) -def test_first_last_nth_dtypes(df_mixed_floats): - df = df_mixed_floats.copy() +def test_first_last_nth_dtypes(): + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.default_rng(2).standard_normal(8), + "D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"), + } + ) df["E"] = True df["F"] = 1 @@ -286,8 +295,10 @@ def test_nth5(): tm.assert_frame_equal(gb.nth([3, 4]), df.loc[[]]) -def test_nth_bdays(): - business_dates = pd.date_range(start="4/1/2014", end="6/30/2014", freq="B") +def test_nth_bdays(unit): + business_dates = pd.date_range( + start="4/1/2014", end="6/30/2014", freq="B", unit=unit + ) df = DataFrame(1, index=business_dates, columns=["a", "b"]) # get the first, fourth and last two business days for each month key = [df.index.year, df.index.month] @@ -307,7 +318,7 @@ def test_nth_bdays(): "2014/6/27", "2014/6/30", ] - ) + ).as_unit(unit) expected = DataFrame(1, columns=["a", "b"], index=expected_dates) tm.assert_frame_equal(result, expected) @@ -401,14 +412,15 @@ def test_first_last_tz(data, expected_first, expected_last): ["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"], ], ) -def test_first_last_tz_multi_column(method, ts, alpha): +def test_first_last_tz_multi_column(method, ts, alpha, unit): # GH 21603 category_string = Series(list("abc")).astype("category") + dti = pd.date_range("20130101", periods=3, tz="US/Eastern", unit=unit) df = DataFrame( { "group": [1, 1, 2], "category_string": category_string, - "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "datetimetz": dti, } ) result = getattr(df.groupby("group"), method)() @@ -421,6 +433,7 @@ def test_first_last_tz_multi_column(method, ts, alpha): }, index=Index([1, 2], name="group"), ) + expected["datetimetz"] = expected["datetimetz"].dt.as_unit(unit) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index fcb9701e9881b..361a8c27fbf9d 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -415,13 +415,14 @@ def test_columns_groupby_quantile(): tm.assert_frame_equal(result, expected) -def test_timestamp_groupby_quantile(): +def test_timestamp_groupby_quantile(unit): # GH 33168 + dti = pd.date_range( + start="2020-04-19 00:00:00", freq="1min", periods=100, tz="UTC", unit=unit + ).floor("1h") df = DataFrame( { - "timestamp": pd.date_range( - start="2020-04-19 00:00:00", freq="1min", periods=100, tz="UTC" - ).floor("1h"), + "timestamp": dti, "category": list(range(1, 101)), "value": list(range(101, 201)), } @@ -429,6 +430,7 @@ def test_timestamp_groupby_quantile(): result = df.groupby("timestamp").quantile([0.2, 0.8]) + mi = pd.MultiIndex.from_product([dti[::99], [0.2, 0.8]], names=("timestamp", None)) expected = DataFrame( [ {"category": 12.8, "value": 112.8}, @@ -436,15 +438,7 @@ def test_timestamp_groupby_quantile(): {"category": 68.8, "value": 168.8}, {"category": 92.2, "value": 192.2}, ], - index=pd.MultiIndex.from_tuples( - [ - (pd.Timestamp("2020-04-19 00:00:00+00:00"), 0.2), - (pd.Timestamp("2020-04-19 00:00:00+00:00"), 0.8), - (pd.Timestamp("2020-04-19 01:00:00+00:00"), 0.2), - (pd.Timestamp("2020-04-19 01:00:00+00:00"), 0.8), - ], - names=("timestamp", None), - ), + index=mi, ) tm.assert_frame_equal(result, expected) @@ -453,8 +447,7 @@ def test_timestamp_groupby_quantile(): def test_groupby_quantile_dt64tz_period(): # GH#51373 dti = pd.date_range("2016-01-01", periods=1000) - ser = pd.Series(dti) - df = ser.to_frame() + df = pd.Series(dti).to_frame().copy() df[1] = dti.tz_localize("US/Pacific") df[2] = dti.to_period("D") df[3] = dti - dti[0] diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index b82908ef2aa21..2fa79c815d282 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -385,8 +385,8 @@ def test_against_frame_and_seriesgroupby( "sort, ascending, expected_rows, expected_count, expected_group_size", [ (False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]), - (True, False, [4, 3, 1, 2, 0], [1, 2, 1, 1, 1], [1, 3, 3, 1, 1]), - (True, True, [4, 1, 3, 2, 0], [1, 1, 2, 1, 1], [1, 3, 3, 1, 1]), + (True, False, [3, 0, 1, 2, 4], [2, 1, 1, 1, 1], [3, 1, 3, 1, 1]), + (True, True, [0, 1, 2, 4, 3], [1, 1, 1, 1, 2], [1, 3, 1, 1, 3]), ], ) def test_compound( @@ -811,19 +811,19 @@ def test_categorical_single_grouper_observed_false( ("FR", "female", "high"), ("FR", "male", "medium"), ("FR", "female", "low"), - ("FR", "male", "high"), ("FR", "female", "medium"), + ("FR", "male", "high"), ("US", "female", "high"), ("US", "male", "low"), - ("US", "male", "medium"), - ("US", "male", "high"), - ("US", "female", "medium"), ("US", "female", "low"), - ("ASIA", "male", "low"), - ("ASIA", "male", "high"), - ("ASIA", "female", "medium"), - ("ASIA", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), ("ASIA", "female", "high"), + ("ASIA", "female", "low"), + ("ASIA", "female", "medium"), + ("ASIA", "male", "high"), + ("ASIA", "male", "low"), ("ASIA", "male", "medium"), ] @@ -1135,7 +1135,7 @@ def test_subset_duplicate_columns(): @pytest.mark.parametrize("utc", [True, False]) -def test_value_counts_time_grouper(utc): +def test_value_counts_time_grouper(utc, unit): # GH#50486 df = DataFrame( { @@ -1152,12 +1152,12 @@ def test_value_counts_time_grouper(utc): } ).drop([3]) - df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s") + df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s").dt.as_unit(unit) gb = df.groupby(Grouper(freq="1D", key="Datetime")) result = gb.value_counts() dates = to_datetime( ["2019-08-06", "2019-08-07", "2019-08-09", "2019-08-10"], utc=utc - ) + ).as_unit(unit) timestamps = df["Timestamp"].unique() index = MultiIndex( levels=[dates, timestamps, ["apple", "banana", "orange", "pear"]], @@ -1177,3 +1177,65 @@ def test_value_counts_integer_columns(): {1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"], "count": 1} ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("vc_sort", [True, False]) +@pytest.mark.parametrize("normalize", [True, False]) +def test_value_counts_sort(sort, vc_sort, normalize): + # GH#55951 + df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]}) + gb = df.groupby("a", sort=sort) + result = gb.value_counts(sort=vc_sort, normalize=normalize) + + if normalize: + values = [2 / 3, 1 / 3, 1.0] + else: + values = [2, 1, 1] + index = MultiIndex( + levels=[[1, 2], [3, 4]], codes=[[0, 0, 1], [0, 1, 0]], names=["a", 0] + ) + expected = Series(values, index=index, name="proportion" if normalize else "count") + if sort and vc_sort: + taker = [0, 1, 2] + elif sort and not vc_sort: + taker = [0, 1, 2] + elif not sort and vc_sort: + taker = [0, 2, 1] + else: + taker = [2, 1, 0] + expected = expected.take(taker) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("vc_sort", [True, False]) +@pytest.mark.parametrize("normalize", [True, False]) +def test_value_counts_sort_categorical(sort, vc_sort, normalize): + # GH#55951 + df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]}, dtype="category") + gb = df.groupby("a", sort=sort, observed=True) + result = gb.value_counts(sort=vc_sort, normalize=normalize) + + if normalize: + values = [2 / 3, 1 / 3, 1.0, 0.0] + else: + values = [2, 1, 1, 0] + name = "proportion" if normalize else "count" + expected = DataFrame( + { + "a": Categorical([1, 1, 2, 2]), + 0: Categorical([3, 4, 3, 4]), + name: values, + } + ).set_index(["a", 0])[name] + if sort and vc_sort: + taker = [0, 1, 2, 3] + elif sort and not vc_sort: + taker = [0, 1, 2, 3] + elif not sort and vc_sort: + taker = [0, 2, 1, 3] + else: + taker = [2, 3, 0, 1] + expected = expected.take(taker) + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py index 1a030841ba3ab..3066825352fa7 100644 --- a/pandas/tests/groupby/test_api.py +++ b/pandas/tests/groupby/test_api.py @@ -24,8 +24,8 @@ ) -def test_tab_completion(mframe): - grp = mframe.groupby(level="second") +def test_tab_completion(multiindex_dataframe_random_data): + grp = multiindex_dataframe_random_data.groupby(level="second") results = {v for v in dir(grp) if not v.startswith("_")} expected = { "A", @@ -98,9 +98,13 @@ def test_tab_completion(mframe): assert results == expected -def test_all_methods_categorized(mframe): - grp = mframe.groupby(mframe.iloc[:, 0]) - names = {_ for _ in dir(grp) if not _.startswith("_")} - set(mframe.columns) +def test_all_methods_categorized(multiindex_dataframe_random_data): + grp = multiindex_dataframe_random_data.groupby( + multiindex_dataframe_random_data.iloc[:, 0] + ) + names = {_ for _ in dir(grp) if not _.startswith("_")} - set( + multiindex_dataframe_random_data.columns + ) new_names = set(names) new_names -= reduction_kernels new_names -= transformation_kernels diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 2f2648b9293c5..60b386adb664a 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1559,3 +1559,45 @@ def test_include_groups(include_groups): if not include_groups: expected = expected[["b"]] tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("f", [max, min, sum]) +@pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]]) # Single key # Multi-key +def test_builtins_apply(keys, f): + # see gh-8155 + rs = np.random.default_rng(2) + df = DataFrame(rs.integers(1, 7, (10, 2)), columns=["jim", "joe"]) + df["jolie"] = rs.standard_normal(10) + + gb = df.groupby(keys) + + fname = f.__name__ + + warn = None if f is not sum else FutureWarning + msg = "The behavior of DataFrame.sum with axis=None is deprecated" + with tm.assert_produces_warning( + warn, match=msg, check_stacklevel=False, raise_on_extra_warnings=False + ): + # Also warns on deprecation GH#53425 + result = gb.apply(f) + ngroups = len(df.drop_duplicates(subset=keys)) + + assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))" + assert result.shape == (ngroups, 3), assert_msg + + npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = gb.apply(npfunc) + tm.assert_frame_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected2 = gb.apply(lambda x: npfunc(x)) + tm.assert_frame_equal(result, expected2) + + if f != sum: + expected = gb.agg(fname).reset_index() + expected.set_index(keys, inplace=True, drop=False) + tm.assert_frame_equal(result, expected, check_dtype=False) + + tm.assert_series_equal(getattr(result, fname)(axis=0), getattr(df, fname)(axis=0)) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 939dd176ae90e..d4ccbe4c1c376 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1975,7 +1975,10 @@ def test_category_order_transformer( df = df.set_index(keys) args = get_groupby_method_args(transformation_func, df) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - op_result = getattr(gb, transformation_func)(*args) + warn = FutureWarning if transformation_func == "fillna" else None + msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg): + op_result = getattr(gb, transformation_func)(*args) result = op_result.index.get_level_values("a").categories expected = Index([1, 4, 3, 2]) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/groupby/test_cumulative.py b/pandas/tests/groupby/test_cumulative.py index eecb82cd5050b..25534865b3486 100644 --- a/pandas/tests/groupby/test_cumulative.py +++ b/pandas/tests/groupby/test_cumulative.py @@ -289,3 +289,30 @@ def test_nullable_int_not_cast_as_float(method, dtype, val): expected = DataFrame({"b": data}, dtype=dtype) tm.assert_frame_equal(result, expected) + + +def test_cython_api2(): + # this takes the fast apply path + + # cumsum (GH5614) + df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"]) + expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"]) + result = df.groupby("A").cumsum() + tm.assert_frame_equal(result, expected) + + # GH 5755 - cumsum is a transformer and should ignore as_index + result = df.groupby("A", as_index=False).cumsum() + tm.assert_frame_equal(result, expected) + + # GH 13994 + msg = "DataFrameGroupBy.cumsum with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").cumsum(axis=1) + expected = df.cumsum(axis=1) + tm.assert_frame_equal(result, expected) + + msg = "DataFrameGroupBy.cumprod with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").cumprod(axis=1) + expected = df.cumprod(axis=1) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index b840443aab347..0c9a7aa1b8a73 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1,4 +1,3 @@ -import builtins import re import numpy as np @@ -10,73 +9,12 @@ from pandas import ( DataFrame, Index, - MultiIndex, Series, Timestamp, date_range, ) import pandas._testing as tm from pandas.tests.groupby import get_groupby_method_args -from pandas.util import _test_decorators as td - - -def test_intercept_builtin_sum(): - s = Series([1.0, 2.0, np.nan, 3.0]) - grouped = s.groupby([0, 1, 2, 2]) - - msg = "using SeriesGroupBy.sum" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#53425 - result = grouped.agg(builtins.sum) - msg = "using np.sum" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#53425 - result2 = grouped.apply(builtins.sum) - expected = grouped.sum() - tm.assert_series_equal(result, expected) - tm.assert_series_equal(result2, expected) - - -@pytest.mark.parametrize("f", [max, min, sum]) -@pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]]) # Single key # Multi-key -def test_builtins_apply(keys, f): - # see gh-8155 - rs = np.random.default_rng(2) - df = DataFrame(rs.integers(1, 7, (10, 2)), columns=["jim", "joe"]) - df["jolie"] = rs.standard_normal(10) - - gb = df.groupby(keys) - - fname = f.__name__ - - warn = None if f is not sum else FutureWarning - msg = "The behavior of DataFrame.sum with axis=None is deprecated" - with tm.assert_produces_warning( - warn, match=msg, check_stacklevel=False, raise_on_extra_warnings=False - ): - # Also warns on deprecation GH#53425 - result = gb.apply(f) - ngroups = len(df.drop_duplicates(subset=keys)) - - assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))" - assert result.shape == (ngroups, 3), assert_msg - - npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = gb.apply(npfunc) - tm.assert_frame_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning, match=msg): - expected2 = gb.apply(lambda x: npfunc(x)) - tm.assert_frame_equal(result, expected2) - - if f != sum: - expected = gb.agg(fname).reset_index() - expected.set_index(keys, inplace=True, drop=False) - tm.assert_frame_equal(result, expected, check_dtype=False) - - tm.assert_series_equal(getattr(result, fname)(axis=0), getattr(df, fname)(axis=0)) class TestNumericOnly: @@ -267,118 +205,6 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): tm.assert_index_equal(result.columns, expected_columns) -class TestGroupByNonCythonPaths: - # GH#5610 non-cython calls should not include the grouper - # Tests for code not expected to go through cython paths. - - @pytest.fixture - def df(self): - df = DataFrame( - [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], - columns=["A", "B", "C"], - ) - return df - - @pytest.fixture - def gb(self, df): - gb = df.groupby("A") - return gb - - @pytest.fixture - def gni(self, df): - gni = df.groupby("A", as_index=False) - return gni - - def test_describe(self, df, gb, gni): - # describe - expected_index = Index([1, 3], name="A") - expected_col = MultiIndex( - levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]], - codes=[[0] * 8, list(range(8))], - ) - expected = DataFrame( - [ - [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], - [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], - ], - index=expected_index, - columns=expected_col, - ) - result = gb.describe() - tm.assert_frame_equal(result, expected) - - expected = expected.reset_index() - result = gni.describe() - tm.assert_frame_equal(result, expected) - - -def test_cython_api2(): - # this takes the fast apply path - - # cumsum (GH5614) - df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"]) - expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"]) - result = df.groupby("A").cumsum() - tm.assert_frame_equal(result, expected) - - # GH 5755 - cumsum is a transformer and should ignore as_index - result = df.groupby("A", as_index=False).cumsum() - tm.assert_frame_equal(result, expected) - - # GH 13994 - msg = "DataFrameGroupBy.cumsum with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").cumsum(axis=1) - expected = df.cumsum(axis=1) - tm.assert_frame_equal(result, expected) - - msg = "DataFrameGroupBy.cumprod with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").cumprod(axis=1) - expected = df.cumprod(axis=1) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"] -) -@pytest.mark.parametrize( - "method,data", - [ - ("first", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}), - ("last", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}), - ("min", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}), - ("max", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}), - ("count", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 2}], "out_type": "int64"}), - ], -) -def test_groupby_non_arithmetic_agg_types(dtype, method, data): - # GH9311, GH6620 - df = DataFrame( - [{"a": 1, "b": 1}, {"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 2, "b": 4}] - ) - - df["b"] = df.b.astype(dtype) - - if "args" not in data: - data["args"] = [] - - if "out_type" in data: - out_type = data["out_type"] - else: - out_type = dtype - - exp = data["df"] - df_out = DataFrame(exp) - - df_out["b"] = df_out.b.astype(out_type) - df_out.set_index("a", inplace=True) - - grpd = df.groupby("a") - t = getattr(grpd, method)(*data["args"]) - tm.assert_frame_equal(t, df_out) - - @pytest.mark.parametrize( "i", [ @@ -493,78 +319,6 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only): tm.assert_equal(result, expected) -def scipy_sem(*args, **kwargs): - from scipy.stats import sem - - return sem(*args, ddof=1, **kwargs) - - -@pytest.mark.parametrize( - "op,targop", - [ - ("mean", np.mean), - ("median", np.median), - ("std", np.std), - ("var", np.var), - ("sum", np.sum), - ("prod", np.prod), - ("min", np.min), - ("max", np.max), - ("first", lambda x: x.iloc[0]), - ("last", lambda x: x.iloc[-1]), - ("count", np.size), - pytest.param("sem", scipy_sem, marks=td.skip_if_no_scipy), - ], -) -def test_ops_general(op, targop): - df = DataFrame(np.random.default_rng(2).standard_normal(1000)) - labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float) - - result = getattr(df.groupby(labels), op)() - warn = None if op in ("first", "last", "count", "sem") else FutureWarning - msg = f"using DataFrameGroupBy.{op}" - with tm.assert_produces_warning(warn, match=msg): - expected = df.groupby(labels).agg(targop) - tm.assert_frame_equal(result, expected) - - -def test_series_index_name(df): - grouped = df.loc[:, ["C"]].groupby(df["A"]) - result = grouped.agg(lambda x: x.mean()) - assert result.index.name == "A" - - -@pytest.mark.parametrize( - "values", - [ - { - "a": [1, 1, 1, 2, 2, 2, 3, 3, 3], - "b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2], - }, - {"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]}, - ], -) -@pytest.mark.parametrize("function", ["mean", "median", "var"]) -def test_apply_to_nullable_integer_returns_float(values, function): - # https://github.com/pandas-dev/pandas/issues/32219 - output = 0.5 if function == "var" else 1.5 - arr = np.array([output] * 3, dtype=float) - idx = Index([1, 2, 3], name="a", dtype="Int64") - expected = DataFrame({"b": arr}, index=idx).astype("Float64") - - groups = DataFrame(values, dtype="Int64").groupby("a") - - result = getattr(groups, function)() - tm.assert_frame_equal(result, expected) - - result = groups.agg(function) - tm.assert_frame_equal(result, expected) - - result = groups.agg([function]) - expected.columns = MultiIndex.from_tuples([("b", function)]) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( "kernel, has_arg", [ @@ -627,7 +381,10 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): and numeric_only is lib.no_default ) ): - result = method(*args, **kwargs) + warn = FutureWarning if kernel == "fillna" else None + msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg): + result = method(*args, **kwargs) assert "b" in result.columns elif has_arg: assert numeric_only is not True @@ -725,11 +482,18 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): msg = "cannot be performed against 'object' dtypes" else: msg = "is not supported for object dtype" - with pytest.raises(TypeError, match=msg): - method(*args) + warn = FutureWarning if groupby_func == "fillna" else None + warn_msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=warn_msg): + with pytest.raises(TypeError, match=msg): + method(*args) elif dtype is object: - result = method(*args) - expected = expected_method(*args) + warn = FutureWarning if groupby_func == "fillna" else None + warn_msg = "SeriesGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=warn_msg): + result = method(*args) + with tm.assert_produces_warning(warn, match=warn_msg): + expected = expected_method(*args) if groupby_func in obj_result: expected = expected.astype(object) tm.assert_series_equal(result, expected) @@ -781,31 +545,6 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("dtype", [int, float, object]) -@pytest.mark.parametrize( - "kwargs", - [ - {"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None}, - {"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]}, - {"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None}, - ], -) -def test_groupby_empty_dataset(dtype, kwargs): - # GH#41575 - df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype) - df["B"] = df["B"].astype(int) - df["C"] = df["C"].astype(float) - - result = df.iloc[:0].groupby("A").describe(**kwargs) - expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0] - tm.assert_frame_equal(result, expected) - - result = df.iloc[:0].groupby("A").B.describe(**kwargs) - expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0] - expected.index = Index([]) - tm.assert_frame_equal(result, expected) - - def test_multiindex_group_all_columns_when_empty(groupby_func): # GH 32464 df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"]) @@ -813,7 +552,10 @@ def test_multiindex_group_all_columns_when_empty(groupby_func): method = getattr(gb, groupby_func) args = get_groupby_method_args(groupby_func, df) - result = method(*args).index + warn = FutureWarning if groupby_func == "fillna" else None + warn_msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=warn_msg): + result = method(*args).index expected = df.index tm.assert_index_equal(result, expected) @@ -826,62 +568,18 @@ def test_duplicate_columns(request, groupby_func, as_index): df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb")) args = get_groupby_method_args(groupby_func, df) gb = df.groupby("a", as_index=as_index) - result = getattr(gb, groupby_func)(*args) + warn = FutureWarning if groupby_func == "fillna" else None + warn_msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=warn_msg): + result = getattr(gb, groupby_func)(*args) expected_df = df.set_axis(["a", "b", "c"], axis=1) expected_args = get_groupby_method_args(groupby_func, expected_df) expected_gb = expected_df.groupby("a", as_index=as_index) - expected = getattr(expected_gb, groupby_func)(*expected_args) + warn = FutureWarning if groupby_func == "fillna" else None + warn_msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=warn_msg): + expected = getattr(expected_gb, groupby_func)(*expected_args) if groupby_func not in ("size", "ngroup", "cumcount"): expected = expected.rename(columns={"c": "b"}) tm.assert_equal(result, expected) - - -@pytest.mark.parametrize( - "op", - [ - "sum", - "prod", - "min", - "max", - "median", - "mean", - "skew", - "std", - "var", - "sem", - ], -) -@pytest.mark.parametrize("axis", [0, 1]) -@pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.parametrize("sort", [True, False]) -def test_regression_allowlist_methods(op, axis, skipna, sort): - # GH6944 - # GH 17537 - # explicitly test the allowlist methods - raw_frame = DataFrame([0]) - if axis == 0: - frame = raw_frame - msg = "The 'axis' keyword in DataFrame.groupby is deprecated and will be" - else: - frame = raw_frame.T - msg = "DataFrame.groupby with axis=1 is deprecated" - - with tm.assert_produces_warning(FutureWarning, match=msg): - grouped = frame.groupby(level=0, axis=axis, sort=sort) - - if op == "skew": - # skew has skipna - result = getattr(grouped, op)(skipna=skipna) - expected = frame.groupby(level=0).apply( - lambda h: getattr(h, op)(axis=axis, skipna=skipna) - ) - if sort: - expected = expected.sort_index(axis=axis) - tm.assert_frame_equal(result, expected) - else: - result = getattr(grouped, op)() - expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)(axis=axis)) - if sort: - expected = expected.sort_index(axis=axis) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index de8ceb30b565b..c61d9fab0435e 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -41,7 +41,7 @@ def test_repr(): assert result == expected -# TODO(CoW-warn) this should NOT warn +# TODO(CoW-warn) this should NOT warn -> inplace operator triggers it @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_groupby_std_datetimelike(warn_copy_on_write): # GH#48481 @@ -136,18 +136,27 @@ def test_basic_aggregations(dtype): grouped.aggregate(lambda x: x * 2) -def test_groupby_nonobject_dtype(mframe, df_mixed_floats): - key = mframe.index.codes[0] - grouped = mframe.groupby(key) +def test_groupby_nonobject_dtype(multiindex_dataframe_random_data): + key = multiindex_dataframe_random_data.index.codes[0] + grouped = multiindex_dataframe_random_data.groupby(key) result = grouped.sum() - expected = mframe.groupby(key.astype("O")).sum() + expected = multiindex_dataframe_random_data.groupby(key.astype("O")).sum() assert result.index.dtype == np.int8 assert expected.index.dtype == np.int64 tm.assert_frame_equal(result, expected, check_index_type=False) + +def test_groupby_nonobject_dtype_mixed(): # GH 3911, mixed frame non-conversion - df = df_mixed_floats.copy() + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.default_rng(2).standard_normal(8), + "D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"), + } + ) df["value"] = range(len(df)) def max_value(group): @@ -1052,7 +1061,7 @@ def test_raise_on_nuisance_python_multiple(three_group): grouped.mean() -def test_empty_groups_corner(mframe): +def test_empty_groups_corner(multiindex_dataframe_random_data): # handle empty groups df = DataFrame( { @@ -1069,7 +1078,7 @@ def test_empty_groups_corner(mframe): expected = grouped.mean(numeric_only=True) tm.assert_frame_equal(result, expected) - grouped = mframe[3:5].groupby(level=0) + grouped = multiindex_dataframe_random_data[3:5].groupby(level=0) agged = grouped.apply(lambda x: x.mean()) agged_A = grouped["A"].apply("mean") tm.assert_series_equal(agged["A"], agged_A) @@ -1083,8 +1092,8 @@ def test_nonsense_func(): df.groupby(lambda x: x + "foo") -def test_wrap_aggregated_output_multindex(mframe): - df = mframe.T +def test_wrap_aggregated_output_multindex(multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data.T df["baz", "two"] = "peekaboo" keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] @@ -1103,24 +1112,24 @@ def aggfun(ser): df.groupby(keys).aggregate(aggfun) -def test_groupby_level_apply(mframe): - result = mframe.groupby(level=0).count() +def test_groupby_level_apply(multiindex_dataframe_random_data): + result = multiindex_dataframe_random_data.groupby(level=0).count() assert result.index.name == "first" - result = mframe.groupby(level=1).count() + result = multiindex_dataframe_random_data.groupby(level=1).count() assert result.index.name == "second" - result = mframe["A"].groupby(level=0).count() + result = multiindex_dataframe_random_data["A"].groupby(level=0).count() assert result.index.name == "first" -def test_groupby_level_mapper(mframe): - deleveled = mframe.reset_index() +def test_groupby_level_mapper(multiindex_dataframe_random_data): + deleveled = multiindex_dataframe_random_data.reset_index() mapper0 = {"foo": 0, "bar": 0, "baz": 1, "qux": 1} mapper1 = {"one": 0, "two": 0, "three": 1} - result0 = mframe.groupby(mapper0, level=0).sum() - result1 = mframe.groupby(mapper1, level=1).sum() + result0 = multiindex_dataframe_random_data.groupby(mapper0, level=0).sum() + result1 = multiindex_dataframe_random_data.groupby(mapper1, level=1).sum() mapped_level0 = np.array( [mapper0.get(x) for x in deleveled["first"]], dtype=np.int64 @@ -1128,8 +1137,8 @@ def test_groupby_level_mapper(mframe): mapped_level1 = np.array( [mapper1.get(x) for x in deleveled["second"]], dtype=np.int64 ) - expected0 = mframe.groupby(mapped_level0).sum() - expected1 = mframe.groupby(mapped_level1).sum() + expected0 = multiindex_dataframe_random_data.groupby(mapped_level0).sum() + expected1 = multiindex_dataframe_random_data.groupby(mapped_level1).sum() expected0.index.name, expected1.index.name = "first", "second" tm.assert_frame_equal(result0, expected0) @@ -2370,18 +2379,32 @@ def test_group_on_empty_multiindex(transformation_func, request): args = ("ffill",) else: args = () - result = df.iloc[:0].groupby(["col_1"]).transform(transformation_func, *args) - expected = df.groupby(["col_1"]).transform(transformation_func, *args).iloc[:0] + warn = FutureWarning if transformation_func == "fillna" else None + warn_msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=warn_msg): + result = df.iloc[:0].groupby(["col_1"]).transform(transformation_func, *args) + with tm.assert_produces_warning(warn, match=warn_msg): + expected = df.groupby(["col_1"]).transform(transformation_func, *args).iloc[:0] if transformation_func in ("diff", "shift"): expected = expected.astype(int) tm.assert_equal(result, expected) - result = ( - df["col_3"].iloc[:0].groupby(["col_1"]).transform(transformation_func, *args) - ) - expected = ( - df["col_3"].groupby(["col_1"]).transform(transformation_func, *args).iloc[:0] - ) + warn_msg = "SeriesGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=warn_msg): + result = ( + df["col_3"] + .iloc[:0] + .groupby(["col_1"]) + .transform(transformation_func, *args) + ) + warn_msg = "SeriesGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=warn_msg): + expected = ( + df["col_3"] + .groupby(["col_1"]) + .transform(transformation_func, *args) + .iloc[:0] + ) if transformation_func in ("diff", "shift"): expected = expected.astype(int) tm.assert_equal(result, expected) @@ -2402,7 +2425,10 @@ def test_dup_labels_output_shape(groupby_func, idx): grp_by = df.groupby([0]) args = get_groupby_method_args(groupby_func, df) - result = getattr(grp_by, groupby_func)(*args) + warn = FutureWarning if groupby_func == "fillna" else None + warn_msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=warn_msg): + result = getattr(grp_by, groupby_func)(*args) assert result.shape == (1, 2) tm.assert_index_equal(result.columns, idx) @@ -3158,7 +3184,9 @@ def test_groupby_selection_other_methods(df): g_exp = df[["C"]].groupby(df["A"]) # methods which aren't just .foo() - tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0)) + warn_msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0)) msg = "DataFrameGroupBy.dtypes is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): tm.assert_frame_equal(g.dtypes, g_exp.dtypes) @@ -3172,28 +3200,32 @@ def test_groupby_selection_other_methods(df): ) -def test_groupby_with_Time_Grouper(): - idx2 = [ - to_datetime("2016-08-31 22:08:12.000"), - to_datetime("2016-08-31 22:09:12.200"), - to_datetime("2016-08-31 22:20:12.400"), - ] +def test_groupby_with_Time_Grouper(unit): + idx2 = to_datetime( + [ + "2016-08-31 22:08:12.000", + "2016-08-31 22:09:12.200", + "2016-08-31 22:20:12.400", + ] + ).as_unit(unit) test_data = DataFrame( {"quant": [1.0, 1.0, 3.0], "quant2": [1.0, 1.0, 3.0], "time2": idx2} ) + time2 = date_range("2016-08-31 22:08:00", periods=13, freq="1min", unit=unit) expected_output = DataFrame( { - "time2": date_range("2016-08-31 22:08:00", periods=13, freq="1min"), + "time2": time2, "quant": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], "quant2": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], } ) - df = test_data.groupby(Grouper(key="time2", freq="1min")).count().reset_index() + gb = test_data.groupby(Grouper(key="time2", freq="1min")) + result = gb.count().reset_index() - tm.assert_frame_equal(df, expected_output) + tm.assert_frame_equal(result, expected_output) def test_groupby_series_with_datetimeindex_month_name(): diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 0068f642a4294..bf809bd5db437 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -36,8 +36,12 @@ def test_groupby_preserves_subclass(obj, groupby_func): args = get_groupby_method_args(groupby_func, obj) - result1 = getattr(grouped, groupby_func)(*args) - result2 = grouped.agg(groupby_func, *args) + warn = FutureWarning if groupby_func == "fillna" else None + msg = f"{type(grouped).__name__}.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg, raise_on_extra_warnings=False): + result1 = getattr(grouped, groupby_func)(*args) + with tm.assert_produces_warning(warn, match=msg, raise_on_extra_warnings=False): + result2 = grouped.agg(groupby_func, *args) # Reduction or transformation kernels should preserve type slices = {"ngroup", "cumcount", "size"} diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 01768582299eb..e3cc41afa4679 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -276,19 +276,24 @@ def test_grouper_creation_bug2(self): result = g.sum() tm.assert_frame_equal(result, expected) - def test_grouper_creation_bug3(self): + def test_grouper_creation_bug3(self, unit): # GH8866 + dti = date_range("20130101", periods=2, unit=unit) + mi = MultiIndex.from_product( + [list("ab"), range(2), dti], + names=["one", "two", "three"], + ) ser = Series( np.arange(8, dtype="int64"), - index=MultiIndex.from_product( - [list("ab"), range(2), date_range("20130101", periods=2)], - names=["one", "two", "three"], - ), + index=mi, ) result = ser.groupby(Grouper(level="three", freq="ME")).sum() + exp_dti = pd.DatetimeIndex( + [Timestamp("2013-01-31")], freq="ME", name="three" + ).as_unit(unit) expected = Series( [28], - index=pd.DatetimeIndex([Timestamp("2013-01-31")], freq="ME", name="three"), + index=exp_dti, ) tm.assert_series_equal(result, expected) @@ -529,22 +534,24 @@ def test_multiindex_passthru(self): result = gb.first() tm.assert_frame_equal(result, df) - def test_multiindex_negative_level(self, mframe): + def test_multiindex_negative_level(self, multiindex_dataframe_random_data): # GH 13901 - result = mframe.groupby(level=-1).sum() - expected = mframe.groupby(level="second").sum() + result = multiindex_dataframe_random_data.groupby(level=-1).sum() + expected = multiindex_dataframe_random_data.groupby(level="second").sum() tm.assert_frame_equal(result, expected) - result = mframe.groupby(level=-2).sum() - expected = mframe.groupby(level="first").sum() + result = multiindex_dataframe_random_data.groupby(level=-2).sum() + expected = multiindex_dataframe_random_data.groupby(level="first").sum() tm.assert_frame_equal(result, expected) - result = mframe.groupby(level=[-2, -1]).sum() - expected = mframe.sort_index() + result = multiindex_dataframe_random_data.groupby(level=[-2, -1]).sum() + expected = multiindex_dataframe_random_data.sort_index() tm.assert_frame_equal(result, expected) - result = mframe.groupby(level=[-1, "first"]).sum() - expected = mframe.groupby(level=["second", "first"]).sum() + result = multiindex_dataframe_random_data.groupby(level=[-1, "first"]).sum() + expected = multiindex_dataframe_random_data.groupby( + level=["second", "first"] + ).sum() tm.assert_frame_equal(result, expected) def test_multifunc_select_col_integer_cols(self, df): @@ -636,9 +643,9 @@ def test_groupby_multiindex_partial_indexing_equivalence(self): tm.assert_dict_equal(expected_groups, result_groups) @pytest.mark.parametrize("sort", [True, False]) - def test_groupby_level(self, sort, mframe, df): + def test_groupby_level(self, sort, multiindex_dataframe_random_data, df): # GH 17537 - frame = mframe + frame = multiindex_dataframe_random_data deleveled = frame.reset_index() result0 = frame.groupby(level=0, sort=sort).sum() @@ -719,9 +726,9 @@ def test_groupby_level_with_nas(self, sort): expected = Series([6.0, 18.0], index=[0.0, 1.0]) tm.assert_series_equal(result, expected) - def test_groupby_args(self, mframe): + def test_groupby_args(self, multiindex_dataframe_random_data): # PR8618 and issue 8015 - frame = mframe + frame = multiindex_dataframe_random_data msg = "You have to supply one of 'by' and 'level'" with pytest.raises(TypeError, match=msg): @@ -738,14 +745,16 @@ def test_groupby_args(self, mframe): [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]], ], ) - def test_level_preserve_order(self, sort, labels, mframe): + def test_level_preserve_order(self, sort, labels, multiindex_dataframe_random_data): # GH 17537 - grouped = mframe.groupby(level=0, sort=sort) + grouped = multiindex_dataframe_random_data.groupby(level=0, sort=sort) exp_labels = np.array(labels, np.intp) tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) - def test_grouping_labels(self, mframe): - grouped = mframe.groupby(mframe.index.get_level_values(0)) + def test_grouping_labels(self, multiindex_dataframe_random_data): + grouped = multiindex_dataframe_random_data.groupby( + multiindex_dataframe_random_data.index.get_level_values(0) + ) exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp) tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) diff --git a/pandas/tests/groupby/test_missing.py b/pandas/tests/groupby/test_missing.py index 37bf22279b38c..3180a92be1236 100644 --- a/pandas/tests/groupby/test_missing.py +++ b/pandas/tests/groupby/test_missing.py @@ -39,8 +39,10 @@ def test_groupby_fill_duplicate_column_names(func): def test_ffill_missing_arguments(): # GH 14955 df = DataFrame({"a": [1, 2], "b": [1, 1]}) - with pytest.raises(ValueError, match="Must specify a fill"): - df.groupby("b").fillna() + msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match="Must specify a fill"): + df.groupby("b").fillna() @pytest.mark.parametrize( @@ -50,7 +52,7 @@ def test_fillna_with_string_dtype(method, expected): # GH 40250 df = DataFrame({"a": pd.array([None, "a", None], dtype="string"), "b": [0, 0, 0]}) grp = df.groupby("b") - msg = "DataFrameGroupBy.fillna with 'method' is deprecated" + msg = "DataFrameGroupBy.fillna is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): result = grp.fillna(method=method) expected = DataFrame({"a": pd.array(expected, dtype="string")}) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 0f4a73e4e2a38..2800f08b5fd90 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -193,7 +193,12 @@ def test_groupby_raises_string( ), }[groupby_func] - _call_and_check(klass, msg, how, gb, groupby_func, args) + if groupby_func == "fillna": + kind = "Series" if groupby_series else "DataFrame" + warn_msg = f"{kind}GroupBy.fillna is deprecated" + else: + warn_msg = "" + _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) @pytest.mark.parametrize("how", ["agg", "transform"]) @@ -300,6 +305,9 @@ def test_groupby_raises_datetime( if groupby_func in ["any", "all"]: warn_msg = f"'{groupby_func}' with datetime64 dtypes is deprecated" + elif groupby_func == "fillna": + kind = "Series" if groupby_series else "DataFrame" + warn_msg = f"{kind}GroupBy.fillna is deprecated" else: warn_msg = "" _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=warn_msg) @@ -495,7 +503,12 @@ def test_groupby_raises_category( ), }[groupby_func] - _call_and_check(klass, msg, how, gb, groupby_func, args) + if groupby_func == "fillna": + kind = "Series" if groupby_series else "DataFrame" + warn_msg = f"{kind}GroupBy.fillna is deprecated" + else: + warn_msg = "" + _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) @pytest.mark.parametrize("how", ["agg", "transform"]) @@ -685,7 +698,12 @@ def test_groupby_raises_category_on_category( ), }[groupby_func] - _call_and_check(klass, msg, how, gb, groupby_func, args) + if groupby_func == "fillna": + kind = "Series" if groupby_series else "DataFrame" + warn_msg = f"{kind}GroupBy.fillna is deprecated" + else: + warn_msg = "" + _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) def test_subsetting_columns_axis_1_raises(): diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index f48926e4d0c77..30ca3110ac2fa 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -17,6 +17,7 @@ isna, ) import pandas._testing as tm +from pandas.util import _test_decorators as td @pytest.mark.parametrize("agg_func", ["any", "all"]) @@ -793,6 +794,23 @@ def test_empty_categorical(observed): tm.assert_series_equal(result, expected) +def test_intercept_builtin_sum(): + s = Series([1.0, 2.0, np.nan, 3.0]) + grouped = s.groupby([0, 1, 2, 2]) + + msg = "using SeriesGroupBy.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#53425 + result = grouped.agg(builtins.sum) + msg = "using np.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#53425 + result2 = grouped.apply(builtins.sum) + expected = grouped.sum() + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) + + @pytest.mark.parametrize("min_count", [0, 10]) def test_groupby_sum_mincount_boolean(min_count): b = True @@ -853,3 +871,159 @@ def test_groupby_sum_timedelta_with_nat(): res = gb["b"].sum(min_count=2) expected = Series([td3, pd.NaT], dtype="m8[ns]", name="b", index=expected.index) tm.assert_series_equal(res, expected) + + +@pytest.mark.parametrize( + "dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"] +) +@pytest.mark.parametrize( + "method,data", + [ + ("first", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}), + ("last", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}), + ("min", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}), + ("max", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}), + ("count", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 2}], "out_type": "int64"}), + ], +) +def test_groupby_non_arithmetic_agg_types(dtype, method, data): + # GH9311, GH6620 + df = DataFrame( + [{"a": 1, "b": 1}, {"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 2, "b": 4}] + ) + + df["b"] = df.b.astype(dtype) + + if "args" not in data: + data["args"] = [] + + if "out_type" in data: + out_type = data["out_type"] + else: + out_type = dtype + + exp = data["df"] + df_out = DataFrame(exp) + + df_out["b"] = df_out.b.astype(out_type) + df_out.set_index("a", inplace=True) + + grpd = df.groupby("a") + t = getattr(grpd, method)(*data["args"]) + tm.assert_frame_equal(t, df_out) + + +def scipy_sem(*args, **kwargs): + from scipy.stats import sem + + return sem(*args, ddof=1, **kwargs) + + +@pytest.mark.parametrize( + "op,targop", + [ + ("mean", np.mean), + ("median", np.median), + ("std", np.std), + ("var", np.var), + ("sum", np.sum), + ("prod", np.prod), + ("min", np.min), + ("max", np.max), + ("first", lambda x: x.iloc[0]), + ("last", lambda x: x.iloc[-1]), + ("count", np.size), + pytest.param("sem", scipy_sem, marks=td.skip_if_no("scipy")), + ], +) +def test_ops_general(op, targop): + df = DataFrame(np.random.default_rng(2).standard_normal(1000)) + labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float) + + result = getattr(df.groupby(labels), op)() + warn = None if op in ("first", "last", "count", "sem") else FutureWarning + msg = f"using DataFrameGroupBy.{op}" + with tm.assert_produces_warning(warn, match=msg): + expected = df.groupby(labels).agg(targop) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + { + "a": [1, 1, 1, 2, 2, 2, 3, 3, 3], + "b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2], + }, + {"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]}, + ], +) +@pytest.mark.parametrize("function", ["mean", "median", "var"]) +def test_apply_to_nullable_integer_returns_float(values, function): + # https://github.com/pandas-dev/pandas/issues/32219 + output = 0.5 if function == "var" else 1.5 + arr = np.array([output] * 3, dtype=float) + idx = pd.Index([1, 2, 3], name="a", dtype="Int64") + expected = DataFrame({"b": arr}, index=idx).astype("Float64") + + groups = DataFrame(values, dtype="Int64").groupby("a") + + result = getattr(groups, function)() + tm.assert_frame_equal(result, expected) + + result = groups.agg(function) + tm.assert_frame_equal(result, expected) + + result = groups.agg([function]) + expected.columns = MultiIndex.from_tuples([("b", function)]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "op", + [ + "sum", + "prod", + "min", + "max", + "median", + "mean", + "skew", + "std", + "var", + "sem", + ], +) +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +def test_regression_allowlist_methods(op, axis, skipna, sort): + # GH6944 + # GH 17537 + # explicitly test the allowlist methods + raw_frame = DataFrame([0]) + if axis == 0: + frame = raw_frame + msg = "The 'axis' keyword in DataFrame.groupby is deprecated and will be" + else: + frame = raw_frame.T + msg = "DataFrame.groupby with axis=1 is deprecated" + + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped = frame.groupby(level=0, axis=axis, sort=sort) + + if op == "skew": + # skew has skipna + result = getattr(grouped, op)(skipna=skipna) + expected = frame.groupby(level=0).apply( + lambda h: getattr(h, op)(axis=axis, skipna=skipna) + ) + if sort: + expected = expected.sort_index(axis=axis) + tm.assert_frame_equal(result, expected) + else: + result = getattr(grouped, op)() + expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)(axis=axis)) + if sort: + expected = expected.sort_index(axis=axis) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 22614c542ed09..cf122832d86b4 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -186,8 +186,13 @@ def test_transform_axis_1(request, transformation_func): msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): gb = df.groupby([0, 0, 1], axis=1) - result = gb.transform(transformation_func, *args) - expected = df.T.groupby([0, 0, 1]).transform(transformation_func, *args).T + warn = FutureWarning if transformation_func == "fillna" else None + msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg): + result = gb.transform(transformation_func, *args) + msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg): + expected = df.T.groupby([0, 0, 1]).transform(transformation_func, *args).T if transformation_func in ["diff", "shift"]: # Result contains nans, so transpose coerces to float @@ -385,7 +390,7 @@ def test_dispatch_transform(tsframe): grouped = df.groupby(lambda x: x.month) - msg = "DataFrameGroupBy.fillna with 'method' is deprecated" + msg = "DataFrameGroupBy.fillna is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): filled = grouped.fillna(method="pad") msg = "Series.fillna with 'method' is deprecated" @@ -403,10 +408,13 @@ def test_transform_fillna_null(): "cost": (100, 200, 300, 400, 500, 600), } ) - with pytest.raises(ValueError, match="Must specify a fill 'value' or 'method'"): - df.groupby(["price"]).transform("fillna") - with pytest.raises(ValueError, match="Must specify a fill 'value' or 'method'"): - df.groupby(["price"]).fillna() + msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match="Must specify a fill 'value' or 'method'"): + df.groupby(["price"]).transform("fillna") + with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match="Must specify a fill 'value' or 'method'"): + df.groupby(["price"]).fillna() def test_transform_transformation_func(transformation_func): @@ -437,23 +445,30 @@ def mock_op(x): test_op = lambda x: x.transform(transformation_func) mock_op = lambda x: getattr(x, transformation_func)() - msg = "The default fill_method='pad' in DataFrame.pct_change is deprecated" - groupby_msg = ( - "The default fill_method='ffill' in DataFrameGroupBy.pct_change is deprecated" - ) if transformation_func == "pct_change": - with tm.assert_produces_warning(FutureWarning, match=groupby_msg): - result = test_op(df.groupby("A")) + msg = "The default fill_method='pad' in DataFrame.pct_change is deprecated" + groupby_msg = ( + "The default fill_method='ffill' in DataFrameGroupBy.pct_change " + "is deprecated" + ) + warn = FutureWarning + groupby_warn = FutureWarning + elif transformation_func == "fillna": + msg = "" + groupby_msg = "DataFrameGroupBy.fillna is deprecated" + warn = None + groupby_warn = FutureWarning else: + msg = groupby_msg = "" + warn = groupby_warn = None + + with tm.assert_produces_warning(groupby_warn, match=groupby_msg): result = test_op(df.groupby("A")) # pass the group in same order as iterating `for ... in df.groupby(...)` # but reorder to match df's index since this is a transform groups = [df[["B"]].iloc[4:6], df[["B"]].iloc[6:], df[["B"]].iloc[:4]] - if transformation_func == "pct_change": - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = concat([mock_op(g) for g in groups]).sort_index() - else: + with tm.assert_produces_warning(warn, match=msg): expected = concat([mock_op(g) for g in groups]).sort_index() # sort_index does not preserve the freq expected = expected.set_axis(df.index) @@ -1527,11 +1542,19 @@ def test_null_group_str_transformer(request, dropna, transformation_func): # ngroup/cumcount always returns a Series as it counts the groups, not values expected = expected["B"].rename(None) - msg = "The default fill_method='ffill' in DataFrameGroupBy.pct_change is deprecated" if transformation_func == "pct_change" and not dropna: - with tm.assert_produces_warning(FutureWarning, match=msg): - result = gb.transform("pct_change", *args) + warn = FutureWarning + msg = ( + "The default fill_method='ffill' in DataFrameGroupBy.pct_change " + "is deprecated" + ) + elif transformation_func == "fillna": + warn = FutureWarning + msg = "DataFrameGroupBy.fillna is deprecated" else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): result = gb.transform(transformation_func, *args) tm.assert_equal(result, expected) @@ -1599,7 +1622,9 @@ def test_null_group_str_transformer_series(dropna, transformation_func): buffer.append(Series([np.nan], index=[3], dtype=dtype)) expected = concat(buffer) - with tm.assert_produces_warning(None): + warn = FutureWarning if transformation_func == "fillna" else None + msg = "SeriesGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg): result = gb.transform(transformation_func, *args) tm.assert_equal(result, expected) @@ -1642,8 +1667,12 @@ def test_as_index_no_change(keys, df, groupby_func): args = get_groupby_method_args(groupby_func, df) gb_as_index_true = df.groupby(keys, as_index=True) gb_as_index_false = df.groupby(keys, as_index=False) - result = gb_as_index_true.transform(groupby_func, *args) - expected = gb_as_index_false.transform(groupby_func, *args) + warn = FutureWarning if groupby_func == "fillna" else None + msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg): + result = gb_as_index_true.transform(groupby_func, *args) + with tm.assert_produces_warning(warn, match=msg): + expected = gb_as_index_false.transform(groupby_func, *args) tm.assert_equal(result, expected) diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 488f79eea0d11..e538ad512d691 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -12,6 +12,13 @@ from pandas.core.algorithms import safe_sort +def equal_contents(arr1, arr2) -> bool: + """ + Checks if the set of unique elements of arr1 and arr2 are equivalent. + """ + return frozenset(arr1) == frozenset(arr2) + + class TestIndexSetOps: @pytest.mark.parametrize( "method", ["union", "intersection", "difference", "symmetric_difference"] @@ -71,7 +78,7 @@ def test_union_different_type_base(self, klass): result = first.union(klass(second.values)) - assert tm.equalContents(result, index) + assert equal_contents(result, index) def test_union_sort_other_incomparable(self): # https://github.com/pandas-dev/pandas/issues/24959 @@ -119,7 +126,7 @@ def test_intersection_different_type_base(self, klass, sort): second = index[:3] result = first.intersection(klass(second.values), sort=sort) - assert tm.equalContents(result, second) + assert equal_contents(result, second) def test_intersection_nosort(self): result = Index(["c", "b", "a"]).intersection(["b", "a"]) @@ -244,7 +251,7 @@ def test_union_name_preservation( tm.assert_index_equal(union, expected) else: expected = Index(vals, name=expected_name) - tm.equalContents(union, expected) + tm.assert_index_equal(union.sort_values(), expected.sort_values()) @pytest.mark.parametrize( "diff_type, expected", diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index 808a1687390ff..bfb7acdcf4812 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -5,7 +5,6 @@ Series, array, ) -import pandas._testing as tm @pytest.fixture(params=[None, False]) @@ -40,22 +39,3 @@ def listlike_box(request): Types that may be passed as the indexer to searchsorted. """ return request.param - - -@pytest.fixture( - params=tm.ALL_REAL_NUMPY_DTYPES - + [ - "object", - "category", - "datetime64[ns]", - "timedelta64[ns]", - ] -) -def any_dtype_for_small_pos_integer_indexes(request): - """ - Dtypes that can be given to an Index with small positive integers. - - This means that for any dtype `x` in the params list, `Index([1, 2, 3], dtype=x)` is - valid and gives the correct Index (sub-)class. - """ - return request.param diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index e527f6f28cd9d..9ee6250feeac6 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -39,7 +39,9 @@ def test_dti_astype_asobject_around_dst_transition(self, tzstr): def test_astype(self): # GH 13149, GH 13209 - idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan], name="idx") + idx = DatetimeIndex( + ["2016-05-16", "NaT", NaT, np.nan], dtype="M8[ns]", name="idx" + ) result = idx.astype(object) expected = Index( @@ -55,6 +57,7 @@ def test_astype(self): ) tm.assert_index_equal(result, expected) + def test_astype2(self): rng = date_range("1/1/2000", periods=10, name="idx") result = rng.astype("i8") tm.assert_index_equal(result, Index(rng.asi8, name="idx")) @@ -160,7 +163,9 @@ def test_astype_str_freq_and_tz(self): def test_astype_datetime64(self): # GH 13149, GH 13209 - idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan], name="idx") + idx = DatetimeIndex( + ["2016-05-16", "NaT", NaT, np.nan], dtype="M8[ns]", name="idx" + ) result = idx.astype("datetime64[ns]") tm.assert_index_equal(result, idx) diff --git a/pandas/tests/indexes/datetimes/methods/test_delete.py b/pandas/tests/indexes/datetimes/methods/test_delete.py index 0fdd60c14474e..2341499977f22 100644 --- a/pandas/tests/indexes/datetimes/methods/test_delete.py +++ b/pandas/tests/indexes/datetimes/methods/test_delete.py @@ -116,18 +116,17 @@ def test_delete_slice(self, unit): # TODO: belongs in Series.drop tests? @pytest.mark.parametrize("tz", [None, "Asia/Tokyo", "US/Pacific"]) - def test_delete_slice2(self, tz): + def test_delete_slice2(self, tz, unit): + dti = date_range( + "2000-01-01 09:00", periods=10, freq="h", name="idx", tz=tz, unit=unit + ) ts = Series( 1, - index=date_range( - "2000-01-01 09:00", periods=10, freq="h", name="idx", tz=tz - ), + index=dti, ) # preserve freq result = ts.drop(ts.index[:5]).index - expected = date_range( - "2000-01-01 14:00", periods=5, freq="h", name="idx", tz=tz - ) + expected = dti[5:] tm.assert_index_equal(result, expected) assert result.name == expected.name assert result.freq == expected.freq @@ -135,18 +134,7 @@ def test_delete_slice2(self, tz): # reset freq to None result = ts.drop(ts.index[[1, 3, 5, 7, 9]]).index - expected = DatetimeIndex( - [ - "2000-01-01 09:00", - "2000-01-01 11:00", - "2000-01-01 13:00", - "2000-01-01 15:00", - "2000-01-01 17:00", - ], - freq=None, - name="idx", - tz=tz, - ) + expected = dti[::2]._with_freq(None) tm.assert_index_equal(result, expected) assert result.name == expected.name assert result.freq == expected.freq diff --git a/pandas/tests/indexes/datetimes/methods/test_repeat.py b/pandas/tests/indexes/datetimes/methods/test_repeat.py index cf4749aedd5a7..92501755f8c5b 100644 --- a/pandas/tests/indexes/datetimes/methods/test_repeat.py +++ b/pandas/tests/indexes/datetimes/methods/test_repeat.py @@ -17,29 +17,29 @@ def test_repeat_range(self, tz_naive_fixture): assert result.freq is None assert len(result) == 5 * len(rng) - def test_repeat_range2(self, tz_naive_fixture): + def test_repeat_range2(self, tz_naive_fixture, unit): tz = tz_naive_fixture - index = date_range("2001-01-01", periods=2, freq="D", tz=tz) + index = date_range("2001-01-01", periods=2, freq="D", tz=tz, unit=unit) exp = DatetimeIndex( ["2001-01-01", "2001-01-01", "2001-01-02", "2001-01-02"], tz=tz - ) + ).as_unit(unit) for res in [index.repeat(2), np.repeat(index, 2)]: tm.assert_index_equal(res, exp) assert res.freq is None - def test_repeat_range3(self, tz_naive_fixture): + def test_repeat_range3(self, tz_naive_fixture, unit): tz = tz_naive_fixture - index = date_range("2001-01-01", periods=2, freq="2D", tz=tz) + index = date_range("2001-01-01", periods=2, freq="2D", tz=tz, unit=unit) exp = DatetimeIndex( ["2001-01-01", "2001-01-01", "2001-01-03", "2001-01-03"], tz=tz - ) + ).as_unit(unit) for res in [index.repeat(2), np.repeat(index, 2)]: tm.assert_index_equal(res, exp) assert res.freq is None - def test_repeat_range4(self, tz_naive_fixture): + def test_repeat_range4(self, tz_naive_fixture, unit): tz = tz_naive_fixture - index = DatetimeIndex(["2001-01-01", "NaT", "2003-01-01"], tz=tz) + index = DatetimeIndex(["2001-01-01", "NaT", "2003-01-01"], tz=tz).as_unit(unit) exp = DatetimeIndex( [ "2001-01-01", @@ -53,17 +53,17 @@ def test_repeat_range4(self, tz_naive_fixture): "2003-01-01", ], tz=tz, - ) + ).as_unit(unit) for res in [index.repeat(3), np.repeat(index, 3)]: tm.assert_index_equal(res, exp) assert res.freq is None - def test_repeat(self, tz_naive_fixture): + def test_repeat(self, tz_naive_fixture, unit): tz = tz_naive_fixture reps = 2 msg = "the 'axis' parameter is not supported" - rng = date_range(start="2016-01-01", periods=2, freq="30Min", tz=tz) + rng = date_range(start="2016-01-01", periods=2, freq="30Min", tz=tz, unit=unit) expected_rng = DatetimeIndex( [ @@ -72,7 +72,7 @@ def test_repeat(self, tz_naive_fixture): Timestamp("2016-01-01 00:30:00", tz=tz), Timestamp("2016-01-01 00:30:00", tz=tz), ] - ) + ).as_unit(unit) res = rng.repeat(reps) tm.assert_index_equal(res, expected_rng) diff --git a/pandas/tests/indexes/datetimes/methods/test_shift.py b/pandas/tests/indexes/datetimes/methods/test_shift.py index c50f75894d810..d8bdcc2a17685 100644 --- a/pandas/tests/indexes/datetimes/methods/test_shift.py +++ b/pandas/tests/indexes/datetimes/methods/test_shift.py @@ -20,10 +20,10 @@ class TestDatetimeIndexShift: # ------------------------------------------------------------- # DatetimeIndex.shift is used in integer addition - def test_dti_shift_tzaware(self, tz_naive_fixture): + def test_dti_shift_tzaware(self, tz_naive_fixture, unit): # GH#9903 tz = tz_naive_fixture - idx = DatetimeIndex([], name="xxx", tz=tz) + idx = DatetimeIndex([], name="xxx", tz=tz).as_unit(unit) tm.assert_index_equal(idx.shift(0, freq="h"), idx) tm.assert_index_equal(idx.shift(3, freq="h"), idx) @@ -32,30 +32,31 @@ def test_dti_shift_tzaware(self, tz_naive_fixture): name="xxx", tz=tz, freq="h", - ) + ).as_unit(unit) tm.assert_index_equal(idx.shift(0, freq="h"), idx) exp = DatetimeIndex( ["2011-01-01 13:00", "2011-01-01 14:00", "2011-01-01 15:00"], name="xxx", tz=tz, freq="h", - ) + ).as_unit(unit) tm.assert_index_equal(idx.shift(3, freq="h"), exp) exp = DatetimeIndex( ["2011-01-01 07:00", "2011-01-01 08:00", "2011-01-01 09:00"], name="xxx", tz=tz, freq="h", - ) + ).as_unit(unit) tm.assert_index_equal(idx.shift(-3, freq="h"), exp) - def test_dti_shift_freqs(self): + def test_dti_shift_freqs(self, unit): # test shift for DatetimeIndex and non DatetimeIndex # GH#8083 - drange = date_range("20130101", periods=5) + drange = date_range("20130101", periods=5, unit=unit) result = drange.shift(1) expected = DatetimeIndex( ["2013-01-02", "2013-01-03", "2013-01-04", "2013-01-05", "2013-01-06"], + dtype=f"M8[{unit}]", freq="D", ) tm.assert_index_equal(result, expected) @@ -63,6 +64,7 @@ def test_dti_shift_freqs(self): result = drange.shift(-1) expected = DatetimeIndex( ["2012-12-31", "2013-01-01", "2013-01-02", "2013-01-03", "2013-01-04"], + dtype=f"M8[{unit}]", freq="D", ) tm.assert_index_equal(result, expected) @@ -70,12 +72,13 @@ def test_dti_shift_freqs(self): result = drange.shift(3, freq="2D") expected = DatetimeIndex( ["2013-01-07", "2013-01-08", "2013-01-09", "2013-01-10", "2013-01-11"], + dtype=f"M8[{unit}]", freq="D", ) tm.assert_index_equal(result, expected) - def test_dti_shift_int(self): - rng = date_range("1/1/2000", periods=20) + def test_dti_shift_int(self, unit): + rng = date_range("1/1/2000", periods=20, unit=unit) result = rng + 5 * rng.freq expected = rng.shift(5) @@ -85,25 +88,27 @@ def test_dti_shift_int(self): expected = rng.shift(-5) tm.assert_index_equal(result, expected) - def test_dti_shift_no_freq(self): + def test_dti_shift_no_freq(self, unit): # GH#19147 - dti = DatetimeIndex(["2011-01-01 10:00", "2011-01-01"], freq=None) + dti = DatetimeIndex(["2011-01-01 10:00", "2011-01-01"], freq=None).as_unit(unit) with pytest.raises(NullFrequencyError, match="Cannot shift with no freq"): dti.shift(2) @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) - def test_dti_shift_localized(self, tzstr): - dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI") + def test_dti_shift_localized(self, tzstr, unit): + dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI", unit=unit) dr_tz = dr.tz_localize(tzstr) result = dr_tz.shift(1, "10min") assert result.tz == dr_tz.tz - def test_dti_shift_across_dst(self): + def test_dti_shift_across_dst(self, unit): # GH 8616 - idx = date_range("2013-11-03", tz="America/Chicago", periods=7, freq="h") - s = Series(index=idx[:-1], dtype=object) - result = s.shift(freq="h") + idx = date_range( + "2013-11-03", tz="America/Chicago", periods=7, freq="h", unit=unit + ) + ser = Series(index=idx[:-1], dtype=object) + result = ser.shift(freq="h") expected = Series(index=idx[1:], dtype=object) tm.assert_series_equal(result, expected) @@ -115,24 +120,26 @@ def test_dti_shift_across_dst(self): [1, "2014-11-14 01:00:00"], ], ) - def test_dti_shift_near_midnight(self, shift, result_time): + def test_dti_shift_near_midnight(self, shift, result_time, unit): # GH 8616 dt = datetime(2014, 11, 14, 0) dt_est = pytz.timezone("EST").localize(dt) - s = Series(data=[1], index=[dt_est]) - result = s.shift(shift, freq="h") - expected = Series(1, index=DatetimeIndex([result_time], tz="EST")) + idx = DatetimeIndex([dt_est]).as_unit(unit) + ser = Series(data=[1], index=idx) + result = ser.shift(shift, freq="h") + exp_index = DatetimeIndex([result_time], tz="EST").as_unit(unit) + expected = Series(1, index=exp_index) tm.assert_series_equal(result, expected) - def test_shift_periods(self): + def test_shift_periods(self, unit): # GH#22458 : argument 'n' was deprecated in favor of 'periods' - idx = date_range(start=START, end=END, periods=3) + idx = date_range(start=START, end=END, periods=3, unit=unit) tm.assert_index_equal(idx.shift(periods=0), idx) tm.assert_index_equal(idx.shift(0), idx) @pytest.mark.parametrize("freq", ["B", "C"]) - def test_shift_bday(self, freq): - rng = date_range(START, END, freq=freq) + def test_shift_bday(self, freq, unit): + rng = date_range(START, END, freq=freq, unit=unit) shifted = rng.shift(5) assert shifted[0] == rng[5] assert shifted.freq == rng.freq @@ -145,18 +152,18 @@ def test_shift_bday(self, freq): assert shifted[0] == rng[0] assert shifted.freq == rng.freq - def test_shift_bmonth(self): - rng = date_range(START, END, freq=pd.offsets.BMonthEnd()) + def test_shift_bmonth(self, unit): + rng = date_range(START, END, freq=pd.offsets.BMonthEnd(), unit=unit) shifted = rng.shift(1, freq=pd.offsets.BDay()) assert shifted[0] == rng[0] + pd.offsets.BDay() - rng = date_range(START, END, freq=pd.offsets.BMonthEnd()) + rng = date_range(START, END, freq=pd.offsets.BMonthEnd(), unit=unit) with tm.assert_produces_warning(pd.errors.PerformanceWarning): shifted = rng.shift(1, freq=pd.offsets.CDay()) assert shifted[0] == rng[0] + pd.offsets.CDay() - def test_shift_empty(self): + def test_shift_empty(self, unit): # GH#14811 - dti = date_range(start="2016-10-21", end="2016-10-21", freq="BME") + dti = date_range(start="2016-10-21", end="2016-10-21", freq="BME", unit=unit) result = dti.shift(1) tm.assert_index_equal(result, dti) diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index af04f827d4593..aa217e895c30a 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -50,13 +50,13 @@ def test_to_period_quarterly(self, month): result = stamps.to_period(freq) tm.assert_index_equal(rng, result) - @pytest.mark.parametrize("off", ["BQ", "QS", "BQS"]) + @pytest.mark.parametrize("off", ["BQE", "QS", "BQS"]) def test_to_period_quarterlyish(self, off): rng = date_range("01-Jan-2012", periods=8, freq=off) prng = rng.to_period() assert prng.freq == "QE-DEC" - @pytest.mark.parametrize("off", ["BY", "YS", "BYS"]) + @pytest.mark.parametrize("off", ["BYE", "YS", "BYS"]) def test_to_period_annualish(self, off): rng = date_range("01-Jan-2012", periods=8, freq=off) prng = rng.to_period() @@ -103,13 +103,29 @@ def test_dti_to_period_2monthish(self, freq_offset, freq_period): ) def test_to_period_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): # GH#9586 - msg = f"'{freq_depr[1:]}' will be deprecated, please use '{freq[1:]}' instead." + msg = f"'{freq_depr[1:]}' is deprecated, please use '{freq[1:]}' instead." rng = date_range("01-Jan-2012", periods=8, freq=freq) prng = rng.to_period() with tm.assert_produces_warning(FutureWarning, match=msg): assert prng.freq == freq_depr + @pytest.mark.parametrize( + "freq, freq_depr", + [ + ("2BQE-SEP", "2BQ-SEP"), + ("2BYE-MAR", "2BY-MAR"), + ], + ) + def test_to_period_frequency_BQ_BY_deprecated(self, freq, freq_depr): + # GH#9586 + msg = f"'{freq_depr[1:]}' is deprecated, please use '{freq[1:]}' instead." + + rng = date_range("01-Jan-2012", periods=8, freq=freq) + prng = rng.to_period() + with tm.assert_produces_warning(FutureWarning, match=msg): + prng.freq == freq_depr + def test_to_period_infer(self): # https://github.com/pandas-dev/pandas/issues/33358 rng = date_range( diff --git a/pandas/tests/indexes/datetimes/methods/test_tz_localize.py b/pandas/tests/indexes/datetimes/methods/test_tz_localize.py index 9cd0d2df48ac8..ad7769c6b9671 100644 --- a/pandas/tests/indexes/datetimes/methods/test_tz_localize.py +++ b/pandas/tests/indexes/datetimes/methods/test_tz_localize.py @@ -97,9 +97,11 @@ def test_dti_tz_localize_ambiguous_infer(self, tz): dr.tz_localize(tz) @pytest.mark.parametrize("tz", easts) - def test_dti_tz_localize_ambiguous_infer2(self, tz): + def test_dti_tz_localize_ambiguous_infer2(self, tz, unit): # With repeated hours, we can infer the transition - dr = date_range(datetime(2011, 11, 6, 0), periods=5, freq=offsets.Hour(), tz=tz) + dr = date_range( + datetime(2011, 11, 6, 0), periods=5, freq=offsets.Hour(), tz=tz, unit=unit + ) times = [ "11/06/2011 00:00", "11/06/2011 01:00", @@ -107,11 +109,11 @@ def test_dti_tz_localize_ambiguous_infer2(self, tz): "11/06/2011 02:00", "11/06/2011 03:00", ] - di = DatetimeIndex(times) + di = DatetimeIndex(times).as_unit(unit) result = di.tz_localize(tz, ambiguous="infer") expected = dr._with_freq(None) tm.assert_index_equal(result, expected) - result2 = DatetimeIndex(times, tz=tz, ambiguous="infer") + result2 = DatetimeIndex(times, tz=tz, ambiguous="infer").as_unit(unit) tm.assert_index_equal(result2, expected) @pytest.mark.parametrize("tz", easts) @@ -269,11 +271,13 @@ def test_dti_tz_localize_ambiguous_nat(self, tz): tm.assert_numpy_array_equal(di_test.values, localized.values) @pytest.mark.parametrize("tz", easts) - def test_dti_tz_localize_ambiguous_flags(self, tz): + def test_dti_tz_localize_ambiguous_flags(self, tz, unit): # November 6, 2011, fall back, repeat 2 AM hour # Pass in flags to determine right dst transition - dr = date_range(datetime(2011, 11, 6, 0), periods=5, freq=offsets.Hour(), tz=tz) + dr = date_range( + datetime(2011, 11, 6, 0), periods=5, freq=offsets.Hour(), tz=tz, unit=unit + ) times = [ "11/06/2011 00:00", "11/06/2011 01:00", @@ -283,12 +287,14 @@ def test_dti_tz_localize_ambiguous_flags(self, tz): ] # Test tz_localize - di = DatetimeIndex(times) + di = DatetimeIndex(times).as_unit(unit) is_dst = [1, 1, 0, 0, 0] localized = di.tz_localize(tz, ambiguous=is_dst) expected = dr._with_freq(None) tm.assert_index_equal(expected, localized) - tm.assert_index_equal(expected, DatetimeIndex(times, tz=tz, ambiguous=is_dst)) + + result = DatetimeIndex(times, tz=tz, ambiguous=is_dst).as_unit(unit) + tm.assert_index_equal(result, expected) localized = di.tz_localize(tz, ambiguous=np.array(is_dst)) tm.assert_index_equal(dr, localized) @@ -297,12 +303,12 @@ def test_dti_tz_localize_ambiguous_flags(self, tz): tm.assert_index_equal(dr, localized) # Test constructor - localized = DatetimeIndex(times, tz=tz, ambiguous=is_dst) + localized = DatetimeIndex(times, tz=tz, ambiguous=is_dst).as_unit(unit) tm.assert_index_equal(dr, localized) # Test duplicate times where inferring the dst fails times += times - di = DatetimeIndex(times) + di = DatetimeIndex(times).as_unit(unit) # When the sizes are incompatible, make sure error is raised msg = "Length of ambiguous bool-array must be the same size as vals" @@ -315,6 +321,8 @@ def test_dti_tz_localize_ambiguous_flags(self, tz): dr = dr.append(dr) tm.assert_index_equal(dr, localized) + @pytest.mark.parametrize("tz", easts) + def test_dti_tz_localize_ambiguous_flags2(self, tz, unit): # When there is no dst transition, nothing special happens dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=offsets.Hour()) is_dst = np.array([1] * 10) @@ -373,15 +381,15 @@ def test_dti_tz_localize_bdate_range(self): ) @pytest.mark.parametrize("tz_type", ["", "dateutil/"]) def test_dti_tz_localize_nonexistent_shift( - self, start_ts, tz, end_ts, shift, tz_type + self, start_ts, tz, end_ts, shift, tz_type, unit ): # GH#8917 tz = tz_type + tz if isinstance(shift, str): shift = "shift_" + shift - dti = DatetimeIndex([Timestamp(start_ts)]) + dti = DatetimeIndex([Timestamp(start_ts)]).as_unit(unit) result = dti.tz_localize(tz, nonexistent=shift) - expected = DatetimeIndex([Timestamp(end_ts)]).tz_localize(tz) + expected = DatetimeIndex([Timestamp(end_ts)]).tz_localize(tz).as_unit(unit) tm.assert_index_equal(result, expected) @pytest.mark.parametrize("offset", [-1, 1]) diff --git a/pandas/tests/indexes/datetimes/methods/test_unique.py b/pandas/tests/indexes/datetimes/methods/test_unique.py index 5319bf59f8a64..3c419b23c749a 100644 --- a/pandas/tests/indexes/datetimes/methods/test_unique.py +++ b/pandas/tests/indexes/datetimes/methods/test_unique.py @@ -33,9 +33,10 @@ def test_index_unique(rand_series_with_duplicate_datetimeindex): datetime(2000, 1, 3), datetime(2000, 1, 4), datetime(2000, 1, 5), - ] + ], + dtype=index.dtype, ) - assert uniques.dtype == "M8[ns]" # sanity + assert uniques.dtype == index.dtype # sanity tm.assert_index_equal(uniques, expected) assert index.nunique() == 4 diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 2a1fa20dce777..35c3604de3d63 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -955,8 +955,10 @@ def test_dti_tz_constructors(self, tzstr): for other in [idx2, idx3, idx4]: tm.assert_index_equal(idx1, other) - def test_dti_construction_univalent(self): - rng = date_range("03/12/2012 00:00", periods=10, freq="W-FRI", tz="US/Eastern") + def test_dti_construction_univalent(self, unit): + rng = date_range( + "03/12/2012 00:00", periods=10, freq="W-FRI", tz="US/Eastern", unit=unit + ) rng2 = DatetimeIndex(data=rng, tz="US/Eastern") tm.assert_index_equal(rng, rng2) @@ -1010,11 +1012,17 @@ def test_dti_constructor_with_non_nano_dtype(self, tz): dtype = "M8[us]" if tz is not None: dtype = f"M8[us, {tz}]" - # NB: the 2500 is interpreted as nanoseconds and rounded *down* - # to 2 microseconds vals = [ts, "2999-01-02 03:04:05.678910", 2500] result = DatetimeIndex(vals, dtype=dtype) - exp_vals = [Timestamp(x, tz=tz).as_unit("us").asm8 for x in vals] + # The 2500 is interpreted as microseconds, consistent with what + # we would get if we created DatetimeIndexes from vals[:2] and vals[2:] + # and concated the results. + pointwise = [ + vals[0].tz_localize(tz), + Timestamp(vals[1], tz=tz), + to_datetime(vals[2], unit="us", utc=True).tz_convert(tz), + ] + exp_vals = [x.as_unit("us").asm8 for x in pointwise] exp_arr = np.array(exp_vals, dtype="M8[us]") expected = DatetimeIndex(exp_arr, dtype="M8[us]") if tz is not None: @@ -1052,6 +1060,36 @@ def test_dti_constructor_object_float_matches_float_dtype(self): dti2 = DatetimeIndex(arr2, tz="CET") tm.assert_index_equal(dti1, dti2) + @pytest.mark.parametrize("dtype", ["M8[us]", "M8[us, US/Pacific]"]) + def test_dti_constructor_with_dtype_object_int_matches_int_dtype(self, dtype): + # Going through the object path should match the non-object path + + vals1 = np.arange(5, dtype="i8") * 1000 + vals1[0] = pd.NaT.value + + vals2 = vals1.astype(np.float64) + vals2[0] = np.nan + + vals3 = vals1.astype(object) + # change lib.infer_dtype(vals3) from "integer" so we go through + # array_to_datetime in _sequence_to_dt64 + vals3[0] = pd.NaT + + vals4 = vals2.astype(object) + + res1 = DatetimeIndex(vals1, dtype=dtype) + res2 = DatetimeIndex(vals2, dtype=dtype) + res3 = DatetimeIndex(vals3, dtype=dtype) + res4 = DatetimeIndex(vals4, dtype=dtype) + + expected = DatetimeIndex(vals1.view("M8[us]")) + if res1.tz is not None: + expected = expected.tz_localize("UTC").tz_convert(res1.tz) + tm.assert_index_equal(res1, expected) + tm.assert_index_equal(res2, expected) + tm.assert_index_equal(res3, expected) + tm.assert_index_equal(res4, expected) + class TestTimeSeries: def test_dti_constructor_preserve_dti_freq(self): diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 3a6d1cacd0f81..dfad4d7ad01ea 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -139,18 +139,29 @@ def test_date_range_invalid_periods(self): with pytest.raises(TypeError, match=msg): date_range(start="1/1/2000", periods="foo", freq="D") - def test_date_range_float_periods(self): - # TODO: reconsider allowing this? - rng = date_range("1/1/2000", periods=10.5) + def test_date_range_fractional_period(self): + msg = "Non-integer 'periods' in pd.date_range, pd.timedelta_range" + with tm.assert_produces_warning(FutureWarning, match=msg): + rng = date_range("1/1/2000", periods=10.5) exp = date_range("1/1/2000", periods=10) tm.assert_index_equal(rng, exp) - def test_date_range_frequency_M_deprecated(self): - depr_msg = "'M' will be deprecated, please use 'ME' instead." + @pytest.mark.parametrize( + "freq,freq_depr", + [ + ("2ME", "2M"), + ("2SME", "2SM"), + ("2BQE", "2BQ"), + ("2BYE", "2BY"), + ], + ) + def test_date_range_frequency_M_SM_BQ_BY_deprecated(self, freq, freq_depr): + # GH#52064 + depr_msg = f"'{freq_depr[1:]}' is deprecated, please use '{freq[1:]}' instead." - expected = date_range("1/1/2000", periods=4, freq="2ME") + expected = date_range("1/1/2000", periods=4, freq=freq) with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = date_range("1/1/2000", periods=4, freq="2M") + result = date_range("1/1/2000", periods=4, freq=freq_depr) tm.assert_index_equal(result, expected) def test_date_range_tuple_freq_raises(self): @@ -172,6 +183,7 @@ def test_date_range_edges(self, freq): ) exp = DatetimeIndex( [ts + n * td for n in range(1, 5)], + dtype="M8[ns]", freq=freq, ) tm.assert_index_equal(idx, exp) @@ -182,7 +194,7 @@ def test_date_range_edges(self, freq): end=ts + td, freq=freq, ) - exp = DatetimeIndex([], freq=freq) + exp = DatetimeIndex([], dtype="M8[ns]", freq=freq) tm.assert_index_equal(idx, exp) # start matches end @@ -191,7 +203,7 @@ def test_date_range_edges(self, freq): end=ts + td, freq=freq, ) - exp = DatetimeIndex([ts + td], freq=freq) + exp = DatetimeIndex([ts + td], dtype="M8[ns]", freq=freq) tm.assert_index_equal(idx, exp) def test_date_range_near_implementation_bound(self): @@ -285,9 +297,11 @@ def test_date_range_normalize(self): rng = date_range(snap, periods=n, normalize=False, freq="2D") offset = timedelta(2) - values = DatetimeIndex([snap + i * offset for i in range(n)], freq=offset) + expected = DatetimeIndex( + [snap + i * offset for i in range(n)], dtype="M8[ns]", freq=offset + ) - tm.assert_index_equal(rng, values) + tm.assert_index_equal(rng, expected) rng = date_range("1/1/2000 08:15", periods=n, normalize=False, freq="B") the_time = time(8, 15) @@ -306,11 +320,12 @@ def test_date_range_ambiguous_arguments(self): with pytest.raises(ValueError, match=msg): date_range(start, end, periods=10, freq="s") - def test_date_range_convenience_periods(self): + def test_date_range_convenience_periods(self, unit): # GH 20808 - result = date_range("2018-04-24", "2018-04-27", periods=3) + result = date_range("2018-04-24", "2018-04-27", periods=3, unit=unit) expected = DatetimeIndex( ["2018-04-24 00:00:00", "2018-04-25 12:00:00", "2018-04-27 00:00:00"], + dtype=f"M8[{unit}]", freq=None, ) @@ -322,6 +337,7 @@ def test_date_range_convenience_periods(self): "2018-04-01 04:00:00", tz="Australia/Sydney", periods=3, + unit=unit, ) expected = DatetimeIndex( [ @@ -329,7 +345,7 @@ def test_date_range_convenience_periods(self): Timestamp("2018-04-01 02:00:00+1000", tz="Australia/Sydney"), Timestamp("2018-04-01 04:00:00+1000", tz="Australia/Sydney"), ] - ) + ).as_unit(unit) tm.assert_index_equal(result, expected) def test_date_range_index_comparison(self): @@ -432,7 +448,7 @@ def test_catch_infinite_loop(self): with pytest.raises(ValueError, match=msg): date_range(datetime(2011, 11, 11), datetime(2011, 11, 12), freq=offset) - def test_construct_over_dst(self): + def test_construct_over_dst(self, unit): # GH 20854 pre_dst = Timestamp("2010-11-07 01:00:00").tz_localize( "US/Pacific", ambiguous=True @@ -445,14 +461,19 @@ def test_construct_over_dst(self): pre_dst, pst_dst, ] - expected = DatetimeIndex(expect_data, freq="h") - result = date_range(start="2010-11-7", periods=3, freq="h", tz="US/Pacific") + expected = DatetimeIndex(expect_data, freq="h").as_unit(unit) + result = date_range( + start="2010-11-7", periods=3, freq="h", tz="US/Pacific", unit=unit + ) tm.assert_index_equal(result, expected) - def test_construct_with_different_start_end_string_format(self): + def test_construct_with_different_start_end_string_format(self, unit): # GH 12064 result = date_range( - "2013-01-01 00:00:00+09:00", "2013/01/01 02:00:00+09:00", freq="h" + "2013-01-01 00:00:00+09:00", + "2013/01/01 02:00:00+09:00", + freq="h", + unit=unit, ) expected = DatetimeIndex( [ @@ -461,7 +482,7 @@ def test_construct_with_different_start_end_string_format(self): Timestamp("2013-01-01 02:00:00+09:00"), ], freq="h", - ) + ).as_unit(unit) tm.assert_index_equal(result, expected) def test_error_with_zero_monthends(self): @@ -469,13 +490,15 @@ def test_error_with_zero_monthends(self): with pytest.raises(ValueError, match=msg): date_range("1/1/2000", "1/1/2001", freq=MonthEnd(0)) - def test_range_bug(self): + def test_range_bug(self, unit): # GH #770 offset = DateOffset(months=3) - result = date_range("2011-1-1", "2012-1-31", freq=offset) + result = date_range("2011-1-1", "2012-1-31", freq=offset, unit=unit) start = datetime(2011, 1, 1) - expected = DatetimeIndex([start + i * offset for i in range(5)], freq=offset) + expected = DatetimeIndex( + [start + i * offset for i in range(5)], dtype=f"M8[{unit}]", freq=offset + ) tm.assert_index_equal(result, expected) def test_range_tz_pytz(self): @@ -786,7 +809,7 @@ def test_frequencies_A_deprecated_Y_renamed(self, freq, freq_depr): # GH#9586, GH#54275 freq_msg = re.split("[0-9]*", freq, maxsplit=1)[1] freq_depr_msg = re.split("[0-9]*", freq_depr, maxsplit=1)[1] - msg = f"'{freq_depr_msg}' will be deprecated, please use '{freq_msg}' instead." + msg = f"'{freq_depr_msg}' is deprecated, please use '{freq_msg}' instead." expected = date_range("1/1/2000", periods=2, freq=freq) with tm.assert_produces_warning(FutureWarning, match=msg): @@ -1081,7 +1104,7 @@ def test_bday_near_overflow(self): # GH#24252 avoid doing unnecessary addition that _would_ overflow start = Timestamp.max.floor("D").to_pydatetime() rng = date_range(start, end=None, periods=1, freq="B") - expected = DatetimeIndex([start], freq="B") + expected = DatetimeIndex([start], freq="B").as_unit("ns") tm.assert_index_equal(rng, expected) def test_bday_overflow_error(self): @@ -1123,18 +1146,22 @@ def test_daterange_bug_456(self): result = rng1.union(rng2) assert isinstance(result, DatetimeIndex) - def test_cdaterange(self): - result = bdate_range("2013-05-01", periods=3, freq="C") - expected = DatetimeIndex(["2013-05-01", "2013-05-02", "2013-05-03"], freq="C") + def test_cdaterange(self, unit): + result = bdate_range("2013-05-01", periods=3, freq="C", unit=unit) + expected = DatetimeIndex( + ["2013-05-01", "2013-05-02", "2013-05-03"], dtype=f"M8[{unit}]", freq="C" + ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq - def test_cdaterange_weekmask(self): + def test_cdaterange_weekmask(self, unit): result = bdate_range( - "2013-05-01", periods=3, freq="C", weekmask="Sun Mon Tue Wed Thu" + "2013-05-01", periods=3, freq="C", weekmask="Sun Mon Tue Wed Thu", unit=unit ) expected = DatetimeIndex( - ["2013-05-01", "2013-05-02", "2013-05-05"], freq=result.freq + ["2013-05-01", "2013-05-02", "2013-05-05"], + dtype=f"M8[{unit}]", + freq=result.freq, ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq @@ -1147,10 +1174,14 @@ def test_cdaterange_weekmask(self): with pytest.raises(ValueError, match=msg): bdate_range("2013-05-01", periods=3, weekmask="Sun Mon Tue Wed Thu") - def test_cdaterange_holidays(self): - result = bdate_range("2013-05-01", periods=3, freq="C", holidays=["2013-05-01"]) + def test_cdaterange_holidays(self, unit): + result = bdate_range( + "2013-05-01", periods=3, freq="C", holidays=["2013-05-01"], unit=unit + ) expected = DatetimeIndex( - ["2013-05-02", "2013-05-03", "2013-05-06"], freq=result.freq + ["2013-05-02", "2013-05-03", "2013-05-06"], + dtype=f"M8[{unit}]", + freq=result.freq, ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq @@ -1163,20 +1194,24 @@ def test_cdaterange_holidays(self): with pytest.raises(ValueError, match=msg): bdate_range("2013-05-01", periods=3, holidays=["2013-05-01"]) - def test_cdaterange_weekmask_and_holidays(self): + def test_cdaterange_weekmask_and_holidays(self, unit): result = bdate_range( "2013-05-01", periods=3, freq="C", weekmask="Sun Mon Tue Wed Thu", holidays=["2013-05-01"], + unit=unit, ) expected = DatetimeIndex( - ["2013-05-02", "2013-05-05", "2013-05-06"], freq=result.freq + ["2013-05-02", "2013-05-05", "2013-05-06"], + dtype=f"M8[{unit}]", + freq=result.freq, ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq + def test_cdaterange_holidays_weekmask_requires_freqstr(self): # raise with non-custom freq msg = ( "a custom frequency string is required when holidays or " @@ -1216,7 +1251,7 @@ def test_range_with_millisecond_resolution(self, start_end): # https://github.com/pandas-dev/pandas/issues/24110 start, end = start_end result = date_range(start=start, end=end, periods=2, inclusive="left") - expected = DatetimeIndex([start]) + expected = DatetimeIndex([start], dtype="M8[ns, UTC]") tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -1234,7 +1269,7 @@ def test_range_with_millisecond_resolution(self, start_end): def test_range_with_timezone_and_custombusinessday(self, start, period, expected): # GH49441 result = date_range(start=start, periods=period, freq="C") - expected = DatetimeIndex(expected) + expected = DatetimeIndex(expected).as_unit("ns") tm.assert_index_equal(result, expected) @@ -1509,11 +1544,11 @@ def test_date_range_negative_freq_year_end(self, unit): def test_date_range_business_year_end_year(self, unit): # see GH#9313 - rng = date_range("1/1/2013", "7/1/2017", freq="BY", unit=unit) + rng = date_range("1/1/2013", "7/1/2017", freq="BYE", unit=unit) exp = DatetimeIndex( ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-30"], dtype=f"M8[{unit}]", - freq="BY", + freq="BYE", ) tm.assert_index_equal(rng, exp) @@ -1601,8 +1636,8 @@ def test_date_range_semi_month_end(self, unit): datetime(2008, 12, 31), ] # ensure generating a range with DatetimeIndex gives same result - result = date_range(start=dates[0], end=dates[-1], freq="SM", unit=unit) - exp = DatetimeIndex(dates, dtype=f"M8[{unit}]", freq="SM") + result = date_range(start=dates[0], end=dates[-1], freq="SME", unit=unit) + exp = DatetimeIndex(dates, dtype=f"M8[{unit}]", freq="SME") tm.assert_index_equal(result, exp) def test_date_range_week_of_month(self, unit): @@ -1618,6 +1653,16 @@ def test_date_range_week_of_month(self, unit): ) tm.assert_index_equal(result2, expected2) + def test_date_range_week_of_month2(self, unit): + # GH#5115, GH#5348 + result = date_range("2013-1-1", periods=4, freq="WOM-1SAT", unit=unit) + expected = DatetimeIndex( + ["2013-01-05", "2013-02-02", "2013-03-02", "2013-04-06"], + dtype=f"M8[{unit}]", + freq="WOM-1SAT", + ) + tm.assert_index_equal(result, expected) + def test_date_range_negative_freq_month_end(self, unit): # GH#11018 rng = date_range("2011-01-31", freq="-2ME", periods=3, unit=unit) @@ -1645,7 +1690,7 @@ def test_date_range_fy5253(self, unit): "freqstr,offset", [ ("QS", offsets.QuarterBegin(startingMonth=1)), - ("BQ", offsets.BQuarterEnd(startingMonth=12)), + ("BQE", offsets.BQuarterEnd(startingMonth=12)), ("W-SUN", offsets.Week(weekday=6)), ], ) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index a1074a1744c7a..d2d627c27312a 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -58,12 +58,6 @@ def test_week_of_month_frequency(self): expected = DatetimeIndex([d1, d3, d2]) tm.assert_index_equal(result_union, expected) - # GH 5115 - result = date_range("2013-1-1", periods=4, freq="WOM-1SAT") - dates = ["2013-01-05", "2013-02-02", "2013-03-02", "2013-04-06"] - expected = DatetimeIndex(dates, freq="WOM-1SAT") - tm.assert_index_equal(result, expected) - def test_append_nondatetimeindex(self): rng = date_range("1/1/2000", periods=10) idx = Index(["a", "b", "c", "d"]) @@ -167,11 +161,6 @@ def test_CBH_deprecated(self): @pytest.mark.parametrize( "freq_depr, expected_values, expected_freq", [ - ( - "2BA", - ["2020-12-31", "2022-12-30"], - "2BY-DEC", - ), ( "AS-AUG", ["2021-08-01", "2022-08-01", "2023-08-01"], @@ -184,7 +173,7 @@ def test_CBH_deprecated(self): ), ], ) - def test_AS_BA_BAS_deprecated(self, freq_depr, expected_values, expected_freq): + def test_AS_BAS_deprecated(self, freq_depr, expected_values, expected_freq): # GH#55479 freq_msg = re.split("[0-9]*", freq_depr, maxsplit=1)[1] msg = f"'{freq_msg}' is deprecated and will be removed in a future version." @@ -201,16 +190,26 @@ def test_AS_BA_BAS_deprecated(self, freq_depr, expected_values, expected_freq): tm.assert_index_equal(result, expected) - def test_BM_deprecated(self): + @pytest.mark.parametrize( + "freq, expected_values, freq_depr", + [ + ("2BYE-MAR", ["2016-03-31"], "2BA-MAR"), + ("2BYE-JUN", ["2016-06-30"], "2BY-JUN"), + ("2BME", ["2016-02-29", "2016-04-29", "2016-06-30"], "2BM"), + ("2BQE", ["2016-03-31"], "2BQ"), + ("1BQE-MAR", ["2016-03-31", "2016-06-30"], "1BQ-MAR"), + ], + ) + def test_BM_BQ_BY_deprecated(self, freq, expected_values, freq_depr): # GH#52064 - msg = "'BM' is deprecated and will be removed in a future version." + msg = f"'{freq_depr[1:]}' is deprecated, please use '{freq[1:]}' instead." with tm.assert_produces_warning(FutureWarning, match=msg): - expected = date_range(start="2016-02-21", end="2016-08-21", freq="2BM") + expected = date_range(start="2016-02-21", end="2016-08-21", freq=freq_depr) result = DatetimeIndex( - ["2016-02-29", "2016-04-29", "2016-06-30"], + data=expected_values, dtype="datetime64[ns]", - freq="2BME", + freq=freq, ) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index da11920ac1cba..73de33607ca0b 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -8,6 +8,7 @@ import numpy as np import pytest +from pandas._libs import index as libindex from pandas.compat.numpy import np_long import pandas as pd @@ -425,17 +426,17 @@ def test_get_loc_time_obj(self): expected = np.array([]) tm.assert_numpy_array_equal(result, expected, check_dtype=False) - def test_get_loc_time_obj2(self): + @pytest.mark.parametrize("offset", [-10, 10]) + def test_get_loc_time_obj2(self, monkeypatch, offset): # GH#8667 - - from pandas._libs.index import _SIZE_CUTOFF - - ns = _SIZE_CUTOFF + np.array([-100, 100], dtype=np.int64) + size_cutoff = 50 + n = size_cutoff + offset key = time(15, 11, 30) start = key.hour * 3600 + key.minute * 60 + key.second step = 24 * 3600 - for n in ns: + with monkeypatch.context(): + monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff) idx = date_range("2014-11-26", periods=n, freq="s") ts = pd.Series(np.random.default_rng(2).standard_normal(n), index=idx) locs = np.arange(start, n, step, dtype=np.intp) diff --git a/pandas/tests/indexes/datetimes/test_iter.py b/pandas/tests/indexes/datetimes/test_iter.py index a1861e6e9447e..a006ed79f27ba 100644 --- a/pandas/tests/indexes/datetimes/test_iter.py +++ b/pandas/tests/indexes/datetimes/test_iter.py @@ -7,6 +7,7 @@ date_range, to_datetime, ) +from pandas.core.arrays import datetimes class TestDatetimeIndexIteration: @@ -59,13 +60,17 @@ def test_iteration_preserves_tz3(self): assert result._repr_base == expected._repr_base assert result == expected - @pytest.mark.parametrize("periods", [0, 9999, 10000, 10001]) - def test_iteration_over_chunksize(self, periods): + @pytest.mark.parametrize("offset", [-5, -1, 0, 1]) + def test_iteration_over_chunksize(self, offset, monkeypatch): # GH#21012 - - index = date_range("2000-01-01 00:00:00", periods=periods, freq="min") + chunksize = 5 + index = date_range( + "2000-01-01 00:00:00", periods=chunksize - offset, freq="min" + ) num = 0 - for stamp in index: - assert index[num] == stamp - num += 1 + with monkeypatch.context() as m: + m.setattr(datetimes, "_ITER_CHUNKSIZE", chunksize) + for stamp in index: + assert index[num] == stamp + num += 1 assert num == len(index) diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 6fbfe27acc0e0..81992219d71b4 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -304,11 +304,11 @@ def test_dti_fields(self, tz): exp = dti[[0, 90, 181, 273]] tm.assert_index_equal(res, exp) res = dti[dti.is_leap_year] - exp = DatetimeIndex([], freq="D", tz=dti.tz, name="name") + exp = DatetimeIndex([], freq="D", tz=dti.tz, name="name").as_unit("ns") tm.assert_index_equal(res, exp) def test_dti_is_year_quarter_start(self): - dti = date_range(freq="BQ-FEB", start=datetime(1998, 1, 1), periods=4) + dti = date_range(freq="BQE-FEB", start=datetime(1998, 1, 1), periods=4) assert sum(dti.is_quarter_start) == 0 assert sum(dti.is_quarter_end) == 4 diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 62b5e72d5e025..78c23e47897cf 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -73,7 +73,7 @@ def test_union(self, tz, sort): expected2_notsorted = DatetimeIndex(list(other2) + list(rng2[:3])) rng3 = date_range("1/1/2000", freq="D", periods=5, tz=tz) - other3 = DatetimeIndex([], tz=tz) + other3 = DatetimeIndex([], tz=tz).as_unit("ns") expected3 = date_range("1/1/2000", freq="D", periods=5, tz=tz) expected3_notsorted = rng3 @@ -206,13 +206,13 @@ def test_intersection2(self): first = tm.makeDateIndex(10) second = first[5:] intersect = first.intersection(second) - assert tm.equalContents(intersect, second) + tm.assert_index_equal(intersect, second) # GH 10149 cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: result = first.intersection(case) - assert tm.equalContents(result, second) + tm.assert_index_equal(result, second) third = Index(["a", "b", "c"]) result = first.intersection(third) @@ -235,7 +235,7 @@ def test_intersection(self, tz, sort): expected3 = date_range("6/1/2000", "6/20/2000", freq="D", name=None) rng4 = date_range("7/1/2000", "7/31/2000", freq="D", name="idx") - expected4 = DatetimeIndex([], freq="D", name="idx") + expected4 = DatetimeIndex([], freq="D", name="idx", dtype="M8[ns]") for rng, expected in [ (rng2, expected2), @@ -265,7 +265,7 @@ def test_intersection(self, tz, sort): # GH 7880 rng4 = date_range("7/1/2000", "7/31/2000", freq="D", tz=tz, name="idx") - expected4 = DatetimeIndex([], tz=tz, name="idx") + expected4 = DatetimeIndex([], tz=tz, name="idx").as_unit("ns") assert expected4.freq is None for rng, expected in [ @@ -536,7 +536,7 @@ def test_intersection(self): # non-overlapping the_int = rng[:10].intersection(rng[10:]) - expected = DatetimeIndex([]) + expected = DatetimeIndex([]).as_unit("ns") tm.assert_index_equal(the_int, expected) def test_intersection_bug(self): diff --git a/pandas/tests/indexes/interval/test_base.py b/pandas/tests/indexes/interval/test_base.py deleted file mode 100644 index e0155a13481ac..0000000000000 --- a/pandas/tests/indexes/interval/test_base.py +++ /dev/null @@ -1,56 +0,0 @@ -import numpy as np -import pytest - -from pandas import IntervalIndex -import pandas._testing as tm - - -class TestInterval: - """ - Tests specific to the shared common index tests; unrelated tests should be placed - in test_interval.py or the specific test file (e.g. test_astype.py) - """ - - @pytest.fixture - def simple_index(self) -> IntervalIndex: - return IntervalIndex.from_breaks(range(11), closed="right") - - @pytest.fixture - def index(self): - return tm.makeIntervalIndex(10) - - def test_take(self, closed): - index = IntervalIndex.from_breaks(range(11), closed=closed) - - result = index.take(range(10)) - tm.assert_index_equal(result, index) - - result = index.take([0, 0, 1]) - expected = IntervalIndex.from_arrays([0, 0, 1], [1, 1, 2], closed=closed) - tm.assert_index_equal(result, expected) - - def test_where(self, simple_index, listlike_box): - klass = listlike_box - - idx = simple_index - cond = [True] * len(idx) - expected = idx - result = expected.where(klass(cond)) - tm.assert_index_equal(result, expected) - - cond = [False] + [True] * len(idx[1:]) - expected = IntervalIndex([np.nan] + idx[1:].tolist()) - result = idx.where(klass(cond)) - tm.assert_index_equal(result, expected) - - def test_getitem_2d_deprecated(self, simple_index): - # GH#30588 multi-dim indexing is deprecated, but raising is also acceptable - idx = simple_index - with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): - idx[:, None] - with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): - # GH#44051 - idx[True] - with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): - # GH#44051 - idx[False] diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index 1efe5ff980f6c..078a0e06e0ed7 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -488,6 +488,23 @@ def test_index_mixed_closed(self): tm.assert_index_equal(result, expected) +@pytest.mark.parametrize("timezone", ["UTC", "US/Pacific", "GMT"]) +def test_interval_index_subtype(timezone, inclusive_endpoints_fixture): + # GH#46999 + dates = date_range("2022", periods=3, tz=timezone) + dtype = f"interval[datetime64[ns, {timezone}], {inclusive_endpoints_fixture}]" + result = IntervalIndex.from_arrays( + ["2022-01-01", "2022-01-02"], + ["2022-01-02", "2022-01-03"], + closed=inclusive_endpoints_fixture, + dtype=dtype, + ) + expected = IntervalIndex.from_arrays( + dates[:-1], dates[1:], closed=inclusive_endpoints_fixture + ) + tm.assert_index_equal(result, expected) + + def test_dtype_closed_mismatch(): # GH#38394 closed specified in both dtype and IntervalIndex constructor diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index db8f697b95cd8..2007a793843c9 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -19,12 +19,75 @@ array, date_range, interval_range, + isna, period_range, timedelta_range, ) import pandas._testing as tm +class TestGetItem: + def test_getitem(self, closed): + idx = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed=closed) + assert idx[0] == Interval(0.0, 1.0, closed=closed) + assert idx[1] == Interval(1.0, 2.0, closed=closed) + assert isna(idx[2]) + + result = idx[0:1] + expected = IntervalIndex.from_arrays((0.0,), (1.0,), closed=closed) + tm.assert_index_equal(result, expected) + + result = idx[0:2] + expected = IntervalIndex.from_arrays((0.0, 1), (1.0, 2.0), closed=closed) + tm.assert_index_equal(result, expected) + + result = idx[1:3] + expected = IntervalIndex.from_arrays( + (1.0, np.nan), (2.0, np.nan), closed=closed + ) + tm.assert_index_equal(result, expected) + + def test_getitem_2d_deprecated(self): + # GH#30588 multi-dim indexing is deprecated, but raising is also acceptable + idx = IntervalIndex.from_breaks(range(11), closed="right") + with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): + idx[:, None] + with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): + # GH#44051 + idx[True] + with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): + # GH#44051 + idx[False] + + +class TestWhere: + def test_where(self, listlike_box): + klass = listlike_box + + idx = IntervalIndex.from_breaks(range(11), closed="right") + cond = [True] * len(idx) + expected = idx + result = expected.where(klass(cond)) + tm.assert_index_equal(result, expected) + + cond = [False] + [True] * len(idx[1:]) + expected = IntervalIndex([np.nan] + idx[1:].tolist()) + result = idx.where(klass(cond)) + tm.assert_index_equal(result, expected) + + +class TestTake: + def test_take(self, closed): + index = IntervalIndex.from_breaks(range(11), closed=closed) + + result = index.take(range(10)) + tm.assert_index_equal(result, index) + + result = index.take([0, 0, 1]) + expected = IntervalIndex.from_arrays([0, 0, 1], [1, 1, 2], closed=closed) + tm.assert_index_equal(result, expected) + + class TestGetLoc: @pytest.mark.parametrize("side", ["right", "left", "both", "neither"]) def test_get_loc_interval(self, closed, side): diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index dea40eff8d2ac..63f9b2176dddd 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -341,26 +341,6 @@ def test_is_monotonic_with_nans(self): assert not index._is_strictly_monotonic_decreasing assert not index.is_monotonic_decreasing - def test_get_item(self, closed): - i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed=closed) - assert i[0] == Interval(0.0, 1.0, closed=closed) - assert i[1] == Interval(1.0, 2.0, closed=closed) - assert isna(i[2]) - - result = i[0:1] - expected = IntervalIndex.from_arrays((0.0,), (1.0,), closed=closed) - tm.assert_index_equal(result, expected) - - result = i[0:2] - expected = IntervalIndex.from_arrays((0.0, 1), (1.0, 2.0), closed=closed) - tm.assert_index_equal(result, expected) - - result = i[1:3] - expected = IntervalIndex.from_arrays( - (1.0, np.nan), (2.0, np.nan), closed=closed - ) - tm.assert_index_equal(result, expected) - @pytest.mark.parametrize( "breaks", [ @@ -408,7 +388,7 @@ def test_maybe_convert_i8_nat(self, breaks): # GH 20636 index = IntervalIndex.from_breaks(breaks) - to_convert = breaks._constructor([pd.NaT] * 3) + to_convert = breaks._constructor([pd.NaT] * 3).as_unit("ns") expected = Index([np.nan] * 3, dtype=np.float64) result = index._maybe_convert_i8(to_convert) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/interval/test_interval_range.py b/pandas/tests/indexes/interval/test_interval_range.py index 6c531fb0428a3..37606bda9efca 100644 --- a/pandas/tests/indexes/interval/test_interval_range.py +++ b/pandas/tests/indexes/interval/test_interval_range.py @@ -219,12 +219,15 @@ def test_float_subtype(self, start, end, freq): expected = "int64" if is_integer(start + end) else "float64" assert result == expected - def test_constructor_coverage(self): + def test_interval_range_fractional_period(self): # float value for periods expected = interval_range(start=0, periods=10) - result = interval_range(start=0, periods=10.5) + msg = "Non-integer 'periods' in pd.date_range, .* pd.interval_range" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = interval_range(start=0, periods=10.5) tm.assert_index_equal(result, expected) + def test_constructor_coverage(self): # equivalent timestamp-like start/end start, end = Timestamp("2017-01-01"), Timestamp("2017-01-15") expected = interval_range(start=start, end=end) diff --git a/pandas/tests/indexes/interval/test_setops.py b/pandas/tests/indexes/interval/test_setops.py index 059b0b75f4190..1b0816a9405cb 100644 --- a/pandas/tests/indexes/interval/test_setops.py +++ b/pandas/tests/indexes/interval/test_setops.py @@ -25,14 +25,16 @@ def test_union(self, closed, sort): expected = monotonic_index(0, 13, closed=closed) result = index[::-1].union(other, sort=sort) - if sort is None: + if sort in (None, True): tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) result = other[::-1].union(index, sort=sort) - if sort is None: + if sort in (None, True): tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) tm.assert_index_equal(index.union(index, sort=sort), index) tm.assert_index_equal(index.union(index[:1], sort=sort), index) @@ -65,14 +67,16 @@ def test_intersection(self, closed, sort): expected = monotonic_index(5, 11, closed=closed) result = index[::-1].intersection(other, sort=sort) - if sort is None: + if sort in (None, True): tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) result = other[::-1].intersection(index, sort=sort) - if sort is None: + if sort in (None, True): tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) tm.assert_index_equal(index.intersection(index, sort=sort), index) @@ -148,16 +152,18 @@ def test_symmetric_difference(self, closed, sort): index = monotonic_index(0, 11, closed=closed) result = index[1:].symmetric_difference(index[:-1], sort=sort) expected = IntervalIndex([index[0], index[-1]]) - if sort is None: + if sort in (None, True): tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) # GH 19101: empty result, same dtype result = index.symmetric_difference(index, sort=sort) expected = empty_index(dtype="int64", closed=closed) - if sort is None: + if sort in (None, True): tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) # GH 19101: empty result, different dtypes other = IntervalIndex.from_arrays( diff --git a/pandas/tests/indexes/multi/conftest.py b/pandas/tests/indexes/multi/conftest.py index 3cc4fa4713831..15062aee56e3a 100644 --- a/pandas/tests/indexes/multi/conftest.py +++ b/pandas/tests/indexes/multi/conftest.py @@ -1,7 +1,6 @@ import numpy as np import pytest -import pandas as pd from pandas import ( Index, MultiIndex, @@ -26,52 +25,3 @@ def idx(): verify_integrity=False, ) return mi - - -@pytest.fixture -def idx_dup(): - # compare tests/indexes/multi/conftest.py - major_axis = Index(["foo", "bar", "baz", "qux"]) - minor_axis = Index(["one", "two"]) - - major_codes = np.array([0, 0, 1, 0, 1, 1]) - minor_codes = np.array([0, 1, 0, 1, 0, 1]) - index_names = ["first", "second"] - mi = MultiIndex( - levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes], - names=index_names, - verify_integrity=False, - ) - return mi - - -@pytest.fixture -def index_names(): - # names that match those in the idx fixture for testing equality of - # names assigned to the idx - return ["first", "second"] - - -@pytest.fixture -def narrow_multi_index(): - """ - Return a MultiIndex that is narrower than the display (<80 characters). - """ - n = 1000 - ci = pd.CategoricalIndex(list("a" * n) + (["abc"] * n)) - dti = pd.date_range("2000-01-01", freq="s", periods=n * 2) - return MultiIndex.from_arrays([ci, ci.codes + 9, dti], names=["a", "b", "dti"]) - - -@pytest.fixture -def wide_multi_index(): - """ - Return a MultiIndex that is wider than the display (>80 characters). - """ - n = 1000 - ci = pd.CategoricalIndex(list("a" * n) + (["abc"] * n)) - dti = pd.date_range("2000-01-01", freq="s", periods=n * 2) - levels = [ci, ci.codes + 9, dti, dti, dti] - names = ["a", "b", "dti_1", "dti_2", "dti_3"] - return MultiIndex.from_arrays(levels, names=names) diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index ee1edaa27f804..a69248cf038f8 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -11,12 +11,31 @@ from pandas import ( NA, DatetimeIndex, + Index, MultiIndex, Series, ) import pandas._testing as tm +@pytest.fixture +def idx_dup(): + # compare tests/indexes/multi/conftest.py + major_axis = Index(["foo", "bar", "baz", "qux"]) + minor_axis = Index(["one", "two"]) + + major_codes = np.array([0, 0, 1, 0, 1, 1]) + minor_codes = np.array([0, 1, 0, 1, 0, 1]) + index_names = ["first", "second"] + mi = MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=index_names, + verify_integrity=False, + ) + return mi + + @pytest.mark.parametrize("names", [None, ["first", "second"]]) def test_unique(names): mi = MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]], names=names) diff --git a/pandas/tests/indexes/multi/test_formats.py b/pandas/tests/indexes/multi/test_formats.py index 1736f65e355fb..52ff3109128f2 100644 --- a/pandas/tests/indexes/multi/test_formats.py +++ b/pandas/tests/indexes/multi/test_formats.py @@ -139,8 +139,11 @@ def test_repr(self, idx): names=['first', ...], length=6)""" assert result == expected - def test_rjust(self, narrow_multi_index): - mi = narrow_multi_index + def test_rjust(self): + n = 1000 + ci = pd.CategoricalIndex(list("a" * n) + (["abc"] * n)) + dti = pd.date_range("2000-01-01", freq="s", periods=n * 2) + mi = MultiIndex.from_arrays([ci, ci.codes + 9, dti], names=["a", "b", "dti"]) result = mi[:1].__repr__() expected = """\ MultiIndex([('a', 9, '2000-01-01 00:00:00')], @@ -182,8 +185,13 @@ def test_rjust(self, narrow_multi_index): names=['a', 'b', 'dti'], length=2000)""" assert result == expected - def test_tuple_width(self, wide_multi_index): - mi = wide_multi_index + def test_tuple_width(self): + n = 1000 + ci = pd.CategoricalIndex(list("a" * n) + (["abc"] * n)) + dti = pd.date_range("2000-01-01", freq="s", periods=n * 2) + levels = [ci, ci.codes + 9, dti, dti, dti] + names = ["a", "b", "dti_1", "dti_2", "dti_3"] + mi = MultiIndex.from_arrays(levels, names=names) result = mi[:1].__repr__() expected = """MultiIndex([('a', 9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...)], names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'])""" # noqa: E501 diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index 0720a1e1c648c..e362fc8a05a46 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -95,8 +95,9 @@ def test_get_level_number_out_of_bounds(multiindex_dataframe_random_data): frame.index._get_level_number(-3) -def test_set_name_methods(idx, index_names): +def test_set_name_methods(idx): # so long as these are synonyms, we don't need to test set_names + index_names = ["first", "second"] assert idx.rename == idx.set_names new_names = [name + "SUFFIX" for name in index_names] ind = idx.set_names(new_names) diff --git a/pandas/tests/indexes/multi/test_names.py b/pandas/tests/indexes/multi/test_names.py index 8ae643eb3626d..45f19b4d70fb9 100644 --- a/pandas/tests/indexes/multi/test_names.py +++ b/pandas/tests/indexes/multi/test_names.py @@ -83,11 +83,11 @@ def test_copy_names(): multi_idx.copy(names=[["mario"], ["luigi"]]) -def test_names(idx, index_names): +def test_names(idx): # names are assigned in setup - assert index_names == ["first", "second"] + assert idx.names == ["first", "second"] level_names = [level.name for level in idx.levels] - assert level_names == index_names + assert level_names == idx.names # setting bad names on existing index = idx diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 2b4107acee096..be266f5d8fdce 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -243,10 +243,10 @@ def test_union(idx, sort): the_union = piece1.union(piece2, sort=sort) - if sort is None: - tm.assert_index_equal(the_union, idx.sort_values()) - - assert tm.equalContents(the_union, idx) + if sort in (None, False): + tm.assert_index_equal(the_union.sort_values(), idx.sort_values()) + else: + tm.assert_index_equal(the_union, idx) # corner case, pass self or empty thing: the_union = idx.union(idx, sort=sort) @@ -258,7 +258,7 @@ def test_union(idx, sort): tuples = idx.values result = idx[:4].union(tuples[4:], sort=sort) if sort is None: - tm.equalContents(result, idx) + tm.assert_index_equal(result.sort_values(), idx.sort_values()) else: assert result.equals(idx) @@ -284,9 +284,10 @@ def test_intersection(idx, sort): the_int = piece1.intersection(piece2, sort=sort) - if sort is None: + if sort in (None, True): tm.assert_index_equal(the_int, idx[3:5]) - assert tm.equalContents(the_int, idx[3:5]) + else: + tm.assert_index_equal(the_int.sort_values(), idx[3:5]) # corner case, pass self the_int = idx.intersection(idx, sort=sort) diff --git a/pandas/tests/indexes/numeric/test_setops.py b/pandas/tests/indexes/numeric/test_setops.py index d3789f2477896..376b51dd98bb1 100644 --- a/pandas/tests/indexes/numeric/test_setops.py +++ b/pandas/tests/indexes/numeric/test_setops.py @@ -133,7 +133,10 @@ def test_symmetric_difference(self, sort): index2 = Index([2, 3, 4, 1]) result = index1.symmetric_difference(index2, sort=sort) expected = Index([5, 1]) - assert tm.equalContents(result, expected) + if sort is not None: + tm.assert_index_equal(result, expected) + else: + tm.assert_index_equal(result, expected.sort_values()) assert result.name is None if sort is None: expected = expected.sort_values() diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 87d3afc77d556..93d46ebdd0b51 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -4,6 +4,7 @@ import pytest from pandas._libs.missing import is_matching_na +import pandas.util._test_decorators as td import pandas as pd from pandas import Index @@ -144,6 +145,13 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): class TestSliceLocs: + @pytest.mark.parametrize( + "dtype", + [ + "object", + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + ], + ) @pytest.mark.parametrize( "in_slice,expected", [ @@ -167,12 +175,23 @@ class TestSliceLocs: (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] ], ) - def test_slice_locs_negative_step(self, in_slice, expected): - index = Index(list("bcdxy")) + def test_slice_locs_negative_step(self, in_slice, expected, dtype): + index = Index(list("bcdxy"), dtype=dtype) s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) result = index[s_start : s_stop : in_slice.step] - expected = Index(list(expected)) + expected = Index(list(expected), dtype=dtype) + tm.assert_index_equal(result, expected) + + @td.skip_if_no("pyarrow") + def test_slice_locs_negative_step_oob(self): + index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]") + + result = index[-10:5:1] + tm.assert_index_equal(result, index) + + result = index[4:-10:-1] + expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]") tm.assert_index_equal(result, expected) def test_slice_locs_dup(self): diff --git a/pandas/tests/indexes/period/methods/test_to_timestamp.py b/pandas/tests/indexes/period/methods/test_to_timestamp.py index 977ad8b26a369..7be2602135578 100644 --- a/pandas/tests/indexes/period/methods/test_to_timestamp.py +++ b/pandas/tests/indexes/period/methods/test_to_timestamp.py @@ -87,7 +87,7 @@ def test_to_timestamp_quarterly_bug(self): years = np.arange(1960, 2000).repeat(4) quarters = np.tile(list(range(1, 5)), 40) - pindex = PeriodIndex(year=years, quarter=quarters) + pindex = PeriodIndex.from_fields(year=years, quarter=quarters) stamps = pindex.to_timestamp("D", "end") expected = DatetimeIndex([x.to_timestamp("D", "end") for x in pindex]) diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index 79837af79c189..aecd3b3bace9a 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -23,18 +23,23 @@ class TestPeriodIndex: def test_keyword_mismatch(self): # GH#55961 we should get exactly one of data/ordinals/**fields per = Period("2016-01-01", "D") + depr_msg1 = "The 'ordinal' keyword in PeriodIndex is deprecated" + depr_msg2 = "Constructing PeriodIndex from fields is deprecated" err_msg1 = "Cannot pass both data and ordinal" with pytest.raises(ValueError, match=err_msg1): - PeriodIndex(data=[per], ordinal=[per.ordinal], freq=per.freq) + with tm.assert_produces_warning(FutureWarning, match=depr_msg1): + PeriodIndex(data=[per], ordinal=[per.ordinal], freq=per.freq) err_msg2 = "Cannot pass both data and fields" with pytest.raises(ValueError, match=err_msg2): - PeriodIndex(data=[per], year=[per.year], freq=per.freq) + with tm.assert_produces_warning(FutureWarning, match=depr_msg2): + PeriodIndex(data=[per], year=[per.year], freq=per.freq) err_msg3 = "Cannot pass both ordinal and fields" with pytest.raises(ValueError, match=err_msg3): - PeriodIndex(ordinal=[per.ordinal], year=[per.year], freq=per.freq) + with tm.assert_produces_warning(FutureWarning, match=depr_msg2): + PeriodIndex(ordinal=[per.ordinal], year=[per.year], freq=per.freq) def test_construction_base_constructor(self): # GH 13664 @@ -94,14 +99,18 @@ def test_constructor_field_arrays(self): years = np.arange(1990, 2010).repeat(4)[2:-2] quarters = np.tile(np.arange(1, 5), 20)[2:-2] - index = PeriodIndex(year=years, quarter=quarters, freq="Q-DEC") + depr_msg = "Constructing PeriodIndex from fields is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + index = PeriodIndex(year=years, quarter=quarters, freq="Q-DEC") expected = period_range("1990Q3", "2009Q2", freq="Q-DEC") tm.assert_index_equal(index, expected) - index2 = PeriodIndex(year=years, quarter=quarters, freq="2Q-DEC") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + index2 = PeriodIndex(year=years, quarter=quarters, freq="2Q-DEC") tm.assert_numpy_array_equal(index.asi8, index2.asi8) - index = PeriodIndex(year=years, quarter=quarters) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + index = PeriodIndex(year=years, quarter=quarters) tm.assert_index_equal(index, expected) years = [2007, 2007, 2007] @@ -109,13 +118,16 @@ def test_constructor_field_arrays(self): msg = "Mismatched Period array lengths" with pytest.raises(ValueError, match=msg): - PeriodIndex(year=years, month=months, freq="M") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + PeriodIndex(year=years, month=months, freq="M") with pytest.raises(ValueError, match=msg): - PeriodIndex(year=years, month=months, freq="2M") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + PeriodIndex(year=years, month=months, freq="2M") years = [2007, 2007, 2007] months = [1, 2, 3] - idx = PeriodIndex(year=years, month=months, freq="M") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + idx = PeriodIndex(year=years, month=months, freq="M") exp = period_range("2007-01", periods=3, freq="M") tm.assert_index_equal(idx, exp) @@ -145,18 +157,29 @@ def test_constructor_arrays_negative_year(self): years = np.arange(1960, 2000, dtype=np.int64).repeat(4) quarters = np.tile(np.array([1, 2, 3, 4], dtype=np.int64), 40) - pindex = PeriodIndex(year=years, quarter=quarters) + msg = "Constructing PeriodIndex from fields is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + pindex = PeriodIndex(year=years, quarter=quarters) tm.assert_index_equal(pindex.year, Index(years)) tm.assert_index_equal(pindex.quarter, Index(quarters)) + alt = PeriodIndex.from_fields(year=years, quarter=quarters) + tm.assert_index_equal(alt, pindex) + def test_constructor_invalid_quarters(self): + depr_msg = "Constructing PeriodIndex from fields is deprecated" msg = "Quarter must be 1 <= q <= 4" with pytest.raises(ValueError, match=msg): - PeriodIndex(year=range(2000, 2004), quarter=list(range(4)), freq="Q-DEC") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + PeriodIndex( + year=range(2000, 2004), quarter=list(range(4)), freq="Q-DEC" + ) - def test_constructor_corner(self): - result = period_range("2007-01", periods=10.5, freq="M") + def test_period_range_fractional_period(self): + msg = "Non-integer 'periods' in pd.date_range, pd.timedelta_range" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = period_range("2007-01", periods=10.5, freq="M") exp = period_range("2007-01", periods=10, freq="M") tm.assert_index_equal(result, exp) @@ -394,7 +417,9 @@ def test_constructor_nat(self): def test_constructor_year_and_quarter(self): year = Series([2001, 2002, 2003]) quarter = year - 2000 - idx = PeriodIndex(year=year, quarter=quarter) + msg = "Constructing PeriodIndex from fields is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + idx = PeriodIndex(year=year, quarter=quarter) strs = [f"{t[0]:d}Q{t[1]:d}" for t in zip(quarter, year)] lops = list(map(Period, strs)) p = PeriodIndex(lops) diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index 4af6b5ca1a6a7..4fab12f195dc0 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -117,7 +117,7 @@ def test_range_slice_outofbounds(self, make_range): idx = make_range(start="2013/10/01", freq="D", periods=10) df = DataFrame({"units": [100 + i for i in range(10)]}, index=idx) - empty = DataFrame(index=type(idx)([], freq="D"), columns=["units"]) + empty = DataFrame(index=idx[:0], columns=["units"]) empty["units"] = empty["units"].astype("int64") tm.assert_frame_equal(df["2013/09/01":"2013/09/30"], empty) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 6965aaf19f8fb..1354f33291c45 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -214,10 +214,19 @@ def test_negative_ordinals(self): Period(ordinal=-1000, freq="Y") Period(ordinal=0, freq="Y") - idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq="Y") - idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq="Y") + msg = "The 'ordinal' keyword in PeriodIndex is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq="Y") + with tm.assert_produces_warning(FutureWarning, match=msg): + idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq="Y") tm.assert_index_equal(idx1, idx2) + alt1 = PeriodIndex.from_ordinals([-1, 0, 1], freq="Y") + tm.assert_index_equal(alt1, idx1) + + alt2 = PeriodIndex.from_ordinals(np.array([-1, 0, 1]), freq="Y") + tm.assert_index_equal(alt2, idx2) + def test_pindex_fieldaccessor_nat(self): idx = PeriodIndex( ["2011-01", "2011-02", "NaT", "2012-03", "2012-04"], freq="D", name="name" @@ -272,11 +281,20 @@ def test_map(self): exp = Index([x.ordinal for x in index]) tm.assert_index_equal(result, exp) - def test_period_index_frequency_ME_error_message(self): - msg = "for Period, please use 'M' instead of 'ME'" + @pytest.mark.parametrize( + "freq,freq_depr", + [ + ("2M", "2ME"), + ("2Q-MAR", "2QE-MAR"), + ("2Y-FEB", "2YE-FEB"), + ], + ) + def test_period_index_frequency_ME_error_message(self, freq, freq_depr): + # GH#52064 + msg = f"for Period, please use '{freq[1:]}' instead of '{freq_depr[1:]}'" with pytest.raises(ValueError, match=msg): - PeriodIndex(["2020-01-01", "2020-01-02"], freq="2ME") + PeriodIndex(["2020-01-01", "2020-01-02"], freq=freq_depr) def test_H_deprecated_from_time_series(self): # GH#52536 @@ -299,11 +317,10 @@ def test_a_deprecated_from_time_series(self, freq_depr): series = Series(1, index=index) assert isinstance(series, Series) - @pytest.mark.parametrize("freq_depr", ["2ME", "2QE", "2YE"]) - def test_period_index_frequency_error_message(self, freq_depr): + @pytest.mark.parametrize("freq_depr", ["2SME", "2CBME", "2BYE"]) + def test_period_index_frequency_invalid_freq(self, freq_depr): # GH#9586 - msg = f"for Period, please use '{freq_depr[1:-1]}' " - f"instead of '{freq_depr[1:]}'" + msg = f"Invalid frequency: {freq_depr[1:]}" with pytest.raises(ValueError, match=msg): period_range("2020-01", "2020-05", freq=freq_depr) diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index b9a5940795a5b..2fa7e8cd0d2df 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -142,9 +142,10 @@ def test_union_misc(self, sort): # not in order result = _permute(index[:-5]).union(_permute(index[10:]), sort=sort) - if sort is None: + if sort is False: + tm.assert_index_equal(result.sort_values(), index) + else: tm.assert_index_equal(result, index) - assert tm.equalContents(result, index) # cast if different frequencies index = period_range("1/1/2000", "1/20/2000", freq="D") @@ -163,9 +164,10 @@ def test_intersection(self, sort): left = _permute(index[:-5]) right = _permute(index[10:]) result = left.intersection(right, sort=sort) - if sort is None: + if sort is False: + tm.assert_index_equal(result.sort_values(), index[10:-5]) + else: tm.assert_index_equal(result, index[10:-5]) - assert tm.equalContents(result, index[10:-5]) # cast if different frequencies index = period_range("1/1/2000", "1/20/2000", freq="D") diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 7620bf272db11..5360f1c6ea6d9 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -462,7 +462,11 @@ def test_empty_fancy(self, index, dtype): empty_index = type(index)([], dtype=index.dtype) assert index[[]].identical(empty_index) - assert index[empty_arr].identical(empty_index) + if dtype == np.bool_: + with tm.assert_produces_warning(FutureWarning, match="is deprecated"): + assert index[empty_arr].identical(empty_index) + else: + assert index[empty_arr].identical(empty_index) @pytest.mark.parametrize( "index", @@ -488,10 +492,10 @@ def test_union_dt_as_obj(self, simple_index): first_cat = index.union(date_index) second_cat = index.union(index) - appended = np.append(index, date_index.astype("O")) + appended = Index(np.append(index, date_index.astype("O"))) - assert tm.equalContents(first_cat, appended) - assert tm.equalContents(second_cat, index) + tm.assert_index_equal(first_cat, appended) + tm.assert_index_equal(second_cat, index) tm.assert_contains_all(index, first_cat) tm.assert_contains_all(index, second_cat) tm.assert_contains_all(date_index, first_cat) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 30e3f5cee05b4..f08de8e65451c 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -272,7 +272,9 @@ def test_ensure_copied_data(self, index): if isinstance(index, PeriodIndex): # .values an object array of Period, thus copied - result = index_type(ordinal=index.asi8, copy=False, **init_kwargs) + depr_msg = "The 'ordinal' keyword in PeriodIndex is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = index_type(ordinal=index.asi8, copy=False, **init_kwargs) tm.assert_numpy_array_equal(index.asi8, result.asi8, check_same="same") elif isinstance(index, IntervalIndex): # checked in test_interval.py diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 1dfeb4f2c6b5b..dab2475240267 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -30,6 +30,32 @@ ) +def equal_contents(arr1, arr2) -> bool: + """ + Checks if the set of unique elements of arr1 and arr2 are equivalent. + """ + return frozenset(arr1) == frozenset(arr2) + + +@pytest.fixture( + params=tm.ALL_REAL_NUMPY_DTYPES + + [ + "object", + "category", + "datetime64[ns]", + "timedelta64[ns]", + ] +) +def any_dtype_for_small_pos_integer_indexes(request): + """ + Dtypes that can be given to an Index with small positive integers. + + This means that for any dtype `x` in the params list, `Index([1, 2, 3], dtype=x)` is + valid and gives the correct Index (sub-)class. + """ + return request.param + + def test_union_same_types(index): # Union with a non-unique, non-monotonic index raises error # Only needed for bool index factory @@ -196,10 +222,10 @@ def test_intersection_base(self, index): if isinstance(index, CategoricalIndex): pytest.skip(f"Not relevant for {type(index).__name__}") - first = index[:5] - second = index[:3] + first = index[:5].unique() + second = index[:3].unique() intersect = first.intersection(second) - assert tm.equalContents(intersect, second) + tm.assert_index_equal(intersect, second) if isinstance(index.dtype, DatetimeTZDtype): # The second.values below will drop tz, so the rest of this test @@ -210,7 +236,7 @@ def test_intersection_base(self, index): cases = [second.to_numpy(), second.to_series(), second.to_list()] for case in cases: result = first.intersection(case) - assert tm.equalContents(result, second) + assert equal_contents(result, second) if isinstance(index, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" @@ -222,12 +248,13 @@ def test_intersection_base(self, index): ) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_union_base(self, index): + index = index.unique() first = index[3:] second = index[:5] everything = index union = first.union(second) - assert tm.equalContents(union, everything) + tm.assert_index_equal(union.sort_values(), everything.sort_values()) if isinstance(index.dtype, DatetimeTZDtype): # The second.values below will drop tz, so the rest of this test @@ -238,7 +265,7 @@ def test_union_base(self, index): cases = [second.to_numpy(), second.to_series(), second.to_list()] for case in cases: result = first.union(case) - assert tm.equalContents(result, everything) + assert equal_contents(result, everything) if isinstance(index, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" @@ -261,13 +288,13 @@ def test_difference_base(self, sort, index): else: answer = index[4:] result = first.difference(second, sort) - assert tm.equalContents(result, answer) + assert equal_contents(result, answer) # GH#10149 cases = [second.to_numpy(), second.to_series(), second.to_list()] for case in cases: result = first.difference(case, sort) - assert tm.equalContents(result, answer) + assert equal_contents(result, answer) if isinstance(index, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" @@ -292,13 +319,13 @@ def test_symmetric_difference(self, index): second = index[:-1] answer = index[[0, -1]] result = first.symmetric_difference(second) - assert tm.equalContents(result, answer) + tm.assert_index_equal(result.sort_values(), answer.sort_values()) # GH#10149 cases = [second.to_numpy(), second.to_series(), second.to_list()] for case in cases: result = first.symmetric_difference(case) - assert tm.equalContents(result, answer) + assert equal_contents(result, answer) if isinstance(index, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" @@ -320,8 +347,9 @@ def test_corner_union(self, index_flat, fname, sname, expected_name): # Test unions with various name combinations # Do not test MultiIndex or repeats if not index_flat.is_unique: - pytest.skip("Randomly generated index_flat was not unique.") - index = index_flat + index = index_flat.unique() + else: + index = index_flat # Test copy.union(copy) first = index.copy().set_names(fname) @@ -363,8 +391,9 @@ def test_corner_union(self, index_flat, fname, sname, expected_name): ) def test_union_unequal(self, index_flat, fname, sname, expected_name): if not index_flat.is_unique: - pytest.skip("Randomly generated index_flat was not unique.") - index = index_flat + index = index_flat.unique() + else: + index = index_flat # test copy.union(subset) - need sort for unicode and string first = index.copy().set_names(fname) @@ -387,8 +416,9 @@ def test_corner_intersect(self, index_flat, fname, sname, expected_name): # GH#35847 # Test intersections with various name combinations if not index_flat.is_unique: - pytest.skip("Randomly generated index_flat was not unique.") - index = index_flat + index = index_flat.unique() + else: + index = index_flat # Test copy.intersection(copy) first = index.copy().set_names(fname) @@ -430,8 +460,9 @@ def test_corner_intersect(self, index_flat, fname, sname, expected_name): ) def test_intersect_unequal(self, index_flat, fname, sname, expected_name): if not index_flat.is_unique: - pytest.skip("Randomly generated index_flat was not unique.") - index = index_flat + index = index_flat.unique() + else: + index = index_flat # test copy.intersection(subset) - need sort for unicode and string first = index.copy().set_names(fname) @@ -678,9 +709,10 @@ def test_intersection(self, index, sort): first = index[:20] second = index[:10] intersect = first.intersection(second, sort=sort) - if sort is None: - tm.assert_index_equal(intersect, second.sort_values()) - assert tm.equalContents(intersect, second) + if sort in (None, False): + tm.assert_index_equal(intersect.sort_values(), second.sort_values()) + else: + tm.assert_index_equal(intersect, second) # Corner cases inter = first.intersection(first, sort=sort) @@ -743,9 +775,10 @@ def test_union(self, index, sort): everything = index[:20] union = first.union(second, sort=sort) - if sort is None: - tm.assert_index_equal(union, everything.sort_values()) - assert tm.equalContents(union, everything) + if sort in (None, False): + tm.assert_index_equal(union.sort_values(), everything.sort_values()) + else: + tm.assert_index_equal(union, everything) @pytest.mark.parametrize("klass", [np.array, Series, list]) @pytest.mark.parametrize("index", ["string"], indirect=True) @@ -757,9 +790,10 @@ def test_union_from_iterables(self, index, klass, sort): case = klass(second.values) result = first.union(case, sort=sort) - if sort is None: - tm.assert_index_equal(result, everything.sort_values()) - assert tm.equalContents(result, everything) + if sort in (None, False): + tm.assert_index_equal(result.sort_values(), everything.sort_values()) + else: + tm.assert_index_equal(result, everything) @pytest.mark.parametrize("index", ["string"], indirect=True) def test_union_identity(self, index, sort): @@ -788,7 +822,11 @@ def test_difference_name_preservation(self, index, second_name, expected, sort): second.name = second_name result = first.difference(second, sort=sort) - assert tm.equalContents(result, answer) + if sort is True: + tm.assert_index_equal(result, answer) + else: + answer.name = second_name + tm.assert_index_equal(result.sort_values(), answer.sort_values()) if expected is None: assert result.name is None @@ -871,7 +909,6 @@ def test_symmetric_difference_mi(self, sort): if sort is None: expected = expected.sort_values() tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) @pytest.mark.parametrize( "index2,expected", @@ -893,13 +930,20 @@ def test_symmetric_difference_missing(self, index2, expected, sort): def test_symmetric_difference_non_index(self, sort): index1 = Index([1, 2, 3, 4], name="index1") index2 = np.array([2, 3, 4, 5]) - expected = Index([1, 5]) + expected = Index([1, 5], name="index1") result = index1.symmetric_difference(index2, sort=sort) - assert tm.equalContents(result, expected) + if sort in (None, True): + tm.assert_index_equal(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) assert result.name == "index1" result = index1.symmetric_difference(index2, result_name="new_name", sort=sort) - assert tm.equalContents(result, expected) + expected.name = "new_name" + if sort in (None, True): + tm.assert_index_equal(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) assert result.name == "new_name" def test_union_ea_dtypes(self, any_numeric_ea_and_arrow_dtype): diff --git a/pandas/tests/indexes/timedeltas/test_constructors.py b/pandas/tests/indexes/timedeltas/test_constructors.py index ccbd61c4d6693..8c51f6c0a1d25 100644 --- a/pandas/tests/indexes/timedeltas/test_constructors.py +++ b/pandas/tests/indexes/timedeltas/test_constructors.py @@ -174,11 +174,14 @@ def test_constructor_iso(self): result = to_timedelta(durations) tm.assert_index_equal(result, expected) - def test_constructor_coverage(self): - rng = timedelta_range("1 days", periods=10.5) + def test_timedelta_range_fractional_period(self): + msg = "Non-integer 'periods' in pd.date_range, pd.timedelta_range" + with tm.assert_produces_warning(FutureWarning, match=msg): + rng = timedelta_range("1 days", periods=10.5) exp = timedelta_range("1 days", periods=10) tm.assert_index_equal(rng, exp) + def test_constructor_coverage(self): msg = "periods must be a number, got foo" with pytest.raises(TypeError, match=msg): timedelta_range(start="1 days", periods="foo", freq="D") diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index 727b4eee00566..fce10d9176d74 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -115,7 +115,7 @@ def test_intersection_equal(self, sort): intersect = first.intersection(second, sort=sort) if sort is None: tm.assert_index_equal(intersect, second.sort_values()) - assert tm.equalContents(intersect, second) + tm.assert_index_equal(intersect, second) # Corner cases inter = first.intersection(first, sort=sort) diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py index 9bf7c601e4db0..c71b077a5e242 100644 --- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -34,12 +34,13 @@ def test_detect_chained_assignment(using_copy_on_write, warn_copy_on_write): with tm.raises_chained_assignment_error(): zed["eyes"]["right"].fillna(value=555, inplace=True) elif warn_copy_on_write: - # TODO(CoW-warn) should warn - zed["eyes"]["right"].fillna(value=555, inplace=True) + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + zed["eyes"]["right"].fillna(value=555, inplace=True) else: msg = "A value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(SettingWithCopyError, match=msg): - zed["eyes"]["right"].fillna(value=555, inplace=True) + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + zed["eyes"]["right"].fillna(value=555, inplace=True) @td.skip_array_manager_invalid_test # with ArrayManager df.loc[0] is not a view diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 3d2ed1d168040..36cc8316ea5ff 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -1,7 +1,7 @@ import numpy as np import pytest -import pandas._libs.index as _index +import pandas._libs.index as libindex from pandas.errors import PerformanceWarning import pandas as pd @@ -33,20 +33,19 @@ def test_multiindex_perf_warn(self): with tm.assert_produces_warning(PerformanceWarning): df.loc[(0,)] - def test_indexing_over_hashtable_size_cutoff(self): - n = 10000 + @pytest.mark.parametrize("offset", [-5, 5]) + def test_indexing_over_hashtable_size_cutoff(self, monkeypatch, offset): + size_cutoff = 20 + n = size_cutoff + offset - old_cutoff = _index._SIZE_CUTOFF - _index._SIZE_CUTOFF = 20000 + with monkeypatch.context(): + monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff) + s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n)))) - s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n)))) - - # hai it works! - assert s[("a", 5)] == 5 - assert s[("a", 6)] == 6 - assert s[("a", 7)] == 7 - - _index._SIZE_CUTOFF = old_cutoff + # hai it works! + assert s[("a", 5)] == 5 + assert s[("a", 6)] == 6 + assert s[("a", 7)] == 7 def test_multi_nan_indexing(self): # GH 3588 diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 3237c8f52797a..e31b675efb69a 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -201,7 +201,9 @@ def test_multiindex_assignment(self): df.loc[4, "d"] = arr tm.assert_series_equal(df.loc[4, "d"], Series(arr, index=[8, 10], name="d")) - def test_multiindex_assignment_single_dtype(self, using_copy_on_write): + def test_multiindex_assignment_single_dtype( + self, using_copy_on_write, warn_copy_on_write + ): # GH3777 part 2b # single dtype arr = np.array([0.0, 1.0]) @@ -215,6 +217,8 @@ def test_multiindex_assignment_single_dtype(self, using_copy_on_write): view = df["c"].iloc[:2].values # arr can be losslessly cast to int, so this setitem is inplace + # INFO(CoW-warn) this does not warn because we directly took .values + # above, so no reference to a pandas object is alive for `view` df.loc[4, "c"] = arr exp = Series(arr, index=[8, 10], name="c", dtype="int64") result = df.loc[4, "c"] @@ -234,7 +238,8 @@ def test_multiindex_assignment_single_dtype(self, using_copy_on_write): tm.assert_series_equal(result, exp) # scalar ok - df.loc[4, "c"] = 10 + with tm.assert_cow_warning(warn_copy_on_write): + df.loc[4, "c"] = 10 exp = Series(10, index=[8, 10], name="c", dtype="float64") tm.assert_series_equal(df.loc[4, "c"], exp) @@ -248,7 +253,8 @@ def test_multiindex_assignment_single_dtype(self, using_copy_on_write): # But with a length-1 listlike column indexer this behaves like # `df.loc[4, "c"] = 0 - df.loc[4, ["c"]] = [0] + with tm.assert_cow_warning(warn_copy_on_write): + df.loc[4, ["c"]] = [0] assert (df.loc[4, "c"] == 0).all() def test_groupby_example(self): diff --git a/pandas/tests/indexing/test_iat.py b/pandas/tests/indexing/test_iat.py index b4c4b81ac9cfb..5b8c4f2d4b9b9 100644 --- a/pandas/tests/indexing/test_iat.py +++ b/pandas/tests/indexing/test_iat.py @@ -41,11 +41,10 @@ def test_iat_setitem_item_cache_cleared( # previously this iat setting would split the block and fail to clear # the item_cache. - with tm.assert_cow_warning(warn_copy_on_write and indexer_ial is tm.iloc): + with tm.assert_cow_warning(warn_copy_on_write): indexer_ial(df)[7, 0] = 9999 - # TODO(CoW-warn) should also warn for iat? - with tm.assert_cow_warning(warn_copy_on_write and indexer_ial is tm.iloc): + with tm.assert_cow_warning(warn_copy_on_write): indexer_ial(df)[7, 1] = 1234 assert df.iat[7, 1] == 1234 diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 5c0c1b42ca963..cbcbf3396363a 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -424,7 +424,7 @@ def test_iloc_getitem_slice_dups(self): tm.assert_frame_equal(df.iloc[10:, :2], df2) tm.assert_frame_equal(df.iloc[10:, 2:], df1) - # TODO(CoW-warn) this should NOT warn + # TODO(CoW-warn) this should NOT warn -> Series inplace operator @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_iloc_setitem(self): df = DataFrame( @@ -850,9 +850,8 @@ def test_identity_slice_returns_new_object( # Setting using .loc[:, "a"] sets inplace so alters both sliced and orig # depending on CoW - # TODO(CoW-warn) this should warn - # with tm.assert_cow_warning(warn_copy_on_write): - original_df.loc[:, "a"] = [4, 4, 4] + with tm.assert_cow_warning(warn_copy_on_write): + original_df.loc[:, "a"] = [4, 4, 4] if using_copy_on_write: assert (sliced_df["a"] == [1, 2, 3]).all() else: @@ -1142,7 +1141,7 @@ def test_iloc_getitem_with_duplicates2(self): expected = df.take([0], axis=1) tm.assert_frame_equal(result, expected) - def test_iloc_interval(self): + def test_iloc_interval(self, warn_copy_on_write): # GH#17130 df = DataFrame({Interval(1, 2): [1, 2]}) @@ -1155,7 +1154,9 @@ def test_iloc_interval(self): tm.assert_series_equal(result, expected) result = df.copy() - result.iloc[:, 0] += 1 + # TODO(CoW-warn) false positive + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[:, 0] += 1 expected = DataFrame({Interval(1, 2): [2, 3]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 35c50d6c705d1..96fd3f4e6fca0 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -828,7 +828,8 @@ def test_loc_setitem_frame_mixed_labels(self): df.loc[0, [1, 2]] = [5, 6] tm.assert_frame_equal(df, expected) - def test_loc_setitem_frame_multiples(self): + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + def test_loc_setitem_frame_multiples(self, warn_copy_on_write): # multiple setting df = DataFrame( {"A": ["foo", "bar", "baz"], "B": Series(range(3), dtype=np.int64)} @@ -1097,9 +1098,8 @@ def test_identity_slice_returns_new_object( # Setting using .loc[:, "a"] sets inplace so alters both sliced and orig # depending on CoW - # TODO(CoW-warn) should warn - # with tm.assert_cow_warning(warn_copy_on_write): - original_df.loc[:, "a"] = [4, 4, 4] + with tm.assert_cow_warning(warn_copy_on_write): + original_df.loc[:, "a"] = [4, 4, 4] if using_copy_on_write: assert (sliced_df["a"] == [1, 2, 3]).all() else: diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index ffc672cc748be..1251a6ae97a1c 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -3,6 +3,8 @@ in core.internals """ +import pytest + import pandas as pd import pandas._testing as tm from pandas.core import internals @@ -27,9 +29,6 @@ def test_namespace(): "ops", ] expected = [ - "Block", - "DatetimeTZBlock", - "ExtensionBlock", "make_block", "DataManager", "ArrayManager", @@ -44,6 +43,28 @@ def test_namespace(): assert set(result) == set(expected + modules) +@pytest.mark.parametrize( + "name", + [ + "NumericBlock", + "ObjectBlock", + "Block", + "ExtensionBlock", + "DatetimeTZBlock", + ], +) +def test_deprecations(name): + # GH#55139 + msg = f"{name} is deprecated.* Use public APIs instead" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + getattr(internals, name) + + if name not in ["NumericBlock", "ObjectBlock"]: + # NumericBlock and ObjectBlock are not in the internals.api namespace + with tm.assert_produces_warning(DeprecationWarning, match=msg): + getattr(api, name) + + def test_make_block_2d_with_dti(): # GH#41168 dti = pd.date_range("2012", periods=3, tz="UTC") diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index ae79da4bbe0d3..792d1323ec730 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -991,7 +991,6 @@ def assert_slice_ok(mgr, axis, slobj): # 2D only support slice objects # boolean mask - assert_slice_ok(mgr, ax, np.array([], dtype=np.bool_)) assert_slice_ok(mgr, ax, np.ones(mgr.shape[ax], dtype=np.bool_)) assert_slice_ok(mgr, ax, np.zeros(mgr.shape[ax], dtype=np.bool_)) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index d3552ab5d39f5..ab6cacc4cc860 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -51,23 +51,7 @@ def xml_file(datapath): @pytest.fixture -def s3so(worker_id): - if is_ci_environment(): - url = "http://localhost:5000/" - else: - worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw") - url = f"http://127.0.0.1:555{worker_id}/" - return {"client_kwargs": {"endpoint_url": url}} - - -@pytest.fixture(scope="function" if is_ci_environment() else "session") -def monkeysession(): - with pytest.MonkeyPatch.context() as mp: - yield mp - - -@pytest.fixture(scope="function" if is_ci_environment() else "session") -def s3_base(worker_id, monkeysession): +def s3_base(worker_id, monkeypatch): """ Fixture for mocking S3 interaction. @@ -79,8 +63,8 @@ def s3_base(worker_id, monkeysession): # temporary workaround as moto fails for botocore >= 1.11 otherwise, # see https://github.com/spulec/moto/issues/1924 & 1952 - monkeysession.setenv("AWS_ACCESS_KEY_ID", "foobar_key") - monkeysession.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") + monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key") + monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") if is_ci_environment(): if is_platform_arm() or is_platform_mac() or is_platform_windows(): # NOT RUN on Windows/macOS/ARM, only Ubuntu @@ -93,6 +77,7 @@ def s3_base(worker_id, monkeysession): "Windows, macOS or ARM platforms" ) else: + # set in .github/workflows/unit-tests.yml yield "http://localhost:5000" else: requests = pytest.importorskip("requests") @@ -128,6 +113,11 @@ def s3_base(worker_id, monkeysession): proc.terminate() +@pytest.fixture +def s3so(s3_base): + return {"client_kwargs": {"endpoint_url": s3_base}} + + @pytest.fixture def s3_resource(s3_base): import boto3 diff --git a/pandas/tests/io/data/gbq_fake_job.txt b/pandas/tests/io/data/gbq_fake_job.txt deleted file mode 100644 index b0995222292e4..0000000000000 --- a/pandas/tests/io/data/gbq_fake_job.txt +++ /dev/null @@ -1 +0,0 @@ -{'status': {'state': 'DONE'}, 'kind': 'bigquery#job', 'statistics': {'query': {'cacheHit': True, 'totalBytesProcessed': '0'}, 'endTime': '1377668744674', 'totalBytesProcessed': '0', 'startTime': '1377668744466'}, 'jobReference': {'projectId': '57288129629', 'jobId': 'bqjob_r5f956972f0190bdf_00000140c374bf42_2'}, 'etag': '"4PTsVxg68bQkQs1RJ1Ndewqkgg4/oO4VmgFrAku4N6FWci9s7iFIftc"', 'configuration': {'query': {'createDisposition': 'CREATE_IF_NEEDED', 'query': 'SELECT * FROM [publicdata:samples.shakespeare]', 'writeDisposition': 'WRITE_TRUNCATE', 'destinationTable': {'projectId': '57288129629', 'tableId': 'anonb5ec450da88eeeb78a27784ea482ee75a146d442', 'datasetId': '_d0b4f5f0d50dc68a3eb0fa6cba66a9a8687d9253'}}}, 'id': '57288129629:bqjob_r5f956972f0190bdf_00000140c374bf42_2', 'selfLink': 'https://www.googleapis.com/bigquery/v2/projects/57288129629/jobs/bqjob_r5f956972f0190bdf_00000140c374bf42_2'} \ No newline at end of file diff --git a/pandas/tests/io/excel/conftest.py b/pandas/tests/io/excel/conftest.py deleted file mode 100644 index 15ff52d5bea48..0000000000000 --- a/pandas/tests/io/excel/conftest.py +++ /dev/null @@ -1,41 +0,0 @@ -import pytest - -import pandas._testing as tm - -from pandas.io.parsers import read_csv - - -@pytest.fixture -def frame(float_frame): - """ - Returns the first ten items in fixture "float_frame". - """ - return float_frame[:10] - - -@pytest.fixture -def tsframe(): - return tm.makeTimeDataFrame()[:5] - - -@pytest.fixture(params=[True, False]) -def merge_cells(request): - return request.param - - -@pytest.fixture -def df_ref(datapath): - """ - Obtain the reference data from read_csv with the Python engine. - """ - filepath = datapath("io", "data", "csv", "test1.csv") - df_ref = read_csv(filepath, index_col=0, parse_dates=True, engine="python") - return df_ref - - -@pytest.fixture(params=[".xls", ".xlsx", ".xlsm", ".ods", ".xlsb"]) -def read_ext(request): - """ - Valid extensions for reading Excel files. - """ - return request.param diff --git a/pandas/tests/io/excel/test_odswriter.py b/pandas/tests/io/excel/test_odswriter.py index ecee58362f8a9..271353a173d2a 100644 --- a/pandas/tests/io/excel/test_odswriter.py +++ b/pandas/tests/io/excel/test_odswriter.py @@ -13,7 +13,10 @@ odf = pytest.importorskip("odf") -pytestmark = pytest.mark.parametrize("ext", [".ods"]) + +@pytest.fixture +def ext(): + return ".ods" def test_write_append_mode_raises(ext): diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 53cbd1ce3cceb..2df9ec9e53516 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -17,10 +17,13 @@ openpyxl = pytest.importorskip("openpyxl") -pytestmark = pytest.mark.parametrize("ext", [".xlsx"]) +@pytest.fixture +def ext(): + return ".xlsx" -def test_to_excel_styleconverter(ext): + +def test_to_excel_styleconverter(): from openpyxl import styles hstyle = { diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 74fe5166df65f..abbdb77efad0e 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -22,6 +22,7 @@ Index, MultiIndex, Series, + read_csv, ) import pandas._testing as tm from pandas.core.arrays import ( @@ -117,6 +118,16 @@ def read_ext(engine_and_read_ext): return read_ext +@pytest.fixture +def df_ref(datapath): + """ + Obtain the reference data from read_csv with the Python engine. + """ + filepath = datapath("io", "data", "csv", "test1.csv") + df_ref = read_csv(filepath, index_col=0, parse_dates=True, engine="python") + return df_ref + + def adjust_expected(expected: DataFrame, read_ext: str) -> None: expected.index.name = None diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 946c621ae2b6e..22cd0621fd4c4 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -12,6 +12,7 @@ import pytest from pandas.compat._constants import PY310 +from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td import pandas as pd @@ -33,6 +34,19 @@ from pandas.io.excel._util import _writers +@pytest.fixture +def frame(float_frame): + """ + Returns the first ten items in fixture "float_frame". + """ + return float_frame[:10] + + +@pytest.fixture(params=[True, False]) +def merge_cells(request): + return request.param + + @pytest.fixture def path(ext): """ @@ -443,8 +457,8 @@ def test_mixed(self, frame, path): recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(mixed_frame, recons) - def test_ts_frame(self, tsframe, path): - df = tsframe + def test_ts_frame(self, path): + df = tm.makeTimeDataFrame()[:5] # freq doesn't round-trip index = pd.DatetimeIndex(np.asarray(df.index), freq=None) @@ -515,8 +529,9 @@ def test_inf_roundtrip(self, path): tm.assert_frame_equal(df, recons) - def test_sheets(self, frame, tsframe, path): + def test_sheets(self, frame, path): # freq doesn't round-trip + tsframe = tm.makeTimeDataFrame()[:5] index = pd.DatetimeIndex(np.asarray(tsframe.index), freq=None) tsframe.index = index @@ -632,10 +647,11 @@ def test_excel_roundtrip_indexname(self, merge_cells, path): tm.assert_frame_equal(result, df) assert result.index.name == "foo" - def test_excel_roundtrip_datetime(self, merge_cells, tsframe, path): + def test_excel_roundtrip_datetime(self, merge_cells, path): # datetime.date, not sure what to test here exactly # freq does not round-trip + tsframe = tm.makeTimeDataFrame()[:5] index = pd.DatetimeIndex(np.asarray(tsframe.index), freq=None) tsframe.index = index @@ -750,8 +766,8 @@ def test_to_excel_timedelta(self, path): recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(expected, recons) - def test_to_excel_periodindex(self, tsframe, path): - xp = tsframe.resample("ME", kind="period").mean() + def test_to_excel_periodindex(self, path): + xp = tm.makeTimeDataFrame()[:5].resample("ME", kind="period").mean() xp.to_excel(path, sheet_name="sht1") @@ -813,8 +829,9 @@ def test_to_excel_multiindex_cols(self, merge_cells, frame, path): frame.columns = [".".join(map(str, q)) for q in zip(*fm)] tm.assert_frame_equal(frame, df) - def test_to_excel_multiindex_dates(self, merge_cells, tsframe, path): + def test_to_excel_multiindex_dates(self, merge_cells, path): # try multiindex with dates + tsframe = tm.makeTimeDataFrame()[:5] new_index = [tsframe.index, np.arange(len(tsframe.index), dtype=np.int64)] tsframe.index = MultiIndex.from_arrays(new_index) @@ -1309,7 +1326,9 @@ class TestExcelWriterEngineTests: def test_ExcelWriter_dispatch(self, klass, ext): with tm.ensure_clean(ext) as path: with ExcelWriter(path) as writer: - if ext == ".xlsx" and td.safe_import("xlsxwriter"): + if ext == ".xlsx" and bool( + import_optional_dependency("xlsxwriter", errors="ignore") + ): # xlsxwriter has preference over openpyxl if both installed assert isinstance(writer, _XlsxWriter) else: diff --git a/pandas/tests/io/excel/test_xlsxwriter.py b/pandas/tests/io/excel/test_xlsxwriter.py index c4d02d71390cc..94f6bdfaf069c 100644 --- a/pandas/tests/io/excel/test_xlsxwriter.py +++ b/pandas/tests/io/excel/test_xlsxwriter.py @@ -9,7 +9,10 @@ xlsxwriter = pytest.importorskip("xlsxwriter") -pytestmark = pytest.mark.parametrize("ext", [".xlsx"]) + +@pytest.fixture +def ext(): + return ".xlsx" def test_column_format(ext): diff --git a/pandas/tests/io/formats/test_eng_formatting.py b/pandas/tests/io/formats/test_eng_formatting.py index 75e63cb1f6b54..6d581b5b92e0c 100644 --- a/pandas/tests/io/formats/test_eng_formatting.py +++ b/pandas/tests/io/formats/test_eng_formatting.py @@ -1,9 +1,19 @@ import numpy as np +import pytest -from pandas import DataFrame -import pandas._testing as tm +from pandas import ( + DataFrame, + reset_option, + set_eng_float_format, +) -import pandas.io.formats.format as fmt +from pandas.io.formats.format import EngFormatter + + +@pytest.fixture(autouse=True) +def reset_float_format(): + yield + reset_option("display.float_format") class TestEngFormatter: @@ -11,20 +21,19 @@ def test_eng_float_formatter2(self, float_frame): df = float_frame df.loc[5] = 0 - fmt.set_eng_float_format() + set_eng_float_format() repr(df) - fmt.set_eng_float_format(use_eng_prefix=True) + set_eng_float_format(use_eng_prefix=True) repr(df) - fmt.set_eng_float_format(accuracy=0) + set_eng_float_format(accuracy=0) repr(df) - tm.reset_display_options() def test_eng_float_formatter(self): df = DataFrame({"A": [1.41, 141.0, 14100, 1410000.0]}) - fmt.set_eng_float_format() + set_eng_float_format() result = df.to_string() expected = ( " A\n" @@ -35,18 +44,16 @@ def test_eng_float_formatter(self): ) assert result == expected - fmt.set_eng_float_format(use_eng_prefix=True) + set_eng_float_format(use_eng_prefix=True) result = df.to_string() expected = " A\n0 1.410\n1 141.000\n2 14.100k\n3 1.410M" assert result == expected - fmt.set_eng_float_format(accuracy=0) + set_eng_float_format(accuracy=0) result = df.to_string() expected = " A\n0 1E+00\n1 141E+00\n2 14E+03\n3 1E+06" assert result == expected - tm.reset_display_options() - def compare(self, formatter, input, output): formatted_input = formatter(input) assert formatted_input == output @@ -67,7 +74,7 @@ def compare_all(self, formatter, in_out): self.compare(formatter, -input, "-" + output[1:]) def test_exponents_with_eng_prefix(self): - formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + formatter = EngFormatter(accuracy=3, use_eng_prefix=True) f = np.sqrt(2) in_out = [ (f * 10**-24, " 1.414y"), @@ -125,7 +132,7 @@ def test_exponents_with_eng_prefix(self): self.compare_all(formatter, in_out) def test_exponents_without_eng_prefix(self): - formatter = fmt.EngFormatter(accuracy=4, use_eng_prefix=False) + formatter = EngFormatter(accuracy=4, use_eng_prefix=False) f = np.pi in_out = [ (f * 10**-24, " 3.1416E-24"), @@ -183,7 +190,7 @@ def test_exponents_without_eng_prefix(self): self.compare_all(formatter, in_out) def test_rounding(self): - formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + formatter = EngFormatter(accuracy=3, use_eng_prefix=True) in_out = [ (5.55555, " 5.556"), (55.5555, " 55.556"), @@ -194,7 +201,7 @@ def test_rounding(self): ] self.compare_all(formatter, in_out) - formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) + formatter = EngFormatter(accuracy=1, use_eng_prefix=True) in_out = [ (5.55555, " 5.6"), (55.5555, " 55.6"), @@ -205,7 +212,7 @@ def test_rounding(self): ] self.compare_all(formatter, in_out) - formatter = fmt.EngFormatter(accuracy=0, use_eng_prefix=True) + formatter = EngFormatter(accuracy=0, use_eng_prefix=True) in_out = [ (5.55555, " 6"), (55.5555, " 56"), @@ -216,14 +223,14 @@ def test_rounding(self): ] self.compare_all(formatter, in_out) - formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + formatter = EngFormatter(accuracy=3, use_eng_prefix=True) result = formatter(0) assert result == " 0.000" def test_nan(self): # Issue #11981 - formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) + formatter = EngFormatter(accuracy=1, use_eng_prefix=True) result = formatter(np.nan) assert result == "NaN" @@ -235,14 +242,13 @@ def test_nan(self): } ) pt = df.pivot_table(values="a", index="b", columns="c") - fmt.set_eng_float_format(accuracy=1) + set_eng_float_format(accuracy=1) result = pt.to_string() assert "NaN" in result - tm.reset_display_options() def test_inf(self): # Issue #11981 - formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) + formatter = EngFormatter(accuracy=1, use_eng_prefix=True) result = formatter(np.inf) assert result == "inf" diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index b520e2463353c..edac82193d1c8 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -872,20 +872,22 @@ def test_to_string_truncate_multilevel(self): with option_context("display.max_rows", 7, "display.max_columns", 7): assert has_doubly_truncated_repr(df) - def test_truncate_with_different_dtypes(self): + @pytest.mark.parametrize("dtype", ["object", "datetime64[us]"]) + def test_truncate_with_different_dtypes(self, dtype): # 11594, 12045 # when truncated the dtypes of the splits can differ # 11594 - s = Series( + ser = Series( [datetime(2012, 1, 1)] * 10 + [datetime(1012, 1, 2)] - + [datetime(2012, 1, 3)] * 10 + + [datetime(2012, 1, 3)] * 10, + dtype=dtype, ) with option_context("display.max_rows", 8): - result = str(s) - assert "object" in result + result = str(ser) + assert dtype in result def test_truncate_with_different_dtypes2(self): # 12045 diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 8cfcc26b95b4c..a7648cf1c471a 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -914,7 +914,6 @@ def get_ipython(): repstr = df._repr_html_() assert "class" in repstr # info fallback - tm.reset_display_options() def test_repr_html(self, float_frame): df = float_frame @@ -926,16 +925,12 @@ def test_repr_html(self, float_frame): with option_context("display.notebook_repr_html", False): df._repr_html_() - tm.reset_display_options() - df = DataFrame([[1, 2], [3, 4]]) with option_context("display.show_dimensions", True): assert "2 rows" in df._repr_html_() with option_context("display.show_dimensions", False): assert "2 rows" not in df._repr_html_() - tm.reset_display_options() - def test_repr_html_mathjax(self): df = DataFrame([[1, 2], [3, 4]]) assert "tex2jax_ignore" not in df._repr_html_() diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index 3a8dcb339b063..e607b6eb454a1 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -412,7 +412,6 @@ def test_to_string_complex_float_formatting(self): def test_to_string_format_inf(self): # GH#24861 - tm.reset_display_options() df = DataFrame( { "A": [-np.inf, np.inf, -1, -2.1234, 3, 4], @@ -460,7 +459,6 @@ def test_to_string_int_formatting(self): assert output == expected def test_to_string_float_formatting(self): - tm.reset_display_options() with option_context( "display.precision", 5, @@ -495,7 +493,6 @@ def test_to_string_float_formatting(self): expected = " x\n0 3234.000\n1 0.253" assert df_s == expected - tm.reset_display_options() assert get_option("display.precision") == 6 df = DataFrame({"x": [1e9, 0.2512]}) @@ -516,14 +513,12 @@ def test_to_string_decimal(self): assert df.to_string(decimal=",") == expected def test_to_string_left_justify_cols(self): - tm.reset_display_options() df = DataFrame({"x": [3234, 0.253]}) df_s = df.to_string(justify="left") expected = " x \n0 3234.000\n1 0.253" assert df_s == expected def test_to_string_format_na(self): - tm.reset_display_options() df = DataFrame( { "A": [np.nan, -1, -2.1234, 3, 4], diff --git a/pandas/tests/io/json/conftest.py b/pandas/tests/io/json/conftest.py index f3736252e850a..4e848cd48b42d 100644 --- a/pandas/tests/io/json/conftest.py +++ b/pandas/tests/io/json/conftest.py @@ -7,10 +7,3 @@ def orient(request): Fixture for orients excluding the table format. """ return request.param - - -@pytest.fixture(params=["ujson", "pyarrow"]) -def engine(request): - if request.param == "pyarrow": - pytest.importorskip("pyarrow.json") - return request.param diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index f5342e0ab1a38..d96ccb4b94cc2 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -25,6 +25,13 @@ def lines_json_df(): return df.to_json(lines=True, orient="records") +@pytest.fixture(params=["ujson", "pyarrow"]) +def engine(request): + if request.param == "pyarrow": + pytest.importorskip("pyarrow.json") + return request.param + + def test_read_jsonl(): # GH9180 result = read_json(StringIO('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n'), lines=True) diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 0f42aa81e4b37..baed74fc212e4 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -16,13 +16,11 @@ ) import pandas._testing as tm -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) -@xfail_pyarrow # The 'chunksize' option is not supported @pytest.mark.parametrize("index_col", [0, "index"]) def test_read_chunksize_with_index(all_parsers, index_col): parser = all_parsers @@ -48,6 +46,13 @@ def test_read_chunksize_with_index(all_parsers, index_col): ) expected = expected.set_index("index") + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader: + list(reader) + return + with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader: chunks = list(reader) tm.assert_frame_equal(chunks[0], expected[:2]) @@ -55,7 +60,6 @@ def test_read_chunksize_with_index(all_parsers, index_col): tm.assert_frame_equal(chunks[2], expected[4:]) -@xfail_pyarrow # AssertionError: Regex pattern did not match @pytest.mark.parametrize("chunksize", [1.3, "foo", 0]) def test_read_chunksize_bad(all_parsers, chunksize): data = """index,A,B,C,D @@ -68,13 +72,14 @@ def test_read_chunksize_bad(all_parsers, chunksize): """ parser = all_parsers msg = r"'chunksize' must be an integer >=1" + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): with parser.read_csv(StringIO(data), chunksize=chunksize) as _: pass -@xfail_pyarrow # The 'nrows' option is not supported @pytest.mark.parametrize("chunksize", [2, 8]) def test_read_chunksize_and_nrows(all_parsers, chunksize): # see gh-15755 @@ -89,12 +94,17 @@ def test_read_chunksize_and_nrows(all_parsers, chunksize): parser = all_parsers kwargs = {"index_col": 0, "nrows": 5} + if parser.engine == "pyarrow": + msg = "The 'nrows' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + return + expected = parser.read_csv(StringIO(data), **kwargs) with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader: tm.assert_frame_equal(concat(reader), expected) -@xfail_pyarrow # The 'chunksize' option is not supported def test_read_chunksize_and_nrows_changing_size(all_parsers): data = """index,A,B,C,D foo,2,3,4,5 @@ -107,6 +117,12 @@ def test_read_chunksize_and_nrows_changing_size(all_parsers): parser = all_parsers kwargs = {"index_col": 0, "nrows": 5} + if parser.engine == "pyarrow": + msg = "The 'nrows' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + return + expected = parser.read_csv(StringIO(data), **kwargs) with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader: tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2]) @@ -116,7 +132,6 @@ def test_read_chunksize_and_nrows_changing_size(all_parsers): reader.get_chunk(size=3) -@xfail_pyarrow # The 'chunksize' option is not supported def test_get_chunk_passed_chunksize(all_parsers): parser = all_parsers data = """A,B,C @@ -125,6 +140,13 @@ def test_get_chunk_passed_chunksize(all_parsers): 7,8,9 1,2,3""" + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with parser.read_csv(StringIO(data), chunksize=2) as reader: + reader.get_chunk() + return + with parser.read_csv(StringIO(data), chunksize=2) as reader: result = reader.get_chunk() @@ -132,7 +154,6 @@ def test_get_chunk_passed_chunksize(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # The 'chunksize' option is not supported @pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}]) def test_read_chunksize_compat(all_parsers, kwargs): # see gh-12185 @@ -146,17 +167,35 @@ def test_read_chunksize_compat(all_parsers, kwargs): """ parser = all_parsers result = parser.read_csv(StringIO(data), **kwargs) + + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader: + concat(reader) + return + with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader: - tm.assert_frame_equal(concat(reader), result) + via_reader = concat(reader) + tm.assert_frame_equal(via_reader, result) -@xfail_pyarrow # The 'chunksize' option is not supported def test_read_chunksize_jagged_names(all_parsers): # see gh-23509 parser = all_parsers data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)]) expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10]) + + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with parser.read_csv( + StringIO(data), names=range(10), chunksize=4 + ) as reader: + concat(reader) + return + with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader: result = concat(reader) tm.assert_frame_equal(result, expected) @@ -194,7 +233,6 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): assert result.a.dtype == float -@xfail_pyarrow # ValueError: The 'chunksize' option is not supported def test_warn_if_chunks_have_mismatched_type(all_parsers): warning_type = None parser = all_parsers @@ -212,17 +250,24 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers): buf = StringIO(data) - df = parser.read_csv_check_warnings( - warning_type, - r"Columns \(0\) have mixed types. " - "Specify dtype option on import or set low_memory=False.", - buf, - ) + if parser.engine == "pyarrow": + df = parser.read_csv_check_warnings( + DeprecationWarning, + "Passing a BlockManager to DataFrame is deprecated", + buf, + check_stacklevel=False, + ) + else: + df = parser.read_csv_check_warnings( + warning_type, + r"Columns \(0\) have mixed types. " + "Specify dtype option on import or set low_memory=False.", + buf, + ) assert df.a.dtype == object -@xfail_pyarrow # ValueError: The 'chunksize' option is not supported @pytest.mark.parametrize("iterator", [True, False]) def test_empty_with_nrows_chunksize(all_parsers, iterator): # see gh-9535 @@ -232,6 +277,18 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator): nrows = 10 data = StringIO("foo,bar\n") + if parser.engine == "pyarrow": + msg = ( + "The '(nrows|chunksize)' option is not supported with the 'pyarrow' engine" + ) + with pytest.raises(ValueError, match=msg): + if iterator: + with parser.read_csv(data, chunksize=nrows) as reader: + next(iter(reader)) + else: + parser.read_csv(data, nrows=nrows) + return + if iterator: with parser.read_csv(data, chunksize=nrows) as reader: result = next(iter(reader)) @@ -241,7 +298,6 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The 'chunksize' option is not supported def test_read_csv_memory_growth_chunksize(all_parsers): # see gh-24805 # @@ -254,12 +310,19 @@ def test_read_csv_memory_growth_chunksize(all_parsers): for i in range(1000): f.write(str(i) + "\n") + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with parser.read_csv(path, chunksize=20) as result: + for _ in result: + pass + return + with parser.read_csv(path, chunksize=20) as result: for _ in result: pass -@xfail_pyarrow # ValueError: The 'chunksize' option is not supported def test_chunksize_with_usecols_second_block_shorter(all_parsers): # GH#21211 parser = all_parsers @@ -268,6 +331,18 @@ def test_chunksize_with_usecols_second_block_shorter(all_parsers): 9,10,11 """ + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + names=["a", "b"], + chunksize=2, + usecols=[0, 1], + header=None, + ) + return + result_chunks = parser.read_csv( StringIO(data), names=["a", "b"], @@ -285,7 +360,6 @@ def test_chunksize_with_usecols_second_block_shorter(all_parsers): tm.assert_frame_equal(result, expected_frames[i]) -@xfail_pyarrow # ValueError: The 'chunksize' option is not supported def test_chunksize_second_block_shorter(all_parsers): # GH#21211 parser = all_parsers @@ -295,6 +369,12 @@ def test_chunksize_second_block_shorter(all_parsers): 9,10,11 """ + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), chunksize=2) + return + result_chunks = parser.read_csv(StringIO(data), chunksize=2) expected_frames = [ diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 0c28db245de31..558fdb7632102 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -399,11 +399,16 @@ def test_escapechar(all_parsers): tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"])) -@xfail_pyarrow # ValueError: the 'pyarrow' engine does not support regex separators def test_ignore_leading_whitespace(all_parsers): # see gh-3374, gh-6607 parser = all_parsers data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9" + + if parser.engine == "pyarrow": + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), sep=r"\s+") + return result = parser.read_csv(StringIO(data), sep=r"\s+") expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]}) @@ -582,12 +587,14 @@ def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data, request): if sep == r"\s+": data = data.replace(",", " ") + if parser.engine == "pyarrow": - mark = pytest.mark.xfail( - raises=ValueError, - reason="the 'pyarrow' engine does not support regex separators", - ) - request.applymarker(mark) + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines + ) + return result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines) expected = DataFrame(exp_data, columns=["A", "B", "C"]) @@ -610,7 +617,6 @@ def test_whitespace_lines(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: the 'pyarrow' engine does not support regex separators @pytest.mark.parametrize( "data,expected", [ @@ -635,6 +641,12 @@ def test_whitespace_lines(all_parsers): def test_whitespace_regex_separator(all_parsers, data, expected): # see gh-6607 parser = all_parsers + if parser.engine == "pyarrow": + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), sep=r"\s+") + return + result = parser.read_csv(StringIO(data), sep=r"\s+") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index a6e68cb984ef4..c374795019ff4 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -222,8 +222,10 @@ def test_eof_states(all_parsers, data, kwargs, expected, msg, request): return if parser.engine == "pyarrow" and "\r" not in data: - mark = pytest.mark.xfail(reason="Mismatched exception type/message") - request.applymarker(mark) + # pandas.errors.ParserError: CSV parse error: Expected 3 columns, got 1: + # ValueError: skiprows argument must be an integer when using engine='pyarrow' + # AssertionError: Regex pattern did not match. + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") if expected is None: with pytest.raises(ParserError, match=msg): @@ -233,7 +235,6 @@ def test_eof_states(all_parsers, data, kwargs, expected, msg, request): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: the 'pyarrow' engine does not support regex separators def test_temporary_file(all_parsers): # see gh-13398 parser = all_parsers @@ -244,6 +245,12 @@ def test_temporary_file(all_parsers): new_file.flush() new_file.seek(0) + if parser.engine == "pyarrow": + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv(new_file, sep=r"\s+", header=None) + return + result = parser.read_csv(new_file, sep=r"\s+", header=None) expected = DataFrame([[0, 0]]) diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index f3794c056a256..4a4ae2b259289 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -186,7 +186,8 @@ def test_error_bad_lines(all_parsers): msg = "Expected 1 fields in line 3, saw 3" if parser.engine == "pyarrow": - msg = "CSV parse error: Expected 1 columns, got 3: 1,2,3" + # "CSV parse error: Expected 1 columns, got 3: 1,2,3" + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), on_bad_lines="error") @@ -222,7 +223,8 @@ def test_read_csv_wrong_num_columns(all_parsers): msg = "Expected 6 fields in line 3, saw 7" if parser.engine == "pyarrow": - msg = "Expected 6 columns, got 7: 6,7,8,9,10,11,12" + # Expected 6 columns, got 7: 6,7,8,9,10,11,12 + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data)) @@ -246,10 +248,9 @@ def test_null_byte_char(request, all_parsers): tm.assert_frame_equal(out, expected) else: if parser.engine == "pyarrow": - msg = ( - "CSV parse error: Empty CSV file or block: " - "cannot infer number of columns" - ) + # CSV parse error: Empty CSV file or block: " + # cannot infer number of columns" + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") else: msg = "NULL byte detected" with pytest.raises(ParserError, match=msg): @@ -299,7 +300,8 @@ def test_bad_header_uniform_error(all_parsers): "number of columns, but 3 left to parse." ) elif parser.engine == "pyarrow": - msg = "CSV parse error: Expected 1 columns, got 4: col1,col2,col3,col4" + # "CSV parse error: Expected 1 columns, got 4: col1,col2,col3,col4" + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error") diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index eb7835bb27372..6d5f870f07206 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -34,6 +34,7 @@ def read_csv_check_warnings( warn_msg: str, *args, raise_on_extra_warnings=True, + check_stacklevel: bool = True, **kwargs, ): # We need to check the stacklevel here instead of in the tests @@ -41,7 +42,10 @@ def read_csv_check_warnings( # should point to. kwargs = self.update_kwargs(kwargs) with tm.assert_produces_warning( - warn_type, match=warn_msg, raise_on_extra_warnings=raise_on_extra_warnings + warn_type, + match=warn_msg, + raise_on_extra_warnings=raise_on_extra_warnings, + check_stacklevel=check_stacklevel, ): return read_csv(*args, **kwargs) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index a9540c94ce10e..500863dce84ee 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -17,7 +17,6 @@ import numpy as np import pytest -from pandas.compat import is_ci_environment from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import ( ParserError, @@ -531,24 +530,6 @@ def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix): tm.assert_frame_equal(out, expected) -@pytest.mark.single_cpu -@pytest.mark.skipif(is_ci_environment(), reason="Too memory intensive for CI.") -def test_bytes_exceed_2gb(c_parser_only): - # see gh-16798 - # - # Read from a "CSV" that has a column larger than 2GB. - parser = c_parser_only - - if parser.low_memory: - pytest.skip("not a low_memory test") - - # csv takes 10 seconds to construct, spikes memory to 8GB+, the whole test - # spikes up to 10.4GB on the c_high case - csv = StringIO("strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)])) - df = parser.read_csv(csv) - assert not df.empty - - def test_chunk_whitespace_on_boundary(c_parser_only): # see gh-9735: this issue is C parser-specific (bug when # parsing whitespace and characters at chunk boundary) diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py index 1724f3390ea46..4af03e5f3b1ec 100644 --- a/pandas/tests/io/parser/test_comment.py +++ b/pandas/tests/io/parser/test_comment.py @@ -10,10 +10,7 @@ from pandas import DataFrame import pandas._testing as tm -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") - -@xfail_pyarrow # ValueError: The 'comment' option is not supported @pytest.mark.parametrize("na_values", [None, ["NaN"]]) def test_comment(all_parsers, na_values): parser = all_parsers @@ -24,11 +21,15 @@ def test_comment(all_parsers, na_values): expected = DataFrame( [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] ) + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), comment="#", na_values=na_values) + return result = parser.read_csv(StringIO(data), comment="#", na_values=na_values) tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The 'comment' option is not supported @pytest.mark.parametrize( "read_kwargs", [{}, {"lineterminator": "*"}, {"delim_whitespace": True}] ) @@ -43,15 +44,25 @@ def test_line_comment(all_parsers, read_kwargs, request): if read_kwargs.get("delim_whitespace"): data = data.replace(",", " ") elif read_kwargs.get("lineterminator"): - if parser.engine != "c": - mark = pytest.mark.xfail( - reason="Custom terminator not supported with Python engine" - ) - request.applymarker(mark) - data = data.replace("\n", read_kwargs.get("lineterminator")) read_kwargs["comment"] = "#" + if parser.engine == "pyarrow": + if "lineterminator" in read_kwargs: + msg = ( + "The 'lineterminator' option is not supported with the 'pyarrow' engine" + ) + else: + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **read_kwargs) + return + elif parser.engine == "python" and read_kwargs.get("lineterminator"): + msg = r"Custom line terminators not supported in python parser \(yet\)" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **read_kwargs) + return + result = parser.read_csv(StringIO(data), **read_kwargs) expected = DataFrame( @@ -60,7 +71,6 @@ def test_line_comment(all_parsers, read_kwargs, request): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The 'comment' option is not supported def test_comment_skiprows(all_parsers): parser = all_parsers data = """# empty @@ -75,11 +85,16 @@ def test_comment_skiprows(all_parsers): expected = DataFrame( [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] ) + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), comment="#", skiprows=4) + return + result = parser.read_csv(StringIO(data), comment="#", skiprows=4) tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The 'comment' option is not supported def test_comment_header(all_parsers): parser = all_parsers data = """# empty @@ -93,11 +108,15 @@ def test_comment_header(all_parsers): expected = DataFrame( [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] ) + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), comment="#", header=1) + return result = parser.read_csv(StringIO(data), comment="#", header=1) tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The 'comment' option is not supported def test_comment_skiprows_header(all_parsers): parser = all_parsers data = """# empty @@ -115,15 +134,28 @@ def test_comment_skiprows_header(all_parsers): expected = DataFrame( [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] ) + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1) + return + result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1) tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The 'comment' option is not supported @pytest.mark.parametrize("comment_char", ["#", "~", "&", "^", "*", "@"]) def test_custom_comment_char(all_parsers, comment_char): parser = all_parsers data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo" + + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data.replace("#", comment_char)), comment=comment_char + ) + return result = parser.read_csv( StringIO(data.replace("#", comment_char)), comment=comment_char ) @@ -132,7 +164,6 @@ def test_custom_comment_char(all_parsers, comment_char): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The 'comment' option is not supported @pytest.mark.parametrize("header", ["infer", None]) def test_comment_first_line(all_parsers, header): # see gh-4623 @@ -144,11 +175,15 @@ def test_comment_first_line(all_parsers, header): else: expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), comment="#", header=header) + return result = parser.read_csv(StringIO(data), comment="#", header=header) tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The 'comment' option is not supported def test_comment_char_in_default_value(all_parsers, request): # GH#34002 if all_parsers.engine == "c": @@ -164,6 +199,11 @@ def test_comment_char_in_default_value(all_parsers, request): "4,5#,6,10\n" "7,8,#N/A,11\n" ) + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), comment="#", na_values="#N/A") + return result = parser.read_csv(StringIO(data), comment="#", na_values="#N/A") expected = DataFrame( { diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index b16542bbdecec..7f3e45324dbd2 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -15,21 +15,21 @@ ) import pandas._testing as tm -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") - -@xfail_pyarrow # ValueError: The 'converters' option is not supported def test_converters_type_must_be_dict(all_parsers): parser = all_parsers data = """index,A,B,C,D foo,2,3,4,5 """ - + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), converters=0) + return with pytest.raises(TypeError, match="Type converters.+"): parser.read_csv(StringIO(data), converters=0) -@xfail_pyarrow # ValueError: The 'converters' option is not supported @pytest.mark.parametrize("column", [3, "D"]) @pytest.mark.parametrize( "converter", [parse, lambda x: int(x.split("/")[2])] # Produce integer. @@ -41,6 +41,12 @@ def test_converters(all_parsers, column, converter): b,3,4,01/02/2009 c,4,5,01/03/2009 """ + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), converters={column: converter}) + return + result = parser.read_csv(StringIO(data), converters={column: converter}) expected = parser.read_csv(StringIO(data)) @@ -49,13 +55,19 @@ def test_converters(all_parsers, column, converter): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The 'converters' option is not supported def test_converters_no_implicit_conv(all_parsers): # see gh-2184 parser = all_parsers data = """000102,1.2,A\n001245,2,B""" converters = {0: lambda x: x.strip()} + + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=None, converters=converters) + return + result = parser.read_csv(StringIO(data), header=None, converters=converters) # Column 0 should not be casted to numeric and should remain as object. @@ -63,7 +75,6 @@ def test_converters_no_implicit_conv(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The 'converters' option is not supported def test_converters_euro_decimal_format(all_parsers): # see gh-583 converters = {} @@ -77,6 +88,12 @@ def test_converters_euro_decimal_format(all_parsers): "Number3" ] = lambda x: float(x.replace(",", ".")) + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), sep=";", converters=converters) + return + result = parser.read_csv(StringIO(data), sep=";", converters=converters) expected = DataFrame( [ @@ -89,7 +106,6 @@ def test_converters_euro_decimal_format(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The 'converters' option is not supported def test_converters_corner_with_nans(all_parsers): parser = all_parsers data = """id,score,days @@ -146,6 +162,16 @@ def convert_score(x): results = [] for day_converter in [convert_days, convert_days_sentinel]: + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + converters={"score": convert_score, "days": day_converter}, + na_values=["", None], + ) + continue + result = parser.read_csv( StringIO(data), converters={"score": convert_score, "days": day_converter}, @@ -154,16 +180,24 @@ def convert_score(x): assert pd.isna(result["days"][1]) results.append(result) - tm.assert_frame_equal(results[0], results[1]) + if parser.engine != "pyarrow": + tm.assert_frame_equal(results[0], results[1]) -@xfail_pyarrow # ValueError: The 'converters' option is not supported @pytest.mark.parametrize("conv_f", [lambda x: x, str]) def test_converter_index_col_bug(all_parsers, conv_f): # see gh-1835 , GH#40589 parser = all_parsers data = "A;B\n1;2\n3;4" + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), sep=";", index_col="A", converters={"A": conv_f} + ) + return + rs = parser.read_csv( StringIO(data), sep=";", index_col="A", converters={"A": conv_f} ) @@ -172,24 +206,42 @@ def test_converter_index_col_bug(all_parsers, conv_f): tm.assert_frame_equal(rs, xp) -@xfail_pyarrow # ValueError: The 'converters' option is not supported def test_converter_identity_object(all_parsers): # GH#40589 parser = all_parsers data = "A,B\n1,2\n3,4" + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), converters={"A": lambda x: x}) + return + rs = parser.read_csv(StringIO(data), converters={"A": lambda x: x}) xp = DataFrame({"A": ["1", "3"], "B": [2, 4]}) tm.assert_frame_equal(rs, xp) -@xfail_pyarrow # ValueError: The 'converters' option is not supported def test_converter_multi_index(all_parsers): # GH 42446 parser = all_parsers data = "A,B,B\nX,Y,Z\n1,2,3" + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + header=list(range(2)), + converters={ + ("A", "X"): np.int32, + ("B", "Y"): np.int32, + ("B", "Z"): np.float32, + }, + ) + return + result = parser.read_csv( StringIO(data), header=list(range(2)), diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index fbea895435699..7a72e66996d43 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -16,7 +16,6 @@ pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @pytest.fixture @@ -33,7 +32,6 @@ def custom_dialect(): return dialect_name, dialect_kwargs -@xfail_pyarrow # ValueError: The 'dialect' option is not supported def test_dialect(all_parsers): parser = all_parsers data = """\ @@ -44,6 +42,13 @@ def test_dialect(all_parsers): dia = csv.excel() dia.quoting = csv.QUOTE_NONE + + if parser.engine == "pyarrow": + msg = "The 'dialect' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dialect=dia) + return + df = parser.read_csv(StringIO(data), dialect=dia) data = """\ @@ -56,7 +61,6 @@ def test_dialect(all_parsers): tm.assert_frame_equal(df, exp) -@xfail_pyarrow # ValueError: The 'dialect' option is not supported def test_dialect_str(all_parsers): dialect_name = "mydialect" parser = all_parsers @@ -68,6 +72,12 @@ def test_dialect_str(all_parsers): exp = DataFrame({"fruit": ["apple", "pear"], "vegetable": ["broccoli", "tomato"]}) with tm.with_csv_dialect(dialect_name, delimiter=":"): + if parser.engine == "pyarrow": + msg = "The 'dialect' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dialect=dialect_name) + return + df = parser.read_csv(StringIO(data), dialect=dialect_name) tm.assert_frame_equal(df, exp) @@ -84,7 +94,6 @@ class InvalidDialect: parser.read_csv(StringIO(data), dialect=InvalidDialect) -@xfail_pyarrow # ValueError: The 'dialect' option is not supported @pytest.mark.parametrize( "arg", [None, "doublequote", "escapechar", "skipinitialspace", "quotechar", "quoting"], @@ -114,6 +123,18 @@ def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, arg, val kwds[arg] = "blah" with tm.with_csv_dialect(dialect_name, **dialect_kwargs): + if parser.engine == "pyarrow": + msg = "The 'dialect' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv_check_warnings( + # No warning bc we raise + None, + "Conflicting values for", + StringIO(data), + dialect=dialect_name, + **kwds, + ) + return result = parser.read_csv_check_warnings( warning_klass, "Conflicting values for", @@ -124,7 +145,6 @@ def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, arg, val tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The 'dialect' option is not supported @pytest.mark.parametrize( "kwargs,warning_klass", [ @@ -153,6 +173,18 @@ def test_dialect_conflict_delimiter(all_parsers, custom_dialect, kwargs, warning data = "a:b\n1:2" with tm.with_csv_dialect(dialect_name, **dialect_kwargs): + if parser.engine == "pyarrow": + msg = "The 'dialect' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv_check_warnings( + # no warning bc we raise + None, + "Conflicting values for 'delimiter'", + StringIO(data), + dialect=dialect_name, + **kwargs, + ) + return result = parser.read_csv_check_warnings( warning_klass, "Conflicting values for 'delimiter'", diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 3580c040688d8..cbd3917ba9c04 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -23,7 +23,6 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @@ -130,8 +129,8 @@ def _encode_data_with_bom(_data): and data == "\n1" and kwargs.get("skip_blank_lines", True) ): - # Manually xfail, since we don't have mechanism to xfail specific version - request.applymarker(pytest.mark.xfail(reason="Pyarrow can't read blank lines")) + # CSV parse error: Empty CSV file or block: cannot infer number of columns + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs) tm.assert_frame_equal(result, expected) @@ -238,7 +237,6 @@ def test_parse_encoded_special_characters(encoding): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The 'memory_map' option is not supported @pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"]) def test_encoding_memory_map(all_parsers, encoding): # GH40986 @@ -252,11 +250,17 @@ def test_encoding_memory_map(all_parsers, encoding): ) with tm.ensure_clean() as file: expected.to_csv(file, index=False, encoding=encoding) + + if parser.engine == "pyarrow": + msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(file, encoding=encoding, memory_map=True) + return + df = parser.read_csv(file, encoding=encoding, memory_map=True) tm.assert_frame_equal(df, expected) -@xfail_pyarrow # ValueError: The 'memory_map' option is not supported def test_chunk_splits_multibyte_char(all_parsers): """ Chunk splits a multibyte character with memory_map=True @@ -272,11 +276,17 @@ def test_chunk_splits_multibyte_char(all_parsers): df.iloc[2047] = "a" * 127 + "ą" with tm.ensure_clean("bug-gh43540.csv") as fname: df.to_csv(fname, index=False, header=False, encoding="utf-8") - dfr = parser.read_csv(fname, header=None, memory_map=True, engine="c") + + if parser.engine == "pyarrow": + msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(fname, header=None, memory_map=True) + return + + dfr = parser.read_csv(fname, header=None, memory_map=True) tm.assert_frame_equal(dfr, df) -@xfail_pyarrow # ValueError: The 'memory_map' option is not supported def test_readcsv_memmap_utf8(all_parsers): """ GH 43787 @@ -300,9 +310,14 @@ def test_readcsv_memmap_utf8(all_parsers): df = DataFrame(lines) with tm.ensure_clean("utf8test.csv") as fname: df.to_csv(fname, index=False, header=False, encoding="utf-8") - dfr = parser.read_csv( - fname, header=None, memory_map=True, engine="c", encoding="utf-8" - ) + + if parser.engine == "pyarrow": + msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8") + return + + dfr = parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8") tm.assert_frame_equal(df, dfr) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index f55f8497f318c..86162bf90db8b 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -411,7 +411,7 @@ def test_header_names_backward_compat(all_parsers, data, header, request): parser = all_parsers if parser.engine == "pyarrow" and header is not None: - mark = pytest.mark.xfail(reason="mismatched index") + mark = pytest.mark.xfail(reason="DataFrame.columns are different") request.applymarker(mark) expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), names=["a", "b", "c"]) @@ -635,7 +635,7 @@ def test_header_none_and_implicit_index(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # regex mismatch "CSV parse error: Expected 2 columns, got " +@skip_pyarrow # regex mismatch "CSV parse error: Expected 2 columns, got " def test_header_none_and_implicit_index_in_second_row(all_parsers): # GH#22144 parser = all_parsers diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 437a5fb5e9f09..ca106fa772e82 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -59,7 +59,6 @@ def test_detect_string_na(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # TypeError: expected bytes, int found @pytest.mark.parametrize( "na_values", [ @@ -87,12 +86,26 @@ def test_detect_string_na(all_parsers): """, ], ) -def test_non_string_na_values(all_parsers, data, na_values): +def test_non_string_na_values(all_parsers, data, na_values, request): # see gh-3611: with an odd float format, we can't match # the string "999.0" exactly but still need float matching parser = all_parsers expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], [3.0, 4.5]], columns=["A", "B"]) + if parser.engine == "pyarrow" and not all(isinstance(x, str) for x in na_values): + msg = "The 'pyarrow' engine requires all na_values to be strings" + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), na_values=na_values) + return + elif parser.engine == "pyarrow" and "-999.000" in data: + # bc the pyarrow engine does not include the float-ified version + # of "-999" -> -999, it does not match the entry with the trailing + # zeros, so "-999.000" is not treated as null. + mark = pytest.mark.xfail( + reason="pyarrow engined does not recognize equivalent floats" + ) + request.applymarker(mark) + result = parser.read_csv(StringIO(data), na_values=na_values) tm.assert_frame_equal(result, expected) @@ -145,8 +158,6 @@ def f(i, v): tm.assert_frame_equal(result, expected) -# ValueError: skiprows argument must be an integer when using engine='pyarrow' -@xfail_pyarrow @pytest.mark.parametrize("na_values", ["baz", ["baz"]]) def test_custom_na_values(all_parsers, na_values): parser = all_parsers @@ -159,6 +170,12 @@ def test_custom_na_values(all_parsers, na_values): expected = DataFrame( [[1.0, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=["A", "B", "C"] ) + if parser.engine == "pyarrow": + msg = "skiprows argument must be an integer when using engine='pyarrow'" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1]) + return + result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1]) tm.assert_frame_equal(result, expected) @@ -183,7 +200,6 @@ def test_bool_na_values(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict def test_na_value_dict(all_parsers): data = """A,B,C foo,bar,NA @@ -191,6 +207,13 @@ def test_na_value_dict(all_parsers): foo,bar,NA bar,foo,foo""" parser = all_parsers + + if parser.engine == "pyarrow": + msg = "pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]}) + return + df = parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]}) expected = DataFrame( { @@ -235,7 +258,6 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict @pytest.mark.parametrize( "kwargs,expected", [ @@ -281,7 +303,7 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): ), ], ) -def test_na_values_keep_default(all_parsers, kwargs, expected): +def test_na_values_keep_default(all_parsers, kwargs, expected, request): data = """\ A,B,C a,1,one @@ -293,6 +315,15 @@ def test_na_values_keep_default(all_parsers, kwargs, expected): g,7,seven """ parser = all_parsers + if parser.engine == "pyarrow": + if "na_values" in kwargs and isinstance(kwargs["na_values"], dict): + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + return + mark = pytest.mark.xfail() + request.applymarker(mark) + result = parser.read_csv(StringIO(data), **kwargs) tm.assert_frame_equal(result, expected) @@ -323,11 +354,19 @@ def test_no_na_values_no_keep_default(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict def test_no_keep_default_na_dict_na_values(all_parsers): # see gh-19227 data = "a,b\n,2" parser = all_parsers + + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), na_values={"b": ["2"]}, keep_default_na=False + ) + return + result = parser.read_csv( StringIO(data), na_values={"b": ["2"]}, keep_default_na=False ) @@ -335,19 +374,24 @@ def test_no_keep_default_na_dict_na_values(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict def test_no_keep_default_na_dict_na_scalar_values(all_parsers): # see gh-19227 # # Scalar values shouldn't cause the parsing to crash or fail. data = "a,b\n1,2" parser = all_parsers + + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False) + return + df = parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False) expected = DataFrame({"a": [1], "b": [np.nan]}) tm.assert_frame_equal(df, expected) -@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict @pytest.mark.parametrize("col_zero_na_values", [113125, "113125"]) def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values): # see gh-19227 @@ -368,6 +412,17 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v } ) + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + header=None, + keep_default_na=False, + na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values}, + ) + return + result = parser.read_csv( StringIO(data), header=None, @@ -427,7 +482,6 @@ def test_na_trailing_columns(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # TypeError: expected bytes, int found @pytest.mark.parametrize( "na_values,row_data", [ @@ -441,12 +495,27 @@ def test_na_values_scalar(all_parsers, na_values, row_data): names = ["a", "b"] data = "1,2\n2,1" + if parser.engine == "pyarrow" and isinstance(na_values, dict): + if isinstance(na_values, dict): + err = ValueError + msg = "The pyarrow engine doesn't support passing a dict for na_values" + else: + err = TypeError + msg = "The 'pyarrow' engine requires all na_values to be strings" + with pytest.raises(err, match=msg): + parser.read_csv(StringIO(data), names=names, na_values=na_values) + return + elif parser.engine == "pyarrow": + msg = "The 'pyarrow' engine requires all na_values to be strings" + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), names=names, na_values=na_values) + return + result = parser.read_csv(StringIO(data), names=names, na_values=na_values) expected = DataFrame(row_data, columns=names) tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict def test_na_values_dict_aliasing(all_parsers): parser = all_parsers na_values = {"a": 2, "b": 1} @@ -456,25 +525,36 @@ def test_na_values_dict_aliasing(all_parsers): data = "1,2\n2,1" expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names) + + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), names=names, na_values=na_values) + return + result = parser.read_csv(StringIO(data), names=names, na_values=na_values) tm.assert_frame_equal(result, expected) tm.assert_dict_equal(na_values, na_values_copy) -@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict def test_na_values_dict_col_index(all_parsers): # see gh-14203 data = "a\nfoo\n1" parser = all_parsers na_values = {0: "foo"} + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), na_values=na_values) + return + result = parser.read_csv(StringIO(data), na_values=na_values) expected = DataFrame({"a": [np.nan, 1]}) tm.assert_frame_equal(result, expected) -@xfail_pyarrow # TypeError: expected bytes, int found @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -487,9 +567,19 @@ def test_na_values_dict_col_index(all_parsers): (str(2**63) + "\n1", {"na_values": [2**63]}, DataFrame([np.nan, 1])), ], ) -def test_na_values_uint64(all_parsers, data, kwargs, expected): +def test_na_values_uint64(all_parsers, data, kwargs, expected, request): # see gh-14983 parser = all_parsers + + if parser.engine == "pyarrow" and "na_values" in kwargs: + msg = "The 'pyarrow' engine requires all na_values to be strings" + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), header=None, **kwargs) + return + elif parser.engine == "pyarrow": + mark = pytest.mark.xfail(reason="Returns float64 instead of object") + request.applymarker(mark) + result = parser.read_csv(StringIO(data), header=None, **kwargs) tm.assert_frame_equal(result, expected) @@ -566,11 +656,14 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): ) def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): parser = all_parsers - msg = ( - "(Bool column has NA values in column [0a])|" - "(cannot safely convert passed user dtype of " - "bool for object dtyped data in column 0)" + msg = "|".join( + [ + "Bool column has NA values in column [0a]", + "cannot safely convert passed user dtype of " + "bool for object dtyped data in column 0", + ] ) + with pytest.raises(ValueError, match=msg): parser.read_csv( StringIO(data), @@ -581,6 +674,8 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): ) +# TODO: this test isn't about the na_values keyword, it is about the empty entries +# being returned with NaN entries, whereas the pyarrow engine returns "nan" @xfail_pyarrow # mismatched shapes def test_str_nan_dropped(all_parsers): # see gh-21131 @@ -610,12 +705,19 @@ def test_str_nan_dropped(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The pyarrow engine doesn't support passing a dict def test_nan_multi_index(all_parsers): # GH 42446 parser = all_parsers data = "A,B,B\nX,Y,Z\n1,2,inf" + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"} + ) + return + result = parser.read_csv( StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"} ) @@ -631,7 +733,7 @@ def test_nan_multi_index(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # Failed: DID NOT RAISE +@xfail_pyarrow # Failed: DID NOT RAISE ; it casts the NaN to False def test_bool_and_nan_to_bool(all_parsers): # GH#42808 parser = all_parsers diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 28e5f5ad9bb70..9351387dfc337 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -2,16 +2,13 @@ Tests parsers ability to read and parse non-local files and hence require a network connection to be read. """ -from io import ( - BytesIO, - StringIO, -) +from io import BytesIO import logging +import re import numpy as np import pytest -from pandas.compat import is_ci_environment import pandas.util._test_decorators as td from pandas import DataFrame @@ -292,39 +289,23 @@ def test_read_csv_handles_boto_s3_object( tm.assert_frame_equal(result, expected) @pytest.mark.single_cpu - @pytest.mark.skipif( - is_ci_environment(), - reason="GH: 45651: This test can hang in our CI min_versions build", - ) def test_read_csv_chunked_download(self, s3_public_bucket, caplog, s3so): # 8 MB, S3FS uses 5MB chunks - import s3fs - - df = DataFrame( - np.random.default_rng(2).standard_normal((100000, 4)), columns=list("abcd") - ) - str_buf = StringIO() - - df.to_csv(str_buf) - - buf = BytesIO(str_buf.getvalue().encode("utf-8")) - - s3_public_bucket.put_object(Key="large-file.csv", Body=buf) - - # Possibly some state leaking in between tests. - # If we don't clear this cache, we saw `GetObject operation: Forbidden`. - # Presumably the s3fs instance is being cached, with the directory listing - # from *before* we add the large-file.csv in the s3_public_bucket_with_data. - s3fs.S3FileSystem.clear_instance_cache() - - with caplog.at_level(logging.DEBUG, logger="s3fs"): - read_csv( - f"s3://{s3_public_bucket.name}/large-file.csv", - nrows=5, - storage_options=s3so, - ) - # log of fetch_range (start, stop) - assert (0, 5505024) in (x.args[-2:] for x in caplog.records) + df = DataFrame(np.zeros((100000, 4)), columns=list("abcd")) + with BytesIO(df.to_csv().encode("utf-8")) as buf: + s3_public_bucket.put_object(Key="large-file.csv", Body=buf) + uri = f"{s3_public_bucket.name}/large-file.csv" + match_re = re.compile(rf"^Fetch: {uri}, 0-(?P\d+)$") + with caplog.at_level(logging.DEBUG, logger="s3fs"): + read_csv( + f"s3://{uri}", + nrows=5, + storage_options=s3so, + ) + for log in caplog.messages: + if match := re.match(match_re, log): + # Less than 8 MB + assert int(match.group("stop")) < 8000000 def test_read_s3_with_hash_in_key(self, s3_public_bucket_with_data, tips_df, s3so): # GH 25945 diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 70d9171fa3c22..113402cda1b9a 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -831,9 +831,7 @@ def test_parse_dates_string(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data), index_col="date", parse_dates=["date"]) # freq doesn't round-trip - index = DatetimeIndex( - list(date_range("1/1/2009", periods=3)), name="date", freq=None - ) + index = date_range("1/1/2009", periods=3, name="date")._with_freq(None) expected = DataFrame( {"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}, index=index @@ -1736,24 +1734,19 @@ def test_parse_timezone(all_parsers): 2018-01-04 09:05:00+09:00,23400""" result = parser.read_csv(StringIO(data), parse_dates=["dt"]) - dti = DatetimeIndex( - list( - date_range( - start="2018-01-04 09:01:00", - end="2018-01-04 09:05:00", - freq="1min", - tz=timezone(timedelta(minutes=540)), - ) - ), - freq=None, - ) + dti = date_range( + start="2018-01-04 09:01:00", + end="2018-01-04 09:05:00", + freq="1min", + tz=timezone(timedelta(minutes=540)), + )._with_freq(None) expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]} expected = DataFrame(expected_data) tm.assert_frame_equal(result, expected) -@xfail_pyarrow # pandas.errors.ParserError: CSV parse error +@skip_pyarrow # pandas.errors.ParserError: CSV parse error @pytest.mark.parametrize( "date_string", ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"], @@ -1787,8 +1780,8 @@ def test_parse_delimited_date_swap_no_warning( expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") if parser.engine == "pyarrow": if not dayfirst: - mark = pytest.mark.xfail(reason="CSV parse error: Empty CSV file or block") - request.applymarker(mark) + # "CSV parse error: Empty CSV file or block" + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") msg = "The 'dayfirst' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): parser.read_csv( @@ -1802,7 +1795,8 @@ def test_parse_delimited_date_swap_no_warning( tm.assert_frame_equal(result, expected) -@xfail_pyarrow +# ArrowInvalid: CSV parse error: Empty CSV file or block: cannot infer number of columns +@skip_pyarrow @pytest.mark.parametrize( "date_string,dayfirst,expected", [ @@ -1887,7 +1881,8 @@ def test_hypothesis_delimited_date( assert result == expected -@xfail_pyarrow # KeyErrors +# ArrowKeyError: Column 'fdate1' in include_columns does not exist in CSV file +@skip_pyarrow @pytest.mark.parametrize( "names, usecols, parse_dates, missing_cols", [ @@ -1977,8 +1972,6 @@ def test_date_parser_multiindex_columns_combine_cols(all_parsers, parse_spec, co tm.assert_frame_equal(result, expected) -# ValueError: The 'thousands' option is not supported with the 'pyarrow' engine -@xfail_pyarrow def test_date_parser_usecols_thousands(all_parsers): # GH#39365 data = """A,B,C @@ -1987,12 +1980,21 @@ def test_date_parser_usecols_thousands(all_parsers): """ parser = all_parsers - warn = UserWarning + if parser.engine == "pyarrow": # DeprecationWarning for passing a Manager object - warn = (UserWarning, DeprecationWarning) + msg = "The 'thousands' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + parse_dates=[1], + usecols=[1, 2], + thousands="-", + ) + return + result = parser.read_csv_check_warnings( - warn, + UserWarning, "Could not infer format", StringIO(data), parse_dates=[1], diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 695dc84db5e25..b099ed53b7a5f 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -178,8 +178,8 @@ def test_close_file_handle_on_invalid_usecols(all_parsers): error = ValueError if parser.engine == "pyarrow": - pyarrow = pytest.importorskip("pyarrow") - error = pyarrow.lib.ArrowKeyError + # Raises pyarrow.lib.ArrowKeyError + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") with tm.ensure_clean("test.csv") as fname: Path(fname).write_text("col1,col2\na,b\n1,2", encoding="utf-8") diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index 042c3814ef72a..52be6f302c337 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -19,8 +19,12 @@ xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +_msg_pyarrow_requires_names = ( + "The pyarrow engine does not allow 'usecols' to be integer column " + "positions. Pass a list of string column names instead." +) + -@xfail_pyarrow # TypeError: expected bytes, int found @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) def test_usecols_with_parse_dates(all_parsers, usecols): # see gh-9755 @@ -35,6 +39,10 @@ def test_usecols_with_parse_dates(all_parsers, usecols): "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], } expected = DataFrame(cols, columns=["c_d", "a"]) + if parser.engine == "pyarrow": + with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): + parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) + return result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 055be81d2996d..92db98bc4ec28 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -28,6 +28,10 @@ _msg_validate_usecols_names = ( "Usecols do not match columns, columns expected but not found: {0}" ) +_msg_pyarrow_requires_names = ( + "The pyarrow engine does not allow 'usecols' to be integer column " + "positions. Pass a list of string column names instead." +) xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @@ -60,15 +64,16 @@ def test_usecols(all_parsers, usecols, request): 10,11,12""" parser = all_parsers if parser.engine == "pyarrow" and isinstance(usecols[0], int): - mark = pytest.mark.xfail(raises=TypeError, reason="expected bytes, int found") - request.applymarker(mark) + with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): + parser.read_csv(StringIO(data), usecols=usecols) + return + result = parser.read_csv(StringIO(data), usecols=usecols) expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) tm.assert_frame_equal(result, expected) -@xfail_pyarrow # TypeError: expected bytes, int found def test_usecols_with_names(all_parsers): data = """\ a,b,c @@ -78,6 +83,12 @@ def test_usecols_with_names(all_parsers): 10,11,12""" parser = all_parsers names = ["foo", "bar"] + + if parser.engine == "pyarrow": + with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): + parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0) + return + result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0) expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names) @@ -87,7 +98,7 @@ def test_usecols_with_names(all_parsers): @pytest.mark.parametrize( "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])] ) -def test_usecols_relative_to_names(all_parsers, names, usecols, request): +def test_usecols_relative_to_names(all_parsers, names, usecols): data = """\ 1,2,3 4,5,6 @@ -95,10 +106,8 @@ def test_usecols_relative_to_names(all_parsers, names, usecols, request): 10,11,12""" parser = all_parsers if parser.engine == "pyarrow" and not isinstance(usecols[0], int): - mark = pytest.mark.xfail( - reason="ArrowKeyError: Column 'fb' in include_columns does not exist" - ) - request.applymarker(mark) + # ArrowKeyError: Column 'fb' in include_columns does not exist + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols) @@ -133,7 +142,6 @@ def test_usecols_name_length_conflict(all_parsers): 10,11,12""" parser = all_parsers msg = "Number of passed names did not match number of header fields in the file" - with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1]) @@ -168,10 +176,13 @@ def test_usecols_index_col_false(all_parsers, data): def test_usecols_index_col_conflict(all_parsers, usecols, index_col, request): # see gh-4201: test that index_col as integer reflects usecols parser = all_parsers - if parser.engine == "pyarrow" and isinstance(usecols[0], int): - mark = pytest.mark.xfail(raises=TypeError, match="expected bytes, int found") - request.applymarker(mark) data = "a,b,c,d\nA,a,1,one\nB,b,2,two" + + if parser.engine == "pyarrow" and isinstance(usecols[0], int): + with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): + parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col) + return + expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b")) result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col) @@ -276,8 +287,9 @@ def test_usecols_with_integer_like_header(all_parsers, usecols, expected, reques 4000,5000,6000""" if parser.engine == "pyarrow" and isinstance(usecols[0], int): - mark = pytest.mark.xfail(raises=TypeError, reason="expected bytes, int found") - request.applymarker(mark) + with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): + parser.read_csv(StringIO(data), usecols=usecols) + return result = parser.read_csv(StringIO(data), usecols=usecols) tm.assert_frame_equal(result, expected) @@ -304,7 +316,6 @@ def test_np_array_usecols(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # TypeError: 'function' object is not iterable @pytest.mark.parametrize( "usecols,expected", [ @@ -333,6 +344,12 @@ def test_callable_usecols(all_parsers, usecols, expected): 3.568935038,7,False,a""" parser = all_parsers + if parser.engine == "pyarrow": + msg = "The pyarrow engine does not allow 'usecols' to be a callable" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), usecols=usecols) + return + result = parser.read_csv(StringIO(data), usecols=usecols) tm.assert_frame_equal(result, expected) @@ -438,10 +455,8 @@ def test_raises_on_usecols_names_mismatch( usecols is not None and expected is not None ): # everything but the first case - mark = pytest.mark.xfail( - reason="e.g. Column 'f' in include_columns does not exist in CSV file" - ) - request.applymarker(mark) + # ArrowKeyError: Column 'f' in include_columns does not exist in CSV file + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") if expected is None: with pytest.raises(ValueError, match=msg): @@ -451,19 +466,25 @@ def test_raises_on_usecols_names_mismatch( tm.assert_frame_equal(result, expected) -@xfail_pyarrow # TypeError: expected bytes, int found @pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]]) -def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols): +def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request): data = "a,b,c,d\n1,2,3,4\n5,6,7,8" names = ["A", "B", "C", "D"] parser = all_parsers + if parser.engine == "pyarrow": + if isinstance(usecols[0], int): + with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): + parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols) + return + # "pyarrow.lib.ArrowKeyError: Column 'A' in include_columns does not exist" + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") + result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols) expected = DataFrame({"A": [1, 5], "C": [3, 7]}) tm.assert_frame_equal(result, expected) -@xfail_pyarrow # TypeError: expected bytes, int found @pytest.mark.parametrize("names", [None, ["a", "b"]]) def test_usecols_indices_out_of_bounds(all_parsers, names): # GH#25623 & GH 41130; enforced in 2.0 @@ -472,7 +493,14 @@ def test_usecols_indices_out_of_bounds(all_parsers, names): a,b 1,2 """ - with pytest.raises(ParserError, match="Defining usecols with out-of-bounds"): + + err = ParserError + msg = "Defining usecols with out-of-bounds" + if parser.engine == "pyarrow": + err = ValueError + msg = _msg_pyarrow_requires_names + + with pytest.raises(err, match=msg): parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0) @@ -482,8 +510,8 @@ def test_usecols_additional_columns(all_parsers): usecols = lambda header: header.strip() in ["a", "b", "c"] if parser.engine == "pyarrow": - msg = "'function' object is not iterable" - with pytest.raises(TypeError, match=msg): + msg = "The pyarrow engine does not allow 'usecols' to be a callable" + with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols) return result = parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols) @@ -496,8 +524,8 @@ def test_usecols_additional_columns_integer_columns(all_parsers): parser = all_parsers usecols = lambda header: header.strip() in ["0", "1"] if parser.engine == "pyarrow": - msg = "'function' object is not iterable" - with pytest.raises(TypeError, match=msg): + msg = "The pyarrow engine does not allow 'usecols' to be a callable" + with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols) return result = parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols) diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index 24ee0d28a1887..c109b72e9c239 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -1,4 +1,3 @@ -import datetime import re import numpy as np @@ -15,6 +14,7 @@ Series, _testing as tm, concat, + date_range, ) from pandas.tests.io.pytables.common import ( _maybe_remove, @@ -279,15 +279,9 @@ def test_store_multiindex(setup_path): with ensure_clean_store(setup_path) as store: def make_index(names=None): - return MultiIndex.from_tuples( - [ - (datetime.datetime(2013, 12, d), s, t) - for d in range(1, 3) - for s in range(2) - for t in range(3) - ], - names=names, - ) + dti = date_range("2013-12-01", "2013-12-02") + mi = MultiIndex.from_product([dti, range(2), range(3)], names=names) + return mi # no names _maybe_remove(store, "df") @@ -306,11 +300,11 @@ def make_index(names=None): tm.assert_frame_equal(store.select("df"), df) # series - _maybe_remove(store, "s") - s = Series(np.zeros(12), index=make_index(["date", None, None])) - store.append("s", s) + _maybe_remove(store, "ser") + ser = Series(np.zeros(12), index=make_index(["date", None, None])) + store.append("ser", ser) xp = Series(np.zeros(12), index=make_index(["date", "level_1", "level_2"])) - tm.assert_series_equal(store.select("s"), xp) + tm.assert_series_equal(store.select("ser"), xp) # dup with column _maybe_remove(store, "df") diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index 9b4590ca3f01f..32af61de05ee4 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -154,7 +154,7 @@ def test_pytables_native_read(datapath): datapath("io", "data", "legacy_hdf/pytables_native.h5"), mode="r" ) as store: d2 = store["detector/readout"] - assert isinstance(d2, DataFrame) + assert isinstance(d2, DataFrame) @pytest.mark.skipif(is_platform_windows(), reason="native2 read fails oddly on windows") @@ -164,7 +164,7 @@ def test_pytables_native2_read(datapath): ) as store: str(store) d1 = store["detector"] - assert isinstance(d1, DataFrame) + assert isinstance(d1, DataFrame) def test_legacy_table_fixed_format_read_py2(datapath): @@ -174,28 +174,29 @@ def test_legacy_table_fixed_format_read_py2(datapath): datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r" ) as store: result = store.select("df") - expected = DataFrame( - [[1, 2, 3, "D"]], - columns=["A", "B", "C", "D"], - index=Index(["ABC"], name="INDEX_NAME"), - ) - tm.assert_frame_equal(expected, result) + expected = DataFrame( + [[1, 2, 3, "D"]], + columns=["A", "B", "C", "D"], + index=Index(["ABC"], name="INDEX_NAME"), + ) + tm.assert_frame_equal(expected, result) def test_legacy_table_fixed_format_read_datetime_py2(datapath): # GH 31750 # legacy table with fixed format and datetime64 column written in Python 2 + expected = DataFrame( + [[Timestamp("2020-02-06T18:00")]], + columns=["A"], + index=Index(["date"]), + dtype="M8[ns]", + ) with ensure_clean_store( datapath("io", "data", "legacy_hdf", "legacy_table_fixed_datetime_py2.h5"), mode="r", ) as store: result = store.select("df") - expected = DataFrame( - [[Timestamp("2020-02-06T18:00")]], - columns=["A"], - index=Index(["date"]), - ) - tm.assert_frame_equal(expected, result) + tm.assert_frame_equal(expected, result) def test_legacy_table_read_py2(datapath): @@ -264,7 +265,7 @@ def test_read_hdf_iterator(tmp_path, setup_path): with closing(iterator.store): assert isinstance(iterator, TableIterator) indirect = next(iterator.__iter__()) - tm.assert_frame_equal(direct, indirect) + tm.assert_frame_equal(direct, indirect) def test_read_nokey(tmp_path, setup_path): @@ -387,7 +388,7 @@ def test_read_py2_hdf_file_in_py3(datapath): mode="r", ) as store: result = store["p"] - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_read_infer_string(tmp_path, setup_path): diff --git a/pandas/tests/io/pytables/test_retain_attributes.py b/pandas/tests/io/pytables/test_retain_attributes.py index aef6fc0460cd9..6284b826c3cf0 100644 --- a/pandas/tests/io/pytables/test_retain_attributes.py +++ b/pandas/tests/io/pytables/test_retain_attributes.py @@ -1,9 +1,8 @@ import pytest -from pandas._libs.tslibs import Timestamp - from pandas import ( DataFrame, + DatetimeIndex, Series, _testing as tm, date_range, @@ -18,11 +17,10 @@ pytestmark = pytest.mark.single_cpu -def test_retain_index_attributes(setup_path): +def test_retain_index_attributes(setup_path, unit): # GH 3499, losing frequency info on index recreation - df = DataFrame( - {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="h"))} - ) + dti = date_range("2000-1-1", periods=3, freq="h", unit=unit) + df = DataFrame({"A": Series(range(3), index=dti)}) with ensure_clean_store(setup_path) as store: _maybe_remove(store, "data") @@ -37,37 +35,30 @@ def test_retain_index_attributes(setup_path): getattr(result, idx), attr, None ) + dti2 = date_range("2002-1-1", periods=3, freq="D", unit=unit) # try to append a table with a different frequency with tm.assert_produces_warning(errors.AttributeConflictWarning): - df2 = DataFrame( - { - "A": Series( - range(3), index=date_range("2002-1-1", periods=3, freq="D") - ) - } - ) + df2 = DataFrame({"A": Series(range(3), index=dti2)}) store.append("data", df2) assert store.get_storer("data").info["index"]["freq"] is None # this is ok _maybe_remove(store, "df2") + dti3 = DatetimeIndex( + ["2001-01-01", "2001-01-02", "2002-01-01"], dtype=f"M8[{unit}]" + ) df2 = DataFrame( { "A": Series( range(3), - index=[ - Timestamp("20010101"), - Timestamp("20010102"), - Timestamp("20020101"), - ], + index=dti3, ) } ) store.append("df2", df2) - df3 = DataFrame( - {"A": Series(range(3), index=date_range("2002-1-1", periods=3, freq="D"))} - ) + dti4 = date_range("2002-1-1", periods=3, freq="D", unit=unit) + df3 = DataFrame({"A": Series(range(3), index=dti4)}) store.append("df2", df3) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index af24a5cf7e0ab..4f908f28cb5e9 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -10,6 +10,7 @@ import pandas as pd from pandas import ( DataFrame, + DatetimeIndex, Index, Series, _testing as tm, @@ -320,7 +321,11 @@ def test_index_types(setup_path): ser = Series(values, [1, 5]) _check_roundtrip(ser, func, path=setup_path) - ser = Series(values, [datetime.datetime(2012, 1, 1), datetime.datetime(2012, 1, 2)]) + dti = DatetimeIndex(["2012-01-01", "2012-01-02"], dtype="M8[ns]") + ser = Series(values, index=dti) + _check_roundtrip(ser, func, path=setup_path) + + ser.index = ser.index.as_unit("s") _check_roundtrip(ser, func, path=setup_path) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index b61b6f0772251..4624a48df18e3 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -542,17 +542,16 @@ def test_store_index_name(setup_path): @pytest.mark.parametrize("tz", [None, "US/Pacific"]) -@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) @pytest.mark.parametrize("table_format", ["table", "fixed"]) def test_store_index_name_numpy_str(tmp_path, table_format, setup_path, unit, tz): # GH #13492 - idx = Index( - pd.to_datetime([dt.date(2000, 1, 1), dt.date(2000, 1, 2)]), + idx = DatetimeIndex( + [dt.date(2000, 1, 1), dt.date(2000, 1, 2)], name="cols\u05d2", ).tz_localize(tz) idx1 = ( - Index( - pd.to_datetime([dt.date(2010, 1, 1), dt.date(2010, 1, 2)]), + DatetimeIndex( + [dt.date(2010, 1, 1), dt.date(2010, 1, 2)], name="rows\u05d0", ) .as_unit(unit) diff --git a/pandas/tests/io/pytables/test_time_series.py b/pandas/tests/io/pytables/test_time_series.py index 26492f72f192d..cfe25f03e1aab 100644 --- a/pandas/tests/io/pytables/test_time_series.py +++ b/pandas/tests/io/pytables/test_time_series.py @@ -5,6 +5,7 @@ from pandas import ( DataFrame, + DatetimeIndex, Series, _testing as tm, ) @@ -13,10 +14,12 @@ pytestmark = pytest.mark.single_cpu -def test_store_datetime_fractional_secs(setup_path): +@pytest.mark.parametrize("unit", ["us", "ns"]) +def test_store_datetime_fractional_secs(setup_path, unit): + dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) + dti = DatetimeIndex([dt], dtype=f"M8[{unit}]") + series = Series([0], index=dti) with ensure_clean_store(setup_path) as store: - dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) - series = Series([0], [dt]) store["a"] = series assert store["a"].index[0] == dt diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 676b9374514e8..8c61830ebe038 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -148,16 +148,20 @@ def test_append_with_timezones_as_index(setup_path, gettz): tm.assert_frame_equal(result, df) -def test_roundtrip_tz_aware_index(setup_path): +def test_roundtrip_tz_aware_index(setup_path, unit): # GH 17618 - time = Timestamp("2000-01-01 01:00:00", tz="US/Eastern") - df = DataFrame(data=[0], index=[time]) + ts = Timestamp("2000-01-01 01:00:00", tz="US/Eastern") + dti = DatetimeIndex([ts]).as_unit(unit) + df = DataFrame(data=[0], index=dti) with ensure_clean_store(setup_path) as store: store.put("frame", df, format="fixed") recons = store["frame"] tm.assert_frame_equal(recons, df) - assert recons.index[0]._value == 946706400000000000 + + value = recons.index[0]._value + denom = {"ns": 1, "us": 1000, "ms": 10**6, "s": 10**9}[unit] + assert value == 946706400000000000 // denom def test_store_index_name_with_tz(setup_path): @@ -365,7 +369,7 @@ def test_py2_created_with_datetimez(datapath): # Python 3. # # GH26443 - index = [Timestamp("2019-01-01T18:00").tz_localize("America/New_York")] + index = DatetimeIndex(["2019-01-01T18:00"], dtype="M8[ns, America/New_York]") expected = DataFrame({"data": 123}, index=index) with ensure_clean_store( datapath("io", "data", "legacy_hdf", "gh26443.h5"), mode="r" diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index d56139d32b1da..0ce428cef9520 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -187,7 +187,18 @@ def test_date_time(datapath): fname, parse_dates=["Date1", "Date2", "DateTime", "DateTimeHi", "Taiw"] ) # GH 19732: Timestamps imported from sas will incur floating point errors - df[df.columns[3]] = df.iloc[:, 3].dt.round("us") + # 2023-11-16 we don't know the correct "expected" result bc we do not have + # access to SAS to read the sas7bdat file. We are really just testing + # that we are "close". This only seems to be an issue near the + # implementation bounds. + res = df.iloc[:, 3].dt.round("us").copy() + + # the first and last elements are near the implementation bounds, where we + # would expect floating point error to occur. + res.iloc[0] -= pd.Timedelta(microseconds=1) + res.iloc[-1] += pd.Timedelta(microseconds=1) + + df["DateTimeHi"] = res tm.assert_frame_equal(df, df0) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 10e0467c5d74d..3d150aaa28eb4 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -1,13 +1,8 @@ -import os from textwrap import dedent import numpy as np import pytest -from pandas.compat import ( - is_ci_environment, - is_platform_mac, -) from pandas.errors import ( PyperclipException, PyperclipWindowsException, @@ -30,8 +25,7 @@ from pandas.io.clipboard import ( CheckedCall, _stringifyText, - clipboard_get, - clipboard_set, + init_qt_clipboard, ) @@ -205,60 +199,34 @@ def test_stringify_text(text): @pytest.fixture -def mock_clipboard(monkeypatch, request): - """Fixture mocking clipboard IO. - - This mocks pandas.io.clipboard.clipboard_get and - pandas.io.clipboard.clipboard_set. - - This uses a local dict for storing data. The dictionary - key used is the test ID, available with ``request.node.name``. - - This returns the local dictionary, for direct manipulation by - tests. - """ - # our local clipboard for tests - _mock_data = {} - - def _mock_set(data): - _mock_data[request.node.name] = data - - def _mock_get(): - return _mock_data[request.node.name] - - monkeypatch.setattr("pandas.io.clipboard.clipboard_set", _mock_set) - monkeypatch.setattr("pandas.io.clipboard.clipboard_get", _mock_get) - - yield _mock_data - +def set_pyqt_clipboard(monkeypatch): + qt_cut, qt_paste = init_qt_clipboard() + with monkeypatch.context() as m: + m.setattr(pd.io.clipboard, "clipboard_set", qt_cut) + m.setattr(pd.io.clipboard, "clipboard_get", qt_paste) + yield -@pytest.mark.clipboard -def test_mock_clipboard(mock_clipboard): - import pandas.io.clipboard - pandas.io.clipboard.clipboard_set("abc") - assert "abc" in set(mock_clipboard.values()) - result = pandas.io.clipboard.clipboard_get() - assert result == "abc" +@pytest.fixture +def clipboard(qapp): + clip = qapp.clipboard() + yield clip + clip.clear() @pytest.mark.single_cpu @pytest.mark.clipboard -@pytest.mark.usefixtures("mock_clipboard") +@pytest.mark.usefixtures("set_pyqt_clipboard") +@pytest.mark.usefixtures("clipboard") class TestClipboard: - def check_round_trip_frame(self, data, excel=None, sep=None, encoding=None): - data.to_clipboard(excel=excel, sep=sep, encoding=encoding) - result = read_clipboard(sep=sep or "\t", index_col=0, encoding=encoding) - tm.assert_frame_equal(data, result) - # Test that default arguments copy as tab delimited - def test_round_trip_frame(self, df): - self.check_round_trip_frame(df) - # Test that explicit delimiters are respected - @pytest.mark.parametrize("sep", ["\t", ",", "|"]) - def test_round_trip_frame_sep(self, df, sep): - self.check_round_trip_frame(df, sep=sep) + @pytest.mark.parametrize("sep", [None, "\t", ",", "|"]) + @pytest.mark.parametrize("encoding", [None, "UTF-8", "utf-8", "utf8"]) + def test_round_trip_frame_sep(self, df, sep, encoding): + df.to_clipboard(excel=None, sep=sep, encoding=encoding) + result = read_clipboard(sep=sep or "\t", index_col=0, encoding=encoding) + tm.assert_frame_equal(df, result) # Test white space separator def test_round_trip_frame_string(self, df): @@ -286,22 +254,21 @@ def test_copy_delim_warning(self, df): # delimited and excel="True" @pytest.mark.parametrize("sep", ["\t", None, "default"]) @pytest.mark.parametrize("excel", [True, None, "default"]) - def test_clipboard_copy_tabs_default(self, sep, excel, df, request, mock_clipboard): + def test_clipboard_copy_tabs_default(self, sep, excel, df, clipboard): kwargs = build_kwargs(sep, excel) df.to_clipboard(**kwargs) - assert mock_clipboard[request.node.name] == df.to_csv(sep="\t") + assert clipboard.text() == df.to_csv(sep="\t") # Tests reading of white space separated tables @pytest.mark.parametrize("sep", [None, "default"]) - @pytest.mark.parametrize("excel", [False]) - def test_clipboard_copy_strings(self, sep, excel, df): - kwargs = build_kwargs(sep, excel) + def test_clipboard_copy_strings(self, sep, df): + kwargs = build_kwargs(sep, False) df.to_clipboard(**kwargs) result = read_clipboard(sep=r"\s+") assert result.to_string() == df.to_string() assert df.shape == result.shape - def test_read_clipboard_infer_excel(self, request, mock_clipboard): + def test_read_clipboard_infer_excel(self, clipboard): # gh-19010: avoid warnings clip_kwargs = {"engine": "python"} @@ -312,7 +279,7 @@ def test_read_clipboard_infer_excel(self, request, mock_clipboard): 4\tHarry Carney """.strip() ) - mock_clipboard[request.node.name] = text + clipboard.setText(text) df = read_clipboard(**clip_kwargs) # excel data is parsed correctly @@ -326,7 +293,7 @@ def test_read_clipboard_infer_excel(self, request, mock_clipboard): 3 4 """.strip() ) - mock_clipboard[request.node.name] = text + clipboard.setText(text) res = read_clipboard(**clip_kwargs) text = dedent( @@ -336,16 +303,16 @@ def test_read_clipboard_infer_excel(self, request, mock_clipboard): 3 4 """.strip() ) - mock_clipboard[request.node.name] = text + clipboard.setText(text) exp = read_clipboard(**clip_kwargs) tm.assert_frame_equal(res, exp) - def test_infer_excel_with_nulls(self, request, mock_clipboard): + def test_infer_excel_with_nulls(self, clipboard): # GH41108 text = "col1\tcol2\n1\tred\n\tblue\n2\tgreen" - mock_clipboard[request.node.name] = text + clipboard.setText(text) df = read_clipboard() df_expected = DataFrame( data={"col1": [1, None, 2], "col2": ["red", "blue", "green"]} @@ -376,10 +343,10 @@ def test_infer_excel_with_nulls(self, request, mock_clipboard): ), ], ) - def test_infer_excel_with_multiindex(self, request, mock_clipboard, multiindex): + def test_infer_excel_with_multiindex(self, clipboard, multiindex): # GH41108 - mock_clipboard[request.node.name] = multiindex[0] + clipboard.setText(multiindex[0]) df = read_clipboard() df_expected = DataFrame( data={"col1": [1, None, 2], "col2": ["red", "blue", "green"]}, @@ -397,26 +364,17 @@ def test_invalid_encoding(self, df): with pytest.raises(NotImplementedError, match=msg): read_clipboard(encoding="ascii") - @pytest.mark.parametrize("enc", ["UTF-8", "utf-8", "utf8"]) - def test_round_trip_valid_encodings(self, enc, df): - self.check_round_trip_frame(df, encoding=enc) - - @pytest.mark.single_cpu @pytest.mark.parametrize("data", ["\U0001f44d...", "Ωœ∑`...", "abcd..."]) - @pytest.mark.xfail( - (os.environ.get("DISPLAY") is None and not is_platform_mac()) - or is_ci_environment(), - reason="Cannot pass if a headless system is not put in place with Xvfb", - strict=not is_ci_environment(), # Flaky failures in the CI - ) def test_raw_roundtrip(self, data): # PR #25040 wide unicode wasn't copied correctly on PY3 on windows - clipboard_set(data) - assert data == clipboard_get() + df = DataFrame({"data": [data]}) + df.to_clipboard() + result = read_clipboard() + tm.assert_frame_equal(df, result) @pytest.mark.parametrize("engine", ["c", "python"]) def test_read_clipboard_dtype_backend( - self, request, mock_clipboard, string_storage, dtype_backend, engine + self, clipboard, string_storage, dtype_backend, engine ): # GH#50502 if string_storage == "pyarrow" or dtype_backend == "pyarrow": @@ -433,7 +391,7 @@ def test_read_clipboard_dtype_backend( text = """a,b,c,d,e,f,g,h,i x,1,4.0,x,2,4.0,,True,False y,2,5.0,,,,,False,""" - mock_clipboard[request.node.name] = text + clipboard.setText(text) with pd.option_context("mode.string_storage", string_storage): result = read_clipboard(sep=",", dtype_backend=dtype_backend, engine=engine) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 018400fcfe0cf..26035d1af7f90 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -288,6 +288,8 @@ def test_read_expands_user_home_dir( ): reader(path) + # TODO(CoW-warn) avoid warnings in the stata reader code + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") @pytest.mark.parametrize( "reader, module, path", [ diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 8726d44c9c3ed..a1dec8a2d05b4 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -23,6 +23,28 @@ ) +@pytest.fixture +def fsspectest(): + pytest.importorskip("fsspec") + from fsspec import register_implementation + from fsspec.implementations.memory import MemoryFileSystem + from fsspec.registry import _registry as registry + + class TestMemoryFS(MemoryFileSystem): + protocol = "testmem" + test = [None] + + def __init__(self, **kwargs) -> None: + self.test[0] = kwargs.pop("test", None) + super().__init__(**kwargs) + + register_implementation("testmem", TestMemoryFS, clobber=True) + yield TestMemoryFS() + registry.pop("testmem", None) + TestMemoryFS.test[0] = None + TestMemoryFS.store.clear() + + @pytest.fixture def df1(): return DataFrame( diff --git a/pandas/tests/io/test_http_headers.py b/pandas/tests/io/test_http_headers.py new file mode 100644 index 0000000000000..2ca11ad1f74e6 --- /dev/null +++ b/pandas/tests/io/test_http_headers.py @@ -0,0 +1,172 @@ +""" +Tests for the pandas custom headers in http(s) requests +""" +from functools import partial +import gzip +from io import BytesIO + +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm + +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.network, + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), +] + + +def gzip_bytes(response_bytes): + with BytesIO() as bio: + with gzip.GzipFile(fileobj=bio, mode="w") as zipper: + zipper.write(response_bytes) + return bio.getvalue() + + +def csv_responder(df): + return df.to_csv(index=False).encode("utf-8") + + +def gz_csv_responder(df): + return gzip_bytes(csv_responder(df)) + + +def json_responder(df): + return df.to_json().encode("utf-8") + + +def gz_json_responder(df): + return gzip_bytes(json_responder(df)) + + +def html_responder(df): + return df.to_html(index=False).encode("utf-8") + + +def parquetpyarrow_reponder(df): + return df.to_parquet(index=False, engine="pyarrow") + + +def parquetfastparquet_responder(df): + # the fastparquet engine doesn't like to write to a buffer + # it can do it via the open_with function being set appropriately + # however it automatically calls the close method and wipes the buffer + # so just overwrite that attribute on this instance to not do that + + # protected by an importorskip in the respective test + import fsspec + + df.to_parquet( + "memory://fastparquet_user_agent.parquet", + index=False, + engine="fastparquet", + compression=None, + ) + with fsspec.open("memory://fastparquet_user_agent.parquet", "rb") as f: + return f.read() + + +def pickle_respnder(df): + with BytesIO() as bio: + df.to_pickle(bio) + return bio.getvalue() + + +def stata_responder(df): + with BytesIO() as bio: + df.to_stata(bio, write_index=False) + return bio.getvalue() + + +@pytest.mark.parametrize( + "responder, read_method", + [ + (csv_responder, pd.read_csv), + (json_responder, pd.read_json), + ( + html_responder, + lambda *args, **kwargs: pd.read_html(*args, **kwargs)[0], + ), + pytest.param( + parquetpyarrow_reponder, + partial(pd.read_parquet, engine="pyarrow"), + marks=td.skip_if_no("pyarrow"), + ), + pytest.param( + parquetfastparquet_responder, + partial(pd.read_parquet, engine="fastparquet"), + # TODO(ArrayManager) fastparquet + marks=[ + td.skip_if_no("fastparquet"), + td.skip_if_no("fsspec"), + td.skip_array_manager_not_yet_implemented, + ], + ), + (pickle_respnder, pd.read_pickle), + (stata_responder, pd.read_stata), + (gz_csv_responder, pd.read_csv), + (gz_json_responder, pd.read_json), + ], +) +@pytest.mark.parametrize( + "storage_options", + [ + None, + {"User-Agent": "foo"}, + {"User-Agent": "foo", "Auth": "bar"}, + ], +) +def test_request_headers(responder, read_method, httpserver, storage_options): + expected = pd.DataFrame({"a": ["b"]}) + default_headers = ["Accept-Encoding", "Host", "Connection", "User-Agent"] + if "gz" in responder.__name__: + extra = {"Content-Encoding": "gzip"} + if storage_options is None: + storage_options = extra + else: + storage_options |= extra + else: + extra = None + expected_headers = set(default_headers).union( + storage_options.keys() if storage_options else [] + ) + httpserver.serve_content(content=responder(expected), headers=extra) + result = read_method(httpserver.url, storage_options=storage_options) + tm.assert_frame_equal(result, expected) + + request_headers = dict(httpserver.requests[0].headers) + for header in expected_headers: + exp = request_headers.pop(header) + if storage_options and header in storage_options: + assert exp == storage_options[header] + # No extra headers added + assert not request_headers + + +@pytest.mark.parametrize( + "engine", + [ + "pyarrow", + "fastparquet", + ], +) +def test_to_parquet_to_disk_with_storage_options(engine): + headers = { + "User-Agent": "custom", + "Auth": "other_custom", + } + + pytest.importorskip(engine) + + true_df = pd.DataFrame({"column_name": ["column_value"]}) + msg = ( + "storage_options passed with file object or non-fsspec file path|" + "storage_options passed with buffer, or non-supported URL" + ) + with pytest.raises(ValueError, match=msg): + true_df.to_parquet("/tmp/junk.parquet", storage_options=headers, engine=engine) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 0a72fd7bbec7d..9ae2d5cb03afd 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -15,6 +15,7 @@ from pandas.compat.pyarrow import ( pa_version_under11p0, pa_version_under13p0, + pa_version_under15p0, ) import pandas as pd @@ -721,21 +722,15 @@ def test_basic_subset_columns(self, pa, df_full): def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): # GH 37105 - msg = "Mismatched null-like values nan and None found" - warn = None - if using_copy_on_write(): - warn = FutureWarning - buf_bytes = df_full.to_parquet(engine=pa) assert isinstance(buf_bytes, bytes) buf_stream = BytesIO(buf_bytes) res = read_parquet(buf_stream) - expected = df_full.copy(deep=False) + expected = df_full.copy() expected.loc[1, "string_with_nan"] = None - with tm.assert_produces_warning(warn, match=msg): - tm.assert_frame_equal(df_full, res) + tm.assert_frame_equal(res, expected) def test_duplicate_columns(self, pa): # not currently able to handle duplicate columns @@ -758,7 +753,10 @@ def test_unsupported_float16(self, pa): # Not able to write float 16 column using pyarrow. data = np.arange(2, 10, dtype=np.float16) df = pd.DataFrame(data=data, columns=["fp16"]) - self.check_external_error_on_write(df, pa, pyarrow.ArrowException) + if pa_version_under15p0: + self.check_external_error_on_write(df, pa, pyarrow.ArrowException) + else: + check_round_trip(df, pa) @pytest.mark.xfail( is_platform_windows(), @@ -767,6 +765,7 @@ def test_unsupported_float16(self, pa): "dtypes are passed to_parquet function in windows" ), ) + @pytest.mark.skipif(not pa_version_under15p0, reason="float16 works on 15") @pytest.mark.parametrize("path_type", [str, pathlib.Path]) def test_unsupported_float16_cleanup(self, pa, path_type): # #44847, #44914 diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py index 9ee3c09631d0e..79473895b662d 100644 --- a/pandas/tests/io/test_s3.py +++ b/pandas/tests/io/test_s3.py @@ -30,15 +30,10 @@ def test_read_without_creds_from_pub_bucket(s3_public_bucket_with_data, s3so): @pytest.mark.single_cpu -def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, monkeypatch, s3so): +def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, s3so): # Ensure we can read from a public bucket with credentials # GH 34626 - - # temporary workaround as moto fails for botocore >= 1.11 otherwise, - # see https://github.com/spulec/moto/issues/1924 & 1952 pytest.importorskip("s3fs") - monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key") - monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") df = read_csv( f"s3://{s3_public_bucket_with_data.name}/tips.csv", nrows=5, diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index b274866b7c9a8..45d0a839b9e1a 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -19,6 +19,11 @@ import pytest from pandas._libs import lib +from pandas.compat import ( + pa_version_under13p0, + pa_version_under14p1, +) +from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td import pandas as pd @@ -56,6 +61,11 @@ import sqlalchemy +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + + @pytest.fixture def sql_strings(): return { @@ -110,8 +120,7 @@ def iris_table_metadata(): return iris -def create_and_load_iris_sqlite3(conn: sqlite3.Connection, iris_file: Path): - cur = conn.cursor() +def create_and_load_iris_sqlite3(conn, iris_file: Path): stmt = """CREATE TABLE iris ( "SepalLength" REAL, "SepalWidth" REAL, @@ -119,17 +128,65 @@ def create_and_load_iris_sqlite3(conn: sqlite3.Connection, iris_file: Path): "PetalWidth" REAL, "Name" TEXT )""" + + cur = conn.cursor() cur.execute(stmt) with iris_file.open(newline=None, encoding="utf-8") as csvfile: reader = csv.reader(csvfile) next(reader) stmt = "INSERT INTO iris VALUES(?, ?, ?, ?, ?)" - cur.executemany(stmt, reader) + # ADBC requires explicit types - no implicit str -> float conversion + records = [] + records = [ + ( + float(row[0]), + float(row[1]), + float(row[2]), + float(row[3]), + row[4], + ) + for row in reader + ] + + cur.executemany(stmt, records) + cur.close() + + conn.commit() + + +def create_and_load_iris_postgresql(conn, iris_file: Path): + stmt = """CREATE TABLE iris ( + "SepalLength" DOUBLE PRECISION, + "SepalWidth" DOUBLE PRECISION, + "PetalLength" DOUBLE PRECISION, + "PetalWidth" DOUBLE PRECISION, + "Name" TEXT + )""" + with conn.cursor() as cur: + cur.execute(stmt) + with iris_file.open(newline=None, encoding="utf-8") as csvfile: + reader = csv.reader(csvfile) + next(reader) + stmt = "INSERT INTO iris VALUES($1, $2, $3, $4, $5)" + # ADBC requires explicit types - no implicit str -> float conversion + records = [ + ( + float(row[0]), + float(row[1]), + float(row[2]), + float(row[3]), + row[4], + ) + for row in reader + ] + + cur.executemany(stmt, records) + + conn.commit() def create_and_load_iris(conn, iris_file: Path): from sqlalchemy import insert - from sqlalchemy.engine import Engine iris = iris_table_metadata() @@ -138,17 +195,10 @@ def create_and_load_iris(conn, iris_file: Path): header = next(reader) params = [dict(zip(header, row)) for row in reader] stmt = insert(iris).values(params) - if isinstance(conn, Engine): - with conn.connect() as conn: - with conn.begin(): - iris.drop(conn, checkfirst=True) - iris.create(bind=conn) - conn.execute(stmt) - else: - with conn.begin(): - iris.drop(conn, checkfirst=True) - iris.create(bind=conn) - conn.execute(stmt) + with conn.begin() as con: + iris.drop(con, checkfirst=True) + iris.create(bind=con) + con.execute(stmt) def create_and_load_iris_view(conn): @@ -157,17 +207,17 @@ def create_and_load_iris_view(conn): cur = conn.cursor() cur.execute(stmt) else: - from sqlalchemy import text - from sqlalchemy.engine import Engine - - stmt = text(stmt) - if isinstance(conn, Engine): - with conn.connect() as conn: - with conn.begin(): - conn.execute(stmt) + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(conn, adbc.Connection): + with conn.cursor() as cur: + cur.execute(stmt) + conn.commit() else: - with conn.begin(): - conn.execute(stmt) + from sqlalchemy import text + + stmt = text(stmt) + with conn.begin() as con: + con.execute(stmt) def types_table_metadata(dialect: str): @@ -201,8 +251,7 @@ def types_table_metadata(dialect: str): return types -def create_and_load_types_sqlite3(conn: sqlite3.Connection, types_data: list[dict]): - cur = conn.cursor() +def create_and_load_types_sqlite3(conn, types_data: list[dict]): stmt = """CREATE TABLE types ( "TextCol" TEXT, "DateCol" TEXT, @@ -214,13 +263,47 @@ def create_and_load_types_sqlite3(conn: sqlite3.Connection, types_data: list[dic "IntColWithNull" INTEGER, "BoolColWithNull" INTEGER )""" - cur.execute(stmt) - stmt = """ - INSERT INTO types - VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?) - """ - cur.executemany(stmt, types_data) + ins_stmt = """ + INSERT INTO types + VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?) + """ + + if isinstance(conn, sqlite3.Connection): + cur = conn.cursor() + cur.execute(stmt) + cur.executemany(ins_stmt, types_data) + else: + with conn.cursor() as cur: + cur.execute(stmt) + cur.executemany(ins_stmt, types_data) + + conn.commit() + + +def create_and_load_types_postgresql(conn, types_data: list[dict]): + with conn.cursor() as cur: + stmt = """CREATE TABLE types ( + "TextCol" TEXT, + "DateCol" TIMESTAMP, + "IntDateCol" INTEGER, + "IntDateOnlyCol" INTEGER, + "FloatCol" DOUBLE PRECISION, + "IntCol" INTEGER, + "BoolCol" BOOLEAN, + "IntColWithNull" INTEGER, + "BoolColWithNull" BOOLEAN + )""" + cur.execute(stmt) + + stmt = """ + INSERT INTO types + VALUES($1, $2::timestamp, $3, $4, $5, $6, $7, $8, $9) + """ + + cur.executemany(stmt, types_data) + + conn.commit() def create_and_load_types(conn, types_data: list[dict], dialect: str): @@ -292,15 +375,22 @@ def check_iris_frame(frame: DataFrame): pytype = frame.dtypes.iloc[0].type row = frame.iloc[0] assert issubclass(pytype, np.floating) - tm.equalContents(row.values, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) + tm.assert_series_equal( + row, Series([5.1, 3.5, 1.4, 0.2, "Iris-setosa"], index=frame.columns, name=0) + ) assert frame.shape in ((150, 5), (8, 5)) def count_rows(conn, table_name: str): stmt = f"SELECT count(*) AS count_1 FROM {table_name}" + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") if isinstance(conn, sqlite3.Connection): cur = conn.cursor() return cur.execute(stmt).fetchone()[0] + elif adbc and isinstance(conn, adbc.Connection): + with conn.cursor() as cur: + cur.execute(stmt) + return cur.fetchone()[0] else: from sqlalchemy import create_engine from sqlalchemy.engine import Engine @@ -423,9 +513,24 @@ def get_all_views(conn): c = conn.execute("SELECT name FROM sqlite_master WHERE type='view'") return [view[0] for view in c.fetchall()] else: - from sqlalchemy import inspect + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(conn, adbc.Connection): + results = [] + info = conn.adbc_get_objects().read_all().to_pylist() + for catalog in info: + catalog["catalog_name"] + for schema in catalog["catalog_db_schemas"]: + schema["db_schema_name"] + for table in schema["db_schema_tables"]: + if table["table_type"] == "view": + view_name = table["table_name"] + results.append(view_name) + + return results + else: + from sqlalchemy import inspect - return inspect(conn).get_view_names() + return inspect(conn).get_view_names() def get_all_tables(conn): @@ -433,9 +538,23 @@ def get_all_tables(conn): c = conn.execute("SELECT name FROM sqlite_master WHERE type='table'") return [table[0] for table in c.fetchall()] else: - from sqlalchemy import inspect + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + + if adbc and isinstance(conn, adbc.Connection): + results = [] + info = conn.adbc_get_objects().read_all().to_pylist() + for catalog in info: + for schema in catalog["catalog_db_schemas"]: + for table in schema["db_schema_tables"]: + if table["table_type"] == "table": + table_name = table["table_name"] + results.append(table_name) + + return results + else: + from sqlalchemy import inspect - return inspect(conn).get_table_names() + return inspect(conn).get_table_names() def drop_table( @@ -445,10 +564,16 @@ def drop_table( if isinstance(conn, sqlite3.Connection): conn.execute(f"DROP TABLE IF EXISTS {sql._get_valid_sqlite_name(table_name)}") conn.commit() + else: - with conn.begin() as con: - with sql.SQLDatabase(con) as db: - db.drop_table(table_name) + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(conn, adbc.Connection): + with conn.cursor() as cur: + cur.execute(f'DROP TABLE IF EXISTS "{table_name}"') + else: + with conn.begin() as con: + with sql.SQLDatabase(con) as db: + db.drop_table(table_name) def drop_view( @@ -461,12 +586,17 @@ def drop_view( conn.execute(f"DROP VIEW IF EXISTS {sql._get_valid_sqlite_name(view_name)}") conn.commit() else: - quoted_view = conn.engine.dialect.identifier_preparer.quote_identifier( - view_name - ) - stmt = sqlalchemy.text(f"DROP VIEW IF EXISTS {quoted_view}") - with conn.begin() as con: - con.execute(stmt) # type: ignore[union-attr] + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(conn, adbc.Connection): + with conn.cursor() as cur: + cur.execute(f'DROP VIEW IF EXISTS "{view_name}"') + else: + quoted_view = conn.engine.dialect.identifier_preparer.quote_identifier( + view_name + ) + stmt = sqlalchemy.text(f"DROP VIEW IF EXISTS {quoted_view}") + with conn.begin() as con: + con.execute(stmt) # type: ignore[union-attr] @pytest.fixture @@ -552,6 +682,57 @@ def postgresql_psycopg2_conn(postgresql_psycopg2_engine): yield conn +@pytest.fixture +def postgresql_adbc_conn(): + pytest.importorskip("adbc_driver_postgresql") + from adbc_driver_postgresql import dbapi + + uri = "postgresql://postgres:postgres@localhost:5432/pandas" + with dbapi.connect(uri) as conn: + yield conn + for view in get_all_views(conn): + drop_view(view, conn) + for tbl in get_all_tables(conn): + drop_table(tbl, conn) + conn.commit() + + +@pytest.fixture +def postgresql_adbc_iris(postgresql_adbc_conn, iris_path): + import adbc_driver_manager as mgr + + conn = postgresql_adbc_conn + + try: + conn.adbc_get_table_schema("iris") + except mgr.ProgrammingError: + conn.rollback() + create_and_load_iris_postgresql(conn, iris_path) + try: + conn.adbc_get_table_schema("iris_view") + except mgr.ProgrammingError: # note arrow-adbc issue 1022 + conn.rollback() + create_and_load_iris_view(conn) + yield conn + + +@pytest.fixture +def postgresql_adbc_types(postgresql_adbc_conn, types_data): + import adbc_driver_manager as mgr + + conn = postgresql_adbc_conn + + try: + conn.adbc_get_table_schema("types") + except mgr.ProgrammingError: + conn.rollback() + new_data = [tuple(entry.values()) for entry in types_data] + + create_and_load_types_postgresql(conn, new_data) + + yield conn + + @pytest.fixture def postgresql_psycopg2_conn_iris(postgresql_psycopg2_engine_iris): with postgresql_psycopg2_engine_iris.connect() as conn: @@ -633,6 +814,62 @@ def sqlite_conn_types(sqlite_engine_types): yield conn +@pytest.fixture +def sqlite_adbc_conn(): + pytest.importorskip("adbc_driver_sqlite") + from adbc_driver_sqlite import dbapi + + with tm.ensure_clean() as name: + uri = f"file:{name}" + with dbapi.connect(uri) as conn: + yield conn + for view in get_all_views(conn): + drop_view(view, conn) + for tbl in get_all_tables(conn): + drop_table(tbl, conn) + conn.commit() + + +@pytest.fixture +def sqlite_adbc_iris(sqlite_adbc_conn, iris_path): + import adbc_driver_manager as mgr + + conn = sqlite_adbc_conn + try: + conn.adbc_get_table_schema("iris") + except mgr.ProgrammingError: + conn.rollback() + create_and_load_iris_sqlite3(conn, iris_path) + try: + conn.adbc_get_table_schema("iris_view") + except mgr.ProgrammingError: + conn.rollback() + create_and_load_iris_view(conn) + yield conn + + +@pytest.fixture +def sqlite_adbc_types(sqlite_adbc_conn, types_data): + import adbc_driver_manager as mgr + + conn = sqlite_adbc_conn + try: + conn.adbc_get_table_schema("types") + except mgr.ProgrammingError: + conn.rollback() + new_data = [] + for entry in types_data: + entry["BoolCol"] = int(entry["BoolCol"]) + if entry["BoolColWithNull"] is not None: + entry["BoolColWithNull"] = int(entry["BoolColWithNull"]) + new_data.append(tuple(entry.values())) + + create_and_load_types_sqlite3(conn, new_data) + conn.commit() + + yield conn + + @pytest.fixture def sqlite_buildin(): with contextlib.closing(sqlite3.connect(":memory:")) as closing_conn: @@ -712,11 +949,31 @@ def sqlite_buildin_types(sqlite_buildin, types_data): mysql_connectable_types + postgresql_connectable_types + sqlite_connectable_types ) -all_connectable = sqlalchemy_connectable + ["sqlite_buildin"] +adbc_connectable = [ + "sqlite_adbc_conn", + pytest.param("postgresql_adbc_conn", marks=pytest.mark.db), +] + +adbc_connectable_iris = [ + pytest.param("postgresql_adbc_iris", marks=pytest.mark.db), + pytest.param("sqlite_adbc_iris", marks=pytest.mark.db), +] + +adbc_connectable_types = [ + pytest.param("postgresql_adbc_types", marks=pytest.mark.db), + pytest.param("sqlite_adbc_types", marks=pytest.mark.db), +] + + +all_connectable = sqlalchemy_connectable + ["sqlite_buildin"] + adbc_connectable -all_connectable_iris = sqlalchemy_connectable_iris + ["sqlite_buildin_iris"] +all_connectable_iris = ( + sqlalchemy_connectable_iris + ["sqlite_buildin_iris"] + adbc_connectable_iris +) -all_connectable_types = sqlalchemy_connectable_types + ["sqlite_buildin_types"] +all_connectable_types = ( + sqlalchemy_connectable_types + ["sqlite_buildin_types"] + adbc_connectable_types +) @pytest.mark.parametrize("conn", all_connectable) @@ -728,6 +985,13 @@ def test_dataframe_to_sql(conn, test_frame1, request): @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_empty(conn, test_frame1, request): + if conn == "postgresql_adbc_conn": + request.node.add_marker( + pytest.mark.xfail( + reason="postgres ADBC driver cannot insert index with null type", + strict=True, + ) + ) # GH 51086 if conn is sqlite_engine conn = request.getfixturevalue(conn) empty_df = test_frame1.iloc[:0] @@ -749,8 +1013,22 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): "string": pd.array(["a"], dtype="string[pyarrow]"), } ) + + if "adbc" in conn: + if conn == "sqlite_adbc_conn": + df = df.drop(columns=["timedelta"]) + if pa_version_under14p1: + exp_warning = FutureWarning + msg = "is_sparse is deprecated" + else: + exp_warning = None + msg = "" + else: + exp_warning = UserWarning + msg = "the 'timedelta'" + conn = request.getfixturevalue(conn) - with tm.assert_produces_warning(UserWarning, match="the 'timedelta'"): + with tm.assert_produces_warning(exp_warning, match=msg, check_stacklevel=False): df.to_sql(name="test_arrow", con=conn, if_exists="replace", index=False) @@ -772,6 +1050,13 @@ def test_dataframe_to_sql_arrow_dtypes_missing(conn, request, nulls_fixture): @pytest.mark.parametrize("conn", all_connectable) @pytest.mark.parametrize("method", [None, "multi"]) def test_to_sql(conn, method, test_frame1, request): + if method == "multi" and "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'method' not implemented for ADBC drivers", strict=True + ) + ) + conn = request.getfixturevalue(conn) with pandasSQL_builder(conn, need_transaction=True) as pandasSQL: pandasSQL.to_sql(test_frame1, "test_frame", method=method) @@ -816,6 +1101,13 @@ def test_read_iris_query(conn, request): @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_iris_query_chunksize(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'chunksize' not implemented for ADBC drivers", + strict=True, + ) + ) conn = request.getfixturevalue(conn) iris_frame = concat(read_sql_query("SELECT * FROM iris", conn, chunksize=7)) check_iris_frame(iris_frame) @@ -828,6 +1120,13 @@ def test_read_iris_query_chunksize(conn, request): @pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) def test_read_iris_query_expression_with_parameter(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'chunksize' not implemented for ADBC drivers", + strict=True, + ) + ) conn = request.getfixturevalue(conn) from sqlalchemy import ( MetaData, @@ -849,6 +1148,14 @@ def test_read_iris_query_expression_with_parameter(conn, request): @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_iris_query_string_with_parameter(conn, request, sql_strings): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'chunksize' not implemented for ADBC drivers", + strict=True, + ) + ) + for db, query in sql_strings["read_parameters"].items(): if db in conn: break @@ -871,6 +1178,10 @@ def test_read_iris_table(conn, request): @pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) def test_read_iris_table_chunksize(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) iris_frame = concat(read_sql_table("iris", conn, chunksize=7)) check_iris_frame(iris_frame) @@ -1209,6 +1520,13 @@ def flavor(conn_name): @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_sql_iris_parameter(conn, request, sql_strings): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'params' not implemented for ADBC drivers", + strict=True, + ) + ) conn_name = conn conn = request.getfixturevalue(conn) query = sql_strings["read_parameters"][flavor(conn_name)] @@ -1221,6 +1539,14 @@ def test_read_sql_iris_parameter(conn, request, sql_strings): @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_sql_iris_named_parameter(conn, request, sql_strings): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'params' not implemented for ADBC drivers", + strict=True, + ) + ) + conn_name = conn conn = request.getfixturevalue(conn) query = sql_strings["read_named_parameters"][flavor(conn_name)] @@ -1233,7 +1559,7 @@ def test_read_sql_iris_named_parameter(conn, request, sql_strings): @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_sql_iris_no_parameter_with_percent(conn, request, sql_strings): - if "mysql" in conn or "postgresql" in conn: + if "mysql" in conn or ("postgresql" in conn and "adbc" not in conn): request.applymarker(pytest.mark.xfail(reason="broken test")) conn_name = conn @@ -1259,6 +1585,10 @@ def test_api_read_sql_view(conn, request): @pytest.mark.parametrize("conn", all_connectable_iris) def test_api_read_sql_with_chunksize_no_result(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) query = 'SELECT * FROM iris_view WHERE "SepalLength" < 0.0' with_batch = sql.read_sql_query(query, conn, chunksize=5) @@ -1357,6 +1687,7 @@ def test_api_to_sql_series(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_roundtrip(conn, request, test_frame1): + conn_name = conn conn = request.getfixturevalue(conn) if sql.has_table("test_frame_roundtrip", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: @@ -1366,6 +1697,8 @@ def test_api_roundtrip(conn, request, test_frame1): result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=conn) # HACK! + if "adbc" in conn_name: + result = result.rename(columns={"__index_level_0__": "level_0"}) result.index = test_frame1.index result.set_index("level_0", inplace=True) result.index.astype(int) @@ -1375,6 +1708,10 @@ def test_api_roundtrip(conn, request, test_frame1): @pytest.mark.parametrize("conn", all_connectable) def test_api_roundtrip_chunksize(conn, request, test_frame1): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) if sql.has_table("test_frame_roundtrip", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: @@ -1398,7 +1735,8 @@ def test_api_execute_sql(conn, request): with sql.pandasSQL_builder(conn) as pandas_sql: iris_results = pandas_sql.execute("SELECT * FROM iris") row = iris_results.fetchone() - tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) + iris_results.close() + assert list(row) == [5.1, 3.5, 1.4, 0.2, "Iris-setosa"] @pytest.mark.parametrize("conn", all_connectable_types) @@ -1496,6 +1834,19 @@ def test_api_custom_dateparsing_error( result["BoolCol"] = result["BoolCol"].astype(int) result["BoolColWithNull"] = result["BoolColWithNull"].astype(float) + if conn_name == "postgresql_adbc_types": + expected = expected.astype( + { + "IntDateCol": "int32", + "IntDateOnlyCol": "int32", + "IntCol": "int32", + } + ) + + if not pa_version_under13p0: + # TODO: is this astype safe? + expected["DateCol"] = expected["DateCol"].astype("datetime64[us]") + tm.assert_frame_equal(result, expected) @@ -1517,24 +1868,60 @@ def test_api_date_and_index(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_timedelta(conn, request): # see #6921 + conn_name = conn conn = request.getfixturevalue(conn) if sql.has_table("test_timedelta", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: pandasSQL.drop_table("test_timedelta") df = to_timedelta(Series(["00:00:01", "00:00:03"], name="foo")).to_frame() - with tm.assert_produces_warning(UserWarning): + + if conn_name == "sqlite_adbc_conn": + request.node.add_marker( + pytest.mark.xfail( + reason="sqlite ADBC driver doesn't implement timedelta", + ) + ) + + if "adbc" in conn_name: + if pa_version_under14p1: + exp_warning = FutureWarning + else: + exp_warning = None + else: + exp_warning = UserWarning + + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): result_count = df.to_sql(name="test_timedelta", con=conn) assert result_count == 2 result = sql.read_sql_query("SELECT * FROM test_timedelta", conn) - tm.assert_series_equal(result["foo"], df["foo"].view("int64")) + + if conn_name == "postgresql_adbc_conn": + # TODO: Postgres stores an INTERVAL, which ADBC reads as a Month-Day-Nano + # Interval; the default pandas type mapper maps this to a DateOffset + # but maybe we should try and restore the timedelta here? + expected = Series( + [ + pd.DateOffset(months=0, days=0, microseconds=1000000, nanoseconds=0), + pd.DateOffset(months=0, days=0, microseconds=3000000, nanoseconds=0), + ], + name="foo", + ) + else: + expected = df["foo"].view("int64") + tm.assert_series_equal(result["foo"], expected) @pytest.mark.parametrize("conn", all_connectable) def test_api_complex_raises(conn, request): + conn_name = conn conn = request.getfixturevalue(conn) df = DataFrame({"a": [1 + 1j, 2j]}) - msg = "Complex datatypes not supported" + + if "adbc" in conn_name: + msg = "datatypes not supported" + else: + msg = "Complex datatypes not supported" with pytest.raises(ValueError, match=msg): assert df.to_sql("test_complex", con=conn) is None @@ -1558,6 +1945,10 @@ def test_api_complex_raises(conn, request): ], ) def test_api_to_sql_index_label(conn, request, index_name, index_label, expected): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="index_label argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) if sql.has_table("test_index_label", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: @@ -1580,6 +1971,10 @@ def test_api_to_sql_index_label_multiindex(conn, request): reason="MySQL can fail using TEXT without length as key", strict=False ) ) + elif "adbc" in conn_name: + request.node.add_marker( + pytest.mark.xfail(reason="index_label argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) if sql.has_table("test_index_label", conn): @@ -1702,6 +2097,13 @@ def test_api_integer_col_names(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_get_schema(conn, request, test_frame1): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'get_schema' not implemented for ADBC drivers", + strict=True, + ) + ) conn = request.getfixturevalue(conn) create_sql = sql.get_schema(test_frame1, "test", con=conn) assert "CREATE" in create_sql @@ -1710,6 +2112,13 @@ def test_api_get_schema(conn, request, test_frame1): @pytest.mark.parametrize("conn", all_connectable) def test_api_get_schema_with_schema(conn, request, test_frame1): # GH28486 + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'get_schema' not implemented for ADBC drivers", + strict=True, + ) + ) conn = request.getfixturevalue(conn) create_sql = sql.get_schema(test_frame1, "test", con=conn, schema="pypi") assert "CREATE TABLE pypi." in create_sql @@ -1717,6 +2126,13 @@ def test_api_get_schema_with_schema(conn, request, test_frame1): @pytest.mark.parametrize("conn", all_connectable) def test_api_get_schema_dtypes(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'get_schema' not implemented for ADBC drivers", + strict=True, + ) + ) conn_name = conn conn = request.getfixturevalue(conn) float_frame = DataFrame({"a": [1.1, 1.2], "b": [2.1, 2.2]}) @@ -1734,6 +2150,13 @@ def test_api_get_schema_dtypes(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_get_schema_keys(conn, request, test_frame1): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'get_schema' not implemented for ADBC drivers", + strict=True, + ) + ) conn_name = conn conn = request.getfixturevalue(conn) frame = DataFrame({"Col1": [1.1, 1.2], "Col2": [2.1, 2.2]}) @@ -1756,6 +2179,10 @@ def test_api_get_schema_keys(conn, request, test_frame1): @pytest.mark.parametrize("conn", all_connectable) def test_api_chunksize_read(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC") + ) conn_name = conn conn = request.getfixturevalue(conn) if sql.has_table("test_chunksize", conn): @@ -1801,6 +2228,13 @@ def test_api_chunksize_read(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_categorical(conn, request): + if conn == "postgresql_adbc_conn": + request.node.add_marker( + pytest.mark.xfail( + reason="categorical dtype not implemented for ADBC postgres driver", + strict=True, + ) + ) # GH8624 # test that categorical gets written correctly as dense column conn = request.getfixturevalue(conn) @@ -1859,6 +2293,10 @@ def test_api_escaped_table_name(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_read_sql_duplicate_columns(conn, request): # GH#53117 + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="pyarrow->pandas throws ValueError", strict=True) + ) conn = request.getfixturevalue(conn) if sql.has_table("test_table", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: @@ -1867,7 +2305,7 @@ def test_api_read_sql_duplicate_columns(conn, request): df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": 1}) df.to_sql(name="test_table", con=conn, index=False) - result = pd.read_sql("SELECT a, b, a +1 as a, c FROM test_table;", conn) + result = pd.read_sql("SELECT a, b, a +1 as a, c FROM test_table", conn) expected = DataFrame( [[1, 0.1, 2, 1], [2, 0.2, 3, 1], [3, 0.3, 4, 1]], columns=["a", "b", "a", "c"], @@ -1961,7 +2399,7 @@ def test_not_reflect_all_tables(sqlite_conn): @pytest.mark.parametrize("conn", all_connectable) def test_warning_case_insensitive_table_name(conn, request, test_frame1): conn_name = conn - if conn_name == "sqlite_buildin": + if conn_name == "sqlite_buildin" or "adbc" in conn_name: request.applymarker(pytest.mark.xfail(reason="Does not raise warning")) conn = request.getfixturevalue(conn) @@ -2249,12 +2687,15 @@ def test_roundtrip(conn, request, test_frame1): if conn == "sqlite_str": pytest.skip("sqlite_str has no inspection system") + conn_name = conn conn = request.getfixturevalue(conn) pandasSQL = pandasSQL_builder(conn) with pandasSQL.run_transaction(): assert pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") == 4 result = pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") + if "adbc" in conn_name: + result = result.rename(columns={"__index_level_0__": "level_0"}) result.set_index("level_0", inplace=True) # result.index.astype(int) @@ -2270,7 +2711,8 @@ def test_execute_sql(conn, request): with pandasSQL.run_transaction(): iris_results = pandasSQL.execute("SELECT * FROM iris") row = iris_results.fetchone() - tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) + iris_results.close() + assert list(row) == [5.1, 3.5, 1.4, 0.2, "Iris-setosa"] @pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) @@ -2286,7 +2728,7 @@ def test_sqlalchemy_read_table_columns(conn, request): iris_frame = sql.read_sql_table( "iris", con=conn, columns=["SepalLength", "SepalLength"] ) - tm.equalContents(iris_frame.columns.values, ["SepalLength", "SepalLength"]) + tm.assert_index_equal(iris_frame.columns, Index(["SepalLength", "SepalLength__1"])) @pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) @@ -2629,6 +3071,12 @@ def test_nan_string(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_to_sql_save_index(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="ADBC implementation does not create index", strict=True + ) + ) conn_name = conn conn = request.getfixturevalue(conn) df = DataFrame.from_records( @@ -2667,7 +3115,7 @@ def test_transactions(conn, request): conn = request.getfixturevalue(conn) stmt = "CREATE TABLE test_trans (A INT, B TEXT)" - if conn_name != "sqlite_buildin": + if conn_name != "sqlite_buildin" and "adbc" not in conn_name: from sqlalchemy import text stmt = text(stmt) @@ -2679,11 +3127,12 @@ def test_transactions(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_transaction_rollback(conn, request): + conn_name = conn conn = request.getfixturevalue(conn) with pandasSQL_builder(conn) as pandasSQL: with pandasSQL.run_transaction() as trans: stmt = "CREATE TABLE test_trans (A INT, B TEXT)" - if isinstance(pandasSQL, SQLiteDatabase): + if "adbc" in conn_name or isinstance(pandasSQL, SQLiteDatabase): trans.execute(stmt) else: from sqlalchemy import text @@ -2987,9 +3436,11 @@ class Temporary(Base): @pytest.mark.parametrize("conn", all_connectable) def test_invalid_engine(conn, request, test_frame1): - if conn == "sqlite_buildin": + if conn == "sqlite_buildin" or "adbc" in conn: request.applymarker( - pytest.mark.xfail(reason="SQLiteDatabase does not raise for bad engine") + pytest.mark.xfail( + reason="SQLiteDatabase/ADBCDatabase does not raise for bad engine" + ) ) conn = request.getfixturevalue(conn) @@ -3089,6 +3540,12 @@ def test_read_sql_dtype_backend( expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) tm.assert_frame_equal(result, expected) + if "adbc" in conn_name: + # adbc does not support chunksize argument + request.applymarker( + pytest.mark.xfail(reason="adbc does not support chunksize argument") + ) + with pd.option_context("mode.string_storage", string_storage): iterator = getattr(pd, func)( f"Select * from {table}", @@ -3112,7 +3569,7 @@ def test_read_sql_dtype_backend_table( dtype_backend_data, dtype_backend_expected, ): - if "sqlite" in conn: + if "sqlite" in conn and "adbc" not in conn: request.applymarker( pytest.mark.xfail( reason=( @@ -3133,6 +3590,10 @@ def test_read_sql_dtype_backend_table( expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) tm.assert_frame_equal(result, expected) + if "adbc" in conn_name: + # adbc does not support chunksize argument + return + with pd.option_context("mode.string_storage", string_storage): iterator = getattr(pd, func)( table, @@ -3179,7 +3640,9 @@ def dtype_backend_data() -> DataFrame: @pytest.fixture def dtype_backend_expected(): - def func(storage, dtype_backend, conn_name): + def func(storage, dtype_backend, conn_name) -> DataFrame: + string_array: StringArray | ArrowStringArray + string_array_na: StringArray | ArrowStringArray if storage == "python": string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) @@ -3227,6 +3690,10 @@ def func(storage, dtype_backend, conn_name): @pytest.mark.parametrize("conn", all_connectable) def test_chunksize_empty_dtypes(conn, request): # GH#50245 + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) dtypes = {"a": "int64", "b": "object"} df = DataFrame(columns=["a", "b"]).astype(dtypes) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index f7f94af92743e..82e6c2964b8c5 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -32,6 +32,11 @@ read_stata, ) +# TODO(CoW-warn) avoid warnings in the stata reader code +pytestmark = pytest.mark.filterwarnings( + "ignore:Setting a value on a view:FutureWarning" +) + @pytest.fixture def mixed_frame(): @@ -178,11 +183,13 @@ def test_read_dta2(self, datapath): path2 = datapath("io", "data", "stata", "stata2_115.dta") path3 = datapath("io", "data", "stata", "stata2_117.dta") - with tm.assert_produces_warning(UserWarning): + # TODO(CoW-warn) avoid warnings in the stata reader code + # once fixed -> remove `raise_on_extra_warnings=False` again + with tm.assert_produces_warning(UserWarning, raise_on_extra_warnings=False): parsed_114 = self.read_dta(path1) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, raise_on_extra_warnings=False): parsed_115 = self.read_dta(path2) - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning(UserWarning, raise_on_extra_warnings=False): parsed_117 = self.read_dta(path3) # FIXME: don't leave commented-out # 113 is buggy due to limits of date format support in Stata @@ -1833,15 +1840,14 @@ def test_encoding_latin1_118(self, datapath): @pytest.mark.slow def test_stata_119(self, datapath): # Gzipped since contains 32,999 variables and uncompressed is 20MiB + # Just validate that the reader reports correct number of variables + # to avoid high peak memory with gzip.open( datapath("io", "data", "stata", "stata1_119.dta.gz"), "rb" ) as gz: - df = read_stata(gz) - assert df.shape == (1, 32999) - assert df.iloc[0, 6] == "A" * 3000 - assert df.iloc[0, 7] == 3.14 - assert df.iloc[0, -1] == 1 - assert df.iloc[0, 0] == pd.Timestamp(datetime(2012, 12, 21, 21, 12, 21)) + with StataReader(gz) as reader: + reader._ensure_open() + assert reader._nvar == 32999 @pytest.mark.parametrize("version", [118, 119, None]) def test_utf8_writer(self, version): diff --git a/pandas/tests/io/test_user_agent.py b/pandas/tests/io/test_user_agent.py deleted file mode 100644 index a892e51f2f28d..0000000000000 --- a/pandas/tests/io/test_user_agent.py +++ /dev/null @@ -1,403 +0,0 @@ -""" -Tests for the pandas custom headers in http(s) requests -""" -import gzip -import http.server -from io import BytesIO -import multiprocessing -import socket -import time -import urllib.error - -import pytest - -from pandas.compat import is_ci_environment -import pandas.util._test_decorators as td - -import pandas as pd -import pandas._testing as tm - -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.skipif( - is_ci_environment(), - reason="GH 45651: This test can hang in our CI min_versions build", - ), - pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" - ), -] - - -class BaseUserAgentResponder(http.server.BaseHTTPRequestHandler): - """ - Base class for setting up a server that can be set up to respond - with a particular file format with accompanying content-type headers. - The interfaces on the different io methods are different enough - that this seemed logical to do. - """ - - def start_processing_headers(self): - """ - shared logic at the start of a GET request - """ - self.send_response(200) - self.requested_from_user_agent = self.headers["User-Agent"] - response_df = pd.DataFrame( - { - "header": [self.requested_from_user_agent], - } - ) - return response_df - - def gzip_bytes(self, response_bytes): - """ - some web servers will send back gzipped files to save bandwidth - """ - with BytesIO() as bio: - with gzip.GzipFile(fileobj=bio, mode="w") as zipper: - zipper.write(response_bytes) - response_bytes = bio.getvalue() - return response_bytes - - def write_back_bytes(self, response_bytes): - """ - shared logic at the end of a GET request - """ - self.wfile.write(response_bytes) - - -class CSVUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - - self.send_header("Content-Type", "text/csv") - self.end_headers() - - response_bytes = response_df.to_csv(index=False).encode("utf-8") - self.write_back_bytes(response_bytes) - - -class GzippedCSVUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "text/csv") - self.send_header("Content-Encoding", "gzip") - self.end_headers() - - response_bytes = response_df.to_csv(index=False).encode("utf-8") - response_bytes = self.gzip_bytes(response_bytes) - - self.write_back_bytes(response_bytes) - - -class JSONUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "application/json") - self.end_headers() - - response_bytes = response_df.to_json().encode("utf-8") - - self.write_back_bytes(response_bytes) - - -class GzippedJSONUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "application/json") - self.send_header("Content-Encoding", "gzip") - self.end_headers() - - response_bytes = response_df.to_json().encode("utf-8") - response_bytes = self.gzip_bytes(response_bytes) - - self.write_back_bytes(response_bytes) - - -class HTMLUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "text/html") - self.end_headers() - - response_bytes = response_df.to_html(index=False).encode("utf-8") - - self.write_back_bytes(response_bytes) - - -class ParquetPyArrowUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "application/octet-stream") - self.end_headers() - - response_bytes = response_df.to_parquet(index=False, engine="pyarrow") - - self.write_back_bytes(response_bytes) - - -class ParquetFastParquetUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "application/octet-stream") - self.end_headers() - - # the fastparquet engine doesn't like to write to a buffer - # it can do it via the open_with function being set appropriately - # however it automatically calls the close method and wipes the buffer - # so just overwrite that attribute on this instance to not do that - - # protected by an importorskip in the respective test - import fsspec - - response_df.to_parquet( - "memory://fastparquet_user_agent.parquet", - index=False, - engine="fastparquet", - compression=None, - ) - with fsspec.open("memory://fastparquet_user_agent.parquet", "rb") as f: - response_bytes = f.read() - - self.write_back_bytes(response_bytes) - - -class PickleUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "application/octet-stream") - self.end_headers() - - bio = BytesIO() - response_df.to_pickle(bio) - response_bytes = bio.getvalue() - - self.write_back_bytes(response_bytes) - - -class StataUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "application/octet-stream") - self.end_headers() - - bio = BytesIO() - response_df.to_stata(bio, write_index=False) - response_bytes = bio.getvalue() - - self.write_back_bytes(response_bytes) - - -class AllHeaderCSVResponder(http.server.BaseHTTPRequestHandler): - """ - Send all request headers back for checking round trip - """ - - def do_GET(self): - response_df = pd.DataFrame(self.headers.items()) - self.send_response(200) - self.send_header("Content-Type", "text/csv") - self.end_headers() - response_bytes = response_df.to_csv(index=False).encode("utf-8") - self.wfile.write(response_bytes) - - -def wait_until_ready(func, *args, **kwargs): - def inner(*args, **kwargs): - while True: - try: - return func(*args, **kwargs) - except urllib.error.URLError: - # Connection refused as http server is starting - time.sleep(0.1) - - return inner - - -def process_server(responder, port): - with http.server.HTTPServer(("localhost", port), responder) as server: - server.handle_request() - server.server_close() - - -@pytest.fixture -def responder(request): - """ - Fixture that starts a local http server in a separate process on localhost - and returns the port. - - Running in a separate process instead of a thread to allow termination/killing - of http server upon cleanup. - """ - # Find an available port - with socket.socket() as sock: - sock.bind(("localhost", 0)) - port = sock.getsockname()[1] - - server_process = multiprocessing.Process( - target=process_server, args=(request.param, port) - ) - server_process.start() - yield port - server_process.join(10) - server_process.terminate() - kill_time = 5 - wait_time = 0 - while server_process.is_alive(): - if wait_time > kill_time: - server_process.kill() - break - wait_time += 0.1 - time.sleep(0.1) - server_process.close() - - -@pytest.mark.parametrize( - "responder, read_method, parquet_engine", - [ - (CSVUserAgentResponder, pd.read_csv, None), - (JSONUserAgentResponder, pd.read_json, None), - ( - HTMLUserAgentResponder, - lambda *args, **kwargs: pd.read_html(*args, **kwargs)[0], - None, - ), - (ParquetPyArrowUserAgentResponder, pd.read_parquet, "pyarrow"), - pytest.param( - ParquetFastParquetUserAgentResponder, - pd.read_parquet, - "fastparquet", - # TODO(ArrayManager) fastparquet - marks=[ - td.skip_array_manager_not_yet_implemented, - ], - ), - (PickleUserAgentResponder, pd.read_pickle, None), - (StataUserAgentResponder, pd.read_stata, None), - (GzippedCSVUserAgentResponder, pd.read_csv, None), - (GzippedJSONUserAgentResponder, pd.read_json, None), - ], - indirect=["responder"], -) -def test_server_and_default_headers(responder, read_method, parquet_engine): - if parquet_engine is not None: - pytest.importorskip(parquet_engine) - if parquet_engine == "fastparquet": - pytest.importorskip("fsspec") - - read_method = wait_until_ready(read_method) - if parquet_engine is None: - df_http = read_method(f"http://localhost:{responder}") - else: - df_http = read_method(f"http://localhost:{responder}", engine=parquet_engine) - - assert not df_http.empty - - -@pytest.mark.parametrize( - "responder, read_method, parquet_engine", - [ - (CSVUserAgentResponder, pd.read_csv, None), - (JSONUserAgentResponder, pd.read_json, None), - ( - HTMLUserAgentResponder, - lambda *args, **kwargs: pd.read_html(*args, **kwargs)[0], - None, - ), - (ParquetPyArrowUserAgentResponder, pd.read_parquet, "pyarrow"), - pytest.param( - ParquetFastParquetUserAgentResponder, - pd.read_parquet, - "fastparquet", - # TODO(ArrayManager) fastparquet - marks=[ - td.skip_array_manager_not_yet_implemented, - ], - ), - (PickleUserAgentResponder, pd.read_pickle, None), - (StataUserAgentResponder, pd.read_stata, None), - (GzippedCSVUserAgentResponder, pd.read_csv, None), - (GzippedJSONUserAgentResponder, pd.read_json, None), - ], - indirect=["responder"], -) -def test_server_and_custom_headers(responder, read_method, parquet_engine): - if parquet_engine is not None: - pytest.importorskip(parquet_engine) - if parquet_engine == "fastparquet": - pytest.importorskip("fsspec") - - custom_user_agent = "Super Cool One" - df_true = pd.DataFrame({"header": [custom_user_agent]}) - - read_method = wait_until_ready(read_method) - if parquet_engine is None: - df_http = read_method( - f"http://localhost:{responder}", - storage_options={"User-Agent": custom_user_agent}, - ) - else: - df_http = read_method( - f"http://localhost:{responder}", - storage_options={"User-Agent": custom_user_agent}, - engine=parquet_engine, - ) - - tm.assert_frame_equal(df_true, df_http) - - -@pytest.mark.parametrize( - "responder, read_method", - [ - (AllHeaderCSVResponder, pd.read_csv), - ], - indirect=["responder"], -) -def test_server_and_all_custom_headers(responder, read_method): - custom_user_agent = "Super Cool One" - custom_auth_token = "Super Secret One" - storage_options = { - "User-Agent": custom_user_agent, - "Auth": custom_auth_token, - } - read_method = wait_until_ready(read_method) - df_http = read_method( - f"http://localhost:{responder}", - storage_options=storage_options, - ) - - df_http = df_http[df_http["0"].isin(storage_options.keys())] - df_http = df_http.sort_values(["0"]).reset_index() - df_http = df_http[["0", "1"]] - - keys = list(storage_options.keys()) - df_true = pd.DataFrame({"0": keys, "1": [storage_options[k] for k in keys]}) - df_true = df_true.sort_values(["0"]) - df_true = df_true.reset_index().drop(["index"], axis=1) - - tm.assert_frame_equal(df_true, df_http) - - -@pytest.mark.parametrize( - "engine", - [ - "pyarrow", - "fastparquet", - ], -) -def test_to_parquet_to_disk_with_storage_options(engine): - headers = { - "User-Agent": "custom", - "Auth": "other_custom", - } - - pytest.importorskip(engine) - - true_df = pd.DataFrame({"column_name": ["column_value"]}) - msg = ( - "storage_options passed with file object or non-fsspec file path|" - "storage_options passed with buffer, or non-supported URL" - ) - with pytest.raises(ValueError, match=msg): - true_df.to_parquet("/tmp/junk.parquet", storage_options=headers, engine=engine) diff --git a/pandas/tests/io/xml/conftest.py b/pandas/tests/io/xml/conftest.py index c88616eb78029..aafda0ff62bbd 100644 --- a/pandas/tests/io/xml/conftest.py +++ b/pandas/tests/io/xml/conftest.py @@ -1,9 +1,11 @@ +from pathlib import Path + import pytest @pytest.fixture -def xml_data_path(tests_io_data_path, datapath): - return tests_io_data_path / "xml" +def xml_data_path(): + return Path(__file__).parent.parent / "data" / "xml" @pytest.fixture diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 2c8f4c4149528..e54764f9ac4a6 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -644,13 +644,13 @@ def test_mode(self, dtype, writable): values = np.repeat(np.arange(N).astype(dtype), 5) values[0] = 42 values.flags.writeable = writable - result = ht.mode(values, False) + result = ht.mode(values, False)[0] assert result == 42 def test_mode_stable(self, dtype, writable): values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype) values.flags.writeable = writable - keys = ht.mode(values, False) + keys = ht.mode(values, False)[0] tm.assert_numpy_array_equal(keys, values) @@ -658,7 +658,7 @@ def test_modes_with_nans(): # GH42688, nans aren't mangled nulls = [pd.NA, np.nan, pd.NaT, None] values = np.array([True] + nulls * 2, dtype=np.object_) - modes = ht.mode(values, False) + modes = ht.mode(values, False)[0] assert modes.size == len(nulls) @@ -724,8 +724,8 @@ def test_ismember_no(self, dtype): def test_mode(self, dtype): values = np.array([42, np.nan, np.nan, np.nan], dtype=dtype) - assert ht.mode(values, True) == 42 - assert np.isnan(ht.mode(values, False)) + assert ht.mode(values, True)[0] == 42 + assert np.isnan(ht.mode(values, False)[0]) def test_ismember_tuple_with_nans(): diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index fd5a66049bd24..69120160699c2 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -328,7 +328,7 @@ def _check_axes_shape(axes, axes_num=None, layout=None, figsize=None): ) -def _flatten_visible(axes): +def _flatten_visible(axes: Axes | Sequence[Axes]) -> Sequence[Axes]: """ Flatten axes, and filter only visible @@ -339,8 +339,8 @@ def _flatten_visible(axes): """ from pandas.plotting._matplotlib.tools import flatten_axes - axes = flatten_axes(axes) - axes = [ax for ax in axes if ax.get_visible()] + axes_ndarray = flatten_axes(axes) + axes = [ax for ax in axes_ndarray if ax.get_visible()] return axes diff --git a/pandas/tests/plotting/frame/test_frame_legend.py b/pandas/tests/plotting/frame/test_frame_legend.py index d2924930667b6..402a4b9531e5d 100644 --- a/pandas/tests/plotting/frame/test_frame_legend.py +++ b/pandas/tests/plotting/frame/test_frame_legend.py @@ -231,7 +231,7 @@ def test_legend_name(self): "line", "bar", "barh", - pytest.param("kde", marks=td.skip_if_no_scipy), + pytest.param("kde", marks=td.skip_if_no("scipy")), "area", "hist", ], diff --git a/pandas/tests/plotting/test_backend.py b/pandas/tests/plotting/test_backend.py index c0ad8e0c9608d..8a8643415ae12 100644 --- a/pandas/tests/plotting/test_backend.py +++ b/pandas/tests/plotting/test_backend.py @@ -80,7 +80,7 @@ def test_setting_backend_without_plot_raises(monkeypatch): assert pandas.options.plotting.backend == "matplotlib" -@td.skip_if_mpl +@td.skip_if_installed("matplotlib") def test_no_matplotlib_ok(): msg = ( 'matplotlib is required for plotting when the default backend "matplotlib" is ' diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 5b06cd1ab296a..56028249fe517 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -15,6 +15,7 @@ interval_range, period_range, plotting, + read_csv, ) import pandas._testing as tm from pandas.tests.plotting.common import ( @@ -30,7 +31,15 @@ cm = pytest.importorskip("matplotlib.cm") -@td.skip_if_mpl +@pytest.fixture +def iris(datapath) -> DataFrame: + """ + The iris dataset as a DataFrame. + """ + return read_csv(datapath("io", "data", "csv", "iris.csv")) + + +@td.skip_if_installed("matplotlib") def test_import_error_message(): # GH-19810 df = DataFrame({"A": [1, 2]}) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 6719e39612f5d..f78f5c4879b9f 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -91,7 +91,7 @@ def test_plot_iseries(self, iseries): "line", "bar", "barh", - pytest.param("kde", marks=td.skip_if_no_scipy), + pytest.param("kde", marks=td.skip_if_no("scipy")), "hist", "box", ], diff --git a/pandas/tests/resample/conftest.py b/pandas/tests/resample/conftest.py index 90c2a91a22158..1033d908eb22d 100644 --- a/pandas/tests/resample/conftest.py +++ b/pandas/tests/resample/conftest.py @@ -1,5 +1,4 @@ from datetime import datetime -import warnings import numpy as np import pytest @@ -8,8 +7,6 @@ DataFrame, Series, ) -from pandas.core.indexes.datetimes import date_range -from pandas.core.indexes.period import period_range # The various methods we support downsample_methods = [ @@ -44,40 +41,6 @@ def resample_method(request): return request.param -@pytest.fixture -def simple_date_range_series(): - """ - Series with date range index and random data for test purposes. - """ - - def _simple_date_range_series(start, end, freq="D"): - rng = date_range(start, end, freq=freq) - return Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - - return _simple_date_range_series - - -@pytest.fixture -def simple_period_range_series(): - """ - Series with period range index and random data for test purposes. - """ - - def _simple_period_range_series(start, end, freq="D"): - with warnings.catch_warnings(): - # suppress Period[B] deprecation warning - msg = "|".join(["Period with BDay freq", r"PeriodDtype\[B\] is deprecated"]) - warnings.filterwarnings( - "ignore", - msg, - category=FutureWarning, - ) - rng = period_range(start, end, freq=freq) - return Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - - return _simple_period_range_series - - @pytest.fixture def _index_start(): """Fixture for parametrization of index, series and frame.""" diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 9cf4e68a9c5ec..98cd7d7ae22f6 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -53,6 +53,19 @@ def unit(request): return request.param +@pytest.fixture +def simple_date_range_series(): + """ + Series with date range index and random data for test purposes. + """ + + def _simple_date_range_series(start, end, freq="D"): + rng = date_range(start, end, freq=freq) + return Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) + + return _simple_date_range_series + + def test_custom_grouper(index, unit): dti = index.as_unit(unit) s = Series(np.array([1] * len(dti)), index=dti, dtype="int64") @@ -1208,14 +1221,6 @@ def test_corner_cases(unit): tm.assert_index_equal(result.index, ex_index) -def test_corner_cases_period(simple_period_range_series): - # miscellaneous test coverage - len0pts = simple_period_range_series("2007-01", "2010-05", freq="M")[:0] - # it works - result = len0pts.resample("Y-DEC").mean() - assert len(result) == 0 - - def test_corner_cases_date(simple_date_range_series, unit): # resample to periods ts = simple_date_range_series("2000-04-28", "2000-04-30 11:00", freq="h") @@ -1385,7 +1390,9 @@ def test_resample_timegrouper(dates, unit): @pytest.mark.parametrize("dates", [dates1, dates2, dates3]) -def test_resample_timegrouper2(dates): +def test_resample_timegrouper2(dates, unit): + dates = DatetimeIndex(dates).as_unit(unit) + df = DataFrame({"A": dates, "B": np.arange(len(dates)), "C": np.arange(len(dates))}) result = df.set_index("A").resample("ME").count() @@ -1393,7 +1400,7 @@ def test_resample_timegrouper2(dates): ["2014-07-31", "2014-08-31", "2014-09-30", "2014-10-31", "2014-11-30"], freq="ME", name="A", - ) + ).as_unit(unit) expected = DataFrame( {"B": [1, 0, 2, 2, 1], "C": [1, 0, 2, 2, 1]}, index=exp_idx, @@ -1859,7 +1866,7 @@ def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k, unit): # GH 24127 n1_ = n1 * k n2_ = n2 * k - dti = date_range("19910905 13:00", "19911005 07:00", freq=freq1).as_unit(unit) + dti = date_range("1991-09-05", "1991-09-12", freq=freq1).as_unit(unit) ser = Series(range(len(dti)), index=dti) result1 = ser.resample(str(n1_) + freq1).mean() @@ -2058,7 +2065,7 @@ def test_resample_empty_series_with_tz(): ) def test_resample_M_Q_Y_A_deprecated(freq, freq_depr): # GH#9586 - depr_msg = f"'{freq_depr[1:]}' will be deprecated, please use '{freq[1:]}' instead." + depr_msg = f"'{freq_depr[1:]}' is deprecated, please use '{freq[1:]}' instead." s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) expected = s.resample(freq).mean() @@ -2067,14 +2074,22 @@ def test_resample_M_Q_Y_A_deprecated(freq, freq_depr): tm.assert_series_equal(result, expected) -def test_resample_BM_deprecated(): +@pytest.mark.parametrize( + "freq, freq_depr", + [ + ("2BME", "2BM"), + ("2BQE", "2BQ"), + ("2BQE-MAR", "2BQ-MAR"), + ], +) +def test_resample_BM_BQ_deprecated(freq, freq_depr): # GH#52064 - depr_msg = "'BM' is deprecated and will be removed in a future version." + depr_msg = f"'{freq_depr[1:]}' is deprecated, please use '{freq[1:]}' instead." s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) - expected = s.resample("2BME").mean() + expected = s.resample(freq).mean() with tm.assert_produces_warning(FutureWarning, match=depr_msg): - result = s.resample("2BM").mean() + result = s.resample(freq_depr).mean() tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index f3a7c94fc9eaa..b3d346eb22ac5 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -1,4 +1,5 @@ from datetime import datetime +import warnings import dateutil import numpy as np @@ -40,6 +41,27 @@ def _series_name(): return "pi" +@pytest.fixture +def simple_period_range_series(): + """ + Series with period range index and random data for test purposes. + """ + + def _simple_period_range_series(start, end, freq="D"): + with warnings.catch_warnings(): + # suppress Period[B] deprecation warning + msg = "|".join(["Period with BDay freq", r"PeriodDtype\[B\] is deprecated"]) + warnings.filterwarnings( + "ignore", + msg, + category=FutureWarning, + ) + rng = period_range(start, end, freq=freq) + return Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) + + return _simple_period_range_series + + class TestPeriodIndex: @pytest.mark.parametrize("freq", ["2D", "1h", "2h"]) @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) @@ -303,13 +325,17 @@ def test_with_local_timezone(self, tz): dateutil.tz.gettz("America/Los_Angeles"), ], ) - def test_resample_with_tz(self, tz): + def test_resample_with_tz(self, tz, unit): # GH 13238 - ser = Series(2, index=date_range("2017-01-01", periods=48, freq="h", tz=tz)) + dti = date_range("2017-01-01", periods=48, freq="h", tz=tz, unit=unit) + ser = Series(2, index=dti) result = ser.resample("D").mean() + exp_dti = pd.DatetimeIndex( + ["2017-01-01", "2017-01-02"], tz=tz, freq="D" + ).as_unit(unit) expected = Series( 2.0, - index=pd.DatetimeIndex(["2017-01-01", "2017-01-02"], tz=tz, freq="D"), + index=exp_dti, ) tm.assert_series_equal(result, expected) # Especially assert that the timezone is LMT for pytz @@ -478,8 +504,8 @@ def test_resample_weekly_all_na(self): expected = ts.asfreq("W-THU").ffill() tm.assert_series_equal(result, expected) - def test_resample_tz_localized(self): - dr = date_range(start="2012-4-13", end="2012-5-1") + def test_resample_tz_localized(self, unit): + dr = date_range(start="2012-4-13", end="2012-5-1", unit=unit) ts = Series(range(len(dr)), index=dr) ts_utc = ts.tz_localize("UTC") @@ -488,9 +514,7 @@ def test_resample_tz_localized(self): result = ts_local.resample("W").mean() ts_local_naive = ts_local.copy() - ts_local_naive.index = [ - x.replace(tzinfo=None) for x in ts_local_naive.index.to_pydatetime() - ] + ts_local_naive.index = ts_local_naive.index.tz_localize(None) exp = ts_local_naive.resample("W").mean().tz_localize("America/Los_Angeles") exp.index = pd.DatetimeIndex(exp.index, freq="W") @@ -604,8 +628,10 @@ def test_resample_with_dst_time_change(self): "2016-03-15 01:00:00-05:00", "2016-03-15 13:00:00-05:00", ] - index = pd.to_datetime(expected_index_values, utc=True).tz_convert( - "America/Chicago" + index = ( + pd.to_datetime(expected_index_values, utc=True) + .tz_convert("America/Chicago") + .as_unit(index.unit) ) index = pd.DatetimeIndex(index, freq="12h") expected = DataFrame( @@ -664,15 +690,15 @@ def test_default_left_closed_label(self, from_freq, to_freq): ) def test_all_values_single_bin(self): - # 2070 + # GH#2070 index = period_range(start="2012-01-01", end="2012-12-31", freq="M") - s = Series(np.random.default_rng(2).standard_normal(len(index)), index=index) + ser = Series(np.random.default_rng(2).standard_normal(len(index)), index=index) - result = s.resample("Y").mean() - tm.assert_almost_equal(result.iloc[0], s.mean()) + result = ser.resample("Y").mean() + tm.assert_almost_equal(result.iloc[0], ser.mean()) def test_evenly_divisible_with_no_extra_bins(self): - # 4076 + # GH#4076 # when the frequency is evenly divisible, sometimes extra bins df = DataFrame( @@ -682,7 +708,7 @@ def test_evenly_divisible_with_no_extra_bins(self): result = df.resample("5D").mean() expected = pd.concat([df.iloc[0:5].mean(), df.iloc[5:].mean()], axis=1).T expected.index = pd.DatetimeIndex( - [Timestamp("2000-1-1"), Timestamp("2000-1-6")], freq="5D" + [Timestamp("2000-1-1"), Timestamp("2000-1-6")], dtype="M8[ns]", freq="5D" ) tm.assert_frame_equal(result, expected) @@ -924,11 +950,46 @@ def test_resample_t_l_deprecated(self): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("freq_depr", ["2ME", "2QE", "2QE-FEB", "2YE", "2YE-MAR"]) -def test_resample_frequency_ME_QE_error_message(series_and_frame, freq_depr): +@pytest.mark.parametrize( + "freq,freq_depr", + [ + ("2M", "2ME"), + ("2Q", "2QE"), + ("2Q-FEB", "2QE-FEB"), + ("2Y", "2YE"), + ("2Y-MAR", "2YE-MAR"), + ], +) +def test_resample_frequency_ME_QE_YE_error_message(series_and_frame, freq, freq_depr): + # GH#9586 + msg = f"for Period, please use '{freq[1:]}' instead of '{freq_depr[1:]}'" + + obj = series_and_frame + with pytest.raises(ValueError, match=msg): + obj.resample(freq_depr) + + +def test_corner_cases_period(simple_period_range_series): + # miscellaneous test coverage + len0pts = simple_period_range_series("2007-01", "2010-05", freq="M")[:0] + # it works + result = len0pts.resample("Y-DEC").mean() + assert len(result) == 0 + + +@pytest.mark.parametrize( + "freq_depr", + [ + "2BME", + "2CBME", + "2SME", + "2BQE-FEB", + "2BYE-MAR", + ], +) +def test_resample_frequency_invalid_freq(series_and_frame, freq_depr): # GH#9586 - msg = f"for Period, please use '{freq_depr[1:2]}{freq_depr[3:]}' " - f"instead of '{freq_depr[1:]}'" + msg = f"Invalid frequency: {freq_depr[1:]}" obj = series_and_frame with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 12618f42c7f07..81a054bbbc3df 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -116,7 +116,10 @@ def test_resample_group_keys(): # group_keys=True expected.index = pd.MultiIndex.from_arrays( - [pd.to_datetime(["2000-01-01", "2000-01-06"]).repeat(5), expected.index] + [ + pd.to_datetime(["2000-01-01", "2000-01-06"]).as_unit("ns").repeat(5), + expected.index, + ] ) g = df.resample("5D", group_keys=True) result = g.apply(lambda x: x) @@ -376,214 +379,245 @@ def test_agg_consistency_int_str_column_mix(): # `Base` test class -def test_agg(): - # test with all three Resampler apis and TimeGrouper - +@pytest.fixture +def index(): index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") index.name = "date" - df = DataFrame( + return index + + +@pytest.fixture +def df(index): + frame = DataFrame( np.random.default_rng(2).random((10, 2)), columns=list("AB"), index=index ) - df_col = df.reset_index() + return frame + + +@pytest.fixture +def df_col(df): + return df.reset_index() + + +@pytest.fixture +def df_mult(df_col, index): df_mult = df_col.copy() df_mult.index = pd.MultiIndex.from_arrays( - [range(10), df.index], names=["index", "date"] + [range(10), index], names=["index", "date"] ) - r = df.resample("2D") - cases = [ - r, - df_col.resample("2D", on="date"), - df_mult.resample("2D", level="date"), - df.groupby(pd.Grouper(freq="2D")), - ] + return df_mult + + +@pytest.fixture +def a_mean(df): + return df.resample("2D")["A"].mean() + + +@pytest.fixture +def a_std(df): + return df.resample("2D")["A"].std() + + +@pytest.fixture +def a_sum(df): + return df.resample("2D")["A"].sum() + + +@pytest.fixture +def b_mean(df): + return df.resample("2D")["B"].mean() + + +@pytest.fixture +def b_std(df): + return df.resample("2D")["B"].std() + + +@pytest.fixture +def b_sum(df): + return df.resample("2D")["B"].sum() + + +@pytest.fixture +def df_resample(df): + return df.resample("2D") + + +@pytest.fixture +def df_col_resample(df_col): + return df_col.resample("2D", on="date") + + +@pytest.fixture +def df_mult_resample(df_mult): + return df_mult.resample("2D", level="date") + + +@pytest.fixture +def df_grouper_resample(df): + return df.groupby(pd.Grouper(freq="2D")) - a_mean = r["A"].mean() - a_std = r["A"].std() - a_sum = r["A"].sum() - b_mean = r["B"].mean() - b_std = r["B"].std() - b_sum = r["B"].sum() +@pytest.fixture( + params=["df_resample", "df_col_resample", "df_mult_resample", "df_grouper_resample"] +) +def cases(request): + return request.getfixturevalue(request.param) + + +def test_agg_mixed_column_aggregation(cases, a_mean, a_std, b_mean, b_std, request): expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) msg = "using SeriesGroupBy.[mean|std]" - for t in cases: - # In case 2, "date" is an index and a column, so get included in the agg - if t == cases[2]: - date_mean = t["date"].mean() - date_std = t["date"].std() - exp = pd.concat([date_mean, date_std, expected], axis=1) - exp.columns = pd.MultiIndex.from_product( - [["date", "A", "B"], ["mean", "std"]] - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.aggregate([np.mean, np.std]) - tm.assert_frame_equal(result, exp) - else: - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.aggregate([np.mean, np.std]) - tm.assert_frame_equal(result, expected) + # "date" is an index and a column, so get included in the agg + if "df_mult" in request.node.callspec.id: + date_mean = cases["date"].mean() + date_std = cases["date"].std() + expected = pd.concat([date_mean, date_std, expected], axis=1) + expected.columns = pd.MultiIndex.from_product( + [["date", "A", "B"], ["mean", "std"]] + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = cases.aggregate([np.mean, np.std]) + tm.assert_frame_equal(result, expected) - expected = pd.concat([a_mean, b_std], axis=1) - for t in cases: - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.aggregate({"A": np.mean, "B": np.std}) - tm.assert_frame_equal(result, expected, check_like=True) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.aggregate(A=("A", np.mean), B=("B", np.std)) - tm.assert_frame_equal(result, expected, check_like=True) +@pytest.mark.parametrize( + "agg", + [ + {"func": {"A": np.mean, "B": np.std}}, + {"A": ("A", np.mean), "B": ("B", np.std)}, + {"A": NamedAgg("A", np.mean), "B": NamedAgg("B", np.std)}, + ], +) +def test_agg_both_mean_std_named_result(cases, a_mean, b_std, agg): + msg = "using SeriesGroupBy.[mean|std]" + expected = pd.concat([a_mean, b_std], axis=1) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = cases.aggregate(**agg) + tm.assert_frame_equal(result, expected, check_like=True) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.aggregate(A=NamedAgg("A", np.mean), B=NamedAgg("B", np.std)) - tm.assert_frame_equal(result, expected, check_like=True) +def test_agg_both_mean_std_dict_of_list(cases, a_mean, a_std): expected = pd.concat([a_mean, a_std], axis=1) expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")]) - for t in cases: - result = t.aggregate({"A": ["mean", "std"]}) - tm.assert_frame_equal(result, expected) + result = cases.aggregate({"A": ["mean", "std"]}) + tm.assert_frame_equal(result, expected) + +@pytest.mark.parametrize( + "agg", [{"func": ["mean", "sum"]}, {"mean": "mean", "sum": "sum"}] +) +def test_agg_both_mean_sum(cases, a_mean, a_sum, agg): expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = ["mean", "sum"] - for t in cases: - result = t["A"].aggregate(["mean", "sum"]) - tm.assert_frame_equal(result, expected) + result = cases["A"].aggregate(**agg) + tm.assert_frame_equal(result, expected) - result = t["A"].aggregate(mean="mean", sum="sum") - tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "agg", + [ + {"A": {"mean": "mean", "sum": "sum"}}, + { + "A": {"mean": "mean", "sum": "sum"}, + "B": {"mean2": "mean", "sum2": "sum"}, + }, + ], +) +def test_agg_dict_of_dict_specificationerror(cases, agg): msg = "nested renamer is not supported" - for t in cases: - with pytest.raises(pd.errors.SpecificationError, match=msg): - t.aggregate({"A": {"mean": "mean", "sum": "sum"}}) + with pytest.raises(pd.errors.SpecificationError, match=msg): + cases.aggregate(agg) - expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples( - [("A", "mean"), ("A", "sum"), ("B", "mean2"), ("B", "sum2")] - ) - for t in cases: - with pytest.raises(pd.errors.SpecificationError, match=msg): - t.aggregate( - { - "A": {"mean": "mean", "sum": "sum"}, - "B": {"mean2": "mean", "sum2": "sum"}, - } - ) +def test_agg_dict_of_lists(cases, a_mean, a_std, b_mean, b_std): expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_tuples( [("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")] ) - for t in cases: - result = t.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) - tm.assert_frame_equal(result, expected, check_like=True) - - expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples( - [ - ("r1", "A", "mean"), - ("r1", "A", "sum"), - ("r2", "B", "mean"), - ("r2", "B", "sum"), - ] - ) - - -def test_agg_misc(): - # test with all three Resampler apis and TimeGrouper - - index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") - index.name = "date" - df = DataFrame( - np.random.default_rng(2).random((10, 2)), columns=list("AB"), index=index - ) - df_col = df.reset_index() - df_mult = df_col.copy() - df_mult.index = pd.MultiIndex.from_arrays( - [range(10), df.index], names=["index", "date"] - ) + result = cases.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) + tm.assert_frame_equal(result, expected, check_like=True) - r = df.resample("2D") - cases = [ - r, - df_col.resample("2D", on="date"), - df_mult.resample("2D", level="date"), - df.groupby(pd.Grouper(freq="2D")), - ] +@pytest.mark.parametrize( + "agg", + [ + {"func": {"A": np.sum, "B": lambda x: np.std(x, ddof=1)}}, + {"A": ("A", np.sum), "B": ("B", lambda x: np.std(x, ddof=1))}, + {"A": NamedAgg("A", np.sum), "B": NamedAgg("B", lambda x: np.std(x, ddof=1))}, + ], +) +def test_agg_with_lambda(cases, agg): # passed lambda msg = "using SeriesGroupBy.sum" - for t in cases: - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) - rcustom = t["B"].apply(lambda x: np.std(x, ddof=1)) - expected = pd.concat([r["A"].sum(), rcustom], axis=1) - tm.assert_frame_equal(result, expected, check_like=True) - - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.agg(A=("A", np.sum), B=("B", lambda x: np.std(x, ddof=1))) - tm.assert_frame_equal(result, expected, check_like=True) - - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.agg( - A=NamedAgg("A", np.sum), B=NamedAgg("B", lambda x: np.std(x, ddof=1)) - ) - tm.assert_frame_equal(result, expected, check_like=True) + rcustom = cases["B"].apply(lambda x: np.std(x, ddof=1)) + expected = pd.concat([cases["A"].sum(), rcustom], axis=1) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = cases.agg(**agg) + tm.assert_frame_equal(result, expected, check_like=True) - # agg with renamers - expected = pd.concat( - [t["A"].sum(), t["B"].sum(), t["A"].mean(), t["B"].mean()], axis=1 - ) - expected.columns = pd.MultiIndex.from_tuples( - [("result1", "A"), ("result1", "B"), ("result2", "A"), ("result2", "B")] - ) +@pytest.mark.parametrize( + "agg", + [ + {"func": {"result1": np.sum, "result2": np.mean}}, + {"A": ("result1", np.sum), "B": ("result2", np.mean)}, + {"A": NamedAgg("result1", np.sum), "B": NamedAgg("result2", np.mean)}, + ], +) +def test_agg_no_column(cases, agg): msg = r"Column\(s\) \['result1', 'result2'\] do not exist" - for t in cases: - with pytest.raises(KeyError, match=msg): - t[["A", "B"]].agg({"result1": np.sum, "result2": np.mean}) + with pytest.raises(KeyError, match=msg): + cases[["A", "B"]].agg(**agg) - with pytest.raises(KeyError, match=msg): - t[["A", "B"]].agg(A=("result1", np.sum), B=("result2", np.mean)) - - with pytest.raises(KeyError, match=msg): - t[["A", "B"]].agg( - A=NamedAgg("result1", np.sum), B=NamedAgg("result2", np.mean) - ) +@pytest.mark.parametrize( + "cols, agg", + [ + [None, {"A": ["sum", "std"], "B": ["mean", "std"]}], + [ + [ + "A", + "B", + ], + {"A": ["sum", "std"], "B": ["mean", "std"]}, + ], + ], +) +def test_agg_specificationerror_nested(cases, cols, agg, a_sum, a_std, b_mean, b_std): # agg with different hows - expected = pd.concat( - [t["A"].sum(), t["A"].std(), t["B"].mean(), t["B"].std()], axis=1 - ) + # equivalent of using a selection list / or not + expected = pd.concat([a_sum, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_tuples( [("A", "sum"), ("A", "std"), ("B", "mean"), ("B", "std")] ) - for t in cases: - result = t.agg({"A": ["sum", "std"], "B": ["mean", "std"]}) - tm.assert_frame_equal(result, expected, check_like=True) + if cols is not None: + obj = cases[cols] + else: + obj = cases + + result = obj.agg(agg) + tm.assert_frame_equal(result, expected, check_like=True) - # equivalent of using a selection list / or not - for t in cases: - result = t[["A", "B"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) - tm.assert_frame_equal(result, expected, check_like=True) +@pytest.mark.parametrize( + "agg", [{"A": ["sum", "std"]}, {"A": ["sum", "std"], "B": ["mean", "std"]}] +) +def test_agg_specificationerror_series(cases, agg): msg = "nested renamer is not supported" # series like aggs - for t in cases: - with pytest.raises(pd.errors.SpecificationError, match=msg): - t["A"].agg({"A": ["sum", "std"]}) + with pytest.raises(pd.errors.SpecificationError, match=msg): + cases["A"].agg(agg) - with pytest.raises(pd.errors.SpecificationError, match=msg): - t["A"].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) +def test_agg_specificationerror_invalid_names(cases): # errors # invalid names in the agg specification msg = r"Column\(s\) \['B'\] do not exist" - for t in cases: - with pytest.raises(KeyError, match=msg): - t[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) + with pytest.raises(KeyError, match=msg): + cases[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) @pytest.mark.parametrize( diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index fd02216934576..4afd3b477c3ee 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -349,9 +349,9 @@ def weighted_quantile(series, weights, q): tm.assert_series_equal(result, expected) -def test_resample_groupby_with_label(): +def test_resample_groupby_with_label(unit): # GH 13235 - index = date_range("2000-01-01", freq="2D", periods=5) + index = date_range("2000-01-01", freq="2D", periods=5, unit=unit) df = DataFrame(index=index, data={"col0": [0, 0, 1, 1, 2], "col1": [1, 1, 1, 1, 1]}) msg = "DataFrameGroupBy.resample operated on the grouping columns" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -359,8 +359,9 @@ def test_resample_groupby_with_label(): mi = [ np.array([0, 0, 1, 2], dtype=np.int64), - pd.to_datetime( - np.array(["1999-12-26", "2000-01-02", "2000-01-02", "2000-01-02"]) + np.array( + ["1999-12-26", "2000-01-02", "2000-01-02", "2000-01-02"], + dtype=f"M8[{unit}]", ), ] mindex = pd.MultiIndex.from_arrays(mi, names=["col0", None]) @@ -505,7 +506,9 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): idx = pd.MultiIndex.from_arrays( [ ["A"] * 3 + ["B"] * 3, - pd.to_datetime(["2020-01-05", "2020-01-12", "2020-01-19"] * 2), + pd.to_datetime(["2020-01-05", "2020-01-12", "2020-01-19"] * 2).as_unit( + "ns" + ), ], names=["key", "date"], ) @@ -530,19 +533,15 @@ def test_groupby_resample_with_list_of_keys(): } ) result = df.groupby("group").resample("2D", on="date")[["val"]].mean() + + mi_exp = pd.MultiIndex.from_arrays( + [[0, 0, 1, 1], df["date"]._values[::2]], names=["group", "date"] + ) expected = DataFrame( data={ "val": [4.0, 3.5, 6.5, 3.0], }, - index=Index( - data=[ - (0, Timestamp("2016-01-01")), - (0, Timestamp("2016-01-03")), - (1, Timestamp("2016-01-05")), - (1, Timestamp("2016-01-07")), - ], - name=("group", "date"), - ), + index=mi_exp, ) tm.assert_frame_equal(result, expected) @@ -605,17 +604,17 @@ def test_groupby_resample_size_all_index_same(): msg = "DataFrameGroupBy.resample operated on the grouping columns" with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("A").resample("D").size() + + mi_exp = pd.MultiIndex.from_arrays( + [ + [1, 1, 2, 2], + pd.DatetimeIndex(["2000-12-31", "2001-01-01"] * 2, dtype="M8[ns]"), + ], + names=["A", None], + ) expected = Series( 3, - index=pd.MultiIndex.from_tuples( - [ - (1, Timestamp("2000-12-31")), - (1, Timestamp("2001-01-01")), - (2, Timestamp("2000-12-31")), - (2, Timestamp("2001-01-01")), - ], - names=["A", None], - ), + index=mi_exp, ) tm.assert_series_equal(result, expected) @@ -627,25 +626,18 @@ def test_groupby_resample_on_index_with_list_of_keys(): "group": [0, 0, 0, 0, 1, 1, 1, 1], "val": [3, 1, 4, 1, 5, 9, 2, 6], }, - index=Series( - date_range(start="2016-01-01", periods=8), - name="date", - ), + index=date_range(start="2016-01-01", periods=8, name="date"), ) result = df.groupby("group").resample("2D")[["val"]].mean() + + mi_exp = pd.MultiIndex.from_arrays( + [[0, 0, 1, 1], df.index[::2]], names=["group", "date"] + ) expected = DataFrame( data={ "val": [2.0, 2.5, 7.0, 4.0], }, - index=Index( - data=[ - (0, Timestamp("2016-01-01")), - (0, Timestamp("2016-01-03")), - (1, Timestamp("2016-01-05")), - (1, Timestamp("2016-01-07")), - ], - name=("group", "date"), - ), + index=mi_exp, ) tm.assert_frame_equal(result, expected) @@ -659,26 +651,19 @@ def test_groupby_resample_on_index_with_list_of_keys_multi_columns(): "second_val": [2, 7, 1, 8, 2, 8, 1, 8], "third_val": [1, 4, 1, 4, 2, 1, 3, 5], }, - index=Series( - date_range(start="2016-01-01", periods=8), - name="date", - ), + index=date_range(start="2016-01-01", periods=8, name="date"), ) result = df.groupby("group").resample("2D")[["first_val", "second_val"]].mean() + + mi_exp = pd.MultiIndex.from_arrays( + [[0, 0, 1, 1], df.index[::2]], names=["group", "date"] + ) expected = DataFrame( data={ "first_val": [2.0, 2.5, 7.0, 4.0], "second_val": [4.5, 4.5, 5.0, 4.5], }, - index=Index( - data=[ - (0, Timestamp("2016-01-01")), - (0, Timestamp("2016-01-03")), - (1, Timestamp("2016-01-05")), - (1, Timestamp("2016-01-07")), - ], - name=("group", "date"), - ), + index=mi_exp, ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 41d34be79bc9c..1dc25cb9d4c1e 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -7,6 +7,7 @@ import pandas as pd from pandas import ( DataFrame, + Index, Series, Timestamp, ) @@ -138,13 +139,17 @@ def test_aggregate_normal(resample_method): normal_df["key"] = [1, 2, 3, 4, 5] * 4 dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) - dt_df["key"] = [ - datetime(2013, 1, 1), - datetime(2013, 1, 2), - datetime(2013, 1, 3), - datetime(2013, 1, 4), - datetime(2013, 1, 5), - ] * 4 + dt_df["key"] = Index( + [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + datetime(2013, 1, 3), + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] + * 4, + dtype="M8[ns]", + ) normal_grouped = normal_df.groupby("key") dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) @@ -193,11 +198,11 @@ def test_aggregate_nth(): ], ) def test_resample_entirely_nat_window(method, method_args, unit): - s = Series([0] * 2 + [np.nan] * 2, index=date_range("2017", periods=4)) - result = methodcaller(method, **method_args)(s.resample("2d")) - expected = Series( - [0.0, unit], index=pd.DatetimeIndex(["2017-01-01", "2017-01-03"], freq="2D") - ) + ser = Series([0] * 2 + [np.nan] * 2, index=date_range("2017", periods=4)) + result = methodcaller(method, **method_args)(ser.resample("2d")) + + exp_dti = pd.DatetimeIndex(["2017-01-01", "2017-01-03"], dtype="M8[ns]", freq="2D") + expected = Series([0.0, unit], index=exp_dti) tm.assert_series_equal(result, expected) @@ -216,13 +221,17 @@ def test_aggregate_with_nat(func, fill_value): normal_df["key"] = [1, 2, np.nan, 4, 5] * 4 dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) - dt_df["key"] = [ - datetime(2013, 1, 1), - datetime(2013, 1, 2), - pd.NaT, - datetime(2013, 1, 4), - datetime(2013, 1, 5), - ] * 4 + dt_df["key"] = Index( + [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + pd.NaT, + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] + * 4, + dtype="M8[ns]", + ) normal_grouped = normal_df.groupby("key") dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) @@ -233,7 +242,13 @@ def test_aggregate_with_nat(func, fill_value): pad = DataFrame([[fill_value] * 4], index=[3], columns=["A", "B", "C", "D"]) expected = pd.concat([normal_result, pad]) expected = expected.sort_index() - dti = date_range(start="2013-01-01", freq="D", periods=5, name="key") + dti = date_range( + start="2013-01-01", + freq="D", + periods=5, + name="key", + unit=dt_df["key"]._values.unit, + ) expected.index = dti._with_freq(None) # TODO: is this desired? tm.assert_frame_equal(expected, dt_result) assert dt_result.index.name == "key" @@ -247,13 +262,17 @@ def test_aggregate_with_nat_size(): normal_df["key"] = [1, 2, np.nan, 4, 5] * 4 dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) - dt_df["key"] = [ - datetime(2013, 1, 1), - datetime(2013, 1, 2), - pd.NaT, - datetime(2013, 1, 4), - datetime(2013, 1, 5), - ] * 4 + dt_df["key"] = Index( + [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + pd.NaT, + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] + * 4, + dtype="M8[ns]", + ) normal_grouped = normal_df.groupby("key") dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) @@ -265,7 +284,11 @@ def test_aggregate_with_nat_size(): expected = pd.concat([normal_result, pad]) expected = expected.sort_index() expected.index = date_range( - start="2013-01-01", freq="D", periods=5, name="key" + start="2013-01-01", + freq="D", + periods=5, + name="key", + unit=dt_df["key"]._values.unit, )._with_freq(None) tm.assert_series_equal(expected, dt_result) assert dt_result.index.name == "key" @@ -304,10 +327,11 @@ def test_repr(): ], ) def test_upsample_sum(method, method_args, expected_values): - s = Series(1, index=date_range("2017", periods=2, freq="h")) - resampled = s.resample("30min") + ser = Series(1, index=date_range("2017", periods=2, freq="h")) + resampled = ser.resample("30min") index = pd.DatetimeIndex( ["2017-01-01T00:00:00", "2017-01-01T00:30:00", "2017-01-01T01:00:00"], + dtype="M8[ns]", freq="30min", ) result = methodcaller(method, **method_args)(resampled) @@ -332,25 +356,12 @@ def test_groupby_resample_interpolate(): .interpolate(method="linear") ) - expected_ind = pd.MultiIndex.from_tuples( - [ - (50, Timestamp("2018-01-07")), - (50, Timestamp("2018-01-08")), - (50, Timestamp("2018-01-09")), - (50, Timestamp("2018-01-10")), - (50, Timestamp("2018-01-11")), - (50, Timestamp("2018-01-12")), - (50, Timestamp("2018-01-13")), - (50, Timestamp("2018-01-14")), - (50, Timestamp("2018-01-15")), - (50, Timestamp("2018-01-16")), - (50, Timestamp("2018-01-17")), - (50, Timestamp("2018-01-18")), - (50, Timestamp("2018-01-19")), - (50, Timestamp("2018-01-20")), - (50, Timestamp("2018-01-21")), - (60, Timestamp("2018-01-14")), - ], + volume = [50] * 15 + [60] + week_starting = list(date_range("2018-01-07", "2018-01-21")) + [ + Timestamp("2018-01-14") + ] + expected_ind = pd.MultiIndex.from_arrays( + [volume, week_starting], names=["volume", "week_starting"], ) diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index a87042180df8f..ab20e8c8f6930 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -315,7 +315,7 @@ def test_concatlike_datetimetz_short(self, tz): exp_idx = pd.DatetimeIndex( ["2014-07-15", "2014-07-16", "2014-07-17", "2014-07-11", "2014-07-21"], tz=tz, - ) + ).as_unit("ns") exp = DataFrame(0, index=exp_idx, columns=["A", "B"]) tm.assert_frame_equal(df1._append(df2), exp) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 22e1d75255b3f..c061a393bb1a6 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -63,6 +63,7 @@ def test_concat_datetime_timezone(self): ) .tz_convert("UTC") .tz_convert("Europe/Paris") + .as_unit("ns") ) expected = DataFrame( @@ -84,7 +85,7 @@ def test_concat_datetime_timezone(self): "2010-12-31 16:00:00+00:00", "2010-12-31 17:00:00+00:00", ] - ) + ).as_unit("ns") expected = DataFrame( [ @@ -197,8 +198,8 @@ def test_concat_NaT_series2(self): def test_concat_NaT_dataframes(self, tz): # GH 12396 - first = DataFrame([[pd.NaT], [pd.NaT]]) - first = first.apply(lambda x: x.dt.tz_localize(tz)) + dti = DatetimeIndex([pd.NaT, pd.NaT], tz=tz) + first = DataFrame({0: dti}) second = DataFrame( [[Timestamp("2015/01/01", tz=tz)], [Timestamp("2016/01/01", tz=tz)]], index=[2, 3], diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index c630ba6a43cb1..a38e4ffe2eaf7 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -771,13 +771,13 @@ def test_join_datetime_string(self): ], columns=["x", "y", "a"], ) - dfa["x"] = pd.to_datetime(dfa["x"]) + dfa["x"] = pd.to_datetime(dfa["x"]).astype("M8[ns]") dfb = DataFrame( [["2012-08-02", "J", 1], ["2013-04-06", "L", 2]], columns=["x", "y", "z"], index=[2, 4], ) - dfb["x"] = pd.to_datetime(dfb["x"]) + dfb["x"] = pd.to_datetime(dfb["x"]).astype("M8[ns]") result = dfb.join(dfa.set_index(["x", "y"]), on=["x", "y"]) expected = DataFrame( [ @@ -787,6 +787,7 @@ def test_join_datetime_string(self): index=[2, 4], columns=["x", "y", "z", "a"], ) + expected["x"] = expected["x"].dt.as_unit("ns") tm.assert_frame_equal(result, expected) def test_join_with_categorical_index(self): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 495fb4536f62b..7538894bbf1c9 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -881,9 +881,9 @@ def test_merge_on_datetime64tz_empty(self): dtz = pd.DatetimeTZDtype(tz="UTC") right = DataFrame( { - "date": [pd.Timestamp("2018", tz=dtz.tz)], + "date": DatetimeIndex(["2018"], dtype=dtz), "value": [4.0], - "date2": [pd.Timestamp("2019", tz=dtz.tz)], + "date2": DatetimeIndex(["2019"], dtype=dtz), }, columns=["date", "value", "date2"], ) @@ -1346,9 +1346,12 @@ def test_merge_two_empty_df_no_division_error(self): CategoricalIndex([1, 2, 4, None, None, None]), ), ( - DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"]), DatetimeIndex( - ["2001-01-01", "2002-02-02", "2003-03-03", pd.NaT, pd.NaT, pd.NaT] + ["2001-01-01", "2002-02-02", "2003-03-03"], dtype="M8[ns]" + ), + DatetimeIndex( + ["2001-01-01", "2002-02-02", "2003-03-03", pd.NaT, pd.NaT, pd.NaT], + dtype="M8[ns]", ), ), *[ diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index d7aed23d63cfb..6d0a405430c9f 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -3477,16 +3477,19 @@ def test_merge_asof_numeri_column_in_index_object_dtype(): merge_asof(left, right, left_on="a", right_on="a") -def test_merge_asof_array_as_on(): +def test_merge_asof_array_as_on(unit): # GH#42844 + dti = pd.DatetimeIndex( + ["2021/01/01 00:37", "2021/01/01 01:40"], dtype=f"M8[{unit}]" + ) right = pd.DataFrame( { "a": [2, 6], - "ts": [pd.Timestamp("2021/01/01 00:37"), pd.Timestamp("2021/01/01 01:40")], + "ts": dti, } ) ts_merge = pd.date_range( - start=pd.Timestamp("2021/01/01 00:00"), periods=3, freq="1h" + start=pd.Timestamp("2021/01/01 00:00"), periods=3, freq="1h", unit=unit ) left = pd.DataFrame({"b": [4, 8, 7]}) result = merge_asof( @@ -3511,7 +3514,7 @@ def test_merge_asof_array_as_on(): expected = pd.DataFrame( { "a": [2, 6], - "ts": [pd.Timestamp("2021/01/01 00:37"), pd.Timestamp("2021/01/01 01:40")], + "ts": dti, "b": [4, 8], } ) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 8c4c51289870b..f5880f58064aa 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -529,12 +529,12 @@ def test_datetime_tz_cut_mismatched_tzawareness(box): def test_datetime_tz_cut(bins, box): # see gh-19872 tz = "US/Eastern" - s = Series(date_range("20130101", periods=3, tz=tz)) + ser = Series(date_range("20130101", periods=3, tz=tz)) if not isinstance(bins, int): bins = box(bins) - result = cut(s, bins) + result = cut(ser, bins) expected = Series( IntervalIndex( [ @@ -686,10 +686,10 @@ def test_cut_unordered_with_missing_labels_raises_error(): def test_cut_unordered_with_series_labels(): # https://github.com/pandas-dev/pandas/issues/36603 - s = Series([1, 2, 3, 4, 5]) + ser = Series([1, 2, 3, 4, 5]) bins = Series([0, 2, 4, 6]) labels = Series(["a", "b", "c"]) - result = cut(s, bins=bins, labels=labels, ordered=False) + result = cut(ser, bins=bins, labels=labels, ordered=False) expected = Series(["a", "a", "b", "b", "c"], dtype="category") tm.assert_series_equal(result, expected) @@ -710,8 +710,8 @@ def test_cut_with_duplicated_index_lowest_included(): dtype="category", ).cat.as_ordered() - s = Series([0, 1, 2, 3, 0], index=[0, 1, 2, 3, 0]) - result = cut(s, bins=[0, 2, 4], include_lowest=True) + ser = Series([0, 1, 2, 3, 0], index=[0, 1, 2, 3, 0]) + result = cut(ser, bins=[0, 2, 4], include_lowest=True) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index aecdf0a9c6975..f8ececf6c0540 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -545,40 +545,48 @@ def test_pivot_index_with_nan_dates(self, method): tm.assert_frame_equal(result, pv.T) @pytest.mark.parametrize("method", [True, False]) - def test_pivot_with_tz(self, method): + def test_pivot_with_tz(self, method, unit): # GH 5878 df = DataFrame( { - "dt1": [ - datetime(2013, 1, 1, 9, 0), - datetime(2013, 1, 2, 9, 0), - datetime(2013, 1, 1, 9, 0), - datetime(2013, 1, 2, 9, 0), - ], - "dt2": [ - datetime(2014, 1, 1, 9, 0), - datetime(2014, 1, 1, 9, 0), - datetime(2014, 1, 2, 9, 0), - datetime(2014, 1, 2, 9, 0), - ], + "dt1": pd.DatetimeIndex( + [ + datetime(2013, 1, 1, 9, 0), + datetime(2013, 1, 2, 9, 0), + datetime(2013, 1, 1, 9, 0), + datetime(2013, 1, 2, 9, 0), + ], + dtype=f"M8[{unit}, US/Pacific]", + ), + "dt2": pd.DatetimeIndex( + [ + datetime(2014, 1, 1, 9, 0), + datetime(2014, 1, 1, 9, 0), + datetime(2014, 1, 2, 9, 0), + datetime(2014, 1, 2, 9, 0), + ], + dtype=f"M8[{unit}, Asia/Tokyo]", + ), "data1": np.arange(4, dtype="int64"), "data2": np.arange(4, dtype="int64"), } ) - df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific")) - df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo")) - exp_col1 = Index(["data1", "data1", "data2", "data2"]) exp_col2 = pd.DatetimeIndex( - ["2014/01/01 09:00", "2014/01/02 09:00"] * 2, name="dt2", tz="Asia/Tokyo" + ["2014/01/01 09:00", "2014/01/02 09:00"] * 2, + name="dt2", + dtype=f"M8[{unit}, Asia/Tokyo]", ) exp_col = MultiIndex.from_arrays([exp_col1, exp_col2]) + exp_idx = pd.DatetimeIndex( + ["2013/01/01 09:00", "2013/01/02 09:00"], + name="dt1", + dtype=f"M8[{unit}, US/Pacific]", + ) expected = DataFrame( [[0, 2, 0, 2], [1, 3, 1, 3]], - index=pd.DatetimeIndex( - ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific" - ), + index=exp_idx, columns=exp_col, ) @@ -590,12 +598,8 @@ def test_pivot_with_tz(self, method): expected = DataFrame( [[0, 2], [1, 3]], - index=pd.DatetimeIndex( - ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific" - ), - columns=pd.DatetimeIndex( - ["2014/01/01 09:00", "2014/01/02 09:00"], name="dt2", tz="Asia/Tokyo" - ), + index=exp_idx, + columns=exp_col2[:2], ) if method: @@ -1534,22 +1538,29 @@ def test_pivot_timegrouper_double(self): tm.assert_frame_equal(result, expected.T) def test_pivot_datetime_tz(self): - dates1 = [ - "2011-07-19 07:00:00", - "2011-07-19 08:00:00", - "2011-07-19 09:00:00", - "2011-07-19 07:00:00", - "2011-07-19 08:00:00", - "2011-07-19 09:00:00", - ] - dates2 = [ - "2013-01-01 15:00:00", - "2013-01-01 15:00:00", - "2013-01-01 15:00:00", - "2013-02-01 15:00:00", - "2013-02-01 15:00:00", - "2013-02-01 15:00:00", - ] + dates1 = pd.DatetimeIndex( + [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ], + dtype="M8[ns, US/Pacific]", + name="dt1", + ) + dates2 = pd.DatetimeIndex( + [ + "2013-01-01 15:00:00", + "2013-01-01 15:00:00", + "2013-01-01 15:00:00", + "2013-02-01 15:00:00", + "2013-02-01 15:00:00", + "2013-02-01 15:00:00", + ], + dtype="M8[ns, Asia/Tokyo]", + ) df = DataFrame( { "label": ["a", "a", "a", "b", "b", "b"], @@ -1559,14 +1570,8 @@ def test_pivot_datetime_tz(self): "value2": [1, 2] * 3, } ) - df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific")) - df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo")) - exp_idx = pd.DatetimeIndex( - ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], - tz="US/Pacific", - name="dt1", - ) + exp_idx = dates1[:3] exp_col1 = Index(["value1", "value1"]) exp_col2 = Index(["a", "b"], name="label") exp_col = MultiIndex.from_arrays([exp_col1, exp_col2]) @@ -1580,7 +1585,7 @@ def test_pivot_datetime_tz(self): exp_col2 = Index(["value1", "value1", "value2", "value2"] * 2) exp_col3 = pd.DatetimeIndex( ["2013-01-01 15:00:00", "2013-02-01 15:00:00"] * 4, - tz="Asia/Tokyo", + dtype="M8[ns, Asia/Tokyo]", name="dt2", ) exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3]) @@ -1625,22 +1630,26 @@ def test_pivot_datetime_tz(self): def test_pivot_dtaccessor(self): # GH 8103 - dates1 = [ - "2011-07-19 07:00:00", - "2011-07-19 08:00:00", - "2011-07-19 09:00:00", - "2011-07-19 07:00:00", - "2011-07-19 08:00:00", - "2011-07-19 09:00:00", - ] - dates2 = [ - "2013-01-01 15:00:00", - "2013-01-01 15:00:00", - "2013-01-01 15:00:00", - "2013-02-01 15:00:00", - "2013-02-01 15:00:00", - "2013-02-01 15:00:00", - ] + dates1 = pd.DatetimeIndex( + [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ] + ) + dates2 = pd.DatetimeIndex( + [ + "2013-01-01 15:00:00", + "2013-01-01 15:00:00", + "2013-01-01 15:00:00", + "2013-02-01 15:00:00", + "2013-02-01 15:00:00", + "2013-02-01 15:00:00", + ] + ) df = DataFrame( { "label": ["a", "a", "a", "b", "b", "b"], @@ -1650,8 +1659,6 @@ def test_pivot_dtaccessor(self): "value2": [1, 2] * 3, } ) - df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d)) - df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d)) result = pivot_table( df, index="label", columns=df["dt1"].dt.hour, values="value1" @@ -1712,39 +1719,38 @@ def test_pivot_dtaccessor(self): ) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("i", range(1, 367)) - def test_daily(self, i): + def test_daily(self): rng = date_range("1/1/2000", "12/31/2004", freq="D") - ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) + ts = Series(np.arange(len(rng)), index=rng) - annual = pivot_table( + result = pivot_table( DataFrame(ts), index=ts.index.year, columns=ts.index.dayofyear ) - annual.columns = annual.columns.droplevel(0) + result.columns = result.columns.droplevel(0) doy = np.asarray(ts.index.dayofyear) - subset = ts[doy == i] - subset.index = subset.index.year - - result = annual[i].dropna() - tm.assert_series_equal(result, subset, check_names=False) - assert result.name == i + expected = {} + for y in ts.index.year.unique().values: + mask = ts.index.year == y + expected[y] = Series(ts.values[mask], index=doy[mask]) + expected = DataFrame(expected, dtype=float).T + tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("i", range(1, 13)) - def test_monthly(self, i): + def test_monthly(self): rng = date_range("1/1/2000", "12/31/2004", freq="ME") - ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) + ts = Series(np.arange(len(rng)), index=rng) - annual = pivot_table(DataFrame(ts), index=ts.index.year, columns=ts.index.month) - annual.columns = annual.columns.droplevel(0) + result = pivot_table(DataFrame(ts), index=ts.index.year, columns=ts.index.month) + result.columns = result.columns.droplevel(0) - month = ts.index.month - subset = ts[month == i] - subset.index = subset.index.year - result = annual[i].dropna() - tm.assert_series_equal(result, subset, check_names=False) - assert result.name == i + month = np.asarray(ts.index.month) + expected = {} + for y in ts.index.year.unique().values: + mask = ts.index.year == y + expected[y] = Series(ts.values[mask], index=month[mask]) + expected = DataFrame(expected, dtype=float).T + tm.assert_frame_equal(result, expected) def test_pivot_table_with_iterator_values(self, data): # GH 12017 @@ -2664,3 +2670,18 @@ def test_pivot_table_handles_explicit_datetime_types(self): names=["a", "date"], ) tm.assert_index_equal(pivot.index, expected) + + def test_pivot_table_with_margins_and_numeric_column_names(self): + # GH#26568 + df = DataFrame([["a", "x", 1], ["a", "y", 2], ["b", "y", 3], ["b", "z", 4]]) + + result = df.pivot_table( + index=0, columns=1, values=2, aggfunc="sum", fill_value=0, margins=True + ) + + expected = DataFrame( + [[1, 2, 0, 3], [0, 3, 4, 7], [1, 5, 4, 10]], + columns=Index(["x", "y", "z", "All"], name=1), + index=Index(["a", "b", "All"], name=0), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/scalar/interval/test_arithmetic.py b/pandas/tests/scalar/interval/test_arithmetic.py index 863446c64de42..603763227cb88 100644 --- a/pandas/tests/scalar/interval/test_arithmetic.py +++ b/pandas/tests/scalar/interval/test_arithmetic.py @@ -8,56 +8,185 @@ Timedelta, Timestamp, ) +import pandas._testing as tm -@pytest.mark.parametrize("method", ["__add__", "__sub__"]) -@pytest.mark.parametrize( - "interval", - [ - Interval(Timestamp("2017-01-01 00:00:00"), Timestamp("2018-01-01 00:00:00")), - Interval(Timedelta(days=7), Timedelta(days=14)), - ], -) -@pytest.mark.parametrize( - "delta", [Timedelta(days=7), timedelta(7), np.timedelta64(7, "D")] -) -def test_time_interval_add_subtract_timedelta(interval, delta, method): - # https://github.com/pandas-dev/pandas/issues/32023 - result = getattr(interval, method)(delta) - left = getattr(interval.left, method)(delta) - right = getattr(interval.right, method)(delta) - expected = Interval(left, right) +class TestIntervalArithmetic: + def test_interval_add(self, closed): + interval = Interval(0, 1, closed=closed) + expected = Interval(1, 2, closed=closed) - assert result == expected + result = interval + 1 + assert result == expected + result = 1 + interval + assert result == expected -@pytest.mark.parametrize("interval", [Interval(1, 2), Interval(1.0, 2.0)]) -@pytest.mark.parametrize( - "delta", [Timedelta(days=7), timedelta(7), np.timedelta64(7, "D")] -) -def test_numeric_interval_add_timedelta_raises(interval, delta): - # https://github.com/pandas-dev/pandas/issues/32023 - msg = "|".join( + result = interval + result += 1 + assert result == expected + + msg = r"unsupported operand type\(s\) for \+" + with pytest.raises(TypeError, match=msg): + interval + interval + + with pytest.raises(TypeError, match=msg): + interval + "foo" + + def test_interval_sub(self, closed): + interval = Interval(0, 1, closed=closed) + expected = Interval(-1, 0, closed=closed) + + result = interval - 1 + assert result == expected + + result = interval + result -= 1 + assert result == expected + + msg = r"unsupported operand type\(s\) for -" + with pytest.raises(TypeError, match=msg): + interval - interval + + with pytest.raises(TypeError, match=msg): + interval - "foo" + + def test_interval_mult(self, closed): + interval = Interval(0, 1, closed=closed) + expected = Interval(0, 2, closed=closed) + + result = interval * 2 + assert result == expected + + result = 2 * interval + assert result == expected + + result = interval + result *= 2 + assert result == expected + + msg = r"unsupported operand type\(s\) for \*" + with pytest.raises(TypeError, match=msg): + interval * interval + + msg = r"can\'t multiply sequence by non-int" + with pytest.raises(TypeError, match=msg): + interval * "foo" + + def test_interval_div(self, closed): + interval = Interval(0, 1, closed=closed) + expected = Interval(0, 0.5, closed=closed) + + result = interval / 2.0 + assert result == expected + + result = interval + result /= 2.0 + assert result == expected + + msg = r"unsupported operand type\(s\) for /" + with pytest.raises(TypeError, match=msg): + interval / interval + + with pytest.raises(TypeError, match=msg): + interval / "foo" + + def test_interval_floordiv(self, closed): + interval = Interval(1, 2, closed=closed) + expected = Interval(0, 1, closed=closed) + + result = interval // 2 + assert result == expected + + result = interval + result //= 2 + assert result == expected + + msg = r"unsupported operand type\(s\) for //" + with pytest.raises(TypeError, match=msg): + interval // interval + + with pytest.raises(TypeError, match=msg): + interval // "foo" + + @pytest.mark.parametrize("method", ["__add__", "__sub__"]) + @pytest.mark.parametrize( + "interval", [ - "unsupported operand", - "cannot use operands", - "Only numeric, Timestamp and Timedelta endpoints are allowed", - ] + Interval( + Timestamp("2017-01-01 00:00:00"), Timestamp("2018-01-01 00:00:00") + ), + Interval(Timedelta(days=7), Timedelta(days=14)), + ], ) - with pytest.raises((TypeError, ValueError), match=msg): - interval + delta + @pytest.mark.parametrize( + "delta", [Timedelta(days=7), timedelta(7), np.timedelta64(7, "D")] + ) + def test_time_interval_add_subtract_timedelta(self, interval, delta, method): + # https://github.com/pandas-dev/pandas/issues/32023 + result = getattr(interval, method)(delta) + left = getattr(interval.left, method)(delta) + right = getattr(interval.right, method)(delta) + expected = Interval(left, right) + + assert result == expected + + @pytest.mark.parametrize("interval", [Interval(1, 2), Interval(1.0, 2.0)]) + @pytest.mark.parametrize( + "delta", [Timedelta(days=7), timedelta(7), np.timedelta64(7, "D")] + ) + def test_numeric_interval_add_timedelta_raises(self, interval, delta): + # https://github.com/pandas-dev/pandas/issues/32023 + msg = "|".join( + [ + "unsupported operand", + "cannot use operands", + "Only numeric, Timestamp and Timedelta endpoints are allowed", + ] + ) + with pytest.raises((TypeError, ValueError), match=msg): + interval + delta + + with pytest.raises((TypeError, ValueError), match=msg): + delta + interval + + @pytest.mark.parametrize("klass", [timedelta, np.timedelta64, Timedelta]) + def test_timedelta_add_timestamp_interval(self, klass): + delta = klass(0) + expected = Interval(Timestamp("2020-01-01"), Timestamp("2020-02-01")) + + result = delta + expected + assert result == expected + + result = expected + delta + assert result == expected - with pytest.raises((TypeError, ValueError), match=msg): - delta + interval +class TestIntervalComparisons: + def test_interval_equal(self): + assert Interval(0, 1) == Interval(0, 1, closed="right") + assert Interval(0, 1) != Interval(0, 1, closed="left") + assert Interval(0, 1) != 0 -@pytest.mark.parametrize("klass", [timedelta, np.timedelta64, Timedelta]) -def test_timedelta_add_timestamp_interval(klass): - delta = klass(0) - expected = Interval(Timestamp("2020-01-01"), Timestamp("2020-02-01")) + def test_interval_comparison(self): + msg = ( + "'<' not supported between instances of " + "'pandas._libs.interval.Interval' and 'int'" + ) + with pytest.raises(TypeError, match=msg): + Interval(0, 1) < 2 - result = delta + expected - assert result == expected + assert Interval(0, 1) < Interval(1, 2) + assert Interval(0, 1) < Interval(0, 2) + assert Interval(0, 1) < Interval(0.5, 1.5) + assert Interval(0, 1) <= Interval(0, 1) + assert Interval(0, 1) > Interval(-1, 2) + assert Interval(0, 1) >= Interval(0, 1) - result = expected + delta - assert result == expected + def test_equality_comparison_broadcasts_over_array(self): + # https://github.com/pandas-dev/pandas/issues/35931 + interval = Interval(0, 1) + arr = np.array([interval, interval]) + result = interval == arr + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/scalar/interval/test_constructors.py b/pandas/tests/scalar/interval/test_constructors.py new file mode 100644 index 0000000000000..a4bc00b923434 --- /dev/null +++ b/pandas/tests/scalar/interval/test_constructors.py @@ -0,0 +1,51 @@ +import pytest + +from pandas import ( + Interval, + Period, + Timestamp, +) + + +class TestIntervalConstructors: + @pytest.mark.parametrize( + "left, right", + [ + ("a", "z"), + (("a", "b"), ("c", "d")), + (list("AB"), list("ab")), + (Interval(0, 1), Interval(1, 2)), + (Period("2018Q1", freq="Q"), Period("2018Q1", freq="Q")), + ], + ) + def test_construct_errors(self, left, right): + # GH#23013 + msg = "Only numeric, Timestamp and Timedelta endpoints are allowed" + with pytest.raises(ValueError, match=msg): + Interval(left, right) + + def test_constructor_errors(self): + msg = "invalid option for 'closed': foo" + with pytest.raises(ValueError, match=msg): + Interval(0, 1, closed="foo") + + msg = "left side of interval must be <= right side" + with pytest.raises(ValueError, match=msg): + Interval(1, 0) + + @pytest.mark.parametrize( + "tz_left, tz_right", [(None, "UTC"), ("UTC", None), ("UTC", "US/Eastern")] + ) + def test_constructor_errors_tz(self, tz_left, tz_right): + # GH#18538 + left = Timestamp("2017-01-01", tz=tz_left) + right = Timestamp("2017-01-02", tz=tz_right) + + if tz_left is None or tz_right is None: + error = TypeError + msg = "Cannot compare tz-naive and tz-aware timestamps" + else: + error = ValueError + msg = "left and right must have the same time zone" + with pytest.raises(error, match=msg): + Interval(left, right) diff --git a/pandas/tests/scalar/interval/test_contains.py b/pandas/tests/scalar/interval/test_contains.py new file mode 100644 index 0000000000000..8dfca117a658b --- /dev/null +++ b/pandas/tests/scalar/interval/test_contains.py @@ -0,0 +1,73 @@ +import pytest + +from pandas import ( + Interval, + Timedelta, + Timestamp, +) + + +class TestContains: + def test_contains(self): + interval = Interval(0, 1) + assert 0.5 in interval + assert 1 in interval + assert 0 not in interval + + interval_both = Interval(0, 1, "both") + assert 0 in interval_both + assert 1 in interval_both + + interval_neither = Interval(0, 1, closed="neither") + assert 0 not in interval_neither + assert 0.5 in interval_neither + assert 1 not in interval_neither + + def test_contains_interval(self, inclusive_endpoints_fixture): + interval1 = Interval(0, 1, "both") + interval2 = Interval(0, 1, inclusive_endpoints_fixture) + assert interval1 in interval1 + assert interval2 in interval2 + assert interval2 in interval1 + assert interval1 not in interval2 or inclusive_endpoints_fixture == "both" + + def test_contains_infinite_length(self): + interval1 = Interval(0, 1, "both") + interval2 = Interval(float("-inf"), float("inf"), "neither") + assert interval1 in interval2 + assert interval2 not in interval1 + + def test_contains_zero_length(self): + interval1 = Interval(0, 1, "both") + interval2 = Interval(-1, -1, "both") + interval3 = Interval(0.5, 0.5, "both") + assert interval2 not in interval1 + assert interval3 in interval1 + assert interval2 not in interval3 and interval3 not in interval2 + assert interval1 not in interval2 and interval1 not in interval3 + + @pytest.mark.parametrize( + "type1", + [ + (0, 1), + (Timestamp(2000, 1, 1, 0), Timestamp(2000, 1, 1, 1)), + (Timedelta("0h"), Timedelta("1h")), + ], + ) + @pytest.mark.parametrize( + "type2", + [ + (0, 1), + (Timestamp(2000, 1, 1, 0), Timestamp(2000, 1, 1, 1)), + (Timedelta("0h"), Timedelta("1h")), + ], + ) + def test_contains_mixed_types(self, type1, type2): + interval1 = Interval(*type1) + interval2 = Interval(*type2) + if type1 == type2: + assert interval1 in interval2 + else: + msg = "^'<=' not supported between instances of" + with pytest.raises(TypeError, match=msg): + interval1 in interval2 diff --git a/pandas/tests/scalar/interval/test_formats.py b/pandas/tests/scalar/interval/test_formats.py new file mode 100644 index 0000000000000..6bf7aa91df3ce --- /dev/null +++ b/pandas/tests/scalar/interval/test_formats.py @@ -0,0 +1,11 @@ +from pandas import Interval + + +def test_interval_repr(): + interval = Interval(0, 1) + assert repr(interval) == "Interval(0, 1, closed='right')" + assert str(interval) == "(0, 1]" + + interval_left = Interval(0, 1, closed="left") + assert repr(interval_left) == "Interval(0, 1, closed='left')" + assert str(interval_left) == "[0, 1)" diff --git a/pandas/tests/scalar/interval/test_interval.py b/pandas/tests/scalar/interval/test_interval.py index 4841c488a5768..91b31e82f9c52 100644 --- a/pandas/tests/scalar/interval/test_interval.py +++ b/pandas/tests/scalar/interval/test_interval.py @@ -3,12 +3,9 @@ from pandas import ( Interval, - Period, Timedelta, Timestamp, ) -import pandas._testing as tm -import pandas.core.common as com @pytest.fixture @@ -23,48 +20,6 @@ def test_properties(self, interval): assert interval.right == 1 assert interval.mid == 0.5 - def test_repr(self, interval): - assert repr(interval) == "Interval(0, 1, closed='right')" - assert str(interval) == "(0, 1]" - - interval_left = Interval(0, 1, closed="left") - assert repr(interval_left) == "Interval(0, 1, closed='left')" - assert str(interval_left) == "[0, 1)" - - def test_contains(self, interval): - assert 0.5 in interval - assert 1 in interval - assert 0 not in interval - - interval_both = Interval(0, 1, "both") - assert 0 in interval_both - assert 1 in interval_both - - interval_neither = Interval(0, 1, closed="neither") - assert 0 not in interval_neither - assert 0.5 in interval_neither - assert 1 not in interval_neither - - def test_equal(self): - assert Interval(0, 1) == Interval(0, 1, closed="right") - assert Interval(0, 1) != Interval(0, 1, closed="left") - assert Interval(0, 1) != 0 - - def test_comparison(self): - msg = ( - "'<' not supported between instances of " - "'pandas._libs.interval.Interval' and 'int'" - ) - with pytest.raises(TypeError, match=msg): - Interval(0, 1) < 2 - - assert Interval(0, 1) < Interval(1, 2) - assert Interval(0, 1) < Interval(0, 2) - assert Interval(0, 1) < Interval(0.5, 1.5) - assert Interval(0, 1) <= Interval(0, 1) - assert Interval(0, 1) > Interval(-1, 2) - assert Interval(0, 1) >= Interval(0, 1) - def test_hash(self, interval): # should not raise hash(interval) @@ -130,150 +85,3 @@ def test_is_empty(self, left, right, closed): result = iv.is_empty expected = closed != "both" assert result is expected - - @pytest.mark.parametrize( - "left, right", - [ - ("a", "z"), - (("a", "b"), ("c", "d")), - (list("AB"), list("ab")), - (Interval(0, 1), Interval(1, 2)), - (Period("2018Q1", freq="Q"), Period("2018Q1", freq="Q")), - ], - ) - def test_construct_errors(self, left, right): - # GH 23013 - msg = "Only numeric, Timestamp and Timedelta endpoints are allowed" - with pytest.raises(ValueError, match=msg): - Interval(left, right) - - def test_math_add(self, closed): - interval = Interval(0, 1, closed=closed) - expected = Interval(1, 2, closed=closed) - - result = interval + 1 - assert result == expected - - result = 1 + interval - assert result == expected - - result = interval - result += 1 - assert result == expected - - msg = r"unsupported operand type\(s\) for \+" - with pytest.raises(TypeError, match=msg): - interval + interval - - with pytest.raises(TypeError, match=msg): - interval + "foo" - - def test_math_sub(self, closed): - interval = Interval(0, 1, closed=closed) - expected = Interval(-1, 0, closed=closed) - - result = interval - 1 - assert result == expected - - result = interval - result -= 1 - assert result == expected - - msg = r"unsupported operand type\(s\) for -" - with pytest.raises(TypeError, match=msg): - interval - interval - - with pytest.raises(TypeError, match=msg): - interval - "foo" - - def test_math_mult(self, closed): - interval = Interval(0, 1, closed=closed) - expected = Interval(0, 2, closed=closed) - - result = interval * 2 - assert result == expected - - result = 2 * interval - assert result == expected - - result = interval - result *= 2 - assert result == expected - - msg = r"unsupported operand type\(s\) for \*" - with pytest.raises(TypeError, match=msg): - interval * interval - - msg = r"can\'t multiply sequence by non-int" - with pytest.raises(TypeError, match=msg): - interval * "foo" - - def test_math_div(self, closed): - interval = Interval(0, 1, closed=closed) - expected = Interval(0, 0.5, closed=closed) - - result = interval / 2.0 - assert result == expected - - result = interval - result /= 2.0 - assert result == expected - - msg = r"unsupported operand type\(s\) for /" - with pytest.raises(TypeError, match=msg): - interval / interval - - with pytest.raises(TypeError, match=msg): - interval / "foo" - - def test_math_floordiv(self, closed): - interval = Interval(1, 2, closed=closed) - expected = Interval(0, 1, closed=closed) - - result = interval // 2 - assert result == expected - - result = interval - result //= 2 - assert result == expected - - msg = r"unsupported operand type\(s\) for //" - with pytest.raises(TypeError, match=msg): - interval // interval - - with pytest.raises(TypeError, match=msg): - interval // "foo" - - def test_constructor_errors(self): - msg = "invalid option for 'closed': foo" - with pytest.raises(ValueError, match=msg): - Interval(0, 1, closed="foo") - - msg = "left side of interval must be <= right side" - with pytest.raises(ValueError, match=msg): - Interval(1, 0) - - @pytest.mark.parametrize( - "tz_left, tz_right", [(None, "UTC"), ("UTC", None), ("UTC", "US/Eastern")] - ) - def test_constructor_errors_tz(self, tz_left, tz_right): - # GH 18538 - left = Timestamp("2017-01-01", tz=tz_left) - right = Timestamp("2017-01-02", tz=tz_right) - - if com.any_none(tz_left, tz_right): - error = TypeError - msg = "Cannot compare tz-naive and tz-aware timestamps" - else: - error = ValueError - msg = "left and right must have the same time zone" - with pytest.raises(error, match=msg): - Interval(left, right) - - def test_equality_comparison_broadcasts_over_array(self): - # https://github.com/pandas-dev/pandas/issues/35931 - interval = Interval(0, 1) - arr = np.array([interval, interval]) - result = interval == arr - expected = np.array([True, True]) - tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/scalar/interval/test_ops.py b/pandas/tests/scalar/interval/test_overlaps.py similarity index 54% rename from pandas/tests/scalar/interval/test_ops.py rename to pandas/tests/scalar/interval/test_overlaps.py index 92db6ac772830..7fcf59d7bb4af 100644 --- a/pandas/tests/scalar/interval/test_ops.py +++ b/pandas/tests/scalar/interval/test_overlaps.py @@ -1,4 +1,3 @@ -"""Tests for Interval-Interval operations, such as overlaps, contains, etc.""" import pytest from pandas import ( @@ -66,54 +65,3 @@ def test_overlaps_invalid_type(self, other): msg = f"`other` must be an Interval, got {type(other).__name__}" with pytest.raises(TypeError, match=msg): interval.overlaps(other) - - -class TestContains: - def test_contains_interval(self, inclusive_endpoints_fixture): - interval1 = Interval(0, 1, "both") - interval2 = Interval(0, 1, inclusive_endpoints_fixture) - assert interval1 in interval1 - assert interval2 in interval2 - assert interval2 in interval1 - assert interval1 not in interval2 or inclusive_endpoints_fixture == "both" - - def test_contains_infinite_length(self): - interval1 = Interval(0, 1, "both") - interval2 = Interval(float("-inf"), float("inf"), "neither") - assert interval1 in interval2 - assert interval2 not in interval1 - - def test_contains_zero_length(self): - interval1 = Interval(0, 1, "both") - interval2 = Interval(-1, -1, "both") - interval3 = Interval(0.5, 0.5, "both") - assert interval2 not in interval1 - assert interval3 in interval1 - assert interval2 not in interval3 and interval3 not in interval2 - assert interval1 not in interval2 and interval1 not in interval3 - - @pytest.mark.parametrize( - "type1", - [ - (0, 1), - (Timestamp(2000, 1, 1, 0), Timestamp(2000, 1, 1, 1)), - (Timedelta("0h"), Timedelta("1h")), - ], - ) - @pytest.mark.parametrize( - "type2", - [ - (0, 1), - (Timestamp(2000, 1, 1, 0), Timestamp(2000, 1, 1, 1)), - (Timedelta("0h"), Timedelta("1h")), - ], - ) - def test_contains_mixed_types(self, type1, type2): - interval1 = Interval(*type1) - interval2 = Interval(*type2) - if type1 == type2: - assert interval1 in interval2 - else: - msg = "^'<=' not supported between instances of" - with pytest.raises(TypeError, match=msg): - interval1 in interval2 diff --git a/pandas/tests/scalar/period/test_arithmetic.py b/pandas/tests/scalar/period/test_arithmetic.py new file mode 100644 index 0000000000000..5dc0858de466c --- /dev/null +++ b/pandas/tests/scalar/period/test_arithmetic.py @@ -0,0 +1,486 @@ +from datetime import timedelta + +import numpy as np +import pytest + +from pandas._libs.tslibs.period import IncompatibleFrequency + +from pandas import ( + NaT, + Period, + Timedelta, + Timestamp, + offsets, +) + + +class TestPeriodArithmetic: + def test_add_overflow_raises(self): + # GH#55503 + per = Timestamp.max.to_period("ns") + + msg = "|".join( + [ + "Python int too large to convert to C long", + # windows, 32bit linux builds + "int too big to convert", + ] + ) + with pytest.raises(OverflowError, match=msg): + per + 1 + + msg = "value too large" + with pytest.raises(OverflowError, match=msg): + per + Timedelta(1) + with pytest.raises(OverflowError, match=msg): + per + offsets.Nano(1) + + def test_period_add_integer(self): + per1 = Period(freq="D", year=2008, month=1, day=1) + per2 = Period(freq="D", year=2008, month=1, day=2) + assert per1 + 1 == per2 + assert 1 + per1 == per2 + + def test_period_add_invalid(self): + # GH#4731 + per1 = Period(freq="D", year=2008, month=1, day=1) + per2 = Period(freq="D", year=2008, month=1, day=2) + + msg = "|".join( + [ + r"unsupported operand type\(s\)", + "can only concatenate str", + "must be str, not Period", + ] + ) + with pytest.raises(TypeError, match=msg): + per1 + "str" + with pytest.raises(TypeError, match=msg): + "str" + per1 + with pytest.raises(TypeError, match=msg): + per1 + per2 + + def test_period_sub_period_annual(self): + left, right = Period("2011", freq="Y"), Period("2007", freq="Y") + result = left - right + assert result == 4 * right.freq + + msg = r"Input has different freq=M from Period\(freq=Y-DEC\)" + with pytest.raises(IncompatibleFrequency, match=msg): + left - Period("2007-01", freq="M") + + def test_period_sub_period(self): + per1 = Period("2011-01-01", freq="D") + per2 = Period("2011-01-15", freq="D") + + off = per1.freq + assert per1 - per2 == -14 * off + assert per2 - per1 == 14 * off + + msg = r"Input has different freq=M from Period\(freq=D\)" + with pytest.raises(IncompatibleFrequency, match=msg): + per1 - Period("2011-02", freq="M") + + @pytest.mark.parametrize("n", [1, 2, 3, 4]) + def test_sub_n_gt_1_ticks(self, tick_classes, n): + # GH#23878 + p1 = Period("19910905", freq=tick_classes(n)) + p2 = Period("19920406", freq=tick_classes(n)) + + expected = Period(str(p2), freq=p2.freq.base) - Period( + str(p1), freq=p1.freq.base + ) + + assert (p2 - p1) == expected + + @pytest.mark.parametrize("normalize", [True, False]) + @pytest.mark.parametrize("n", [1, 2, 3, 4]) + @pytest.mark.parametrize( + "offset, kwd_name", + [ + (offsets.YearEnd, "month"), + (offsets.QuarterEnd, "startingMonth"), + (offsets.MonthEnd, None), + (offsets.Week, "weekday"), + ], + ) + def test_sub_n_gt_1_offsets(self, offset, kwd_name, n, normalize): + # GH#23878 + kwds = {kwd_name: 3} if kwd_name is not None else {} + p1_d = "19910905" + p2_d = "19920406" + p1 = Period(p1_d, freq=offset(n, normalize, **kwds)) + p2 = Period(p2_d, freq=offset(n, normalize, **kwds)) + + expected = Period(p2_d, freq=p2.freq.base) - Period(p1_d, freq=p1.freq.base) + + assert (p2 - p1) == expected + + def test_period_add_offset(self): + # freq is DateOffset + for freq in ["Y", "2Y", "3Y"]: + per = Period("2011", freq=freq) + exp = Period("2013", freq=freq) + assert per + offsets.YearEnd(2) == exp + assert offsets.YearEnd(2) + per == exp + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: + msg = "Input has different freq|Input cannot be converted to Period" + with pytest.raises(IncompatibleFrequency, match=msg): + per + off + with pytest.raises(IncompatibleFrequency, match=msg): + off + per + + for freq in ["M", "2M", "3M"]: + per = Period("2011-03", freq=freq) + exp = Period("2011-05", freq=freq) + assert per + offsets.MonthEnd(2) == exp + assert offsets.MonthEnd(2) + per == exp + + exp = Period("2012-03", freq=freq) + assert per + offsets.MonthEnd(12) == exp + assert offsets.MonthEnd(12) + per == exp + + msg = "|".join( + [ + "Input has different freq", + "Input cannot be converted to Period", + ] + ) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per + off + with pytest.raises(IncompatibleFrequency, match=msg): + off + per + + # freq is Tick + for freq in ["D", "2D", "3D"]: + per = Period("2011-04-01", freq=freq) + + exp = Period("2011-04-06", freq=freq) + assert per + offsets.Day(5) == exp + assert offsets.Day(5) + per == exp + + exp = Period("2011-04-02", freq=freq) + assert per + offsets.Hour(24) == exp + assert offsets.Hour(24) + per == exp + + exp = Period("2011-04-03", freq=freq) + assert per + np.timedelta64(2, "D") == exp + assert np.timedelta64(2, "D") + per == exp + + exp = Period("2011-04-02", freq=freq) + assert per + np.timedelta64(3600 * 24, "s") == exp + assert np.timedelta64(3600 * 24, "s") + per == exp + + exp = Period("2011-03-30", freq=freq) + assert per + timedelta(-2) == exp + assert timedelta(-2) + per == exp + + exp = Period("2011-04-03", freq=freq) + assert per + timedelta(hours=48) == exp + assert timedelta(hours=48) + per == exp + + msg = "|".join( + [ + "Input has different freq", + "Input cannot be converted to Period", + ] + ) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(4, "h"), + timedelta(hours=23), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per + off + with pytest.raises(IncompatibleFrequency, match=msg): + off + per + + for freq in ["h", "2h", "3h"]: + per = Period("2011-04-01 09:00", freq=freq) + + exp = Period("2011-04-03 09:00", freq=freq) + assert per + offsets.Day(2) == exp + assert offsets.Day(2) + per == exp + + exp = Period("2011-04-01 12:00", freq=freq) + assert per + offsets.Hour(3) == exp + assert offsets.Hour(3) + per == exp + + msg = "cannot use operands with types" + exp = Period("2011-04-01 12:00", freq=freq) + assert per + np.timedelta64(3, "h") == exp + assert np.timedelta64(3, "h") + per == exp + + exp = Period("2011-04-01 10:00", freq=freq) + assert per + np.timedelta64(3600, "s") == exp + assert np.timedelta64(3600, "s") + per == exp + + exp = Period("2011-04-01 11:00", freq=freq) + assert per + timedelta(minutes=120) == exp + assert timedelta(minutes=120) + per == exp + + exp = Period("2011-04-05 12:00", freq=freq) + assert per + timedelta(days=4, minutes=180) == exp + assert timedelta(days=4, minutes=180) + per == exp + + msg = "|".join( + [ + "Input has different freq", + "Input cannot be converted to Period", + ] + ) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(3200, "s"), + timedelta(hours=23, minutes=30), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per + off + with pytest.raises(IncompatibleFrequency, match=msg): + off + per + + def test_period_sub_offset(self): + # freq is DateOffset + msg = "|".join( + [ + "Input has different freq", + "Input cannot be converted to Period", + ] + ) + + for freq in ["Y", "2Y", "3Y"]: + per = Period("2011", freq=freq) + assert per - offsets.YearEnd(2) == Period("2009", freq=freq) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per - off + + for freq in ["M", "2M", "3M"]: + per = Period("2011-03", freq=freq) + assert per - offsets.MonthEnd(2) == Period("2011-01", freq=freq) + assert per - offsets.MonthEnd(12) == Period("2010-03", freq=freq) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per - off + + # freq is Tick + for freq in ["D", "2D", "3D"]: + per = Period("2011-04-01", freq=freq) + assert per - offsets.Day(5) == Period("2011-03-27", freq=freq) + assert per - offsets.Hour(24) == Period("2011-03-31", freq=freq) + assert per - np.timedelta64(2, "D") == Period("2011-03-30", freq=freq) + assert per - np.timedelta64(3600 * 24, "s") == Period( + "2011-03-31", freq=freq + ) + assert per - timedelta(-2) == Period("2011-04-03", freq=freq) + assert per - timedelta(hours=48) == Period("2011-03-30", freq=freq) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(4, "h"), + timedelta(hours=23), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per - off + + for freq in ["h", "2h", "3h"]: + per = Period("2011-04-01 09:00", freq=freq) + assert per - offsets.Day(2) == Period("2011-03-30 09:00", freq=freq) + assert per - offsets.Hour(3) == Period("2011-04-01 06:00", freq=freq) + assert per - np.timedelta64(3, "h") == Period("2011-04-01 06:00", freq=freq) + assert per - np.timedelta64(3600, "s") == Period( + "2011-04-01 08:00", freq=freq + ) + assert per - timedelta(minutes=120) == Period("2011-04-01 07:00", freq=freq) + assert per - timedelta(days=4, minutes=180) == Period( + "2011-03-28 06:00", freq=freq + ) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(3200, "s"), + timedelta(hours=23, minutes=30), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per - off + + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) + def test_period_addsub_nat(self, freq): + # GH#13071 + per = Period("2011-01", freq=freq) + + # For subtraction, NaT is treated as another Period object + assert NaT - per is NaT + assert per - NaT is NaT + + # For addition, NaT is treated as offset-like + assert NaT + per is NaT + assert per + NaT is NaT + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "m"]) + def test_period_add_sub_td64_nat(self, unit): + # GH#47196 + per = Period("2022-06-01", "D") + nat = np.timedelta64("NaT", unit) + + assert per + nat is NaT + assert nat + per is NaT + assert per - nat is NaT + + with pytest.raises(TypeError, match="unsupported operand"): + nat - per + + def test_period_ops_offset(self): + per = Period("2011-04-01", freq="D") + result = per + offsets.Day() + exp = Period("2011-04-02", freq="D") + assert result == exp + + result = per - offsets.Day(2) + exp = Period("2011-03-30", freq="D") + assert result == exp + + msg = r"Input cannot be converted to Period\(freq=D\)" + with pytest.raises(IncompatibleFrequency, match=msg): + per + offsets.Hour(2) + + with pytest.raises(IncompatibleFrequency, match=msg): + per - offsets.Hour(2) + + def test_period_add_timestamp_raises(self): + # GH#17983 + ts = Timestamp("2017") + per = Period("2017", freq="M") + + msg = r"unsupported operand type\(s\) for \+: 'Timestamp' and 'Period'" + with pytest.raises(TypeError, match=msg): + ts + per + + msg = r"unsupported operand type\(s\) for \+: 'Period' and 'Timestamp'" + with pytest.raises(TypeError, match=msg): + per + ts + + +class TestPeriodComparisons: + def test_period_comparison_same_freq(self): + jan = Period("2000-01", "M") + feb = Period("2000-02", "M") + + assert not jan == feb + assert jan != feb + assert jan < feb + assert jan <= feb + assert not jan > feb + assert not jan >= feb + + def test_period_comparison_same_period_different_object(self): + # Separate Period objects for the same period + left = Period("2000-01", "M") + right = Period("2000-01", "M") + + assert left == right + assert left >= right + assert left <= right + assert not left < right + assert not left > right + + def test_period_comparison_mismatched_freq(self): + jan = Period("2000-01", "M") + day = Period("2012-01-01", "D") + + assert not jan == day + assert jan != day + msg = r"Input has different freq=D from Period\(freq=M\)" + with pytest.raises(IncompatibleFrequency, match=msg): + jan < day + with pytest.raises(IncompatibleFrequency, match=msg): + jan <= day + with pytest.raises(IncompatibleFrequency, match=msg): + jan > day + with pytest.raises(IncompatibleFrequency, match=msg): + jan >= day + + def test_period_comparison_invalid_type(self): + jan = Period("2000-01", "M") + + assert not jan == 1 + assert jan != 1 + + int_or_per = "'(Period|int)'" + msg = f"not supported between instances of {int_or_per} and {int_or_per}" + for left, right in [(jan, 1), (1, jan)]: + with pytest.raises(TypeError, match=msg): + left > right + with pytest.raises(TypeError, match=msg): + left >= right + with pytest.raises(TypeError, match=msg): + left < right + with pytest.raises(TypeError, match=msg): + left <= right + + def test_period_comparison_nat(self): + per = Period("2011-01-01", freq="D") + + ts = Timestamp("2011-01-01") + # confirm Period('NaT') work identical with Timestamp('NaT') + for left, right in [ + (NaT, per), + (per, NaT), + (NaT, ts), + (ts, NaT), + ]: + assert not left < right + assert not left > right + assert not left == right + assert left != right + assert not left <= right + assert not left >= right + + @pytest.mark.parametrize( + "zerodim_arr, expected", + ((np.array(0), False), (np.array(Period("2000-01", "M")), True)), + ) + def test_period_comparison_numpy_zerodim_arr(self, zerodim_arr, expected): + per = Period("2000-01", "M") + + assert (per == zerodim_arr) is expected + assert (zerodim_arr == per) is expected diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index 7164de0a228d9..4489c307172d7 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -1,6 +1,5 @@ import pytest -from pandas._libs.tslibs.dtypes import _period_code_map from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG from pandas.errors import OutOfBoundsDatetime @@ -828,5 +827,3 @@ def test_asfreq_MS(self): msg = "MonthBegin is not supported as period frequency" with pytest.raises(TypeError, match=msg): Period("2013-01", "MS") - - assert _period_code_map.get("MS") is None diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index f466804fe0814..3e91264fdb3b1 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -17,12 +17,8 @@ ) from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.parsing import DateParseError -from pandas._libs.tslibs.period import ( - INVALID_FREQ_ERR_MSG, - IncompatibleFrequency, -) +from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG -import pandas as pd from pandas import ( NaT, Period, @@ -1086,62 +1082,6 @@ def test_get_period_field_array_raises_on_out_of_range(self): class TestPeriodComparisons: - def test_comparison_same_period_different_object(self): - # Separate Period objects for the same period - left = Period("2000-01", "M") - right = Period("2000-01", "M") - - assert left == right - assert left >= right - assert left <= right - assert not left < right - assert not left > right - - def test_comparison_same_freq(self): - jan = Period("2000-01", "M") - feb = Period("2000-02", "M") - - assert not jan == feb - assert jan != feb - assert jan < feb - assert jan <= feb - assert not jan > feb - assert not jan >= feb - - def test_comparison_mismatched_freq(self): - jan = Period("2000-01", "M") - day = Period("2012-01-01", "D") - - assert not jan == day - assert jan != day - msg = r"Input has different freq=D from Period\(freq=M\)" - with pytest.raises(IncompatibleFrequency, match=msg): - jan < day - with pytest.raises(IncompatibleFrequency, match=msg): - jan <= day - with pytest.raises(IncompatibleFrequency, match=msg): - jan > day - with pytest.raises(IncompatibleFrequency, match=msg): - jan >= day - - def test_comparison_invalid_type(self): - jan = Period("2000-01", "M") - - assert not jan == 1 - assert jan != 1 - - int_or_per = "'(Period|int)'" - msg = f"not supported between instances of {int_or_per} and {int_or_per}" - for left, right in [(jan, 1), (1, jan)]: - with pytest.raises(TypeError, match=msg): - left > right - with pytest.raises(TypeError, match=msg): - left >= right - with pytest.raises(TypeError, match=msg): - left < right - with pytest.raises(TypeError, match=msg): - left <= right - def test_sort_periods(self): jan = Period("2000-01", "M") feb = Period("2000-02", "M") @@ -1150,442 +1090,6 @@ def test_sort_periods(self): correctPeriods = [jan, feb, mar] assert sorted(periods) == correctPeriods - def test_period_cmp_nat(self): - p = Period("2011-01-01", freq="D") - - t = Timestamp("2011-01-01") - # confirm Period('NaT') work identical with Timestamp('NaT') - for left, right in [ - (NaT, p), - (p, NaT), - (NaT, t), - (t, NaT), - ]: - assert not left < right - assert not left > right - assert not left == right - assert left != right - assert not left <= right - assert not left >= right - - @pytest.mark.parametrize( - "zerodim_arr, expected", - ((np.array(0), False), (np.array(Period("2000-01", "M")), True)), - ) - def test_comparison_numpy_zerodim_arr(self, zerodim_arr, expected): - p = Period("2000-01", "M") - - assert (p == zerodim_arr) is expected - assert (zerodim_arr == p) is expected - - -class TestArithmetic: - def test_add_overflow_raises(self): - # GH#55503 - per = Timestamp.max.to_period("ns") - - msg = "|".join( - [ - "Python int too large to convert to C long", - # windows, 32bit linux builds - "int too big to convert", - ] - ) - with pytest.raises(OverflowError, match=msg): - per + 1 - - msg = "value too large" - with pytest.raises(OverflowError, match=msg): - per + Timedelta(1) - with pytest.raises(OverflowError, match=msg): - per + offsets.Nano(1) - - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "m"]) - def test_add_sub_td64_nat(self, unit): - # GH#47196 - per = Period("2022-06-01", "D") - nat = np.timedelta64("NaT", unit) - - assert per + nat is NaT - assert nat + per is NaT - assert per - nat is NaT - - with pytest.raises(TypeError, match="unsupported operand"): - nat - per - - def test_sub_delta(self): - left, right = Period("2011", freq="Y"), Period("2007", freq="Y") - result = left - right - assert result == 4 * right.freq - - msg = r"Input has different freq=M from Period\(freq=Y-DEC\)" - with pytest.raises(IncompatibleFrequency, match=msg): - left - Period("2007-01", freq="M") - - def test_add_integer(self): - per1 = Period(freq="D", year=2008, month=1, day=1) - per2 = Period(freq="D", year=2008, month=1, day=2) - assert per1 + 1 == per2 - assert 1 + per1 == per2 - - def test_add_sub_nat(self): - # GH#13071 - p = Period("2011-01", freq="M") - assert p + NaT is NaT - assert NaT + p is NaT - assert p - NaT is NaT - assert NaT - p is NaT - - def test_add_invalid(self): - # GH#4731 - per1 = Period(freq="D", year=2008, month=1, day=1) - per2 = Period(freq="D", year=2008, month=1, day=2) - - msg = "|".join( - [ - r"unsupported operand type\(s\)", - "can only concatenate str", - "must be str, not Period", - ] - ) - with pytest.raises(TypeError, match=msg): - per1 + "str" - with pytest.raises(TypeError, match=msg): - "str" + per1 - with pytest.raises(TypeError, match=msg): - per1 + per2 - - boxes = [lambda x: x, lambda x: pd.Series([x]), lambda x: pd.Index([x])] - ids = ["identity", "Series", "Index"] - - @pytest.mark.parametrize("lbox", boxes, ids=ids) - @pytest.mark.parametrize("rbox", boxes, ids=ids) - def test_add_timestamp_raises(self, rbox, lbox): - # GH#17983 - ts = Timestamp("2017") - per = Period("2017", freq="M") - - # We may get a different message depending on which class raises - # the error. - msg = "|".join( - [ - "cannot add", - "unsupported operand", - "can only operate on a", - "incompatible type", - "ufunc add cannot use operands", - ] - ) - with pytest.raises(TypeError, match=msg): - lbox(ts) + rbox(per) - - with pytest.raises(TypeError, match=msg): - lbox(per) + rbox(ts) - - with pytest.raises(TypeError, match=msg): - lbox(per) + rbox(per) - - def test_sub(self): - per1 = Period("2011-01-01", freq="D") - per2 = Period("2011-01-15", freq="D") - - off = per1.freq - assert per1 - per2 == -14 * off - assert per2 - per1 == 14 * off - - msg = r"Input has different freq=M from Period\(freq=D\)" - with pytest.raises(IncompatibleFrequency, match=msg): - per1 - Period("2011-02", freq="M") - - @pytest.mark.parametrize("n", [1, 2, 3, 4]) - def test_sub_n_gt_1_ticks(self, tick_classes, n): - # GH 23878 - p1 = Period("19910905", freq=tick_classes(n)) - p2 = Period("19920406", freq=tick_classes(n)) - - expected = Period(str(p2), freq=p2.freq.base) - Period( - str(p1), freq=p1.freq.base - ) - - assert (p2 - p1) == expected - - @pytest.mark.parametrize("normalize", [True, False]) - @pytest.mark.parametrize("n", [1, 2, 3, 4]) - @pytest.mark.parametrize( - "offset, kwd_name", - [ - (offsets.YearEnd, "month"), - (offsets.QuarterEnd, "startingMonth"), - (offsets.MonthEnd, None), - (offsets.Week, "weekday"), - ], - ) - def test_sub_n_gt_1_offsets(self, offset, kwd_name, n, normalize): - # GH 23878 - kwds = {kwd_name: 3} if kwd_name is not None else {} - p1_d = "19910905" - p2_d = "19920406" - p1 = Period(p1_d, freq=offset(n, normalize, **kwds)) - p2 = Period(p2_d, freq=offset(n, normalize, **kwds)) - - expected = Period(p2_d, freq=p2.freq.base) - Period(p1_d, freq=p1.freq.base) - - assert (p2 - p1) == expected - - def test_add_offset(self): - # freq is DateOffset - for freq in ["Y", "2Y", "3Y"]: - p = Period("2011", freq=freq) - exp = Period("2013", freq=freq) - assert p + offsets.YearEnd(2) == exp - assert offsets.YearEnd(2) + p == exp - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(365, "D"), - timedelta(365), - ]: - msg = "Input has different freq|Input cannot be converted to Period" - with pytest.raises(IncompatibleFrequency, match=msg): - p + o - with pytest.raises(IncompatibleFrequency, match=msg): - o + p - - for freq in ["M", "2M", "3M"]: - p = Period("2011-03", freq=freq) - exp = Period("2011-05", freq=freq) - assert p + offsets.MonthEnd(2) == exp - assert offsets.MonthEnd(2) + p == exp - - exp = Period("2012-03", freq=freq) - assert p + offsets.MonthEnd(12) == exp - assert offsets.MonthEnd(12) + p == exp - - msg = "|".join( - [ - "Input has different freq", - "Input cannot be converted to Period", - ] - ) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(365, "D"), - timedelta(365), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p + o - with pytest.raises(IncompatibleFrequency, match=msg): - o + p - - # freq is Tick - for freq in ["D", "2D", "3D"]: - p = Period("2011-04-01", freq=freq) - - exp = Period("2011-04-06", freq=freq) - assert p + offsets.Day(5) == exp - assert offsets.Day(5) + p == exp - - exp = Period("2011-04-02", freq=freq) - assert p + offsets.Hour(24) == exp - assert offsets.Hour(24) + p == exp - - exp = Period("2011-04-03", freq=freq) - assert p + np.timedelta64(2, "D") == exp - assert np.timedelta64(2, "D") + p == exp - - exp = Period("2011-04-02", freq=freq) - assert p + np.timedelta64(3600 * 24, "s") == exp - assert np.timedelta64(3600 * 24, "s") + p == exp - - exp = Period("2011-03-30", freq=freq) - assert p + timedelta(-2) == exp - assert timedelta(-2) + p == exp - - exp = Period("2011-04-03", freq=freq) - assert p + timedelta(hours=48) == exp - assert timedelta(hours=48) + p == exp - - msg = "|".join( - [ - "Input has different freq", - "Input cannot be converted to Period", - ] - ) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(4, "h"), - timedelta(hours=23), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p + o - with pytest.raises(IncompatibleFrequency, match=msg): - o + p - - for freq in ["h", "2h", "3h"]: - p = Period("2011-04-01 09:00", freq=freq) - - exp = Period("2011-04-03 09:00", freq=freq) - assert p + offsets.Day(2) == exp - assert offsets.Day(2) + p == exp - - exp = Period("2011-04-01 12:00", freq=freq) - assert p + offsets.Hour(3) == exp - assert offsets.Hour(3) + p == exp - - msg = "cannot use operands with types" - exp = Period("2011-04-01 12:00", freq=freq) - assert p + np.timedelta64(3, "h") == exp - assert np.timedelta64(3, "h") + p == exp - - exp = Period("2011-04-01 10:00", freq=freq) - assert p + np.timedelta64(3600, "s") == exp - assert np.timedelta64(3600, "s") + p == exp - - exp = Period("2011-04-01 11:00", freq=freq) - assert p + timedelta(minutes=120) == exp - assert timedelta(minutes=120) + p == exp - - exp = Period("2011-04-05 12:00", freq=freq) - assert p + timedelta(days=4, minutes=180) == exp - assert timedelta(days=4, minutes=180) + p == exp - - msg = "|".join( - [ - "Input has different freq", - "Input cannot be converted to Period", - ] - ) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(3200, "s"), - timedelta(hours=23, minutes=30), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p + o - with pytest.raises(IncompatibleFrequency, match=msg): - o + p - - def test_sub_offset(self): - # freq is DateOffset - msg = "|".join( - [ - "Input has different freq", - "Input cannot be converted to Period", - ] - ) - - for freq in ["Y", "2Y", "3Y"]: - p = Period("2011", freq=freq) - assert p - offsets.YearEnd(2) == Period("2009", freq=freq) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(365, "D"), - timedelta(365), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p - o - - for freq in ["M", "2M", "3M"]: - p = Period("2011-03", freq=freq) - assert p - offsets.MonthEnd(2) == Period("2011-01", freq=freq) - assert p - offsets.MonthEnd(12) == Period("2010-03", freq=freq) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(365, "D"), - timedelta(365), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p - o - - # freq is Tick - for freq in ["D", "2D", "3D"]: - p = Period("2011-04-01", freq=freq) - assert p - offsets.Day(5) == Period("2011-03-27", freq=freq) - assert p - offsets.Hour(24) == Period("2011-03-31", freq=freq) - assert p - np.timedelta64(2, "D") == Period("2011-03-30", freq=freq) - assert p - np.timedelta64(3600 * 24, "s") == Period("2011-03-31", freq=freq) - assert p - timedelta(-2) == Period("2011-04-03", freq=freq) - assert p - timedelta(hours=48) == Period("2011-03-30", freq=freq) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(4, "h"), - timedelta(hours=23), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p - o - - for freq in ["h", "2h", "3h"]: - p = Period("2011-04-01 09:00", freq=freq) - assert p - offsets.Day(2) == Period("2011-03-30 09:00", freq=freq) - assert p - offsets.Hour(3) == Period("2011-04-01 06:00", freq=freq) - assert p - np.timedelta64(3, "h") == Period("2011-04-01 06:00", freq=freq) - assert p - np.timedelta64(3600, "s") == Period( - "2011-04-01 08:00", freq=freq - ) - assert p - timedelta(minutes=120) == Period("2011-04-01 07:00", freq=freq) - assert p - timedelta(days=4, minutes=180) == Period( - "2011-03-28 06:00", freq=freq - ) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(3200, "s"), - timedelta(hours=23, minutes=30), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p - o - - @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) - def test_period_addsub_nat(self, freq): - per = Period("2011-01", freq=freq) - - # For subtraction, NaT is treated as another Period object - assert NaT - per is NaT - assert per - NaT is NaT - - # For addition, NaT is treated as offset-like - assert NaT + per is NaT - assert per + NaT is NaT - - def test_period_ops_offset(self): - p = Period("2011-04-01", freq="D") - result = p + offsets.Day() - exp = Period("2011-04-02", freq="D") - assert result == exp - - result = p - offsets.Day(2) - exp = Period("2011-03-30", freq="D") - assert result == exp - - msg = r"Input cannot be converted to Period\(freq=D\)" - with pytest.raises(IncompatibleFrequency, match=msg): - p + offsets.Hour(2) - - with pytest.raises(IncompatibleFrequency, match=msg): - p - offsets.Hour(2) - def test_period_immutable(): # see gh-17116 diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 2df090e5016e7..3eaf21a2eee26 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -458,6 +458,7 @@ def test_nat_arithmetic_index(op_name, value): expected = DatetimeIndex(exp_data, tz=value.tz, name=exp_name) else: expected = TimedeltaIndex(exp_data, name=exp_name) + expected = expected.as_unit(value.unit) if not isinstance(value, Index): expected = expected.array diff --git a/pandas/tests/scalar/timestamp/methods/test_replace.py b/pandas/tests/scalar/timestamp/methods/test_replace.py index 9b2b21cf7f388..8a208455edc82 100644 --- a/pandas/tests/scalar/timestamp/methods/test_replace.py +++ b/pandas/tests/scalar/timestamp/methods/test_replace.py @@ -21,7 +21,7 @@ def test_replace_out_of_pydatetime_bounds(self): # GH#50348 ts = Timestamp("2016-01-01").as_unit("ns") - msg = "Out of bounds nanosecond timestamp: 99999-01-01 00:00:00" + msg = "Out of bounds timestamp: 99999-01-01 00:00:00 with frequency 'ns'" with pytest.raises(OutOfBoundsDatetime, match=msg): ts.replace(year=99_999) diff --git a/pandas/tests/scalar/timestamp/methods/test_tz_convert.py b/pandas/tests/scalar/timestamp/methods/test_tz_convert.py index 02c85a8f325d5..a7bb3b90805ab 100644 --- a/pandas/tests/scalar/timestamp/methods/test_tz_convert.py +++ b/pandas/tests/scalar/timestamp/methods/test_tz_convert.py @@ -6,12 +6,6 @@ from pandas import Timestamp -try: - from zoneinfo import ZoneInfo -except ImportError: - # Cannot assign to a type - ZoneInfo = None # type: ignore[misc, assignment] - class TestTimestampTZConvert: @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) diff --git a/pandas/tests/scalar/timestamp/methods/test_tz_localize.py b/pandas/tests/scalar/timestamp/methods/test_tz_localize.py index 247a583bc38f3..9df0a023730de 100644 --- a/pandas/tests/scalar/timestamp/methods/test_tz_localize.py +++ b/pandas/tests/scalar/timestamp/methods/test_tz_localize.py @@ -123,6 +123,44 @@ def test_tz_localize_nonexistent(self, stamp, tz): ts.tz_localize(tz, nonexistent="raise") assert ts.tz_localize(tz, nonexistent="NaT") is NaT + @pytest.mark.parametrize( + "stamp, tz, forward_expected, backward_expected", + [ + ( + "2015-03-29 02:00:00", + "Europe/Warsaw", + "2015-03-29 03:00:00", + "2015-03-29 01:59:59", + ), # utc+1 -> utc+2 + ( + "2023-03-12 02:00:00", + "America/Los_Angeles", + "2023-03-12 03:00:00", + "2023-03-12 01:59:59", + ), # utc-8 -> utc-7 + ( + "2023-03-26 01:00:00", + "Europe/London", + "2023-03-26 02:00:00", + "2023-03-26 00:59:59", + ), # utc+0 -> utc+1 + ( + "2023-03-26 00:00:00", + "Atlantic/Azores", + "2023-03-26 01:00:00", + "2023-03-25 23:59:59", + ), # utc-1 -> utc+0 + ], + ) + def test_tz_localize_nonexistent_shift( + self, stamp, tz, forward_expected, backward_expected + ): + ts = Timestamp(stamp) + forward_ts = ts.tz_localize(tz, nonexistent="shift_forward") + assert forward_ts == Timestamp(forward_expected, tz=tz) + backward_ts = ts.tz_localize(tz, nonexistent="shift_backward") + assert backward_ts == Timestamp(backward_expected, tz=tz) + def test_tz_localize_ambiguous_raise(self): # GH#13057 ts = Timestamp("2015-11-1 01:00") diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index b0019d2381e47..0201e5d9af2ee 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -445,6 +445,18 @@ def test_constructor_str_infer_reso(self): ts = Timestamp("300 June 1:30:01.300") assert ts.unit == "ms" + # dateutil path -> don't drop trailing zeros + ts = Timestamp("01-01-2013T00:00:00.000000000+0000") + assert ts.unit == "ns" + + ts = Timestamp("2016/01/02 03:04:05.001000 UTC") + assert ts.unit == "us" + + # higher-than-nanosecond -> we drop the trailing bits + ts = Timestamp("01-01-2013T00:00:00.000000002100+0000") + assert ts == Timestamp("01-01-2013T00:00:00.000000002+0000") + assert ts.unit == "ns" + class TestTimestampConstructors: def test_weekday_but_no_day_raises(self): @@ -814,7 +826,7 @@ def test_bounds_with_different_units(self): # With more extreme cases, we can't even fit inside second resolution info = np.iinfo(np.int64) - msg = "Out of bounds nanosecond timestamp:" + msg = "Out of bounds second timestamp:" for value in [info.min + 1, info.max]: for unit in ["D", "h", "m"]: dt64 = np.datetime64(value, unit) diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index 4ae196eaed2ea..cb2a35be907cd 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -7,12 +7,6 @@ from pandas import Timestamp -try: - from zoneinfo import ZoneInfo -except ImportError: - # Cannot assign to a type - ZoneInfo = None # type: ignore[misc, assignment] - class TestTimestampTZOperations: # ------------------------------------------------------------------ diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 53ac7fbf40af1..618f69eb744e3 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -586,13 +586,15 @@ def test_strftime_dt64_days(self): # dtype may be S10 or U10 depending on python version tm.assert_index_equal(result, expected) - def test_strftime_period_days(self): + def test_strftime_period_days(self, using_infer_string): period_index = period_range("20150301", periods=5) result = period_index.strftime("%Y/%m/%d") expected = Index( ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"], dtype="=U10", ) + if using_infer_string: + expected = expected.astype("string[pyarrow_numpy]") tm.assert_index_equal(result, expected) def test_strftime_dt64_microsecond_resolution(self): diff --git a/pandas/tests/series/indexing/test_delitem.py b/pandas/tests/series/indexing/test_delitem.py index af6b3910baec0..3d1082c3d040b 100644 --- a/pandas/tests/series/indexing/test_delitem.py +++ b/pandas/tests/series/indexing/test_delitem.py @@ -31,19 +31,16 @@ def test_delitem(self): del s[0] tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) - def test_delitem_object_index(self): + def test_delitem_object_index(self, using_infer_string): # Index(dtype=object) - s = Series(1, index=["a"]) + dtype = "string[pyarrow_numpy]" if using_infer_string else object + s = Series(1, index=Index(["a"], dtype=dtype)) del s["a"] - tm.assert_series_equal( - s, Series(dtype="int64", index=Index([], dtype="object")) - ) + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) s["a"] = 1 - tm.assert_series_equal(s, Series(1, index=["a"])) + tm.assert_series_equal(s, Series(1, index=Index(["a"], dtype=dtype))) del s["a"] - tm.assert_series_equal( - s, Series(dtype="int64", index=Index([], dtype="object")) - ) + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) def test_delitem_missing_key(self): # empty diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index 479e74703bc0e..596a225c288b8 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -71,7 +71,7 @@ def test_getitem_unrecognized_scalar(self): def test_getitem_negative_out_of_bounds(self): ser = Series(["a"] * 10, index=["a"] * 10) - msg = "index -11 is out of bounds for axis 0 with size 10" + msg = "index -11 is out of bounds for axis 0 with size 10|index out of bounds" warn_msg = "Series.__getitem__ treating keys as positions is deprecated" with pytest.raises(IndexError, match=msg): with tm.assert_produces_warning(FutureWarning, match=warn_msg): @@ -363,7 +363,9 @@ def test_getitem_no_matches(self, box): key = Series(["C"], dtype=object) key = box(key) - msg = r"None of \[Index\(\['C'\], dtype='object'\)\] are in the \[index\]" + msg = ( + r"None of \[Index\(\['C'\], dtype='object|string'\)\] are in the \[index\]" + ) with pytest.raises(KeyError, match=msg): ser[key] @@ -437,7 +439,7 @@ def test_getitem_boolean_empty(self): # GH#5877 # indexing with empty series - ser = Series(["A", "B"]) + ser = Series(["A", "B"], dtype=object) expected = Series(dtype=object, index=Index([], dtype="int64")) result = ser[Series([], dtype=object)] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 0e850c2d20e72..0c788b371a03a 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -252,7 +252,7 @@ def test_slice(string_series, object_series, using_copy_on_write, warn_copy_on_w assert string_series[numSlice.index[0]] == numSlice[numSlice.index[0]] assert numSlice.index[1] == string_series.index[11] - assert tm.equalContents(numSliceEnd, np.array(string_series)[-10:]) + tm.assert_numpy_array_equal(np.array(numSliceEnd), np.array(string_series)[-10:]) # Test return view. sl = string_series[10:20] @@ -295,7 +295,8 @@ def test_underlying_data_conversion(using_copy_on_write): df["val"].update(s) expected = df_original else: - df["val"].update(s) + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["val"].update(s) expected = DataFrame( {"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3], "val": [0, 1, 0]} ) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 16c127e6ece7b..168be838ec768 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -2,6 +2,7 @@ date, datetime, ) +from decimal import Decimal import numpy as np import pytest @@ -175,7 +176,8 @@ class TestSetitemScalarIndexer: def test_setitem_negative_out_of_bounds(self): ser = Series(["a"] * 10, index=["a"] * 10) - msg = "index -11 is out of bounds for axis 0 with size 10" + # string index falls back to positional + msg = "index -11|-1 is out of bounds for axis 0 with size 10" warn_msg = "Series.__setitem__ treating keys as positions is deprecated" with pytest.raises(IndexError, match=msg): with tm.assert_produces_warning(FutureWarning, match=warn_msg): @@ -500,10 +502,11 @@ def test_setitem_empty_series(self): def test_setitem_empty_series_datetimeindex_preserves_freq(self): # GH#33573 our index should retain its freq - series = Series([], DatetimeIndex([], freq="D"), dtype=object) + dti = DatetimeIndex([], freq="D", dtype="M8[ns]") + series = Series([], index=dti, dtype=object) key = Timestamp("2012-01-01") series[key] = 47 - expected = Series(47, DatetimeIndex([key], freq="D")) + expected = Series(47, DatetimeIndex([key], freq="D").as_unit("ns")) tm.assert_series_equal(series, expected) assert series.index.freq == expected.index.freq @@ -527,8 +530,12 @@ def test_setitem_empty_series_timestamp_preserves_dtype(self): Timedelta("9 days").to_pytimedelta(), ], ) - def test_append_timedelta_does_not_cast(self, td): + def test_append_timedelta_does_not_cast(self, td, using_infer_string, request): # GH#22717 inserting a Timedelta should _not_ cast to int64 + if using_infer_string and not isinstance(td, Timedelta): + # TODO: GH#56010 + request.applymarker(pytest.mark.xfail(reason="inferred as string")) + expected = Series(["x", td], index=[0, "td"], dtype=object) ser = Series(["x"]) @@ -595,13 +602,21 @@ def test_setitem_enlarge_with_na( expected = Series(expected_values, dtype=target_dtype) tm.assert_series_equal(ser, expected) - def test_setitem_enlargement_object_none(self, nulls_fixture): + def test_setitem_enlargement_object_none(self, nulls_fixture, using_infer_string): # GH#48665 ser = Series(["a", "b"]) ser[3] = nulls_fixture - expected = Series(["a", "b", nulls_fixture], index=[0, 1, 3]) + dtype = ( + "string[pyarrow_numpy]" + if using_infer_string and not isinstance(nulls_fixture, Decimal) + else object + ) + expected = Series(["a", "b", nulls_fixture], index=[0, 1, 3], dtype=dtype) tm.assert_series_equal(ser, expected) - assert ser[3] is nulls_fixture + if using_infer_string: + ser[3] is np.nan + else: + assert ser[3] is nulls_fixture def test_setitem_scalar_into_readonly_backing_data(): @@ -845,20 +860,28 @@ def test_series_where(self, obj, key, expected, warn, val, is_inplace): self._check_inplace(is_inplace, orig, arr, obj) - def test_index_where(self, obj, key, expected, warn, val): + def test_index_where(self, obj, key, expected, warn, val, using_infer_string): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - res = Index(obj).where(~mask, val) - expected_idx = Index(expected, dtype=expected.dtype) - tm.assert_index_equal(res, expected_idx) + if using_infer_string and obj.dtype == object: + with pytest.raises(TypeError, match="Scalar must"): + Index(obj).where(~mask, val) + else: + res = Index(obj).where(~mask, val) + expected_idx = Index(expected, dtype=expected.dtype) + tm.assert_index_equal(res, expected_idx) - def test_index_putmask(self, obj, key, expected, warn, val): + def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - res = Index(obj).putmask(mask, val) - tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) + if using_infer_string and obj.dtype == object: + with pytest.raises(TypeError, match="Scalar must"): + Index(obj).putmask(mask, val) + else: + res = Index(obj).putmask(mask, val) + tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) @pytest.mark.parametrize( diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index 7c1507ce423ad..c978481ca9988 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.core.dtypes.common import is_integer import pandas as pd @@ -230,6 +232,7 @@ def test_where_ndframe_align(): tm.assert_series_equal(out, expected) +@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set ints into string") def test_where_setitem_invalid(): # GH 2702 # make sure correct exceptions are raised on invalid list assignment diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py index 5bcf42aad1db4..432c0eceee011 100644 --- a/pandas/tests/series/methods/test_argsort.py +++ b/pandas/tests/series/methods/test_argsort.py @@ -42,14 +42,17 @@ def test_argsort(self, datetime_series): argsorted = datetime_series.argsort() assert issubclass(argsorted.dtype.type, np.integer) + def test_argsort_dt64(self, unit): # GH#2967 (introduced bug in 0.11-dev I think) - s = Series([Timestamp(f"201301{i:02d}") for i in range(1, 6)]) - assert s.dtype == "datetime64[ns]" - shifted = s.shift(-1) - assert shifted.dtype == "datetime64[ns]" + ser = Series( + [Timestamp(f"201301{i:02d}") for i in range(1, 6)], dtype=f"M8[{unit}]" + ) + assert ser.dtype == f"datetime64[{unit}]" + shifted = ser.shift(-1) + assert shifted.dtype == f"datetime64[{unit}]" assert isna(shifted[4]) - result = s.argsort() + result = ser.argsort() expected = Series(range(5), dtype=np.intp) tm.assert_series_equal(result, expected) @@ -60,12 +63,12 @@ def test_argsort(self, datetime_series): tm.assert_series_equal(result, expected) def test_argsort_stable(self): - s = Series(np.random.default_rng(2).integers(0, 100, size=10000)) - mindexer = s.argsort(kind="mergesort") - qindexer = s.argsort() + ser = Series(np.random.default_rng(2).integers(0, 100, size=10000)) + mindexer = ser.argsort(kind="mergesort") + qindexer = ser.argsort() - mexpected = np.argsort(s.values, kind="mergesort") - qexpected = np.argsort(s.values, kind="quicksort") + mexpected = np.argsort(ser.values, kind="mergesort") + qexpected = np.argsort(ser.values, kind="quicksort") tm.assert_series_equal(mindexer.astype(np.intp), Series(mexpected)) tm.assert_series_equal(qindexer.astype(np.intp), Series(qexpected)) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 7d4ad99deab38..46f55fff91e41 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -25,6 +25,7 @@ Timestamp, cut, date_range, + to_datetime, ) import pandas._testing as tm @@ -75,7 +76,7 @@ def test_astype_dict_like(self, dtype_class): dt1 = dtype_class({"abc": str}) result = ser.astype(dt1) - expected = Series(["0", "2", "4", "6", "8"], name="abc") + expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype=object) tm.assert_series_equal(result, expected) dt2 = dtype_class({"abc": "float64"}) @@ -114,13 +115,19 @@ def test_astype_object_to_dt64_non_nano(self, tz): dtype = "M8[us]" if tz is not None: dtype = f"M8[us, {tz}]" - # NB: the 2500 is interpreted as nanoseconds and rounded *down* - # to 2 microseconds vals = [ts, "2999-01-02 03:04:05.678910", 2500] ser = Series(vals, dtype=object) result = ser.astype(dtype) - exp_vals = [Timestamp(x, tz=tz).as_unit("us").asm8 for x in vals] + # The 2500 is interpreted as microseconds, consistent with what + # we would get if we created DatetimeIndexes from vals[:2] and vals[2:] + # and concated the results. + pointwise = [ + vals[0].tz_localize(tz), + Timestamp(vals[1], tz=tz), + to_datetime(vals[2], unit="us", utc=True).tz_convert(tz), + ] + exp_vals = [x.as_unit("us").asm8 for x in pointwise] exp_arr = np.array(exp_vals, dtype="M8[us]") expected = Series(exp_arr, dtype="M8[us]") if tz is not None: @@ -163,10 +170,12 @@ def test_astype_empty_constructor_equality(self, dtype): Series([string.digits * 10, rand_str(63), rand_str(64), np.nan, 1.0]), ], ) - def test_astype_str_map(self, dtype, series): + def test_astype_str_map(self, dtype, series, using_infer_string): # see GH#4405 result = series.astype(dtype) expected = series.map(str) + if using_infer_string: + expected = expected.astype(object) tm.assert_series_equal(result, expected) def test_astype_float_to_period(self): @@ -276,13 +285,13 @@ def test_astype_str_cast_dt64(self): ts = Series([Timestamp("2010-01-04 00:00:00")]) res = ts.astype(str) - expected = Series(["2010-01-04"]) + expected = Series(["2010-01-04"], dtype=object) tm.assert_series_equal(res, expected) ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")]) res = ts.astype(str) - expected = Series(["2010-01-04 00:00:00-05:00"]) + expected = Series(["2010-01-04 00:00:00-05:00"], dtype=object) tm.assert_series_equal(res, expected) def test_astype_str_cast_td64(self): @@ -291,7 +300,7 @@ def test_astype_str_cast_td64(self): td = Series([Timedelta(1, unit="d")]) ser = td.astype(str) - expected = Series(["1 days"]) + expected = Series(["1 days"], dtype=object) tm.assert_series_equal(ser, expected) def test_dt64_series_astype_object(self): @@ -338,7 +347,7 @@ def test_astype_from_float_to_str(self, dtype): # https://github.com/pandas-dev/pandas/issues/36451 ser = Series([0.1], dtype=dtype) result = ser.astype(str) - expected = Series(["0.1"]) + expected = Series(["0.1"], dtype=object) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -409,7 +418,7 @@ def test_astype_cast_object_int(self): tm.assert_series_equal(result, Series(np.arange(1, 5))) - def test_astype_unicode(self): + def test_astype_unicode(self, using_infer_string): # see GH#7758: A bit of magic is required to set # default encoding to utf-8 digits = string.digits @@ -426,12 +435,14 @@ def test_astype_unicode(self): item = "野菜食べないとやばい" ser = Series([item.encode()]) result = ser.astype(np.str_) - expected = Series([item]) + expected = Series([item], dtype=object) tm.assert_series_equal(result, expected) for ser in test_series: res = ser.astype(np.str_) expec = ser.map(str) + if using_infer_string: + expec = expec.astype(object) tm.assert_series_equal(res, expec) # Restore the former encoding @@ -527,12 +538,12 @@ def test_astype_categorical_to_other(self): expected = ser tm.assert_series_equal(ser.astype("category"), expected) tm.assert_series_equal(ser.astype(CategoricalDtype()), expected) - msg = r"Cannot cast object dtype to float64" + msg = r"Cannot cast object|string dtype to float64" with pytest.raises(ValueError, match=msg): ser.astype("float64") cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) - exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) + exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"], dtype=object) tm.assert_series_equal(cat.astype("str"), exp) s2 = Series(Categorical(["1", "2", "3", "4"])) exp2 = Series([1, 2, 3, 4]).astype("int") diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py index 130b77db0a1fb..5bbf35f6e39bb 100644 --- a/pandas/tests/series/methods/test_clip.py +++ b/pandas/tests/series/methods/test_clip.py @@ -135,11 +135,12 @@ def test_clip_with_datetimes(self): ) tm.assert_series_equal(result, expected) - def test_clip_with_timestamps_and_oob_datetimes(self): + @pytest.mark.parametrize("dtype", [object, "M8[us]"]) + def test_clip_with_timestamps_and_oob_datetimes(self, dtype): # GH-42794 - ser = Series([datetime(1, 1, 1), datetime(9999, 9, 9)]) + ser = Series([datetime(1, 1, 1), datetime(9999, 9, 9)], dtype=dtype) result = ser.clip(lower=Timestamp.min, upper=Timestamp.max) - expected = Series([Timestamp.min, Timestamp.max], dtype="object") + expected = Series([Timestamp.min, Timestamp.max], dtype=dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index 47659308cfcad..795b2eab82aca 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -53,7 +53,7 @@ def test_combine_first(self): # mixed types index = tm.makeStringIndex(20) floats = Series(np.random.default_rng(2).standard_normal(20), index=index) - strings = Series(tm.makeStringIndex(10), index=index[::2]) + strings = Series(tm.makeStringIndex(10), index=index[::2], dtype=object) combined = strings.combine_first(floats) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 0cd39140938c4..b0a920ba02cad 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -151,13 +151,13 @@ {}, ), ( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ns"), "datetime64[ns]", np.dtype("datetime64[ns]"), {}, ), ( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ns"), object, np.dtype("datetime64[ns]"), {("infer_objects", False): np.dtype("object")}, @@ -186,6 +186,7 @@ def test_convert_dtypes( self, test_cases, params, + using_infer_string, ): data, maindtype, expected_default, expected_other = test_cases if ( @@ -219,6 +220,16 @@ def test_convert_dtypes( for spec, dtype in expected_other.items(): if all(params_dict[key] is val for key, val in zip(spec[::2], spec[1::2])): expected_dtype = dtype + if ( + using_infer_string + and expected_default == "string" + and expected_dtype == object + and params[0] + and not params[1] + ): + # If we would convert with convert strings then infer_objects converts + # with the option + expected_dtype = "string[pyarrow_numpy]" expected = pd.Series(data, dtype=expected_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index 426885bf41a1f..d854f0b787759 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -205,7 +205,7 @@ def test_interpolate_from_derivatives(self): [ {}, pytest.param( - {"method": "polynomial", "order": 1}, marks=td.skip_if_no_scipy + {"method": "polynomial", "order": 1}, marks=td.skip_if_no("scipy") ), ], ) @@ -253,7 +253,7 @@ def test_interpolate_non_ts(self): [ {}, pytest.param( - {"method": "polynomial", "order": 1}, marks=td.skip_if_no_scipy + {"method": "polynomial", "order": 1}, marks=td.skip_if_no("scipy") ), ], ) @@ -628,7 +628,7 @@ def test_interp_all_good(self): tm.assert_series_equal(result, s) @pytest.mark.parametrize( - "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no("scipy"))] ) def test_interp_multiIndex(self, check_scipy): idx = MultiIndex.from_tuples([(0, "a"), (1, "b"), (2, "c")]) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 6d78ecd61cdcb..f86f6069a2ef3 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -83,7 +83,7 @@ def func(x): tm.assert_series_equal(result, expected) -def test_map_series_stringdtype(any_string_dtype): +def test_map_series_stringdtype(any_string_dtype, using_infer_string): # map test on StringDType, GH#40823 ser1 = Series( data=["cat", "dog", "rabbit"], @@ -98,6 +98,8 @@ def test_map_series_stringdtype(any_string_dtype): item = np.nan expected = Series(data=["rabbit", "dog", "cat", item], dtype=any_string_dtype) + if using_infer_string and any_string_dtype == "object": + expected = expected.astype("string[pyarrow_numpy]") tm.assert_series_equal(result, expected) @@ -106,7 +108,7 @@ def test_map_series_stringdtype(any_string_dtype): "data, expected_dtype", [(["1-1", "1-1", np.nan], "category"), (["1-1", "1-2", np.nan], object)], ) -def test_map_categorical_with_nan_values(data, expected_dtype): +def test_map_categorical_with_nan_values(data, expected_dtype, using_infer_string): # GH 20714 bug fixed in: GH 24275 def func(val): return val.split("-")[0] @@ -114,6 +116,8 @@ def func(val): s = Series(data, dtype="category") result = s.map(func, na_action="ignore") + if using_infer_string and expected_dtype == object: + expected_dtype = "string[pyarrow_numpy]" expected = Series(["1", "1", np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -133,11 +137,15 @@ def test_map_empty_integer_series_with_datetime_index(): @pytest.mark.parametrize("func", [str, lambda x: str(x)]) -def test_map_simple_str_callables_same_as_astype(string_series, func): +def test_map_simple_str_callables_same_as_astype( + string_series, func, using_infer_string +): # test that we are evaluating row-by-row first # before vectorized evaluation result = string_series.map(func) - expected = string_series.astype(str) + expected = string_series.astype( + str if not using_infer_string else "string[pyarrow_numpy]" + ) tm.assert_series_equal(result, expected) @@ -461,7 +469,7 @@ def test_map_box_period(): @pytest.mark.parametrize("na_action", [None, "ignore"]) -def test_map_categorical(na_action): +def test_map_categorical(na_action, using_infer_string): values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) s = Series(values, name="XX", index=list("abcdefg")) @@ -474,7 +482,7 @@ def test_map_categorical(na_action): result = s.map(lambda x: "A", na_action=na_action) exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) - assert result.dtype == object + assert result.dtype == object if not using_infer_string else "string" @pytest.mark.parametrize( @@ -536,12 +544,14 @@ def f(x): (list(range(3)), {0: 42}, [42] + [np.nan] * 3), ], ) -def test_map_missing_mixed(vals, mapping, exp): +def test_map_missing_mixed(vals, mapping, exp, using_infer_string): # GH20495 s = Series(vals + [np.nan]) result = s.map(mapping) - - tm.assert_series_equal(result, Series(exp)) + exp = Series(exp) + if using_infer_string and mapping == {np.nan: "not NaN"}: + exp.iloc[-1] = np.nan + tm.assert_series_equal(result, exp) def test_map_scalar_on_date_time_index_aware_series(): diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 0923a2d42ce10..6f0c8d751a92a 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -22,6 +24,9 @@ import pandas._testing as tm +@pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="share memory doesn't work for arrow" +) def test_reindex(datetime_series, string_series): identity = string_series.reindex(string_series.index) diff --git a/pandas/tests/series/methods/test_rename.py b/pandas/tests/series/methods/test_rename.py index 83adff08b758e..119654bd19b3f 100644 --- a/pandas/tests/series/methods/test_rename.py +++ b/pandas/tests/series/methods/test_rename.py @@ -8,6 +8,7 @@ Index, MultiIndex, Series, + array, ) import pandas._testing as tm @@ -45,22 +46,28 @@ def test_rename_by_series(self): expected = Series(range(5), index=[0, 10, 20, 3, 4], name="foo") tm.assert_series_equal(result, expected) - def test_rename_set_name(self): + def test_rename_set_name(self, using_infer_string): ser = Series(range(4), index=list("abcd")) for name in ["foo", 123, 123.0, datetime(2001, 11, 11), ("foo",)]: result = ser.rename(name) assert result.name == name - tm.assert_numpy_array_equal(result.index.values, ser.index.values) + if using_infer_string: + tm.assert_extension_array_equal(result.index.values, ser.index.values) + else: + tm.assert_numpy_array_equal(result.index.values, ser.index.values) assert ser.name is None - def test_rename_set_name_inplace(self): + def test_rename_set_name_inplace(self, using_infer_string): ser = Series(range(3), index=list("abc")) for name in ["foo", 123, 123.0, datetime(2001, 11, 11), ("foo",)]: ser.rename(name, inplace=True) assert ser.name == name - exp = np.array(["a", "b", "c"], dtype=np.object_) - tm.assert_numpy_array_equal(ser.index.values, exp) + if using_infer_string: + exp = array(exp, dtype="string[pyarrow_numpy]") + tm.assert_extension_array_equal(ser.index.values, exp) + else: + tm.assert_numpy_array_equal(ser.index.values, exp) def test_rename_axis_supported(self): # Supporting axis for compatibility, detailed in GH-18589 diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index f08966c3816c0..fe0f79b766f72 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core.arrays import IntervalArray @@ -389,6 +391,7 @@ def test_replace_mixed_types_with_string(self): expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") @pytest.mark.parametrize( "categorical, numeric", [ @@ -719,6 +722,7 @@ def test_replace_nullable_numeric(self): with pytest.raises(TypeError, match="Invalid value"): ints.replace(1, 9.5) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 1 in string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_series(self, regex): # GH-48644 @@ -748,10 +752,12 @@ def test_replace_value_none_dtype_numeric(self, val): expected = pd.Series([1, None], dtype=object) tm.assert_series_equal(result, expected) - def test_replace_change_dtype_series(self): + def test_replace_change_dtype_series(self, using_infer_string): # GH#25797 df = pd.DataFrame.from_dict({"Test": ["0.5", True, "0.6"]}) - df["Test"] = df["Test"].replace([True], [np.nan]) + warn = FutureWarning if using_infer_string else None + with tm.assert_produces_warning(warn, match="Downcasting"): + df["Test"] = df["Test"].replace([True], [np.nan]) expected = pd.DataFrame.from_dict({"Test": ["0.5", np.nan, "0.6"]}) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/series/methods/test_reset_index.py b/pandas/tests/series/methods/test_reset_index.py index db36221d8f510..9e6b4ce0df1d6 100644 --- a/pandas/tests/series/methods/test_reset_index.py +++ b/pandas/tests/series/methods/test_reset_index.py @@ -136,8 +136,16 @@ def test_reset_index_drop_errors(self): with pytest.raises(KeyError, match="not found"): s.reset_index("wrong", drop=True) - def test_reset_index_with_drop(self, series_with_multilevel_index): - ser = series_with_multilevel_index + def test_reset_index_with_drop(self): + arrays = [ + ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + tuples = zip(*arrays) + index = MultiIndex.from_tuples(tuples) + data = np.random.default_rng(2).standard_normal(8) + ser = Series(data, index=index) + ser.iloc[3] = np.nan deleveled = ser.reset_index() assert isinstance(deleveled, DataFrame) @@ -166,12 +174,20 @@ def test_reset_index_inplace_and_drop_ignore_name(self): ), ], ) -def test_reset_index_dtypes_on_empty_series_with_multiindex(array, dtype): +def test_reset_index_dtypes_on_empty_series_with_multiindex( + array, dtype, using_infer_string +): # GH 19602 - Preserve dtype on empty Series with MultiIndex idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = Series(dtype=object, index=idx)[:0].reset_index().dtypes + exp = "string" if using_infer_string else object expected = Series( - {"level_0": np.int64, "level_1": np.float64, "level_2": dtype, 0: object} + { + "level_0": np.int64, + "level_1": np.float64, + "level_2": exp if dtype == object else dtype, + 0: object, + } ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_sort_index.py b/pandas/tests/series/methods/test_sort_index.py index c5d4694f8bdbd..d6817aa179b7b 100644 --- a/pandas/tests/series/methods/test_sort_index.py +++ b/pandas/tests/series/methods/test_sort_index.py @@ -317,3 +317,21 @@ def test_sort_values_key_type(self): result = s.sort_index(key=lambda x: x.month_name()) expected = s.iloc[[2, 1, 0]] tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "ascending", + [ + [True, False], + [False, True], + ], + ) + def test_sort_index_multi_already_monotonic(self, ascending): + # GH 56049 + mi = MultiIndex.from_product([[1, 2], [3, 4]]) + ser = Series(range(len(mi)), index=mi) + result = ser.sort_index(ascending=ascending) + if ascending == [True, False]: + expected = ser.take([1, 0, 3, 2]) + elif ascending == [False, True]: + expected = ser.take([2, 3, 0, 1]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index 76ca05a60eb7a..1c17013d621c7 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -165,7 +165,7 @@ def test_to_csv_compression(self, s, encoding, compression): pd.read_csv(fh, index_col=0, encoding=encoding).squeeze("columns"), ) - def test_to_csv_interval_index(self): + def test_to_csv_interval_index(self, using_infer_string): # GH 28210 s = Series(["foo", "bar", "baz"], index=pd.interval_range(0, 3)) @@ -175,6 +175,8 @@ def test_to_csv_interval_index(self): # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) expected = s.copy() - expected.index = expected.index.astype(str) - + if using_infer_string: + expected.index = expected.index.astype("string[pyarrow_numpy]") + else: + expected.index = expected.index.astype(str) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_update.py b/pandas/tests/series/methods/test_update.py index c38b2400f0f4e..3f18ae6c13880 100644 --- a/pandas/tests/series/methods/test_update.py +++ b/pandas/tests/series/methods/test_update.py @@ -34,10 +34,12 @@ def test_update(self, using_copy_on_write): df["c"].update(Series(["foo"], index=[0])) expected = df_orig else: - df["c"].update(Series(["foo"], index=[0])) + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["c"].update(Series(["foo"], index=[0])) expected = DataFrame( [[1, np.nan, "foo"], [3, 2.0, np.nan]], columns=["a", "b", "c"] ) + expected["c"] = expected["c"].astype(object) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index b835be6d8e501..6471cd71f0860 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -204,9 +204,9 @@ def test_series_integer_mod(self, index): s1 = Series(range(1, 10)) s2 = Series("foo", index=index) - msg = "not all arguments converted during string formatting" + msg = "not all arguments converted during string formatting|mod not" - with pytest.raises(TypeError, match=msg): + with pytest.raises((TypeError, NotImplementedError), match=msg): s2 % s1 def test_add_with_duplicate_index(self): @@ -491,14 +491,27 @@ def test_ser_cmp_result_names(self, names, comparison_op): result = op(ser, cidx) assert result.name == names[2] - def test_comparisons(self): + def test_comparisons(self, using_infer_string): s = Series(["a", "b", "c"]) s2 = Series([False, True, False]) # it works! exp = Series([False, False, False]) - tm.assert_series_equal(s == s2, exp) - tm.assert_series_equal(s2 == s, exp) + if using_infer_string: + import pyarrow as pa + + msg = "has no kernel" + # TODO(3.0) GH56008 + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + s == s2 + with tm.assert_produces_warning( + DeprecationWarning, match="comparison", check_stacklevel=False + ): + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + s2 == s + else: + tm.assert_series_equal(s == s2, exp) + tm.assert_series_equal(s2 == s, exp) # ----------------------------------------------------------------- # Categorical Dtype Comparisons @@ -649,9 +662,9 @@ def test_comparison_operators_with_nas(self, comparison_op): def test_ne(self): ts = Series([3, 4, 5, 6, 7], [3, 4, 5, 6, 7], dtype=float) - expected = [True, True, False, True, True] - assert tm.equalContents(ts.index != 5, expected) - assert tm.equalContents(~(ts.index == 5), expected) + expected = np.array([True, True, False, True, True]) + tm.assert_numpy_array_equal(ts.index != 5, expected) + tm.assert_numpy_array_equal(~(ts.index == 5), expected) @pytest.mark.parametrize( "left, right", diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 195c50969ffae..84c612c43da29 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -152,7 +152,7 @@ def test_scalar_extension_dtype(self, ea_scalar_and_dtype): assert ser.dtype == ea_dtype tm.assert_series_equal(ser, expected) - def test_constructor(self, datetime_series): + def test_constructor(self, datetime_series, using_infer_string): empty_series = Series() assert datetime_series.index._is_all_dates @@ -160,13 +160,13 @@ def test_constructor(self, datetime_series): derived = Series(datetime_series) assert derived.index._is_all_dates - assert tm.equalContents(derived.index, datetime_series.index) + tm.assert_index_equal(derived.index, datetime_series.index) # Ensure new index is not created assert id(datetime_series.index) == id(derived.index) # Mixed type Series mixed = Series(["hello", np.nan], index=[0, 1]) - assert mixed.dtype == np.object_ + assert mixed.dtype == np.object_ if not using_infer_string else "string" assert np.isnan(mixed[1]) assert not empty_series.index._is_all_dates @@ -197,7 +197,7 @@ def test_constructor_index_ndim_gt_1_raises(self): Series([1, 3, 2], index=df) @pytest.mark.parametrize("input_class", [list, dict, OrderedDict]) - def test_constructor_empty(self, input_class): + def test_constructor_empty(self, input_class, using_infer_string): empty = Series() empty2 = Series(input_class()) @@ -228,7 +228,10 @@ def test_constructor_empty(self, input_class): # GH 19853 : with empty string, index and dtype str empty = Series("", dtype=str, index=range(3)) - empty2 = Series("", index=range(3)) + if using_infer_string: + empty2 = Series("", index=range(3), dtype=object) + else: + empty2 = Series("", index=range(3)) tm.assert_series_equal(empty, empty2) @pytest.mark.parametrize("input_arg", [np.nan, float("nan")]) @@ -1072,24 +1075,24 @@ def test_constructor_dtype_datetime64_5(self): def test_constructor_dtype_datetime64_4(self): # non-convertible - s = Series([1479596223000, -1479590, NaT]) - assert s.dtype == "object" - assert s[2] is NaT - assert "NaT" in str(s) + ser = Series([1479596223000, -1479590, NaT]) + assert ser.dtype == "object" + assert ser[2] is NaT + assert "NaT" in str(ser) def test_constructor_dtype_datetime64_3(self): # if we passed a NaT it remains - s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), NaT]) - assert s.dtype == "object" - assert s[2] is NaT - assert "NaT" in str(s) + ser = Series([datetime(2010, 1, 1), datetime(2, 1, 1), NaT]) + assert ser.dtype == "object" + assert ser[2] is NaT + assert "NaT" in str(ser) def test_constructor_dtype_datetime64_2(self): # if we passed a nan it remains - s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) - assert s.dtype == "object" - assert s[2] is np.nan - assert "NaN" in str(s) + ser = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) + assert ser.dtype == "object" + assert ser[2] is np.nan + assert "NaN" in str(ser) def test_constructor_with_datetime_tz(self): # 8260 @@ -1135,13 +1138,14 @@ def test_constructor_with_datetime_tz(self): assert "datetime64[ns, US/Eastern]" in str(result) assert "NaT" in str(result) - # long str - t = Series(date_range("20130101", periods=1000, tz="US/Eastern")) - assert "datetime64[ns, US/Eastern]" in str(t) - result = DatetimeIndex(s, freq="infer") tm.assert_index_equal(result, dr) + def test_constructor_with_datetime_tz5(self): + # long str + ser = Series(date_range("20130101", periods=1000, tz="US/Eastern")) + assert "datetime64[ns, US/Eastern]" in str(ser) + def test_constructor_with_datetime_tz4(self): # inference s = Series( @@ -1165,9 +1169,10 @@ def test_constructor_with_datetime_tz3(self): def test_constructor_with_datetime_tz2(self): # with all NaT - s = Series(NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]") - expected = Series(DatetimeIndex(["NaT", "NaT"], tz="US/Eastern")) - tm.assert_series_equal(s, expected) + ser = Series(NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]") + dti = DatetimeIndex(["NaT", "NaT"], tz="US/Eastern").as_unit("ns") + expected = Series(dti) + tm.assert_series_equal(ser, expected) def test_constructor_no_partial_datetime_casting(self): # GH#40111 @@ -1439,7 +1444,7 @@ def test_constructor_dict_of_tuples(self): # https://github.com/pandas-dev/pandas/issues/22698 @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning") - def test_fromDict(self): + def test_fromDict(self, using_infer_string): data = {"a": 0, "b": 1, "c": 2, "d": 3} series = Series(data) @@ -1451,19 +1456,19 @@ def test_fromDict(self): data = {"a": 0, "b": "1", "c": "2", "d": "3"} series = Series(data) - assert series.dtype == np.object_ + assert series.dtype == np.object_ if not using_infer_string else "string" data = {"a": "0", "b": "1"} series = Series(data, dtype=float) assert series.dtype == np.float64 - def test_fromValue(self, datetime_series): + def test_fromValue(self, datetime_series, using_infer_string): nans = Series(np.nan, index=datetime_series.index, dtype=np.float64) assert nans.dtype == np.float64 assert len(nans) == len(datetime_series) strings = Series("foo", index=datetime_series.index) - assert strings.dtype == np.object_ + assert strings.dtype == np.object_ if not using_infer_string else "string" assert len(strings) == len(datetime_series) d = datetime.now() @@ -2144,16 +2149,14 @@ def test_series_constructor_datetimelike_index_coercion(self): # to DatetimeIndex GH#39307, GH#23598 assert not isinstance(ser.index, DatetimeIndex) - def test_series_constructor_infer_multiindex(self): - index_lists = [["a", "a", "b", "b"], ["x", "y", "x", "y"]] - - multi = Series(1.0, index=[np.array(x) for x in index_lists]) - assert isinstance(multi.index, MultiIndex) - - multi = Series(1.0, index=index_lists) - assert isinstance(multi.index, MultiIndex) + @pytest.mark.parametrize("container", [None, np.array, Series, Index]) + @pytest.mark.parametrize("data", [1.0, range(4)]) + def test_series_constructor_infer_multiindex(self, container, data): + indexes = [["a", "a", "b", "b"], ["x", "y", "x", "y"]] + if container is not None: + indexes = [container(ind) for ind in indexes] - multi = Series(range(4), index=index_lists) + multi = Series(data, index=indexes) assert isinstance(multi.index, MultiIndex) diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py index 25b34351627a1..040b1186980b2 100644 --- a/pandas/tests/series/test_formats.py +++ b/pandas/tests/series/test_formats.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas as pd from pandas import ( Categorical, @@ -142,6 +144,9 @@ def test_tidy_repr_name_0(self, arg): rep_str = repr(ser) assert "Name: 0" in rep_str + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="TODO: investigate why this is failing" + ) def test_newline(self): ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"]) assert "\t" not in repr(ser) @@ -301,7 +306,7 @@ def __repr__(self) -> str: repr(ser) str(ser) - def test_categorical_repr(self): + def test_categorical_repr(self, using_infer_string): a = Series(Categorical([1, 2, 3, 4])) exp = ( "0 1\n1 2\n2 3\n3 4\n" @@ -311,22 +316,38 @@ def test_categorical_repr(self): assert exp == a.__str__() a = Series(Categorical(["a", "b"] * 25)) - exp = ( - "0 a\n1 b\n" - " ..\n" - "48 a\n49 b\n" - "Length: 50, dtype: category\nCategories (2, object): ['a', 'b']" - ) + if using_infer_string: + exp = ( + "0 a\n1 b\n" + " ..\n" + "48 a\n49 b\n" + "Length: 50, dtype: category\nCategories (2, string): [a, b]" + ) + else: + exp = ( + "0 a\n1 b\n" + " ..\n" + "48 a\n49 b\n" + "Length: 50, dtype: category\nCategories (2, object): ['a', 'b']" + ) with option_context("display.max_rows", 5): assert exp == repr(a) levs = list("abcdefghijklmnopqrstuvwxyz") a = Series(Categorical(["a", "b"], categories=levs, ordered=True)) - exp = ( - "0 a\n1 b\n" - "dtype: category\n" - "Categories (26, object): ['a' < 'b' < 'c' < 'd' ... 'w' < 'x' < 'y' < 'z']" - ) + if using_infer_string: + exp = ( + "0 a\n1 b\n" + "dtype: category\n" + "Categories (26, string): [a < b < c < d ... w < x < y < z]" + ) + else: + exp = ( + "0 a\n1 b\n" + "dtype: category\n" + "Categories (26, object): ['a' < 'b' < 'c' < 'd' ... " + "'w' < 'x' < 'y' < 'z']" + ) assert exp == a.__str__() def test_categorical_series_repr(self): diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 2146e154dc7fa..166f52181fed4 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -146,7 +146,7 @@ def test_logical_operators_int_dtype_with_bool(self): expected = Series([False, True, True, True]) tm.assert_series_equal(result, expected) - def test_logical_operators_int_dtype_with_object(self): + def test_logical_operators_int_dtype_with_object(self, using_infer_string): # GH#9016: support bitwise op for integer types s_0123 = Series(range(4), dtype="int64") @@ -155,8 +155,14 @@ def test_logical_operators_int_dtype_with_object(self): tm.assert_series_equal(result, expected) s_abNd = Series(["a", "b", np.nan, "d"]) - with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"): - s_0123 & s_abNd + if using_infer_string: + import pyarrow as pa + + with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): + s_0123 & s_abNd + else: + with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"): + s_0123 & s_abNd def test_logical_operators_bool_dtype_with_int(self): index = list("bca") @@ -354,7 +360,7 @@ def test_reverse_ops_with_index(self, op, expected): result = op(ser, idx) tm.assert_series_equal(result, expected) - def test_logical_ops_label_based(self): + def test_logical_ops_label_based(self, using_infer_string): # GH#4947 # logical ops should be label based @@ -422,7 +428,17 @@ def test_logical_ops_label_based(self): tm.assert_series_equal(result, a[a]) for e in [Series(["z"])]: - result = a[a | e] + warn = FutureWarning if using_infer_string else None + if using_infer_string: + import pyarrow as pa + + with tm.assert_produces_warning(warn, match="Operation between non"): + with pytest.raises( + pa.lib.ArrowNotImplementedError, match="has no kernel" + ): + result = a[a | e] + else: + result = a[a | e] tm.assert_series_equal(result, a[a]) # vs scalars diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index fbdf843a998bb..4bbbcf3bf54c2 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -29,6 +29,28 @@ def test_mode_extension_dtype(as_period): tm.assert_series_equal(res, ser) +def test_mode_nullable_dtype(any_numeric_ea_dtype): + # GH#55340 + ser = Series([1, 3, 2, pd.NA, 3, 2, pd.NA], dtype=any_numeric_ea_dtype) + result = ser.mode(dropna=False) + expected = Series([2, 3, pd.NA], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + result = ser.mode(dropna=True) + expected = Series([2, 3], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + ser[-1] = pd.NA + + result = ser.mode(dropna=True) + expected = Series([2, 3], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + result = ser.mode(dropna=False) + expected = Series([pd.NA], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + def test_reductions_td64_with_nat(): # GH#8617 ser = Series([0, pd.NaT], dtype="m8[ns]") @@ -131,17 +153,22 @@ def test_validate_stat_keepdims(): np.sum(ser, keepdims=True) -def test_mean_with_convertible_string_raises(using_array_manager): +def test_mean_with_convertible_string_raises(using_array_manager, using_infer_string): # GH#44008 ser = Series(["1", "2"]) - assert ser.sum() == "12" - msg = "Could not convert string '12' to numeric" + if using_infer_string: + msg = "does not support" + with pytest.raises(TypeError, match=msg): + ser.sum() + else: + assert ser.sum() == "12" + msg = "Could not convert string '12' to numeric|does not support" with pytest.raises(TypeError, match=msg): ser.mean() df = ser.to_frame() if not using_array_manager: - msg = r"Could not convert \['12'\] to numeric" + msg = r"Could not convert \['12'\] to numeric|does not support" with pytest.raises(TypeError, match=msg): df.mean() @@ -152,29 +179,30 @@ def test_mean_dont_convert_j_to_complex(using_array_manager): if using_array_manager: msg = "Could not convert string 'J' to numeric" else: - msg = r"Could not convert \['J'\] to numeric" + msg = r"Could not convert \['J'\] to numeric|does not support" with pytest.raises(TypeError, match=msg): df.mean() with pytest.raises(TypeError, match=msg): df.agg("mean") - msg = "Could not convert string 'J' to numeric" + msg = "Could not convert string 'J' to numeric|does not support" with pytest.raises(TypeError, match=msg): df["db"].mean() + msg = "Could not convert string 'J' to numeric|ufunc 'divide'" with pytest.raises(TypeError, match=msg): np.mean(df["db"].astype("string").array) def test_median_with_convertible_string_raises(using_array_manager): # GH#34671 this _could_ return a string "2", but definitely not float 2.0 - msg = r"Cannot convert \['1' '2' '3'\] to numeric" + msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support" ser = Series(["1", "2", "3"]) with pytest.raises(TypeError, match=msg): ser.median() if not using_array_manager: - msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric" + msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support" df = ser.to_frame() with pytest.raises(TypeError, match=msg): df.median() diff --git a/pandas/tests/strings/conftest.py b/pandas/tests/strings/conftest.py index 868b5f1283128..036e4de20ba53 100644 --- a/pandas/tests/strings/conftest.py +++ b/pandas/tests/strings/conftest.py @@ -1,4 +1,3 @@ -import numpy as np import pytest from pandas import Series @@ -131,53 +130,3 @@ def any_string_method(request): ... method(*args, **kwargs) """ return request.param - - -# subset of the full set from pandas/conftest.py -_any_allowed_skipna_inferred_dtype = [ - ("string", ["a", np.nan, "c"]), - ("bytes", [b"a", np.nan, b"c"]), - ("empty", [np.nan, np.nan, np.nan]), - ("empty", []), - ("mixed-integer", ["a", np.nan, 2]), -] -ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id - - -@pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids) -def any_allowed_skipna_inferred_dtype(request): - """ - Fixture for all (inferred) dtypes allowed in StringMethods.__init__ - - The covered (inferred) types are: - * 'string' - * 'empty' - * 'bytes' - * 'mixed' - * 'mixed-integer' - - Returns - ------- - inferred_dtype : str - The string for the inferred dtype from _libs.lib.infer_dtype - values : np.ndarray - An array of object dtype that will be inferred to have - `inferred_dtype` - - Examples - -------- - >>> from pandas._libs import lib - >>> - >>> def test_something(any_allowed_skipna_inferred_dtype): - ... inferred_dtype, values = any_allowed_skipna_inferred_dtype - ... # will pass - ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype - ... - ... # constructor for .str-accessor will also pass - ... Series(values).str - """ - inferred_dtype, values = request.param - values = np.array(values, dtype=object) # object dtype to avoid casting - - # correctness of inference tested in tests/dtypes/test_inference.py - return inferred_dtype, values diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index 0d2f220e70c56..2914b22a52e94 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -1,3 +1,4 @@ +import numpy as np import pytest from pandas import ( @@ -9,6 +10,55 @@ ) from pandas.core.strings.accessor import StringMethods +# subset of the full set from pandas/conftest.py +_any_allowed_skipna_inferred_dtype = [ + ("string", ["a", np.nan, "c"]), + ("bytes", [b"a", np.nan, b"c"]), + ("empty", [np.nan, np.nan, np.nan]), + ("empty", []), + ("mixed-integer", ["a", np.nan, 2]), +] +ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id + + +@pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids) +def any_allowed_skipna_inferred_dtype(request): + """ + Fixture for all (inferred) dtypes allowed in StringMethods.__init__ + + The covered (inferred) types are: + * 'string' + * 'empty' + * 'bytes' + * 'mixed' + * 'mixed-integer' + + Returns + ------- + inferred_dtype : str + The string for the inferred dtype from _libs.lib.infer_dtype + values : np.ndarray + An array of object dtype that will be inferred to have + `inferred_dtype` + + Examples + -------- + >>> from pandas._libs import lib + >>> + >>> def test_something(any_allowed_skipna_inferred_dtype): + ... inferred_dtype, values = any_allowed_skipna_inferred_dtype + ... # will pass + ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype + ... + ... # constructor for .str-accessor will also pass + ... Series(values).str + """ + inferred_dtype, values = request.param + values = np.array(values, dtype=object) # object dtype to avoid casting + + # correctness of inference tested in tests/dtypes/test_inference.py + return inferred_dtype, values + def test_api(any_string_dtype): # GH 6106, GH 9322 diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py index b8319e90e09a8..9ad9b1eca41d9 100644 --- a/pandas/tests/strings/test_extract.py +++ b/pandas/tests/strings/test_extract.py @@ -360,7 +360,7 @@ def test_extract_dataframe_capture_groups_index(index, any_string_dtype): data = ["A1", "B2", "C"] if len(index) < len(data): - pytest.skip("Index too short") + pytest.skip(f"Index needs more than {len(data)} values") index = index[: len(data)] s = Series(data, index=index, dtype=any_string_dtype) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 97119127b1665..a6e63dfd5f409 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -536,6 +536,25 @@ def test_factorize_mixed_values(self, data, expected_codes, expected_uniques): tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_index_equal(uniques, expected_uniques) + def test_factorize_interval_non_nano(self, unit): + # GH#56099 + left = DatetimeIndex(["2016-01-01", np.nan, "2015-10-11"]).as_unit(unit) + right = DatetimeIndex(["2016-01-02", np.nan, "2015-10-15"]).as_unit(unit) + idx = IntervalIndex.from_arrays(left, right) + codes, cats = idx.factorize() + assert cats.dtype == f"interval[datetime64[{unit}], right]" + + ts = Timestamp(0).as_unit(unit) + idx2 = IntervalIndex.from_arrays(left - ts, right - ts) + codes2, cats2 = idx2.factorize() + assert cats2.dtype == f"interval[timedelta64[{unit}], right]" + + idx3 = IntervalIndex.from_arrays( + left.tz_localize("US/Pacific"), right.tz_localize("US/Pacific") + ) + codes3, cats3 = idx3.factorize() + assert cats3.dtype == f"interval[datetime64[{unit}, US/Pacific], right]" + class TestUnique: def test_ints(self): @@ -1227,9 +1246,10 @@ def test_value_counts_nat(self): result_td = algos.value_counts(td) tm.assert_series_equal(result_td, exp_td) - def test_value_counts_datetime_outofbounds(self): + @pytest.mark.parametrize("dtype", [object, "M8[us]"]) + def test_value_counts_datetime_outofbounds(self, dtype): # GH 13663 - s = Series( + ser = Series( [ datetime(3000, 1, 1), datetime(5000, 1, 1), @@ -1237,13 +1257,14 @@ def test_value_counts_datetime_outofbounds(self): datetime(6000, 1, 1), datetime(3000, 1, 1), datetime(3000, 1, 1), - ] + ], + dtype=dtype, ) - res = s.value_counts() + res = ser.value_counts() exp_index = Index( [datetime(3000, 1, 1), datetime(5000, 1, 1), datetime(6000, 1, 1)], - dtype=object, + dtype=dtype, ) exp = Series([3, 2, 1], index=exp_index, name="count") tm.assert_series_equal(res, exp) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index fe5bd33e15f15..c2e39dc38d5ff 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -337,11 +337,9 @@ def test_dataframe_consortium() -> None: expected_1 = ["a", "b"] assert result_1 == expected_1 - ser = Series([1, 2, 3]) + ser = Series([1, 2, 3], name="a") col = ser.__column_consortium_standard__() - result_2 = col.get_value(1) - expected_2 = 2 - assert result_2 == expected_2 + assert col.name == "a" def test_xarray_coerce_unit(): diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 2d21d1200acac..046a4749925d8 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -5,11 +5,6 @@ import numpy as np import pytest -from pandas.compat import ( - is_ci_environment, - is_platform_windows, -) - from pandas import ( NA, DataFrame, @@ -409,11 +404,6 @@ def test_codes(self, verify, codes, exp_codes): tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_codes, expected_codes) - @pytest.mark.skipif( - is_platform_windows() and is_ci_environment(), - reason="In CI environment can crash thread with: " - "Windows fatal exception: access violation", - ) def test_codes_out_of_bound(self): values = np.array([3, 1, 2, 0, 4]) expected = np.array([0, 1, 2, 3, 4]) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 3d3ca99f4d26a..503032293dc81 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1143,7 +1143,7 @@ def test_to_datetime_dt64s_out_of_ns_bounds(self, cache, dt, errors): def test_to_datetime_dt64d_out_of_bounds(self, cache): dt64 = np.datetime64(np.iinfo(np.int64).max, "D") - msg = "Out of bounds nanosecond timestamp" + msg = "Out of bounds second timestamp: 25252734927768524-07-27" with pytest.raises(OutOfBoundsDatetime, match=msg): Timestamp(dt64) with pytest.raises(OutOfBoundsDatetime, match=msg): @@ -1585,28 +1585,23 @@ def test_convert_object_to_datetime_with_cache( @pytest.mark.parametrize("cache", [True, False]) @pytest.mark.parametrize( - ("input", "expected"), - ( - ( - Series([NaT] * 20 + [None] * 20, dtype="object"), - Series([NaT] * 40, dtype="datetime64[ns]"), - ), - ( - Series([NaT] * 60 + [None] * 60, dtype="object"), - Series([NaT] * 120, dtype="datetime64[ns]"), - ), - (Series([None] * 20), Series([NaT] * 20, dtype="datetime64[ns]")), - (Series([None] * 60), Series([NaT] * 60, dtype="datetime64[ns]")), - (Series([""] * 20), Series([NaT] * 20, dtype="datetime64[ns]")), - (Series([""] * 60), Series([NaT] * 60, dtype="datetime64[ns]")), - (Series([pd.NA] * 20), Series([NaT] * 20, dtype="datetime64[ns]")), - (Series([pd.NA] * 60), Series([NaT] * 60, dtype="datetime64[ns]")), - (Series([np.nan] * 20), Series([NaT] * 20, dtype="datetime64[ns]")), - (Series([np.nan] * 60), Series([NaT] * 60, dtype="datetime64[ns]")), - ), + "input", + [ + Series([NaT] * 20 + [None] * 20, dtype="object"), + Series([NaT] * 60 + [None] * 60, dtype="object"), + Series([None] * 20), + Series([None] * 60), + Series([""] * 20), + Series([""] * 60), + Series([pd.NA] * 20), + Series([pd.NA] * 60), + Series([np.nan] * 20), + Series([np.nan] * 60), + ], ) - def test_to_datetime_converts_null_like_to_nat(self, cache, input, expected): + def test_to_datetime_converts_null_like_to_nat(self, cache, input): # GH35888 + expected = Series([NaT] * len(input), dtype="M8[ns]") result = to_datetime(input, cache=cache) tm.assert_series_equal(result, expected) @@ -1864,16 +1859,14 @@ def test_to_datetime_month_or_year_unit_int(self, cache, unit, item, request): result = to_datetime(np.array([item], dtype=object), unit=unit, cache=cache) tm.assert_index_equal(result, expected) - # TODO: this should also work - if isinstance(item, float): - request.applymarker( - pytest.mark.xfail( - reason=f"{type(item).__name__} in np.array should work" - ) - ) result = to_datetime(np.array([item]), unit=unit, cache=cache) tm.assert_index_equal(result, expected) + # with a nan! + result = to_datetime(np.array([item, np.nan]), unit=unit, cache=cache) + assert result.isna()[1] + tm.assert_index_equal(result[:1], expected) + @pytest.mark.parametrize("unit", ["Y", "M"]) def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): # GH#50301 @@ -1883,6 +1876,8 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): msg = f"Conversion of non-round float with unit={unit} is ambiguous" with pytest.raises(ValueError, match=msg): to_datetime([1.5], unit=unit, errors="raise") + with pytest.raises(ValueError, match=msg): + to_datetime(np.array([1.5]), unit=unit, errors="raise") with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning, match=warn_msg): to_datetime(["1.5"], unit=unit, errors="raise") @@ -1948,7 +1943,7 @@ def test_unit_array_mixed_nans_large_int(self, cache): tm.assert_index_equal(result, expected) result = to_datetime(values, errors="coerce", unit="s", cache=cache) - expected = DatetimeIndex(["NaT", "NaT", "NaT", "NaT", "NaT"]) + expected = DatetimeIndex(["NaT", "NaT", "NaT", "NaT", "NaT"], dtype="M8[ns]") tm.assert_index_equal(result, expected) msg = "cannot convert input 1420043460000000000000000 with the unit 's'" @@ -2030,10 +2025,14 @@ def test_unit_mixed(self, cache, arr): def test_unit_rounding(self, cache): # GH 14156 & GH 20445: argument will incur floating point errors # but no premature rounding - result = to_datetime(1434743731.8770001, unit="s", cache=cache) - expected = Timestamp("2015-06-19 19:55:31.877000192") + value = 1434743731.8770001 + result = to_datetime(value, unit="s", cache=cache) + expected = Timestamp("2015-06-19 19:55:31.877000093") assert result == expected + alt = Timestamp(value, unit="s") + assert alt == result + def test_unit_ignore_keeps_name(self, cache): # GH 21697 expected = Index([15e9] * 2, name="name") diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index c4c9b41c218a0..e588bc83b0de8 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -6,6 +6,7 @@ import numpy as np import pytest +from pandas.compat import IS64 from pandas.errors import OutOfBoundsTimedelta import pandas as pd @@ -232,6 +233,7 @@ def test_to_timedelta_on_missing_values_list(self, val): actual = to_timedelta([val]) assert actual[0]._value == np.timedelta64("NaT").astype("int64") + @pytest.mark.xfail(not IS64, reason="Floating point error") def test_to_timedelta_float(self): # https://github.com/pandas-dev/pandas/issues/25077 arr = np.arange(0, 1, 1e-6)[-10:] diff --git a/pandas/tests/tseries/frequencies/test_freq_code.py b/pandas/tests/tseries/frequencies/test_freq_code.py index 96b71ef4fe662..16b7190753ee2 100644 --- a/pandas/tests/tseries/frequencies/test_freq_code.py +++ b/pandas/tests/tseries/frequencies/test_freq_code.py @@ -3,12 +3,8 @@ from pandas._libs.tslibs import ( Period, - Resolution, to_offset, ) -from pandas._libs.tslibs.dtypes import _attrname_to_abbrevs - -import pandas._testing as tm @pytest.mark.parametrize( @@ -24,32 +20,6 @@ def test_get_to_timestamp_base(freqstr, exp_freqstr): assert result_code == exp_code -@pytest.mark.parametrize( - "freqstr,expected", - [ - ("Y", "year"), - ("Q", "quarter"), - ("M", "month"), - ("D", "day"), - ("h", "hour"), - ("min", "minute"), - ("s", "second"), - ("ms", "millisecond"), - ("us", "microsecond"), - ("ns", "nanosecond"), - ], -) -def test_get_attrname_from_abbrev(freqstr, expected): - assert Resolution.get_reso_from_freqstr(freqstr).attrname == expected - - -@pytest.mark.parametrize("freq", ["D", "h", "min", "s", "ms", "us", "ns"]) -def test_get_freq_roundtrip2(freq): - obj = Resolution.get_reso_from_freqstr(freq) - result = _attrname_to_abbrevs[obj.attrname] - assert freq == result - - @pytest.mark.parametrize( "args,expected", [ @@ -97,12 +67,3 @@ def test_compatibility(freqstr, expected): ts_np = np.datetime64("2021-01-01T08:00:00.00") do = to_offset(freqstr) assert ts_np + do == np.datetime64(expected) - - -@pytest.mark.parametrize("freq", ["A", "H", "T", "S", "L", "U", "N"]) -def test_units_A_H_T_S_L_U_N_deprecated_from_attrname_to_abbrevs(freq): - # GH#52536 - msg = f"'{freq}' is deprecated and will be removed in a future version." - - with tm.assert_produces_warning(FutureWarning, match=msg): - Resolution.get_reso_from_freqstr(freq) diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index 19c6425c12ce1..ee8f161858b2b 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -52,7 +52,7 @@ def base_delta_code_pair(request): freqs = ( [f"QE-{month}" for month in MONTHS] - + [f"{annual}-{month}" for annual in ["YE", "BY"] for month in MONTHS] + + [f"{annual}-{month}" for annual in ["YE", "BYE"] for month in MONTHS] + ["ME", "BME", "BMS"] + [f"WOM-{count}{day}" for count in range(1, 5) for day in DAYS] + [f"W-{day}" for day in DAYS] diff --git a/pandas/tests/tseries/offsets/conftest.py b/pandas/tests/tseries/offsets/conftest.py index c9c4d6c456c53..2fc846353dcb5 100644 --- a/pandas/tests/tseries/offsets/conftest.py +++ b/pandas/tests/tseries/offsets/conftest.py @@ -3,35 +3,6 @@ import pytest from pandas._libs.tslibs import Timestamp -from pandas._libs.tslibs.offsets import MonthOffset - -from pandas.tseries import offsets - - -@pytest.fixture( - params=[ - getattr(offsets, o) for o in offsets.__all__ if o not in ("Tick", "BaseOffset") - ] -) -def offset_types(request): - """ - Fixture for all the datetime offsets available for a time series. - """ - return request.param - - -@pytest.fixture( - params=[ - getattr(offsets, o) - for o in offsets.__all__ - if issubclass(getattr(offsets, o), MonthOffset) and o != "MonthOffset" - ] -) -def month_classes(request): - """ - Fixture for month based datetime offsets available for a time series. - """ - return request.param @pytest.fixture diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 6d5b762b4f15a..97566750a5225 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -102,6 +102,33 @@ def _create_offset(klass, value=1, normalize=False): return klass +@pytest.fixture( + params=[ + getattr(offsets, o) + for o in offsets.__all__ + if issubclass(getattr(offsets, o), liboffsets.MonthOffset) + and o != "MonthOffset" + ] +) +def month_classes(request): + """ + Fixture for month based datetime offsets available for a time series. + """ + return request.param + + +@pytest.fixture( + params=[ + getattr(offsets, o) for o in offsets.__all__ if o not in ("Tick", "BaseOffset") + ] +) +def offset_types(request): + """ + Fixture for all the datetime offsets available for a time series. + """ + return request.param + + @pytest.fixture def dt(): return Timestamp(datetime(2008, 1, 2)) @@ -229,18 +256,9 @@ def _check_offsetfunc_works(self, offset, funcname, dt, expected, normalize=Fals assert result == expected # see gh-14101 - exp_warning = None ts = Timestamp(dt) + Nano(5) - - if ( - type(offset_s).__name__ == "DateOffset" - and (funcname in ["apply", "_apply"] or normalize) - and ts.nanosecond > 0 - ): - exp_warning = UserWarning - # test nanosecond is preserved - with tm.assert_produces_warning(exp_warning): + with tm.assert_produces_warning(None): result = func(ts) assert isinstance(result, Timestamp) @@ -274,18 +292,9 @@ def _check_offsetfunc_works(self, offset, funcname, dt, expected, normalize=Fals assert result == expected_localize # see gh-14101 - exp_warning = None ts = Timestamp(dt, tz=tz) + Nano(5) - - if ( - type(offset_s).__name__ == "DateOffset" - and (funcname in ["apply", "_apply"] or normalize) - and ts.nanosecond > 0 - ): - exp_warning = UserWarning - # test nanosecond is preserved - with tm.assert_produces_warning(exp_warning): + with tm.assert_produces_warning(None): result = func(ts) assert isinstance(result, Timestamp) if normalize is False: @@ -838,7 +847,7 @@ def test_rule_code(self): "NOV", "DEC", ] - base_lst = ["YE", "YS", "BY", "BYS", "QE", "QS", "BQ", "BQS"] + base_lst = ["YE", "YS", "BYE", "BYS", "QE", "QS", "BQE", "BQS"] for base in base_lst: for v in suffix_lst: alias = "-".join([base, v]) @@ -857,7 +866,7 @@ def test_freq_offsets(): class TestReprNames: def test_str_for_named_is_name(self): # look at all the amazing combinations! - month_prefixes = ["YE", "YS", "BY", "BYS", "QE", "BQ", "BQS", "QS"] + month_prefixes = ["YE", "YS", "BYE", "BYS", "QE", "BQE", "BQS", "QS"] names = [ prefix + "-" + month for prefix in month_prefixes @@ -1010,6 +1019,14 @@ def test_dateoffset_add_sub_timestamp_with_nano(): result = offset + ts assert result == expected + offset2 = DateOffset(minutes=2, nanoseconds=9, hour=1) + assert offset2._use_relativedelta + with tm.assert_produces_warning(None): + # no warning about Discarding nonzero nanoseconds + result2 = ts + offset2 + expected2 = Timestamp("1970-01-01 01:02:00.000000013") + assert result2 == expected2 + @pytest.mark.parametrize( "attribute", @@ -1122,7 +1139,7 @@ def test_is_yqm_start_end(): bm = to_offset("BME") qfeb = to_offset("QE-FEB") qsfeb = to_offset("QS-FEB") - bq = to_offset("BQ") + bq = to_offset("BQE") bqs_apr = to_offset("BQS-APR") as_nov = to_offset("YS-NOV") diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 86560969d10ce..632d3b4cc3c84 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -15,6 +15,7 @@ tslib, ) from pandas._libs.tslibs.dtypes import NpyDatetimeUnit +from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas import Timestamp import pandas._testing as tm @@ -22,6 +23,90 @@ creso_infer = NpyDatetimeUnit.NPY_FR_GENERIC.value +class TestArrayToDatetimeResolutionInference: + # TODO: tests that include tzs, ints + + def test_infer_all_nat(self): + arr = np.array([NaT, np.nan], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + assert result.dtype == "M8[s]" + + def test_infer_homogeoneous_datetimes(self): + dt = datetime(2023, 10, 27, 18, 3, 5, 678000) + arr = np.array([dt, dt, dt], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array([dt, dt, dt], dtype="M8[us]") + tm.assert_numpy_array_equal(result, expected) + + def test_infer_homogeoneous_date_objects(self): + dt = datetime(2023, 10, 27, 18, 3, 5, 678000) + dt2 = dt.date() + arr = np.array([None, dt2, dt2, dt2], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array([np.datetime64("NaT"), dt2, dt2, dt2], dtype="M8[s]") + tm.assert_numpy_array_equal(result, expected) + + def test_infer_homogeoneous_dt64(self): + dt = datetime(2023, 10, 27, 18, 3, 5, 678000) + dt64 = np.datetime64(dt, "ms") + arr = np.array([None, dt64, dt64, dt64], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array([np.datetime64("NaT"), dt64, dt64, dt64], dtype="M8[ms]") + tm.assert_numpy_array_equal(result, expected) + + def test_infer_homogeoneous_timestamps(self): + dt = datetime(2023, 10, 27, 18, 3, 5, 678000) + ts = Timestamp(dt).as_unit("ns") + arr = np.array([None, ts, ts, ts], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array([np.datetime64("NaT")] + [ts.asm8] * 3, dtype="M8[ns]") + tm.assert_numpy_array_equal(result, expected) + + def test_infer_homogeoneous_datetimes_strings(self): + item = "2023-10-27 18:03:05.678000" + arr = np.array([None, item, item, item], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array([np.datetime64("NaT"), item, item, item], dtype="M8[us]") + tm.assert_numpy_array_equal(result, expected) + + def test_infer_heterogeneous(self): + dtstr = "2023-10-27 18:03:05.678000" + + arr = np.array([dtstr, dtstr[:-3], dtstr[:-7], None], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array(arr, dtype="M8[us]") + tm.assert_numpy_array_equal(result, expected) + + result, tz = tslib.array_to_datetime(arr[::-1], creso=creso_infer) + assert tz is None + tm.assert_numpy_array_equal(result, expected[::-1]) + + @pytest.mark.parametrize( + "item", [float("nan"), NaT.value, float(NaT.value), "NaT", ""] + ) + def test_infer_with_nat_int_float_str(self, item): + # floats/ints get inferred to nanos *unless* they are NaN/iNaT, + # similar NaT string gets treated like NaT scalar (ignored for resolution) + dt = datetime(2023, 11, 15, 15, 5, 6) + + arr = np.array([dt, item], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array([dt, np.datetime64("NaT")], dtype="M8[us]") + tm.assert_numpy_array_equal(result, expected) + + result2, tz2 = tslib.array_to_datetime(arr[::-1], creso=creso_infer) + assert tz2 is None + tm.assert_numpy_array_equal(result2, expected[::-1]) + + class TestArrayToDatetimeWithTZResolutionInference: def test_array_to_datetime_with_tz_resolution(self): tz = tzoffset("custom", 3600) @@ -41,11 +126,11 @@ def test_array_to_datetime_with_tz_resolution_all_nat(self): tz = tzoffset("custom", 3600) vals = np.array(["NaT"], dtype=object) res = tslib.array_to_datetime_with_tz(vals, tz, False, False, creso_infer) - assert res.dtype == "M8[ns]" + assert res.dtype == "M8[s]" vals2 = np.array([NaT, NaT], dtype=object) res2 = tslib.array_to_datetime_with_tz(vals2, tz, False, False, creso_infer) - assert res2.dtype == "M8[ns]" + assert res2.dtype == "M8[s]" @pytest.mark.parametrize( @@ -162,7 +247,7 @@ def test_coerce_outside_ns_bounds(invalid_date, errors): if errors == "raise": msg = "^Out of bounds nanosecond timestamp: .*, at position 0$" - with pytest.raises(ValueError, match=msg): + with pytest.raises(OutOfBoundsDatetime, match=msg): tslib.array_to_datetime(**kwargs) else: # coerce. result, _ = tslib.array_to_datetime(**kwargs) diff --git a/pandas/tests/tslibs/test_resolution.py b/pandas/tests/tslibs/test_resolution.py index 7b2268f16a85f..690962f1daa5e 100644 --- a/pandas/tests/tslibs/test_resolution.py +++ b/pandas/tests/tslibs/test_resolution.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pytz from pandas._libs.tslibs import ( @@ -7,6 +8,8 @@ ) from pandas._libs.tslibs.dtypes import NpyDatetimeUnit +import pandas._testing as tm + def test_get_resolution_nano(): # don't return the fallback RESO_DAY @@ -22,3 +25,33 @@ def test_get_resolution_non_nano_data(): res = get_resolution(arr, pytz.UTC, NpyDatetimeUnit.NPY_FR_us.value) assert res == Resolution.RESO_US + + +@pytest.mark.parametrize( + "freqstr,expected", + [ + ("Y", "year"), + ("Q", "quarter"), + ("M", "month"), + ("D", "day"), + ("h", "hour"), + ("min", "minute"), + ("s", "second"), + ("ms", "millisecond"), + ("us", "microsecond"), + ("ns", "nanosecond"), + ], +) +def test_get_attrname_from_abbrev(freqstr, expected): + reso = Resolution.get_reso_from_freqstr(freqstr) + assert reso.attr_abbrev == freqstr + assert reso.attrname == expected + + +@pytest.mark.parametrize("freq", ["A", "H", "T", "S", "L", "U", "N"]) +def test_units_A_H_T_S_L_U_N_deprecated_from_attrname_to_abbrevs(freq): + # GH#52536 + msg = f"'{freq}' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + Resolution.get_reso_from_freqstr(freq) diff --git a/pandas/tests/tslibs/test_strptime.py b/pandas/tests/tslibs/test_strptime.py index ce45bdd10b8e8..d726006b03f6d 100644 --- a/pandas/tests/tslibs/test_strptime.py +++ b/pandas/tests/tslibs/test_strptime.py @@ -9,13 +9,26 @@ from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas._libs.tslibs.strptime import array_strptime -from pandas import Timestamp +from pandas import ( + NaT, + Timestamp, +) import pandas._testing as tm creso_infer = NpyDatetimeUnit.NPY_FR_GENERIC.value class TestArrayStrptimeResolutionInference: + def test_array_strptime_resolution_all_nat(self): + arr = np.array([NaT, np.nan], dtype=object) + + fmt = "%Y-%m-%d %H:%M:%S" + res, _ = array_strptime(arr, fmt=fmt, utc=False, creso=creso_infer) + assert res.dtype == "M8[s]" + + res, _ = array_strptime(arr, fmt=fmt, utc=True, creso=creso_infer) + assert res.dtype == "M8[s]" + @pytest.mark.parametrize("tz", [None, timezone.utc]) def test_array_strptime_resolution_inference_homogeneous_strings(self, tz): dt = datetime(2016, 1, 2, 3, 4, 5, 678900, tzinfo=tz) diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index 3fb205f21a857..ef68408305232 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -26,8 +26,8 @@ ("1s0.25ms", offsets.Micro(1000250)), ("1s0.25ms", offsets.Micro(1000250)), ("2800ns", offsets.Nano(2800)), - ("2SM", offsets.SemiMonthEnd(2)), - ("2SM-16", offsets.SemiMonthEnd(2, day_of_month=16)), + ("2SME", offsets.SemiMonthEnd(2)), + ("2SME-16", offsets.SemiMonthEnd(2, day_of_month=16)), ("2SMS-14", offsets.SemiMonthBegin(2, day_of_month=14)), ("2SMS-15", offsets.SemiMonthBegin(2)), ], @@ -38,7 +38,7 @@ def test_to_offset(freq_input, expected): @pytest.mark.parametrize( - "freqstr,expected", [("-1s", -1), ("-2SM", -2), ("-1SMS", -1), ("-5min10s", -310)] + "freqstr,expected", [("-1s", -1), ("-2SME", -2), ("-1SMS", -1), ("-5min10s", -310)] ) def test_to_offset_negative(freqstr, expected): result = to_offset(freqstr) @@ -66,12 +66,12 @@ def test_to_offset_negative(freqstr, expected): "+d", "-m", # Invalid shortcut anchors. - "SM-0", - "SM-28", - "SM-29", - "SM-FOO", + "SME-0", + "SME-28", + "SME-29", + "SME-FOO", "BSM", - "SM--1", + "SME--1", "SMS-1", "SMS-28", "SMS-30", @@ -161,10 +161,10 @@ def test_to_offset_pd_timedelta(kwargs, expected): ("QE", offsets.QuarterEnd(startingMonth=12)), ("QE-DEC", offsets.QuarterEnd(startingMonth=12)), ("QE-MAY", offsets.QuarterEnd(startingMonth=5)), - ("SM", offsets.SemiMonthEnd(day_of_month=15)), - ("SM-15", offsets.SemiMonthEnd(day_of_month=15)), - ("SM-1", offsets.SemiMonthEnd(day_of_month=1)), - ("SM-27", offsets.SemiMonthEnd(day_of_month=27)), + ("SME", offsets.SemiMonthEnd(day_of_month=15)), + ("SME-15", offsets.SemiMonthEnd(day_of_month=15)), + ("SME-1", offsets.SemiMonthEnd(day_of_month=1)), + ("SME-27", offsets.SemiMonthEnd(day_of_month=27)), ("SMS-2", offsets.SemiMonthBegin(day_of_month=2)), ("SMS-27", offsets.SemiMonthBegin(day_of_month=27)), ], diff --git a/pandas/tests/util/test_safe_import.py b/pandas/tests/util/test_safe_import.py deleted file mode 100644 index bd07bea934ed3..0000000000000 --- a/pandas/tests/util/test_safe_import.py +++ /dev/null @@ -1,39 +0,0 @@ -import sys -import types - -import pytest - -import pandas.util._test_decorators as td - - -@pytest.mark.parametrize("name", ["foo", "hello123"]) -def test_safe_import_non_existent(name): - assert not td.safe_import(name) - - -def test_safe_import_exists(): - assert td.safe_import("pandas") - - -@pytest.mark.parametrize("min_version,valid", [("0.0.0", True), ("99.99.99", False)]) -def test_safe_import_versions(min_version, valid): - result = td.safe_import("pandas", min_version=min_version) - result = result if valid else not result - assert result - - -@pytest.mark.parametrize( - "min_version,valid", [(None, False), ("1.0", True), ("2.0", False)] -) -def test_safe_import_dummy(monkeypatch, min_version, valid): - mod_name = "hello123" - - mod = types.ModuleType(mod_name) - mod.__version__ = "1.5" - - if min_version is not None: - monkeypatch.setitem(sys.modules, mod_name, mod) - - result = td.safe_import(mod_name, min_version=min_version) - result = result if valid else not result - assert result diff --git a/pandas/tests/window/test_apply.py b/pandas/tests/window/test_apply.py index 4e4eca6e772e7..136f81632cb0a 100644 --- a/pandas/tests/window/test_apply.py +++ b/pandas/tests/window/test_apply.py @@ -301,8 +301,9 @@ def test_center_reindex_series(raw, series): tm.assert_series_equal(series_xp, series_rs) -def test_center_reindex_frame(raw, frame): +def test_center_reindex_frame(raw): # shifter index + frame = DataFrame(range(100), index=date_range("2020-01-01", freq="D", periods=100)) s = [f"x{x:d}" for x in range(12)] minp = 10 diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index c5c395414b450..427780db79783 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -102,12 +102,38 @@ def test_ewma_with_times_equal_spacing(halflife_with_times, times, min_periods): tm.assert_frame_equal(result, expected) -def test_ewma_with_times_variable_spacing(tz_aware_fixture): +@pytest.mark.parametrize( + "unit", + [ + pytest.param( + "s", + marks=pytest.mark.xfail( + reason="ExponentialMovingWindow constructor raises on non-nano" + ), + ), + pytest.param( + "ms", + marks=pytest.mark.xfail( + reason="ExponentialMovingWindow constructor raises on non-nano" + ), + ), + pytest.param( + "us", + marks=pytest.mark.xfail( + reason="ExponentialMovingWindow constructor raises on non-nano" + ), + ), + "ns", + ], +) +def test_ewma_with_times_variable_spacing(tz_aware_fixture, unit): tz = tz_aware_fixture halflife = "23 days" - times = DatetimeIndex( - ["2020-01-01", "2020-01-10T00:04:05", "2020-02-23T05:00:23"] - ).tz_localize(tz) + times = ( + DatetimeIndex(["2020-01-01", "2020-01-10T00:04:05", "2020-02-23T05:00:23"]) + .tz_localize(tz) + .as_unit(unit) + ) data = np.arange(3) df = DataFrame(data) result = df.ewm(halflife=halflife, times=times).mean() diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index a23c91df5eef6..4fd33d54ef846 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -3,6 +3,7 @@ from pandas import ( DataFrame, + DatetimeIndex, Index, MultiIndex, Series, @@ -373,24 +374,11 @@ def test_groupby_rolling_center_on(self): .rolling(6, on="Date", center=True, min_periods=1) .value.mean() ) + mi = MultiIndex.from_arrays([df["gb"], df["Date"]], names=["gb", "Date"]) expected = Series( [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 7.0, 7.5, 7.5, 7.5], name="value", - index=MultiIndex.from_tuples( - ( - ("group_1", Timestamp("2020-01-01")), - ("group_1", Timestamp("2020-01-02")), - ("group_1", Timestamp("2020-01-03")), - ("group_1", Timestamp("2020-01-04")), - ("group_1", Timestamp("2020-01-05")), - ("group_1", Timestamp("2020-01-06")), - ("group_2", Timestamp("2020-01-07")), - ("group_2", Timestamp("2020-01-08")), - ("group_2", Timestamp("2020-01-09")), - ("group_2", Timestamp("2020-01-10")), - ), - names=["gb", "Date"], - ), + index=mi, ) tm.assert_series_equal(result, expected) @@ -625,14 +613,14 @@ def test_groupby_rolling_no_sort(self): expected = expected.drop(columns="foo") tm.assert_frame_equal(result, expected) - def test_groupby_rolling_count_closed_on(self): + def test_groupby_rolling_count_closed_on(self, unit): # GH 35869 df = DataFrame( { "column1": range(6), "column2": range(6), "group": 3 * ["A", "B"], - "date": date_range(end="20190101", periods=6), + "date": date_range(end="20190101", periods=6, unit=unit), } ) result = ( @@ -640,20 +628,28 @@ def test_groupby_rolling_count_closed_on(self): .rolling("3d", on="date", closed="left")["column1"] .count() ) + dti = DatetimeIndex( + [ + "2018-12-27", + "2018-12-29", + "2018-12-31", + "2018-12-28", + "2018-12-30", + "2019-01-01", + ], + dtype=f"M8[{unit}]", + ) + mi = MultiIndex.from_arrays( + [ + ["A", "A", "A", "B", "B", "B"], + dti, + ], + names=["group", "date"], + ) expected = Series( [np.nan, 1.0, 1.0, np.nan, 1.0, 1.0], name="column1", - index=MultiIndex.from_tuples( - [ - ("A", Timestamp("2018-12-27")), - ("A", Timestamp("2018-12-29")), - ("A", Timestamp("2018-12-31")), - ("B", Timestamp("2018-12-28")), - ("B", Timestamp("2018-12-30")), - ("B", Timestamp("2019-01-01")), - ], - names=["group", "date"], - ), + index=mi, ) tm.assert_series_equal(result, expected) @@ -885,7 +881,7 @@ def test_groupby_level(self): ], ], ) - def test_as_index_false(self, by, expected_data): + def test_as_index_false(self, by, expected_data, unit): # GH 39433 data = [ ["A", "2018-01-01", 100.0], @@ -894,7 +890,7 @@ def test_as_index_false(self, by, expected_data): ["B", "2018-01-02", 250.0], ] df = DataFrame(data, columns=["id", "date", "num"]) - df["date"] = to_datetime(df["date"]) + df["date"] = df["date"].astype(f"M8[{unit}]") df = df.set_index(["date"]) gp_by = [getattr(df, attr) for attr in by] @@ -908,6 +904,8 @@ def test_as_index_false(self, by, expected_data): expected, index=df.index, ) + if "date" in expected_data: + expected["date"] = expected["date"].astype(f"M8[{unit}]") tm.assert_frame_equal(result, expected) def test_nan_and_zero_endpoints(self, any_int_numpy_dtype): diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index f5ef6a00e0b32..b1cc7ec186f19 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -1,11 +1,6 @@ import numpy as np import pytest -from pandas.compat import ( - is_ci_environment, - is_platform_mac, - is_platform_windows, -) from pandas.errors import NumbaUtilError import pandas.util._test_decorators as td @@ -17,15 +12,7 @@ ) import pandas._testing as tm -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.skipif( - is_ci_environment() and (is_platform_windows() or is_platform_mac()), - reason="On GHA CI, Windows can fail with " - "'Windows fatal exception: stack overflow' " - "and macOS can timeout", - ), -] +pytestmark = pytest.mark.single_cpu @pytest.fixture(params=["single", "table"]) diff --git a/pandas/tests/window/test_online.py b/pandas/tests/window/test_online.py index 8c4fb1fe6872b..14d3a39107bc4 100644 --- a/pandas/tests/window/test_online.py +++ b/pandas/tests/window/test_online.py @@ -1,27 +1,13 @@ import numpy as np import pytest -from pandas.compat import ( - is_ci_environment, - is_platform_mac, - is_platform_windows, -) - from pandas import ( DataFrame, Series, ) import pandas._testing as tm -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.skipif( - is_ci_environment() and (is_platform_windows() or is_platform_mac()), - reason="On GHA CI, Windows can fail with " - "'Windows fatal exception: stack overflow' " - "and macOS can timeout", - ), -] +pytestmark = pytest.mark.single_cpu pytest.importorskip("numba") diff --git a/pandas/tests/window/test_rolling_functions.py b/pandas/tests/window/test_rolling_functions.py index 51f801ab3761b..5906ff52db098 100644 --- a/pandas/tests/window/test_rolling_functions.py +++ b/pandas/tests/window/test_rolling_functions.py @@ -346,7 +346,7 @@ def test_center_reindex_frame(frame, roll_func, kwargs, minp, fill_value): lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), pytest.param( lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), - marks=td.skip_if_no_scipy, + marks=td.skip_if_no("scipy"), ), ], ) @@ -508,7 +508,7 @@ def test_rolling_min_max_numeric_types(data_type): lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), pytest.param( lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), - marks=td.skip_if_no_scipy, + marks=td.skip_if_no("scipy"), ), ], ) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 8460ef5826e34..4a1a668426b36 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -59,7 +59,7 @@ # -------------------------------------------------------------------- # Offset related functions -_need_suffix = ["QS", "BQ", "BQS", "YS", "BY", "BYS"] +_need_suffix = ["QS", "BQE", "BQS", "YS", "BYE", "BYS"] for _prefix in _need_suffix: for _m in MONTHS: @@ -345,7 +345,7 @@ def _get_annual_rule(self) -> str | None: if pos_check is None: return None else: - return {"cs": "YS", "bs": "BYS", "ce": "YE", "be": "BY"}.get(pos_check) + return {"cs": "YS", "bs": "BYS", "ce": "YE", "be": "BYE"}.get(pos_check) def _get_quarterly_rule(self) -> str | None: if len(self.mdiffs) > 1: @@ -359,7 +359,7 @@ def _get_quarterly_rule(self) -> str | None: if pos_check is None: return None else: - return {"cs": "QS", "bs": "BQS", "ce": "QE", "be": "BQ"}.get(pos_check) + return {"cs": "QS", "bs": "BQS", "ce": "QE", "be": "BQE"}.get(pos_check) def _get_monthly_rule(self) -> str | None: if len(self.mdiffs) > 1: diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index e71b4881a58a4..2c1912bce856d 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -2,8 +2,8 @@ This module provides decorator functions which can be applied to test objects in order to skip those objects when certain conditions occur. A sample use case is to detect if the platform is missing ``matplotlib``. If so, any test objects -which require ``matplotlib`` and decorated with ``@td.skip_if_no_mpl`` will be -skipped by ``pytest`` during the execution of the test suite. +which require ``matplotlib`` and decorated with ``@td.skip_if_no("matplotlib")`` +will be skipped by ``pytest`` during the execution of the test suite. To illustrate, after importing this module: @@ -11,13 +11,13 @@ The decorators can be applied to classes: -@td.skip_if_some_reason +@td.skip_if_no("package") class Foo: ... Or individual functions: -@td.skip_if_some_reason +@td.skip_if_no("package") def test_foo(): ... @@ -44,59 +44,7 @@ def test_foo(): IS64, is_platform_windows, ) - -from pandas.core.computation.expressions import ( - NUMEXPR_INSTALLED, - USE_NUMEXPR, -) -from pandas.util.version import Version - - -def safe_import(mod_name: str, min_version: str | None = None): - """ - Parameters - ---------- - mod_name : str - Name of the module to be imported - min_version : str, default None - Minimum required version of the specified mod_name - - Returns - ------- - object - The imported module if successful, or False - """ - try: - mod = __import__(mod_name) - except ImportError: - return False - - if not min_version: - return mod - else: - import sys - - version = getattr(sys.modules[mod_name], "__version__") - if version and Version(version) >= Version(min_version): - return mod - - return False - - -def _skip_if_not_us_locale() -> bool: - lang, _ = locale.getlocale() - if lang != "en_US": - return True - return False - - -def _skip_if_no_scipy() -> bool: - return not ( - safe_import("scipy.stats") - and safe_import("scipy.sparse") - and safe_import("scipy.interpolate") - and safe_import("scipy.signal") - ) +from pandas.compat._optional import import_optional_dependency def skip_if_installed(package: str) -> pytest.MarkDecorator: @@ -115,7 +63,8 @@ def skip_if_installed(package: str) -> pytest.MarkDecorator: parametrization mark. """ return pytest.mark.skipif( - safe_import(package), reason=f"Skipping because {package} is installed." + bool(import_optional_dependency(package, errors="ignore")), + reason=f"Skipping because {package} is installed.", ) @@ -154,25 +103,20 @@ def skip_if_no(package: str, min_version: str | None = None) -> pytest.MarkDecor if min_version: msg += f" satisfying a min_version of {min_version}" return pytest.mark.skipif( - not safe_import(package, min_version=min_version), reason=msg + not bool( + import_optional_dependency( + package, errors="ignore", min_version=min_version + ) + ), + reason=msg, ) -skip_if_mpl = pytest.mark.skipif( - bool(safe_import("matplotlib")), reason="matplotlib is present" -) skip_if_32bit = pytest.mark.skipif(not IS64, reason="skipping for 32 bit") skip_if_windows = pytest.mark.skipif(is_platform_windows(), reason="Running on Windows") skip_if_not_us_locale = pytest.mark.skipif( - _skip_if_not_us_locale(), - reason=f"Specific locale is set {locale.getlocale()[0]}", -) -skip_if_no_scipy = pytest.mark.skipif( - _skip_if_no_scipy(), reason="Missing SciPy requirement" -) -skip_if_no_ne = pytest.mark.skipif( - not USE_NUMEXPR, - reason=f"numexpr enabled->{USE_NUMEXPR}, installed->{NUMEXPR_INSTALLED}", + locale.getlocale()[0] != "en_US", + reason=f"Set local {locale.getlocale()[0]} is not en_US", ) diff --git a/pyproject.toml b/pyproject.toml index 26d52d97b0934..65c647e439cb1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = [ "meson-python==0.13.1", "meson==1.2.1", "wheel", - "Cython>=0.29.33,<3", # Note: sync with setup.py, environment.yml and asv.conf.json + "Cython==3.0.5", # Note: sync with setup.py, environment.yml and asv.conf.json # Any NumPy version should be fine for compiling. Users are unlikely # to get a NumPy<1.25 so the result will be compatible with all relevant # NumPy versions (if not it is presumably compatible with their version). @@ -76,17 +76,19 @@ hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/i #'blosc>=1.20.1', 'tables>=3.8.0'] spss = ['pyreadstat>=1.2.0'] -postgresql = ['SQLAlchemy>=2.0.0', 'psycopg2>=2.9.6'] +postgresql = ['SQLAlchemy>=2.0.0', 'psycopg2>=2.9.6', 'adbc-driver-postgresql>=0.8.0'] mysql = ['SQLAlchemy>=2.0.0', 'pymysql>=1.0.2'] -sql-other = ['SQLAlchemy>=2.0.0'] +sql-other = ['SQLAlchemy>=2.0.0', 'adbc-driver-postgresql>=0.8.0', 'adbc-driver-sqlite>=0.8.0'] html = ['beautifulsoup4>=4.11.2', 'html5lib>=1.1', 'lxml>=4.9.2'] xml = ['lxml>=4.9.2'] plot = ['matplotlib>=3.6.3'] output-formatting = ['jinja2>=3.1.2', 'tabulate>=0.9.0'] -clipboard = ['PyQt5>=5.15.8', 'qtpy>=2.3.0'] +clipboard = ['PyQt5>=5.15.9', 'qtpy>=2.3.0'] compression = ['zstandard>=0.19.0'] consortium-standard = ['dataframe-api-compat>=0.1.7'] -all = ['beautifulsoup4>=4.11.2', +all = ['adbc-driver-postgresql>=0.8.0', + 'adbc-driver-sqlite>=0.8.0', + 'beautifulsoup4>=4.11.2', # blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) #'blosc>=1.21.3', 'bottleneck>=1.3.6', @@ -107,7 +109,7 @@ all = ['beautifulsoup4>=4.11.2', 'psycopg2>=2.9.6', 'pyarrow>=10.0.1', 'pymysql>=1.0.2', - 'PyQt5>=5.15.8', + 'PyQt5>=5.15.9', 'pyreadstat>=1.2.0', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', @@ -486,6 +488,10 @@ filterwarnings = [ "error:::pandas", "error::ResourceWarning", "error::pytest.PytestUnraisableExceptionWarning", + # TODO(PY311-minimum): Specify EncodingWarning + # Ignore 3rd party EncodingWarning but raise on pandas' + "ignore:.*encoding.* argument not specified", + "error:.*encoding.* argument not specified::pandas", "ignore:.*ssl.SSLSocket:pytest.PytestUnraisableExceptionWarning", "ignore:.*ssl.SSLSocket:ResourceWarning", # GH 44844: Can remove once minimum matplotlib version >= 3.7 diff --git a/pyright_reportGeneralTypeIssues.json b/pyright_reportGeneralTypeIssues.json index c059b9c589ecd..e155b34053069 100644 --- a/pyright_reportGeneralTypeIssues.json +++ b/pyright_reportGeneralTypeIssues.json @@ -99,6 +99,9 @@ "pandas/io/sql.py", "pandas/io/stata.py", "pandas/plotting/_matplotlib/boxplot.py", + "pandas/plotting/_matplotlib/core.py", + "pandas/plotting/_matplotlib/timeseries.py", + "pandas/plotting/_matplotlib/tools.py", "pandas/tseries/frequencies.py", "pandas/tseries/holiday.py", ], diff --git a/requirements-dev.txt b/requirements-dev.txt index 490e170299030..fa3ef1c214a14 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,12 +3,14 @@ pip versioneer[toml] -cython==0.29.33 +cython==3.0.5 meson[ninja]==1.2.1 meson-python==0.13.1 pytest>=7.3.2 pytest-cov pytest-xdist>=2.2.0 +pytest-qt>=4.2.0 +PyQt5>=5.15.9 coverage python-dateutil numpy<2 @@ -24,7 +26,7 @@ gcsfs>=2022.11.0 ipython jinja2>=3.1.2 lxml>=4.9.2 -matplotlib>=3.6.3, <3.8 +matplotlib>=3.6.3 numba>=0.56.4 numexpr>=2.8.4 openpyxl>=3.1.0 @@ -81,6 +83,8 @@ feedparser pyyaml requests pygments +adbc-driver-postgresql>=0.8.0 +adbc-driver-sqlite>=0.8.0 dataframe-api-compat>=0.1.7 sphinx-toggleprompt typing_extensions; python_version<"3.11" diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index 49fecca253ba4..5fcf09cd073fe 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -25,12 +25,13 @@ EXCLUDE = {"python", "c-compiler", "cxx-compiler"} REMAP_VERSION = {"tzdata": "2022.7"} -RENAME = { +CONDA_TO_PIP = { "pytables": "tables", "psycopg2": "psycopg2-binary", "dask-core": "dask", "seaborn-base": "seaborn", "sqlalchemy": "SQLAlchemy", + "pyqt": "PyQt5", } @@ -40,7 +41,7 @@ def conda_package_to_pip(package: str): In most cases they are the same, those are the exceptions: - Packages that should be excluded (in `EXCLUDE`) - - Packages that should be renamed (in `RENAME`) + - Packages that should be renamed (in `CONDA_TO_PIP`) - A package requiring a specific version, in conda is defined with a single equal (e.g. ``pandas=1.0``) and in pip with two (e.g. ``pandas==1.0``) """ @@ -53,14 +54,14 @@ def conda_package_to_pip(package: str): return if pkg in REMAP_VERSION: return "".join((pkg, compare, REMAP_VERSION[pkg])) - if pkg in RENAME: - return "".join((RENAME[pkg], compare, version)) + if pkg in CONDA_TO_PIP: + return "".join((CONDA_TO_PIP[pkg], compare, version)) if package in EXCLUDE: return - if package in RENAME: - return RENAME[package] + if package in CONDA_TO_PIP: + return CONDA_TO_PIP[package] return package diff --git a/scripts/run_stubtest.py b/scripts/run_stubtest.py index dedcdb5532593..35cbbef08124e 100644 --- a/scripts/run_stubtest.py +++ b/scripts/run_stubtest.py @@ -47,6 +47,8 @@ # stubtest might be too sensitive "pandas._libs.lib.NoDefault", "pandas._libs.lib._NoDefault.no_default", + # stubtest/Cython is not recognizing the default value for the dtype parameter + "pandas._libs.lib.map_infer_mask", # internal type alias (should probably be private) "pandas._libs.lib.ndarray_obj_2d", # runtime argument "owner" has a default value but stub argument does not diff --git a/scripts/validate_min_versions_in_sync.py b/scripts/validate_min_versions_in_sync.py index d5c491f6ebc12..7dd3e96e6ec18 100755 --- a/scripts/validate_min_versions_in_sync.py +++ b/scripts/validate_min_versions_in_sync.py @@ -26,7 +26,7 @@ from typing import Any -from scripts.generate_pip_deps_from_conda import RENAME +from scripts.generate_pip_deps_from_conda import CONDA_TO_PIP DOC_PATH = pathlib.Path("doc/source/getting_started/install.rst").resolve() CI_PATH = next( @@ -36,7 +36,7 @@ SETUP_PATH = pathlib.Path("pyproject.toml").resolve() YAML_PATH = pathlib.Path("ci/deps") ENV_PATH = pathlib.Path("environment.yml") -EXCLUDE_DEPS = {"tzdata", "blosc", "pandas-gbq"} +EXCLUDE_DEPS = {"tzdata", "blosc", "pandas-gbq", "pyqt", "pyqt5"} EXCLUSION_LIST = frozenset(["python=3.8[build=*_pypy]"]) # pandas package is not available # in pre-commit environment @@ -169,8 +169,8 @@ def pin_min_versions_to_yaml_file( old_dep = yaml_package if yaml_versions is not None: old_dep = old_dep + ", ".join(yaml_versions) - if RENAME.get(yaml_package, yaml_package) in toml_map: - min_dep = toml_map[RENAME.get(yaml_package, yaml_package)] + if CONDA_TO_PIP.get(yaml_package, yaml_package) in toml_map: + min_dep = toml_map[CONDA_TO_PIP.get(yaml_package, yaml_package)] elif yaml_package in toml_map: min_dep = toml_map[yaml_package] else: diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 6e6251425928d..5bde4c21cfab5 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -50,6 +50,7 @@ "_global_config", "_chained_assignment_msg", "_chained_assignment_method_msg", + "_chained_assignment_warning_method_msg", "_version_meson", # The numba extensions need this to mock the iloc object "_iLocIndexer", diff --git a/setup.py b/setup.py index db3717efb738d..d6fb4b775a351 100755 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ def is_platform_mac(): # note: sync with pyproject.toml, environment.yml and asv.conf.json -min_cython_ver = "0.29.33" +min_cython_ver = "3.0.5" try: from Cython import (