diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index e156b1e0aeca2..33b6e7a8c2340 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -88,7 +88,6 @@ jobs: name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} env: PATTERN: ${{ matrix.pattern }} - EXTRA_APT: ${{ matrix.extra_apt || '' }} LANG: ${{ matrix.lang || 'C.UTF-8' }} LC_ALL: ${{ matrix.lc_all || '' }} PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }} @@ -96,6 +95,8 @@ jobs: TEST_ARGS: ${{ matrix.test_args || '' }} PYTEST_WORKERS: 'auto' PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} + # Clipboard tests + QT_QPA_PLATFORM: offscreen concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_copy_on_write || '' }} @@ -145,8 +146,8 @@ jobs: fetch-depth: 0 - name: Extra installs - # xsel for clipboard tests - run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }} + run: sudo apt-get update && sudo apt-get install -y ${{ matrix.extra_apt }} + if: ${{ matrix.extra_apt }} - name: Generate extra locales # These extra locales will be available for locale.setlocale() calls in tests diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index d70d9d0aa5227..6682a60f42997 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -245,7 +245,8 @@ def time_extract_single_group(self, dtype, expand): class Dummies(Dtypes): def setup(self, dtype): super().setup(dtype) - self.s = self.s.str.join("|") + N = len(self.s) // 5 + self.s = self.s[:N].str.join("|") def time_get_dummies(self, dtype): self.s.str.get_dummies("|") diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 94652e8586d77..7d08f2df5a8ca 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -15,6 +15,7 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - pytest-localserver>=0.7.1 + - pytest-qt>=4.2.0 - boto3 # required dependencies @@ -42,6 +43,7 @@ dependencies: - psycopg2>=2.9.6 - pyarrow>=10.0.1 - pymysql>=1.0.2 + - pyqt>=5.15.9 - pyreadstat>=1.2.0 - pytables>=3.8.0 - python-calamine>=0.1.6 @@ -56,5 +58,6 @@ dependencies: - zstandard>=0.19.0 - pip: - - pyqt5>=5.15.8 + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index bf47bfe3a83ec..7a2bbe442cde7 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -16,6 +16,7 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - pytest-localserver>=0.7.1 + - pytest-qt>=4.2.0 - boto3 # required dependencies @@ -43,6 +44,7 @@ dependencies: - psycopg2>=2.9.6 - pyarrow>=10.0.1 - pymysql>=1.0.2 + - pyqt>=5.15.9 - pyreadstat>=1.2.0 - pytables>=3.8.0 - python-calamine>=0.1.6 @@ -70,6 +72,7 @@ dependencies: - pyyaml - py - pip: + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 - dataframe-api-compat>=0.1.7 - - pyqt5>=5.15.8 - tzdata>=2022.7 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index bd9e2059ef477..681345a34513d 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -15,6 +15,7 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - pytest-localserver>=0.7.1 + - pytest-qt>=4.2.0 - boto3 # required dependencies @@ -38,6 +39,7 @@ dependencies: - numexpr>=2.8.4 - odfpy>=1.4.1 - qtpy>=2.3.0 + - pyqt>=5.15.9 - openpyxl>=3.1.0 - psycopg2>=2.9.6 - pyarrow>=10.0.1 @@ -56,5 +58,6 @@ dependencies: - zstandard>=0.19.0 - pip: - - pyqt5>=5.15.8 + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index aa8597978ecf7..2e3b03f63eb99 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -17,6 +17,7 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - pytest-localserver>=0.7.1 + - pytest-qt>=4.2.0 - boto3 # required dependencies @@ -44,6 +45,7 @@ dependencies: - psycopg2=2.9.6 - pyarrow=10.0.1 - pymysql=1.0.2 + - pyqt=5.15.9 - pyreadstat=1.2.0 - pytables=3.8.0 - python-calamine=0.1.6 @@ -58,6 +60,7 @@ dependencies: - zstandard=0.19.0 - pip: + - adbc-driver-postgresql==0.8.0 + - adbc-driver-sqlite==0.8.0 - dataframe-api-compat==0.1.7 - - pyqt5==5.15.8 - tzdata==2022.7 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index cf4087a3e4670..d3e545b636de9 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -15,6 +15,7 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - pytest-localserver>=0.7.1 + - pytest-qt>=4.2.0 - boto3 # required dependencies @@ -42,6 +43,7 @@ dependencies: - psycopg2>=2.9.6 - pyarrow>=10.0.1 - pymysql>=1.0.2 + - pyqt>=5.15.9 - pyreadstat>=1.2.0 - pytables>=3.8.0 - python-calamine>=0.1.6 @@ -56,5 +58,6 @@ dependencies: - zstandard>=0.19.0 - pip: - - pyqt5>=5.15.8 + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index abe6145d077ed..0098406037876 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -15,6 +15,7 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - pytest-localserver>=0.7.1 + - pytest-qt>=4.2.0 - boto3 # required dependencies @@ -42,6 +43,7 @@ dependencies: - psycopg2>=2.9.6 - pyarrow>=10.0.1 - pymysql>=1.0.2 + - pyqt>=5.15.9 - pyreadstat>=1.2.0 - pytables>=3.8.0 - python-calamine>=0.1.6 @@ -54,3 +56,6 @@ dependencies: - xlrd>=2.0.1 - xlsxwriter>=3.0.5 - zstandard>=0.19.0 + - pip: + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index 0cc1fe2629e46..7fc42f6021f00 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -86,7 +86,7 @@ Before we begin, please: Option 1: using mamba (recommended) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* Install `mamba `_ +* Install `mamba `_ * Make sure your mamba is up to date (``mamba update mamba``) .. code-block:: none diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 2af66da6b8baf..c048c71613b57 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -335,7 +335,7 @@ lxml 4.9.2 xml XML parser for read SQL databases ^^^^^^^^^^^^^ -Installable with ``pip install "pandas[postgresql, mysql, sql-other]"``. +Traditional drivers are installable with ``pip install "pandas[postgresql, mysql, sql-other]"`` ========================= ================== =============== ============================================================= Dependency Minimum Version pip extra Notes @@ -345,6 +345,8 @@ SQLAlchemy 2.0.0 postgresql, SQL support for dat sql-other psycopg2 2.9.6 postgresql PostgreSQL engine for sqlalchemy pymysql 1.0.2 mysql MySQL engine for sqlalchemy +adbc-driver-postgresql 0.8.0 postgresql ADBC Driver for PostgreSQL +adbc-driver-sqlite 0.8.0 sql-other ADBC Driver for SQLite ========================= ================== =============== ============================================================= Other data sources @@ -395,7 +397,7 @@ Installable with ``pip install "pandas[clipboard]"``. ========================= ================== =============== ============================================================= Dependency Minimum Version pip extra Notes ========================= ================== =============== ============================================================= -PyQt4/PyQt5 5.15.8 clipboard Clipboard I/O +PyQt4/PyQt5 5.15.9 clipboard Clipboard I/O qtpy 2.3.0 clipboard Clipboard I/O ========================= ================== =============== ============================================================= diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 73ec09cdd12f1..21e07e8d00ad6 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5565,9 +5565,23 @@ SQL queries ----------- The :mod:`pandas.io.sql` module provides a collection of query wrappers to both -facilitate data retrieval and to reduce dependency on DB-specific API. Database abstraction -is provided by SQLAlchemy if installed. In addition you will need a driver library for -your database. Examples of such drivers are `psycopg2 `__ +facilitate data retrieval and to reduce dependency on DB-specific API. + +Where available, users may first want to opt for `Apache Arrow ADBC +`_ drivers. These drivers +should provide the best performance, null handling, and type detection. + + .. versionadded:: 2.2.0 + + Added native support for ADBC drivers + +For a full list of ADBC drivers and their development status, see the `ADBC Driver +Implementation Status `_ +documentation. + +Where an ADBC driver is not available or may be missing functionality, +users should opt for installing SQLAlchemy alongside their database driver library. +Examples of such drivers are `psycopg2 `__ for PostgreSQL or `pymysql `__ for MySQL. For `SQLite `__ this is included in Python's standard library by default. @@ -5600,6 +5614,18 @@ In the following example, we use the `SQlite engine. You can use a temporary SQLite database where data are stored in "memory". +To connect using an ADBC driver you will want to install the ``adbc_driver_sqlite`` using your +package manager. Once installed, you can use the DBAPI interface provided by the ADBC driver +to connect to your database. + +.. code-block:: python + + import adbc_driver_sqlite.dbapi as sqlite_dbapi + + # Create the connection + with sqlite_dbapi.connect("sqlite:///:memory:") as conn: + df = pd.read_sql_table("data", conn) + To connect with SQLAlchemy you use the :func:`create_engine` function to create an engine object from database URI. You only need to create the engine once per database you are connecting to. @@ -5675,9 +5701,74 @@ writes ``data`` to the database in batches of 1000 rows at a time: SQL data types ++++++++++++++ -:func:`~pandas.DataFrame.to_sql` will try to map your data to an appropriate -SQL data type based on the dtype of the data. When you have columns of dtype -``object``, pandas will try to infer the data type. +Ensuring consistent data type management across SQL databases is challenging. +Not every SQL database offers the same types, and even when they do the implementation +of a given type can vary in ways that have subtle effects on how types can be +preserved. + +For the best odds at preserving database types users are advised to use +ADBC drivers when available. The Arrow type system offers a wider array of +types that more closely match database types than the historical pandas/NumPy +type system. To illustrate, note this (non-exhaustive) listing of types +available in different databases and pandas backends: + ++-----------------+-----------------------+----------------+---------+ +|numpy/pandas |arrow |postgres |sqlite | ++=================+=======================+================+=========+ +|int16/Int16 |int16 |SMALLINT |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|int32/Int32 |int32 |INTEGER |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|int64/Int64 |int64 |BIGINT |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|float32 |float32 |REAL |REAL | ++-----------------+-----------------------+----------------+---------+ +|float64 |float64 |DOUBLE PRECISION|REAL | ++-----------------+-----------------------+----------------+---------+ +|object |string |TEXT |TEXT | ++-----------------+-----------------------+----------------+---------+ +|bool |``bool_`` |BOOLEAN | | ++-----------------+-----------------------+----------------+---------+ +|datetime64[ns] |timestamp(us) |TIMESTAMP | | ++-----------------+-----------------------+----------------+---------+ +|datetime64[ns,tz]|timestamp(us,tz) |TIMESTAMPTZ | | ++-----------------+-----------------------+----------------+---------+ +| |date32 |DATE | | ++-----------------+-----------------------+----------------+---------+ +| |month_day_nano_interval|INTERVAL | | ++-----------------+-----------------------+----------------+---------+ +| |binary |BINARY |BLOB | ++-----------------+-----------------------+----------------+---------+ +| |decimal128 |DECIMAL [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ +| |list |ARRAY [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ +| |struct |COMPOSITE TYPE | | +| | | [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ + +.. rubric:: Footnotes + +.. [#f1] Not implemented as of writing, but theoretically possible + +If you are interested in preserving database types as best as possible +throughout the lifecycle of your DataFrame, users are encouraged to +leverage the ``dtype_backend="pyarrow"`` argument of :func:`~pandas.read_sql` + +.. code-block:: ipython + + # for roundtripping + with pg_dbapi.connect(uri) as conn: + df2 = pd.read_sql("pandas_table", conn, dtype_backend="pyarrow") + +This will prevent your data from being converted to the traditional pandas/NumPy +type system, which often converts SQL types in ways that make them impossible to +round-trip. + +In case an ADBC driver is not available, :func:`~pandas.DataFrame.to_sql` +will try to map your data to an appropriate SQL data type based on the dtype of +the data. When you have columns of dtype ``object``, pandas will try to infer +the data type. You can always override the default type by specifying the desired SQL type of any of the columns by using the ``dtype`` argument. This argument needs a @@ -5696,7 +5787,9 @@ default ``Text`` type for string columns: Due to the limited support for timedelta's in the different database flavors, columns with type ``timedelta64`` will be written as integer - values as nanoseconds to the database and a warning will be raised. + values as nanoseconds to the database and a warning will be raised. The only + exception to this is when using the ADBC PostgreSQL driver in which case a + timedelta will be written to the database as an ``INTERVAL`` .. note:: @@ -5711,7 +5804,7 @@ default ``Text`` type for string columns: Datetime data types ''''''''''''''''''' -Using SQLAlchemy, :func:`~pandas.DataFrame.to_sql` is capable of writing +Using ADBC or SQLAlchemy, :func:`~pandas.DataFrame.to_sql` is capable of writing datetime data that is timezone naive or timezone aware. However, the resulting data stored in the database ultimately depends on the supported data type for datetime data of the database system being used. @@ -5802,7 +5895,7 @@ table name and optionally a subset of columns to read. .. note:: In order to use :func:`~pandas.read_sql_table`, you **must** have the - SQLAlchemy optional dependency installed. + ADBC driver or SQLAlchemy optional dependency installed. .. ipython:: python @@ -5810,7 +5903,8 @@ table name and optionally a subset of columns to read. .. note:: - Note that pandas infers column dtypes from query outputs, and not by looking + ADBC drivers will map database types directly back to arrow types. For other drivers + note that pandas infers column dtypes from query outputs, and not by looking up data types in the physical database schema. For example, assume ``userid`` is an integer column in a table. Then, intuitively, ``select userid ...`` will return integer-valued series, while ``select cast(userid as text) ...`` will diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 2a04adf2ac7f7..77ce303dc1bfe 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -22,9 +22,10 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`) +- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) -- +- Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`) .. --------------------------------------------------------------------------- .. _whatsnew_214.other: diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 3795784c64e75..f548b4a722c41 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -89,6 +89,97 @@ a Series. (:issue:`55323`) ) series.list[0] +.. _whatsnew_220.enhancements.adbc_support: + +ADBC Driver support in to_sql and read_sql +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`read_sql` and :meth:`~DataFrame.to_sql` now work with `Apache Arrow ADBC +`_ drivers. Compared to +traditional drivers used via SQLAlchemy, ADBC drivers should provide +significant performance improvements, better type support and cleaner +nullability handling. + +.. code-block:: ipython + + import adbc_driver_postgresql.dbapi as pg_dbapi + + df = pd.DataFrame( + [ + [1, 2, 3], + [4, 5, 6], + ], + columns=['a', 'b', 'c'] + ) + uri = "postgresql://postgres:postgres@localhost/postgres" + with pg_dbapi.connect(uri) as conn: + df.to_sql("pandas_table", conn, index=False) + + # for roundtripping + with pg_dbapi.connect(uri) as conn: + df2 = pd.read_sql("pandas_table", conn) + +The Arrow type system offers a wider array of types that can more closely match +what databases like PostgreSQL can offer. To illustrate, note this (non-exhaustive) +listing of types available in different databases and pandas backends: + ++-----------------+-----------------------+----------------+---------+ +|numpy/pandas |arrow |postgres |sqlite | ++=================+=======================+================+=========+ +|int16/Int16 |int16 |SMALLINT |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|int32/Int32 |int32 |INTEGER |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|int64/Int64 |int64 |BIGINT |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|float32 |float32 |REAL |REAL | ++-----------------+-----------------------+----------------+---------+ +|float64 |float64 |DOUBLE PRECISION|REAL | ++-----------------+-----------------------+----------------+---------+ +|object |string |TEXT |TEXT | ++-----------------+-----------------------+----------------+---------+ +|bool |``bool_`` |BOOLEAN | | ++-----------------+-----------------------+----------------+---------+ +|datetime64[ns] |timestamp(us) |TIMESTAMP | | ++-----------------+-----------------------+----------------+---------+ +|datetime64[ns,tz]|timestamp(us,tz) |TIMESTAMPTZ | | ++-----------------+-----------------------+----------------+---------+ +| |date32 |DATE | | ++-----------------+-----------------------+----------------+---------+ +| |month_day_nano_interval|INTERVAL | | ++-----------------+-----------------------+----------------+---------+ +| |binary |BINARY |BLOB | ++-----------------+-----------------------+----------------+---------+ +| |decimal128 |DECIMAL [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ +| |list |ARRAY [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ +| |struct |COMPOSITE TYPE | | +| | | [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ + +.. rubric:: Footnotes + +.. [#f1] Not implemented as of writing, but theoretically possible + +If you are interested in preserving database types as best as possible +throughout the lifecycle of your DataFrame, users are encouraged to +leverage the ``dtype_backend="pyarrow"`` argument of :func:`~pandas.read_sql` + +.. code-block:: ipython + + # for roundtripping + with pg_dbapi.connect(uri) as conn: + df2 = pd.read_sql("pandas_table", conn, dtype_backend="pyarrow") + +This will prevent your data from being converted to the traditional pandas/NumPy +type system, which often converts SQL types in ways that make them impossible to +round-trip. + +For a full list of ADBC drivers and their development status, see the `ADBC Driver +Implementation Status `_ +documentation. + .. _whatsnew_220.enhancements.other: Other enhancements @@ -303,9 +394,12 @@ Other Deprecations - Deprecated strings ``H``, ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`) - Deprecated strings ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`) - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`) +- Deprecated the :class:`.BaseGrouper` attributes ``group_keys_seq`` and ``reconstructed_codes``; these will be removed in a future version of pandas (:issue:`56148`) +- Deprecated the :class:`.Grouping` attributes ``group_index``, ``result_index``, and ``group_arraylike``; these will be removed in a future version of pandas (:issue:`56148`) - Deprecated the ``errors="ignore"`` option in :func:`to_datetime`, :func:`to_timedelta`, and :func:`to_numeric`; explicitly catch exceptions instead (:issue:`54467`) - Deprecated the ``fastpath`` keyword in the :class:`Series` constructor (:issue:`20110`) - Deprecated the ``ordinal`` keyword in :class:`PeriodIndex`, use :meth:`PeriodIndex.from_ordinals` instead (:issue:`55960`) +- Deprecated the behavior of :meth:`Series.value_counts` and :meth:`Index.value_counts` with object dtype; in a future version these will not perform dtype inference on the resulting :class:`Index`, do ``result.index = result.index.infer_objects()`` to retain the old behavior (:issue:`56161`) - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) - Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`) - Deprecated the previous implementation of :class:`DataFrame.stack`; specify ``future_stack=True`` to adopt the future version (:issue:`53515`) @@ -327,8 +421,10 @@ Performance improvements - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` when indexing with a :class:`MultiIndex` (:issue:`56062`) - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`) - Performance improvement in :meth:`Index.difference` (:issue:`55108`) +- Performance improvement in :meth:`Index.sort_values` when index is already sorted (:issue:`56128`) - Performance improvement in :meth:`MultiIndex.get_indexer` when ``method`` is not ``None`` (:issue:`55839`) - Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`) +- Performance improvement in :meth:`Series.str.get_dummies` when dtype is ``"string[pyarrow]"`` or ``"string[pyarrow_numpy]"`` (:issue:`56110`) - Performance improvement in :meth:`Series.str` methods (:issue:`55736`) - Performance improvement in :meth:`Series.value_counts` and :meth:`Series.mode` for masked dtypes (:issue:`54984`, :issue:`55340`) - Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`) @@ -356,9 +452,11 @@ Datetimelike - Bug in :func:`testing.assert_extension_array_equal` that could use the wrong unit when comparing resolutions (:issue:`55730`) - Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing a list of mixed-string-and-numeric types incorrectly raising (:issue:`55780`) - Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing mixed-type objects with a mix of timezones or mix of timezone-awareness failing to raise ``ValueError`` (:issue:`55693`) +- Bug in :meth:`DatetimeIndex.shift` with non-nanosecond resolution incorrectly returning with nanosecond resolution (:issue:`56117`) - Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`) - Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`) - Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`) +- Bug in :meth:`Series.dt.round` with non-nanosecond resolution and ``NaT`` entries incorrectly raising ``OverflowError`` (:issue:`56158`) - Bug in :meth:`Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) - Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetim64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`55979`) - Bug in adding or subtracting a :class:`Week` offset to a ``datetime64`` :class:`Series`, :class:`Index`, or :class:`DataFrame` column with non-nanosecond resolution returning incorrect results (:issue:`55583`) @@ -370,6 +468,7 @@ Datetimelike - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`) - Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`) - Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`) +- Bug in the results of :func:`pd.to_datetime` with an floating-dtype argument with ``unit`` not matching the pointwise results of :class:`Timestamp` (:issue:`56037`) - Timedelta @@ -448,8 +547,8 @@ Period Plotting ^^^^^^^^ - Bug in :meth:`DataFrame.plot.box` with ``vert=False`` and a matplotlib ``Axes`` created with ``sharey=True`` (:issue:`54941`) +- Bug in :meth:`DataFrame.plot.scatter` discaring string columns (:issue:`56142`) - Bug in :meth:`Series.plot` when reusing an ``ax`` object failing to raise when a ``how`` keyword is passed (:issue:`55953`) -- Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -497,6 +596,7 @@ Other - Bug in :meth:`Dataframe.from_dict` which would always sort the rows of the created :class:`DataFrame`. (:issue:`55683`) - Bug in rendering ``inf`` values inside a a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`) - Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) +- Bug in the error message when assigning an empty dataframe to a column (:issue:`55956`) .. ***DO NOT USE THIS SECTION*** diff --git a/environment.yml b/environment.yml index f87338e54f563..4aa8073e93f6d 100644 --- a/environment.yml +++ b/environment.yml @@ -16,6 +16,8 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 + - pytest-qt>=4.2.0 + - pyqt>=5.15.9 - coverage # required dependencies @@ -113,6 +115,8 @@ dependencies: - pygments # Code highlighting - pip: + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 - dataframe-api-compat>=0.1.7 - sphinx-toggleprompt # conda-forge version has stricter pins on jinja2 - typing_extensions; python_version<"3.11" diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 22ffec51c82f1..89f101964aff7 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -1280,13 +1280,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, NPY_DATETIMEUNIT dateUnit = NPY_FR_ns; if (PyTypeNum_ISDATETIME(type_num)) { is_datetimelike = 1; - PyArray_VectorUnaryFunc *castfunc = - PyArray_GetCastFunc(PyArray_DescrFromType(type_num), NPY_INT64); - if (!castfunc) { - PyErr_Format(PyExc_ValueError, "Cannot cast numpy dtype %d to long", - enc->npyType); - } - castfunc(dataptr, &i8date, 1, NULL, NULL); + i8date = *(npy_int64 *)dataptr; dateUnit = get_datetime_metadata_from_dtype(dtype).base; } else if (PyDate_Check(item) || PyDelta_Check(item)) { is_datetimelike = 1; @@ -1425,13 +1419,8 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (PyTypeNum_ISDATETIME(enc->npyType)) { int64_t longVal; - PyArray_VectorUnaryFunc *castfunc = - PyArray_GetCastFunc(PyArray_DescrFromType(enc->npyType), NPY_INT64); - if (!castfunc) { - PyErr_Format(PyExc_ValueError, "Cannot cast numpy dtype %d to long", - enc->npyType); - } - castfunc(enc->npyValue, &longVal, 1, NULL, NULL); + + longVal = *(npy_int64 *)enc->npyValue; if (longVal == get_nat()) { tc->type = JT_NULL; } else { diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 8736b9d5ec8d6..da4eeaeb3b692 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -642,11 +642,16 @@ cpdef array_to_datetime( utc=utc, creso=state.creso, ) - - # Otherwise we can use the single reso that we encountered and avoid - # a second pass. - abbrev = npy_unit_to_abbrev(state.creso) - result = iresult.view(f"M8[{abbrev}]").reshape(result.shape) + elif state.creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: + # i.e. we never encountered anything non-NaT, default to "s". This + # ensures that insert and concat-like operations with NaT + # do not upcast units + result = iresult.view("M8[s]").reshape(result.shape) + else: + # Otherwise we can use the single reso that we encountered and avoid + # a second pass. + abbrev = npy_unit_to_abbrev(state.creso) + result = iresult.view(f"M8[{abbrev}]").reshape(result.shape) return result, tz_out @@ -823,14 +828,16 @@ def array_to_datetime_with_tz( # We encountered mismatched resolutions, need to re-parse with # the correct one. return array_to_datetime_with_tz(values, tz=tz, creso=creso) - - # Otherwise we can use the single reso that we encountered and avoid - # a second pass. - abbrev = npy_unit_to_abbrev(creso) - result = result.view(f"M8[{abbrev}]") - elif creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: - # We didn't find any non-NaT to infer from, default to "ns" - result = result.view("M8[ns]") + elif creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: + # i.e. we never encountered anything non-NaT, default to "s". This + # ensures that insert and concat-like operations with NaT + # do not upcast units + result = result.view("M8[s]") + else: + # Otherwise we can use the single reso that we encountered and avoid + # a second pass. + abbrev = npy_unit_to_abbrev(creso) + result = result.view(f"M8[{abbrev}]") else: abbrev = npy_unit_to_abbrev(creso) result = result.view(f"M8[{abbrev}]") diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 7ffd630d6d8e1..6b9f41b1bb06f 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -45,7 +45,7 @@ cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1 cpdef datetime localize_pydatetime(datetime dt, tzinfo tz) cdef int64_t cast_from_unit(object ts, str unit, NPY_DATETIMEUNIT out_reso=*) except? -1 -cpdef (int64_t, int) precision_from_unit( +cdef (int64_t, int) precision_from_unit( NPY_DATETIMEUNIT in_reso, NPY_DATETIMEUNIT out_reso=* ) diff --git a/pandas/_libs/tslibs/conversion.pyi b/pandas/_libs/tslibs/conversion.pyi index a5f5d04efeb76..cfe39fe2964cb 100644 --- a/pandas/_libs/tslibs/conversion.pyi +++ b/pandas/_libs/tslibs/conversion.pyi @@ -8,8 +8,5 @@ import numpy as np DT64NS_DTYPE: np.dtype TD64NS_DTYPE: np.dtype -def precision_from_unit( - in_reso: int, - out_reso: int = ..., -) -> tuple[int, int]: ... # (int64_t, _) def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ... +def cast_from_unit_vectorized(values: np.ndarray, unit: str) -> np.ndarray: ... diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 222ff2cde0ede..5ad9a648c52a2 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -1,8 +1,11 @@ +cimport cython + import numpy as np cimport numpy as cnp from libc.math cimport log10 from numpy cimport ( + float64_t, int32_t, int64_t, ) @@ -37,6 +40,7 @@ from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, NPY_FR_ns, NPY_FR_us, + astype_overflowsafe, check_dts_bounds, convert_reso, dts_to_iso_string, @@ -74,6 +78,7 @@ from pandas._libs.tslibs.tzconversion cimport ( from pandas._libs.tslibs.util cimport ( is_float_object, is_integer_object, + is_nan, ) # ---------------------------------------------------------------------- @@ -86,6 +91,78 @@ TD64NS_DTYPE = np.dtype("m8[ns]") # ---------------------------------------------------------------------- # Unit Conversion Helpers +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.overflowcheck(True) +def cast_from_unit_vectorized( + ndarray values, + str unit, +): + """ + Vectorized analogue to cast_from_unit. + """ + cdef: + int64_t m + int p + NPY_DATETIMEUNIT in_reso, out_reso + Py_ssize_t i + + assert values.dtype.kind == "f" + + if unit in "YM": + if not (((values % 1) == 0) | np.isnan(values)).all(): + # GH#47267 it is clear that 2 "M" corresponds to 1970-02-01, + # but not clear what 2.5 "M" corresponds to, so we will + # disallow that case. + raise ValueError( + f"Conversion of non-round float with unit={unit} " + "is ambiguous" + ) + + # GH#47266 go through np.datetime64 to avoid weird results e.g. with "Y" + # and 150 we'd get 2120-01-01 09:00:00 + values = values.astype(f"M8[{unit}]") + dtype = np.dtype("M8[ns]") + return astype_overflowsafe(values, dtype=dtype, copy=False).view("i8") + + in_reso = abbrev_to_npy_unit(unit) + out_reso = abbrev_to_npy_unit("ns") + m, p = precision_from_unit(in_reso, out_reso) + + cdef: + ndarray[int64_t] base, out + ndarray[float64_t] frac + tuple shape = (values).shape + + out = np.empty(shape, dtype="i8") + base = np.empty(shape, dtype="i8") + frac = np.empty(shape, dtype="f8") + + for i in range(len(values)): + if is_nan(values[i]): + base[i] = NPY_NAT + else: + base[i] = values[i] + frac[i] = values[i] - base[i] + + if p: + frac = np.round(frac, p) + + try: + for i in range(len(values)): + if base[i] == NPY_NAT: + out[i] = NPY_NAT + else: + out[i] = (base[i] * m) + (frac[i] * m) + except (OverflowError, FloatingPointError) as err: + # FloatingPointError can be issued if we have float dtype and have + # set np.errstate(over="raise") + raise OutOfBoundsDatetime( + f"cannot convert input {values[i]} with the unit '{unit}'" + ) from err + return out + + cdef int64_t cast_from_unit( object ts, str unit, @@ -155,7 +232,7 @@ cdef int64_t cast_from_unit( ) from err -cpdef (int64_t, int) precision_from_unit( +cdef (int64_t, int) precision_from_unit( NPY_DATETIMEUNIT in_reso, NPY_DATETIMEUNIT out_reso=NPY_DATETIMEUNIT.NPY_FR_ns, ): diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index a726c735bf9a1..ff4fb4d635d17 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -746,7 +746,31 @@ cdef ndarray[int64_t] _ceil_int64(const int64_t[:] values, int64_t unit): cdef ndarray[int64_t] _rounddown_int64(values, int64_t unit): - return _ceil_int64(values - unit // 2, unit) + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] result = np.empty(n, dtype="i8") + int64_t res, value, remainder, half + + half = unit // 2 + + with cython.overflowcheck(True): + for i in range(n): + value = values[i] + + if value == NPY_NAT: + res = NPY_NAT + else: + # This adjustment is the only difference between rounddown_int64 + # and _ceil_int64 + value = value - half + remainder = value % unit + if remainder == 0: + res = value + else: + res = value + (unit - remainder) + + result[i] = res + return result cdef ndarray[int64_t] _roundup_int64(values, int64_t unit): diff --git a/pandas/_libs/tslibs/offsets.pyi b/pandas/_libs/tslibs/offsets.pyi index aecc8cf681cf8..7eb8dc0813868 100644 --- a/pandas/_libs/tslibs/offsets.pyi +++ b/pandas/_libs/tslibs/offsets.pyi @@ -33,6 +33,7 @@ class ApplyTypeError(TypeError): ... class BaseOffset: n: int + normalize: bool def __init__(self, n: int = ..., normalize: bool = ...) -> None: ... def __eq__(self, other) -> bool: ... def __ne__(self, other) -> bool: ... @@ -85,7 +86,7 @@ class BaseOffset: @property def freqstr(self) -> str: ... def _apply(self, other): ... - def _apply_array(self, dtarr) -> None: ... + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: ... def rollback(self, dt: datetime) -> datetime: ... def rollforward(self, dt: datetime) -> datetime: ... def is_on_offset(self, dt: datetime) -> bool: ... diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 01962c47ff31c..47e507a0150e2 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -110,33 +110,6 @@ cdef bint _is_normalized(datetime dt): return True -def apply_wrapper_core(func, self, other) -> ndarray: - result = func(self, other) - result = np.asarray(result) - - if self.normalize: - # TODO: Avoid circular/runtime import - from .vectorized import normalize_i8_timestamps - reso = get_unit_from_dtype(other.dtype) - result = normalize_i8_timestamps(result.view("i8"), None, reso=reso) - - return result - - -def apply_array_wraps(func): - # Note: normally we would use `@functools.wraps(func)`, but this does - # not play nicely with cython class methods - def wrapper(self, other) -> np.ndarray: - # other is a DatetimeArray - result = apply_wrapper_core(func, self, other) - return result - - # do @functools.wraps(func) manually since it doesn't work on cdef funcs - wrapper.__name__ = func.__name__ - wrapper.__doc__ = func.__doc__ - return wrapper - - def apply_wraps(func): # Note: normally we would use `@functools.wraps(func)`, but this does # not play nicely with cython class methods @@ -644,8 +617,9 @@ cdef class BaseOffset: def _apply(self, other): raise NotImplementedError("implemented by subclasses") - @apply_array_wraps - def _apply_array(self, dtarr): + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: + # NB: _apply_array does not handle respecting `self.normalize`, the + # caller (DatetimeArray) handles that in post-processing. raise NotImplementedError( f"DateOffset subclass {type(self).__name__} " "does not have a vectorized implementation" @@ -1399,8 +1373,7 @@ cdef class RelativeDeltaOffset(BaseOffset): "applied vectorized" ) - @apply_array_wraps - def _apply_array(self, dtarr): + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: reso = get_unit_from_dtype(dtarr.dtype) dt64other = np.asarray(dtarr) @@ -1814,8 +1787,7 @@ cdef class BusinessDay(BusinessMixin): days = n + 2 return days - @apply_array_wraps - def _apply_array(self, dtarr): + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: i8other = dtarr.view("i8") reso = get_unit_from_dtype(dtarr.dtype) res = self._shift_bdays(i8other, reso=reso) @@ -2361,8 +2333,7 @@ cdef class YearOffset(SingleConstructorOffset): months = years * 12 + (self.month - other.month) return shift_month(other, months, self._day_opt) - @apply_array_wraps - def _apply_array(self, dtarr): + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: reso = get_unit_from_dtype(dtarr.dtype) shifted = shift_quarters( dtarr.view("i8"), self.n, self.month, self._day_opt, modby=12, reso=reso @@ -2613,8 +2584,7 @@ cdef class QuarterOffset(SingleConstructorOffset): months = qtrs * 3 - months_since return shift_month(other, months, self._day_opt) - @apply_array_wraps - def _apply_array(self, dtarr): + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: reso = get_unit_from_dtype(dtarr.dtype) shifted = shift_quarters( dtarr.view("i8"), @@ -2798,8 +2768,7 @@ cdef class MonthOffset(SingleConstructorOffset): n = roll_convention(other.day, self.n, compare_day) return shift_month(other, n, self._day_opt) - @apply_array_wraps - def _apply_array(self, dtarr): + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: reso = get_unit_from_dtype(dtarr.dtype) shifted = shift_months(dtarr.view("i8"), self.n, self._day_opt, reso=reso) return shifted @@ -3029,10 +2998,9 @@ cdef class SemiMonthOffset(SingleConstructorOffset): return shift_month(other, months, to_day) - @apply_array_wraps @cython.wraparound(False) @cython.boundscheck(False) - def _apply_array(self, dtarr): + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: cdef: ndarray i8other = dtarr.view("i8") Py_ssize_t i, count = dtarr.size @@ -3254,8 +3222,7 @@ cdef class Week(SingleConstructorOffset): return other + timedelta(weeks=k) - @apply_array_wraps - def _apply_array(self, dtarr): + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: if self.weekday is None: td = timedelta(days=7 * self.n) unit = np.datetime_data(dtarr.dtype)[0] @@ -4735,6 +4702,9 @@ cpdef to_offset(freq, bint is_period=False): f"for Period, please use \'Y{name[2:]}\' " f"instead of \'{name}\'" ) + if (name.startswith("B") or + name.startswith("S") or name.startswith("C")): + raise ValueError(INVALID_FREQ_ERR_MSG.format(name)) else: raise ValueError( f"for Period, please use " diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index d8926d14ae7e5..5616bc17045ad 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -47,7 +47,10 @@ from numpy cimport ( ) from pandas._libs.missing cimport checknull_with_nat_and_na -from pandas._libs.tslibs.conversion cimport get_datetime64_nanos +from pandas._libs.tslibs.conversion cimport ( + get_datetime64_nanos, + parse_pydatetime, +) from pandas._libs.tslibs.dtypes cimport ( get_supported_reso, npy_unit_to_abbrev, @@ -55,6 +58,7 @@ from pandas._libs.tslibs.dtypes cimport ( ) from pandas._libs.tslibs.nattype cimport ( NPY_NAT, + c_NaT as NaT, c_nat_strings as nat_strings, ) from pandas._libs.tslibs.np_datetime cimport ( @@ -65,7 +69,6 @@ from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct, npy_datetimestruct_to_datetime, pydate_to_dt64, - pydatetime_to_dt64, string_to_dts, ) @@ -82,6 +85,8 @@ from pandas._libs.util cimport ( from pandas._libs.tslibs.timestamps import Timestamp +from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single + cnp.import_array() @@ -314,11 +319,13 @@ def array_strptime( Py_ssize_t i, n = len(values) npy_datetimestruct dts int64_t[::1] iresult - object[::1] result_timezone object val, tz + bint seen_datetime_offset = False bint is_raise = errors=="raise" bint is_ignore = errors=="ignore" bint is_coerce = errors=="coerce" + bint is_same_offsets + set out_tzoffset_vals = set() tzinfo tz_out = None bint iso_format = format_is_iso(fmt) NPY_DATETIMEUNIT out_bestunit, item_reso @@ -338,7 +345,6 @@ def array_strptime( abbrev = npy_unit_to_abbrev(creso) result = np.empty(n, dtype=f"M8[{abbrev}]") iresult = result.view("i8") - result_timezone = np.empty(n, dtype="object") dts.us = dts.ps = dts.as = 0 @@ -361,16 +367,10 @@ def array_strptime( if infer_reso: creso = state.creso tz_out = state.process_datetime(val, tz_out, utc) - if isinstance(val, _Timestamp): - val = (<_Timestamp>val)._as_creso(creso) - iresult[i] = val.tz_localize(None)._value - else: - iresult[i] = pydatetime_to_dt64( - val.replace(tzinfo=None), &dts, reso=creso - ) - result_timezone[i] = val.tzinfo + iresult[i] = parse_pydatetime(val, &dts, state.creso) continue elif PyDate_Check(val): + state.found_other = True item_reso = NPY_DATETIMEUNIT.NPY_FR_s state.update_creso(item_reso) if infer_reso: @@ -378,6 +378,7 @@ def array_strptime( iresult[i] = pydate_to_dt64(val, &dts, reso=creso) continue elif cnp.is_datetime64_object(val): + state.found_other = True item_reso = get_supported_reso(get_datetime64_unit(val)) state.update_creso(item_reso) if infer_reso: @@ -418,13 +419,17 @@ def array_strptime( f"Out of bounds {attrname} timestamp: {val}" ) from err if out_local == 1: - # Store the out_tzoffset in seconds - # since we store the total_seconds of - # dateutil.tz.tzoffset objects + nsecs = out_tzoffset * 60 + out_tzoffset_vals.add(nsecs) + seen_datetime_offset = True tz = timezone(timedelta(minutes=out_tzoffset)) - result_timezone[i] = tz - out_local = 0 - out_tzoffset = 0 + value = tz_localize_to_utc_single( + value, tz, ambiguous="raise", nonexistent=None, creso=creso + ) + else: + tz = None + out_tzoffset_vals.add("naive") + state.found_naive_str = True iresult[i] = value continue @@ -450,6 +455,7 @@ def array_strptime( state.update_creso(item_reso) if infer_reso: creso = state.creso + try: iresult[i] = npy_datetimestruct_to_datetime(creso, &dts) except OverflowError as err: @@ -457,7 +463,26 @@ def array_strptime( raise OutOfBoundsDatetime( f"Out of bounds {attrname} timestamp: {val}" ) from err - result_timezone[i] = tz + + if tz is not None: + ival = iresult[i] + iresult[i] = tz_localize_to_utc_single( + ival, tz, ambiguous="raise", nonexistent=None, creso=creso + ) + nsecs = (ival - iresult[i]) + if creso == NPY_FR_ns: + nsecs = nsecs // 10**9 + elif creso == NPY_DATETIMEUNIT.NPY_FR_us: + nsecs = nsecs // 10**6 + elif creso == NPY_DATETIMEUNIT.NPY_FR_ms: + nsecs = nsecs // 10**3 + + out_tzoffset_vals.add(nsecs) + seen_datetime_offset = True + else: + state.found_naive_str = True + tz = None + out_tzoffset_vals.add("naive") except (ValueError, OutOfBoundsDatetime) as ex: ex.args = ( @@ -474,7 +499,37 @@ def array_strptime( continue elif is_raise: raise - return values, [] + return values, None + + if seen_datetime_offset and not utc: + is_same_offsets = len(out_tzoffset_vals) == 1 + if not is_same_offsets or (state.found_naive or state.found_other): + result2 = _array_strptime_object_fallback( + values, fmt=fmt, exact=exact, errors=errors, utc=utc + ) + return result2, None + elif tz_out is not None: + # GH#55693 + tz_offset = out_tzoffset_vals.pop() + tz_out2 = timezone(timedelta(seconds=tz_offset)) + if not tz_compare(tz_out, tz_out2): + # e.g. test_to_datetime_mixed_offsets_with_utc_false_deprecated + result2 = _array_strptime_object_fallback( + values, fmt=fmt, exact=exact, errors=errors, utc=utc + ) + return result2, None + # e.g. test_guess_datetime_format_with_parseable_formats + else: + # e.g. test_to_datetime_iso8601_with_timezone_valid + tz_offset = out_tzoffset_vals.pop() + tz_out = timezone(timedelta(seconds=tz_offset)) + elif not utc: + if tz_out and (state.found_other or state.found_naive_str): + # found_other indicates a tz-naive int, float, dt64, or date + result2 = _array_strptime_object_fallback( + values, fmt=fmt, exact=exact, errors=errors, utc=utc + ) + return result2, None if infer_reso: if state.creso_ever_changed: @@ -488,12 +543,17 @@ def array_strptime( utc=utc, creso=state.creso, ) - - # Otherwise we can use the single reso that we encountered and avoid - # a second pass. - abbrev = npy_unit_to_abbrev(state.creso) - result = iresult.base.view(f"M8[{abbrev}]") - return result, result_timezone.base + elif state.creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: + # i.e. we never encountered anything non-NaT, default to "s". This + # ensures that insert and concat-like operations with NaT + # do not upcast units + result = iresult.base.view("M8[s]") + else: + # Otherwise we can use the single reso that we encountered and avoid + # a second pass. + abbrev = npy_unit_to_abbrev(state.creso) + result = iresult.base.view(f"M8[{abbrev}]") + return result, tz_out cdef tzinfo _parse_with_format( @@ -633,7 +693,7 @@ cdef tzinfo _parse_with_format( item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_ns # Pad to always return nanoseconds s += "0" * (9 - len(s)) - us = long(s) + us = int(s) ns = us % 1000 us = us // 1000 elif parse_code == 11: @@ -731,6 +791,157 @@ cdef tzinfo _parse_with_format( return tz +def _array_strptime_object_fallback( + ndarray[object] values, + str fmt, + bint exact=True, + errors="raise", + bint utc=False, +): + + cdef: + Py_ssize_t i, n = len(values) + npy_datetimestruct dts + int64_t iresult + object val + tzinfo tz + bint is_raise = errors=="raise" + bint is_ignore = errors=="ignore" + bint is_coerce = errors=="coerce" + bint iso_format = format_is_iso(fmt) + NPY_DATETIMEUNIT creso, out_bestunit, item_reso + int out_local = 0, out_tzoffset = 0 + bint string_to_dts_succeeded = 0 + + assert is_raise or is_ignore or is_coerce + + item_reso = NPY_DATETIMEUNIT.NPY_FR_GENERIC + format_regex, locale_time = _get_format_regex(fmt) + + result = np.empty(n, dtype=object) + + dts.us = dts.ps = dts.as = 0 + + for i in range(n): + val = values[i] + try: + if isinstance(val, str): + if len(val) == 0 or val in nat_strings: + result[i] = NaT + continue + elif checknull_with_nat_and_na(val): + result[i] = NaT + continue + elif PyDateTime_Check(val): + result[i] = Timestamp(val) + continue + elif PyDate_Check(val): + result[i] = Timestamp(val) + continue + elif cnp.is_datetime64_object(val): + result[i] = Timestamp(val) + continue + elif ( + (is_integer_object(val) or is_float_object(val)) + and (val != val or val == NPY_NAT) + ): + result[i] = NaT + continue + else: + val = str(val) + + if fmt == "ISO8601": + string_to_dts_succeeded = not string_to_dts( + val, &dts, &out_bestunit, &out_local, + &out_tzoffset, False, None, False + ) + elif iso_format: + string_to_dts_succeeded = not string_to_dts( + val, &dts, &out_bestunit, &out_local, + &out_tzoffset, False, fmt, exact + ) + if string_to_dts_succeeded: + # No error reported by string_to_dts, pick back up + # where we left off + creso = get_supported_reso(out_bestunit) + try: + value = npy_datetimestruct_to_datetime(creso, &dts) + except OverflowError as err: + raise OutOfBoundsDatetime( + f"Out of bounds nanosecond timestamp: {val}" + ) from err + if out_local == 1: + tz = timezone(timedelta(minutes=out_tzoffset)) + value = tz_localize_to_utc_single( + value, tz, ambiguous="raise", nonexistent=None, creso=creso + ) + else: + tz = None + ts = Timestamp._from_value_and_reso(value, creso, tz) + result[i] = ts + continue + + if parse_today_now(val, &iresult, utc, NPY_FR_ns): + result[i] = Timestamp(val) + continue + + # Some ISO formats can't be parsed by string_to_dts + # For example, 6-digit YYYYMD. So, if there's an error, and a format + # was specified, then try the string-matching code below. If the format + # specified was 'ISO8601', then we need to error, because + # only string_to_dts handles mixed ISO8601 formats. + if not string_to_dts_succeeded and fmt == "ISO8601": + raise ValueError(f"Time data {val} is not ISO8601 format") + + tz = _parse_with_format( + val, fmt, exact, format_regex, locale_time, &dts, &item_reso + ) + try: + iresult = npy_datetimestruct_to_datetime(item_reso, &dts) + except OverflowError as err: + raise OutOfBoundsDatetime( + f"Out of bounds nanosecond timestamp: {val}" + ) from err + if tz is not None: + iresult = tz_localize_to_utc_single( + iresult, tz, ambiguous="raise", nonexistent=None, creso=item_reso + ) + ts = Timestamp._from_value_and_reso(iresult, item_reso, tz) + result[i] = ts + + except (ValueError, OutOfBoundsDatetime) as ex: + ex.args = ( + f"{str(ex)}, at position {i}. You might want to try:\n" + " - passing `format` if your strings have a consistent format;\n" + " - passing `format='ISO8601'` if your strings are " + "all ISO8601 but not necessarily in exactly the same format;\n" + " - passing `format='mixed'`, and the format will be " + "inferred for each element individually. " + "You might want to use `dayfirst` alongside this.", + ) + if is_coerce: + result[i] = NaT + continue + elif is_raise: + raise + return values + + import warnings + + from pandas.util._exceptions import find_stack_level + warnings.warn( + "In a future version of pandas, parsing datetimes with mixed time " + "zones will raise an error unless `utc=True`. Please specify `utc=True` " + "to opt in to the new behaviour and silence this warning. " + "To create a `Series` with mixed offsets and `object` dtype, " + "please use `apply` and `datetime.datetime.strptime`", + FutureWarning, + stacklevel=find_stack_level(), + ) + + return result + + class TimeRE(_TimeRE): """ Handle conversion from format directives to regexes. diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 832919db442d4..a74fb2bf48bc4 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -43,7 +43,6 @@ DataFrame, DatetimeIndex, Index, - IntervalIndex, MultiIndex, RangeIndex, Series, @@ -106,8 +105,6 @@ from pandas.core.construction import extract_array if TYPE_CHECKING: - from collections.abc import Iterable - from pandas._typing import ( Dtype, Frequency, @@ -291,24 +288,10 @@ comparison_dunder_methods = ["__eq__", "__ne__", "__le__", "__lt__", "__ge__", "__gt__"] -def reset_display_options() -> None: - """ - Reset the display options for printing and representing objects. - """ - pd.reset_option("^display.", silent=True) - - # ----------------------------------------------------------------------------- # Comparators -def equalContents(arr1, arr2) -> bool: - """ - Checks if the set of unique elements of arr1 and arr2 are equivalent. - """ - return frozenset(arr1) == frozenset(arr2) - - def box_expected(expected, box_cls, transpose: bool = True): """ Helper function to wrap the expected output of a test in a given box_class. @@ -402,12 +385,6 @@ def makeCategoricalIndex( ) -def makeIntervalIndex(k: int = 10, name=None, **kwargs) -> IntervalIndex: - """make a length k IntervalIndex""" - x = np.linspace(0, 100, num=(k + 1)) - return IntervalIndex.from_breaks(x, name=name, **kwargs) - - def makeBoolIndex(k: int = 10, name=None) -> Index: if k == 1: return Index([True], name=name) @@ -479,62 +456,6 @@ def makePeriodIndex(k: int = 10, name=None, **kwargs) -> PeriodIndex: return pi -def makeMultiIndex(k: int = 10, names=None, **kwargs): - N = (k // 2) + 1 - rng = range(N) - mi = MultiIndex.from_product([("foo", "bar"), rng], names=names, **kwargs) - assert len(mi) >= k # GH#38795 - return mi[:k] - - -def index_subclass_makers_generator(): - make_index_funcs = [ - makeDateIndex, - makePeriodIndex, - makeTimedeltaIndex, - makeRangeIndex, - makeIntervalIndex, - makeCategoricalIndex, - makeMultiIndex, - ] - yield from make_index_funcs - - -def all_timeseries_index_generator(k: int = 10) -> Iterable[Index]: - """ - Generator which can be iterated over to get instances of all the classes - which represent time-series. - - Parameters - ---------- - k: length of each of the index instances - """ - make_index_funcs: list[Callable[..., Index]] = [ - makeDateIndex, - makePeriodIndex, - makeTimedeltaIndex, - ] - for make_index_func in make_index_funcs: - yield make_index_func(k=k) - - -# make series -def make_rand_series(name=None, dtype=np.float64) -> Series: - index = makeStringIndex(_N) - data = np.random.default_rng(2).standard_normal(_N) - with np.errstate(invalid="ignore"): - data = data.astype(dtype, copy=False) - return Series(data, index=index, name=name) - - -def makeFloatSeries(name=None) -> Series: - return make_rand_series(name=name) - - -def makeStringSeries(name=None) -> Series: - return make_rand_series(name=name) - - def makeObjectSeries(name=None) -> Series: data = makeStringIndex(_N) data = Index(data, dtype=object) @@ -560,24 +481,10 @@ def makeTimeSeries(nper=None, freq: Frequency = "B", name=None) -> Series: ) -def makePeriodSeries(nper=None, name=None) -> Series: - if nper is None: - nper = _N - return Series( - np.random.default_rng(2).standard_normal(nper), - index=makePeriodIndex(nper), - name=name, - ) - - def getTimeSeriesData(nper=None, freq: Frequency = "B") -> dict[str, Series]: return {c: makeTimeSeries(nper, freq) for c in getCols(_K)} -def getPeriodData(nper=None) -> dict[str, Series]: - return {c: makePeriodSeries(nper) for c in getCols(_K)} - - # make frame def makeTimeDataFrame(nper=None, freq: Frequency = "B") -> DataFrame: data = getTimeSeriesData(nper, freq) @@ -606,11 +513,6 @@ def makeMixedDataFrame() -> DataFrame: return DataFrame(getMixedTypeDict()[1]) -def makePeriodFrame(nper=None) -> DataFrame: - data = getPeriodData(nper) - return DataFrame(data) - - def makeCustomIndex( nentries, nlevels, @@ -1094,7 +996,6 @@ def shares_memory(left, right) -> bool: "ALL_INT_NUMPY_DTYPES", "ALL_NUMPY_DTYPES", "ALL_REAL_NUMPY_DTYPES", - "all_timeseries_index_generator", "assert_almost_equal", "assert_attr_equal", "assert_categorical_equal", @@ -1131,7 +1032,6 @@ def shares_memory(left, right) -> bool: "EMPTY_STRING_PATTERN", "ENDIAN", "ensure_clean", - "equalContents", "external_error_raised", "FLOAT_EA_DTYPES", "FLOAT_NUMPY_DTYPES", @@ -1144,12 +1044,10 @@ def shares_memory(left, right) -> bool: "get_finest_unit", "get_obj", "get_op_from_name", - "getPeriodData", "getSeriesData", "getTimeSeriesData", "iat", "iloc", - "index_subclass_makers_generator", "loc", "makeBoolIndex", "makeCategoricalIndex", @@ -1158,20 +1056,13 @@ def shares_memory(left, right) -> bool: "makeDataFrame", "makeDateIndex", "makeFloatIndex", - "makeFloatSeries", - "makeIntervalIndex", "makeIntIndex", "makeMixedDataFrame", - "makeMultiIndex", "makeNumericIndex", "makeObjectSeries", - "makePeriodFrame", "makePeriodIndex", - "makePeriodSeries", - "make_rand_series", "makeRangeIndex", "makeStringIndex", - "makeStringSeries", "makeTimeDataFrame", "makeTimedeltaIndex", "makeTimeSeries", @@ -1182,7 +1073,6 @@ def shares_memory(left, right) -> bool: "NULL_OBJECTS", "OBJECT_DTYPES", "raise_assert_detail", - "reset_display_options", "raises_chained_assignment_error", "round_trip_localpath", "round_trip_pathlib", diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py index 4b087c75f5d74..f11dc11f6ac0d 100644 --- a/pandas/_testing/_warnings.py +++ b/pandas/_testing/_warnings.py @@ -14,6 +14,8 @@ ) import warnings +from pandas.compat import PY311 + if TYPE_CHECKING: from collections.abc import ( Generator, @@ -180,6 +182,11 @@ def _assert_caught_no_extra_warnings( # due to these open files. if any("matplotlib" in mod for mod in sys.modules): continue + if PY311 and actual_warning.category == EncodingWarning: + # EncodingWarnings are checked in the CI + # pyproject.toml errors on EncodingWarnings in pandas + # Ignore EncodingWarnings from other libraries + continue extra_warnings.append( ( actual_warning.category.__name__, diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index c45c2ff6eb6df..1354c0a6db7c5 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -15,6 +15,8 @@ # Update install.rst & setup.cfg when updating versions! VERSIONS = { + "adbc-driver-postgresql": "0.8.0", + "adbc-driver-sqlite": "0.8.0", "bs4": "4.11.2", "blosc": "1.21.3", "bottleneck": "1.3.6", @@ -50,7 +52,7 @@ "zstandard": "0.19.0", "tzdata": "2022.7", "qtpy": "2.3.0", - "pyqt5": "5.15.8", + "pyqt5": "5.15.9", } # A mapping from import name to package name (on PyPI) for packages where diff --git a/pandas/conftest.py b/pandas/conftest.py index b24606f8007d2..350871c3085c1 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -61,6 +61,7 @@ from pandas import ( DataFrame, Interval, + IntervalIndex, Period, Series, Timedelta, @@ -630,7 +631,7 @@ def _create_mi_with_dt64tz_level(): "complex64": tm.makeNumericIndex(100, dtype="float64").astype("complex64"), "complex128": tm.makeNumericIndex(100, dtype="float64").astype("complex128"), "categorical": tm.makeCategoricalIndex(100), - "interval": tm.makeIntervalIndex(100), + "interval": IntervalIndex.from_breaks(np.linspace(0, 100, num=101)), "empty": Index([]), "tuples": MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), "mi-with-dt64tz-level": _create_mi_with_dt64tz_level(), @@ -728,9 +729,11 @@ def string_series() -> Series: """ Fixture for Series of floats with Index of unique strings """ - s = tm.makeStringSeries() - s.name = "series" - return s + return Series( + np.arange(30, dtype=np.float64) * 1.1, + index=Index([f"i_{i}" for i in range(30)], dtype=object), + name="series", + ) @pytest.fixture @@ -775,7 +778,9 @@ def series_with_simple_index(index) -> Series: _narrow_series = { - f"{dtype.__name__}-series": tm.make_rand_series(name="a", dtype=dtype) + f"{dtype.__name__}-series": Series( + range(30), index=[f"i-{i}" for i in range(30)], name="a", dtype=dtype + ) for dtype in tm.NARROW_NP_DTYPES } diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b38a27a9f6d0a..82de8ae96160f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -932,6 +932,16 @@ def value_counts_internal( idx = Index(keys) if idx.dtype == bool and keys.dtype == object: idx = idx.astype(object) + elif idx.dtype != keys.dtype: + warnings.warn( + # GH#56161 + "The behavior of value_counts with object-dtype is deprecated. " + "In a future version, this will *not* perform dtype inference " + "on the resulting index. To retain the old behavior, use " + "`result.index = result.index.infer_objects()`", + FutureWarning, + stacklevel=find_stack_level(), + ) idx.name = index_name result = Series(counts, index=idx, name=name, copy=False) @@ -1712,8 +1722,16 @@ def union_with_duplicates( """ from pandas import Series - l_count = value_counts_internal(lvals, dropna=False) - r_count = value_counts_internal(rvals, dropna=False) + with warnings.catch_warnings(): + # filter warning from object dtype inference; we will end up discarding + # the index here, so the deprecation does not affect the end result here. + warnings.filterwarnings( + "ignore", + "The behavior of value_counts with object-dtype is deprecated", + category=FutureWarning, + ) + l_count = value_counts_internal(lvals, dropna=False) + r_count = value_counts_internal(rvals, dropna=False) l_count, r_count = l_count.align(r_count, fill_value=0) final_count = np.maximum(l_count.values, r_count.values) final_count = Series(final_count, index=l_count.index, dtype="int", copy=False) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4bcc03643dac8..d162b66e5d369 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -553,6 +553,18 @@ def __getitem__(self, item: PositionalIndexer): ) # We are not an array indexer, so maybe e.g. a slice or integer # indexer. We dispatch to pyarrow. + if isinstance(item, slice): + # Arrow bug https://github.com/apache/arrow/issues/38768 + if item.start == item.stop: + pass + elif ( + item.stop is not None + and item.stop < -len(self) + and item.step is not None + and item.step < 0 + ): + item = slice(item.start, None, item.step) + value = self._pa_array[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index f0b9219682350..a88f40013b3f6 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2483,11 +2483,6 @@ def _validate_inferred_freq( Returns ------- freq : DateOffset or None - - Notes - ----- - We assume at this point that `maybe_infer_freq` has been called, so - `freq` is either a DateOffset object or None. """ if inferred_freq is not None: if freq is not None and freq != inferred_freq: @@ -2502,34 +2497,6 @@ def _validate_inferred_freq( return freq -def maybe_infer_freq(freq) -> tuple[BaseOffset | None, bool]: - """ - Comparing a DateOffset to the string "infer" raises, so we need to - be careful about comparisons. Make a dummy variable `freq_infer` to - signify the case where the given freq is "infer" and set freq to None - to avoid comparison trouble later on. - - Parameters - ---------- - freq : {DateOffset, None, str} - - Returns - ------- - freq : {DateOffset, None} - freq_infer : bool - Whether we should inherit the freq of passed data. - """ - freq_infer = False - if not isinstance(freq, BaseOffset): - # if a passed freq is None, don't infer automatically - if freq != "infer": - freq = to_offset(freq) - else: - freq_infer = True - freq = None - return freq, freq_infer - - def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype) -> str: """ Return the unit str corresponding to the dtype's resolution. diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index b76ad5268e76e..de5832ba31b70 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -90,6 +90,9 @@ from pandas.core.arrays import PeriodArray +_ITER_CHUNKSIZE = 10_000 + + def tz_to_dtype( tz: tzinfo | None, unit: str = "ns" ) -> np.dtype[np.datetime64] | DatetimeTZDtype: @@ -654,7 +657,7 @@ def __iter__(self) -> Iterator: # convert in chunks of 10k for efficiency data = self.asi8 length = len(self) - chunksize = 10000 + chunksize = _ITER_CHUNKSIZE chunks = (length // chunksize) + 1 for i in range(chunks): @@ -787,7 +790,7 @@ def _assert_tzawareness_compat(self, other) -> None: # ----------------------------------------------------------------- # Arithmetic Methods - def _add_offset(self, offset) -> Self: + def _add_offset(self, offset: BaseOffset) -> Self: assert not isinstance(offset, Tick) if self.tz is not None: @@ -796,24 +799,31 @@ def _add_offset(self, offset) -> Self: values = self try: - result = offset._apply_array(values) - if result.dtype.kind == "i": - result = result.view(values.dtype) + res_values = offset._apply_array(values._ndarray) + if res_values.dtype.kind == "i": + # error: Argument 1 to "view" of "ndarray" has incompatible type + # "dtype[datetime64] | DatetimeTZDtype"; expected + # "dtype[Any] | type[Any] | _SupportsDType[dtype[Any]]" + res_values = res_values.view(values.dtype) # type: ignore[arg-type] except NotImplementedError: warnings.warn( "Non-vectorized DateOffset being applied to Series or DatetimeIndex.", PerformanceWarning, stacklevel=find_stack_level(), ) - result = self.astype("O") + offset + res_values = self.astype("O") + offset # TODO(GH#55564): as_unit will be unnecessary - result = type(self)._from_sequence(result).as_unit(self.unit) + result = type(self)._from_sequence(res_values).as_unit(self.unit) if not len(self): # GH#30336 _from_sequence won't be able to infer self.tz return result.tz_localize(self.tz) else: - result = type(self)._simple_new(result, dtype=result.dtype) + result = type(self)._simple_new(res_values, dtype=res_values.dtype) + if offset.normalize: + result = result.normalize() + result._freq = None + if self.tz is not None: result = result.tz_localize(self.tz) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 2f1363f180d08..126484ed4a2a0 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -12,6 +12,7 @@ Union, overload, ) +import warnings import numpy as np @@ -1226,7 +1227,16 @@ def value_counts(self, dropna: bool = True) -> Series: Series.value_counts """ # TODO: implement this is a non-naive way! - return value_counts(np.asarray(self), dropna=dropna) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "The behavior of value_counts with object-dtype is deprecated", + category=FutureWarning, + ) + result = value_counts(np.asarray(self), dropna=dropna) + # Once the deprecation is enforced, we will need to do + # `result.index = result.index.astype(self.dtype)` + return result # --------------------------------------------------------------------- # Rendering Methods diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 1c2ecd4684e2f..96ebb4901f797 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -527,6 +527,13 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): return super()._str_find(sub, start, end) return self._convert_int_dtype(result) + def _str_get_dummies(self, sep: str = "|"): + dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep) + if len(labels) == 0: + return np.empty(shape=(0, 0), dtype=np.int64), labels + dummies = np.vstack(dummies_pa.to_numpy()) + return dummies.astype(np.int64, copy=False), labels + def _convert_int_dtype(self, result): return Int64Dtype().__from_arrow__(result) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 226e2568fdbf8..f55d3de8878ad 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -6,7 +6,6 @@ TYPE_CHECKING, cast, ) -import warnings import numpy as np @@ -27,8 +26,7 @@ npy_unit_to_abbrev, periods_per_second, ) -from pandas._libs.tslibs.conversion import precision_from_unit -from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit +from pandas._libs.tslibs.conversion import cast_from_unit_vectorized from pandas._libs.tslibs.fields import ( get_timedelta_days, get_timedelta_field, @@ -1059,23 +1057,10 @@ def sequence_to_td64ns( data = data._data else: mask = np.isnan(data) - # The next few lines are effectively a vectorized 'cast_from_unit' - m, p = precision_from_unit(abbrev_to_npy_unit(unit or "ns")) - with warnings.catch_warnings(): - # Suppress RuntimeWarning about All-NaN slice - warnings.filterwarnings( - "ignore", "invalid value encountered in cast", RuntimeWarning - ) - base = data.astype(np.int64) - frac = data - base - if p: - frac = np.round(frac, p) - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", "invalid value encountered in cast", RuntimeWarning - ) - data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]") + + data = cast_from_unit_vectorized(data, unit or "ns") data[mask] = iNaT + data = data.view("m8[ns]") copy = False elif lib.is_np_dtype(data.dtype, "m"): diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 0ad652bd5aa64..792fa4fdc24a5 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -361,15 +361,11 @@ def trans(x): # if we don't have any elements, just astype it return trans(result).astype(dtype) - # do a test on the first element, if it fails then we are done - r = result.ravel() - arr = np.array([r[0]]) - - if isna(arr).any(): - # if we have any nulls, then we are done - return result - - elif not isinstance(r[0], (np.integer, np.floating, int, float, bool)): + if isinstance(result, np.ndarray): + element = result.item(0) + else: + element = result.iloc[0] + if not isinstance(element, (np.integer, np.floating, int, float, bool)): # a comparable, e.g. a Decimal may slip in here return result diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9f16340d98e22..5d05983529fba 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4375,11 +4375,15 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None: return self.isetitem(locs, value) - if len(value.columns) != 1: + if len(value.columns) > 1: raise ValueError( "Cannot set a DataFrame with multiple columns to the single " f"column {key}" ) + elif len(value.columns) == 0: + raise ValueError( + f"Cannot set a DataFrame without columns to the column {key}" + ) self[key] = value[value.columns[0]] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2f53ff68fc3e0..66113fc54c169 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6451,6 +6451,18 @@ def astype( Return a copy when ``copy=True`` (be very careful setting ``copy=False`` as changes to values then may propagate to other pandas objects). + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` errors : {'raise', 'ignore'}, default 'raise' Control raising of exceptions on invalid data for provided dtype. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1fb412e17c4ba..55ab4c2113fd7 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -819,9 +819,9 @@ def value_counts( rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) # multi-index components - codes = self.grouper.reconstructed_codes + codes = self.grouper._reconstructed_codes codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] - levels = [ping.group_index for ping in self.grouper.groupings] + [lev] + levels = [ping._group_index for ping in self.grouper.groupings] + [lev] if dropna: mask = codes[-1] != -1 diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2d530b64e104b..7d284db4eba2c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2820,7 +2820,7 @@ def _value_counts( and not grouping._observed for grouping in groupings ): - levels_list = [ping.result_index for ping in groupings] + levels_list = [ping._result_index for ping in groupings] multi_index = MultiIndex.from_product( levels_list, names=[ping.name for ping in groupings] ) @@ -5573,7 +5573,7 @@ def _reindex_output( ): return output - levels_list = [ping.group_index for ping in groupings] + levels_list = [ping._group_index for ping in groupings] names = self.grouper.names if qs is not None: # error: Argument 1 to "append" of "list" has incompatible type @@ -5795,7 +5795,7 @@ def _idxmax_idxmin( ping._passed_categorical for ping in self.grouper.groupings ): expected_len = np.prod( - [len(ping.group_index) for ping in self.grouper.groupings] + [len(ping._group_index) for ping in self.grouper.groupings] ) if len(self.grouper.groupings) == 1: result_len = len(self.grouper.groupings[0].grouping_vector.unique()) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index fd0479e17d2bd..fc914831b7a72 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -523,7 +523,6 @@ class Grouping: """ _codes: npt.NDArray[np.signedinteger] | None = None - _group_index: Index | None = None _all_grouper: Categorical | None _orig_cats: Index | None _index: Index @@ -679,7 +678,7 @@ def _ilevel(self) -> int | None: @property def ngroups(self) -> int: - return len(self.group_index) + return len(self._group_index) @cache_readonly def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: @@ -695,34 +694,58 @@ def codes(self) -> npt.NDArray[np.signedinteger]: return self._codes_and_uniques[0] @cache_readonly - def group_arraylike(self) -> ArrayLike: + def _group_arraylike(self) -> ArrayLike: """ Analogous to result_index, but holding an ArrayLike to ensure we can retain ExtensionDtypes. """ if self._all_grouper is not None: # retain dtype for categories, including unobserved ones - return self.result_index._values + return self._result_index._values elif self._passed_categorical: - return self.group_index._values + return self._group_index._values return self._codes_and_uniques[1] + @property + def group_arraylike(self) -> ArrayLike: + """ + Analogous to result_index, but holding an ArrayLike to ensure + we can retain ExtensionDtypes. + """ + warnings.warn( + "group_arraylike is deprecated and will be removed in a future " + "version of pandas", + category=FutureWarning, + stacklevel=find_stack_level(), + ) + return self._group_arraylike + @cache_readonly - def result_index(self) -> Index: + def _result_index(self) -> Index: # result_index retains dtype for categories, including unobserved ones, # which group_index does not if self._all_grouper is not None: - group_idx = self.group_index + group_idx = self._group_index assert isinstance(group_idx, CategoricalIndex) cats = self._orig_cats # set_categories is dynamically added return group_idx.set_categories(cats) # type: ignore[attr-defined] - return self.group_index + return self._group_index + + @property + def result_index(self) -> Index: + warnings.warn( + "result_index is deprecated and will be removed in a future " + "version of pandas", + category=FutureWarning, + stacklevel=find_stack_level(), + ) + return self._result_index @cache_readonly - def group_index(self) -> Index: + def _group_index(self) -> Index: codes, uniques = self._codes_and_uniques if not self._dropna and self._passed_categorical: assert isinstance(uniques, Categorical) @@ -744,6 +767,16 @@ def group_index(self) -> Index: ) return Index._with_infer(uniques, name=self.name) + @property + def group_index(self) -> Index: + warnings.warn( + "group_index is deprecated and will be removed in a future " + "version of pandas", + category=FutureWarning, + stacklevel=find_stack_level(), + ) + return self._group_index + @cache_readonly def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: uniques: ArrayLike @@ -809,7 +842,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: @cache_readonly def groups(self) -> dict[Hashable, np.ndarray]: - cats = Categorical.from_codes(self.codes, self.group_index, validate=False) + cats = Categorical.from_codes(self.codes, self._group_index, validate=False) return self._index.groupby(cats) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 466bbac641077..f3579e6c13a19 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -15,6 +15,7 @@ Generic, final, ) +import warnings import numpy as np @@ -32,6 +33,7 @@ ) from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import ( @@ -616,7 +618,7 @@ def get_iterator( for each group """ splitter = self._get_splitter(data, axis=axis) - keys = self.group_keys_seq + keys = self._group_keys_seq yield from zip(keys, splitter) @final @@ -638,7 +640,7 @@ def _get_splitter(self, data: NDFrame, axis: AxisInt = 0) -> DataSplitter: @final @cache_readonly - def group_keys_seq(self): + def _group_keys_seq(self): if len(self.groupings) == 1: return self.levels[0] else: @@ -647,6 +649,16 @@ def group_keys_seq(self): # provide "flattened" iterator for multi-group setting return get_flattened_list(ids, ngroups, self.levels, self.codes) + @property + def group_keys_seq(self): + warnings.warn( + "group_keys_seq is deprecated and will be removed in a future " + "version of pandas", + category=FutureWarning, + stacklevel=find_stack_level(), + ) + return self._group_keys_seq + @cache_readonly def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: """dict {group name -> group indices}""" @@ -654,7 +666,7 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: # This shows unused categories in indices GH#38642 return self.groupings[0].indices codes_list = [ping.codes for ping in self.groupings] - keys = [ping.group_index for ping in self.groupings] + keys = [ping._group_index for ping in self.groupings] return get_indexer_dict(codes_list, keys) @final @@ -691,7 +703,7 @@ def codes(self) -> list[npt.NDArray[np.signedinteger]]: @property def levels(self) -> list[Index]: - return [ping.group_index for ping in self.groupings] + return [ping._group_index for ping in self.groupings] @property def names(self) -> list[Hashable]: @@ -766,7 +778,7 @@ def _get_compressed_codes( # FIXME: compress_group_index's second return value is int64, not intp ping = self.groupings[0] - return ping.codes, np.arange(len(ping.group_index), dtype=np.intp) + return ping.codes, np.arange(len(ping._group_index), dtype=np.intp) @final @cache_readonly @@ -774,18 +786,28 @@ def ngroups(self) -> int: return len(self.result_index) @property - def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]: + def _reconstructed_codes(self) -> list[npt.NDArray[np.intp]]: codes = self.codes ids, obs_ids, _ = self.group_info return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) + @property + def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]: + warnings.warn( + "reconstructed_codes is deprecated and will be removed in a future " + "version of pandas", + category=FutureWarning, + stacklevel=find_stack_level(), + ) + return self._reconstructed_codes + @cache_readonly def result_index(self) -> Index: if len(self.groupings) == 1: - return self.groupings[0].result_index.rename(self.names[0]) + return self.groupings[0]._result_index.rename(self.names[0]) - codes = self.reconstructed_codes - levels = [ping.result_index for ping in self.groupings] + codes = self._reconstructed_codes + levels = [ping._result_index for ping in self.groupings] return MultiIndex( levels=levels, codes=codes, verify_integrity=False, names=self.names ) @@ -795,12 +817,12 @@ def get_group_levels(self) -> list[ArrayLike]: # Note: only called from _insert_inaxis_grouper, which # is only called for BaseGrouper, never for BinGrouper if len(self.groupings) == 1: - return [self.groupings[0].group_arraylike] + return [self.groupings[0]._group_arraylike] name_list = [] - for ping, codes in zip(self.groupings, self.reconstructed_codes): + for ping, codes in zip(self.groupings, self._reconstructed_codes): codes = ensure_platform_int(codes) - levels = ping.group_arraylike.take(codes) + levels = ping._group_arraylike.take(codes) name_list.append(levels) @@ -907,7 +929,7 @@ def apply_groupwise( ) -> tuple[list, bool]: mutated = False splitter = self._get_splitter(data, axis=axis) - group_keys = self.group_keys_seq + group_keys = self._group_keys_seq result_values = [] # This calls DataSplitter.__iter__ @@ -1087,7 +1109,7 @@ def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: ) @cache_readonly - def reconstructed_codes(self) -> list[np.ndarray]: + def _reconstructed_codes(self) -> list[np.ndarray]: # get unique result indices, and prepend 0 as groupby starts from the first return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index cbf7dc5ba67d2..ac4d2976593a2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5825,6 +5825,19 @@ def sort_values( >>> idx.sort_values(ascending=False, return_indexer=True) (Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) """ + if key is None and ( + self.is_monotonic_increasing or self.is_monotonic_decreasing + ): + reverse = ascending != self.is_monotonic_increasing + sorted_index = self[::-1] if reverse else self.copy() + if return_indexer: + indexer = np.arange(len(self), dtype=np.intp) + if reverse: + indexer = indexer[::-1] + return sorted_index, indexer + else: + return sorted_index + # GH 35584. Sort missing values according to na_position kwarg # ignore na_position for MultiIndex if not isinstance(self, ABCMultiIndex): diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index a3e6c50b21642..264ca8aa11495 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -518,7 +518,7 @@ def shift(self, periods: int = 1, freq=None) -> Self: # appropriate timezone from `start` and `end`, so tz does not need # to be passed explicitly. result = self._data._generate_range( - start=start, end=end, periods=None, freq=self.freq + start=start, end=end, periods=None, freq=self.freq, unit=self.unit ) return type(self)._simple_new(result, name=self.name) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 9d8ef5b0a69cd..d33e704d24c9f 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -22,7 +22,6 @@ ) from pandas.core.dtypes.generic import ABCSeries -from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays.timedeltas import TimedeltaArray import pandas.core.common as com from pandas.core.indexes.base import ( @@ -338,7 +337,7 @@ def timedelta_range( if freq is None and com.any_none(periods, start, end): freq = "D" - freq, _ = dtl.maybe_infer_freq(freq) + freq = to_offset(freq) tdarr = TimedeltaArray._generate_range( start, end, periods, freq, closed=closed, unit=unit ) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 58b904fd31b6a..9fa6e9973291d 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -259,6 +259,7 @@ def _wrap_result( fill_value=np.nan, returns_string: bool = True, returns_bool: bool = False, + dtype=None, ): from pandas import ( Index, @@ -379,29 +380,29 @@ def cons_row(x): out = out.get_level_values(0) return out else: - return Index(result, name=name) + return Index(result, name=name, dtype=dtype) else: index = self._orig.index # This is a mess. - dtype: DtypeObj | str | None + _dtype: DtypeObj | str | None = dtype vdtype = getattr(result, "dtype", None) if self._is_string: if is_bool_dtype(vdtype): - dtype = result.dtype + _dtype = result.dtype elif returns_string: - dtype = self._orig.dtype + _dtype = self._orig.dtype else: - dtype = vdtype - else: - dtype = vdtype + _dtype = vdtype + elif vdtype is not None: + _dtype = vdtype if expand: cons = self._orig._constructor_expanddim - result = cons(result, columns=name, index=index, dtype=dtype) + result = cons(result, columns=name, index=index, dtype=_dtype) else: # Must be a Series cons = self._orig._constructor - result = cons(result, name=name, index=index, dtype=dtype) + result = cons(result, name=name, index=index, dtype=_dtype) result = result.__finalize__(self._orig, method="str") if name is not None and result.ndim == 1: # __finalize__ might copy over the original name, but we may @@ -2317,7 +2318,8 @@ def translate(self, table): dtype: object """ result = self._data.array._str_translate(table) - return self._wrap_result(result) + dtype = object if self._data.dtype == "object" else None + return self._wrap_result(result, dtype=dtype) @forbid_nonstring_types(["bytes"]) def count(self, pat, flags: int = 0): diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index ea5e6e46f58ec..149bc2d932f0e 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -26,12 +26,10 @@ Timestamp, astype_overflowsafe, get_unit_from_dtype, - iNaT, is_supported_unit, timezones as libtimezones, ) -from pandas._libs.tslibs.conversion import precision_from_unit -from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit +from pandas._libs.tslibs.conversion import cast_from_unit_vectorized from pandas._libs.tslibs.parsing import ( DateParseError, guess_datetime_format, @@ -314,56 +312,6 @@ def _convert_and_box_cache( return _box_as_indexlike(result._values, utc=False, name=name) -def _return_parsed_timezone_results( - result: np.ndarray, timezones, utc: bool, name: str -) -> Index: - """ - Return results from array_strptime if a %z or %Z directive was passed. - - Parameters - ---------- - result : ndarray[int64] - int64 date representations of the dates - timezones : ndarray - pytz timezone objects - utc : bool - Whether to convert/localize timestamps to UTC. - name : string, default None - Name for a DatetimeIndex - - Returns - ------- - tz_result : Index-like of parsed dates with timezone - """ - tz_results = np.empty(len(result), dtype=object) - non_na_timezones = set() - for zone in unique(timezones): - # GH#50168 looping over unique timezones is faster than - # operating pointwise. - mask = timezones == zone - dta = DatetimeArray(result[mask]).tz_localize(zone) - if utc: - if dta.tzinfo is None: - dta = dta.tz_localize("utc") - else: - dta = dta.tz_convert("utc") - else: - if not dta.isna().all(): - non_na_timezones.add(zone) - tz_results[mask] = dta - if len(non_na_timezones) > 1: - warnings.warn( - "In a future version of pandas, parsing datetimes with mixed time " - "zones will raise an error unless `utc=True`. Please specify `utc=True` " - "to opt in to the new behaviour and silence this warning. " - "To create a `Series` with mixed offsets and `object` dtype, " - "please use `apply` and `datetime.datetime.strptime`", - FutureWarning, - stacklevel=find_stack_level(), - ) - return Index(tz_results, name=name) - - def _convert_listlike_datetimes( arg, format: str | None, @@ -497,7 +445,8 @@ def _convert_listlike_datetimes( if tz_parsed is not None: # We can take a shortcut since the datetime64 numpy array # is in UTC - dtype = cast(DatetimeTZDtype, tz_to_dtype(tz_parsed)) + out_unit = np.datetime_data(result.dtype)[0] + dtype = cast(DatetimeTZDtype, tz_to_dtype(tz_parsed, out_unit)) dt64_values = result.view(f"M8[{dtype.unit}]") dta = DatetimeArray._simple_new(dt64_values, dtype=dtype) return DatetimeIndex._simple_new(dta, name=name) @@ -516,11 +465,17 @@ def _array_strptime_with_fallback( """ Call array_strptime, with fallback behavior depending on 'errors'. """ - result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc) - if any(tz is not None for tz in timezones): - return _return_parsed_timezone_results(result, timezones, utc, name) - - return _box_as_indexlike(result, utc=utc, name=name) + result, tz_out = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc) + if tz_out is not None: + dtype = DatetimeTZDtype(tz=tz_out) + dta = DatetimeArray._simple_new(result, dtype=dtype) + if utc: + dta = dta.tz_convert("UTC") + return Index(dta, name=name) + elif result.dtype != object and utc: + res = Index(result, dtype="M8[ns, UTC]", name=name) + return res + return Index(result, dtype=result.dtype, name=name) def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: @@ -551,23 +506,19 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: tz_parsed = None elif arg.dtype.kind == "f": - mult, _ = precision_from_unit(abbrev_to_npy_unit(unit)) - - mask = np.isnan(arg) | (arg == iNaT) - fvalues = (arg * mult).astype("f8", copy=False) - fvalues[mask] = 0 - - if (fvalues < Timestamp.min._value).any() or ( - fvalues > Timestamp.max._value - ).any(): - if errors != "raise": - arg = arg.astype(object) - return _to_datetime_with_unit(arg, unit, name, utc, errors) - raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") - - arr = fvalues.astype("M8[ns]", copy=False) - arr[mask] = np.datetime64("NaT", "ns") - + with np.errstate(over="raise"): + try: + arr = cast_from_unit_vectorized(arg, unit=unit) + except OutOfBoundsDatetime: + if errors != "raise": + return _to_datetime_with_unit( + arg.astype(object), unit, name, utc, errors + ) + raise OutOfBoundsDatetime( + f"cannot convert input with unit '{unit}'" + ) + + arr = arr.view("M8[ns]") tz_parsed = None else: arg = arg.astype(object, copy=False) @@ -848,8 +799,8 @@ def to_datetime( to the day starting at noon on January 1, 4713 BC. - If Timestamp convertible (Timestamp, dt.datetime, np.datetimt64 or date string), origin is set to Timestamp identified by origin. - - If a float or integer, origin is the millisecond difference - relative to 1970-01-01. + - If a float or integer, origin is the difference + (in units determined by the ``unit`` argument) relative to 1970-01-01. cache : bool, default True If :const:`True`, use a cache of unique, converted dates to apply the datetime conversion. May produce significant speed-up when parsing diff --git a/pandas/io/sql.py b/pandas/io/sql.py index d854ea09a5335..d4b6602d9f0eb 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -32,6 +32,8 @@ import numpy as np +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -45,7 +47,10 @@ is_dict_like, is_list_like, ) -from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + DatetimeTZDtype, +) from pandas.core.dtypes.missing import isna from pandas import get_option @@ -56,6 +61,7 @@ from pandas.core.arrays import ArrowExtensionArray from pandas.core.base import PandasObject import pandas.core.common as com +from pandas.core.common import maybe_make_list from pandas.core.internals.construction import convert_object_array from pandas.core.tools.datetimes import to_datetime @@ -186,7 +192,7 @@ def _wrap_result( dtype: DtypeArg | None = None, dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ): - """Wrap result set of query in a DataFrame.""" + """Wrap result set of a SQLAlchemy query in a DataFrame.""" frame = _convert_arrays_to_dataframe(data, columns, coerce_float, dtype_backend) if dtype: @@ -200,6 +206,26 @@ def _wrap_result( return frame +def _wrap_result_adbc( + df: DataFrame, + *, + index_col=None, + parse_dates=None, + dtype: DtypeArg | None = None, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", +) -> DataFrame: + """Wrap result set of a SQLAlchemy query in a DataFrame.""" + if dtype: + df = df.astype(dtype) + + df = _parse_date_columns(df, parse_dates) + + if index_col is not None: + df = df.set_index(index_col) + + return df + + def execute(sql, con, params=None): """ Execute the given SQL query using the provided connection object. @@ -559,11 +585,12 @@ def read_sql( ---------- sql : str or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. - con : SQLAlchemy connectable, str, or sqlite3 connection + con : ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection + ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible - for engine disposal and connection closure for the SQLAlchemy connectable; str - connections are closed automatically. See + for engine disposal and connection closure for the ADBC connection and + SQLAlchemy connectable; str connections are closed automatically. See `here `_. index_col : str or list of str, optional, default: None Column(s) to set as index(MultiIndex). @@ -648,6 +675,17 @@ def read_sql( int_column date_column 0 0 2012-11-10 1 1 2010-11-12 + + .. versionadded:: 2.2.0 + + pandas now supports reading via ADBC drivers + + >>> from adbc_driver_postgresql import dbapi + >>> with dbapi.connect('postgres:///db_name') as conn: # doctest:+SKIP + ... pd.read_sql('SELECT int_column FROM test_data', conn) + int_column + 0 0 + 1 1 """ check_dtype_backend(dtype_backend) @@ -719,8 +757,9 @@ def to_sql( frame : DataFrame, Series name : str Name of SQL table. - con : SQLAlchemy connectable(engine/connection) or database string URI + con : ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection or sqlite3 DBAPI2 connection + ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. @@ -775,7 +814,8 @@ def to_sql( Notes ----- The returned rows affected is the sum of the ``rowcount`` attribute of ``sqlite3.Cursor`` - or SQLAlchemy connectable. The returned value may not reflect the exact number of written + or SQLAlchemy connectable. If using ADBC the returned rows are the result + of ``Cursor.adbc_ingest``. The returned value may not reflect the exact number of written rows as stipulated in the `sqlite3 `__ or `SQLAlchemy `__ @@ -814,7 +854,8 @@ def has_table(table_name: str, con, schema: str | None = None) -> bool: ---------- table_name: string Name of SQL table. - con: SQLAlchemy connectable(engine/connection) or sqlite3 DBAPI2 connection + con: ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection + ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. @@ -856,6 +897,10 @@ def pandasSQL_builder( if sqlalchemy is not None and isinstance(con, (str, sqlalchemy.engine.Connectable)): return SQLDatabase(con, schema, need_transaction) + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(con, adbc.Connection): + return ADBCDatabase(con) + warnings.warn( "pandas only supports SQLAlchemy connectable (engine/connection) or " "database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 " @@ -2034,6 +2079,356 @@ def _create_sql_schema( # ---- SQL without SQLAlchemy --- + + +class ADBCDatabase(PandasSQL): + """ + This class enables conversion between DataFrame and SQL databases + using ADBC to handle DataBase abstraction. + + Parameters + ---------- + con : adbc_driver_manager.dbapi.Connection + """ + + def __init__(self, con) -> None: + self.con = con + + @contextmanager + def run_transaction(self): + with self.con.cursor() as cur: + try: + yield cur + except Exception: + self.con.rollback() + raise + self.con.commit() + + def execute(self, sql: str | Select | TextClause, params=None): + if not isinstance(sql, str): + raise TypeError("Query must be a string unless using sqlalchemy.") + args = [] if params is None else [params] + cur = self.con.cursor() + try: + cur.execute(sql, *args) + return cur + except Exception as exc: + try: + self.con.rollback() + except Exception as inner_exc: # pragma: no cover + ex = DatabaseError( + f"Execution failed on sql: {sql}\n{exc}\nunable to rollback" + ) + raise ex from inner_exc + + ex = DatabaseError(f"Execution failed on sql '{sql}': {exc}") + raise ex from exc + + def read_table( + self, + table_name: str, + index_col: str | list[str] | None = None, + coerce_float: bool = True, + parse_dates=None, + columns=None, + schema: str | None = None, + chunksize: int | None = None, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", + ) -> DataFrame | Iterator[DataFrame]: + """ + Read SQL database table into a DataFrame. + + Parameters + ---------- + table_name : str + Name of SQL table in database. + coerce_float : bool, default True + Raises NotImplementedError + parse_dates : list or dict, default: None + - List of column names to parse as dates. + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times, or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps. + - Dict of ``{column_name: arg}``, where the arg corresponds + to the keyword arguments of :func:`pandas.to_datetime`. + Especially useful with databases without native Datetime support, + such as SQLite. + columns : list, default: None + List of column names to select from SQL table. + schema : string, default None + Name of SQL schema in database to query (if database flavor + supports this). If specified, this overwrites the default + schema of the SQL database object. + chunksize : int, default None + Raises NotImplementedError + dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + + Returns + ------- + DataFrame + + See Also + -------- + pandas.read_sql_table + SQLDatabase.read_query + + """ + if coerce_float is not True: + raise NotImplementedError( + "'coerce_float' is not implemented for ADBC drivers" + ) + if chunksize: + raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") + + if columns: + if index_col: + index_select = maybe_make_list(index_col) + else: + index_select = [] + to_select = index_select + columns + select_list = ", ".join(f'"{x}"' for x in to_select) + else: + select_list = "*" + if schema: + stmt = f"SELECT {select_list} FROM {schema}.{table_name}" + else: + stmt = f"SELECT {select_list} FROM {table_name}" + + mapping: type[ArrowDtype] | None | Callable + if dtype_backend == "pyarrow": + mapping = ArrowDtype + elif dtype_backend == "numpy_nullable": + from pandas.io._util import _arrow_dtype_mapping + + mapping = _arrow_dtype_mapping().get + elif using_pyarrow_string_dtype(): + from pandas.io._util import arrow_string_types_mapper + + arrow_string_types_mapper() + else: + mapping = None + + with self.con.cursor() as cur: + cur.execute(stmt) + df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + + return _wrap_result_adbc( + df, + index_col=index_col, + parse_dates=parse_dates, + ) + + def read_query( + self, + sql: str, + index_col: str | list[str] | None = None, + coerce_float: bool = True, + parse_dates=None, + params=None, + chunksize: int | None = None, + dtype: DtypeArg | None = None, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", + ) -> DataFrame | Iterator[DataFrame]: + """ + Read SQL query into a DataFrame. + + Parameters + ---------- + sql : str + SQL query to be executed. + index_col : string, optional, default: None + Column name to use as index for the returned DataFrame object. + coerce_float : bool, default True + Raises NotImplementedError + params : list, tuple or dict, optional, default: None + Raises NotImplementedError + parse_dates : list or dict, default: None + - List of column names to parse as dates. + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times, or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps. + - Dict of ``{column_name: arg dict}``, where the arg dict + corresponds to the keyword arguments of + :func:`pandas.to_datetime` Especially useful with databases + without native Datetime support, such as SQLite. + chunksize : int, default None + Raises NotImplementedError + dtype : Type name or dict of columns + Data type for data or columns. E.g. np.float64 or + {'a': np.float64, 'b': np.int32, 'c': 'Int64'} + + .. versionadded:: 1.3.0 + + Returns + ------- + DataFrame + + See Also + -------- + read_sql_table : Read SQL database table into a DataFrame. + read_sql + + """ + if coerce_float is not True: + raise NotImplementedError( + "'coerce_float' is not implemented for ADBC drivers" + ) + if params: + raise NotImplementedError("'params' is not implemented for ADBC drivers") + if chunksize: + raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") + + mapping: type[ArrowDtype] | None | Callable + if dtype_backend == "pyarrow": + mapping = ArrowDtype + elif dtype_backend == "numpy_nullable": + from pandas.io._util import _arrow_dtype_mapping + + mapping = _arrow_dtype_mapping().get + else: + mapping = None + + with self.con.cursor() as cur: + cur.execute(sql) + df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + + return _wrap_result_adbc( + df, + index_col=index_col, + parse_dates=parse_dates, + dtype=dtype, + ) + + read_sql = read_query + + def to_sql( + self, + frame, + name: str, + if_exists: Literal["fail", "replace", "append"] = "fail", + index: bool = True, + index_label=None, + schema: str | None = None, + chunksize: int | None = None, + dtype: DtypeArg | None = None, + method: Literal["multi"] | Callable | None = None, + engine: str = "auto", + **engine_kwargs, + ) -> int | None: + """ + Write records stored in a DataFrame to a SQL database. + + Parameters + ---------- + frame : DataFrame + name : string + Name of SQL table. + if_exists : {'fail', 'replace', 'append'}, default 'fail' + - fail: If table exists, do nothing. + - replace: If table exists, drop it, recreate it, and insert data. + - append: If table exists, insert data. Create if does not exist. + index : boolean, default True + Write DataFrame index as a column. + index_label : string or sequence, default None + Raises NotImplementedError + schema : string, default None + Name of SQL schema in database to write to (if database flavor + supports this). If specified, this overwrites the default + schema of the SQLDatabase object. + chunksize : int, default None + Raises NotImplementedError + dtype : single type or dict of column name to SQL type, default None + Raises NotImplementedError + method : {None', 'multi', callable}, default None + Raises NotImplementedError + engine : {'auto', 'sqlalchemy'}, default 'auto' + Raises NotImplementedError if not set to 'auto' + """ + if index_label: + raise NotImplementedError( + "'index_label' is not implemented for ADBC drivers" + ) + if chunksize: + raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") + if dtype: + raise NotImplementedError("'dtype' is not implemented for ADBC drivers") + if method: + raise NotImplementedError("'method' is not implemented for ADBC drivers") + if engine != "auto": + raise NotImplementedError( + "engine != 'auto' not implemented for ADBC drivers" + ) + + if schema: + table_name = f"{schema}.{name}" + else: + table_name = name + + # pandas if_exists="append" will still create the + # table if it does not exist; ADBC is more explicit with append/create + # as applicable modes, so the semantics get blurred across + # the libraries + mode = "create" + if self.has_table(name, schema): + if if_exists == "fail": + raise ValueError(f"Table '{table_name}' already exists.") + elif if_exists == "replace": + with self.con.cursor() as cur: + cur.execute(f"DROP TABLE {table_name}") + elif if_exists == "append": + mode = "append" + + import pyarrow as pa + + try: + tbl = pa.Table.from_pandas(frame, preserve_index=index) + except pa.ArrowNotImplementedError as exc: + raise ValueError("datatypes not supported") from exc + + with self.con.cursor() as cur: + total_inserted = cur.adbc_ingest(table_name, tbl, mode=mode) + + self.con.commit() + return total_inserted + + def has_table(self, name: str, schema: str | None = None) -> bool: + meta = self.con.adbc_get_objects( + db_schema_filter=schema, table_name_filter=name + ).read_all() + + for catalog_schema in meta["catalog_db_schemas"].to_pylist(): + if not catalog_schema: + continue + for schema_record in catalog_schema: + if not schema_record: + continue + + for table_record in schema_record["db_schema_tables"]: + if table_record["table_name"] == name: + return True + + return False + + def _create_sql_schema( + self, + frame: DataFrame, + table_name: str, + keys: list[str] | None = None, + dtype: DtypeArg | None = None, + schema: str | None = None, + ): + raise NotImplementedError("not implemented for adbc") + + # sqlite-specific sql strings and handler class # dictionary used for readability purposes _SQL_TYPES = { @@ -2507,9 +2902,10 @@ def get_schema( name of SQL table keys : string or sequence, default: None columns to use a primary key - con: an open SQL database connection object or a SQLAlchemy connectable + con: ADBC Connection, SQLAlchemy connectable, sqlite3 connection, default: None + ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that - library, default: None + library If a DBAPI2 object, only sqlite3 is supported. dtype : dict of column name to SQL type, default None Optional specifying the datatype for columns. The SQL type should diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 7096c1281a1b6..99314e60b7c00 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -688,7 +688,7 @@ def _compute_plot_data(self): # GH 18755, include object and category type for scatter plot if self._kind == "scatter": - include_type.extend(["object", "category"]) + include_type.extend(["object", "category", "string"]) numeric_data = data.select_dtypes(include=include_type, exclude=exclude_type) diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index 9f5157181843e..9f8611dd4b08b 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -338,11 +338,8 @@ def test_transform_wont_agg_series(string_series, func): # we are trying to transform with an aggregator msg = "Function did not transform" - warn = RuntimeWarning if func[0] == "sqrt" else None - warn_msg = "invalid value encountered in sqrt" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(warn, match=warn_msg, check_stacklevel=False): - string_series.transform(func) + string_series.transform(func) @pytest.mark.parametrize( diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index a63bfbf1835a9..9014ba4b6093e 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -905,7 +905,7 @@ def test_dt64arr_add_sub_td64_nat(self, box_with_array, tz_naive_fixture): dti = date_range("1994-04-01", periods=9, tz=tz, freq="QS") other = np.timedelta64("NaT") - expected = DatetimeIndex(["NaT"] * 9, tz=tz) + expected = DatetimeIndex(["NaT"] * 9, tz=tz).as_unit("ns") obj = tm.box_expected(dti, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1590,13 +1590,13 @@ class TestDatetime64OverflowHandling: def test_dt64_overflow_masking(self, box_with_array): # GH#25317 - left = Series([Timestamp("1969-12-31")]) + left = Series([Timestamp("1969-12-31")], dtype="M8[ns]") right = Series([NaT]) left = tm.box_expected(left, box_with_array) right = tm.box_expected(right, box_with_array) - expected = TimedeltaIndex([NaT]) + expected = TimedeltaIndex([NaT], dtype="m8[ns]") expected = tm.box_expected(expected, box_with_array) result = left - right diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 5ffbf1a38e845..33953900c2006 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas as pd from pandas import ( Series, @@ -172,7 +174,12 @@ def test_objarr_add_invalid(self, op, box_with_array): obj_ser = tm.box_expected(obj_ser, box) msg = "|".join( - ["can only concatenate str", "unsupported operand type", "must be str"] + [ + "can only concatenate str", + "unsupported operand type", + "must be str", + "has no kernel", + ] ) with pytest.raises(Exception, match=msg): op(obj_ser, 1) @@ -290,6 +297,7 @@ def test_iadd_string(self): index += "_x" assert "a_x" in index + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="add doesn't work") def test_add(self): index = tm.makeStringIndex(100) expected = pd.Index(index.values * 2) @@ -304,17 +312,24 @@ def test_add(self): expected = pd.Index(["1a", "1b", "1c"]) tm.assert_index_equal("1" + index, expected) - def test_sub_fail(self): + def test_sub_fail(self, using_infer_string): index = tm.makeStringIndex(100) - msg = "unsupported operand type|Cannot broadcast" - with pytest.raises(TypeError, match=msg): + if using_infer_string: + import pyarrow as pa + + err = pa.lib.ArrowNotImplementedError + msg = "has no kernel" + else: + err = TypeError + msg = "unsupported operand type|Cannot broadcast" + with pytest.raises(err, match=msg): index - "a" - with pytest.raises(TypeError, match=msg): + with pytest.raises(err, match=msg): index - index - with pytest.raises(TypeError, match=msg): + with pytest.raises(err, match=msg): index - index.tolist() - with pytest.raises(TypeError, match=msg): + with pytest.raises(err, match=msg): index.tolist() - index def test_sub_object(self): diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 9f0e99b829739..e88bde07aee90 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1039,7 +1039,7 @@ def test_td64arr_add_datetime64_nat(self, box_with_array): other = np.datetime64("NaT") tdi = timedelta_range("1 day", periods=3) - expected = DatetimeIndex(["NaT", "NaT", "NaT"]) + expected = DatetimeIndex(["NaT", "NaT", "NaT"], dtype="M8[ns]") tdser = tm.box_expected(tdi, box_with_array) expected = tm.box_expected(expected, box_with_array) diff --git a/pandas/tests/base/test_constructors.py b/pandas/tests/base/test_constructors.py index 4e954891c2d98..f3ac60f672ee1 100644 --- a/pandas/tests/base/test_constructors.py +++ b/pandas/tests/base/test_constructors.py @@ -138,7 +138,9 @@ class TestConstruction: "object-string", ], ) - def test_constructor_datetime_outofbound(self, a, constructor): + def test_constructor_datetime_outofbound( + self, a, constructor, request, using_infer_string + ): # GH-26853 (+ bug GH-26206 out of bound non-ns unit) # No dtype specified (dtype inference) @@ -150,7 +152,10 @@ def test_constructor_datetime_outofbound(self, a, constructor): assert result.dtype == "M8[s]" else: result = constructor(a) - assert result.dtype == "object" + if using_infer_string and "object-string" in request.node.callspec.id: + assert result.dtype == "string" + else: + assert result.dtype == "object" tm.assert_numpy_array_equal(result.to_numpy(), a) # Explicit dtype specified diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 2fc6e786e3198..4f3e4d3365179 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -20,6 +20,7 @@ SparseArray, TimedeltaArray, ) +from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics class TestToIterable: @@ -215,7 +216,9 @@ def test_iter_box_period(self): ), ], ) -def test_values_consistent(arr, expected_type, dtype): +def test_values_consistent(arr, expected_type, dtype, using_infer_string): + if using_infer_string and dtype == "object": + expected_type = ArrowStringArrayNumpySemantics l_values = Series(arr)._values r_values = pd.Index(arr)._values assert type(l_values) is expected_type @@ -358,17 +361,23 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request): @pytest.mark.parametrize( "arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)] ) -def test_to_numpy_copy(arr, as_series): +def test_to_numpy_copy(arr, as_series, using_infer_string): obj = pd.Index(arr, copy=False) if as_series: obj = Series(obj.values, copy=False) # no copy by default result = obj.to_numpy() - assert np.shares_memory(arr, result) is True + if using_infer_string and arr.dtype == object: + assert np.shares_memory(arr, result) is False + else: + assert np.shares_memory(arr, result) is True result = obj.to_numpy(copy=False) - assert np.shares_memory(arr, result) is True + if using_infer_string and arr.dtype == object: + assert np.shares_memory(arr, result) is False + else: + assert np.shares_memory(arr, result) is True # copy=True result = obj.to_numpy(copy=True) diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index c6fd4955d2d63..65e234e799353 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.compat import PYPY from pandas.core.dtypes.common import ( @@ -80,7 +82,10 @@ def test_ndarray_compat_properties(index_or_series_obj): assert Series([1]).item() == 1 -@pytest.mark.skipif(PYPY, reason="not relevant for PyPy") +@pytest.mark.skipif( + PYPY or using_pyarrow_string_dtype(), + reason="not relevant for PyPy doesn't work properly for arrow strings", +) def test_memory_usage(index_or_series_memory_obj): obj = index_or_series_memory_obj # Clear index caches so that len(obj) == 0 report 0 memory usage @@ -127,7 +132,7 @@ def test_memory_usage_components_series(series_with_simple_index): @pytest.mark.parametrize("dtype", tm.NARROW_NP_DTYPES) def test_memory_usage_components_narrow_series(dtype): - series = tm.make_rand_series(name="a", dtype=dtype) + series = Series(range(5), dtype=dtype, index=[f"i-{i}" for i in range(5)], name="a") total_usage = series.memory_usage(index=True) non_index_usage = series.memory_usage(index=False) index_usage = series.index.memory_usage() @@ -175,7 +180,9 @@ def test_access_by_position(index_flat): assert index[-1] == index[size - 1] msg = f"index {size} is out of bounds for axis 0 with size {size}" - if is_dtype_equal(index.dtype, "string[pyarrow]"): + if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal( + index.dtype, "string[pyarrow_numpy]" + ): msg = "index out of bounds" with pytest.raises(IndexError, match=msg): index[size] diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 4c845d8f24d01..d3fe144f70cfc 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas as pd import pandas._testing as tm from pandas.tests.base.common import allow_na_ops @@ -98,6 +100,7 @@ def test_nunique_null(null_obj, index_or_series_obj): @pytest.mark.single_cpu +@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="decoding fails") def test_unique_bad_unicode(index_or_series): # regression test for #34550 uval = "\ud83d" # smiley emoji diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index c42d064c476bb..2729666398877 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -14,6 +14,7 @@ Series, Timedelta, TimedeltaIndex, + array, ) import pandas._testing as tm from pandas.tests.base.common import allow_na_ops @@ -113,7 +114,7 @@ def test_value_counts_null(null_obj, index_or_series_obj): tm.assert_series_equal(result, expected) -def test_value_counts_inferred(index_or_series): +def test_value_counts_inferred(index_or_series, using_infer_string): klass = index_or_series s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] s = klass(s_values) @@ -125,7 +126,9 @@ def test_value_counts_inferred(index_or_series): tm.assert_index_equal(s.unique(), exp) else: exp = np.unique(np.array(s_values, dtype=np.object_)) - tm.assert_numpy_array_equal(s.unique(), exp) + if using_infer_string: + exp = array(exp) + tm.assert_equal(s.unique(), exp) assert s.nunique() == 4 # don't sort, have to sort after the fact as not sorting is @@ -147,7 +150,7 @@ def test_value_counts_inferred(index_or_series): tm.assert_series_equal(hist, expected) -def test_value_counts_bins(index_or_series): +def test_value_counts_bins(index_or_series, using_infer_string): klass = index_or_series s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] s = klass(s_values) @@ -201,7 +204,9 @@ def test_value_counts_bins(index_or_series): tm.assert_index_equal(s.unique(), exp) else: exp = np.array(["a", "b", np.nan, "d"], dtype=object) - tm.assert_numpy_array_equal(s.unique(), exp) + if using_infer_string: + exp = array(exp) + tm.assert_equal(s.unique(), exp) assert s.nunique() == 3 s = klass({}) if klass is dict else klass({}, dtype=object) @@ -246,7 +251,7 @@ def test_value_counts_datetime64(index_or_series, unit): expected_s = Series([3, 2, 1], index=idx, name="count") tm.assert_series_equal(s.value_counts(), expected_s) - expected = pd.array( + expected = array( np.array( ["2010-01-01 00:00:00", "2009-01-01 00:00:00", "2008-09-09 00:00:00"], dtype=f"datetime64[{unit}]", @@ -336,3 +341,16 @@ def test_value_counts_with_nan(dropna, index_or_series): else: expected = Series([1, 1, 1], index=[True, pd.NA, np.nan], name="count") tm.assert_series_equal(res, expected) + + +def test_value_counts_object_inference_deprecated(): + # GH#56161 + dti = pd.date_range("2016-01-01", periods=3, tz="UTC") + + idx = dti.astype(object) + msg = "The behavior of value_counts with object-dtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = idx.value_counts() + + exp = dti.value_counts() + tm.assert_series_equal(res, exp) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index fe49446424de1..45215cd3b5e96 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -1832,7 +1832,7 @@ def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser): ], ) def test_equals_various(other): - df = DataFrame({"A": ["a", "b", "c"]}) + df = DataFrame({"A": ["a", "b", "c"]}, dtype=object) result = df.eval(f"A == {other}") expected = Series([False, False, False], name="A") if USE_NUMEXPR: diff --git a/pandas/tests/dtypes/cast/test_construct_ndarray.py b/pandas/tests/dtypes/cast/test_construct_ndarray.py index 10085ddde5c8f..ab468c81124bc 100644 --- a/pandas/tests/dtypes/cast/test_construct_ndarray.py +++ b/pandas/tests/dtypes/cast/test_construct_ndarray.py @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas as pd import pandas._testing as tm from pandas.core.construction import sanitize_array @@ -15,9 +16,14 @@ ([1, 2, None], np.dtype("str"), np.array(["1", "2", None])), ], ) -def test_construct_1d_ndarray_preserving_na(values, dtype, expected): +def test_construct_1d_ndarray_preserving_na( + values, dtype, expected, using_infer_string +): result = sanitize_array(values, index=None, dtype=dtype) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string and expected.dtype == object and dtype is None: + tm.assert_extension_array_equal(result, pd.array(expected)) + else: + tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]"]) diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index 50eaa1f4d8713..679031a625c2d 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -159,8 +159,10 @@ def test_infer_dtype_from_scalar_errors(): (Timestamp("20160101", tz="UTC"), "datetime64[s, UTC]"), ], ) -def test_infer_dtype_from_scalar(value, expected): +def test_infer_dtype_from_scalar(value, expected, using_infer_string): dtype, _ = infer_dtype_from_scalar(value) + if using_infer_string and value == "foo": + expected = "string" assert is_dtype_equal(dtype, expected) with pytest.raises(TypeError, match="must be list-like"): @@ -189,8 +191,14 @@ def test_infer_dtype_from_scalar(value, expected): ), ], ) -def test_infer_dtype_from_array(arr, expected): +def test_infer_dtype_from_array(arr, expected, using_infer_string): dtype, _ = infer_dtype_from_array(arr) + if ( + using_infer_string + and isinstance(arr, Series) + and arr.tolist() == ["a", "b", "c"] + ): + expected = "string" assert is_dtype_equal(dtype, expected) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index d946b8da01fad..42043f95a7ace 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -676,9 +676,9 @@ def test_is_complex_dtype(): (np.dtype("float64"), np.dtype("float64")), (str, np.dtype(str)), (pd.Series([1, 2], dtype=np.dtype("int16")), np.dtype("int16")), - (pd.Series(["a", "b"]), np.dtype(object)), + (pd.Series(["a", "b"], dtype=object), np.dtype(object)), (pd.Index([1, 2]), np.dtype("int64")), - (pd.Index(["a", "b"]), np.dtype(object)), + (pd.Index(["a", "b"], dtype=object), np.dtype(object)), ("category", "category"), (pd.Categorical(["a", "b"]).dtype, CategoricalDtype(["a", "b"])), (pd.Categorical(["a", "b"]), CategoricalDtype(["a", "b"])), @@ -727,9 +727,9 @@ def test_get_dtype_fails(input_param, expected_error_message): (np.dtype("float64"), np.float64), (str, np.dtype(str).type), (pd.Series([1, 2], dtype=np.dtype("int16")), np.int16), - (pd.Series(["a", "b"]), np.object_), + (pd.Series(["a", "b"], dtype=object), np.object_), (pd.Index([1, 2], dtype="int64"), np.int64), - (pd.Index(["a", "b"]), np.object_), + (pd.Index(["a", "b"], dtype=object), np.object_), ("category", CategoricalDtypeType), (pd.Categorical(["a", "b"]).dtype, CategoricalDtypeType), (pd.Categorical(["a", "b"]), CategoricalDtypeType), diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 04b3feabb9854..4e8f375b31674 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -200,7 +200,8 @@ def test_is_boolean(self, categories, expected): def test_dtype_specific_categorical_dtype(self): expected = "datetime64[ns]" - result = str(Categorical(DatetimeIndex([])).categories.dtype) + dti = DatetimeIndex([], dtype=expected) + result = str(Categorical(dti).categories.dtype) assert result == expected def test_not_string(self): @@ -1054,13 +1055,14 @@ def test_from_categorical_dtype_both(self): ) assert result == CategoricalDtype([1, 2], ordered=False) - def test_str_vs_repr(self, ordered): + def test_str_vs_repr(self, ordered, using_infer_string): c1 = CategoricalDtype(["a", "b"], ordered=ordered) assert str(c1) == "category" # Py2 will have unicode prefixes + dtype = "string" if using_infer_string else "object" pat = ( r"CategoricalDtype\(categories=\[.*\], ordered={ordered}, " - r"categories_dtype=object\)" + rf"categories_dtype={dtype}\)" ) assert re.match(pat.format(ordered=ordered), repr(c1)) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 1cc1e2725b9c7..88cec50c08aba 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -40,6 +40,7 @@ Series, TimedeltaIndex, date_range, + period_range, ) import pandas._testing as tm @@ -77,11 +78,9 @@ def test_notna_notnull(notna_f): @pytest.mark.parametrize( "ser", [ - tm.makeFloatSeries(), - tm.makeStringSeries(), tm.makeObjectSeries(), tm.makeTimeSeries(), - tm.makePeriodSeries(), + Series(range(5), period_range("2020-01-01", periods=5)), ], ) def test_null_check_is_series(null_func, ser): @@ -124,15 +123,25 @@ def test_isna_isnull(self, isna_f): @pytest.mark.parametrize("isna_f", [isna, isnull]) @pytest.mark.parametrize( - "df", + "data", [ - tm.makeTimeDataFrame(), - tm.makePeriodFrame(), - tm.makeMixedDataFrame(), + np.arange(4, dtype=float), + [0.0, 1.0, 0.0, 1.0], + Series(list("abcd"), dtype=object), + date_range("2020-01-01", periods=4), ], ) - def test_isna_isnull_frame(self, isna_f, df): + @pytest.mark.parametrize( + "index", + [ + date_range("2020-01-01", periods=4), + range(4), + period_range("2020-01-01", periods=4), + ], + ) + def test_isna_isnull_frame(self, isna_f, data, index): # frame + df = pd.DataFrame(data, index=index) result = isna_f(df) expected = df.apply(isna_f) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index b2d94ff5ffbd1..135a86cad1395 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -48,7 +48,7 @@ def test_getitem(self, float_frame): # Column access for _, series in sl.items(): assert len(series.index) == 20 - assert tm.equalContents(series.index, sl.index) + tm.assert_index_equal(series.index, sl.index) for key, _ in float_frame._series.items(): assert float_frame[key] is not None diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index bd916f230bead..0130820fc3de6 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -93,6 +93,11 @@ def test_setitem_error_msmgs(self): with pytest.raises(ValueError, match=msg): df["gr"] = df.groupby(["b", "c"]).count() + # GH 55956, specific message for zero columns + msg = "Cannot set a DataFrame without columns to the column gr" + with pytest.raises(ValueError, match=msg): + df["gr"] = DataFrame() + def test_setitem_benchmark(self): # from the vb_suite/frame_methods/frame_insert_columns N = 10 @@ -1349,7 +1354,8 @@ def test_setitem_frame_dup_cols_dtype(self): def test_frame_setitem_empty_dataframe(self): # GH#28871 - df = DataFrame({"date": [datetime(2000, 1, 1)]}).set_index("date") + dti = DatetimeIndex(["2000-01-01"], dtype="M8[ns]", name="date") + df = DataFrame({"date": dti}).set_index("date") df = df[0:0].copy() df["3010"] = None @@ -1358,6 +1364,6 @@ def test_frame_setitem_empty_dataframe(self): expected = DataFrame( [], columns=["3010", "2010"], - index=Index([], dtype="datetime64[ns]", name="date"), + index=dti[:0], ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 1779f703dd2b7..0335279b3a123 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -37,7 +37,7 @@ def test_combine_first(self, float_frame): combined = head.combine_first(tail) reordered_frame = float_frame.reindex(combined.index) tm.assert_frame_equal(combined, reordered_frame) - assert tm.equalContents(combined.columns, float_frame.columns) + tm.assert_index_equal(combined.columns, float_frame.columns) tm.assert_series_equal(combined["A"], reordered_frame["A"]) # same index diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index dcec68ab3530d..787b77a5c725a 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -623,23 +623,23 @@ def test_quantile_nan(self, interp_method, request, using_array_manager): exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75]) tm.assert_frame_equal(res, exp) - def test_quantile_nat(self, interp_method, request, using_array_manager): + def test_quantile_nat(self, interp_method, request, using_array_manager, unit): interpolation, method = interp_method if method == "table" and using_array_manager: request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) # full NaT column - df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]}) + df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]}, dtype=f"M8[{unit}]") res = df.quantile( 0.5, numeric_only=False, interpolation=interpolation, method=method ) - exp = Series([pd.NaT], index=["a"], name=0.5) + exp = Series([pd.NaT], index=["a"], name=0.5, dtype=f"M8[{unit}]") tm.assert_series_equal(res, exp) res = df.quantile( [0.5], numeric_only=False, interpolation=interpolation, method=method ) - exp = DataFrame({"a": [pd.NaT]}, index=[0.5]) + exp = DataFrame({"a": [pd.NaT]}, index=[0.5], dtype=f"M8[{unit}]") tm.assert_frame_equal(res, exp) # mixed non-null / full null column @@ -651,20 +651,29 @@ def test_quantile_nat(self, interp_method, request, using_array_manager): Timestamp("2012-01-03"), ], "b": [pd.NaT, pd.NaT, pd.NaT], - } + }, + dtype=f"M8[{unit}]", ) res = df.quantile( 0.5, numeric_only=False, interpolation=interpolation, method=method ) - exp = Series([Timestamp("2012-01-02"), pd.NaT], index=["a", "b"], name=0.5) + exp = Series( + [Timestamp("2012-01-02"), pd.NaT], + index=["a", "b"], + name=0.5, + dtype=f"M8[{unit}]", + ) tm.assert_series_equal(res, exp) res = df.quantile( [0.5], numeric_only=False, interpolation=interpolation, method=method ) exp = DataFrame( - [[Timestamp("2012-01-02"), pd.NaT]], index=[0.5], columns=["a", "b"] + [[Timestamp("2012-01-02"), pd.NaT]], + index=[0.5], + columns=["a", "b"], + dtype=f"M8[{unit}]", ) tm.assert_frame_equal(res, exp) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index b57b4f4422888..b6a6334b89fc1 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -624,7 +624,7 @@ def test_reindex(self, float_frame, using_copy_on_write): assert np.isnan(val) for col, series in newFrame.items(): - assert tm.equalContents(series.index, newFrame.index) + tm.assert_index_equal(series.index, newFrame.index) emptyFrame = float_frame.reindex(Index([])) assert len(emptyFrame.index) == 0 @@ -642,7 +642,7 @@ def test_reindex(self, float_frame, using_copy_on_write): assert np.isnan(val) for col, series in nonContigFrame.items(): - assert tm.equalContents(series.index, nonContigFrame.index) + tm.assert_index_equal(series.index, nonContigFrame.index) # corner cases diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 98edd44e9f700..b21aa2d687682 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -241,7 +241,9 @@ def test_shift_by_offset(self, datetime_frame, frame_or_series): def test_shift_with_periodindex(self, frame_or_series): # Shifting with PeriodIndex - ps = tm.makePeriodFrame() + ps = DataFrame( + np.arange(4, dtype=float), index=pd.period_range("2020-01-01", periods=4) + ) ps = tm.get_obj(ps, frame_or_series) shifted = ps.shift(1) @@ -492,7 +494,7 @@ def test_shift_axis1_multiple_blocks_with_int_fill(self): tm.assert_frame_equal(result, expected) def test_period_index_frame_shift_with_freq(self, frame_or_series): - ps = tm.makePeriodFrame() + ps = DataFrame(range(4), index=pd.period_range("2020-01-01", periods=4)) ps = tm.get_obj(ps, frame_or_series) shifted = ps.shift(1, freq="infer") @@ -529,7 +531,7 @@ def test_datetime_frame_shift_with_freq(self, datetime_frame, frame_or_series): tm.assert_equal(unshifted, inferred_ts) def test_period_index_frame_shift_with_freq_error(self, frame_or_series): - ps = tm.makePeriodFrame() + ps = DataFrame(range(4), index=pd.period_range("2020-01-01", periods=4)) ps = tm.get_obj(ps, frame_or_series) msg = "Given freq M does not match PeriodIndex freq D" with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index f96f4e0558fa6..93a2f8d80019c 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -29,7 +29,8 @@ def test_transpose_td64_intervals(self): def test_transpose_empty_preserves_datetimeindex(self): # GH#41382 - df = DataFrame(index=DatetimeIndex([])) + dti = DatetimeIndex([], dtype="M8[ns]") + df = DataFrame(index=dti) expected = DatetimeIndex([], dtype="datetime64[ns]", freq=None) diff --git a/pandas/tests/frame/test_iteration.py b/pandas/tests/frame/test_iteration.py index 7374a8ea6aa77..a1c23ff05f3e1 100644 --- a/pandas/tests/frame/test_iteration.py +++ b/pandas/tests/frame/test_iteration.py @@ -40,7 +40,7 @@ def test_items_names(self, float_string_frame): assert v.name == k def test_iter(self, float_frame): - assert tm.equalContents(list(float_frame), float_frame.columns) + assert list(float_frame) == list(float_frame.columns) def test_iterrows(self, float_frame, float_string_frame): for k, v in float_frame.iterrows(): diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 20ad93e6dce4d..0d71fb0926df9 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -598,7 +598,7 @@ def test_sem(self, datetime_frame): "C": [1.0], "D": ["a"], "E": Categorical(["a"], categories=["a"]), - "F": to_datetime(["2000-1-2"]), + "F": pd.DatetimeIndex(["2000-01-02"], dtype="M8[ns]"), "G": to_timedelta(["1 days"]), }, ), @@ -610,7 +610,7 @@ def test_sem(self, datetime_frame): "C": [np.nan], "D": np.array([np.nan], dtype=object), "E": Categorical([np.nan], categories=["a"]), - "F": [pd.NaT], + "F": pd.DatetimeIndex([pd.NaT], dtype="M8[ns]"), "G": to_timedelta([pd.NaT]), }, ), @@ -621,7 +621,9 @@ def test_sem(self, datetime_frame): "I": [8, 9, np.nan, np.nan], "J": [1, np.nan, np.nan, np.nan], "K": Categorical(["a", np.nan, np.nan, np.nan], categories=["a"]), - "L": to_datetime(["2000-1-2", "NaT", "NaT", "NaT"]), + "L": pd.DatetimeIndex( + ["2000-01-02", "NaT", "NaT", "NaT"], dtype="M8[ns]" + ), "M": to_timedelta(["1 days", "nan", "nan", "nan"]), "N": [0, 1, 2, 3], }, @@ -633,7 +635,9 @@ def test_sem(self, datetime_frame): "I": [8, 9, np.nan, np.nan], "J": [1, np.nan, np.nan, np.nan], "K": Categorical([np.nan, "a", np.nan, np.nan], categories=["a"]), - "L": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]), + "L": pd.DatetimeIndex( + ["NaT", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]" + ), "M": to_timedelta(["nan", "1 days", "nan", "nan"]), "N": [0, 1, 2, 3], }, @@ -648,13 +652,17 @@ def test_mode_dropna(self, dropna, expected): "C": [1, np.nan, np.nan, np.nan], "D": [np.nan, np.nan, "a", np.nan], "E": Categorical([np.nan, np.nan, "a", np.nan]), - "F": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]), + "F": pd.DatetimeIndex( + ["NaT", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]" + ), "G": to_timedelta(["1 days", "nan", "nan", "nan"]), "H": [8, 8, 9, 9], "I": [9, 9, 8, 8], "J": [1, 1, np.nan, np.nan], "K": Categorical(["a", np.nan, "a", np.nan]), - "L": to_datetime(["2000-1-2", "2000-1-2", "NaT", "NaT"]), + "L": pd.DatetimeIndex( + ["2000-01-02", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]" + ), "M": to_timedelta(["1 days", "nan", "1 days", "nan"]), "N": np.arange(4, dtype="int64"), } diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py index f750074a36e91..98962b3003b6d 100644 --- a/pandas/tests/frame/test_repr.py +++ b/pandas/tests/frame/test_repr.py @@ -24,8 +24,6 @@ ) import pandas._testing as tm -import pandas.io.formats.format as fmt - class TestDataFrameRepr: def test_repr_should_return_str(self): @@ -220,16 +218,14 @@ def test_repr_unsortable(self): def test_repr_float_frame_options(self, float_frame): repr(float_frame) - fmt.set_option("display.precision", 3) - repr(float_frame) + with option_context("display.precision", 3): + repr(float_frame) - fmt.set_option("display.max_rows", 10, "display.max_columns", 2) - repr(float_frame) - - fmt.set_option("display.max_rows", 1000, "display.max_columns", 1000) - repr(float_frame) + with option_context("display.max_rows", 10, "display.max_columns", 2): + repr(float_frame) - tm.reset_display_options() + with option_context("display.max_rows", 1000, "display.max_columns", 1000): + repr(float_frame) def test_repr_unicode(self): uval = "\u03c3\u03c3\u03c3\u03c3" diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 87beab04bc586..1f08b9d5c35b8 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -316,7 +316,11 @@ class TestNDFrame: # tests that don't fit elsewhere @pytest.mark.parametrize( - "ser", [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries()] + "ser", + [ + Series(range(10), dtype=np.float64), + Series([str(i) for i in range(10)], dtype=object), + ], ) def test_squeeze_series_noop(self, ser): # noop @@ -360,14 +364,18 @@ def test_squeeze_axis_len_3(self): tm.assert_frame_equal(df.squeeze(axis=0), df) def test_numpy_squeeze(self): - s = tm.makeFloatSeries() + s = Series(range(2), dtype=np.float64) tm.assert_series_equal(np.squeeze(s), s) df = tm.makeTimeDataFrame().reindex(columns=["A"]) tm.assert_series_equal(np.squeeze(df), df["A"]) @pytest.mark.parametrize( - "ser", [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries()] + "ser", + [ + Series(range(10), dtype=np.float64), + Series([str(i) for i in range(10)], dtype=object), + ], ) def test_transpose_series(self, ser): # calls implementation in pandas/core/base.py @@ -393,7 +401,11 @@ def test_numpy_transpose(self, frame_or_series): np.transpose(obj, axes=1) @pytest.mark.parametrize( - "ser", [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries()] + "ser", + [ + Series(range(10), dtype=np.float64), + Series([str(i) for i in range(10)], dtype=object), + ], ) def test_take_series(self, ser): indices = [1, 5, -2, 6, 3, -1] diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py index d6eacf4f9079b..5fb432f849643 100644 --- a/pandas/tests/generic/test_to_xarray.py +++ b/pandas/tests/generic/test_to_xarray.py @@ -29,7 +29,7 @@ def df(self): } ) - def test_to_xarray_index_types(self, index_flat, df): + def test_to_xarray_index_types(self, index_flat, df, using_infer_string): index = index_flat # MultiIndex is tested in test_to_xarray_with_multiindex if len(index) == 0: @@ -51,7 +51,9 @@ def test_to_xarray_index_types(self, index_flat, df): # datetimes w/tz are preserved # column names are lost expected = df.copy() - expected["f"] = expected["f"].astype(object) + expected["f"] = expected["f"].astype( + object if not using_infer_string else "string[pyarrow_numpy]" + ) expected.columns.name = None tm.assert_frame_equal(result.to_dataframe(), expected) @@ -63,7 +65,7 @@ def test_to_xarray_empty(self, df): assert result.dims["foo"] == 0 assert isinstance(result, Dataset) - def test_to_xarray_with_multiindex(self, df): + def test_to_xarray_with_multiindex(self, df, using_infer_string): from xarray import Dataset # MultiIndex @@ -78,7 +80,9 @@ def test_to_xarray_with_multiindex(self, df): result = result.to_dataframe() expected = df.copy() - expected["f"] = expected["f"].astype(object) + expected["f"] = expected["f"].astype( + object if not using_infer_string else "string[pyarrow_numpy]" + ) expected.columns.name = None tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_groupby_shift_diff.py b/pandas/tests/groupby/methods/test_groupby_shift_diff.py index f2d40867af03a..0ce6a6462a5d8 100644 --- a/pandas/tests/groupby/methods/test_groupby_shift_diff.py +++ b/pandas/tests/groupby/methods/test_groupby_shift_diff.py @@ -117,10 +117,11 @@ def test_group_diff_real_frame(any_real_numpy_dtype): [Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")], ], ) -def test_group_diff_datetimelike(data): +def test_group_diff_datetimelike(data, unit): df = DataFrame({"a": [1, 2, 2], "b": data}) + df["b"] = df["b"].dt.as_unit(unit) result = df.groupby("a")["b"].diff() - expected = Series([NaT, NaT, Timedelta("1 days")], name="b") + expected = Series([NaT, NaT, Timedelta("1 days")], name="b").dt.as_unit(unit) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c61d9fab0435e..5b17484de9c93 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3303,3 +3303,13 @@ def test_groupby_ffill_with_duplicated_index(): result = df.groupby(level=0).ffill() expected = DataFrame({"a": [1, 2, 3, 4, 2, 3]}, index=[0, 1, 2, 0, 1, 2]) tm.assert_frame_equal(result, expected, check_dtype=False) + + +@pytest.mark.parametrize("attr", ["group_keys_seq", "reconstructed_codes"]) +def test_depr_grouper_attrs(attr): + # GH#56148 + df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) + gb = df.groupby("a") + msg = f"{attr} is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + getattr(gb.grouper, attr) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index e3cc41afa4679..3c1a35c984031 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -1211,3 +1211,13 @@ def test_grouper_groups(): msg = "Grouper.indexer is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): grper.indexer + + +@pytest.mark.parametrize("attr", ["group_index", "result_index", "group_arraylike"]) +def test_depr_grouping_attrs(attr): + # GH#56148 + df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) + gb = df.groupby("a") + msg = f"{attr} is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + getattr(gb.grouper.groupings[0], attr) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index be02c7f79ba01..b79bed32e7fa4 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -67,7 +67,9 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): gb = df.groupby(tdg) # check we're testing the case we're interested in - assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq) + msg = "group_keys_seq is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq) return gb @@ -98,11 +100,17 @@ def test_groupby_with_timegrouper(self): for df in [df_original, df_reordered]: df = df.set_index(["Date"]) + exp_dti = date_range( + "20130901", + "20131205", + freq="5D", + name="Date", + inclusive="left", + unit=df.index.unit, + ) expected = DataFrame( {"Buyer": 0, "Quantity": 0}, - index=date_range( - "20130901", "20131205", freq="5D", name="Date", inclusive="left" - ), + index=exp_dti, ) # Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl" expected = expected.astype({"Buyer": object}) @@ -514,6 +522,7 @@ def test_groupby_groups_datetimeindex(self): groups = grouped.groups assert isinstance(next(iter(groups.keys())), datetime) + def test_groupby_groups_datetimeindex2(self): # GH#11442 index = date_range("2015/01/01", periods=5, name="date") df = DataFrame({"A": [5, 6, 7, 8, 9], "B": [1, 2, 3, 4, 5]}, index=index) @@ -876,7 +885,9 @@ def test_groupby_apply_timegrouper_with_nat_dict_returns( res = gb["Quantity"].apply(lambda x: {"foo": len(x)}) - dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date") + df = gb.obj + unit = df["Date"]._values.unit + dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date", unit=unit) mi = MultiIndex.from_arrays([dti, ["foo"] * len(dti)]) expected = Series([3, 0, 0, 0, 0, 0, 2], index=mi, name="Quantity") tm.assert_series_equal(res, expected) @@ -890,7 +901,9 @@ def test_groupby_apply_timegrouper_with_nat_scalar_returns( res = gb["Quantity"].apply(lambda x: x.iloc[0] if len(x) else np.nan) - dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date") + df = gb.obj + unit = df["Date"]._values.unit + dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date", unit=unit) expected = Series( [18, np.nan, np.nan, np.nan, np.nan, np.nan, 5], index=dti._with_freq(None), @@ -919,9 +932,10 @@ def test_groupby_apply_timegrouper_with_nat_apply_squeeze( with tm.assert_produces_warning(FutureWarning, match=msg): res = gb.apply(lambda x: x["Quantity"] * 2) + dti = Index([Timestamp("2013-12-31")], dtype=df["Date"].dtype, name="Date") expected = DataFrame( [[36, 6, 6, 10, 2]], - index=Index([Timestamp("2013-12-31")], name="Date"), + index=dti, columns=Index([0, 1, 5, 2, 3], name="Quantity"), ) tm.assert_frame_equal(res, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index cf122832d86b4..a6f160d92fb66 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -115,12 +115,15 @@ def test_transform_fast2(): ) result = df.groupby("grouping").transform("first") - dates = [ - Timestamp("2014-1-1"), - Timestamp("2014-1-2"), - Timestamp("2014-1-2"), - Timestamp("2014-1-4"), - ] + dates = pd.Index( + [ + Timestamp("2014-1-1"), + Timestamp("2014-1-2"), + Timestamp("2014-1-2"), + Timestamp("2014-1-4"), + ], + dtype="M8[ns]", + ) expected = DataFrame( {"f": [1.1, 2.1, 2.1, 4.5], "d": dates, "i": [1, 2, 2, 4]}, columns=["f", "i", "d"], @@ -532,7 +535,7 @@ def test_series_fast_transform_date(): Timestamp("2014-1-2"), Timestamp("2014-1-4"), ] - expected = Series(dates, name="d") + expected = Series(dates, name="d", dtype="M8[ns]") tm.assert_series_equal(result, expected) @@ -1204,7 +1207,9 @@ def test_groupby_transform_with_datetimes(func, values): result = stocks.groupby(stocks["week_id"])["price"].transform(func) - expected = Series(data=pd.to_datetime(values), index=dates, name="price") + expected = Series( + data=pd.to_datetime(values).as_unit("ns"), index=dates, name="price" + ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 488f79eea0d11..e538ad512d691 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -12,6 +12,13 @@ from pandas.core.algorithms import safe_sort +def equal_contents(arr1, arr2) -> bool: + """ + Checks if the set of unique elements of arr1 and arr2 are equivalent. + """ + return frozenset(arr1) == frozenset(arr2) + + class TestIndexSetOps: @pytest.mark.parametrize( "method", ["union", "intersection", "difference", "symmetric_difference"] @@ -71,7 +78,7 @@ def test_union_different_type_base(self, klass): result = first.union(klass(second.values)) - assert tm.equalContents(result, index) + assert equal_contents(result, index) def test_union_sort_other_incomparable(self): # https://github.com/pandas-dev/pandas/issues/24959 @@ -119,7 +126,7 @@ def test_intersection_different_type_base(self, klass, sort): second = index[:3] result = first.intersection(klass(second.values), sort=sort) - assert tm.equalContents(result, second) + assert equal_contents(result, second) def test_intersection_nosort(self): result = Index(["c", "b", "a"]).intersection(["b", "a"]) @@ -244,7 +251,7 @@ def test_union_name_preservation( tm.assert_index_equal(union, expected) else: expected = Index(vals, name=expected_name) - tm.equalContents(union, expected) + tm.assert_index_equal(union.sort_values(), expected.sort_values()) @pytest.mark.parametrize( "diff_type, expected", diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index 9ee6250feeac6..c0bc6601769b1 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -292,7 +292,7 @@ def test_integer_index_astype_datetime(self, tz, dtype): # GH 20997, 20964, 24559 val = [Timestamp("2018-01-01", tz=tz).as_unit("ns")._value] result = Index(val, name="idx").astype(dtype) - expected = DatetimeIndex(["2018-01-01"], tz=tz, name="idx") + expected = DatetimeIndex(["2018-01-01"], tz=tz, name="idx").as_unit("ns") tm.assert_index_equal(result, expected) def test_dti_astype_period(self): @@ -312,8 +312,9 @@ class TestAstype: def test_astype_category(self, tz): obj = date_range("2000", periods=2, tz=tz, name="idx") result = obj.astype("category") + dti = DatetimeIndex(["2000-01-01", "2000-01-02"], tz=tz).as_unit("ns") expected = pd.CategoricalIndex( - [Timestamp("2000-01-01", tz=tz), Timestamp("2000-01-02", tz=tz)], + dti, name="idx", ) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/methods/test_shift.py b/pandas/tests/indexes/datetimes/methods/test_shift.py index c50f75894d810..d8bdcc2a17685 100644 --- a/pandas/tests/indexes/datetimes/methods/test_shift.py +++ b/pandas/tests/indexes/datetimes/methods/test_shift.py @@ -20,10 +20,10 @@ class TestDatetimeIndexShift: # ------------------------------------------------------------- # DatetimeIndex.shift is used in integer addition - def test_dti_shift_tzaware(self, tz_naive_fixture): + def test_dti_shift_tzaware(self, tz_naive_fixture, unit): # GH#9903 tz = tz_naive_fixture - idx = DatetimeIndex([], name="xxx", tz=tz) + idx = DatetimeIndex([], name="xxx", tz=tz).as_unit(unit) tm.assert_index_equal(idx.shift(0, freq="h"), idx) tm.assert_index_equal(idx.shift(3, freq="h"), idx) @@ -32,30 +32,31 @@ def test_dti_shift_tzaware(self, tz_naive_fixture): name="xxx", tz=tz, freq="h", - ) + ).as_unit(unit) tm.assert_index_equal(idx.shift(0, freq="h"), idx) exp = DatetimeIndex( ["2011-01-01 13:00", "2011-01-01 14:00", "2011-01-01 15:00"], name="xxx", tz=tz, freq="h", - ) + ).as_unit(unit) tm.assert_index_equal(idx.shift(3, freq="h"), exp) exp = DatetimeIndex( ["2011-01-01 07:00", "2011-01-01 08:00", "2011-01-01 09:00"], name="xxx", tz=tz, freq="h", - ) + ).as_unit(unit) tm.assert_index_equal(idx.shift(-3, freq="h"), exp) - def test_dti_shift_freqs(self): + def test_dti_shift_freqs(self, unit): # test shift for DatetimeIndex and non DatetimeIndex # GH#8083 - drange = date_range("20130101", periods=5) + drange = date_range("20130101", periods=5, unit=unit) result = drange.shift(1) expected = DatetimeIndex( ["2013-01-02", "2013-01-03", "2013-01-04", "2013-01-05", "2013-01-06"], + dtype=f"M8[{unit}]", freq="D", ) tm.assert_index_equal(result, expected) @@ -63,6 +64,7 @@ def test_dti_shift_freqs(self): result = drange.shift(-1) expected = DatetimeIndex( ["2012-12-31", "2013-01-01", "2013-01-02", "2013-01-03", "2013-01-04"], + dtype=f"M8[{unit}]", freq="D", ) tm.assert_index_equal(result, expected) @@ -70,12 +72,13 @@ def test_dti_shift_freqs(self): result = drange.shift(3, freq="2D") expected = DatetimeIndex( ["2013-01-07", "2013-01-08", "2013-01-09", "2013-01-10", "2013-01-11"], + dtype=f"M8[{unit}]", freq="D", ) tm.assert_index_equal(result, expected) - def test_dti_shift_int(self): - rng = date_range("1/1/2000", periods=20) + def test_dti_shift_int(self, unit): + rng = date_range("1/1/2000", periods=20, unit=unit) result = rng + 5 * rng.freq expected = rng.shift(5) @@ -85,25 +88,27 @@ def test_dti_shift_int(self): expected = rng.shift(-5) tm.assert_index_equal(result, expected) - def test_dti_shift_no_freq(self): + def test_dti_shift_no_freq(self, unit): # GH#19147 - dti = DatetimeIndex(["2011-01-01 10:00", "2011-01-01"], freq=None) + dti = DatetimeIndex(["2011-01-01 10:00", "2011-01-01"], freq=None).as_unit(unit) with pytest.raises(NullFrequencyError, match="Cannot shift with no freq"): dti.shift(2) @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) - def test_dti_shift_localized(self, tzstr): - dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI") + def test_dti_shift_localized(self, tzstr, unit): + dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI", unit=unit) dr_tz = dr.tz_localize(tzstr) result = dr_tz.shift(1, "10min") assert result.tz == dr_tz.tz - def test_dti_shift_across_dst(self): + def test_dti_shift_across_dst(self, unit): # GH 8616 - idx = date_range("2013-11-03", tz="America/Chicago", periods=7, freq="h") - s = Series(index=idx[:-1], dtype=object) - result = s.shift(freq="h") + idx = date_range( + "2013-11-03", tz="America/Chicago", periods=7, freq="h", unit=unit + ) + ser = Series(index=idx[:-1], dtype=object) + result = ser.shift(freq="h") expected = Series(index=idx[1:], dtype=object) tm.assert_series_equal(result, expected) @@ -115,24 +120,26 @@ def test_dti_shift_across_dst(self): [1, "2014-11-14 01:00:00"], ], ) - def test_dti_shift_near_midnight(self, shift, result_time): + def test_dti_shift_near_midnight(self, shift, result_time, unit): # GH 8616 dt = datetime(2014, 11, 14, 0) dt_est = pytz.timezone("EST").localize(dt) - s = Series(data=[1], index=[dt_est]) - result = s.shift(shift, freq="h") - expected = Series(1, index=DatetimeIndex([result_time], tz="EST")) + idx = DatetimeIndex([dt_est]).as_unit(unit) + ser = Series(data=[1], index=idx) + result = ser.shift(shift, freq="h") + exp_index = DatetimeIndex([result_time], tz="EST").as_unit(unit) + expected = Series(1, index=exp_index) tm.assert_series_equal(result, expected) - def test_shift_periods(self): + def test_shift_periods(self, unit): # GH#22458 : argument 'n' was deprecated in favor of 'periods' - idx = date_range(start=START, end=END, periods=3) + idx = date_range(start=START, end=END, periods=3, unit=unit) tm.assert_index_equal(idx.shift(periods=0), idx) tm.assert_index_equal(idx.shift(0), idx) @pytest.mark.parametrize("freq", ["B", "C"]) - def test_shift_bday(self, freq): - rng = date_range(START, END, freq=freq) + def test_shift_bday(self, freq, unit): + rng = date_range(START, END, freq=freq, unit=unit) shifted = rng.shift(5) assert shifted[0] == rng[5] assert shifted.freq == rng.freq @@ -145,18 +152,18 @@ def test_shift_bday(self, freq): assert shifted[0] == rng[0] assert shifted.freq == rng.freq - def test_shift_bmonth(self): - rng = date_range(START, END, freq=pd.offsets.BMonthEnd()) + def test_shift_bmonth(self, unit): + rng = date_range(START, END, freq=pd.offsets.BMonthEnd(), unit=unit) shifted = rng.shift(1, freq=pd.offsets.BDay()) assert shifted[0] == rng[0] + pd.offsets.BDay() - rng = date_range(START, END, freq=pd.offsets.BMonthEnd()) + rng = date_range(START, END, freq=pd.offsets.BMonthEnd(), unit=unit) with tm.assert_produces_warning(pd.errors.PerformanceWarning): shifted = rng.shift(1, freq=pd.offsets.CDay()) assert shifted[0] == rng[0] + pd.offsets.CDay() - def test_shift_empty(self): + def test_shift_empty(self, unit): # GH#14811 - dti = date_range(start="2016-10-21", end="2016-10-21", freq="BME") + dti = date_range(start="2016-10-21", end="2016-10-21", freq="BME", unit=unit) result = dti.shift(1) tm.assert_index_equal(result, dti) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index cd7a5996984de..dfad4d7ad01ea 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -183,6 +183,7 @@ def test_date_range_edges(self, freq): ) exp = DatetimeIndex( [ts + n * td for n in range(1, 5)], + dtype="M8[ns]", freq=freq, ) tm.assert_index_equal(idx, exp) @@ -193,7 +194,7 @@ def test_date_range_edges(self, freq): end=ts + td, freq=freq, ) - exp = DatetimeIndex([], freq=freq) + exp = DatetimeIndex([], dtype="M8[ns]", freq=freq) tm.assert_index_equal(idx, exp) # start matches end @@ -202,7 +203,7 @@ def test_date_range_edges(self, freq): end=ts + td, freq=freq, ) - exp = DatetimeIndex([ts + td], freq=freq) + exp = DatetimeIndex([ts + td], dtype="M8[ns]", freq=freq) tm.assert_index_equal(idx, exp) def test_date_range_near_implementation_bound(self): diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 73de33607ca0b..b7932715c3ac7 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -35,46 +35,43 @@ def test_getitem_slice_keeps_name(self): dr = date_range(st, et, freq="h", name="timebucket") assert dr[1:].name == dr.name - def test_getitem(self): - idx1 = date_range("2011-01-01", "2011-01-31", freq="D", name="idx") - idx2 = date_range( - "2011-01-01", "2011-01-31", freq="D", tz="Asia/Tokyo", name="idx" - ) + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo"]) + def test_getitem(self, tz): + idx = date_range("2011-01-01", "2011-01-31", freq="D", tz=tz, name="idx") - for idx in [idx1, idx2]: - result = idx[0] - assert result == Timestamp("2011-01-01", tz=idx.tz) + result = idx[0] + assert result == Timestamp("2011-01-01", tz=idx.tz) - result = idx[0:5] - expected = date_range( - "2011-01-01", "2011-01-05", freq="D", tz=idx.tz, name="idx" - ) - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq + result = idx[0:5] + expected = date_range( + "2011-01-01", "2011-01-05", freq="D", tz=idx.tz, name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq - result = idx[0:10:2] - expected = date_range( - "2011-01-01", "2011-01-09", freq="2D", tz=idx.tz, name="idx" - ) - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq + result = idx[0:10:2] + expected = date_range( + "2011-01-01", "2011-01-09", freq="2D", tz=idx.tz, name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq - result = idx[-20:-5:3] - expected = date_range( - "2011-01-12", "2011-01-24", freq="3D", tz=idx.tz, name="idx" - ) - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq + result = idx[-20:-5:3] + expected = date_range( + "2011-01-12", "2011-01-24", freq="3D", tz=idx.tz, name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq - result = idx[4::-1] - expected = DatetimeIndex( - ["2011-01-05", "2011-01-04", "2011-01-03", "2011-01-02", "2011-01-01"], - freq="-1D", - tz=idx.tz, - name="idx", - ) - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq + result = idx[4::-1] + expected = DatetimeIndex( + ["2011-01-05", "2011-01-04", "2011-01-03", "2011-01-02", "2011-01-01"], + dtype=idx.dtype, + freq="-1D", + name="idx", + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq @pytest.mark.parametrize("freq", ["B", "C"]) def test_dti_business_getitem(self, freq): @@ -264,8 +261,8 @@ def test_take(self): result = idx.take([3, 2, 5]) expected = DatetimeIndex( ["2011-01-04", "2011-01-03", "2011-01-06"], + dtype=idx.dtype, freq=None, - tz=idx.tz, name="idx", ) tm.assert_index_equal(result, expected) @@ -274,6 +271,7 @@ def test_take(self): result = idx.take([-3, 2, 5]) expected = DatetimeIndex( ["2011-01-29", "2011-01-03", "2011-01-06"], + dtype=idx.dtype, freq=None, tz=idx.tz, name="idx", @@ -314,7 +312,7 @@ def test_take2(self, tz): tz=tz, name="idx", ) - expected = DatetimeIndex(dates, freq=None, name="idx", tz=tz) + expected = DatetimeIndex(dates, freq=None, name="idx", dtype=idx.dtype) taken1 = idx.take([5, 6, 8, 12]) taken2 = idx[[5, 6, 8, 12]] diff --git a/pandas/tests/indexes/datetimes/test_iter.py b/pandas/tests/indexes/datetimes/test_iter.py index a1861e6e9447e..a006ed79f27ba 100644 --- a/pandas/tests/indexes/datetimes/test_iter.py +++ b/pandas/tests/indexes/datetimes/test_iter.py @@ -7,6 +7,7 @@ date_range, to_datetime, ) +from pandas.core.arrays import datetimes class TestDatetimeIndexIteration: @@ -59,13 +60,17 @@ def test_iteration_preserves_tz3(self): assert result._repr_base == expected._repr_base assert result == expected - @pytest.mark.parametrize("periods", [0, 9999, 10000, 10001]) - def test_iteration_over_chunksize(self, periods): + @pytest.mark.parametrize("offset", [-5, -1, 0, 1]) + def test_iteration_over_chunksize(self, offset, monkeypatch): # GH#21012 - - index = date_range("2000-01-01 00:00:00", periods=periods, freq="min") + chunksize = 5 + index = date_range( + "2000-01-01 00:00:00", periods=chunksize - offset, freq="min" + ) num = 0 - for stamp in index: - assert index[num] == stamp - num += 1 + with monkeypatch.context() as m: + m.setattr(datetimes, "_ITER_CHUNKSIZE", chunksize) + for stamp in index: + assert index[num] == stamp + num += 1 assert num == len(index) diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index c361c15c79758..81992219d71b4 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -304,7 +304,7 @@ def test_dti_fields(self, tz): exp = dti[[0, 90, 181, 273]] tm.assert_index_equal(res, exp) res = dti[dti.is_leap_year] - exp = DatetimeIndex([], freq="D", tz=dti.tz, name="name") + exp = DatetimeIndex([], freq="D", tz=dti.tz, name="name").as_unit("ns") tm.assert_index_equal(res, exp) def test_dti_is_year_quarter_start(self): diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 62b5e72d5e025..993f88db38ea6 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -73,7 +73,7 @@ def test_union(self, tz, sort): expected2_notsorted = DatetimeIndex(list(other2) + list(rng2[:3])) rng3 = date_range("1/1/2000", freq="D", periods=5, tz=tz) - other3 = DatetimeIndex([], tz=tz) + other3 = DatetimeIndex([], tz=tz).as_unit("ns") expected3 = date_range("1/1/2000", freq="D", periods=5, tz=tz) expected3_notsorted = rng3 @@ -206,13 +206,13 @@ def test_intersection2(self): first = tm.makeDateIndex(10) second = first[5:] intersect = first.intersection(second) - assert tm.equalContents(intersect, second) + tm.assert_index_equal(intersect, second) # GH 10149 cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: result = first.intersection(case) - assert tm.equalContents(result, second) + tm.assert_index_equal(result, second) third = Index(["a", "b", "c"]) result = first.intersection(third) @@ -235,7 +235,7 @@ def test_intersection(self, tz, sort): expected3 = date_range("6/1/2000", "6/20/2000", freq="D", name=None) rng4 = date_range("7/1/2000", "7/31/2000", freq="D", name="idx") - expected4 = DatetimeIndex([], freq="D", name="idx") + expected4 = DatetimeIndex([], freq="D", name="idx", dtype="M8[ns]") for rng, expected in [ (rng2, expected2), @@ -249,23 +249,27 @@ def test_intersection(self, tz, sort): # non-monotonic base = DatetimeIndex( ["2011-01-05", "2011-01-04", "2011-01-02", "2011-01-03"], tz=tz, name="idx" - ) + ).as_unit("ns") rng2 = DatetimeIndex( ["2011-01-04", "2011-01-02", "2011-02-02", "2011-02-03"], tz=tz, name="idx" - ) - expected2 = DatetimeIndex(["2011-01-04", "2011-01-02"], tz=tz, name="idx") + ).as_unit("ns") + expected2 = DatetimeIndex( + ["2011-01-04", "2011-01-02"], tz=tz, name="idx" + ).as_unit("ns") rng3 = DatetimeIndex( ["2011-01-04", "2011-01-02", "2011-02-02", "2011-02-03"], tz=tz, name="other", - ) - expected3 = DatetimeIndex(["2011-01-04", "2011-01-02"], tz=tz, name=None) + ).as_unit("ns") + expected3 = DatetimeIndex( + ["2011-01-04", "2011-01-02"], tz=tz, name=None + ).as_unit("ns") # GH 7880 rng4 = date_range("7/1/2000", "7/31/2000", freq="D", tz=tz, name="idx") - expected4 = DatetimeIndex([], tz=tz, name="idx") + expected4 = DatetimeIndex([], tz=tz, name="idx").as_unit("ns") assert expected4.freq is None for rng, expected in [ @@ -350,7 +354,7 @@ def test_difference_freq(self, sort): index = date_range("20160920", "20160925", freq="D") other = date_range("20160921", "20160924", freq="D") - expected = DatetimeIndex(["20160920", "20160925"], freq=None) + expected = DatetimeIndex(["20160920", "20160925"], dtype="M8[ns]", freq=None) idx_diff = index.difference(other, sort) tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) @@ -359,7 +363,7 @@ def test_difference_freq(self, sort): # subset of the original range other = date_range("20160922", "20160925", freq="D") idx_diff = index.difference(other, sort) - expected = DatetimeIndex(["20160920", "20160921"], freq="D") + expected = DatetimeIndex(["20160920", "20160921"], dtype="M8[ns]", freq="D") tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) @@ -536,7 +540,7 @@ def test_intersection(self): # non-overlapping the_int = rng[:10].intersection(rng[10:]) - expected = DatetimeIndex([]) + expected = DatetimeIndex([]).as_unit("ns") tm.assert_index_equal(the_int, expected) def test_intersection_bug(self): diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 379c727f4ed0f..daa5b346eb4ec 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -92,7 +92,7 @@ def test_drop_dst_boundary(self): "201710290245", "201710290300", ], - tz=tz, + dtype="M8[ns, Europe/Brussels]", freq=freq, ambiguous=[ True, @@ -112,10 +112,14 @@ def test_drop_dst_boundary(self): result = index.drop(index[0]) tm.assert_index_equal(result, expected) - def test_date_range_localize(self): - rng = date_range("3/11/2012 03:00", periods=15, freq="h", tz="US/Eastern") - rng2 = DatetimeIndex(["3/11/2012 03:00", "3/11/2012 04:00"], tz="US/Eastern") - rng3 = date_range("3/11/2012 03:00", periods=15, freq="h") + def test_date_range_localize(self, unit): + rng = date_range( + "3/11/2012 03:00", periods=15, freq="h", tz="US/Eastern", unit=unit + ) + rng2 = DatetimeIndex( + ["3/11/2012 03:00", "3/11/2012 04:00"], dtype=f"M8[{unit}, US/Eastern]" + ) + rng3 = date_range("3/11/2012 03:00", periods=15, freq="h", unit=unit) rng3 = rng3.tz_localize("US/Eastern") tm.assert_index_equal(rng._with_freq(None), rng3) @@ -129,10 +133,15 @@ def test_date_range_localize(self): assert val == exp # same UTC value tm.assert_index_equal(rng[:2], rng2) + def test_date_range_localize2(self, unit): # Right before the DST transition - rng = date_range("3/11/2012 00:00", periods=2, freq="h", tz="US/Eastern") + rng = date_range( + "3/11/2012 00:00", periods=2, freq="h", tz="US/Eastern", unit=unit + ) rng2 = DatetimeIndex( - ["3/11/2012 00:00", "3/11/2012 01:00"], tz="US/Eastern", freq="h" + ["3/11/2012 00:00", "3/11/2012 01:00"], + dtype=f"M8[{unit}, US/Eastern]", + freq="h", ) tm.assert_index_equal(rng, rng2) exp = Timestamp("3/11/2012 00:00", tz="US/Eastern") @@ -142,7 +151,9 @@ def test_date_range_localize(self): assert exp.hour == 1 assert rng[1] == exp - rng = date_range("3/11/2012 00:00", periods=10, freq="h", tz="US/Eastern") + rng = date_range( + "3/11/2012 00:00", periods=10, freq="h", tz="US/Eastern", unit=unit + ) assert rng[2].hour == 3 def test_timestamp_equality_different_timezones(self): @@ -231,10 +242,10 @@ def test_dti_convert_tz_aware_datetime_datetime(self, tz): dates = [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)] dates_aware = [conversion.localize_pydatetime(x, tz) for x in dates] - result = DatetimeIndex(dates_aware) + result = DatetimeIndex(dates_aware).as_unit("ns") assert timezones.tz_compare(result.tz, tz) - converted = to_datetime(dates_aware, utc=True) + converted = to_datetime(dates_aware, utc=True).as_unit("ns") ex_vals = np.array([Timestamp(x).as_unit("ns")._value for x in dates_aware]) tm.assert_numpy_array_equal(converted.asi8, ex_vals) assert converted.tz is timezone.utc diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index 2007a793843c9..fd03047b2c127 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -363,15 +363,18 @@ def test_get_indexer_categorical_with_nans(self): def test_get_indexer_datetime(self): ii = IntervalIndex.from_breaks(date_range("2018-01-01", periods=4)) - result = ii.get_indexer(DatetimeIndex(["2018-01-02"])) + # TODO: with mismatched resolution get_indexer currently raises; + # this should probably coerce? + target = DatetimeIndex(["2018-01-02"], dtype="M8[ns]") + result = ii.get_indexer(target) expected = np.array([0], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - result = ii.get_indexer(DatetimeIndex(["2018-01-02"]).astype(str)) + result = ii.get_indexer(target.astype(str)) tm.assert_numpy_array_equal(result, expected) # https://github.com/pandas-dev/pandas/issues/47772 - result = ii.get_indexer(DatetimeIndex(["2018-01-02"]).asi8) + result = ii.get_indexer(target.asi8) expected = np.array([-1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index e19b1700236f5..63f9b2176dddd 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -388,7 +388,7 @@ def test_maybe_convert_i8_nat(self, breaks): # GH 20636 index = IntervalIndex.from_breaks(breaks) - to_convert = breaks._constructor([pd.NaT] * 3) + to_convert = breaks._constructor([pd.NaT] * 3).as_unit("ns") expected = Index([np.nan] * 3, dtype=np.float64) result = index._maybe_convert_i8(to_convert) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/interval/test_interval_range.py b/pandas/tests/indexes/interval/test_interval_range.py index 37606bda9efca..d4d4a09c44d13 100644 --- a/pandas/tests/indexes/interval/test_interval_range.py +++ b/pandas/tests/indexes/interval/test_interval_range.py @@ -184,6 +184,9 @@ def test_no_invalid_float_truncation(self, start, end, freq): def test_linspace_dst_transition(self, start, mid, end): # GH 20976: linspace behavior defined from start/end/periods # accounts for the hour gained/lost during DST transition + start = start.as_unit("ns") + mid = mid.as_unit("ns") + end = end.as_unit("ns") result = interval_range(start=start, end=end, periods=2) expected = IntervalIndex.from_breaks([start, mid, end]) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/interval/test_setops.py b/pandas/tests/indexes/interval/test_setops.py index 059b0b75f4190..1b0816a9405cb 100644 --- a/pandas/tests/indexes/interval/test_setops.py +++ b/pandas/tests/indexes/interval/test_setops.py @@ -25,14 +25,16 @@ def test_union(self, closed, sort): expected = monotonic_index(0, 13, closed=closed) result = index[::-1].union(other, sort=sort) - if sort is None: + if sort in (None, True): tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) result = other[::-1].union(index, sort=sort) - if sort is None: + if sort in (None, True): tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) tm.assert_index_equal(index.union(index, sort=sort), index) tm.assert_index_equal(index.union(index[:1], sort=sort), index) @@ -65,14 +67,16 @@ def test_intersection(self, closed, sort): expected = monotonic_index(5, 11, closed=closed) result = index[::-1].intersection(other, sort=sort) - if sort is None: + if sort in (None, True): tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) result = other[::-1].intersection(index, sort=sort) - if sort is None: + if sort in (None, True): tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) tm.assert_index_equal(index.intersection(index, sort=sort), index) @@ -148,16 +152,18 @@ def test_symmetric_difference(self, closed, sort): index = monotonic_index(0, 11, closed=closed) result = index[1:].symmetric_difference(index[:-1], sort=sort) expected = IntervalIndex([index[0], index[-1]]) - if sort is None: + if sort in (None, True): tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) # GH 19101: empty result, same dtype result = index.symmetric_difference(index, sort=sort) expected = empty_index(dtype="int64", closed=closed) - if sort is None: + if sort in (None, True): tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) # GH 19101: empty result, different dtypes other = IntervalIndex.from_arrays( diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 2b4107acee096..be266f5d8fdce 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -243,10 +243,10 @@ def test_union(idx, sort): the_union = piece1.union(piece2, sort=sort) - if sort is None: - tm.assert_index_equal(the_union, idx.sort_values()) - - assert tm.equalContents(the_union, idx) + if sort in (None, False): + tm.assert_index_equal(the_union.sort_values(), idx.sort_values()) + else: + tm.assert_index_equal(the_union, idx) # corner case, pass self or empty thing: the_union = idx.union(idx, sort=sort) @@ -258,7 +258,7 @@ def test_union(idx, sort): tuples = idx.values result = idx[:4].union(tuples[4:], sort=sort) if sort is None: - tm.equalContents(result, idx) + tm.assert_index_equal(result.sort_values(), idx.sort_values()) else: assert result.equals(idx) @@ -284,9 +284,10 @@ def test_intersection(idx, sort): the_int = piece1.intersection(piece2, sort=sort) - if sort is None: + if sort in (None, True): tm.assert_index_equal(the_int, idx[3:5]) - assert tm.equalContents(the_int, idx[3:5]) + else: + tm.assert_index_equal(the_int.sort_values(), idx[3:5]) # corner case, pass self the_int = idx.intersection(idx, sort=sort) diff --git a/pandas/tests/indexes/numeric/test_setops.py b/pandas/tests/indexes/numeric/test_setops.py index d3789f2477896..376b51dd98bb1 100644 --- a/pandas/tests/indexes/numeric/test_setops.py +++ b/pandas/tests/indexes/numeric/test_setops.py @@ -133,7 +133,10 @@ def test_symmetric_difference(self, sort): index2 = Index([2, 3, 4, 1]) result = index1.symmetric_difference(index2, sort=sort) expected = Index([5, 1]) - assert tm.equalContents(result, expected) + if sort is not None: + tm.assert_index_equal(result, expected) + else: + tm.assert_index_equal(result, expected.sort_values()) assert result.name is None if sort is None: expected = expected.sort_values() diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 87d3afc77d556..93d46ebdd0b51 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -4,6 +4,7 @@ import pytest from pandas._libs.missing import is_matching_na +import pandas.util._test_decorators as td import pandas as pd from pandas import Index @@ -144,6 +145,13 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): class TestSliceLocs: + @pytest.mark.parametrize( + "dtype", + [ + "object", + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + ], + ) @pytest.mark.parametrize( "in_slice,expected", [ @@ -167,12 +175,23 @@ class TestSliceLocs: (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] ], ) - def test_slice_locs_negative_step(self, in_slice, expected): - index = Index(list("bcdxy")) + def test_slice_locs_negative_step(self, in_slice, expected, dtype): + index = Index(list("bcdxy"), dtype=dtype) s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) result = index[s_start : s_stop : in_slice.step] - expected = Index(list(expected)) + expected = Index(list(expected), dtype=dtype) + tm.assert_index_equal(result, expected) + + @td.skip_if_no("pyarrow") + def test_slice_locs_negative_step_oob(self): + index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]") + + result = index[-10:5:1] + tm.assert_index_equal(result, index) + + result = index[4:-10:-1] + expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]") tm.assert_index_equal(result, expected) def test_slice_locs_dup(self): diff --git a/pandas/tests/indexes/period/methods/test_to_timestamp.py b/pandas/tests/indexes/period/methods/test_to_timestamp.py index 7be2602135578..3867f9e3245dc 100644 --- a/pandas/tests/indexes/period/methods/test_to_timestamp.py +++ b/pandas/tests/indexes/period/methods/test_to_timestamp.py @@ -58,7 +58,9 @@ def test_to_timestamp_pi_nat(self): result = index.to_timestamp("D") expected = DatetimeIndex( - [NaT, datetime(2011, 1, 1), datetime(2011, 2, 1)], name="idx" + [NaT, datetime(2011, 1, 1), datetime(2011, 2, 1)], + dtype="M8[ns]", + name="idx", ) tm.assert_index_equal(result, expected) assert result.name == "idx" @@ -98,11 +100,15 @@ def test_to_timestamp_pi_mult(self): idx = PeriodIndex(["2011-01", "NaT", "2011-02"], freq="2M", name="idx") result = idx.to_timestamp() - expected = DatetimeIndex(["2011-01-01", "NaT", "2011-02-01"], name="idx") + expected = DatetimeIndex( + ["2011-01-01", "NaT", "2011-02-01"], dtype="M8[ns]", name="idx" + ) tm.assert_index_equal(result, expected) result = idx.to_timestamp(how="E") - expected = DatetimeIndex(["2011-02-28", "NaT", "2011-03-31"], name="idx") + expected = DatetimeIndex( + ["2011-02-28", "NaT", "2011-03-31"], dtype="M8[ns]", name="idx" + ) expected = expected + Timedelta(1, "D") - Timedelta(1, "ns") tm.assert_index_equal(result, expected) @@ -110,18 +116,22 @@ def test_to_timestamp_pi_combined(self): idx = period_range(start="2011", periods=2, freq="1D1h", name="idx") result = idx.to_timestamp() - expected = DatetimeIndex(["2011-01-01 00:00", "2011-01-02 01:00"], name="idx") + expected = DatetimeIndex( + ["2011-01-01 00:00", "2011-01-02 01:00"], dtype="M8[ns]", name="idx" + ) tm.assert_index_equal(result, expected) result = idx.to_timestamp(how="E") expected = DatetimeIndex( - ["2011-01-02 00:59:59", "2011-01-03 01:59:59"], name="idx" + ["2011-01-02 00:59:59", "2011-01-03 01:59:59"], name="idx", dtype="M8[ns]" ) expected = expected + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result, expected) result = idx.to_timestamp(how="E", freq="h") - expected = DatetimeIndex(["2011-01-02 00:00", "2011-01-03 01:00"], name="idx") + expected = DatetimeIndex( + ["2011-01-02 00:00", "2011-01-03 01:00"], dtype="M8[ns]", name="idx" + ) expected = expected + Timedelta(1, "h") - Timedelta(1, "ns") tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index 4af6b5ca1a6a7..4fab12f195dc0 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -117,7 +117,7 @@ def test_range_slice_outofbounds(self, make_range): idx = make_range(start="2013/10/01", freq="D", periods=10) df = DataFrame({"units": [100 + i for i in range(10)]}, index=idx) - empty = DataFrame(index=type(idx)([], freq="D"), columns=["units"]) + empty = DataFrame(index=idx[:0], columns=["units"]) empty["units"] = empty["units"].astype("int64") tm.assert_frame_equal(df["2013/09/01":"2013/09/30"], empty) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 7b7baab112264..1354f33291c45 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -285,10 +285,11 @@ def test_map(self): "freq,freq_depr", [ ("2M", "2ME"), - ("2SM", "2SME"), + ("2Q-MAR", "2QE-MAR"), + ("2Y-FEB", "2YE-FEB"), ], ) - def test_period_index_frequency_ME_SME_error_message(self, freq, freq_depr): + def test_period_index_frequency_ME_error_message(self, freq, freq_depr): # GH#52064 msg = f"for Period, please use '{freq[1:]}' instead of '{freq_depr[1:]}'" @@ -316,11 +317,10 @@ def test_a_deprecated_from_time_series(self, freq_depr): series = Series(1, index=index) assert isinstance(series, Series) - @pytest.mark.parametrize("freq_depr", ["2ME", "2QE", "2BQE", "2YE"]) - def test_period_index_frequency_error_message(self, freq_depr): + @pytest.mark.parametrize("freq_depr", ["2SME", "2CBME", "2BYE"]) + def test_period_index_frequency_invalid_freq(self, freq_depr): # GH#9586 - msg = f"for Period, please use '{freq_depr[1:-1]}' " - f"instead of '{freq_depr[1:]}'" + msg = f"Invalid frequency: {freq_depr[1:]}" with pytest.raises(ValueError, match=msg): period_range("2020-01", "2020-05", freq=freq_depr) diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index b9a5940795a5b..2fa7e8cd0d2df 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -142,9 +142,10 @@ def test_union_misc(self, sort): # not in order result = _permute(index[:-5]).union(_permute(index[10:]), sort=sort) - if sort is None: + if sort is False: + tm.assert_index_equal(result.sort_values(), index) + else: tm.assert_index_equal(result, index) - assert tm.equalContents(result, index) # cast if different frequencies index = period_range("1/1/2000", "1/20/2000", freq="D") @@ -163,9 +164,10 @@ def test_intersection(self, sort): left = _permute(index[:-5]) right = _permute(index[10:]) result = left.intersection(right, sort=sort) - if sort is None: + if sort is False: + tm.assert_index_equal(result.sort_values(), index[10:-5]) + else: tm.assert_index_equal(result, index[10:-5]) - assert tm.equalContents(result, index[10:-5]) # cast if different frequencies index = period_range("1/1/2000", "1/20/2000", freq="D") diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index dc624f0271a73..8e11fc28cc387 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1,5 +1,6 @@ from collections import defaultdict from datetime import datetime +from functools import partial import math import operator import re @@ -492,10 +493,10 @@ def test_union_dt_as_obj(self, simple_index): first_cat = index.union(date_index) second_cat = index.union(index) - appended = np.append(index, date_index.astype("O")) + appended = Index(np.append(index, date_index.astype("O"))) - assert tm.equalContents(first_cat, appended) - assert tm.equalContents(second_cat, index) + tm.assert_index_equal(first_cat, appended) + tm.assert_index_equal(second_cat, index) tm.assert_contains_all(index, first_cat) tm.assert_contains_all(index, second_cat) tm.assert_contains_all(date_index, first_cat) @@ -1596,11 +1597,23 @@ def test_generated_op_names(opname, index): assert method.__name__ == opname -@pytest.mark.parametrize("index_maker", tm.index_subclass_makers_generator()) -def test_index_subclass_constructor_wrong_kwargs(index_maker): +@pytest.mark.parametrize( + "klass", + [ + partial(CategoricalIndex, data=[1]), + partial(DatetimeIndex, data=["2020-01-01"]), + partial(PeriodIndex, data=["2020-01-01"]), + partial(TimedeltaIndex, data=["1 day"]), + partial(RangeIndex, data=range(1)), + partial(IntervalIndex, data=[pd.Interval(0, 1)]), + partial(Index, data=["a"], dtype=object), + partial(MultiIndex, levels=[1], codes=[0]), + ], +) +def test_index_subclass_constructor_wrong_kwargs(klass): # GH #19348 with pytest.raises(TypeError, match="unexpected keyword argument"): - index_maker(foo="bar") + klass(foo="bar") def test_deprecated_fastpath(): diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 1f328c06b483b..dab2475240267 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -30,6 +30,13 @@ ) +def equal_contents(arr1, arr2) -> bool: + """ + Checks if the set of unique elements of arr1 and arr2 are equivalent. + """ + return frozenset(arr1) == frozenset(arr2) + + @pytest.fixture( params=tm.ALL_REAL_NUMPY_DTYPES + [ @@ -215,10 +222,10 @@ def test_intersection_base(self, index): if isinstance(index, CategoricalIndex): pytest.skip(f"Not relevant for {type(index).__name__}") - first = index[:5] - second = index[:3] + first = index[:5].unique() + second = index[:3].unique() intersect = first.intersection(second) - assert tm.equalContents(intersect, second) + tm.assert_index_equal(intersect, second) if isinstance(index.dtype, DatetimeTZDtype): # The second.values below will drop tz, so the rest of this test @@ -229,7 +236,7 @@ def test_intersection_base(self, index): cases = [second.to_numpy(), second.to_series(), second.to_list()] for case in cases: result = first.intersection(case) - assert tm.equalContents(result, second) + assert equal_contents(result, second) if isinstance(index, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" @@ -241,12 +248,13 @@ def test_intersection_base(self, index): ) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_union_base(self, index): + index = index.unique() first = index[3:] second = index[:5] everything = index union = first.union(second) - assert tm.equalContents(union, everything) + tm.assert_index_equal(union.sort_values(), everything.sort_values()) if isinstance(index.dtype, DatetimeTZDtype): # The second.values below will drop tz, so the rest of this test @@ -257,7 +265,7 @@ def test_union_base(self, index): cases = [second.to_numpy(), second.to_series(), second.to_list()] for case in cases: result = first.union(case) - assert tm.equalContents(result, everything) + assert equal_contents(result, everything) if isinstance(index, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" @@ -280,13 +288,13 @@ def test_difference_base(self, sort, index): else: answer = index[4:] result = first.difference(second, sort) - assert tm.equalContents(result, answer) + assert equal_contents(result, answer) # GH#10149 cases = [second.to_numpy(), second.to_series(), second.to_list()] for case in cases: result = first.difference(case, sort) - assert tm.equalContents(result, answer) + assert equal_contents(result, answer) if isinstance(index, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" @@ -311,13 +319,13 @@ def test_symmetric_difference(self, index): second = index[:-1] answer = index[[0, -1]] result = first.symmetric_difference(second) - assert tm.equalContents(result, answer) + tm.assert_index_equal(result.sort_values(), answer.sort_values()) # GH#10149 cases = [second.to_numpy(), second.to_series(), second.to_list()] for case in cases: result = first.symmetric_difference(case) - assert tm.equalContents(result, answer) + assert equal_contents(result, answer) if isinstance(index, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" @@ -701,9 +709,10 @@ def test_intersection(self, index, sort): first = index[:20] second = index[:10] intersect = first.intersection(second, sort=sort) - if sort is None: - tm.assert_index_equal(intersect, second.sort_values()) - assert tm.equalContents(intersect, second) + if sort in (None, False): + tm.assert_index_equal(intersect.sort_values(), second.sort_values()) + else: + tm.assert_index_equal(intersect, second) # Corner cases inter = first.intersection(first, sort=sort) @@ -766,9 +775,10 @@ def test_union(self, index, sort): everything = index[:20] union = first.union(second, sort=sort) - if sort is None: - tm.assert_index_equal(union, everything.sort_values()) - assert tm.equalContents(union, everything) + if sort in (None, False): + tm.assert_index_equal(union.sort_values(), everything.sort_values()) + else: + tm.assert_index_equal(union, everything) @pytest.mark.parametrize("klass", [np.array, Series, list]) @pytest.mark.parametrize("index", ["string"], indirect=True) @@ -780,9 +790,10 @@ def test_union_from_iterables(self, index, klass, sort): case = klass(second.values) result = first.union(case, sort=sort) - if sort is None: - tm.assert_index_equal(result, everything.sort_values()) - assert tm.equalContents(result, everything) + if sort in (None, False): + tm.assert_index_equal(result.sort_values(), everything.sort_values()) + else: + tm.assert_index_equal(result, everything) @pytest.mark.parametrize("index", ["string"], indirect=True) def test_union_identity(self, index, sort): @@ -811,7 +822,11 @@ def test_difference_name_preservation(self, index, second_name, expected, sort): second.name = second_name result = first.difference(second, sort=sort) - assert tm.equalContents(result, answer) + if sort is True: + tm.assert_index_equal(result, answer) + else: + answer.name = second_name + tm.assert_index_equal(result.sort_values(), answer.sort_values()) if expected is None: assert result.name is None @@ -894,7 +909,6 @@ def test_symmetric_difference_mi(self, sort): if sort is None: expected = expected.sort_values() tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) @pytest.mark.parametrize( "index2,expected", @@ -916,13 +930,20 @@ def test_symmetric_difference_missing(self, index2, expected, sort): def test_symmetric_difference_non_index(self, sort): index1 = Index([1, 2, 3, 4], name="index1") index2 = np.array([2, 3, 4, 5]) - expected = Index([1, 5]) + expected = Index([1, 5], name="index1") result = index1.symmetric_difference(index2, sort=sort) - assert tm.equalContents(result, expected) + if sort in (None, True): + tm.assert_index_equal(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) assert result.name == "index1" result = index1.symmetric_difference(index2, result_name="new_name", sort=sort) - assert tm.equalContents(result, expected) + expected.name = "new_name" + if sort in (None, True): + tm.assert_index_equal(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) assert result.name == "new_name" def test_union_ea_dtypes(self, any_numeric_ea_and_arrow_dtype): diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index 727b4eee00566..fce10d9176d74 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -115,7 +115,7 @@ def test_intersection_equal(self, sort): intersect = first.intersection(second, sort=sort) if sort is None: tm.assert_index_equal(intersect, second.sort_values()) - assert tm.equalContents(intersect, second) + tm.assert_index_equal(intersect, second) # Corner cases inter = first.intersection(first, sort=sort) diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index 9cf11b4602cb2..7be3d8c657766 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -5,9 +5,9 @@ from pandas import ( DataFrame, + DatetimeIndex, MultiIndex, date_range, - to_datetime, ) import pandas._testing as tm @@ -219,7 +219,11 @@ def test_setitem_multiple_partial(self, multiindex_dataframe_random_data): @pytest.mark.parametrize( "indexer, exp_idx, exp_values", [ - (slice("2019-2", None), [to_datetime("2019-02-01")], [2, 3]), + ( + slice("2019-2", None), + DatetimeIndex(["2019-02-01"], dtype="M8[ns]"), + [2, 3], + ), ( slice(None, "2019-2"), date_range("2019", periods=2, freq="MS"), diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index e31b675efb69a..9870868a3e1e9 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -9,7 +9,6 @@ DataFrame, MultiIndex, Series, - Timestamp, date_range, isna, notna, @@ -91,11 +90,11 @@ def test_setitem_multiindex3(self): np.random.default_rng(2).random((12, 4)), index=idx, columns=cols ) - subidx = MultiIndex.from_tuples( - [("A", Timestamp("2015-01-01")), ("A", Timestamp("2015-02-01"))] + subidx = MultiIndex.from_arrays( + [["A", "A"], date_range("2015-01-01", "2015-02-01", freq="MS")] ) - subcols = MultiIndex.from_tuples( - [("foo", Timestamp("2016-01-01")), ("foo", Timestamp("2016-02-01"))] + subcols = MultiIndex.from_arrays( + [["foo", "foo"], date_range("2016-01-01", "2016-02-01", freq="MS")] ) vals = DataFrame( diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 792d1323ec730..a9dbdb21f59fb 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -591,7 +591,7 @@ def test_astype(self, t): else: assert tmgr.iget(3).dtype.type == t - def test_convert(self): + def test_convert(self, using_infer_string): def _compare(old_mgr, new_mgr): """compare the blocks, numeric compare ==, object don't""" old_blocks = set(old_mgr.blocks) @@ -626,9 +626,10 @@ def _compare(old_mgr, new_mgr): mgr.iset(1, np.array(["2."] * N, dtype=np.object_)) mgr.iset(2, np.array(["foo."] * N, dtype=np.object_)) new_mgr = mgr.convert(copy=True) - assert new_mgr.iget(0).dtype == np.object_ - assert new_mgr.iget(1).dtype == np.object_ - assert new_mgr.iget(2).dtype == np.object_ + dtype = "string[pyarrow_numpy]" if using_infer_string else np.object_ + assert new_mgr.iget(0).dtype == dtype + assert new_mgr.iget(1).dtype == dtype + assert new_mgr.iget(2).dtype == dtype assert new_mgr.iget(3).dtype == np.int64 assert new_mgr.iget(4).dtype == np.float64 @@ -639,9 +640,9 @@ def _compare(old_mgr, new_mgr): mgr.iset(1, np.array(["2."] * N, dtype=np.object_)) mgr.iset(2, np.array(["foo."] * N, dtype=np.object_)) new_mgr = mgr.convert(copy=True) - assert new_mgr.iget(0).dtype == np.object_ - assert new_mgr.iget(1).dtype == np.object_ - assert new_mgr.iget(2).dtype == np.object_ + assert new_mgr.iget(0).dtype == dtype + assert new_mgr.iget(1).dtype == dtype + assert new_mgr.iget(2).dtype == dtype assert new_mgr.iget(3).dtype == np.int32 assert new_mgr.iget(4).dtype == np.bool_ assert new_mgr.iget(5).dtype.type, np.datetime64 diff --git a/pandas/tests/io/formats/test_eng_formatting.py b/pandas/tests/io/formats/test_eng_formatting.py index 75e63cb1f6b54..6d581b5b92e0c 100644 --- a/pandas/tests/io/formats/test_eng_formatting.py +++ b/pandas/tests/io/formats/test_eng_formatting.py @@ -1,9 +1,19 @@ import numpy as np +import pytest -from pandas import DataFrame -import pandas._testing as tm +from pandas import ( + DataFrame, + reset_option, + set_eng_float_format, +) -import pandas.io.formats.format as fmt +from pandas.io.formats.format import EngFormatter + + +@pytest.fixture(autouse=True) +def reset_float_format(): + yield + reset_option("display.float_format") class TestEngFormatter: @@ -11,20 +21,19 @@ def test_eng_float_formatter2(self, float_frame): df = float_frame df.loc[5] = 0 - fmt.set_eng_float_format() + set_eng_float_format() repr(df) - fmt.set_eng_float_format(use_eng_prefix=True) + set_eng_float_format(use_eng_prefix=True) repr(df) - fmt.set_eng_float_format(accuracy=0) + set_eng_float_format(accuracy=0) repr(df) - tm.reset_display_options() def test_eng_float_formatter(self): df = DataFrame({"A": [1.41, 141.0, 14100, 1410000.0]}) - fmt.set_eng_float_format() + set_eng_float_format() result = df.to_string() expected = ( " A\n" @@ -35,18 +44,16 @@ def test_eng_float_formatter(self): ) assert result == expected - fmt.set_eng_float_format(use_eng_prefix=True) + set_eng_float_format(use_eng_prefix=True) result = df.to_string() expected = " A\n0 1.410\n1 141.000\n2 14.100k\n3 1.410M" assert result == expected - fmt.set_eng_float_format(accuracy=0) + set_eng_float_format(accuracy=0) result = df.to_string() expected = " A\n0 1E+00\n1 141E+00\n2 14E+03\n3 1E+06" assert result == expected - tm.reset_display_options() - def compare(self, formatter, input, output): formatted_input = formatter(input) assert formatted_input == output @@ -67,7 +74,7 @@ def compare_all(self, formatter, in_out): self.compare(formatter, -input, "-" + output[1:]) def test_exponents_with_eng_prefix(self): - formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + formatter = EngFormatter(accuracy=3, use_eng_prefix=True) f = np.sqrt(2) in_out = [ (f * 10**-24, " 1.414y"), @@ -125,7 +132,7 @@ def test_exponents_with_eng_prefix(self): self.compare_all(formatter, in_out) def test_exponents_without_eng_prefix(self): - formatter = fmt.EngFormatter(accuracy=4, use_eng_prefix=False) + formatter = EngFormatter(accuracy=4, use_eng_prefix=False) f = np.pi in_out = [ (f * 10**-24, " 3.1416E-24"), @@ -183,7 +190,7 @@ def test_exponents_without_eng_prefix(self): self.compare_all(formatter, in_out) def test_rounding(self): - formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + formatter = EngFormatter(accuracy=3, use_eng_prefix=True) in_out = [ (5.55555, " 5.556"), (55.5555, " 55.556"), @@ -194,7 +201,7 @@ def test_rounding(self): ] self.compare_all(formatter, in_out) - formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) + formatter = EngFormatter(accuracy=1, use_eng_prefix=True) in_out = [ (5.55555, " 5.6"), (55.5555, " 55.6"), @@ -205,7 +212,7 @@ def test_rounding(self): ] self.compare_all(formatter, in_out) - formatter = fmt.EngFormatter(accuracy=0, use_eng_prefix=True) + formatter = EngFormatter(accuracy=0, use_eng_prefix=True) in_out = [ (5.55555, " 6"), (55.5555, " 56"), @@ -216,14 +223,14 @@ def test_rounding(self): ] self.compare_all(formatter, in_out) - formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + formatter = EngFormatter(accuracy=3, use_eng_prefix=True) result = formatter(0) assert result == " 0.000" def test_nan(self): # Issue #11981 - formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) + formatter = EngFormatter(accuracy=1, use_eng_prefix=True) result = formatter(np.nan) assert result == "NaN" @@ -235,14 +242,13 @@ def test_nan(self): } ) pt = df.pivot_table(values="a", index="b", columns="c") - fmt.set_eng_float_format(accuracy=1) + set_eng_float_format(accuracy=1) result = pt.to_string() assert "NaN" in result - tm.reset_display_options() def test_inf(self): # Issue #11981 - formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) + formatter = EngFormatter(accuracy=1, use_eng_prefix=True) result = formatter(np.inf) assert result == "inf" diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index b520e2463353c..edac82193d1c8 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -872,20 +872,22 @@ def test_to_string_truncate_multilevel(self): with option_context("display.max_rows", 7, "display.max_columns", 7): assert has_doubly_truncated_repr(df) - def test_truncate_with_different_dtypes(self): + @pytest.mark.parametrize("dtype", ["object", "datetime64[us]"]) + def test_truncate_with_different_dtypes(self, dtype): # 11594, 12045 # when truncated the dtypes of the splits can differ # 11594 - s = Series( + ser = Series( [datetime(2012, 1, 1)] * 10 + [datetime(1012, 1, 2)] - + [datetime(2012, 1, 3)] * 10 + + [datetime(2012, 1, 3)] * 10, + dtype=dtype, ) with option_context("display.max_rows", 8): - result = str(s) - assert "object" in result + result = str(ser) + assert dtype in result def test_truncate_with_different_dtypes2(self): # 12045 diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 8cfcc26b95b4c..a7648cf1c471a 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -914,7 +914,6 @@ def get_ipython(): repstr = df._repr_html_() assert "class" in repstr # info fallback - tm.reset_display_options() def test_repr_html(self, float_frame): df = float_frame @@ -926,16 +925,12 @@ def test_repr_html(self, float_frame): with option_context("display.notebook_repr_html", False): df._repr_html_() - tm.reset_display_options() - df = DataFrame([[1, 2], [3, 4]]) with option_context("display.show_dimensions", True): assert "2 rows" in df._repr_html_() with option_context("display.show_dimensions", False): assert "2 rows" not in df._repr_html_() - tm.reset_display_options() - def test_repr_html_mathjax(self): df = DataFrame([[1, 2], [3, 4]]) assert "tex2jax_ignore" not in df._repr_html_() diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index 3a8dcb339b063..e607b6eb454a1 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -412,7 +412,6 @@ def test_to_string_complex_float_formatting(self): def test_to_string_format_inf(self): # GH#24861 - tm.reset_display_options() df = DataFrame( { "A": [-np.inf, np.inf, -1, -2.1234, 3, 4], @@ -460,7 +459,6 @@ def test_to_string_int_formatting(self): assert output == expected def test_to_string_float_formatting(self): - tm.reset_display_options() with option_context( "display.precision", 5, @@ -495,7 +493,6 @@ def test_to_string_float_formatting(self): expected = " x\n0 3234.000\n1 0.253" assert df_s == expected - tm.reset_display_options() assert get_option("display.precision") == 6 df = DataFrame({"x": [1e9, 0.2512]}) @@ -516,14 +513,12 @@ def test_to_string_decimal(self): assert df.to_string(decimal=",") == expected def test_to_string_left_justify_cols(self): - tm.reset_display_options() df = DataFrame({"x": [3234, 0.253]}) df_s = df.to_string(justify="left") expected = " x \n0 3234.000\n1 0.253" assert df_s == expected def test_to_string_format_na(self): - tm.reset_display_options() df = DataFrame( { "A": [np.nan, -1, -2.1234, 3, 4], diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index f3794c056a256..4a4ae2b259289 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -186,7 +186,8 @@ def test_error_bad_lines(all_parsers): msg = "Expected 1 fields in line 3, saw 3" if parser.engine == "pyarrow": - msg = "CSV parse error: Expected 1 columns, got 3: 1,2,3" + # "CSV parse error: Expected 1 columns, got 3: 1,2,3" + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), on_bad_lines="error") @@ -222,7 +223,8 @@ def test_read_csv_wrong_num_columns(all_parsers): msg = "Expected 6 fields in line 3, saw 7" if parser.engine == "pyarrow": - msg = "Expected 6 columns, got 7: 6,7,8,9,10,11,12" + # Expected 6 columns, got 7: 6,7,8,9,10,11,12 + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data)) @@ -246,10 +248,9 @@ def test_null_byte_char(request, all_parsers): tm.assert_frame_equal(out, expected) else: if parser.engine == "pyarrow": - msg = ( - "CSV parse error: Empty CSV file or block: " - "cannot infer number of columns" - ) + # CSV parse error: Empty CSV file or block: " + # cannot infer number of columns" + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") else: msg = "NULL byte detected" with pytest.raises(ParserError, match=msg): @@ -299,7 +300,8 @@ def test_bad_header_uniform_error(all_parsers): "number of columns, but 3 left to parse." ) elif parser.engine == "pyarrow": - msg = "CSV parse error: Expected 1 columns, got 4: col1,col2,col3,col4" + # "CSV parse error: Expected 1 columns, got 4: col1,col2,col3,col4" + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error") diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index ee19a20a155aa..113402cda1b9a 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -831,9 +831,7 @@ def test_parse_dates_string(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data), index_col="date", parse_dates=["date"]) # freq doesn't round-trip - index = DatetimeIndex( - list(date_range("1/1/2009", periods=3)), name="date", freq=None - ) + index = date_range("1/1/2009", periods=3, name="date")._with_freq(None) expected = DataFrame( {"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}, index=index @@ -1736,17 +1734,12 @@ def test_parse_timezone(all_parsers): 2018-01-04 09:05:00+09:00,23400""" result = parser.read_csv(StringIO(data), parse_dates=["dt"]) - dti = DatetimeIndex( - list( - date_range( - start="2018-01-04 09:01:00", - end="2018-01-04 09:05:00", - freq="1min", - tz=timezone(timedelta(minutes=540)), - ) - ), - freq=None, - ) + dti = date_range( + start="2018-01-04 09:01:00", + end="2018-01-04 09:05:00", + freq="1min", + tz=timezone(timedelta(minutes=540)), + )._with_freq(None) expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]} expected = DataFrame(expected_data) diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 695dc84db5e25..b099ed53b7a5f 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -178,8 +178,8 @@ def test_close_file_handle_on_invalid_usecols(all_parsers): error = ValueError if parser.engine == "pyarrow": - pyarrow = pytest.importorskip("pyarrow") - error = pyarrow.lib.ArrowKeyError + # Raises pyarrow.lib.ArrowKeyError + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") with tm.ensure_clean("test.csv") as fname: Path(fname).write_text("col1,col2\na,b\n1,2", encoding="utf-8") diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 23138f2710caf..92db98bc4ec28 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -477,11 +477,8 @@ def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, reques with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols) return - mark = pytest.mark.xfail( - reason="pyarrow.lib.ArrowKeyError: Column 'A' in include_columns " - "does not exist" - ) - request.applymarker(mark) + # "pyarrow.lib.ArrowKeyError: Column 'A' in include_columns does not exist" + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols) expected = DataFrame({"A": [1, 5], "C": [3, 7]}) diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index dace8435595ee..50cf7d737eb99 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -99,7 +99,7 @@ def test_append(setup_path): def test_append_series(setup_path): with ensure_clean_store(setup_path) as store: # basic - ss = tm.makeStringSeries() + ss = Series(range(20), dtype=np.float64, index=[f"i_{i}" for i in range(20)]) ts = tm.makeTimeSeries() ns = Series(np.arange(100)) diff --git a/pandas/tests/io/pytables/test_keys.py b/pandas/tests/io/pytables/test_keys.py index 0dcc9f7f1b9c2..fd7df29595090 100644 --- a/pandas/tests/io/pytables/test_keys.py +++ b/pandas/tests/io/pytables/test_keys.py @@ -3,6 +3,7 @@ from pandas import ( DataFrame, HDFStore, + Series, _testing as tm, ) from pandas.tests.io.pytables.common import ( @@ -16,7 +17,9 @@ def test_keys(setup_path): with ensure_clean_store(setup_path) as store: store["a"] = tm.makeTimeSeries() - store["b"] = tm.makeStringSeries() + store["b"] = Series( + range(10), dtype="float64", index=[f"i_{i}" for i in range(10)] + ) store["c"] = tm.makeDataFrame() assert len(store) == 3 diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index 24ee0d28a1887..c109b72e9c239 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -1,4 +1,3 @@ -import datetime import re import numpy as np @@ -15,6 +14,7 @@ Series, _testing as tm, concat, + date_range, ) from pandas.tests.io.pytables.common import ( _maybe_remove, @@ -279,15 +279,9 @@ def test_store_multiindex(setup_path): with ensure_clean_store(setup_path) as store: def make_index(names=None): - return MultiIndex.from_tuples( - [ - (datetime.datetime(2013, 12, d), s, t) - for d in range(1, 3) - for s in range(2) - for t in range(3) - ], - names=names, - ) + dti = date_range("2013-12-01", "2013-12-02") + mi = MultiIndex.from_product([dti, range(2), range(3)], names=names) + return mi # no names _maybe_remove(store, "df") @@ -306,11 +300,11 @@ def make_index(names=None): tm.assert_frame_equal(store.select("df"), df) # series - _maybe_remove(store, "s") - s = Series(np.zeros(12), index=make_index(["date", None, None])) - store.append("s", s) + _maybe_remove(store, "ser") + ser = Series(np.zeros(12), index=make_index(["date", None, None])) + store.append("ser", ser) xp = Series(np.zeros(12), index=make_index(["date", "level_1", "level_2"])) - tm.assert_series_equal(store.select("s"), xp) + tm.assert_series_equal(store.select("ser"), xp) # dup with column _maybe_remove(store, "df") diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index 9b4590ca3f01f..2030b1eca3203 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -154,7 +154,7 @@ def test_pytables_native_read(datapath): datapath("io", "data", "legacy_hdf/pytables_native.h5"), mode="r" ) as store: d2 = store["detector/readout"] - assert isinstance(d2, DataFrame) + assert isinstance(d2, DataFrame) @pytest.mark.skipif(is_platform_windows(), reason="native2 read fails oddly on windows") @@ -164,7 +164,7 @@ def test_pytables_native2_read(datapath): ) as store: str(store) d1 = store["detector"] - assert isinstance(d1, DataFrame) + assert isinstance(d1, DataFrame) def test_legacy_table_fixed_format_read_py2(datapath): @@ -174,28 +174,29 @@ def test_legacy_table_fixed_format_read_py2(datapath): datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r" ) as store: result = store.select("df") - expected = DataFrame( - [[1, 2, 3, "D"]], - columns=["A", "B", "C", "D"], - index=Index(["ABC"], name="INDEX_NAME"), - ) - tm.assert_frame_equal(expected, result) + expected = DataFrame( + [[1, 2, 3, "D"]], + columns=["A", "B", "C", "D"], + index=Index(["ABC"], name="INDEX_NAME"), + ) + tm.assert_frame_equal(expected, result) def test_legacy_table_fixed_format_read_datetime_py2(datapath): # GH 31750 # legacy table with fixed format and datetime64 column written in Python 2 + expected = DataFrame( + [[Timestamp("2020-02-06T18:00")]], + columns=["A"], + index=Index(["date"]), + dtype="M8[ns]", + ) with ensure_clean_store( datapath("io", "data", "legacy_hdf", "legacy_table_fixed_datetime_py2.h5"), mode="r", ) as store: result = store.select("df") - expected = DataFrame( - [[Timestamp("2020-02-06T18:00")]], - columns=["A"], - index=Index(["date"]), - ) - tm.assert_frame_equal(expected, result) + tm.assert_frame_equal(expected, result) def test_legacy_table_read_py2(datapath): @@ -264,7 +265,7 @@ def test_read_hdf_iterator(tmp_path, setup_path): with closing(iterator.store): assert isinstance(iterator, TableIterator) indirect = next(iterator.__iter__()) - tm.assert_frame_equal(direct, indirect) + tm.assert_frame_equal(direct, indirect) def test_read_nokey(tmp_path, setup_path): @@ -355,7 +356,7 @@ def test_read_hdf_series_mode_r(tmp_path, format, setup_path): # GH 16583 # Tests that reading a Series saved to an HDF file # still works if a mode='r' argument is supplied - series = tm.makeFloatSeries() + series = Series(range(10), dtype=np.float64) path = tmp_path / setup_path series.to_hdf(path, key="data", format=format) result = read_hdf(path, key="data", mode="r") @@ -387,7 +388,7 @@ def test_read_py2_hdf_file_in_py3(datapath): mode="r", ) as store: result = store["p"] - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_read_infer_string(tmp_path, setup_path): diff --git a/pandas/tests/io/pytables/test_retain_attributes.py b/pandas/tests/io/pytables/test_retain_attributes.py index aef6fc0460cd9..6284b826c3cf0 100644 --- a/pandas/tests/io/pytables/test_retain_attributes.py +++ b/pandas/tests/io/pytables/test_retain_attributes.py @@ -1,9 +1,8 @@ import pytest -from pandas._libs.tslibs import Timestamp - from pandas import ( DataFrame, + DatetimeIndex, Series, _testing as tm, date_range, @@ -18,11 +17,10 @@ pytestmark = pytest.mark.single_cpu -def test_retain_index_attributes(setup_path): +def test_retain_index_attributes(setup_path, unit): # GH 3499, losing frequency info on index recreation - df = DataFrame( - {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="h"))} - ) + dti = date_range("2000-1-1", periods=3, freq="h", unit=unit) + df = DataFrame({"A": Series(range(3), index=dti)}) with ensure_clean_store(setup_path) as store: _maybe_remove(store, "data") @@ -37,37 +35,30 @@ def test_retain_index_attributes(setup_path): getattr(result, idx), attr, None ) + dti2 = date_range("2002-1-1", periods=3, freq="D", unit=unit) # try to append a table with a different frequency with tm.assert_produces_warning(errors.AttributeConflictWarning): - df2 = DataFrame( - { - "A": Series( - range(3), index=date_range("2002-1-1", periods=3, freq="D") - ) - } - ) + df2 = DataFrame({"A": Series(range(3), index=dti2)}) store.append("data", df2) assert store.get_storer("data").info["index"]["freq"] is None # this is ok _maybe_remove(store, "df2") + dti3 = DatetimeIndex( + ["2001-01-01", "2001-01-02", "2002-01-01"], dtype=f"M8[{unit}]" + ) df2 = DataFrame( { "A": Series( range(3), - index=[ - Timestamp("20010101"), - Timestamp("20010102"), - Timestamp("20020101"), - ], + index=dti3, ) } ) store.append("df2", df2) - df3 = DataFrame( - {"A": Series(range(3), index=date_range("2002-1-1", periods=3, freq="D"))} - ) + dti4 = date_range("2002-1-1", periods=3, freq="D", unit=unit) + df3 = DataFrame({"A": Series(range(3), index=dti4)}) store.append("df2", df3) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index af24a5cf7e0ab..6c24843f18d0d 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -10,6 +10,7 @@ import pandas as pd from pandas import ( DataFrame, + DatetimeIndex, Index, Series, _testing as tm, @@ -35,7 +36,7 @@ def roundtrip(key, obj, **kwargs): o = tm.makeTimeSeries() tm.assert_series_equal(o, roundtrip("series", o)) - o = tm.makeStringSeries() + o = Series(range(10), dtype="float64", index=[f"i_{i}" for i in range(10)]) tm.assert_series_equal(o, roundtrip("string_series", o)) o = tm.makeDataFrame() @@ -248,7 +249,7 @@ def test_table_values_dtypes_roundtrip(setup_path): @pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") def test_series(setup_path): - s = tm.makeStringSeries() + s = Series(range(10), dtype="float64", index=[f"i_{i}" for i in range(10)]) _check_roundtrip(s, tm.assert_series_equal, path=setup_path) ts = tm.makeTimeSeries() @@ -320,7 +321,11 @@ def test_index_types(setup_path): ser = Series(values, [1, 5]) _check_roundtrip(ser, func, path=setup_path) - ser = Series(values, [datetime.datetime(2012, 1, 1), datetime.datetime(2012, 1, 2)]) + dti = DatetimeIndex(["2012-01-01", "2012-01-02"], dtype="M8[ns]") + ser = Series(values, index=dti) + _check_roundtrip(ser, func, path=setup_path) + + ser.index = ser.index.as_unit("s") _check_roundtrip(ser, func, path=setup_path) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index b61b6f0772251..96c160ab40bd8 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -103,7 +103,9 @@ def test_repr(setup_path): repr(store) store.info() store["a"] = tm.makeTimeSeries() - store["b"] = tm.makeStringSeries() + store["b"] = Series( + range(10), dtype="float64", index=[f"i_{i}" for i in range(10)] + ) store["c"] = tm.makeDataFrame() df = tm.makeDataFrame() @@ -542,17 +544,16 @@ def test_store_index_name(setup_path): @pytest.mark.parametrize("tz", [None, "US/Pacific"]) -@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) @pytest.mark.parametrize("table_format", ["table", "fixed"]) def test_store_index_name_numpy_str(tmp_path, table_format, setup_path, unit, tz): # GH #13492 - idx = Index( - pd.to_datetime([dt.date(2000, 1, 1), dt.date(2000, 1, 2)]), + idx = DatetimeIndex( + [dt.date(2000, 1, 1), dt.date(2000, 1, 2)], name="cols\u05d2", ).tz_localize(tz) idx1 = ( - Index( - pd.to_datetime([dt.date(2010, 1, 1), dt.date(2010, 1, 2)]), + DatetimeIndex( + [dt.date(2010, 1, 1), dt.date(2010, 1, 2)], name="rows\u05d0", ) .as_unit(unit) diff --git a/pandas/tests/io/pytables/test_time_series.py b/pandas/tests/io/pytables/test_time_series.py index 26492f72f192d..cfe25f03e1aab 100644 --- a/pandas/tests/io/pytables/test_time_series.py +++ b/pandas/tests/io/pytables/test_time_series.py @@ -5,6 +5,7 @@ from pandas import ( DataFrame, + DatetimeIndex, Series, _testing as tm, ) @@ -13,10 +14,12 @@ pytestmark = pytest.mark.single_cpu -def test_store_datetime_fractional_secs(setup_path): +@pytest.mark.parametrize("unit", ["us", "ns"]) +def test_store_datetime_fractional_secs(setup_path, unit): + dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) + dti = DatetimeIndex([dt], dtype=f"M8[{unit}]") + series = Series([0], index=dti) with ensure_clean_store(setup_path) as store: - dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) - series = Series([0], [dt]) store["a"] = series assert store["a"].index[0] == dt diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 676b9374514e8..8c61830ebe038 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -148,16 +148,20 @@ def test_append_with_timezones_as_index(setup_path, gettz): tm.assert_frame_equal(result, df) -def test_roundtrip_tz_aware_index(setup_path): +def test_roundtrip_tz_aware_index(setup_path, unit): # GH 17618 - time = Timestamp("2000-01-01 01:00:00", tz="US/Eastern") - df = DataFrame(data=[0], index=[time]) + ts = Timestamp("2000-01-01 01:00:00", tz="US/Eastern") + dti = DatetimeIndex([ts]).as_unit(unit) + df = DataFrame(data=[0], index=dti) with ensure_clean_store(setup_path) as store: store.put("frame", df, format="fixed") recons = store["frame"] tm.assert_frame_equal(recons, df) - assert recons.index[0]._value == 946706400000000000 + + value = recons.index[0]._value + denom = {"ns": 1, "us": 1000, "ms": 10**6, "s": 10**9}[unit] + assert value == 946706400000000000 // denom def test_store_index_name_with_tz(setup_path): @@ -365,7 +369,7 @@ def test_py2_created_with_datetimez(datapath): # Python 3. # # GH26443 - index = [Timestamp("2019-01-01T18:00").tz_localize("America/New_York")] + index = DatetimeIndex(["2019-01-01T18:00"], dtype="M8[ns, America/New_York]") expected = DataFrame({"data": 123}, index=index) with ensure_clean_store( datapath("io", "data", "legacy_hdf", "gh26443.h5"), mode="r" diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index d56139d32b1da..0ce428cef9520 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -187,7 +187,18 @@ def test_date_time(datapath): fname, parse_dates=["Date1", "Date2", "DateTime", "DateTimeHi", "Taiw"] ) # GH 19732: Timestamps imported from sas will incur floating point errors - df[df.columns[3]] = df.iloc[:, 3].dt.round("us") + # 2023-11-16 we don't know the correct "expected" result bc we do not have + # access to SAS to read the sas7bdat file. We are really just testing + # that we are "close". This only seems to be an issue near the + # implementation bounds. + res = df.iloc[:, 3].dt.round("us").copy() + + # the first and last elements are near the implementation bounds, where we + # would expect floating point error to occur. + res.iloc[0] -= pd.Timedelta(microseconds=1) + res.iloc[-1] += pd.Timedelta(microseconds=1) + + df["DateTimeHi"] = res tm.assert_frame_equal(df, df0) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 10e0467c5d74d..3d150aaa28eb4 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -1,13 +1,8 @@ -import os from textwrap import dedent import numpy as np import pytest -from pandas.compat import ( - is_ci_environment, - is_platform_mac, -) from pandas.errors import ( PyperclipException, PyperclipWindowsException, @@ -30,8 +25,7 @@ from pandas.io.clipboard import ( CheckedCall, _stringifyText, - clipboard_get, - clipboard_set, + init_qt_clipboard, ) @@ -205,60 +199,34 @@ def test_stringify_text(text): @pytest.fixture -def mock_clipboard(monkeypatch, request): - """Fixture mocking clipboard IO. - - This mocks pandas.io.clipboard.clipboard_get and - pandas.io.clipboard.clipboard_set. - - This uses a local dict for storing data. The dictionary - key used is the test ID, available with ``request.node.name``. - - This returns the local dictionary, for direct manipulation by - tests. - """ - # our local clipboard for tests - _mock_data = {} - - def _mock_set(data): - _mock_data[request.node.name] = data - - def _mock_get(): - return _mock_data[request.node.name] - - monkeypatch.setattr("pandas.io.clipboard.clipboard_set", _mock_set) - monkeypatch.setattr("pandas.io.clipboard.clipboard_get", _mock_get) - - yield _mock_data - +def set_pyqt_clipboard(monkeypatch): + qt_cut, qt_paste = init_qt_clipboard() + with monkeypatch.context() as m: + m.setattr(pd.io.clipboard, "clipboard_set", qt_cut) + m.setattr(pd.io.clipboard, "clipboard_get", qt_paste) + yield -@pytest.mark.clipboard -def test_mock_clipboard(mock_clipboard): - import pandas.io.clipboard - pandas.io.clipboard.clipboard_set("abc") - assert "abc" in set(mock_clipboard.values()) - result = pandas.io.clipboard.clipboard_get() - assert result == "abc" +@pytest.fixture +def clipboard(qapp): + clip = qapp.clipboard() + yield clip + clip.clear() @pytest.mark.single_cpu @pytest.mark.clipboard -@pytest.mark.usefixtures("mock_clipboard") +@pytest.mark.usefixtures("set_pyqt_clipboard") +@pytest.mark.usefixtures("clipboard") class TestClipboard: - def check_round_trip_frame(self, data, excel=None, sep=None, encoding=None): - data.to_clipboard(excel=excel, sep=sep, encoding=encoding) - result = read_clipboard(sep=sep or "\t", index_col=0, encoding=encoding) - tm.assert_frame_equal(data, result) - # Test that default arguments copy as tab delimited - def test_round_trip_frame(self, df): - self.check_round_trip_frame(df) - # Test that explicit delimiters are respected - @pytest.mark.parametrize("sep", ["\t", ",", "|"]) - def test_round_trip_frame_sep(self, df, sep): - self.check_round_trip_frame(df, sep=sep) + @pytest.mark.parametrize("sep", [None, "\t", ",", "|"]) + @pytest.mark.parametrize("encoding", [None, "UTF-8", "utf-8", "utf8"]) + def test_round_trip_frame_sep(self, df, sep, encoding): + df.to_clipboard(excel=None, sep=sep, encoding=encoding) + result = read_clipboard(sep=sep or "\t", index_col=0, encoding=encoding) + tm.assert_frame_equal(df, result) # Test white space separator def test_round_trip_frame_string(self, df): @@ -286,22 +254,21 @@ def test_copy_delim_warning(self, df): # delimited and excel="True" @pytest.mark.parametrize("sep", ["\t", None, "default"]) @pytest.mark.parametrize("excel", [True, None, "default"]) - def test_clipboard_copy_tabs_default(self, sep, excel, df, request, mock_clipboard): + def test_clipboard_copy_tabs_default(self, sep, excel, df, clipboard): kwargs = build_kwargs(sep, excel) df.to_clipboard(**kwargs) - assert mock_clipboard[request.node.name] == df.to_csv(sep="\t") + assert clipboard.text() == df.to_csv(sep="\t") # Tests reading of white space separated tables @pytest.mark.parametrize("sep", [None, "default"]) - @pytest.mark.parametrize("excel", [False]) - def test_clipboard_copy_strings(self, sep, excel, df): - kwargs = build_kwargs(sep, excel) + def test_clipboard_copy_strings(self, sep, df): + kwargs = build_kwargs(sep, False) df.to_clipboard(**kwargs) result = read_clipboard(sep=r"\s+") assert result.to_string() == df.to_string() assert df.shape == result.shape - def test_read_clipboard_infer_excel(self, request, mock_clipboard): + def test_read_clipboard_infer_excel(self, clipboard): # gh-19010: avoid warnings clip_kwargs = {"engine": "python"} @@ -312,7 +279,7 @@ def test_read_clipboard_infer_excel(self, request, mock_clipboard): 4\tHarry Carney """.strip() ) - mock_clipboard[request.node.name] = text + clipboard.setText(text) df = read_clipboard(**clip_kwargs) # excel data is parsed correctly @@ -326,7 +293,7 @@ def test_read_clipboard_infer_excel(self, request, mock_clipboard): 3 4 """.strip() ) - mock_clipboard[request.node.name] = text + clipboard.setText(text) res = read_clipboard(**clip_kwargs) text = dedent( @@ -336,16 +303,16 @@ def test_read_clipboard_infer_excel(self, request, mock_clipboard): 3 4 """.strip() ) - mock_clipboard[request.node.name] = text + clipboard.setText(text) exp = read_clipboard(**clip_kwargs) tm.assert_frame_equal(res, exp) - def test_infer_excel_with_nulls(self, request, mock_clipboard): + def test_infer_excel_with_nulls(self, clipboard): # GH41108 text = "col1\tcol2\n1\tred\n\tblue\n2\tgreen" - mock_clipboard[request.node.name] = text + clipboard.setText(text) df = read_clipboard() df_expected = DataFrame( data={"col1": [1, None, 2], "col2": ["red", "blue", "green"]} @@ -376,10 +343,10 @@ def test_infer_excel_with_nulls(self, request, mock_clipboard): ), ], ) - def test_infer_excel_with_multiindex(self, request, mock_clipboard, multiindex): + def test_infer_excel_with_multiindex(self, clipboard, multiindex): # GH41108 - mock_clipboard[request.node.name] = multiindex[0] + clipboard.setText(multiindex[0]) df = read_clipboard() df_expected = DataFrame( data={"col1": [1, None, 2], "col2": ["red", "blue", "green"]}, @@ -397,26 +364,17 @@ def test_invalid_encoding(self, df): with pytest.raises(NotImplementedError, match=msg): read_clipboard(encoding="ascii") - @pytest.mark.parametrize("enc", ["UTF-8", "utf-8", "utf8"]) - def test_round_trip_valid_encodings(self, enc, df): - self.check_round_trip_frame(df, encoding=enc) - - @pytest.mark.single_cpu @pytest.mark.parametrize("data", ["\U0001f44d...", "Ωœ∑`...", "abcd..."]) - @pytest.mark.xfail( - (os.environ.get("DISPLAY") is None and not is_platform_mac()) - or is_ci_environment(), - reason="Cannot pass if a headless system is not put in place with Xvfb", - strict=not is_ci_environment(), # Flaky failures in the CI - ) def test_raw_roundtrip(self, data): # PR #25040 wide unicode wasn't copied correctly on PY3 on windows - clipboard_set(data) - assert data == clipboard_get() + df = DataFrame({"data": [data]}) + df.to_clipboard() + result = read_clipboard() + tm.assert_frame_equal(df, result) @pytest.mark.parametrize("engine", ["c", "python"]) def test_read_clipboard_dtype_backend( - self, request, mock_clipboard, string_storage, dtype_backend, engine + self, clipboard, string_storage, dtype_backend, engine ): # GH#50502 if string_storage == "pyarrow" or dtype_backend == "pyarrow": @@ -433,7 +391,7 @@ def test_read_clipboard_dtype_backend( text = """a,b,c,d,e,f,g,h,i x,1,4.0,x,2,4.0,,True,False y,2,5.0,,,,,False,""" - mock_clipboard[request.node.name] = text + clipboard.setText(text) with pd.option_context("mode.string_storage", string_storage): result = read_clipboard(sep=",", dtype_backend=dtype_backend, engine=engine) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 9ac20774b8c93..45d0a839b9e1a 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -19,6 +19,11 @@ import pytest from pandas._libs import lib +from pandas.compat import ( + pa_version_under13p0, + pa_version_under14p1, +) +from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td import pandas as pd @@ -56,6 +61,11 @@ import sqlalchemy +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + + @pytest.fixture def sql_strings(): return { @@ -110,8 +120,7 @@ def iris_table_metadata(): return iris -def create_and_load_iris_sqlite3(conn: sqlite3.Connection, iris_file: Path): - cur = conn.cursor() +def create_and_load_iris_sqlite3(conn, iris_file: Path): stmt = """CREATE TABLE iris ( "SepalLength" REAL, "SepalWidth" REAL, @@ -119,17 +128,65 @@ def create_and_load_iris_sqlite3(conn: sqlite3.Connection, iris_file: Path): "PetalWidth" REAL, "Name" TEXT )""" + + cur = conn.cursor() cur.execute(stmt) with iris_file.open(newline=None, encoding="utf-8") as csvfile: reader = csv.reader(csvfile) next(reader) stmt = "INSERT INTO iris VALUES(?, ?, ?, ?, ?)" - cur.executemany(stmt, reader) + # ADBC requires explicit types - no implicit str -> float conversion + records = [] + records = [ + ( + float(row[0]), + float(row[1]), + float(row[2]), + float(row[3]), + row[4], + ) + for row in reader + ] + + cur.executemany(stmt, records) + cur.close() + + conn.commit() + + +def create_and_load_iris_postgresql(conn, iris_file: Path): + stmt = """CREATE TABLE iris ( + "SepalLength" DOUBLE PRECISION, + "SepalWidth" DOUBLE PRECISION, + "PetalLength" DOUBLE PRECISION, + "PetalWidth" DOUBLE PRECISION, + "Name" TEXT + )""" + with conn.cursor() as cur: + cur.execute(stmt) + with iris_file.open(newline=None, encoding="utf-8") as csvfile: + reader = csv.reader(csvfile) + next(reader) + stmt = "INSERT INTO iris VALUES($1, $2, $3, $4, $5)" + # ADBC requires explicit types - no implicit str -> float conversion + records = [ + ( + float(row[0]), + float(row[1]), + float(row[2]), + float(row[3]), + row[4], + ) + for row in reader + ] + + cur.executemany(stmt, records) + + conn.commit() def create_and_load_iris(conn, iris_file: Path): from sqlalchemy import insert - from sqlalchemy.engine import Engine iris = iris_table_metadata() @@ -138,17 +195,10 @@ def create_and_load_iris(conn, iris_file: Path): header = next(reader) params = [dict(zip(header, row)) for row in reader] stmt = insert(iris).values(params) - if isinstance(conn, Engine): - with conn.connect() as conn: - with conn.begin(): - iris.drop(conn, checkfirst=True) - iris.create(bind=conn) - conn.execute(stmt) - else: - with conn.begin(): - iris.drop(conn, checkfirst=True) - iris.create(bind=conn) - conn.execute(stmt) + with conn.begin() as con: + iris.drop(con, checkfirst=True) + iris.create(bind=con) + con.execute(stmt) def create_and_load_iris_view(conn): @@ -157,17 +207,17 @@ def create_and_load_iris_view(conn): cur = conn.cursor() cur.execute(stmt) else: - from sqlalchemy import text - from sqlalchemy.engine import Engine - - stmt = text(stmt) - if isinstance(conn, Engine): - with conn.connect() as conn: - with conn.begin(): - conn.execute(stmt) + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(conn, adbc.Connection): + with conn.cursor() as cur: + cur.execute(stmt) + conn.commit() else: - with conn.begin(): - conn.execute(stmt) + from sqlalchemy import text + + stmt = text(stmt) + with conn.begin() as con: + con.execute(stmt) def types_table_metadata(dialect: str): @@ -201,8 +251,7 @@ def types_table_metadata(dialect: str): return types -def create_and_load_types_sqlite3(conn: sqlite3.Connection, types_data: list[dict]): - cur = conn.cursor() +def create_and_load_types_sqlite3(conn, types_data: list[dict]): stmt = """CREATE TABLE types ( "TextCol" TEXT, "DateCol" TEXT, @@ -214,13 +263,47 @@ def create_and_load_types_sqlite3(conn: sqlite3.Connection, types_data: list[dic "IntColWithNull" INTEGER, "BoolColWithNull" INTEGER )""" - cur.execute(stmt) - stmt = """ - INSERT INTO types - VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?) - """ - cur.executemany(stmt, types_data) + ins_stmt = """ + INSERT INTO types + VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?) + """ + + if isinstance(conn, sqlite3.Connection): + cur = conn.cursor() + cur.execute(stmt) + cur.executemany(ins_stmt, types_data) + else: + with conn.cursor() as cur: + cur.execute(stmt) + cur.executemany(ins_stmt, types_data) + + conn.commit() + + +def create_and_load_types_postgresql(conn, types_data: list[dict]): + with conn.cursor() as cur: + stmt = """CREATE TABLE types ( + "TextCol" TEXT, + "DateCol" TIMESTAMP, + "IntDateCol" INTEGER, + "IntDateOnlyCol" INTEGER, + "FloatCol" DOUBLE PRECISION, + "IntCol" INTEGER, + "BoolCol" BOOLEAN, + "IntColWithNull" INTEGER, + "BoolColWithNull" BOOLEAN + )""" + cur.execute(stmt) + + stmt = """ + INSERT INTO types + VALUES($1, $2::timestamp, $3, $4, $5, $6, $7, $8, $9) + """ + + cur.executemany(stmt, types_data) + + conn.commit() def create_and_load_types(conn, types_data: list[dict], dialect: str): @@ -292,15 +375,22 @@ def check_iris_frame(frame: DataFrame): pytype = frame.dtypes.iloc[0].type row = frame.iloc[0] assert issubclass(pytype, np.floating) - tm.equalContents(row.values, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) + tm.assert_series_equal( + row, Series([5.1, 3.5, 1.4, 0.2, "Iris-setosa"], index=frame.columns, name=0) + ) assert frame.shape in ((150, 5), (8, 5)) def count_rows(conn, table_name: str): stmt = f"SELECT count(*) AS count_1 FROM {table_name}" + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") if isinstance(conn, sqlite3.Connection): cur = conn.cursor() return cur.execute(stmt).fetchone()[0] + elif adbc and isinstance(conn, adbc.Connection): + with conn.cursor() as cur: + cur.execute(stmt) + return cur.fetchone()[0] else: from sqlalchemy import create_engine from sqlalchemy.engine import Engine @@ -423,9 +513,24 @@ def get_all_views(conn): c = conn.execute("SELECT name FROM sqlite_master WHERE type='view'") return [view[0] for view in c.fetchall()] else: - from sqlalchemy import inspect + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(conn, adbc.Connection): + results = [] + info = conn.adbc_get_objects().read_all().to_pylist() + for catalog in info: + catalog["catalog_name"] + for schema in catalog["catalog_db_schemas"]: + schema["db_schema_name"] + for table in schema["db_schema_tables"]: + if table["table_type"] == "view": + view_name = table["table_name"] + results.append(view_name) + + return results + else: + from sqlalchemy import inspect - return inspect(conn).get_view_names() + return inspect(conn).get_view_names() def get_all_tables(conn): @@ -433,9 +538,23 @@ def get_all_tables(conn): c = conn.execute("SELECT name FROM sqlite_master WHERE type='table'") return [table[0] for table in c.fetchall()] else: - from sqlalchemy import inspect + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + + if adbc and isinstance(conn, adbc.Connection): + results = [] + info = conn.adbc_get_objects().read_all().to_pylist() + for catalog in info: + for schema in catalog["catalog_db_schemas"]: + for table in schema["db_schema_tables"]: + if table["table_type"] == "table": + table_name = table["table_name"] + results.append(table_name) + + return results + else: + from sqlalchemy import inspect - return inspect(conn).get_table_names() + return inspect(conn).get_table_names() def drop_table( @@ -445,10 +564,16 @@ def drop_table( if isinstance(conn, sqlite3.Connection): conn.execute(f"DROP TABLE IF EXISTS {sql._get_valid_sqlite_name(table_name)}") conn.commit() + else: - with conn.begin() as con: - with sql.SQLDatabase(con) as db: - db.drop_table(table_name) + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(conn, adbc.Connection): + with conn.cursor() as cur: + cur.execute(f'DROP TABLE IF EXISTS "{table_name}"') + else: + with conn.begin() as con: + with sql.SQLDatabase(con) as db: + db.drop_table(table_name) def drop_view( @@ -461,12 +586,17 @@ def drop_view( conn.execute(f"DROP VIEW IF EXISTS {sql._get_valid_sqlite_name(view_name)}") conn.commit() else: - quoted_view = conn.engine.dialect.identifier_preparer.quote_identifier( - view_name - ) - stmt = sqlalchemy.text(f"DROP VIEW IF EXISTS {quoted_view}") - with conn.begin() as con: - con.execute(stmt) # type: ignore[union-attr] + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(conn, adbc.Connection): + with conn.cursor() as cur: + cur.execute(f'DROP VIEW IF EXISTS "{view_name}"') + else: + quoted_view = conn.engine.dialect.identifier_preparer.quote_identifier( + view_name + ) + stmt = sqlalchemy.text(f"DROP VIEW IF EXISTS {quoted_view}") + with conn.begin() as con: + con.execute(stmt) # type: ignore[union-attr] @pytest.fixture @@ -552,6 +682,57 @@ def postgresql_psycopg2_conn(postgresql_psycopg2_engine): yield conn +@pytest.fixture +def postgresql_adbc_conn(): + pytest.importorskip("adbc_driver_postgresql") + from adbc_driver_postgresql import dbapi + + uri = "postgresql://postgres:postgres@localhost:5432/pandas" + with dbapi.connect(uri) as conn: + yield conn + for view in get_all_views(conn): + drop_view(view, conn) + for tbl in get_all_tables(conn): + drop_table(tbl, conn) + conn.commit() + + +@pytest.fixture +def postgresql_adbc_iris(postgresql_adbc_conn, iris_path): + import adbc_driver_manager as mgr + + conn = postgresql_adbc_conn + + try: + conn.adbc_get_table_schema("iris") + except mgr.ProgrammingError: + conn.rollback() + create_and_load_iris_postgresql(conn, iris_path) + try: + conn.adbc_get_table_schema("iris_view") + except mgr.ProgrammingError: # note arrow-adbc issue 1022 + conn.rollback() + create_and_load_iris_view(conn) + yield conn + + +@pytest.fixture +def postgresql_adbc_types(postgresql_adbc_conn, types_data): + import adbc_driver_manager as mgr + + conn = postgresql_adbc_conn + + try: + conn.adbc_get_table_schema("types") + except mgr.ProgrammingError: + conn.rollback() + new_data = [tuple(entry.values()) for entry in types_data] + + create_and_load_types_postgresql(conn, new_data) + + yield conn + + @pytest.fixture def postgresql_psycopg2_conn_iris(postgresql_psycopg2_engine_iris): with postgresql_psycopg2_engine_iris.connect() as conn: @@ -633,6 +814,62 @@ def sqlite_conn_types(sqlite_engine_types): yield conn +@pytest.fixture +def sqlite_adbc_conn(): + pytest.importorskip("adbc_driver_sqlite") + from adbc_driver_sqlite import dbapi + + with tm.ensure_clean() as name: + uri = f"file:{name}" + with dbapi.connect(uri) as conn: + yield conn + for view in get_all_views(conn): + drop_view(view, conn) + for tbl in get_all_tables(conn): + drop_table(tbl, conn) + conn.commit() + + +@pytest.fixture +def sqlite_adbc_iris(sqlite_adbc_conn, iris_path): + import adbc_driver_manager as mgr + + conn = sqlite_adbc_conn + try: + conn.adbc_get_table_schema("iris") + except mgr.ProgrammingError: + conn.rollback() + create_and_load_iris_sqlite3(conn, iris_path) + try: + conn.adbc_get_table_schema("iris_view") + except mgr.ProgrammingError: + conn.rollback() + create_and_load_iris_view(conn) + yield conn + + +@pytest.fixture +def sqlite_adbc_types(sqlite_adbc_conn, types_data): + import adbc_driver_manager as mgr + + conn = sqlite_adbc_conn + try: + conn.adbc_get_table_schema("types") + except mgr.ProgrammingError: + conn.rollback() + new_data = [] + for entry in types_data: + entry["BoolCol"] = int(entry["BoolCol"]) + if entry["BoolColWithNull"] is not None: + entry["BoolColWithNull"] = int(entry["BoolColWithNull"]) + new_data.append(tuple(entry.values())) + + create_and_load_types_sqlite3(conn, new_data) + conn.commit() + + yield conn + + @pytest.fixture def sqlite_buildin(): with contextlib.closing(sqlite3.connect(":memory:")) as closing_conn: @@ -712,11 +949,31 @@ def sqlite_buildin_types(sqlite_buildin, types_data): mysql_connectable_types + postgresql_connectable_types + sqlite_connectable_types ) -all_connectable = sqlalchemy_connectable + ["sqlite_buildin"] +adbc_connectable = [ + "sqlite_adbc_conn", + pytest.param("postgresql_adbc_conn", marks=pytest.mark.db), +] + +adbc_connectable_iris = [ + pytest.param("postgresql_adbc_iris", marks=pytest.mark.db), + pytest.param("sqlite_adbc_iris", marks=pytest.mark.db), +] + +adbc_connectable_types = [ + pytest.param("postgresql_adbc_types", marks=pytest.mark.db), + pytest.param("sqlite_adbc_types", marks=pytest.mark.db), +] + + +all_connectable = sqlalchemy_connectable + ["sqlite_buildin"] + adbc_connectable -all_connectable_iris = sqlalchemy_connectable_iris + ["sqlite_buildin_iris"] +all_connectable_iris = ( + sqlalchemy_connectable_iris + ["sqlite_buildin_iris"] + adbc_connectable_iris +) -all_connectable_types = sqlalchemy_connectable_types + ["sqlite_buildin_types"] +all_connectable_types = ( + sqlalchemy_connectable_types + ["sqlite_buildin_types"] + adbc_connectable_types +) @pytest.mark.parametrize("conn", all_connectable) @@ -728,6 +985,13 @@ def test_dataframe_to_sql(conn, test_frame1, request): @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_empty(conn, test_frame1, request): + if conn == "postgresql_adbc_conn": + request.node.add_marker( + pytest.mark.xfail( + reason="postgres ADBC driver cannot insert index with null type", + strict=True, + ) + ) # GH 51086 if conn is sqlite_engine conn = request.getfixturevalue(conn) empty_df = test_frame1.iloc[:0] @@ -749,8 +1013,22 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): "string": pd.array(["a"], dtype="string[pyarrow]"), } ) + + if "adbc" in conn: + if conn == "sqlite_adbc_conn": + df = df.drop(columns=["timedelta"]) + if pa_version_under14p1: + exp_warning = FutureWarning + msg = "is_sparse is deprecated" + else: + exp_warning = None + msg = "" + else: + exp_warning = UserWarning + msg = "the 'timedelta'" + conn = request.getfixturevalue(conn) - with tm.assert_produces_warning(UserWarning, match="the 'timedelta'"): + with tm.assert_produces_warning(exp_warning, match=msg, check_stacklevel=False): df.to_sql(name="test_arrow", con=conn, if_exists="replace", index=False) @@ -772,6 +1050,13 @@ def test_dataframe_to_sql_arrow_dtypes_missing(conn, request, nulls_fixture): @pytest.mark.parametrize("conn", all_connectable) @pytest.mark.parametrize("method", [None, "multi"]) def test_to_sql(conn, method, test_frame1, request): + if method == "multi" and "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'method' not implemented for ADBC drivers", strict=True + ) + ) + conn = request.getfixturevalue(conn) with pandasSQL_builder(conn, need_transaction=True) as pandasSQL: pandasSQL.to_sql(test_frame1, "test_frame", method=method) @@ -816,6 +1101,13 @@ def test_read_iris_query(conn, request): @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_iris_query_chunksize(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'chunksize' not implemented for ADBC drivers", + strict=True, + ) + ) conn = request.getfixturevalue(conn) iris_frame = concat(read_sql_query("SELECT * FROM iris", conn, chunksize=7)) check_iris_frame(iris_frame) @@ -828,6 +1120,13 @@ def test_read_iris_query_chunksize(conn, request): @pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) def test_read_iris_query_expression_with_parameter(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'chunksize' not implemented for ADBC drivers", + strict=True, + ) + ) conn = request.getfixturevalue(conn) from sqlalchemy import ( MetaData, @@ -849,6 +1148,14 @@ def test_read_iris_query_expression_with_parameter(conn, request): @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_iris_query_string_with_parameter(conn, request, sql_strings): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'chunksize' not implemented for ADBC drivers", + strict=True, + ) + ) + for db, query in sql_strings["read_parameters"].items(): if db in conn: break @@ -871,6 +1178,10 @@ def test_read_iris_table(conn, request): @pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) def test_read_iris_table_chunksize(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) iris_frame = concat(read_sql_table("iris", conn, chunksize=7)) check_iris_frame(iris_frame) @@ -1209,6 +1520,13 @@ def flavor(conn_name): @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_sql_iris_parameter(conn, request, sql_strings): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'params' not implemented for ADBC drivers", + strict=True, + ) + ) conn_name = conn conn = request.getfixturevalue(conn) query = sql_strings["read_parameters"][flavor(conn_name)] @@ -1221,6 +1539,14 @@ def test_read_sql_iris_parameter(conn, request, sql_strings): @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_sql_iris_named_parameter(conn, request, sql_strings): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'params' not implemented for ADBC drivers", + strict=True, + ) + ) + conn_name = conn conn = request.getfixturevalue(conn) query = sql_strings["read_named_parameters"][flavor(conn_name)] @@ -1233,7 +1559,7 @@ def test_read_sql_iris_named_parameter(conn, request, sql_strings): @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_sql_iris_no_parameter_with_percent(conn, request, sql_strings): - if "mysql" in conn or "postgresql" in conn: + if "mysql" in conn or ("postgresql" in conn and "adbc" not in conn): request.applymarker(pytest.mark.xfail(reason="broken test")) conn_name = conn @@ -1259,6 +1585,10 @@ def test_api_read_sql_view(conn, request): @pytest.mark.parametrize("conn", all_connectable_iris) def test_api_read_sql_with_chunksize_no_result(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) query = 'SELECT * FROM iris_view WHERE "SepalLength" < 0.0' with_batch = sql.read_sql_query(query, conn, chunksize=5) @@ -1357,6 +1687,7 @@ def test_api_to_sql_series(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_roundtrip(conn, request, test_frame1): + conn_name = conn conn = request.getfixturevalue(conn) if sql.has_table("test_frame_roundtrip", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: @@ -1366,6 +1697,8 @@ def test_api_roundtrip(conn, request, test_frame1): result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=conn) # HACK! + if "adbc" in conn_name: + result = result.rename(columns={"__index_level_0__": "level_0"}) result.index = test_frame1.index result.set_index("level_0", inplace=True) result.index.astype(int) @@ -1375,6 +1708,10 @@ def test_api_roundtrip(conn, request, test_frame1): @pytest.mark.parametrize("conn", all_connectable) def test_api_roundtrip_chunksize(conn, request, test_frame1): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) if sql.has_table("test_frame_roundtrip", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: @@ -1398,7 +1735,8 @@ def test_api_execute_sql(conn, request): with sql.pandasSQL_builder(conn) as pandas_sql: iris_results = pandas_sql.execute("SELECT * FROM iris") row = iris_results.fetchone() - tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) + iris_results.close() + assert list(row) == [5.1, 3.5, 1.4, 0.2, "Iris-setosa"] @pytest.mark.parametrize("conn", all_connectable_types) @@ -1496,6 +1834,19 @@ def test_api_custom_dateparsing_error( result["BoolCol"] = result["BoolCol"].astype(int) result["BoolColWithNull"] = result["BoolColWithNull"].astype(float) + if conn_name == "postgresql_adbc_types": + expected = expected.astype( + { + "IntDateCol": "int32", + "IntDateOnlyCol": "int32", + "IntCol": "int32", + } + ) + + if not pa_version_under13p0: + # TODO: is this astype safe? + expected["DateCol"] = expected["DateCol"].astype("datetime64[us]") + tm.assert_frame_equal(result, expected) @@ -1517,24 +1868,60 @@ def test_api_date_and_index(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_timedelta(conn, request): # see #6921 + conn_name = conn conn = request.getfixturevalue(conn) if sql.has_table("test_timedelta", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: pandasSQL.drop_table("test_timedelta") df = to_timedelta(Series(["00:00:01", "00:00:03"], name="foo")).to_frame() - with tm.assert_produces_warning(UserWarning): + + if conn_name == "sqlite_adbc_conn": + request.node.add_marker( + pytest.mark.xfail( + reason="sqlite ADBC driver doesn't implement timedelta", + ) + ) + + if "adbc" in conn_name: + if pa_version_under14p1: + exp_warning = FutureWarning + else: + exp_warning = None + else: + exp_warning = UserWarning + + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): result_count = df.to_sql(name="test_timedelta", con=conn) assert result_count == 2 result = sql.read_sql_query("SELECT * FROM test_timedelta", conn) - tm.assert_series_equal(result["foo"], df["foo"].view("int64")) + + if conn_name == "postgresql_adbc_conn": + # TODO: Postgres stores an INTERVAL, which ADBC reads as a Month-Day-Nano + # Interval; the default pandas type mapper maps this to a DateOffset + # but maybe we should try and restore the timedelta here? + expected = Series( + [ + pd.DateOffset(months=0, days=0, microseconds=1000000, nanoseconds=0), + pd.DateOffset(months=0, days=0, microseconds=3000000, nanoseconds=0), + ], + name="foo", + ) + else: + expected = df["foo"].view("int64") + tm.assert_series_equal(result["foo"], expected) @pytest.mark.parametrize("conn", all_connectable) def test_api_complex_raises(conn, request): + conn_name = conn conn = request.getfixturevalue(conn) df = DataFrame({"a": [1 + 1j, 2j]}) - msg = "Complex datatypes not supported" + + if "adbc" in conn_name: + msg = "datatypes not supported" + else: + msg = "Complex datatypes not supported" with pytest.raises(ValueError, match=msg): assert df.to_sql("test_complex", con=conn) is None @@ -1558,6 +1945,10 @@ def test_api_complex_raises(conn, request): ], ) def test_api_to_sql_index_label(conn, request, index_name, index_label, expected): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="index_label argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) if sql.has_table("test_index_label", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: @@ -1580,6 +1971,10 @@ def test_api_to_sql_index_label_multiindex(conn, request): reason="MySQL can fail using TEXT without length as key", strict=False ) ) + elif "adbc" in conn_name: + request.node.add_marker( + pytest.mark.xfail(reason="index_label argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) if sql.has_table("test_index_label", conn): @@ -1702,6 +2097,13 @@ def test_api_integer_col_names(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_get_schema(conn, request, test_frame1): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'get_schema' not implemented for ADBC drivers", + strict=True, + ) + ) conn = request.getfixturevalue(conn) create_sql = sql.get_schema(test_frame1, "test", con=conn) assert "CREATE" in create_sql @@ -1710,6 +2112,13 @@ def test_api_get_schema(conn, request, test_frame1): @pytest.mark.parametrize("conn", all_connectable) def test_api_get_schema_with_schema(conn, request, test_frame1): # GH28486 + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'get_schema' not implemented for ADBC drivers", + strict=True, + ) + ) conn = request.getfixturevalue(conn) create_sql = sql.get_schema(test_frame1, "test", con=conn, schema="pypi") assert "CREATE TABLE pypi." in create_sql @@ -1717,6 +2126,13 @@ def test_api_get_schema_with_schema(conn, request, test_frame1): @pytest.mark.parametrize("conn", all_connectable) def test_api_get_schema_dtypes(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'get_schema' not implemented for ADBC drivers", + strict=True, + ) + ) conn_name = conn conn = request.getfixturevalue(conn) float_frame = DataFrame({"a": [1.1, 1.2], "b": [2.1, 2.2]}) @@ -1734,6 +2150,13 @@ def test_api_get_schema_dtypes(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_get_schema_keys(conn, request, test_frame1): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'get_schema' not implemented for ADBC drivers", + strict=True, + ) + ) conn_name = conn conn = request.getfixturevalue(conn) frame = DataFrame({"Col1": [1.1, 1.2], "Col2": [2.1, 2.2]}) @@ -1756,6 +2179,10 @@ def test_api_get_schema_keys(conn, request, test_frame1): @pytest.mark.parametrize("conn", all_connectable) def test_api_chunksize_read(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC") + ) conn_name = conn conn = request.getfixturevalue(conn) if sql.has_table("test_chunksize", conn): @@ -1801,6 +2228,13 @@ def test_api_chunksize_read(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_categorical(conn, request): + if conn == "postgresql_adbc_conn": + request.node.add_marker( + pytest.mark.xfail( + reason="categorical dtype not implemented for ADBC postgres driver", + strict=True, + ) + ) # GH8624 # test that categorical gets written correctly as dense column conn = request.getfixturevalue(conn) @@ -1859,6 +2293,10 @@ def test_api_escaped_table_name(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_read_sql_duplicate_columns(conn, request): # GH#53117 + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="pyarrow->pandas throws ValueError", strict=True) + ) conn = request.getfixturevalue(conn) if sql.has_table("test_table", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: @@ -1867,7 +2305,7 @@ def test_api_read_sql_duplicate_columns(conn, request): df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": 1}) df.to_sql(name="test_table", con=conn, index=False) - result = pd.read_sql("SELECT a, b, a +1 as a, c FROM test_table;", conn) + result = pd.read_sql("SELECT a, b, a +1 as a, c FROM test_table", conn) expected = DataFrame( [[1, 0.1, 2, 1], [2, 0.2, 3, 1], [3, 0.3, 4, 1]], columns=["a", "b", "a", "c"], @@ -1961,7 +2399,7 @@ def test_not_reflect_all_tables(sqlite_conn): @pytest.mark.parametrize("conn", all_connectable) def test_warning_case_insensitive_table_name(conn, request, test_frame1): conn_name = conn - if conn_name == "sqlite_buildin": + if conn_name == "sqlite_buildin" or "adbc" in conn_name: request.applymarker(pytest.mark.xfail(reason="Does not raise warning")) conn = request.getfixturevalue(conn) @@ -2249,12 +2687,15 @@ def test_roundtrip(conn, request, test_frame1): if conn == "sqlite_str": pytest.skip("sqlite_str has no inspection system") + conn_name = conn conn = request.getfixturevalue(conn) pandasSQL = pandasSQL_builder(conn) with pandasSQL.run_transaction(): assert pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") == 4 result = pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") + if "adbc" in conn_name: + result = result.rename(columns={"__index_level_0__": "level_0"}) result.set_index("level_0", inplace=True) # result.index.astype(int) @@ -2270,7 +2711,8 @@ def test_execute_sql(conn, request): with pandasSQL.run_transaction(): iris_results = pandasSQL.execute("SELECT * FROM iris") row = iris_results.fetchone() - tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) + iris_results.close() + assert list(row) == [5.1, 3.5, 1.4, 0.2, "Iris-setosa"] @pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) @@ -2286,7 +2728,7 @@ def test_sqlalchemy_read_table_columns(conn, request): iris_frame = sql.read_sql_table( "iris", con=conn, columns=["SepalLength", "SepalLength"] ) - tm.equalContents(iris_frame.columns.values, ["SepalLength", "SepalLength"]) + tm.assert_index_equal(iris_frame.columns, Index(["SepalLength", "SepalLength__1"])) @pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) @@ -2629,6 +3071,12 @@ def test_nan_string(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_to_sql_save_index(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="ADBC implementation does not create index", strict=True + ) + ) conn_name = conn conn = request.getfixturevalue(conn) df = DataFrame.from_records( @@ -2667,7 +3115,7 @@ def test_transactions(conn, request): conn = request.getfixturevalue(conn) stmt = "CREATE TABLE test_trans (A INT, B TEXT)" - if conn_name != "sqlite_buildin": + if conn_name != "sqlite_buildin" and "adbc" not in conn_name: from sqlalchemy import text stmt = text(stmt) @@ -2679,11 +3127,12 @@ def test_transactions(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_transaction_rollback(conn, request): + conn_name = conn conn = request.getfixturevalue(conn) with pandasSQL_builder(conn) as pandasSQL: with pandasSQL.run_transaction() as trans: stmt = "CREATE TABLE test_trans (A INT, B TEXT)" - if isinstance(pandasSQL, SQLiteDatabase): + if "adbc" in conn_name or isinstance(pandasSQL, SQLiteDatabase): trans.execute(stmt) else: from sqlalchemy import text @@ -2987,9 +3436,11 @@ class Temporary(Base): @pytest.mark.parametrize("conn", all_connectable) def test_invalid_engine(conn, request, test_frame1): - if conn == "sqlite_buildin": + if conn == "sqlite_buildin" or "adbc" in conn: request.applymarker( - pytest.mark.xfail(reason="SQLiteDatabase does not raise for bad engine") + pytest.mark.xfail( + reason="SQLiteDatabase/ADBCDatabase does not raise for bad engine" + ) ) conn = request.getfixturevalue(conn) @@ -3089,6 +3540,12 @@ def test_read_sql_dtype_backend( expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) tm.assert_frame_equal(result, expected) + if "adbc" in conn_name: + # adbc does not support chunksize argument + request.applymarker( + pytest.mark.xfail(reason="adbc does not support chunksize argument") + ) + with pd.option_context("mode.string_storage", string_storage): iterator = getattr(pd, func)( f"Select * from {table}", @@ -3112,7 +3569,7 @@ def test_read_sql_dtype_backend_table( dtype_backend_data, dtype_backend_expected, ): - if "sqlite" in conn: + if "sqlite" in conn and "adbc" not in conn: request.applymarker( pytest.mark.xfail( reason=( @@ -3133,6 +3590,10 @@ def test_read_sql_dtype_backend_table( expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) tm.assert_frame_equal(result, expected) + if "adbc" in conn_name: + # adbc does not support chunksize argument + return + with pd.option_context("mode.string_storage", string_storage): iterator = getattr(pd, func)( table, @@ -3229,6 +3690,10 @@ def func(storage, dtype_backend, conn_name) -> DataFrame: @pytest.mark.parametrize("conn", all_connectable) def test_chunksize_empty_dtypes(conn, request): # GH#50245 + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) dtypes = {"a": "int64", "b": "object"} df = DataFrame(columns=["a", "b"]).astype(dtypes) diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index fb24902efc0f5..a85576ff13f5c 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -9,6 +9,7 @@ from pandas import ( DataFrame, + DatetimeIndex, Series, to_datetime, ) @@ -146,7 +147,9 @@ def test_dtypes_with_names(parser): "Col1": ["square", "circle", "triangle"], "Col2": Series(["00360", "00360", "00180"]).astype("string"), "Col3": Series([4.0, float("nan"), 3.0]).astype("Int64"), - "Col4": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), + "Col4": DatetimeIndex( + ["2020-01-01", "2021-01-01", "2022-01-01"], dtype="M8[ns]" + ), } ) diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index f1fc1174416ca..2a864abc5ea4a 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -12,6 +12,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.api import is_list_like import pandas as pd @@ -22,6 +24,7 @@ Series, bdate_range, date_range, + option_context, plotting, ) import pandas._testing as tm @@ -794,13 +797,17 @@ def test_scatterplot_datetime_data(self, x, y): _check_plot_works(df.plot.scatter, x=x, y=y) + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) @pytest.mark.parametrize("x, y", [("a", "b"), (0, 1)]) @pytest.mark.parametrize("b_col", [[2, 3, 4], ["a", "b", "c"]]) - def test_scatterplot_object_data(self, b_col, x, y): + def test_scatterplot_object_data(self, b_col, x, y, infer_string): # GH 18755 - df = DataFrame({"a": ["A", "B", "C"], "b": b_col}) + with option_context("future.infer_string", infer_string): + df = DataFrame({"a": ["A", "B", "C"], "b": b_col}) - _check_plot_works(df.plot.scatter, x=x, y=y) + _check_plot_works(df.plot.scatter, x=x, y=y) @pytest.mark.parametrize("ordered", [True, False]) @pytest.mark.parametrize( diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 0f66d94535053..bfb9a5a9a1647 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -347,7 +347,7 @@ def test_irregular_datetime64_repr_bug(self): assert rs == xp def test_business_freq(self): - bts = tm.makePeriodSeries() + bts = Series(range(5), period_range("2020-01-01", periods=5)) msg = r"PeriodDtype\[B\] is deprecated" dt = bts.index[0].to_timestamp() with tm.assert_produces_warning(FutureWarning, match=msg): diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index f78f5c4879b9f..c8b47666e1b4a 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -14,6 +14,7 @@ DataFrame, Series, date_range, + period_range, plotting, ) import pandas._testing as tm @@ -42,12 +43,9 @@ def ts(): @pytest.fixture def series(): - return tm.makeStringSeries(name="series") - - -@pytest.fixture -def iseries(): - return tm.makePeriodSeries(name="iseries") + return Series( + range(20), dtype=np.float64, name="series", index=[f"i_{i}" for i in range(20)] + ) class TestSeriesPlots: @@ -82,8 +80,9 @@ def test_plot_ts_bar(self, ts): def test_plot_ts_area_stacked(self, ts): _check_plot_works(ts.plot.area, stacked=False) - def test_plot_iseries(self, iseries): - _check_plot_works(iseries.plot) + def test_plot_iseries(self): + ser = Series(range(5), period_range("2020-01-01", periods=5)) + _check_plot_works(ser.plot) @pytest.mark.parametrize( "kind", diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 17227e7cfabc7..9c4ae92224148 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -28,6 +28,7 @@ ) import pandas._testing as tm from pandas.core import nanops +from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics def get_objs(): @@ -57,7 +58,11 @@ class TestReductions: def test_ops(self, opname, obj): result = getattr(obj, opname)() if not isinstance(obj, PeriodIndex): - expected = getattr(obj.values, opname)() + if isinstance(obj.values, ArrowStringArrayNumpySemantics): + # max not on the interface + expected = getattr(np.array(obj.values), opname)() + else: + expected = getattr(obj.values, opname)() else: expected = Period(ordinal=getattr(obj.asi8, opname)(), freq=obj.freq) @@ -863,7 +868,7 @@ def test_idxmin_dt64index(self, unit): def test_idxmin(self): # test idxmin # _check_stat_op approach can not be used here because of isna check. - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") # add some NaNs string_series[5:15] = np.nan @@ -896,7 +901,7 @@ def test_idxmin(self): def test_idxmax(self): # test idxmax # _check_stat_op approach can not be used here because of isna check. - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") # add some NaNs string_series[5:15] = np.nan @@ -1165,7 +1170,7 @@ def test_assert_idxminmax_empty_raises(self, test_input, error_type): with pytest.raises(ValueError, match=msg): test_input.idxmax(skipna=False) - def test_idxminmax_object_dtype(self): + def test_idxminmax_object_dtype(self, using_infer_string): # pre-2.1 object-dtype was disallowed for argmin/max ser = Series(["foo", "bar", "baz"]) assert ser.idxmax() == 0 @@ -1179,18 +1184,19 @@ def test_idxminmax_object_dtype(self): assert ser2.idxmin() == 0 assert ser2.idxmin(skipna=False) == 0 - # attempting to compare np.nan with string raises - ser3 = Series(["foo", "foo", "bar", "bar", None, np.nan, "baz"]) - msg = "'>' not supported between instances of 'float' and 'str'" - with pytest.raises(TypeError, match=msg): - ser3.idxmax() - with pytest.raises(TypeError, match=msg): - ser3.idxmax(skipna=False) - msg = "'<' not supported between instances of 'float' and 'str'" - with pytest.raises(TypeError, match=msg): - ser3.idxmin() - with pytest.raises(TypeError, match=msg): - ser3.idxmin(skipna=False) + if not using_infer_string: + # attempting to compare np.nan with string raises + ser3 = Series(["foo", "foo", "bar", "bar", None, np.nan, "baz"]) + msg = "'>' not supported between instances of 'float' and 'str'" + with pytest.raises(TypeError, match=msg): + ser3.idxmax() + with pytest.raises(TypeError, match=msg): + ser3.idxmax(skipna=False) + msg = "'<' not supported between instances of 'float' and 'str'" + with pytest.raises(TypeError, match=msg): + ser3.idxmin() + with pytest.raises(TypeError, match=msg): + ser3.idxmin(skipna=False) def test_idxminmax_object_frame(self): # GH#4279 @@ -1445,14 +1451,14 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3): s = Series(data, dtype=object) result = s.mode(dropna) - expected2 = Series(expected2, dtype=object) + expected2 = Series(expected2, dtype=None if expected2 == ["bar"] else object) tm.assert_series_equal(result, expected2) data = ["foo", "bar", "bar", np.nan, np.nan, np.nan] s = Series(data, dtype=object).astype(str) result = s.mode(dropna) - expected3 = Series(expected3, dtype=str) + expected3 = Series(expected3) tm.assert_series_equal(result, expected3) @pytest.mark.parametrize( @@ -1467,7 +1473,7 @@ def test_mode_mixeddtype(self, dropna, expected1, expected2): s = Series([1, "foo", "foo", np.nan, np.nan, np.nan]) result = s.mode(dropna) - expected = Series(expected2, dtype=object) + expected = Series(expected2, dtype=None if expected2 == ["foo"] else object) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index 74e521ab71f41..81f560caff3fa 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -154,15 +154,15 @@ def _check_stat_op( f(string_series_, numeric_only=True) def test_sum(self): - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") self._check_stat_op("sum", np.sum, string_series, check_allna=False) def test_mean(self): - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") self._check_stat_op("mean", np.mean, string_series) def test_median(self): - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") self._check_stat_op("median", np.median, string_series) # test with integers, test failure @@ -170,19 +170,19 @@ def test_median(self): tm.assert_almost_equal(np.median(int_ts), int_ts.median()) def test_prod(self): - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") self._check_stat_op("prod", np.prod, string_series) def test_min(self): - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") self._check_stat_op("min", np.min, string_series, check_objects=True) def test_max(self): - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") self._check_stat_op("max", np.max, string_series, check_objects=True) def test_var_std(self): - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") datetime_series = tm.makeTimeSeries().rename("ts") alt = lambda x: np.std(x, ddof=1) @@ -208,7 +208,7 @@ def test_var_std(self): assert pd.isna(result) def test_sem(self): - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") datetime_series = tm.makeTimeSeries().rename("ts") alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x)) @@ -228,7 +228,7 @@ def test_sem(self): def test_skew(self): sp_stats = pytest.importorskip("scipy.stats") - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") alt = lambda x: sp_stats.skew(x, bias=False) self._check_stat_op("skew", alt, string_series) @@ -250,7 +250,7 @@ def test_skew(self): def test_kurt(self): sp_stats = pytest.importorskip("scipy.stats") - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") alt = lambda x: sp_stats.kurtosis(x, bias=False) self._check_stat_op("kurt", alt, string_series) diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index fee52780585b8..7176cdf6ff9e4 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -5,6 +5,8 @@ from pandas import ( DataFrame, + DatetimeIndex, + Index, MultiIndex, NaT, PeriodIndex, @@ -255,7 +257,7 @@ def test_resample_count_empty_dataframe(freq, empty_frame_dti): index = _asfreq_compat(empty_frame_dti.index, freq) - expected = DataFrame({"a": []}, dtype="int64", index=index) + expected = DataFrame(dtype="int64", index=index, columns=Index(["a"], dtype=object)) tm.assert_frame_equal(result, expected) @@ -287,16 +289,19 @@ def test_resample_size_empty_dataframe(freq, empty_frame_dti): tm.assert_series_equal(result, expected) -@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") -@pytest.mark.parametrize("index", tm.all_timeseries_index_generator(0)) +@pytest.mark.parametrize( + "index", + [ + PeriodIndex([], freq="M", name="a"), + DatetimeIndex([], name="a"), + TimedeltaIndex([], name="a"), + ], +) @pytest.mark.parametrize("dtype", [float, int, object, "datetime64[ns]"]) def test_resample_empty_dtypes(index, dtype, resample_method): # Empty series were sometimes causing a segfault (for the functions # with Cython bounds-checking disabled) or an IndexError. We just run # them to ensure they no longer do. (GH #10228) - if isinstance(index, PeriodIndex): - # GH#53511 - index = PeriodIndex([], freq="B", name=index.name) empty_series_dti = Series([], index, dtype) rs = empty_series_dti.resample("d", group_keys=False) try: diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 554dc92d8508e..98cd7d7ae22f6 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1866,7 +1866,7 @@ def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k, unit): # GH 24127 n1_ = n1 * k n2_ = n2 * k - dti = date_range("19910905 13:00", "19911005 07:00", freq=freq1).as_unit(unit) + dti = date_range("1991-09-05", "1991-09-12", freq=freq1).as_unit(unit) ser = Series(range(len(dti)), index=dti) result1 = ser.resample(str(n1_) + freq1).mean() diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 5d233a940de2f..b3d346eb22ac5 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -504,8 +504,8 @@ def test_resample_weekly_all_na(self): expected = ts.asfreq("W-THU").ffill() tm.assert_series_equal(result, expected) - def test_resample_tz_localized(self): - dr = date_range(start="2012-4-13", end="2012-5-1") + def test_resample_tz_localized(self, unit): + dr = date_range(start="2012-4-13", end="2012-5-1", unit=unit) ts = Series(range(len(dr)), index=dr) ts_utc = ts.tz_localize("UTC") @@ -514,9 +514,7 @@ def test_resample_tz_localized(self): result = ts_local.resample("W").mean() ts_local_naive = ts_local.copy() - ts_local_naive.index = [ - x.replace(tzinfo=None) for x in ts_local_naive.index.to_pydatetime() - ] + ts_local_naive.index = ts_local_naive.index.tz_localize(None) exp = ts_local_naive.resample("W").mean().tz_localize("America/Los_Angeles") exp.index = pd.DatetimeIndex(exp.index, freq="W") @@ -958,11 +956,8 @@ def test_resample_t_l_deprecated(self): ("2M", "2ME"), ("2Q", "2QE"), ("2Q-FEB", "2QE-FEB"), - ("2BQ", "2BQE"), - ("2BQ-FEB", "2BQE-FEB"), ("2Y", "2YE"), ("2Y-MAR", "2YE-MAR"), - ("2BA-MAR", "2BYE-MAR"), ], ) def test_resample_frequency_ME_QE_YE_error_message(series_and_frame, freq, freq_depr): @@ -980,3 +975,22 @@ def test_corner_cases_period(simple_period_range_series): # it works result = len0pts.resample("Y-DEC").mean() assert len(result) == 0 + + +@pytest.mark.parametrize( + "freq_depr", + [ + "2BME", + "2CBME", + "2SME", + "2BQE-FEB", + "2BYE-MAR", + ], +) +def test_resample_frequency_invalid_freq(series_and_frame, freq_depr): + # GH#9586 + msg = f"Invalid frequency: {freq_depr[1:]}" + + obj = series_and_frame + with pytest.raises(ValueError, match=msg): + obj.resample(freq_depr) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 81a054bbbc3df..7e8779ab48b7e 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -197,7 +197,7 @@ def tests_raises_on_nuisance(test_frame): tm.assert_frame_equal(result, expected) expected = r[["A", "B", "C"]].mean() - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): r.mean() result = r.mean(numeric_only=True) @@ -948,7 +948,7 @@ def test_frame_downsample_method(method, numeric_only, expected_data): if isinstance(expected_data, str): if method in ("var", "mean", "median", "prod"): klass = TypeError - msg = re.escape(f"agg function failed [how->{method},dtype->object]") + msg = re.escape(f"agg function failed [how->{method},dtype->") else: klass = ValueError msg = expected_data @@ -998,7 +998,7 @@ def test_series_downsample_method(method, numeric_only, expected_data): with pytest.raises(TypeError, match=msg): func(**kwargs) elif method == "prod": - msg = re.escape("agg function failed [how->prod,dtype->object]") + msg = re.escape("agg function failed [how->prod,dtype->") with pytest.raises(TypeError, match=msg): func(**kwargs) else: diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index fd02216934576..3cf5201d573d4 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -118,12 +118,11 @@ def test_getitem_multiple(): df = DataFrame(data, index=date_range("2016-01-01", periods=2)) r = df.groupby("id").resample("1D") result = r["buyer"].count() + + exp_mi = pd.MultiIndex.from_arrays([[1, 2], df.index], names=("id", None)) expected = Series( [1, 1], - index=pd.MultiIndex.from_tuples( - [(1, Timestamp("2016-01-01")), (2, Timestamp("2016-01-02"))], - names=["id", None], - ), + index=exp_mi, name="buyer", ) tm.assert_series_equal(result, expected) @@ -349,9 +348,9 @@ def weighted_quantile(series, weights, q): tm.assert_series_equal(result, expected) -def test_resample_groupby_with_label(): +def test_resample_groupby_with_label(unit): # GH 13235 - index = date_range("2000-01-01", freq="2D", periods=5) + index = date_range("2000-01-01", freq="2D", periods=5, unit=unit) df = DataFrame(index=index, data={"col0": [0, 0, 1, 1, 2], "col1": [1, 1, 1, 1, 1]}) msg = "DataFrameGroupBy.resample operated on the grouping columns" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -359,8 +358,9 @@ def test_resample_groupby_with_label(): mi = [ np.array([0, 0, 1, 2], dtype=np.int64), - pd.to_datetime( - np.array(["1999-12-26", "2000-01-02", "2000-01-02", "2000-01-02"]) + np.array( + ["1999-12-26", "2000-01-02", "2000-01-02", "2000-01-02"], + dtype=f"M8[{unit}]", ), ] mindex = pd.MultiIndex.from_arrays(mi, names=["col0", None]) @@ -505,7 +505,9 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): idx = pd.MultiIndex.from_arrays( [ ["A"] * 3 + ["B"] * 3, - pd.to_datetime(["2020-01-05", "2020-01-12", "2020-01-19"] * 2), + pd.to_datetime(["2020-01-05", "2020-01-12", "2020-01-19"] * 2).as_unit( + "ns" + ), ], names=["key", "date"], ) @@ -530,19 +532,15 @@ def test_groupby_resample_with_list_of_keys(): } ) result = df.groupby("group").resample("2D", on="date")[["val"]].mean() + + mi_exp = pd.MultiIndex.from_arrays( + [[0, 0, 1, 1], df["date"]._values[::2]], names=["group", "date"] + ) expected = DataFrame( data={ "val": [4.0, 3.5, 6.5, 3.0], }, - index=Index( - data=[ - (0, Timestamp("2016-01-01")), - (0, Timestamp("2016-01-03")), - (1, Timestamp("2016-01-05")), - (1, Timestamp("2016-01-07")), - ], - name=("group", "date"), - ), + index=mi_exp, ) tm.assert_frame_equal(result, expected) @@ -605,17 +603,17 @@ def test_groupby_resample_size_all_index_same(): msg = "DataFrameGroupBy.resample operated on the grouping columns" with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("A").resample("D").size() + + mi_exp = pd.MultiIndex.from_arrays( + [ + [1, 1, 2, 2], + pd.DatetimeIndex(["2000-12-31", "2001-01-01"] * 2, dtype="M8[ns]"), + ], + names=["A", None], + ) expected = Series( 3, - index=pd.MultiIndex.from_tuples( - [ - (1, Timestamp("2000-12-31")), - (1, Timestamp("2001-01-01")), - (2, Timestamp("2000-12-31")), - (2, Timestamp("2001-01-01")), - ], - names=["A", None], - ), + index=mi_exp, ) tm.assert_series_equal(result, expected) @@ -627,25 +625,18 @@ def test_groupby_resample_on_index_with_list_of_keys(): "group": [0, 0, 0, 0, 1, 1, 1, 1], "val": [3, 1, 4, 1, 5, 9, 2, 6], }, - index=Series( - date_range(start="2016-01-01", periods=8), - name="date", - ), + index=date_range(start="2016-01-01", periods=8, name="date"), ) result = df.groupby("group").resample("2D")[["val"]].mean() + + mi_exp = pd.MultiIndex.from_arrays( + [[0, 0, 1, 1], df.index[::2]], names=["group", "date"] + ) expected = DataFrame( data={ "val": [2.0, 2.5, 7.0, 4.0], }, - index=Index( - data=[ - (0, Timestamp("2016-01-01")), - (0, Timestamp("2016-01-03")), - (1, Timestamp("2016-01-05")), - (1, Timestamp("2016-01-07")), - ], - name=("group", "date"), - ), + index=mi_exp, ) tm.assert_frame_equal(result, expected) @@ -659,26 +650,19 @@ def test_groupby_resample_on_index_with_list_of_keys_multi_columns(): "second_val": [2, 7, 1, 8, 2, 8, 1, 8], "third_val": [1, 4, 1, 4, 2, 1, 3, 5], }, - index=Series( - date_range(start="2016-01-01", periods=8), - name="date", - ), + index=date_range(start="2016-01-01", periods=8, name="date"), ) result = df.groupby("group").resample("2D")[["first_val", "second_val"]].mean() + + mi_exp = pd.MultiIndex.from_arrays( + [[0, 0, 1, 1], df.index[::2]], names=["group", "date"] + ) expected = DataFrame( data={ "first_val": [2.0, 2.5, 7.0, 4.0], "second_val": [4.5, 4.5, 5.0, 4.5], }, - index=Index( - data=[ - (0, Timestamp("2016-01-01")), - (0, Timestamp("2016-01-03")), - (1, Timestamp("2016-01-05")), - (1, Timestamp("2016-01-07")), - ], - name=("group", "date"), - ), + index=mi_exp, ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 0d4b4d1865d45..1dc25cb9d4c1e 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -7,6 +7,7 @@ import pandas as pd from pandas import ( DataFrame, + Index, Series, Timestamp, ) @@ -138,13 +139,17 @@ def test_aggregate_normal(resample_method): normal_df["key"] = [1, 2, 3, 4, 5] * 4 dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) - dt_df["key"] = [ - datetime(2013, 1, 1), - datetime(2013, 1, 2), - datetime(2013, 1, 3), - datetime(2013, 1, 4), - datetime(2013, 1, 5), - ] * 4 + dt_df["key"] = Index( + [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + datetime(2013, 1, 3), + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] + * 4, + dtype="M8[ns]", + ) normal_grouped = normal_df.groupby("key") dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) @@ -216,13 +221,17 @@ def test_aggregate_with_nat(func, fill_value): normal_df["key"] = [1, 2, np.nan, 4, 5] * 4 dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) - dt_df["key"] = [ - datetime(2013, 1, 1), - datetime(2013, 1, 2), - pd.NaT, - datetime(2013, 1, 4), - datetime(2013, 1, 5), - ] * 4 + dt_df["key"] = Index( + [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + pd.NaT, + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] + * 4, + dtype="M8[ns]", + ) normal_grouped = normal_df.groupby("key") dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) @@ -253,13 +262,17 @@ def test_aggregate_with_nat_size(): normal_df["key"] = [1, 2, np.nan, 4, 5] * 4 dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) - dt_df["key"] = [ - datetime(2013, 1, 1), - datetime(2013, 1, 2), - pd.NaT, - datetime(2013, 1, 4), - datetime(2013, 1, 5), - ] * 4 + dt_df["key"] = Index( + [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + pd.NaT, + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] + * 4, + dtype="M8[ns]", + ) normal_grouped = normal_df.groupby("key") dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) @@ -314,10 +327,11 @@ def test_repr(): ], ) def test_upsample_sum(method, method_args, expected_values): - s = Series(1, index=date_range("2017", periods=2, freq="h")) - resampled = s.resample("30min") + ser = Series(1, index=date_range("2017", periods=2, freq="h")) + resampled = ser.resample("30min") index = pd.DatetimeIndex( ["2017-01-01T00:00:00", "2017-01-01T00:30:00", "2017-01-01T01:00:00"], + dtype="M8[ns]", freq="30min", ) result = methodcaller(method, **method_args)(resampled) @@ -342,25 +356,12 @@ def test_groupby_resample_interpolate(): .interpolate(method="linear") ) - expected_ind = pd.MultiIndex.from_tuples( - [ - (50, Timestamp("2018-01-07")), - (50, Timestamp("2018-01-08")), - (50, Timestamp("2018-01-09")), - (50, Timestamp("2018-01-10")), - (50, Timestamp("2018-01-11")), - (50, Timestamp("2018-01-12")), - (50, Timestamp("2018-01-13")), - (50, Timestamp("2018-01-14")), - (50, Timestamp("2018-01-15")), - (50, Timestamp("2018-01-16")), - (50, Timestamp("2018-01-17")), - (50, Timestamp("2018-01-18")), - (50, Timestamp("2018-01-19")), - (50, Timestamp("2018-01-20")), - (50, Timestamp("2018-01-21")), - (60, Timestamp("2018-01-14")), - ], + volume = [50] * 15 + [60] + week_starting = list(date_range("2018-01-07", "2018-01-21")) + [ + Timestamp("2018-01-14") + ] + expected_ind = pd.MultiIndex.from_arrays( + [volume, week_starting], names=["volume", "week_starting"], ) diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index a87042180df8f..ab20e8c8f6930 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -315,7 +315,7 @@ def test_concatlike_datetimetz_short(self, tz): exp_idx = pd.DatetimeIndex( ["2014-07-15", "2014-07-16", "2014-07-17", "2014-07-11", "2014-07-21"], tz=tz, - ) + ).as_unit("ns") exp = DataFrame(0, index=exp_idx, columns=["A", "B"]) tm.assert_frame_equal(df1._append(df2), exp) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 22e1d75255b3f..c061a393bb1a6 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -63,6 +63,7 @@ def test_concat_datetime_timezone(self): ) .tz_convert("UTC") .tz_convert("Europe/Paris") + .as_unit("ns") ) expected = DataFrame( @@ -84,7 +85,7 @@ def test_concat_datetime_timezone(self): "2010-12-31 16:00:00+00:00", "2010-12-31 17:00:00+00:00", ] - ) + ).as_unit("ns") expected = DataFrame( [ @@ -197,8 +198,8 @@ def test_concat_NaT_series2(self): def test_concat_NaT_dataframes(self, tz): # GH 12396 - first = DataFrame([[pd.NaT], [pd.NaT]]) - first = first.apply(lambda x: x.dt.tz_localize(tz)) + dti = DatetimeIndex([pd.NaT, pd.NaT], tz=tz) + first = DataFrame({0: dti}) second = DataFrame( [[Timestamp("2015/01/01", tz=tz)], [Timestamp("2016/01/01", tz=tz)]], index=[2, 3], diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 021cbe9a695e2..c4c83e2046b76 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -771,13 +771,13 @@ def test_join_datetime_string(self): ], columns=["x", "y", "a"], ) - dfa["x"] = pd.to_datetime(dfa["x"]).dt.as_unit("ns") + dfa["x"] = pd.to_datetime(dfa["x"]).astype("M8[ns]") dfb = DataFrame( [["2012-08-02", "J", 1], ["2013-04-06", "L", 2]], columns=["x", "y", "z"], index=[2, 4], ) - dfb["x"] = pd.to_datetime(dfb["x"]).dt.as_unit("ns") + dfb["x"] = pd.to_datetime(dfb["x"]).astype("M8[ns]") result = dfb.join(dfa.set_index(["x", "y"]), on=["x", "y"]) expected = DataFrame( [ @@ -787,7 +787,7 @@ def test_join_datetime_string(self): index=[2, 4], columns=["x", "y", "z", "a"], ) - expected["x"] = expected["x"].dt.as_unit("ns") + expected["x"] = expected["x"].astype("M8[ns]") tm.assert_frame_equal(result, expected) def test_join_with_categorical_index(self): diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 6ca6fde7c120d..f8ececf6c0540 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -545,40 +545,48 @@ def test_pivot_index_with_nan_dates(self, method): tm.assert_frame_equal(result, pv.T) @pytest.mark.parametrize("method", [True, False]) - def test_pivot_with_tz(self, method): + def test_pivot_with_tz(self, method, unit): # GH 5878 df = DataFrame( { - "dt1": [ - datetime(2013, 1, 1, 9, 0), - datetime(2013, 1, 2, 9, 0), - datetime(2013, 1, 1, 9, 0), - datetime(2013, 1, 2, 9, 0), - ], - "dt2": [ - datetime(2014, 1, 1, 9, 0), - datetime(2014, 1, 1, 9, 0), - datetime(2014, 1, 2, 9, 0), - datetime(2014, 1, 2, 9, 0), - ], + "dt1": pd.DatetimeIndex( + [ + datetime(2013, 1, 1, 9, 0), + datetime(2013, 1, 2, 9, 0), + datetime(2013, 1, 1, 9, 0), + datetime(2013, 1, 2, 9, 0), + ], + dtype=f"M8[{unit}, US/Pacific]", + ), + "dt2": pd.DatetimeIndex( + [ + datetime(2014, 1, 1, 9, 0), + datetime(2014, 1, 1, 9, 0), + datetime(2014, 1, 2, 9, 0), + datetime(2014, 1, 2, 9, 0), + ], + dtype=f"M8[{unit}, Asia/Tokyo]", + ), "data1": np.arange(4, dtype="int64"), "data2": np.arange(4, dtype="int64"), } ) - df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific")) - df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo")) - exp_col1 = Index(["data1", "data1", "data2", "data2"]) exp_col2 = pd.DatetimeIndex( - ["2014/01/01 09:00", "2014/01/02 09:00"] * 2, name="dt2", tz="Asia/Tokyo" + ["2014/01/01 09:00", "2014/01/02 09:00"] * 2, + name="dt2", + dtype=f"M8[{unit}, Asia/Tokyo]", ) exp_col = MultiIndex.from_arrays([exp_col1, exp_col2]) + exp_idx = pd.DatetimeIndex( + ["2013/01/01 09:00", "2013/01/02 09:00"], + name="dt1", + dtype=f"M8[{unit}, US/Pacific]", + ) expected = DataFrame( [[0, 2, 0, 2], [1, 3, 1, 3]], - index=pd.DatetimeIndex( - ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific" - ), + index=exp_idx, columns=exp_col, ) @@ -590,12 +598,8 @@ def test_pivot_with_tz(self, method): expected = DataFrame( [[0, 2], [1, 3]], - index=pd.DatetimeIndex( - ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific" - ), - columns=pd.DatetimeIndex( - ["2014/01/01 09:00", "2014/01/02 09:00"], name="dt2", tz="Asia/Tokyo" - ), + index=exp_idx, + columns=exp_col2[:2], ) if method: diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index b108ec24732ac..0c788b371a03a 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -252,7 +252,7 @@ def test_slice(string_series, object_series, using_copy_on_write, warn_copy_on_w assert string_series[numSlice.index[0]] == numSlice[numSlice.index[0]] assert numSlice.index[1] == string_series.index[11] - assert tm.equalContents(numSliceEnd, np.array(string_series)[-10:]) + tm.assert_numpy_array_equal(np.array(numSliceEnd), np.array(string_series)[-10:]) # Test return view. sl = string_series[10:20] diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 02d51d5119469..168be838ec768 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -502,10 +502,11 @@ def test_setitem_empty_series(self): def test_setitem_empty_series_datetimeindex_preserves_freq(self): # GH#33573 our index should retain its freq - series = Series([], DatetimeIndex([], freq="D"), dtype=object) + dti = DatetimeIndex([], freq="D", dtype="M8[ns]") + series = Series([], index=dti, dtype=object) key = Timestamp("2012-01-01") series[key] = 47 - expected = Series(47, DatetimeIndex([key], freq="D")) + expected = Series(47, DatetimeIndex([key], freq="D").as_unit("ns")) tm.assert_series_equal(series, expected) assert series.index.freq == expected.index.freq diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py index 130b77db0a1fb..5bbf35f6e39bb 100644 --- a/pandas/tests/series/methods/test_clip.py +++ b/pandas/tests/series/methods/test_clip.py @@ -135,11 +135,12 @@ def test_clip_with_datetimes(self): ) tm.assert_series_equal(result, expected) - def test_clip_with_timestamps_and_oob_datetimes(self): + @pytest.mark.parametrize("dtype", [object, "M8[us]"]) + def test_clip_with_timestamps_and_oob_datetimes(self, dtype): # GH-42794 - ser = Series([datetime(1, 1, 1), datetime(9999, 9, 9)]) + ser = Series([datetime(1, 1, 1), datetime(9999, 9, 9)], dtype=dtype) result = ser.clip(lower=Timestamp.min, upper=Timestamp.max) - expected = Series([Timestamp.min, Timestamp.max], dtype="object") + expected = Series([Timestamp.min, Timestamp.max], dtype=dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_quantile.py b/pandas/tests/series/methods/test_quantile.py index 016635a50fdf4..fa0563271d7df 100644 --- a/pandas/tests/series/methods/test_quantile.py +++ b/pandas/tests/series/methods/test_quantile.py @@ -105,8 +105,8 @@ def test_quantile_interpolation_dtype(self): def test_quantile_nan(self): # GH 13098 - s = Series([1, 2, 3, 4, np.nan]) - result = s.quantile(0.5) + ser = Series([1, 2, 3, 4, np.nan]) + result = ser.quantile(0.5) expected = 2.5 assert result == expected @@ -114,14 +114,14 @@ def test_quantile_nan(self): s1 = Series([], dtype=object) cases = [s1, Series([np.nan, np.nan])] - for s in cases: - res = s.quantile(0.5) + for ser in cases: + res = ser.quantile(0.5) assert np.isnan(res) - res = s.quantile([0.5]) + res = ser.quantile([0.5]) tm.assert_series_equal(res, Series([np.nan], index=[0.5])) - res = s.quantile([0.2, 0.3]) + res = ser.quantile([0.2, 0.3]) tm.assert_series_equal(res, Series([np.nan, np.nan], index=[0.2, 0.3])) @pytest.mark.parametrize( @@ -160,11 +160,11 @@ def test_quantile_nan(self): ], ) def test_quantile_box(self, case): - s = Series(case, name="XXX") - res = s.quantile(0.5) + ser = Series(case, name="XXX") + res = ser.quantile(0.5) assert res == case[1] - res = s.quantile([0.5]) + res = ser.quantile([0.5]) exp = Series([case[1]], index=[0.5], name="XXX") tm.assert_series_equal(res, exp) @@ -190,35 +190,37 @@ def test_quantile_sparse(self, values, dtype): expected = Series(np.asarray(ser)).quantile([0.5]).astype("Sparse[float]") tm.assert_series_equal(result, expected) - def test_quantile_empty(self): + def test_quantile_empty_float64(self): # floats - s = Series([], dtype="float64") + ser = Series([], dtype="float64") - res = s.quantile(0.5) + res = ser.quantile(0.5) assert np.isnan(res) - res = s.quantile([0.5]) + res = ser.quantile([0.5]) exp = Series([np.nan], index=[0.5]) tm.assert_series_equal(res, exp) + def test_quantile_empty_int64(self): # int - s = Series([], dtype="int64") + ser = Series([], dtype="int64") - res = s.quantile(0.5) + res = ser.quantile(0.5) assert np.isnan(res) - res = s.quantile([0.5]) + res = ser.quantile([0.5]) exp = Series([np.nan], index=[0.5]) tm.assert_series_equal(res, exp) + def test_quantile_empty_dt64(self): # datetime - s = Series([], dtype="datetime64[ns]") + ser = Series([], dtype="datetime64[ns]") - res = s.quantile(0.5) + res = ser.quantile(0.5) assert res is pd.NaT - res = s.quantile([0.5]) - exp = Series([pd.NaT], index=[0.5]) + res = ser.quantile([0.5]) + exp = Series([pd.NaT], index=[0.5], dtype=ser.dtype) tm.assert_series_equal(res, exp) @pytest.mark.parametrize("dtype", [int, float, "Int64"]) diff --git a/pandas/tests/series/methods/test_round.py b/pandas/tests/series/methods/test_round.py index 6c40e36419551..7f60c94f10e4f 100644 --- a/pandas/tests/series/methods/test_round.py +++ b/pandas/tests/series/methods/test_round.py @@ -56,9 +56,10 @@ def test_round_builtin(self, any_float_dtype): @pytest.mark.parametrize("method", ["round", "floor", "ceil"]) @pytest.mark.parametrize("freq", ["s", "5s", "min", "5min", "h", "5h"]) - def test_round_nat(self, method, freq): - # GH14940 - ser = Series([pd.NaT]) - expected = Series(pd.NaT) + def test_round_nat(self, method, freq, unit): + # GH14940, GH#56158 + ser = Series([pd.NaT], dtype=f"M8[{unit}]") + expected = Series(pd.NaT, dtype=f"M8[{unit}]") round_method = getattr(ser.dt, method) - tm.assert_series_equal(round_method(freq), expected) + result = round_method(freq) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index e7233f005e427..f38a29bfe7e88 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -47,7 +47,11 @@ class TestSeriesFlexArithmetic: (lambda x: x, lambda x: x * 2, False), (lambda x: x, lambda x: x[::2], False), (lambda x: x, lambda x: 5, True), - (lambda x: tm.makeFloatSeries(), lambda x: tm.makeFloatSeries(), True), + ( + lambda x: Series(range(10), dtype=np.float64), + lambda x: Series(range(10), dtype=np.float64), + True, + ), ], ) @pytest.mark.parametrize( @@ -662,9 +666,9 @@ def test_comparison_operators_with_nas(self, comparison_op): def test_ne(self): ts = Series([3, 4, 5, 6, 7], [3, 4, 5, 6, 7], dtype=float) - expected = [True, True, False, True, True] - assert tm.equalContents(ts.index != 5, expected) - assert tm.equalContents(~(ts.index == 5), expected) + expected = np.array([True, True, False, True, True]) + tm.assert_numpy_array_equal(ts.index != 5, expected) + tm.assert_numpy_array_equal(~(ts.index == 5), expected) @pytest.mark.parametrize( "left, right", diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 587c73cc535d8..84c612c43da29 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -160,7 +160,7 @@ def test_constructor(self, datetime_series, using_infer_string): derived = Series(datetime_series) assert derived.index._is_all_dates - assert tm.equalContents(derived.index, datetime_series.index) + tm.assert_index_equal(derived.index, datetime_series.index) # Ensure new index is not created assert id(datetime_series.index) == id(derived.index) @@ -1169,9 +1169,10 @@ def test_constructor_with_datetime_tz3(self): def test_constructor_with_datetime_tz2(self): # with all NaT - s = Series(NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]") - expected = Series(DatetimeIndex(["NaT", "NaT"], tz="US/Eastern")) - tm.assert_series_equal(s, expected) + ser = Series(NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]") + dti = DatetimeIndex(["NaT", "NaT"], tz="US/Eastern").as_unit("ns") + expected = Series(dti) + tm.assert_series_equal(ser, expected) def test_constructor_no_partial_datetime_casting(self): # GH#40111 diff --git a/pandas/tests/series/test_unary.py b/pandas/tests/series/test_unary.py index ad0e344fa4420..8f153788e413c 100644 --- a/pandas/tests/series/test_unary.py +++ b/pandas/tests/series/test_unary.py @@ -8,13 +8,11 @@ class TestSeriesUnaryOps: # __neg__, __pos__, __invert__ def test_neg(self): - ser = tm.makeStringSeries() - ser.name = "series" + ser = Series(range(5), dtype="float64", name="series") tm.assert_series_equal(-ser, -1 * ser) def test_invert(self): - ser = tm.makeStringSeries() - ser.name = "series" + ser = Series(range(5), dtype="float64", name="series") tm.assert_series_equal(-(ser < 0), ~(ser < 0)) @pytest.mark.parametrize( diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 78f0730d730e8..bd64a5dce3b9a 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -5,6 +5,7 @@ import pytest from pandas.errors import PerformanceWarning +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -893,7 +894,10 @@ def test_find_nan(any_string_dtype): # -------------------------------------------------------------------------------------- -def test_translate(index_or_series, any_string_dtype): +@pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] +) +def test_translate(index_or_series, any_string_dtype, infer_string): obj = index_or_series( ["abcdefg", "abcc", "cdddfg", "cdefggg"], dtype=any_string_dtype ) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ce8d962fc3115..a6e63dfd5f409 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1246,9 +1246,10 @@ def test_value_counts_nat(self): result_td = algos.value_counts(td) tm.assert_series_equal(result_td, exp_td) - def test_value_counts_datetime_outofbounds(self): + @pytest.mark.parametrize("dtype", [object, "M8[us]"]) + def test_value_counts_datetime_outofbounds(self, dtype): # GH 13663 - s = Series( + ser = Series( [ datetime(3000, 1, 1), datetime(5000, 1, 1), @@ -1256,13 +1257,14 @@ def test_value_counts_datetime_outofbounds(self): datetime(6000, 1, 1), datetime(3000, 1, 1), datetime(3000, 1, 1), - ] + ], + dtype=dtype, ) - res = s.value_counts() + res = ser.value_counts() exp_index = Index( [datetime(3000, 1, 1), datetime(5000, 1, 1), datetime(6000, 1, 1)], - dtype=object, + dtype=dtype, ) exp = Series([3, 2, 1], index=exp_index, name="count") tm.assert_series_equal(res, exp) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index ef76d99260764..ee209f74eb4e0 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -183,7 +183,7 @@ def test_to_datetime_format_YYYYMMDD_ignore_with_outofbounds(self, cache): errors="ignore", cache=cache, ) - expected = Index(["15010101", "20150101", np.nan]) + expected = Index(["15010101", "20150101", np.nan], dtype=object) tm.assert_index_equal(result, expected) def test_to_datetime_format_YYYYMMDD_coercion(self, cache): @@ -1206,7 +1206,9 @@ def test_out_of_bounds_errors_ignore2(self): # GH#12424 msg = "errors='ignore' is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): - res = to_datetime(Series(["2362-01-01", np.nan]), errors="ignore") + res = to_datetime( + Series(["2362-01-01", np.nan], dtype=object), errors="ignore" + ) exp = Series(["2362-01-01", np.nan], dtype=object) tm.assert_series_equal(res, exp) @@ -1489,7 +1491,7 @@ def test_datetime_invalid_index(self, values, format): warn, match="Could not infer format", raise_on_extra_warnings=False ): res = to_datetime(values, errors="ignore", format=format) - tm.assert_index_equal(res, Index(values)) + tm.assert_index_equal(res, Index(values, dtype=object)) with tm.assert_produces_warning( warn, match="Could not infer format", raise_on_extra_warnings=False @@ -1585,28 +1587,23 @@ def test_convert_object_to_datetime_with_cache( @pytest.mark.parametrize("cache", [True, False]) @pytest.mark.parametrize( - ("input", "expected"), - ( - ( - Series([NaT] * 20 + [None] * 20, dtype="object"), - Series([NaT] * 40, dtype="datetime64[ns]"), - ), - ( - Series([NaT] * 60 + [None] * 60, dtype="object"), - Series([NaT] * 120, dtype="datetime64[ns]"), - ), - (Series([None] * 20), Series([NaT] * 20, dtype="datetime64[ns]")), - (Series([None] * 60), Series([NaT] * 60, dtype="datetime64[ns]")), - (Series([""] * 20), Series([NaT] * 20, dtype="datetime64[ns]")), - (Series([""] * 60), Series([NaT] * 60, dtype="datetime64[ns]")), - (Series([pd.NA] * 20), Series([NaT] * 20, dtype="datetime64[ns]")), - (Series([pd.NA] * 60), Series([NaT] * 60, dtype="datetime64[ns]")), - (Series([np.nan] * 20), Series([NaT] * 20, dtype="datetime64[ns]")), - (Series([np.nan] * 60), Series([NaT] * 60, dtype="datetime64[ns]")), - ), + "input", + [ + Series([NaT] * 20 + [None] * 20, dtype="object"), + Series([NaT] * 60 + [None] * 60, dtype="object"), + Series([None] * 20), + Series([None] * 60), + Series([""] * 20), + Series([""] * 60), + Series([pd.NA] * 20), + Series([pd.NA] * 60), + Series([np.nan] * 20), + Series([np.nan] * 60), + ], ) - def test_to_datetime_converts_null_like_to_nat(self, cache, input, expected): + def test_to_datetime_converts_null_like_to_nat(self, cache, input): # GH35888 + expected = Series([NaT] * len(input), dtype="M8[ns]") result = to_datetime(input, cache=cache) tm.assert_series_equal(result, expected) @@ -1672,7 +1669,7 @@ def test_to_datetime_coerce_oob(self, string_arg, format, outofbounds): "errors, expected", [ ("coerce", Index([NaT, NaT])), - ("ignore", Index(["200622-12-31", "111111-24-11"])), + ("ignore", Index(["200622-12-31", "111111-24-11"], dtype=object)), ], ) def test_to_datetime_malformed_no_raise(self, errors, expected): @@ -1864,16 +1861,14 @@ def test_to_datetime_month_or_year_unit_int(self, cache, unit, item, request): result = to_datetime(np.array([item], dtype=object), unit=unit, cache=cache) tm.assert_index_equal(result, expected) - # TODO: this should also work - if isinstance(item, float): - request.applymarker( - pytest.mark.xfail( - reason=f"{type(item).__name__} in np.array should work" - ) - ) result = to_datetime(np.array([item]), unit=unit, cache=cache) tm.assert_index_equal(result, expected) + # with a nan! + result = to_datetime(np.array([item, np.nan]), unit=unit, cache=cache) + assert result.isna()[1] + tm.assert_index_equal(result[:1], expected) + @pytest.mark.parametrize("unit", ["Y", "M"]) def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): # GH#50301 @@ -1883,6 +1878,8 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): msg = f"Conversion of non-round float with unit={unit} is ambiguous" with pytest.raises(ValueError, match=msg): to_datetime([1.5], unit=unit, errors="raise") + with pytest.raises(ValueError, match=msg): + to_datetime(np.array([1.5]), unit=unit, errors="raise") with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning, match=warn_msg): to_datetime(["1.5"], unit=unit, errors="raise") @@ -1948,7 +1945,7 @@ def test_unit_array_mixed_nans_large_int(self, cache): tm.assert_index_equal(result, expected) result = to_datetime(values, errors="coerce", unit="s", cache=cache) - expected = DatetimeIndex(["NaT", "NaT", "NaT", "NaT", "NaT"]) + expected = DatetimeIndex(["NaT", "NaT", "NaT", "NaT", "NaT"], dtype="M8[ns]") tm.assert_index_equal(result, expected) msg = "cannot convert input 1420043460000000000000000 with the unit 's'" @@ -2030,10 +2027,14 @@ def test_unit_mixed(self, cache, arr): def test_unit_rounding(self, cache): # GH 14156 & GH 20445: argument will incur floating point errors # but no premature rounding - result = to_datetime(1434743731.8770001, unit="s", cache=cache) - expected = Timestamp("2015-06-19 19:55:31.877000192") + value = 1434743731.8770001 + result = to_datetime(value, unit="s", cache=cache) + expected = Timestamp("2015-06-19 19:55:31.877000093") assert result == expected + alt = Timestamp(value, unit="s") + assert alt == result + def test_unit_ignore_keeps_name(self, cache): # GH 21697 expected = Index([15e9] * 2, name="name") @@ -2682,7 +2683,7 @@ def test_string_na_nat_conversion_malformed(self, cache): result = to_datetime(malformed, errors="ignore", cache=cache) # GH 21864 - expected = Index(malformed) + expected = Index(malformed, dtype=object) tm.assert_index_equal(result, expected) with pytest.raises(ValueError, match=msg): @@ -3671,7 +3672,7 @@ def test_to_datetime_mixed_not_necessarily_iso8601_raise(): ("errors", "expected"), [ ("coerce", DatetimeIndex(["2020-01-01 00:00:00", NaT])), - ("ignore", Index(["2020-01-01", "01-01-2000"])), + ("ignore", Index(["2020-01-01", "01-01-2000"], dtype=object)), ], ) def test_to_datetime_mixed_not_necessarily_iso8601_coerce(errors, expected): diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index c4c9b41c218a0..e588bc83b0de8 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -6,6 +6,7 @@ import numpy as np import pytest +from pandas.compat import IS64 from pandas.errors import OutOfBoundsTimedelta import pandas as pd @@ -232,6 +233,7 @@ def test_to_timedelta_on_missing_values_list(self, val): actual = to_timedelta([val]) assert actual[0]._value == np.timedelta64("NaT").astype("int64") + @pytest.mark.xfail(not IS64, reason="Floating point error") def test_to_timedelta_float(self): # https://github.com/pandas-dev/pandas/issues/25077 arr = np.arange(0, 1, 1e-6)[-10:] diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index ee8f161858b2b..75528a8b99c4d 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -431,12 +431,18 @@ def test_series_invalid_type(end): frequencies.infer_freq(s) -def test_series_inconvertible_string(): +def test_series_inconvertible_string(using_infer_string): # see gh-6407 - msg = "Unknown datetime string format" + if using_infer_string: + msg = "cannot infer freq from" - with pytest.raises(ValueError, match=msg): - frequencies.infer_freq(Series(["foo", "bar"])) + with pytest.raises(TypeError, match=msg): + frequencies.infer_freq(Series(["foo", "bar"])) + else: + msg = "Unknown datetime string format" + + with pytest.raises(ValueError, match=msg): + frequencies.infer_freq(Series(["foo", "bar"])) @pytest.mark.parametrize("freq", [None, "ms"]) diff --git a/pandas/tests/tseries/offsets/test_business_hour.py b/pandas/tests/tseries/offsets/test_business_hour.py index 227cf6a495c91..2779100f5355c 100644 --- a/pandas/tests/tseries/offsets/test_business_hour.py +++ b/pandas/tests/tseries/offsets/test_business_hour.py @@ -946,38 +946,6 @@ def test_apply_nanoseconds(self): for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - def test_datetimeindex(self): - idx1 = date_range(start="2014-07-04 15:00", end="2014-07-08 10:00", freq="bh") - idx2 = date_range(start="2014-07-04 15:00", periods=12, freq="bh") - idx3 = date_range(end="2014-07-08 10:00", periods=12, freq="bh") - expected = DatetimeIndex( - [ - "2014-07-04 15:00", - "2014-07-04 16:00", - "2014-07-07 09:00", - "2014-07-07 10:00", - "2014-07-07 11:00", - "2014-07-07 12:00", - "2014-07-07 13:00", - "2014-07-07 14:00", - "2014-07-07 15:00", - "2014-07-07 16:00", - "2014-07-08 09:00", - "2014-07-08 10:00", - ], - freq="bh", - ) - for idx in [idx1, idx2, idx3]: - tm.assert_index_equal(idx, expected) - - idx1 = date_range(start="2014-07-04 15:45", end="2014-07-08 10:45", freq="bh") - idx2 = date_range(start="2014-07-04 15:45", periods=12, freq="bh") - idx3 = date_range(end="2014-07-08 10:45", periods=12, freq="bh") - - expected = idx1 - for idx in [idx1, idx2, idx3]: - tm.assert_index_equal(idx, expected) - @pytest.mark.parametrize("td_unit", ["s", "ms", "us", "ns"]) @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_bday_ignores_timedeltas(self, unit, td_unit): diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 15e34c68c4d2f..632d3b4cc3c84 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -26,6 +26,12 @@ class TestArrayToDatetimeResolutionInference: # TODO: tests that include tzs, ints + def test_infer_all_nat(self): + arr = np.array([NaT, np.nan], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + assert result.dtype == "M8[s]" + def test_infer_homogeoneous_datetimes(self): dt = datetime(2023, 10, 27, 18, 3, 5, 678000) arr = np.array([dt, dt, dt], dtype=object) @@ -120,11 +126,11 @@ def test_array_to_datetime_with_tz_resolution_all_nat(self): tz = tzoffset("custom", 3600) vals = np.array(["NaT"], dtype=object) res = tslib.array_to_datetime_with_tz(vals, tz, False, False, creso_infer) - assert res.dtype == "M8[ns]" + assert res.dtype == "M8[s]" vals2 = np.array([NaT, NaT], dtype=object) res2 = tslib.array_to_datetime_with_tz(vals2, tz, False, False, creso_infer) - assert res2.dtype == "M8[ns]" + assert res2.dtype == "M8[s]" @pytest.mark.parametrize( diff --git a/pandas/tests/tslibs/test_strptime.py b/pandas/tests/tslibs/test_strptime.py index ce45bdd10b8e8..d726006b03f6d 100644 --- a/pandas/tests/tslibs/test_strptime.py +++ b/pandas/tests/tslibs/test_strptime.py @@ -9,13 +9,26 @@ from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas._libs.tslibs.strptime import array_strptime -from pandas import Timestamp +from pandas import ( + NaT, + Timestamp, +) import pandas._testing as tm creso_infer = NpyDatetimeUnit.NPY_FR_GENERIC.value class TestArrayStrptimeResolutionInference: + def test_array_strptime_resolution_all_nat(self): + arr = np.array([NaT, np.nan], dtype=object) + + fmt = "%Y-%m-%d %H:%M:%S" + res, _ = array_strptime(arr, fmt=fmt, utc=False, creso=creso_infer) + assert res.dtype == "M8[s]" + + res, _ = array_strptime(arr, fmt=fmt, utc=True, creso=creso_infer) + assert res.dtype == "M8[s]" + @pytest.mark.parametrize("tz", [None, timezone.utc]) def test_array_strptime_resolution_inference_homogeneous_strings(self, tz): dt = datetime(2016, 1, 2, 3, 4, 5, 678900, tzinfo=tz) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 2d3b47cd2e994..0c93ee453bb1c 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -109,12 +109,16 @@ def test_empty_dtypes(check_dtype): @pytest.mark.parametrize("check_like", [True, False]) -def test_frame_equal_index_mismatch(check_like, obj_fixture): +def test_frame_equal_index_mismatch(check_like, obj_fixture, using_infer_string): + if using_infer_string: + dtype = "string" + else: + dtype = "object" msg = f"""{obj_fixture}\\.index are different {obj_fixture}\\.index values are different \\(33\\.33333 %\\) -\\[left\\]: Index\\(\\['a', 'b', 'c'\\], dtype='object'\\) -\\[right\\]: Index\\(\\['a', 'b', 'd'\\], dtype='object'\\) +\\[left\\]: Index\\(\\['a', 'b', 'c'\\], dtype='{dtype}'\\) +\\[right\\]: Index\\(\\['a', 'b', 'd'\\], dtype='{dtype}'\\) At positional index 2, first diff: c != d""" df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) @@ -125,12 +129,16 @@ def test_frame_equal_index_mismatch(check_like, obj_fixture): @pytest.mark.parametrize("check_like", [True, False]) -def test_frame_equal_columns_mismatch(check_like, obj_fixture): +def test_frame_equal_columns_mismatch(check_like, obj_fixture, using_infer_string): + if using_infer_string: + dtype = "string" + else: + dtype = "object" msg = f"""{obj_fixture}\\.columns are different {obj_fixture}\\.columns values are different \\(50\\.0 %\\) -\\[left\\]: Index\\(\\['A', 'B'\\], dtype='object'\\) -\\[right\\]: Index\\(\\['A', 'b'\\], dtype='object'\\)""" +\\[left\\]: Index\\(\\['A', 'B'\\], dtype='{dtype}'\\) +\\[right\\]: Index\\(\\['A', 'b'\\], dtype='{dtype}'\\)""" df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) df2 = DataFrame({"A": [1, 2, 3], "b": [4, 5, 6]}, index=["a", "b", "c"]) diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index 15263db9ec645..dc6efdcec380e 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -205,14 +205,18 @@ def test_index_equal_names(name1, name2): tm.assert_index_equal(idx1, idx2) -def test_index_equal_category_mismatch(check_categorical): - msg = """Index are different +def test_index_equal_category_mismatch(check_categorical, using_infer_string): + if using_infer_string: + dtype = "string" + else: + dtype = "object" + msg = f"""Index are different Attribute "dtype" are different \\[left\\]: CategoricalDtype\\(categories=\\['a', 'b'\\], ordered=False, \ -categories_dtype=object\\) +categories_dtype={dtype}\\) \\[right\\]: CategoricalDtype\\(categories=\\['a', 'b', 'c'\\], \ -ordered=False, categories_dtype=object\\)""" +ordered=False, categories_dtype={dtype}\\)""" idx1 = Index(Categorical(["a", "b"])) idx2 = Index(Categorical(["a", "b"], categories=["a", "b", "c"])) diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 12b5987cdb3de..ffc5e105d6f2f 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -214,8 +214,18 @@ def test_series_equal_numeric_values_mismatch(rtol): tm.assert_series_equal(s1, s2, rtol=rtol) -def test_series_equal_categorical_values_mismatch(rtol): - msg = """Series are different +def test_series_equal_categorical_values_mismatch(rtol, using_infer_string): + if using_infer_string: + msg = """Series are different + +Series values are different \\(66\\.66667 %\\) +\\[index\\]: \\[0, 1, 2\\] +\\[left\\]: \\['a', 'b', 'c'\\] +Categories \\(3, string\\): \\[a, b, c\\] +\\[right\\]: \\['a', 'c', 'b'\\] +Categories \\(3, string\\): \\[a, b, c\\]""" + else: + msg = """Series are different Series values are different \\(66\\.66667 %\\) \\[index\\]: \\[0, 1, 2\\] @@ -246,14 +256,18 @@ def test_series_equal_datetime_values_mismatch(rtol): tm.assert_series_equal(s1, s2, rtol=rtol) -def test_series_equal_categorical_mismatch(check_categorical): - msg = """Attributes of Series are different +def test_series_equal_categorical_mismatch(check_categorical, using_infer_string): + if using_infer_string: + dtype = "string" + else: + dtype = "object" + msg = f"""Attributes of Series are different Attribute "dtype" are different \\[left\\]: CategoricalDtype\\(categories=\\['a', 'b'\\], ordered=False, \ -categories_dtype=object\\) +categories_dtype={dtype}\\) \\[right\\]: CategoricalDtype\\(categories=\\['a', 'b', 'c'\\], \ -ordered=False, categories_dtype=object\\)""" +ordered=False, categories_dtype={dtype}\\)""" s1 = Series(Categorical(["a", "b"])) s2 = Series(Categorical(["a", "b"], categories=list("abc"))) diff --git a/pandas/tests/util/test_make_objects.py b/pandas/tests/util/test_make_objects.py deleted file mode 100644 index 74e2366db2a1c..0000000000000 --- a/pandas/tests/util/test_make_objects.py +++ /dev/null @@ -1,15 +0,0 @@ -""" -Tests for tm.makeFoo functions. -""" - - -import numpy as np - -import pandas._testing as tm - - -def test_make_multiindex_respects_k(): - # GH#38795 respect 'k' arg - N = np.random.default_rng(2).integers(0, 100) - mi = tm.makeMultiIndex(k=N) - assert len(mi) == N diff --git a/pandas/tests/window/test_apply.py b/pandas/tests/window/test_apply.py index 4e4eca6e772e7..136f81632cb0a 100644 --- a/pandas/tests/window/test_apply.py +++ b/pandas/tests/window/test_apply.py @@ -301,8 +301,9 @@ def test_center_reindex_series(raw, series): tm.assert_series_equal(series_xp, series_rs) -def test_center_reindex_frame(raw, frame): +def test_center_reindex_frame(raw): # shifter index + frame = DataFrame(range(100), index=date_range("2020-01-01", freq="D", periods=100)) s = [f"x{x:d}" for x in range(12)] minp = 10 diff --git a/pyproject.toml b/pyproject.toml index 38a22b2e7f90d..65c647e439cb1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,17 +76,19 @@ hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/i #'blosc>=1.20.1', 'tables>=3.8.0'] spss = ['pyreadstat>=1.2.0'] -postgresql = ['SQLAlchemy>=2.0.0', 'psycopg2>=2.9.6'] +postgresql = ['SQLAlchemy>=2.0.0', 'psycopg2>=2.9.6', 'adbc-driver-postgresql>=0.8.0'] mysql = ['SQLAlchemy>=2.0.0', 'pymysql>=1.0.2'] -sql-other = ['SQLAlchemy>=2.0.0'] +sql-other = ['SQLAlchemy>=2.0.0', 'adbc-driver-postgresql>=0.8.0', 'adbc-driver-sqlite>=0.8.0'] html = ['beautifulsoup4>=4.11.2', 'html5lib>=1.1', 'lxml>=4.9.2'] xml = ['lxml>=4.9.2'] plot = ['matplotlib>=3.6.3'] output-formatting = ['jinja2>=3.1.2', 'tabulate>=0.9.0'] -clipboard = ['PyQt5>=5.15.8', 'qtpy>=2.3.0'] +clipboard = ['PyQt5>=5.15.9', 'qtpy>=2.3.0'] compression = ['zstandard>=0.19.0'] consortium-standard = ['dataframe-api-compat>=0.1.7'] -all = ['beautifulsoup4>=4.11.2', +all = ['adbc-driver-postgresql>=0.8.0', + 'adbc-driver-sqlite>=0.8.0', + 'beautifulsoup4>=4.11.2', # blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) #'blosc>=1.21.3', 'bottleneck>=1.3.6', @@ -107,7 +109,7 @@ all = ['beautifulsoup4>=4.11.2', 'psycopg2>=2.9.6', 'pyarrow>=10.0.1', 'pymysql>=1.0.2', - 'PyQt5>=5.15.8', + 'PyQt5>=5.15.9', 'pyreadstat>=1.2.0', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', diff --git a/requirements-dev.txt b/requirements-dev.txt index 23a8112bdb344..fa3ef1c214a14 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -9,6 +9,8 @@ meson-python==0.13.1 pytest>=7.3.2 pytest-cov pytest-xdist>=2.2.0 +pytest-qt>=4.2.0 +PyQt5>=5.15.9 coverage python-dateutil numpy<2 @@ -81,6 +83,8 @@ feedparser pyyaml requests pygments +adbc-driver-postgresql>=0.8.0 +adbc-driver-sqlite>=0.8.0 dataframe-api-compat>=0.1.7 sphinx-toggleprompt typing_extensions; python_version<"3.11" diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index 49fecca253ba4..5fcf09cd073fe 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -25,12 +25,13 @@ EXCLUDE = {"python", "c-compiler", "cxx-compiler"} REMAP_VERSION = {"tzdata": "2022.7"} -RENAME = { +CONDA_TO_PIP = { "pytables": "tables", "psycopg2": "psycopg2-binary", "dask-core": "dask", "seaborn-base": "seaborn", "sqlalchemy": "SQLAlchemy", + "pyqt": "PyQt5", } @@ -40,7 +41,7 @@ def conda_package_to_pip(package: str): In most cases they are the same, those are the exceptions: - Packages that should be excluded (in `EXCLUDE`) - - Packages that should be renamed (in `RENAME`) + - Packages that should be renamed (in `CONDA_TO_PIP`) - A package requiring a specific version, in conda is defined with a single equal (e.g. ``pandas=1.0``) and in pip with two (e.g. ``pandas==1.0``) """ @@ -53,14 +54,14 @@ def conda_package_to_pip(package: str): return if pkg in REMAP_VERSION: return "".join((pkg, compare, REMAP_VERSION[pkg])) - if pkg in RENAME: - return "".join((RENAME[pkg], compare, version)) + if pkg in CONDA_TO_PIP: + return "".join((CONDA_TO_PIP[pkg], compare, version)) if package in EXCLUDE: return - if package in RENAME: - return RENAME[package] + if package in CONDA_TO_PIP: + return CONDA_TO_PIP[package] return package diff --git a/scripts/run_stubtest.py b/scripts/run_stubtest.py index 35cbbef08124e..6307afa1bc822 100644 --- a/scripts/run_stubtest.py +++ b/scripts/run_stubtest.py @@ -69,7 +69,6 @@ "pandas._libs.sparse.SparseIndex.to_block_index", "pandas._libs.sparse.SparseIndex.to_int_index", # TODO (decorator changes argument names) - "pandas._libs.tslibs.offsets.BaseOffset._apply_array", "pandas._libs.tslibs.offsets.BusinessHour.rollback", "pandas._libs.tslibs.offsets.BusinessHour.rollforward ", # type alias diff --git a/scripts/validate_min_versions_in_sync.py b/scripts/validate_min_versions_in_sync.py index d5c491f6ebc12..7dd3e96e6ec18 100755 --- a/scripts/validate_min_versions_in_sync.py +++ b/scripts/validate_min_versions_in_sync.py @@ -26,7 +26,7 @@ from typing import Any -from scripts.generate_pip_deps_from_conda import RENAME +from scripts.generate_pip_deps_from_conda import CONDA_TO_PIP DOC_PATH = pathlib.Path("doc/source/getting_started/install.rst").resolve() CI_PATH = next( @@ -36,7 +36,7 @@ SETUP_PATH = pathlib.Path("pyproject.toml").resolve() YAML_PATH = pathlib.Path("ci/deps") ENV_PATH = pathlib.Path("environment.yml") -EXCLUDE_DEPS = {"tzdata", "blosc", "pandas-gbq"} +EXCLUDE_DEPS = {"tzdata", "blosc", "pandas-gbq", "pyqt", "pyqt5"} EXCLUSION_LIST = frozenset(["python=3.8[build=*_pypy]"]) # pandas package is not available # in pre-commit environment @@ -169,8 +169,8 @@ def pin_min_versions_to_yaml_file( old_dep = yaml_package if yaml_versions is not None: old_dep = old_dep + ", ".join(yaml_versions) - if RENAME.get(yaml_package, yaml_package) in toml_map: - min_dep = toml_map[RENAME.get(yaml_package, yaml_package)] + if CONDA_TO_PIP.get(yaml_package, yaml_package) in toml_map: + min_dep = toml_map[CONDA_TO_PIP.get(yaml_package, yaml_package)] elif yaml_package in toml_map: min_dep = toml_map[yaml_package] else: