Merge branch 'main' of github.com:pandas-dev/pandas into arrow-to-csv

pandas-dev · Nov 26, 2023 · a9d3cc4 · a9d3cc4
2 parents cb5f6cd + 762b61d
commit a9d3cc4
Show file tree

Hide file tree

Showing 168 changed files with 2,987 additions and 1,393 deletions.
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -88,14 +88,15 @@ jobs:
     name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }}
     env:
       PATTERN: ${{ matrix.pattern }}
-      EXTRA_APT: ${{ matrix.extra_apt || '' }}
       LANG: ${{ matrix.lang || 'C.UTF-8' }}
       LC_ALL: ${{ matrix.lc_all || '' }}
       PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }}
       PANDAS_CI: ${{ matrix.pandas_ci || '1' }}
       TEST_ARGS: ${{ matrix.test_args || '' }}
       PYTEST_WORKERS: 'auto'
       PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }}
+      # Clipboard tests
+      QT_QPA_PLATFORM: offscreen
     concurrency:
       # https://github.community/t/concurrecy-not-work-for-push/183068/7
       group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_copy_on_write || '' }}
@@ -145,8 +146,8 @@ jobs:
         fetch-depth: 0
 
     - name: Extra installs
-      # xsel for clipboard tests
-      run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }}
+      run: sudo apt-get update && sudo apt-get install -y ${{ matrix.extra_apt }}
+      if: ${{ matrix.extra_apt }}
 
     - name: Generate extra locales
       # These extra locales will be available for locale.setlocale() calls in tests

diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
@@ -245,7 +245,8 @@ def time_extract_single_group(self, dtype, expand):
 class Dummies(Dtypes):
     def setup(self, dtype):
         super().setup(dtype)
-        self.s = self.s.str.join("|")
+        N = len(self.s) // 5
+        self.s = self.s[:N].str.join("|")
 
     def time_get_dummies(self, dtype):
         self.s.str.get_dummies("|")

diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
@@ -15,6 +15,7 @@ dependencies:
   - pytest-cov
   - pytest-xdist>=2.2.0
   - pytest-localserver>=0.7.1
+  - pytest-qt>=4.2.0
   - boto3
 
   # required dependencies
@@ -42,6 +43,7 @@ dependencies:
   - psycopg2>=2.9.6
   - pyarrow>=10.0.1
   - pymysql>=1.0.2
+  - pyqt>=5.15.9
   - pyreadstat>=1.2.0
   - pytables>=3.8.0
   - python-calamine>=0.1.6
@@ -56,5 +58,6 @@ dependencies:
   - zstandard>=0.19.0
 
   - pip:
-    - pyqt5>=5.15.8
+    - adbc-driver-postgresql>=0.8.0
+    - adbc-driver-sqlite>=0.8.0
     - tzdata>=2022.7
diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
@@ -16,6 +16,7 @@ dependencies:
   - pytest-cov
   - pytest-xdist>=2.2.0
   - pytest-localserver>=0.7.1
+  - pytest-qt>=4.2.0
   - boto3
 
   # required dependencies
@@ -43,6 +44,7 @@ dependencies:
   - psycopg2>=2.9.6
   - pyarrow>=10.0.1
   - pymysql>=1.0.2
+  - pyqt>=5.15.9
   - pyreadstat>=1.2.0
   - pytables>=3.8.0
   - python-calamine>=0.1.6
@@ -70,6 +72,7 @@ dependencies:
   - pyyaml
   - py
   - pip:
+    - adbc-driver-postgresql>=0.8.0
+    - adbc-driver-sqlite>=0.8.0
     - dataframe-api-compat>=0.1.7
-    - pyqt5>=5.15.8
     - tzdata>=2022.7
diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
@@ -15,6 +15,7 @@ dependencies:
   - pytest-cov
   - pytest-xdist>=2.2.0
   - pytest-localserver>=0.7.1
+  - pytest-qt>=4.2.0
   - boto3
 
   # required dependencies
@@ -38,6 +39,7 @@ dependencies:
   - numexpr>=2.8.4
   - odfpy>=1.4.1
   - qtpy>=2.3.0
+  - pyqt>=5.15.9
   - openpyxl>=3.1.0
   - psycopg2>=2.9.6
   - pyarrow>=10.0.1
@@ -56,5 +58,6 @@ dependencies:
   - zstandard>=0.19.0
 
   - pip:
-    - pyqt5>=5.15.8
+    - adbc-driver-postgresql>=0.8.0
+    - adbc-driver-sqlite>=0.8.0
     - tzdata>=2022.7
diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml
@@ -17,6 +17,7 @@ dependencies:
   - pytest-cov
   - pytest-xdist>=2.2.0
   - pytest-localserver>=0.7.1
+  - pytest-qt>=4.2.0
   - boto3
 
   # required dependencies
@@ -44,6 +45,7 @@ dependencies:
   - psycopg2=2.9.6
   - pyarrow=10.0.1
   - pymysql=1.0.2
+  - pyqt=5.15.9
   - pyreadstat=1.2.0
   - pytables=3.8.0
   - python-calamine=0.1.6
@@ -58,6 +60,7 @@ dependencies:
   - zstandard=0.19.0
 
   - pip:
+    - adbc-driver-postgresql==0.8.0
+    - adbc-driver-sqlite==0.8.0
     - dataframe-api-compat==0.1.7
-    - pyqt5==5.15.8
     - tzdata==2022.7
diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
@@ -15,6 +15,7 @@ dependencies:
   - pytest-cov
   - pytest-xdist>=2.2.0
   - pytest-localserver>=0.7.1
+  - pytest-qt>=4.2.0
   - boto3
 
   # required dependencies
@@ -42,6 +43,7 @@ dependencies:
   - psycopg2>=2.9.6
   - pyarrow>=10.0.1
   - pymysql>=1.0.2
+  - pyqt>=5.15.9
   - pyreadstat>=1.2.0
   - pytables>=3.8.0
   - python-calamine>=0.1.6
@@ -56,5 +58,6 @@ dependencies:
   - zstandard>=0.19.0
 
   - pip:
-    - pyqt5>=5.15.8
+    - adbc-driver-postgresql>=0.8.0
+    - adbc-driver-sqlite>=0.8.0
     - tzdata>=2022.7
diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml
@@ -15,6 +15,7 @@ dependencies:
   - pytest-cov
   - pytest-xdist>=2.2.0
   - pytest-localserver>=0.7.1
+  - pytest-qt>=4.2.0
   - boto3
 
   # required dependencies
@@ -42,6 +43,7 @@ dependencies:
   - psycopg2>=2.9.6
   - pyarrow>=10.0.1
   - pymysql>=1.0.2
+  - pyqt>=5.15.9
   - pyreadstat>=1.2.0
   - pytables>=3.8.0
   - python-calamine>=0.1.6
@@ -54,3 +56,6 @@ dependencies:
   - xlrd>=2.0.1
   - xlsxwriter>=3.0.5
   - zstandard>=0.19.0
+  - pip:
+    - adbc-driver-postgresql>=0.8.0
+    - adbc-driver-sqlite>=0.8.0
diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst
@@ -86,7 +86,7 @@ Before we begin, please:
 Option 1: using mamba (recommended)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-* Install `mamba <https://mamba.readthedocs.io/en/latest/installation.html>`_
+* Install `mamba <https://mamba.readthedocs.io/en/latest/installation/mamba-installation.html>`_
 * Make sure your mamba is up to date (``mamba update mamba``)
 
 .. code-block:: none

diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
@@ -335,7 +335,7 @@ lxml                      4.9.2              xml             XML parser for read
 SQL databases
 ^^^^^^^^^^^^^
 
-Installable with ``pip install "pandas[postgresql, mysql, sql-other]"``.
+Traditional drivers are installable with ``pip install "pandas[postgresql, mysql, sql-other]"``
 
 ========================= ================== =============== =============================================================
 Dependency                Minimum Version    pip extra       Notes
@@ -345,6 +345,8 @@ SQLAlchemy                2.0.0              postgresql,     SQL support for dat
                                              sql-other
 psycopg2                  2.9.6              postgresql      PostgreSQL engine for sqlalchemy
 pymysql                   1.0.2              mysql           MySQL engine for sqlalchemy
+adbc-driver-postgresql    0.8.0              postgresql      ADBC Driver for PostgreSQL
+adbc-driver-sqlite        0.8.0              sql-other       ADBC Driver for SQLite
 ========================= ================== =============== =============================================================
 
 Other data sources
@@ -395,7 +397,7 @@ Installable with ``pip install "pandas[clipboard]"``.
 ========================= ================== =============== =============================================================
 Dependency                Minimum Version    pip extra       Notes
 ========================= ================== =============== =============================================================
-PyQt4/PyQt5               5.15.8             clipboard       Clipboard I/O
+PyQt4/PyQt5               5.15.9             clipboard       Clipboard I/O
 qtpy                      2.3.0              clipboard       Clipboard I/O
 ========================= ================== =============== =============================================================
 

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -5565,9 +5565,23 @@ SQL queries
 -----------
 
 The :mod:`pandas.io.sql` module provides a collection of query wrappers to both
-facilitate data retrieval and to reduce dependency on DB-specific API. Database abstraction
-is provided by SQLAlchemy if installed. In addition you will need a driver library for
-your database. Examples of such drivers are `psycopg2 <https://www.psycopg.org/>`__
+facilitate data retrieval and to reduce dependency on DB-specific API.
+
+Where available, users may first want to opt for `Apache Arrow ADBC
+<https://arrow.apache.org/adbc/current/index.html>`_ drivers. These drivers
+should provide the best performance, null handling, and type detection.
+
+  .. versionadded:: 2.2.0
+
+     Added native support for ADBC drivers
+
+For a full list of ADBC drivers and their development status, see the `ADBC Driver
+Implementation Status <https://arrow.apache.org/adbc/current/driver/status.html>`_
+documentation.
+
+Where an ADBC driver is not available or may be missing functionality,
+users should opt for installing SQLAlchemy alongside their database driver library.
+Examples of such drivers are `psycopg2 <https://www.psycopg.org/>`__
 for PostgreSQL or `pymysql <https://github.com/PyMySQL/PyMySQL>`__ for MySQL.
 For `SQLite <https://docs.python.org/3/library/sqlite3.html>`__ this is
 included in Python's standard library by default.
@@ -5600,6 +5614,18 @@ In the following example, we use the `SQlite <https://www.sqlite.org/index.html>
 engine. You can use a temporary SQLite database where data are stored in
 "memory".
 
+To connect using an ADBC driver you will want to install the ``adbc_driver_sqlite`` using your
+package manager. Once installed, you can use the DBAPI interface provided by the ADBC driver
+to connect to your database.
+
+.. code-block:: python
+
+   import adbc_driver_sqlite.dbapi as sqlite_dbapi
+
+   # Create the connection
+   with sqlite_dbapi.connect("sqlite:///:memory:") as conn:
+        df = pd.read_sql_table("data", conn)
+
 To connect with SQLAlchemy you use the :func:`create_engine` function to create an engine
 object from database URI. You only need to create the engine once per database you are
 connecting to.
@@ -5675,9 +5701,74 @@ writes ``data`` to the database in batches of 1000 rows at a time:
 SQL data types
 ++++++++++++++
 
-:func:`~pandas.DataFrame.to_sql` will try to map your data to an appropriate
-SQL data type based on the dtype of the data. When you have columns of dtype
-``object``, pandas will try to infer the data type.
+Ensuring consistent data type management across SQL databases is challenging.
+Not every SQL database offers the same types, and even when they do the implementation
+of a given type can vary in ways that have subtle effects on how types can be
+preserved.
+
+For the best odds at preserving database types users are advised to use
+ADBC drivers when available. The Arrow type system offers a wider array of
+types that more closely match database types than the historical pandas/NumPy
+type system. To illustrate, note this (non-exhaustive) listing of types
+available in different databases and pandas backends:
+
++-----------------+-----------------------+----------------+---------+
+|numpy/pandas     |arrow                  |postgres        |sqlite   |
++=================+=======================+================+=========+
+|int16/Int16      |int16                  |SMALLINT        |INTEGER  |
++-----------------+-----------------------+----------------+---------+
+|int32/Int32      |int32                  |INTEGER         |INTEGER  |
++-----------------+-----------------------+----------------+---------+
+|int64/Int64      |int64                  |BIGINT          |INTEGER  |
++-----------------+-----------------------+----------------+---------+
+|float32          |float32                |REAL            |REAL     |
++-----------------+-----------------------+----------------+---------+
+|float64          |float64                |DOUBLE PRECISION|REAL     |
++-----------------+-----------------------+----------------+---------+
+|object           |string                 |TEXT            |TEXT     |
++-----------------+-----------------------+----------------+---------+
+|bool             |``bool_``              |BOOLEAN         |         |
++-----------------+-----------------------+----------------+---------+
+|datetime64[ns]   |timestamp(us)          |TIMESTAMP       |         |
++-----------------+-----------------------+----------------+---------+
+|datetime64[ns,tz]|timestamp(us,tz)       |TIMESTAMPTZ     |         |
++-----------------+-----------------------+----------------+---------+
+|                 |date32                 |DATE            |         |
++-----------------+-----------------------+----------------+---------+
+|                 |month_day_nano_interval|INTERVAL        |         |
++-----------------+-----------------------+----------------+---------+
+|                 |binary                 |BINARY          |BLOB     |
++-----------------+-----------------------+----------------+---------+
+|                 |decimal128             |DECIMAL [#f1]_  |         |
++-----------------+-----------------------+----------------+---------+
+|                 |list                   |ARRAY [#f1]_    |         |
++-----------------+-----------------------+----------------+---------+
+|                 |struct                 |COMPOSITE TYPE  |         |
+|                 |                       | [#f1]_         |         |
++-----------------+-----------------------+----------------+---------+
+
+.. rubric:: Footnotes
+
+.. [#f1] Not implemented as of writing, but theoretically possible
+
+If you are interested in preserving database types as best as possible
+throughout the lifecycle of your DataFrame, users are encouraged to
+leverage the ``dtype_backend="pyarrow"`` argument of :func:`~pandas.read_sql`
+
+.. code-block:: ipython
+
+   # for roundtripping
+   with pg_dbapi.connect(uri) as conn:
+       df2 = pd.read_sql("pandas_table", conn, dtype_backend="pyarrow")
+
+This will prevent your data from being converted to the traditional pandas/NumPy
+type system, which often converts SQL types in ways that make them impossible to
+round-trip.
+
+In case an ADBC driver is not available, :func:`~pandas.DataFrame.to_sql`
+will try to map your data to an appropriate SQL data type based on the dtype of
+the data. When you have columns of dtype ``object``, pandas will try to infer
+the data type.
 
 You can always override the default type by specifying the desired SQL type of
 any of the columns by using the ``dtype`` argument. This argument needs a
@@ -5696,7 +5787,9 @@ default ``Text`` type for string columns:
 
     Due to the limited support for timedelta's in the different database
     flavors, columns with type ``timedelta64`` will be written as integer
-    values as nanoseconds to the database and a warning will be raised.
+    values as nanoseconds to the database and a warning will be raised. The only
+    exception to this is when using the ADBC PostgreSQL driver in which case a
+    timedelta will be written to the database as an ``INTERVAL``
 
 .. note::
 
@@ -5711,7 +5804,7 @@ default ``Text`` type for string columns:
 Datetime data types
 '''''''''''''''''''
 
-Using SQLAlchemy, :func:`~pandas.DataFrame.to_sql` is capable of writing
+Using ADBC or SQLAlchemy, :func:`~pandas.DataFrame.to_sql` is capable of writing
 datetime data that is timezone naive or timezone aware. However, the resulting
 data stored in the database ultimately depends on the supported data type
 for datetime data of the database system being used.
@@ -5802,15 +5895,16 @@ table name and optionally a subset of columns to read.
 .. note::
 
     In order to use :func:`~pandas.read_sql_table`, you **must** have the
-    SQLAlchemy optional dependency installed.
+    ADBC driver or SQLAlchemy optional dependency installed.
 
 .. ipython:: python
 
    pd.read_sql_table("data", engine)
 
 .. note::
 
-  Note that pandas infers column dtypes from query outputs, and not by looking
+  ADBC drivers will map database types directly back to arrow types. For other drivers
+  note that pandas infers column dtypes from query outputs, and not by looking
   up data types in the physical database schema. For example, assume ``userid``
   is an integer column in a table. Then, intuitively, ``select userid ...`` will
   return integer-valued series, while ``select cast(userid as text) ...`` will

diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst
@@ -22,9 +22,10 @@ Fixed regressions
 Bug fixes
 ~~~~~~~~~
 - Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`)
+- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`)
 - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
 - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
--
+- Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_214.other: