diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index b107a5d3ba100..3b0b15585a389 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -134,6 +134,7 @@ MultiIndex I/O ^^^ - :meth:`DataFrame.to_excel` was storing decimals as strings instead of numbers (:issue:`49598`) +- Bug in :func:`read_sql` causing an unintended exception when byte data was being converted to string when using the pyarrow dtype_backend (:issue:`59242`) - Period diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 1dd1b12d6ae95..a430e8f23c046 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2261,7 +2261,9 @@ def type(self): elif pa.types.is_null(pa_type): # TODO: None? pd.NA? pa.null? return type(pa_type) - elif isinstance(pa_type, pa.ExtensionType): + elif isinstance(pa_type, pa.ExtensionType) or isinstance( + pa_type, pa.OpaqueType + ): return type(self)(pa_type.storage_type).type raise NotImplementedError(pa_type) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index dfff34656f82b..ef5b144ee690b 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -967,10 +967,6 @@ def convert(arr): # i.e. maybe_convert_objects didn't convert convert_to_nullable_dtype = dtype_backend != "numpy" arr = maybe_infer_to_datetimelike(arr, convert_to_nullable_dtype) - if convert_to_nullable_dtype and arr.dtype == np.dtype("O"): - new_dtype = StringDtype() - arr_cls = new_dtype.construct_array_type() - arr = arr_cls._from_sequence(arr, dtype=new_dtype) elif dtype_backend != "numpy" and isinstance(arr, np.ndarray): if arr.dtype.kind in "iufb": arr = pd_array(arr, copy=False) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 7e1220ecee218..51c5235996907 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4358,3 +4358,46 @@ def test_xsqlite_if_exists(sqlite_buildin): (5, "E"), ] drop_table(table_name, sqlite_buildin) + + +@pytest.mark.parametrize("con", all_connectable) +@pytest.mark.parametrize("dtype_backend", ["pyarrow", "numpy_nullable", lib.no_default]) +def test_bytes_column(con, dtype_backend, request): + # GitHub Issue #59242 + conn = request.getfixturevalue(con) + pa = pytest.importorskip("pyarrow") + + dtype = "O" + val = b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" + + if "postgres" in con: + val = ( + b"\x00\x00\x00\x80\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" + if "adbc" in con + else "0000000100100011010001010110011110001001101010" + "11110011011110111100000001001000110100010101100" + "11110001001101010111100110111101111" + ) + if dtype_backend == "pyarrow": + dtype = ( + pd.ArrowDtype(pa.string()) + if "adbc" not in con + else pd.ArrowDtype(pa.opaque(pa.binary(), "bit", "PostgreSQL")) + ) + + if "psycopg2" in con: + if dtype_backend == "numpy_nullable": + dtype = pd.StringDtype() + elif dtype_backend == lib.no_default and pd.options.future.infer_string: + dtype = pd.StringDtype(storage="pyarrow", na_value=np.nan) + + if "postgres" not in con and dtype_backend == "pyarrow": + dtype = pd.ArrowDtype(pa.binary()) + + expected = DataFrame([{"a": val}], dtype=dtype) + df = pd.read_sql( + "select x'0123456789abcdef0123456789abcdef' a", + conn, + dtype_backend=dtype_backend, + ) + tm.assert_frame_equal(df, expected)