diff --git a/.circleci/config.yml b/.circleci/config.yml index 50ff7a81ae103..2c52d7aee4e28 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -15,7 +15,6 @@ jobs: - checkout - run: .circleci/setup_env.sh - run: | - sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH \ LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD \ ci/run_tests.sh diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 3e699e1a27b55..473d67acf6e74 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -119,7 +119,7 @@ Interval Indexing ^^^^^^^^ -- +- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`) - Missing diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 75db47bf3160e..9c3791a642768 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -68,6 +68,9 @@ class MaskedUInt16Engine(MaskedIndexEngine): ... class MaskedUInt8Engine(MaskedIndexEngine): ... class MaskedBoolEngine(MaskedUInt8Engine): ... +class StringObjectEngine(ObjectEngine): + def __init__(self, values: object, na_value) -> None: ... + class BaseMultiIndexCodesEngine: levels: list[np.ndarray] offsets: np.ndarray # ndarray[uint64_t, ndim=1] diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index ee6a11ddab004..365cc7c3cecfc 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -532,6 +532,32 @@ cdef class ObjectEngine(IndexEngine): return loc +cdef class StringObjectEngine(ObjectEngine): + + cdef: + object na_value + bint uses_na + + def __init__(self, ndarray values, na_value): + super().__init__(values) + self.na_value = na_value + self.uses_na = na_value is C_NA + + cdef bint _checknull(self, object val): + if self.uses_na: + return val is C_NA + else: + return util.is_nan(val) + + cdef _check_type(self, object val): + if isinstance(val, str): + return val + elif self._checknull(val): + return self.na_value + else: + raise KeyError(val) + + cdef class DatetimeEngine(Int64Engine): cdef: diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index b9fd970e68f5b..71a4d3ae2575f 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -86,6 +86,7 @@ def maybe_convert_objects( safe: bool = ..., convert_numeric: bool = ..., convert_non_numeric: Literal[False] = ..., + convert_string: Literal[False] = ..., convert_to_nullable_dtype: Literal[False] = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> npt.NDArray[np.object_ | np.number]: ... @@ -97,6 +98,7 @@ def maybe_convert_objects( safe: bool = ..., convert_numeric: bool = ..., convert_non_numeric: bool = ..., + convert_string: bool = ..., convert_to_nullable_dtype: Literal[True] = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @@ -108,6 +110,7 @@ def maybe_convert_objects( safe: bool = ..., convert_numeric: bool = ..., convert_non_numeric: bool = ..., + convert_string: bool = ..., convert_to_nullable_dtype: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index bc039917aef87..f72d6a5dad877 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2498,6 +2498,7 @@ def maybe_convert_objects(ndarray[object] objects, bint convert_numeric=True, # NB: different default! bint convert_to_nullable_dtype=False, bint convert_non_numeric=False, + bint convert_string=True, object dtype_if_all_nat=None) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2747,7 +2748,11 @@ def maybe_convert_objects(ndarray[object] objects, dtype = StringDtype() return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) - elif using_string_dtype() and is_string_array(objects, skipna=True): + elif ( + convert_string + and using_string_dtype() + and is_string_array(objects, skipna=True) + ): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(na_value=np.nan) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 13e10c8d3a738..0c1e1d0c63c85 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1633,7 +1633,11 @@ def _accumulate( else: data_to_accum = data_to_accum.cast(pa.int64()) - result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) + try: + result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) + except pa.ArrowNotImplementedError as err: + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) from err if convert_to_int: result = result.cast(pa_dtype) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index a1ef8a3e27e9c..d4263f7488a14 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -87,8 +87,8 @@ if TYPE_CHECKING: from collections.abc import ( + Collection, Sequence, - Sized, ) from pandas._typing import ( @@ -1586,7 +1586,7 @@ def _maybe_box_and_unbox_datetimelike(value: Scalar, dtype: DtypeObj): return _maybe_unbox_datetimelike(value, dtype) -def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: +def construct_1d_object_array_from_listlike(values: Collection) -> np.ndarray: """ Transform any list-like object in a 1-dimensional numpy array of object dtype. @@ -1604,10 +1604,11 @@ def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: ------- 1-dimensional numpy array of dtype object """ - # numpy will try to interpret nested lists as further dimensions, hence - # making a 1D array that contains list-likes is a bit tricky: + # numpy will try to interpret nested lists as further dimensions in np.array(), + # hence explicitly making a 1D array using np.fromiter result = np.empty(len(values), dtype="object") - result[:] = values + for i, obj in enumerate(values): + result[i] = obj return result diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5da327a82c02b..ad39907e7400e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -884,6 +884,8 @@ def _engine( # error: Item "ExtensionArray" of "Union[ExtensionArray, # ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr] target_values = self._data._ndarray # type: ignore[union-attr] + elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype): + return libindex.StringObjectEngine(target_values, self.dtype.na_value) # type: ignore[union-attr] # error: Argument 1 to "ExtensionEngine" has incompatible type # "ndarray[Any, Any]"; expected "ExtensionArray" @@ -6133,7 +6135,6 @@ def _should_fallback_to_positional(self) -> bool: def get_indexer_non_unique( self, target ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: - target = ensure_index(target) target = self._maybe_cast_listlike_indexer(target) if not self._should_compare(target) and not self._should_partial_index(target): @@ -6695,7 +6696,16 @@ def _maybe_cast_listlike_indexer(self, target) -> Index: """ Analogue to maybe_cast_indexer for get_indexer instead of get_loc. """ - return ensure_index(target) + target_index = ensure_index(target) + if ( + not hasattr(target, "dtype") + and self.dtype == object + and target_index.dtype == "string" + ): + # If we started with a list-like, avoid inference to string dtype if self + # is object dtype (coercing to string dtype will alter the missing values) + target_index = Index(target, dtype=self.dtype) + return target_index @final def _validate_indexer( diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 4162ebc33f0d6..53f18883ea3ad 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -6,6 +6,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas.compat._optional import import_optional_dependency from pandas.errors import SettingWithCopyError @@ -124,8 +126,6 @@ def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame: ------- pd.DataFrame """ - # We need a dict of columns here, with each column being a NumPy array (at - # least for now, deal with non-NumPy dtypes later). columns: dict[str, Any] = {} buffers = [] # hold on to buffers, keeps memory alive for name in df.column_names(): @@ -324,8 +324,12 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: # Add to our list of strings str_list[i] = string - # Convert the string list to a NumPy array - return np.asarray(str_list, dtype="object"), buffers + if using_string_dtype(): + res = pd.Series(str_list, dtype="str") + else: + res = np.asarray(str_list, dtype="object") # type: ignore[assignment] + + return res, buffers # type: ignore[return-value] def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a68e1a5e9c3e1..4a62f58077aee 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -563,7 +563,12 @@ def _maybe_downcast( return blocks nbs = extend_blocks( - [blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks] + [ + blk.convert( + using_cow=using_cow, copy=not using_cow, convert_string=False + ) + for blk in blocks + ] ) if caller == "fillna": if len(nbs) != len(blocks) or not all( @@ -636,6 +641,7 @@ def convert( *, copy: bool = True, using_cow: bool = False, + convert_string: bool = True, ) -> list[Block]: """ Attempt to coerce any object types to better types. Return a copy @@ -648,7 +654,10 @@ def convert( if self.ndim != 1 and self.shape[0] != 1: blocks = self.split_and_operate( - Block.convert, copy=copy, using_cow=using_cow + Block.convert, + copy=copy, + using_cow=using_cow, + convert_string=convert_string, ) if all(blk.dtype.kind == "O" for blk in blocks): # Avoid fragmenting the block if convert is a no-op @@ -666,6 +675,7 @@ def convert( res_values = lib.maybe_convert_objects( values, # type: ignore[arg-type] convert_non_numeric=True, + convert_string=convert_string, ) refs = None if ( @@ -851,6 +861,7 @@ def replace( mask: npt.NDArray[np.bool_] | None = None, using_cow: bool = False, already_warned=None, + convert_string=None, ) -> list[Block]: """ replace the to_replace value with value, possible to create new @@ -915,7 +926,11 @@ def replace( if get_option("future.no_silent_downcasting") is True: blocks = [blk] else: - blocks = blk.convert(copy=False, using_cow=using_cow) + blocks = blk.convert( + copy=False, + using_cow=using_cow, + convert_string=convert_string or self.dtype != _dtype_obj, + ) if len(blocks) > 1 or blocks[0].dtype != blk.dtype: warnings.warn( # GH#54710 @@ -944,6 +959,7 @@ def replace( inplace=True, mask=mask, using_cow=using_cow, + convert_string=convert_string, ) else: @@ -958,6 +974,7 @@ def replace( inplace=True, mask=mask[i : i + 1], using_cow=using_cow, + convert_string=convert_string, ) ) return blocks @@ -970,6 +987,7 @@ def _replace_regex( inplace: bool = False, mask=None, using_cow: bool = False, + convert_string: bool = True, already_warned=None, ) -> list[Block]: """ @@ -1029,7 +1047,9 @@ def _replace_regex( ) already_warned.warned_already = True - nbs = block.convert(copy=False, using_cow=using_cow) + nbs = block.convert( + copy=False, using_cow=using_cow, convert_string=convert_string + ) opt = get_option("future.no_silent_downcasting") if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt: warnings.warn( @@ -1068,6 +1088,8 @@ def replace_list( values._replace(to_replace=src_list, value=dest_list, inplace=True) return [blk] + convert_string = self.dtype != _dtype_obj + # Exclude anything that we know we won't contain pairs = [ (x, y) @@ -1152,6 +1174,7 @@ def replace_list( inplace=inplace, regex=regex, using_cow=using_cow, + convert_string=convert_string, ) if using_cow and i != src_len: @@ -1174,7 +1197,9 @@ def replace_list( nbs = [] for res_blk in result: converted = res_blk.convert( - copy=True and not using_cow, using_cow=using_cow + copy=True and not using_cow, + using_cow=using_cow, + convert_string=convert_string, ) if len(converted) > 1 or converted[0].dtype != res_blk.dtype: warnings.warn( @@ -1204,6 +1229,7 @@ def _replace_coerce( inplace: bool = True, regex: bool = False, using_cow: bool = False, + convert_string: bool = True, ) -> list[Block]: """ Replace value corresponding to the given boolean array with another @@ -1233,6 +1259,7 @@ def _replace_coerce( inplace=inplace, mask=mask, using_cow=using_cow, + convert_string=convert_string, ) else: if value is None: @@ -1256,6 +1283,7 @@ def _replace_coerce( inplace=inplace, mask=mask, using_cow=using_cow, + convert_string=convert_string, ) # --------------------------------------------------------------------- diff --git a/pandas/io/_util.py b/pandas/io/_util.py index f3e6dba1391be..9373888e28d28 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -1,14 +1,27 @@ from __future__ import annotations -from typing import Callable +from typing import ( + TYPE_CHECKING, + Literal, +) import numpy as np +from pandas._config import using_string_dtype + +from pandas._libs import lib from pandas.compat import pa_version_under18p0 from pandas.compat._optional import import_optional_dependency import pandas as pd +if TYPE_CHECKING: + from collections.abc import Callable + + import pyarrow + + from pandas._typing import DtypeBackend + def _arrow_dtype_mapping() -> dict: pa = import_optional_dependency("pyarrow") @@ -30,7 +43,7 @@ def _arrow_dtype_mapping() -> dict: } -def arrow_string_types_mapper() -> Callable: +def _arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") mapping = { @@ -41,3 +54,35 @@ def arrow_string_types_mapper() -> Callable: mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan) return mapping.get + + +def arrow_table_to_pandas( + table: pyarrow.Table, + dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default, + null_to_int64: bool = False, + to_pandas_kwargs: dict | None = None, +) -> pd.DataFrame: + if to_pandas_kwargs is None: + to_pandas_kwargs = {} + + pa = import_optional_dependency("pyarrow") + + types_mapper: type[pd.ArrowDtype] | None | Callable + if dtype_backend == "numpy_nullable": + mapping = _arrow_dtype_mapping() + if null_to_int64: + # Modify the default mapping to also map null to Int64 + # (to match other engines - only for CSV parser) + mapping[pa.null()] = pd.Int64Dtype() + types_mapper = mapping.get + elif dtype_backend == "pyarrow": + types_mapper = pd.ArrowDtype + elif using_string_dtype(): + types_mapper = _arrow_string_types_mapper() + elif dtype_backend is lib.no_default or dtype_backend == "numpy": + types_mapper = None + else: + raise NotImplementedError + + df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs) + return df diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 68c73483add3f..1bdb732cb10de 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -13,11 +13,10 @@ from pandas.util._decorators import doc from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas.core.api import DataFrame from pandas.core.shared_docs import _shared_docs -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import get_handle if TYPE_CHECKING: @@ -128,16 +127,4 @@ def read_feather( pa_table = feather.read_table( handles.handle, columns=columns, use_threads=bool(use_threads) ) - - if dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get) - - elif dtype_backend == "pyarrow": - return pa_table.to_pandas(types_mapper=pd.ArrowDtype) - - elif using_string_dtype(): - return pa_table.to_pandas(types_mapper=arrow_string_types_mapper()) - else: - raise NotImplementedError + return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 9414f45215029..c0499ce750cf0 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -40,7 +40,6 @@ from pandas.core.dtypes.dtypes import PeriodDtype from pandas import ( - ArrowDtype, DataFrame, Index, MultiIndex, @@ -52,6 +51,7 @@ from pandas.core.reshape.concat import concat from pandas.core.shared_docs import _shared_docs +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( IOHandles, dedup_names, @@ -997,18 +997,7 @@ def read(self) -> DataFrame | Series: if self.engine == "pyarrow": pyarrow_json = import_optional_dependency("pyarrow.json") pa_table = pyarrow_json.read_json(self.data) - - mapping: type[ArrowDtype] | None | Callable - if self.dtype_backend == "pyarrow": - mapping = ArrowDtype - elif self.dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - else: - mapping = None - - return pa_table.to_pandas(types_mapper=mapping) + return arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend) elif self.engine == "ujson": if self.lines: if self.chunksize: diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 5706336b71697..d7f473a929568 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -9,16 +9,13 @@ Literal, ) -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas.core.indexes.api import default_index -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( get_handle, is_fsspec_url, @@ -117,21 +114,7 @@ def read_orc( pa_table = orc.read_table( source=source, columns=columns, filesystem=filesystem, **kwargs ) - if dtype_backend is not lib.no_default: - if dtype_backend == "pyarrow": - df = pa_table.to_pandas(types_mapper=pd.ArrowDtype) - else: - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping() - df = pa_table.to_pandas(types_mapper=mapping.get) - return df - else: - if using_string_dtype(): - types_mapper = arrow_string_types_mapper() - else: - types_mapper = None - return pa_table.to_pandas(types_mapper=types_mapper) + return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) def to_orc( diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index cc33c87dfc55a..01e320cdb1b72 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -10,9 +10,11 @@ Literal, ) import warnings -from warnings import catch_warnings +from warnings import ( + catch_warnings, + filterwarnings, +) -from pandas._config import using_string_dtype from pandas._config.config import _get_option from pandas._libs import lib @@ -22,14 +24,13 @@ from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas import ( DataFrame, get_option, ) from pandas.core.shared_docs import _shared_docs -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( IOHandles, get_handle, @@ -250,20 +251,10 @@ def read( kwargs["use_pandas_metadata"] = True to_pandas_kwargs = {} - if dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping() - to_pandas_kwargs["types_mapper"] = mapping.get - elif dtype_backend == "pyarrow": - to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] - elif using_string_dtype(): - to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper() manager = _get_option("mode.data_manager", silent=True) if manager == "array": - to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] - + to_pandas_kwargs["split_blocks"] = True path_or_handle, handles, filesystem = _get_path_or_handle( path, filesystem, @@ -278,7 +269,18 @@ def read( filters=filters, **kwargs, ) - result = pa_table.to_pandas(**to_pandas_kwargs) + + with catch_warnings(): + filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + result = arrow_table_to_pandas( + pa_table, + dtype_backend=dtype_backend, + to_pandas_kwargs=to_pandas_kwargs, + ) if manager == "array": result = result._as_manager("array", copy=False) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index c774638fd73f7..a7f01e6322755 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -3,8 +3,6 @@ from typing import TYPE_CHECKING import warnings -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -16,18 +14,14 @@ from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.inference import is_integer -import pandas as pd -from pandas import DataFrame - -from pandas.io._util import ( - _arrow_dtype_mapping, - arrow_string_types_mapper, -) +from pandas.io._util import arrow_table_to_pandas from pandas.io.parsers.base_parser import ParserBase if TYPE_CHECKING: from pandas._typing import ReadBuffer + from pandas import DataFrame + class ArrowParserWrapper(ParserBase): """ @@ -287,17 +281,14 @@ def read(self) -> DataFrame: table = table.cast(new_schema) - if dtype_backend == "pyarrow": - frame = table.to_pandas(types_mapper=pd.ArrowDtype) - elif dtype_backend == "numpy_nullable": - # Modify the default mapping to also - # map null to Int64 (to match other engines) - dtype_mapping = _arrow_dtype_mapping() - dtype_mapping[pa.null()] = pd.Int64Dtype() - frame = table.to_pandas(types_mapper=dtype_mapping.get) - elif using_string_dtype(): - frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + frame = arrow_table_to_pandas( + table, dtype_backend=dtype_backend, null_to_int64=True + ) - else: - frame = table.to_pandas() return self._finalize_pandas_output(frame) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 09f0f2af8e5c6..40e3ea6450647 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -464,7 +464,11 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: arrays = [] converters = self._clean_mapping(self.converters) - for i, arr in enumerate(index): + if self.index_names is not None: + names: Iterable = self.index_names + else: + names = itertools.cycle([None]) + for i, (arr, name) in enumerate(zip(index, names)): if try_parse_dates and self._should_parse_dates(i): arr = self._date_conv( arr, @@ -504,12 +508,17 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: arr, _ = self._infer_types( arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool ) - arrays.append(arr) - - names = self.index_names - index = ensure_index_from_sequences(arrays, names) + if cast_type is not None: + # Don't perform RangeIndex inference + idx = Index(arr, name=name, dtype=cast_type) + else: + idx = ensure_index_from_sequences([arr], [name]) + arrays.append(idx) - return index + if len(arrays) == 1: + return arrays[0] + else: + return MultiIndex.from_arrays(arrays) @final def _convert_to_ndarrays( @@ -1084,12 +1093,11 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None): dtype_dict: defaultdict[Hashable, Any] if not is_dict_like(dtype): # if dtype == None, default will be object. - default_dtype = dtype or object - dtype_dict = defaultdict(lambda: default_dtype) + dtype_dict = defaultdict(lambda: dtype) else: dtype = cast(dict, dtype) dtype_dict = defaultdict( - lambda: object, + lambda: None, {columns[k] if is_integer(k) else k: v for k, v in dtype.items()}, ) @@ -1106,8 +1114,14 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None): if (index_col is None or index_col is False) or index_names is None: index = default_index(0) else: - data = [Series([], dtype=dtype_dict[name]) for name in index_names] - index = ensure_index_from_sequences(data, names=index_names) + # TODO: We could return default_index(0) if dtype_dict[name] is None + data = [ + Index([], name=name, dtype=dtype_dict[name]) for name in index_names + ] + if len(data) == 1: + index = data[0] + else: + index = MultiIndex.from_arrays(data) index_col.sort() for i, n in enumerate(index_col): diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 07c95806d7326..7027702a696fe 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -49,10 +49,7 @@ is_object_dtype, is_string_dtype, ) -from pandas.core.dtypes.dtypes import ( - ArrowDtype, - DatetimeTZDtype, -) +from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna from pandas import get_option @@ -68,6 +65,8 @@ from pandas.core.internals.construction import convert_object_array from pandas.core.tools.datetimes import to_datetime +from pandas.io._util import arrow_table_to_pandas + if TYPE_CHECKING: from collections.abc import ( Iterator, @@ -2221,23 +2220,10 @@ def read_table( else: stmt = f"SELECT {select_list} FROM {table_name}" - mapping: type[ArrowDtype] | None | Callable - if dtype_backend == "pyarrow": - mapping = ArrowDtype - elif dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - elif using_string_dtype(): - from pandas.io._util import arrow_string_types_mapper - - mapping = arrow_string_types_mapper() - else: - mapping = None - with self.con.cursor() as cur: cur.execute(stmt) - df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + pa_table = cur.fetch_arrow_table() + df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) return _wrap_result_adbc( df, @@ -2305,23 +2291,10 @@ def read_query( if chunksize: raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") - mapping: type[ArrowDtype] | None | Callable - if dtype_backend == "pyarrow": - mapping = ArrowDtype - elif dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - elif using_string_dtype(): - from pandas.io._util import arrow_string_types_mapper - - mapping = arrow_string_types_mapper() - else: - mapping = None - with self.con.cursor() as cur: cur.execute(sql) - df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + pa_table = cur.fetch_arrow_table() + df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) return _wrap_result_adbc( df, diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index d2b76decaa75d..80f0349b205e6 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -7,6 +7,7 @@ ) import warnings +import matplotlib as mpl from matplotlib.artist import setp import numpy as np @@ -20,6 +21,7 @@ import pandas as pd import pandas.core.common as com +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.core import ( @@ -54,7 +56,8 @@ def _set_ticklabels(ax: Axes, labels: list[str], is_vertical: bool, **kwargs) -> ticks = ax.get_xticks() if is_vertical else ax.get_yticks() if len(ticks) != len(labels): i, remainder = divmod(len(ticks), len(labels)) - assert remainder == 0, remainder + if Version(mpl.__version__) < Version("3.10"): + assert remainder == 0, remainder labels *= i if is_vertical: ax.set_xticklabels(labels, **kwargs) diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 898b5b25e7b01..98441c5afbaa4 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -57,7 +57,7 @@ def format_date_labels(ax: Axes, rot) -> None: fig = ax.get_figure() if fig is not None: # should always be a Figure but can technically be None - maybe_adjust_figure(fig, bottom=0.2) + maybe_adjust_figure(fig, bottom=0.2) # type: ignore[arg-type] def table( diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index 8963265b0a800..68f3fe36546a0 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -218,18 +218,12 @@ def transform(row): def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string): # GH 21224 if using_infer_string: - if df.dtypes.iloc[0].storage == "pyarrow": - import pyarrow as pa - - # TODO(infer_string) - # should raise a proper TypeError instead of propagating the pyarrow error - - expected = (expected, pa.lib.ArrowNotImplementedError) - else: - expected = (expected, NotImplementedError) + expected = (expected, NotImplementedError) msg = ( - "can't multiply sequence by non-int of type 'str'|has no kernel|cannot perform" + "can't multiply sequence by non-int of type 'str'" + "|cannot perform cumprod with type str" # NotImplementedError python backend + "|operation 'cumprod' not supported for dtype 'str'" # TypeError pyarrow ) warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): @@ -259,16 +253,12 @@ def test_agg_cython_table_raises_series(series, func, expected, using_infer_stri if func == "median" or func is np.nanmedian or func is np.median: msg = r"Cannot convert \['a' 'b' 'c'\] to numeric" - if using_infer_string: - if series.dtype.storage == "pyarrow": - import pyarrow as pa - - # TODO(infer_string) - # should raise a proper TypeError instead of propagating the pyarrow error - expected = (expected, pa.lib.ArrowNotImplementedError) - else: - expected = (expected, NotImplementedError) - msg = msg + "|does not support|has no kernel|Cannot perform|cannot perform" + if using_infer_string and func in ("cumprod", np.cumprod, np.nancumprod): + expected = (expected, NotImplementedError) + + msg = ( + msg + "|does not support|has no kernel|Cannot perform|cannot perform|operation" + ) warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index 8956aed5e9ceb..f916567c6b883 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.common import is_number from pandas import ( @@ -88,7 +86,6 @@ def test_apply_np_transformer(float_frame, op, how): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, func, expected", chain( @@ -147,7 +144,6 @@ def test_agg_cython_table_series(series, func, expected): assert result == expected -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, func, expected", chain( @@ -170,10 +166,17 @@ def test_agg_cython_table_series(series, func, expected): ), ), ) -def test_agg_cython_table_transform_series(series, func, expected): +def test_agg_cython_table_transform_series(request, series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + if series.dtype == "string" and func in ("cumsum", np.cumsum, np.nancumsum): + request.applymarker( + pytest.mark.xfail( + raises=(TypeError, NotImplementedError), + reason="TODO(infer_string) cumsum not yet implemented for string", + ) + ) warn = None if isinstance(func, str) else FutureWarning with tm.assert_produces_warning(warn, match="is currently using Series.*"): result = series.agg(func) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 0ce7a66e0e00c..470ca0673c60e 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -436,7 +436,7 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques request.applymarker( pytest.mark.xfail( reason=f"{all_numeric_accumulations} not implemented for {pa_type}", - raises=NotImplementedError, + raises=TypeError, ) ) @@ -1637,7 +1637,7 @@ def test_from_arrow_respecting_given_dtype(): def test_from_arrow_respecting_given_dtype_unsafe(): array = pa.array([1.5, 2.5], type=pa.float64()) - with pytest.raises(pa.ArrowInvalid, match="Float value 1.5 was truncated"): + with tm.external_error_raised(pa.ArrowInvalid): array.to_pandas(types_mapper={pa.float64(): ArrowDtype(pa.int64())}.get) diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 98dd1c5cb615f..6292e6051aa90 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -90,6 +90,31 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: def test_fillna_length_mismatch(self, data_missing): super().test_fillna_length_mismatch(data_missing) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_hash_pandas_object(self, data): + super().test_hash_pandas_object(data) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_hash_pandas_object_works(self, data, as_frame): + super().test_hash_pandas_object_works(data, as_frame) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data, request): + super().test_EA_types(engine, data, request) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_astype_str(self, data): + super().test_astype_str(data) + # TODO: either belongs in tests.arrays.interval or move into base tests. def test_fillna_non_scalar_raises(data_missing): diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 31a0837290cd5..c0fc72768e27f 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -128,19 +128,15 @@ def test_fillna_different_dtype(self): [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] ) - msg = "Downcasting object dtype arrays" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.fillna({2: "foo"}) + result = df.fillna({2: "foo"}) expected = DataFrame( [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] ) # column is originally float (all-NaN) -> filling with string gives object dtype - # expected[2] = expected[2].astype("object") + expected[2] = expected[2].astype("object") tm.assert_frame_equal(result, expected) - msg = "Downcasting object dtype arrays" - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = df.fillna({2: "foo"}, inplace=True) + return_value = df.fillna({2: "foo"}, inplace=True) tm.assert_frame_equal(df, expected) assert return_value is None @@ -379,13 +375,8 @@ def test_fillna_dtype_conversion(self, using_infer_string): # empty block df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.fillna("nan") - else: - result = df.fillna("nan") - # expected = DataFrame("nan", dtype="object", index=range(3),columns=["A", "B"]) - expected = DataFrame("nan", index=range(3), columns=["A", "B"]) + result = df.fillna("nan") + expected = DataFrame("nan", index=range(3), columns=["A", "B"], dtype=object) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("val", ["", 1, np.nan, 1.0]) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 7003957b532e3..a62d51987f454 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -280,20 +280,12 @@ def test_regex_replace_dict_nested(self, mix_abc): tm.assert_frame_equal(res3, expec) tm.assert_frame_equal(res4, expec) - def test_regex_replace_dict_nested_non_first_character( - self, any_string_dtype, using_infer_string - ): + def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype): # GH 25259 dtype = any_string_dtype df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype) - if using_infer_string and any_string_dtype == "object": - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.replace({"a": "."}, regex=True) - expected = DataFrame({"first": [".bc", "bc.", "c.b"]}) - - else: - result = df.replace({"a": "."}, regex=True) - expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) + result = df.replace({"a": "."}, regex=True) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) def test_regex_replace_dict_nested_gh4115(self): @@ -430,31 +422,12 @@ def test_replace_regex_metachar(self, metachar): ], ) def test_regex_replace_string_types( - self, - data, - to_replace, - expected, - frame_or_series, - any_string_dtype, - using_infer_string, - request, + self, data, to_replace, expected, frame_or_series, any_string_dtype ): # GH-41333, GH-35977 dtype = any_string_dtype obj = frame_or_series(data, dtype=dtype) - if using_infer_string and any_string_dtype == "object": - if len(to_replace) > 1 and isinstance(obj, DataFrame): - request.node.add_marker( - pytest.mark.xfail( - reason="object input array that gets downcasted raises on " - "second pass" - ) - ) - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = obj.replace(to_replace, regex=True) - dtype = "str" - else: - result = obj.replace(to_replace, regex=True) + result = obj.replace(to_replace, regex=True) expected = frame_or_series(expected, dtype=dtype) tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py index bdb9b2c055061..0731750aed0cf 100644 --- a/pandas/tests/frame/methods/test_to_numpy.py +++ b/pandas/tests/frame/methods/test_to_numpy.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas.util._test_decorators as td @@ -41,6 +42,9 @@ def test_to_numpy_copy(self, using_copy_on_write): else: assert df.to_numpy(copy=False, na_value=np.nan).base is arr + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_to_numpy_mixed_dtype_to_str(self): # https://github.com/pandas-dev/pandas/issues/35455 df = DataFrame([[Timestamp("2020-01-01 00:00:00"), 100.0]]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index fd770b368c9da..f16068e0b6538 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2450,6 +2450,9 @@ def test_construct_with_two_categoricalindex_series(self): ) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_constructor_series_nonexact_categoricalindex(self): # GH 42424 ser = Series(range(100)) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 447df952fd0e5..cba02ae869889 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -67,6 +67,7 @@ def f(a): } +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_apply_use_categorical_name(df): cats = qcut(df.C, 4) @@ -338,6 +339,7 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_observed(request, using_infer_string, observed): # multiple groupers, don't re-expand the output space # of the grouper @@ -1556,6 +1558,7 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( assert (res.loc[unobserved_cats] == expected).all().all() +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_series_groupby_categorical_aggregation_getitem(): # GH 8870 d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]} diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9b362164c6149..b5588898d4580 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3118,6 +3118,7 @@ def test_groupby_numeric_only_std_no_result(numeric_only): dfgb.std(numeric_only=numeric_only) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_grouping_with_categorical_interval_columns(): # GH#34164 df = DataFrame({"x": [0.1, 0.2, 0.3, -0.4, 0.5], "w": ["a", "b", "a", "c", "a"]}) diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index 59c555b9644a1..dde5f38074efb 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -186,6 +186,12 @@ def test_subtype_datetimelike(self, index, subtype): with pytest.raises(TypeError, match=msg): index.astype(dtype) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_astype_category(self, index): + super().test_astype_category(index) + class TestDatetimelikeSubtype(AstypeTests): """Tests specific to IntervalIndex with datetime-like subtype""" diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index f858ae137ca4e..73bbfc91028b3 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -59,6 +59,9 @@ def test_repr_floats(self): expected = "(329.973, 345.137] 1\n(345.137, 360.191] 2\ndtype: int64" assert result == expected + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) @pytest.mark.parametrize( "tuples, closed, expected_data", [ diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index fd03047b2c127..b5be7e0713cdf 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -341,6 +341,9 @@ def test_get_indexer_categorical(self, target, ordered): expected = index.get_indexer(target) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_get_indexer_categorical_with_nans(self): # GH#41934 nans in both index and in target ii = IntervalIndex.from_breaks(range(5)) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index d3df349027c00..42ef7e7a96f5e 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -62,6 +62,15 @@ def test_get_indexer_with_NA_values( expected = np.array([0, 1, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_infer_string_missing_values(self): + # ensure the passed list is not cast to string but to object so that + # the None value is matched in the index + # https://github.com/pandas-dev/pandas/issues/55834 + idx = Index(["a", "b", None], dtype="object") + result = idx.get_indexer([None, "x"]) + expected = np.array([2, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + class TestGetIndexerNonUnique: def test_get_indexer_non_unique_nas(self, nulls_fixture): diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py index 755b7109a5a04..d1a278af337b7 100644 --- a/pandas/tests/indexes/string/test_indexing.py +++ b/pandas/tests/indexes/string/test_indexing.py @@ -6,6 +6,51 @@ import pandas._testing as tm +def _isnan(val): + try: + return val is not pd.NA and np.isnan(val) + except TypeError: + return False + + +class TestGetLoc: + def test_get_loc(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + assert index.get_loc("b") == 1 + + def test_get_loc_raises(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="d"): + index.get_loc("d") + + def test_get_loc_invalid_value(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="1"): + index.get_loc(1) + + def test_get_loc_non_unique(self, any_string_dtype): + index = Index(["a", "b", "a"], dtype=any_string_dtype) + result = index.get_loc("a") + expected = np.array([True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError): + index.get_loc(nulls_fixture) + + def test_get_loc_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype) + if any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and nulls_fixture is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(nulls_fixture)) + ): + with pytest.raises(KeyError): + index.get_loc(nulls_fixture) + else: + assert index.get_loc(nulls_fixture) == 2 + + class TestGetIndexer: @pytest.mark.parametrize( "method,expected", @@ -41,23 +86,60 @@ def test_get_indexer_strings_raises(self, any_string_dtype): ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] ) + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_missing(self, any_string_dtype, null, using_infer_string): + # NaT and Decimal("NaN") from null_fixture are not supported for string dtype + index = Index(["a", "b", null], dtype=any_string_dtype) + result = index.get_indexer(["a", null, "c"]) + if using_infer_string: + expected = np.array([0, 2, -1], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + expected = np.array([0, -1, -1], dtype=np.intp) + else: + expected = np.array([0, 2, -1], dtype=np.intp) -class TestGetIndexerNonUnique: - @pytest.mark.xfail(reason="TODO(infer_string)", strict=False) - def test_get_indexer_non_unique_nas(self, any_string_dtype, nulls_fixture): - index = Index(["a", "b", None], dtype=any_string_dtype) - indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + tm.assert_numpy_array_equal(result, expected) - expected_indexer = np.array([2], dtype=np.intp) - expected_missing = np.array([], dtype=np.intp) + +class TestGetIndexerNonUnique: + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_non_unique_nas( + self, any_string_dtype, null, using_infer_string + ): + index = Index(["a", "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique(["a", null]) + + if using_infer_string: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + expected_indexer = np.array([0, -1], dtype=np.intp) + expected_missing = np.array([1], dtype=np.intp) + else: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) # actually non-unique - index = Index(["a", None, "b", None], dtype=any_string_dtype) - indexer, missing = index.get_indexer_non_unique([nulls_fixture]) - - expected_indexer = np.array([1, 3], dtype=np.intp) + index = Index(["a", null, "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique(["a", null]) + + if using_infer_string: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + pass + else: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 3845744dc0717..f6a865ccbb3a0 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -519,6 +519,7 @@ def test_intersection_difference_match_empty(self, index, sort): tm.assert_index_equal(inter, diff, exact=True) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @pytest.mark.parametrize( "method", ["intersection", "union", "difference", "symmetric_difference"] diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index ac3bfe3a13a44..4e1697eabf734 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -831,7 +831,7 @@ def replacer(self, how, from_key, to_key): raise ValueError return replacer - def test_replace_series(self, how, to_key, from_key, replacer): + def test_replace_series(self, how, to_key, from_key, replacer, using_infer_string): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") obj = obj.astype(from_key) @@ -856,6 +856,10 @@ def test_replace_series(self, how, to_key, from_key, replacer): else: exp = pd.Series(self.rep[to_key], index=index, name="yyy") + if using_infer_string and exp.dtype == "string" and obj.dtype == object: + # with infer_string, we disable the deprecated downcasting behavior + exp = exp.astype(object) + msg = "Downcasting behavior in `replace`" warn = FutureWarning if ( diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index ef94c4c7aff2c..c32b31c297c5d 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import iNaT from pandas.compat import ( is_ci_environment, @@ -412,7 +410,6 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: pd.api.interchange.from_dataframe(df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_empty_string_column(): # https://github.com/pandas-dev/pandas/issues/56703 df = pd.DataFrame({"a": []}, dtype=str) @@ -421,13 +418,12 @@ def test_empty_string_column(): tm.assert_frame_equal(df, result) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_large_string(): # GH#56702 pytest.importorskip("pyarrow") df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") result = pd.api.interchange.from_dataframe(df.__dataframe__()) - expected = pd.DataFrame({"a": ["x"]}, dtype="object") + expected = pd.DataFrame({"a": ["x"]}, dtype="str") tm.assert_frame_equal(result, expected) @@ -438,7 +434,6 @@ def test_non_str_names(): assert names == ["0"] -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_non_str_names_w_duplicates(): # https://github.com/pandas-dev/pandas/issues/56701 df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]}) @@ -449,7 +444,7 @@ def test_non_str_names_w_duplicates(): "Expected a Series, got a DataFrame. This likely happened because you " "called __dataframe__ on a DataFrame which, after converting column " r"names to string, resulted in duplicated names: Index\(\['0', '0'\], " - r"dtype='object'\). Please rename these columns before using the " + r"dtype='(str|object)'\). Please rename these columns before using the " "interchange protocol." ), ): diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 57091b268a9db..f133423bc6a85 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -755,6 +755,9 @@ def test_excel_date_datetime_format(self, ext, path): # we need to use df_expected to check the result. tm.assert_frame_equal(rs2, df_expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_to_excel_interval_no_labels(self, path, using_infer_string): # see gh-19242 # diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 7b70601addcad..5226476ef6eac 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import parsers as libparsers from pandas.errors import DtypeWarning @@ -230,8 +228,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): assert result.a.dtype == float -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -def test_warn_if_chunks_have_mismatched_type(all_parsers): +def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string): warning_type = None parser = all_parsers size = 10000 @@ -259,8 +256,12 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers): "Specify dtype option on import or set low_memory=False.", buf, ) - - assert df.a.dtype == object + if parser.engine == "c" and parser.low_memory: + assert df.a.dtype == object + elif using_infer_string: + assert df.a.dtype == "str" + else: + assert df.a.dtype == object @pytest.mark.parametrize("iterator", [True, False]) diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index c13b77f365496..d573b47bb3279 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -14,8 +14,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ( EmptyDataError, ParserError, @@ -69,14 +67,13 @@ def test_local_file(all_parsers, csv_dir_path): pytest.skip("Failing on: " + " ".join(platform.uname())) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # AssertionError: DataFrame.index are different def test_path_path_lib(all_parsers): parser = all_parsers df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0)) tm.assert_frame_equal(df, result) diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 0121af53f1aa4..cdd65223a9c9f 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -8,8 +8,6 @@ import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Index, @@ -87,9 +85,13 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) -def test_multi_index_no_level_names(all_parsers, index_col): +def test_multi_index_no_level_names( + request, all_parsers, index_col, using_infer_string +): + if using_infer_string and all_parsers.engine == "pyarrow": + # result should have string columns instead of object dtype + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) data = """index1,index2,A,B,C,D foo,one,2,3,4,5 foo,two,7,8,9,10 diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 787941c5d0376..d28c43c45647a 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ParserWarning import pandas as pd @@ -24,6 +22,8 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) @@ -54,7 +54,6 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_per_column(all_parsers): parser = all_parsers @@ -68,7 +67,6 @@ def test_dtype_per_column(all_parsers): [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] ) expected["one"] = expected["one"].astype(np.float64) - expected["two"] = expected["two"].astype(object) result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) tm.assert_frame_equal(result, expected) @@ -598,6 +596,7 @@ def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_accurate_parsing_of_large_integers(all_parsers): # GH#52505 data = """SYMBOL,MOMENT,ID,ID_DEAL @@ -608,7 +607,7 @@ def test_accurate_parsing_of_large_integers(all_parsers): AMZN,20230301181139587,2023552585717889759,2023552585717263360 MSFT,20230301181139587,2023552585717889863,2023552585717263361 NVDA,20230301181139587,2023552585717889827,2023552585717263361""" - orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) + orders = all_parsers.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1 assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1 assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2 @@ -630,3 +629,16 @@ def test_dtypes_with_usecols(all_parsers): values = ["1", "4"] expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]}) tm.assert_frame_equal(result, expected) + + +def test_index_col_with_dtype_no_rangeindex(all_parsers): + data = StringIO("345.5,519.5,0\n519.5,726.5,1") + result = all_parsers.read_csv( + data, + header=None, + names=["start", "stop", "bin_id"], + dtype={"start": np.float32, "stop": np.float32, "bin_id": np.uint32}, + index_col="bin_id", + ).index + expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id") + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 1501479510e17..5b72f76440349 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -17,8 +17,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import ( ParserError, @@ -185,8 +183,7 @@ def error(val: float, actual_val: Decimal) -> Decimal: assert max(precise_errors) <= max(normal_errors) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_usecols_dtypes(c_parser_only): +def test_usecols_dtypes(c_parser_only, using_infer_string): parser = c_parser_only data = """\ 1,2,3 @@ -211,8 +208,12 @@ def test_usecols_dtypes(c_parser_only): dtype={"b": int, "c": float}, ) - assert (result.dtypes == [object, int, float]).all() - assert (result2.dtypes == [object, float]).all() + if using_infer_string: + assert (result.dtypes == ["string", int, float]).all() + assert (result2.dtypes == ["string", float]).all() + else: + assert (result.dtypes == [object, int, float]).all() + assert (result2.dtypes == [object, float]).all() def test_disable_bool_parsing(c_parser_only): diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index a3c6dc8fd0898..1848e1e571fc1 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -186,7 +184,6 @@ def convert_score(x): tm.assert_frame_equal(results[0], results[1]) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("conv_f", [lambda x: x, str]) def test_converter_index_col_bug(all_parsers, conv_f): # see gh-1835 , GH#40589 @@ -205,7 +202,7 @@ def test_converter_index_col_bug(all_parsers, conv_f): StringIO(data), sep=";", index_col="A", converters={"A": conv_f} ) - xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object")) + xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A")) tm.assert_frame_equal(rs, xp) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 6dbfed2b6ae83..9224b743b8917 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Index, @@ -344,7 +342,6 @@ def test_infer_types_boolean_sum(all_parsers): tm.assert_frame_equal(result, expected, check_index_type=False) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)]) def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): # GH#9435 @@ -355,7 +352,7 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine") ) result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype}) - expected = DataFrame({"b": [2]}, index=Index([val], name="a")) + expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype)) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 32a8d3b81f470..80c32d3a6262e 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -7,9 +7,10 @@ import pytest -from pandas._config import using_string_dtype - -from pandas import DataFrame +from pandas import ( + DataFrame, + Index, +) import pandas._testing as tm xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @@ -120,7 +121,6 @@ def test_thorough_mangle_names(all_parsers, data, names, expected): parser.read_csv(StringIO(data), names=names) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # AssertionError: DataFrame.columns are different def test_mangled_unnamed_placeholders(all_parsers): # xref gh-13017 @@ -132,7 +132,7 @@ def test_mangled_unnamed_placeholders(all_parsers): # This test recursively updates `df`. for i in range(3): - expected = DataFrame() + expected = DataFrame(columns=Index([], dtype="str")) for j in range(i + 1): col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 5f9823f7225f9..dd168aaa45808 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.parsers import STR_NA_VALUES from pandas import ( @@ -260,7 +258,6 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "kwargs,expected", [ @@ -306,7 +303,9 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): ), ], ) -def test_na_values_keep_default(all_parsers, kwargs, expected, request): +def test_na_values_keep_default( + all_parsers, kwargs, expected, request, using_infer_string +): data = """\ A,B,C a,1,one @@ -324,8 +323,9 @@ def test_na_values_keep_default(all_parsers, kwargs, expected, request): with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), **kwargs) return - mark = pytest.mark.xfail() - request.applymarker(mark) + if not using_infer_string or "na_values" in kwargs: + mark = pytest.mark.xfail() + request.applymarker(mark) result = parser.read_csv(StringIO(data), **kwargs) tm.assert_frame_equal(result, expected) @@ -435,8 +435,6 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -@xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case @pytest.mark.parametrize( "na_filter,row_data", [ @@ -444,14 +442,21 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v (False, [["1", "A"], ["nan", "B"], ["3", "C"]]), ], ) -def test_na_values_na_filter_override(all_parsers, na_filter, row_data): +def test_na_values_na_filter_override( + request, all_parsers, na_filter, row_data, using_infer_string +): + parser = all_parsers + if parser.engine == "pyarrow": + # mismatched dtypes in both cases, FutureWarning in the True case + if not (using_infer_string and na_filter): + mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") + request.applymarker(mark) data = """\ A,B 1,A nan,B 3,C """ - parser = all_parsers result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter) expected = DataFrame(row_data, columns=["A", "B"]) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index be2015fca27d1..616fcb81cf055 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -16,8 +16,6 @@ import pytest import pytz -from pandas._config import using_string_dtype - from pandas._libs.tslibs import parsing import pandas as pd @@ -1799,7 +1797,6 @@ def test_parse_timezone(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @skip_pyarrow # pandas.errors.ParserError: CSV parse error @pytest.mark.parametrize( "date_string", @@ -1807,7 +1804,7 @@ def test_parse_timezone(all_parsers): ) def test_invalid_parse_delimited_date(all_parsers, date_string): parser = all_parsers - expected = DataFrame({0: [date_string]}, dtype="object") + expected = DataFrame({0: [date_string]}, dtype="str") result = parser.read_csv( StringIO(date_string), header=None, @@ -2054,7 +2051,6 @@ def test_parse_dates_and_keep_original_column(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dayfirst_warnings(): # GH 12585 @@ -2087,7 +2083,7 @@ def test_dayfirst_warnings(): # first in DD/MM/YYYY, second in MM/DD/YYYY input = "date\n31/12/2014\n03/30/2011" - expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date") + expected = Index(["31/12/2014", "03/30/2011"], dtype="str", name="date") # A. use dayfirst=True res5 = read_csv( @@ -2204,7 +2200,6 @@ def test_parse_dates_and_string_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_parse_dot_separated_dates(all_parsers): # https://github.com/pandas-dev/pandas/issues/2586 parser = all_parsers @@ -2214,7 +2209,7 @@ def test_parse_dot_separated_dates(all_parsers): if parser.engine == "pyarrow": expected_index = Index( ["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"], - dtype="object", + dtype="str", name="a", ) warn = None diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index 01e576ba40f26..bc4c4c2e24e9c 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.parsers import ( _maybe_upcast, na_values, @@ -86,7 +84,6 @@ def test_maybe_upcaste_all_nan(): tm.assert_extension_array_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("val", [na_values[np.object_], "c"]) def test_maybe_upcast_object(val, string_storage): # GH#36712 diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 5ed64e3eb0958..cf59e3e4c4934 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -5,6 +5,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas import ( DataFrame, date_range, @@ -168,7 +170,9 @@ def test_excel_options(fsspectest): assert fsspectest.test[0] == "read" -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") +@pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string) fastparquet" +) def test_to_parquet_new_file(cleared_fs, df1): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index c7671bfb513aa..9fc0f6eb47766 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat.pyarrow import pa_version_under17p0 from pandas import ( @@ -196,7 +194,6 @@ def test_to_csv_compression_encoding_gcs( tm.assert_frame_equal(df, read_df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index e43aae6a2e9e7..87f9b0108402c 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1223,11 +1223,17 @@ def test_duplicate_columns(self, fp): msg = "Cannot create parquet dataset with duplicate column names" self.check_error_on_write(df, fp, ValueError, msg) - @pytest.mark.xfail( - Version(np.__version__) >= Version("2.0.0"), - reason="fastparquet uses np.float_ in numpy2", - ) - def test_bool_with_none(self, fp): + def test_bool_with_none(self, fp, request): + import fastparquet + + if Version(fastparquet.__version__) < Version("2024.11.0") and Version( + np.__version__ + ) >= Version("2.0.0"): + request.applymarker( + pytest.mark.xfail( + reason=("fastparquet uses np.float_ in numpy2"), + ) + ) df = pd.DataFrame({"a": [True, None, False]}) expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") # Fastparquet bug in 0.7.1 makes it so that this dtype becomes @@ -1342,12 +1348,21 @@ def test_empty_dataframe(self, fp): expected = df.copy() check_round_trip(df, fp, expected=expected) - @pytest.mark.xfail( - _HAVE_FASTPARQUET and Version(fastparquet.__version__) > Version("2022.12"), - reason="fastparquet bug, see https://github.com/dask/fastparquet/issues/929", - ) - @pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index") - def test_timezone_aware_index(self, fp, timezone_aware_date_list): + def test_timezone_aware_index(self, fp, timezone_aware_date_list, request): + import fastparquet + + if Version(fastparquet.__version__) > Version("2022.12") and Version( + fastparquet.__version__ + ) < Version("2024.11.0"): + request.applymarker( + pytest.mark.xfail( + reason=( + "fastparquet bug, see " + "https://github.com/dask/fastparquet/issues/929" + ), + ) + ) + idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 29efe7a457ff8..3676721c5e6b7 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -959,12 +959,12 @@ def sqlite_buildin_types(sqlite_buildin, types_data): adbc_connectable_iris = [ pytest.param("postgresql_adbc_iris", marks=pytest.mark.db), - pytest.param("sqlite_adbc_iris", marks=pytest.mark.db), + "sqlite_adbc_iris", ] adbc_connectable_types = [ pytest.param("postgresql_adbc_types", marks=pytest.mark.db), - pytest.param("sqlite_adbc_types", marks=pytest.mark.db), + "sqlite_adbc_types", ] diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 4ca4067214bbd..33366b4eabba5 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -1059,28 +1059,43 @@ def test_boxplot_series_positions(self, hist_df): tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), positions) assert len(ax.lines) == 7 * len(numeric_cols) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib 3.10", + ) def test_boxplot_vertical(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] # if horizontal, yticklabels are rotated - ax = df.plot.box(rot=50, fontsize=8, vert=False) + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) + ax = df.plot.box(rot=50, fontsize=8, **kwargs) _check_ticks_props(ax, xrot=0, yrot=50, ylabelsize=8) _check_text_labels(ax.get_yticklabels(), labels) assert len(ax.lines) == 7 * len(numeric_cols) - @pytest.mark.filterwarnings("ignore:Attempt:UserWarning") + @pytest.mark.filterwarnings("ignore::UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib version 3.10", + ) def test_boxplot_vertical_subplots(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) axes = _check_plot_works( - df.plot.box, - default_axes=True, - subplots=True, - vert=False, - logx=True, + df.plot.box, default_axes=True, subplots=True, logx=True, **kwargs ) _check_axes_shape(axes, axes_num=3, layout=(1, 3)) _check_ax_scales(axes, xaxis="log") @@ -1088,12 +1103,22 @@ def test_boxplot_vertical_subplots(self, hist_df): _check_text_labels(ax.get_yticklabels(), [label]) assert len(ax.lines) == 7 + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib 3.10", + ) def test_boxplot_vertical_positions(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] positions = np.array([3, 2, 8]) - ax = df.plot.box(positions=positions, vert=False) + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) + ax = df.plot.box(positions=positions, **kwargs) _check_text_labels(ax.get_yticklabels(), labels) tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), positions) assert len(ax.lines) == 7 * len(numeric_cols) diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 76f7fa1f22eec..e1b03a34086c0 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -1,5 +1,7 @@ """ Test cases for .boxplot method """ +from __future__ import annotations + import itertools import string @@ -22,6 +24,7 @@ _check_ticks_props, _check_visible, ) +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing @@ -35,6 +38,17 @@ def _check_ax_limits(col, ax): assert y_max >= col.max() +if Version(mpl.__version__) < Version("3.10"): + verts: list[dict[str, bool | str]] = [{"vert": False}, {"vert": True}] +else: + verts = [{"orientation": "horizontal"}, {"orientation": "vertical"}] + + +@pytest.fixture(params=verts) +def vert(request): + return request.param + + class TestDataFramePlots: def test_stacked_boxplot_set_axis(self): # GH2980 @@ -315,7 +329,7 @@ def test_specified_props_kwd(self, props, expected): assert result[expected][0].get_color() == "C1" - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_plot_xlabel_ylabel(self, vert): df = DataFrame( { @@ -325,11 +339,11 @@ def test_plot_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.plot(kind="box", vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.plot(kind="box", xlabel=xlabel, ylabel=ylabel, **vert) assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_plot_box(self, vert): # GH 54941 rng = np.random.default_rng(2) @@ -338,14 +352,14 @@ def test_plot_box(self, vert): xlabel, ylabel = "x", "y" _, axs = plt.subplots(ncols=2, figsize=(10, 7), sharey=True) - df1.plot.box(ax=axs[0], vert=vert, xlabel=xlabel, ylabel=ylabel) - df2.plot.box(ax=axs[1], vert=vert, xlabel=xlabel, ylabel=ylabel) + df1.plot.box(ax=axs[0], xlabel=xlabel, ylabel=ylabel, **vert) + df2.plot.box(ax=axs[1], xlabel=xlabel, ylabel=ylabel, **vert) for ax in axs: assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel mpl.pyplot.close() - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_boxplot_xlabel_ylabel(self, vert): df = DataFrame( { @@ -355,11 +369,11 @@ def test_boxplot_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.boxplot(vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.boxplot(xlabel=xlabel, ylabel=ylabel, **vert) assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_boxplot_group_xlabel_ylabel(self, vert): df = DataFrame( { @@ -369,14 +383,20 @@ def test_boxplot_group_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.boxplot(by="group", vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.boxplot(by="group", xlabel=xlabel, ylabel=ylabel, **vert) for subplot in ax: assert subplot.get_xlabel() == xlabel assert subplot.get_ylabel() == ylabel mpl.pyplot.close() - @pytest.mark.parametrize("vert", [True, False]) - def test_boxplot_group_no_xlabel_ylabel(self, vert): + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + def test_boxplot_group_no_xlabel_ylabel(self, vert, request): + if Version(mpl.__version__) >= Version("3.10") and vert == { + "orientation": "horizontal" + }: + request.applymarker( + pytest.mark.xfail(reason=f"{vert} fails starting with matplotlib 3.10") + ) df = DataFrame( { "a": np.random.default_rng(2).standard_normal(10), @@ -384,9 +404,14 @@ def test_boxplot_group_no_xlabel_ylabel(self, vert): "group": np.random.default_rng(2).choice(["group1", "group2"], 10), } ) - ax = df.boxplot(by="group", vert=vert) + ax = df.boxplot(by="group", **vert) for subplot in ax: - target_label = subplot.get_xlabel() if vert else subplot.get_ylabel() + target_label = ( + subplot.get_xlabel() + if vert == {"vert": True} # noqa: PLR1714 + or vert == {"orientation": "vertical"} + else subplot.get_ylabel() + ) assert target_label == pprint_thing(["group"]) mpl.pyplot.close() diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 0811c69859c0d..cab2302b3d877 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -727,6 +727,7 @@ def test_cut_with_duplicated_index_lowest_included(): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_cut_with_nonexact_categorical_indices(): # GH 42424 diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 0b0cf57a70c3f..0c2e0fdc2616f 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm from pandas.core.arrays import IntervalArray @@ -768,7 +766,6 @@ def test_replace_value_none_dtype_numeric(self, val): expected = pd.Series([1, None], dtype=object) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_change_dtype_series(self): # GH#25797 df = pd.DataFrame({"Test": ["0.5", True, "0.6"]}, dtype=object) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 97d6415e0de05..d1e69cfa2b4ee 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1283,7 +1283,7 @@ def test_value_counts_nat(self): result_dt = algos.value_counts(dt) tm.assert_series_equal(result_dt, exp_dt) - exp_td = Series({np.timedelta64(10000): 1}, name="count") + exp_td = Series([1], index=[np.timedelta64(10000)], name="count") with tm.assert_produces_warning(FutureWarning, match=msg): result_td = algos.value_counts(td) tm.assert_series_equal(result_td, exp_td)