diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 2584e1f13853a..192f19c36b47d 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -1,6 +1,7 @@ from importlib import import_module import numpy as np +import pyarrow as pa import pandas as pd @@ -72,7 +73,16 @@ class Duplicated: params = [ [True, False], ["first", "last", False], - ["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"], + [ + "int", + "uint", + "float", + "string", + "datetime64[ns]", + "datetime64[ns, tz]", + "timestamp[ms][pyarrow]", + "duration[s][pyarrow]", + ], ] param_names = ["unique", "keep", "dtype"] @@ -87,6 +97,12 @@ def setup(self, unique, keep, dtype): "datetime64[ns, tz]": pd.date_range( "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" ), + "timestamp[ms][pyarrow]": pd.Index( + np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms")) + ), + "duration[s][pyarrow]": pd.Index( + np.arange(N), dtype=pd.ArrowDtype(pa.duration("s")) + ), }[dtype] if not unique: data = data.repeat(5) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 83f830bb11198..e412793a328a3 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -49,6 +49,7 @@ objects. api.extensions.ExtensionArray.copy api.extensions.ExtensionArray.view api.extensions.ExtensionArray.dropna + api.extensions.ExtensionArray.duplicated api.extensions.ExtensionArray.equals api.extensions.ExtensionArray.factorize api.extensions.ExtensionArray.fillna diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 5def84b91705c..2c612e31d33b6 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -451,7 +451,7 @@ Merge Concat ~~~~~~ -pandas provides various facilities for easily combining together :class:`Series`` and +pandas provides various facilities for easily combining together :class:`Series` and :class:`DataFrame` objects with various kinds of set logic for the indexes and relational algebra functionality in the case of join / merge-type operations. diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index bc2f4420da784..c4721f3a6b09c 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -184,7 +184,7 @@ can be improved by passing an ``np.ndarray``. ...: cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b, ...: np.ndarray col_N): ...: assert (col_a.dtype == np.float64 - ...: and col_b.dtype == np.float64 and col_N.dtype == np.int_) + ...: and col_b.dtype == np.float64 and col_N.dtype == np.dtype(int)) ...: cdef Py_ssize_t i, n = len(col_N) ...: assert (len(col_a) == len(col_b) == n) ...: cdef np.ndarray[double] res = np.empty(n) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index a5ba365f2d456..df19b01610a7a 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -22,8 +22,15 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) +- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) +- Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) +- Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`) - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) +- Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) +- Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`) +- Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) - .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index bc8c258b40ebc..b04e0e228e72f 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -76,6 +76,7 @@ Other enhancements - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) +- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) - @@ -132,10 +133,36 @@ and ``sort=False``: result -.. _whatsnew_220.notable_bug_fixes.notable_bug_fix2: +.. _whatsnew_220.notable_bug_fixes.multiindex_join_different_levels: -notable_bug_fix2 -^^^^^^^^^^^^^^^^ +:func:`merge` and :meth:`DataFrame.join` no longer reorder levels when levels differ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` would reorder +index levels when joining on two indexes with different levels (:issue:`34133`). + +.. ipython:: python + + left = pd.DataFrame({"left": 1}, index=pd.MultiIndex.from_tuples([("x", 1), ("x", 2)], names=["A", "B"])) + right = pd.DataFrame({"right": 2}, index=pd.MultiIndex.from_tuples([(1, 1), (2, 2)], names=["B", "C"])) + result = left.join(right) + +*Old Behavior* + +.. code-block:: ipython + + In [5]: result + Out[5]: + left right + B A C + 1 x 1 1 2 + 2 x 2 1 2 + +*New Behavior* + +.. ipython:: python + + result .. --------------------------------------------------------------------------- .. _whatsnew_220.api_breaking: @@ -241,6 +268,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`) - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`) - Performance improvement in :meth:`Index.difference` (:issue:`55108`) +- Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) - Performance improvement when localizing time to UTC (:issue:`55241`) @@ -251,7 +279,9 @@ Bug fixes ~~~~~~~~~ - Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`) - Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) +- Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) +- Bug in :meth:`pandas.DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) - Bug in :meth:`pandas.read_excel` with a ODS file without cached formatted cell for float values (:issue:`55219`) Categorical @@ -339,6 +369,7 @@ Reshaping ^^^^^^^^^ - Bug in :func:`concat` ignoring ``sort`` parameter when passed :class:`DatetimeIndex` indexes (:issue:`54769`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) +- Sparse ^^^^^^ diff --git a/pandas/_typing.py b/pandas/_typing.py index f18c67fcb0c90..de01434c09c39 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -509,3 +509,12 @@ def closed(self) -> bool: # Offsets OffsetCalendar = Union[np.busdaycalendar, "AbstractHolidayCalendar"] + +# read_csv: usecols +UsecolsArgType = Union[ + SequenceNotStr[Hashable], + range, + AnyArrayLike, + Callable[[HashableT], bool], + None, +] diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index d376fa4c1919e..1b974a92f8188 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -21,6 +21,21 @@ ) +np_long: type +np_ulong: type + +if _nlv >= Version("2.0.0.dev0"): + try: + np_long = np.long # type: ignore[attr-defined] + np_ulong = np.ulong # type: ignore[attr-defined] + except AttributeError: + np_long = np.int_ + np_ulong = np.uint +else: + np_long = np.int_ + np_ulong = np.uint + + __all__ = [ "np", "_np_version", diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c952178f4c998..dd45969a13fd7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -55,7 +55,6 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ( - ArrowDtype, BaseMaskedDtype, CategoricalDtype, ExtensionDtype, @@ -979,14 +978,16 @@ def value_counts_arraylike( def duplicated( - values: ArrayLike, keep: Literal["first", "last", False] = "first" + values: ArrayLike, + keep: Literal["first", "last", False] = "first", + mask: npt.NDArray[np.bool_] | None = None, ) -> npt.NDArray[np.bool_]: """ Return boolean ndarray denoting duplicate values. Parameters ---------- - values : nd.array, ExtensionArray or Series + values : np.ndarray or ExtensionArray Array over which to check for duplicate values. keep : {'first', 'last', False}, default 'first' - ``first`` : Mark duplicates as ``True`` except for the first @@ -994,21 +995,15 @@ def duplicated( - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - False : Mark all duplicates as ``True``. + mask : ndarray[bool], optional + array indicating which elements to exclude from checking Returns ------- duplicated : ndarray[bool] """ - if hasattr(values, "dtype"): - if isinstance(values.dtype, ArrowDtype) and values.dtype.kind in "ifub": - values = values._to_masked() # type: ignore[union-attr] - - if isinstance(values.dtype, BaseMaskedDtype): - values = cast("BaseMaskedArray", values) - return htable.duplicated(values._data, keep=keep, mask=values._mask) - values = _ensure_data(values) - return htable.duplicated(values, keep=keep) + return htable.duplicated(values, keep=keep, mask=mask) def mode( @@ -1641,7 +1636,7 @@ def safe_sort( else: mask = None else: - reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer = np.empty(len(sorter), dtype=int) reverse_indexer.put(sorter, np.arange(len(sorter))) # Out of bound indices will be masked with `-1` next, so we # may deal with them here without performance loss using `mode='wrap'` diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4b79d0dbb683e..12fe9b30f3f52 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -30,8 +30,12 @@ from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs -from pandas.core.dtypes.cast import can_hold_element +from pandas.core.dtypes.cast import ( + can_hold_element, + infer_dtype_from_scalar, +) from pandas.core.dtypes.common import ( + CategoricalDtype, is_array_like, is_bool_dtype, is_integer, @@ -42,6 +46,7 @@ from pandas.core.dtypes.missing import isna from pandas.core import ( + algorithms as algos, missing, roperator, ) @@ -627,7 +632,9 @@ def __setstate__(self, state) -> None: def _cmp_method(self, other, op): pc_func = ARROW_CMP_FUNCS[op.__name__] - if isinstance(other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)): + if isinstance( + other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) + ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): result = pc_func(self._pa_array, self._box_pa(other)) elif is_scalar(other): try: @@ -1289,6 +1296,30 @@ def to_numpy( result[~mask] = data[~mask]._pa_array.to_numpy() return result + @doc(ExtensionArray.duplicated) + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + pa_type = self._pa_array.type + if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type): + values = self.to_numpy(na_value=0) + elif pa.types.is_boolean(pa_type): + values = self.to_numpy(na_value=False) + elif pa.types.is_temporal(pa_type): + if pa_type.bit_width == 32: + pa_type = pa.int32() + else: + pa_type = pa.int64() + arr = self.astype(ArrowDtype(pa_type)) + values = arr.to_numpy(na_value=0) + else: + # factorize the values to avoid the performance penalty of + # converting to object dtype + values = self.factorize()[0] + + mask = self.isna() if self._hasna else None + return algos.duplicated(values, keep=keep, mask=mask) + def unique(self) -> Self: """ Compute the ArrowExtensionArray of unique values. @@ -1599,13 +1630,21 @@ def _reduce( pa_result = self._reduce_pyarrow(name, skipna=skipna, **kwargs) if keepdims: - result = pa.array([pa_result.as_py()], type=pa_result.type) + if isinstance(pa_result, pa.Scalar): + result = pa.array([pa_result.as_py()], type=pa_result.type) + else: + result = pa.array( + [pa_result], + type=to_pyarrow_type(infer_dtype_from_scalar(pa_result)[0]), + ) return type(self)(result) if pc.is_null(pa_result).as_py(): return self.dtype.na_value - else: + elif isinstance(pa_result, pa.Scalar): return pa_result.as_py() + else: + return pa_result def _explode(self): """ @@ -1708,7 +1747,7 @@ def __setitem__(self, key, value) -> None: data = pa.chunked_array([data]) self._pa_array = data - def _rank( + def _rank_calc( self, *, axis: AxisInt = 0, @@ -1717,9 +1756,6 @@ def _rank( ascending: bool = True, pct: bool = False, ): - """ - See Series.rank.__doc__. - """ if pa_version_under9p0 or axis != 0: ranked = super()._rank( axis=axis, @@ -1734,7 +1770,7 @@ def _rank( else: pa_type = pa.uint64() result = pa.array(ranked, type=pa_type, from_pandas=True) - return type(self)(result) + return result data = self._pa_array.combine_chunks() sort_keys = "ascending" if ascending else "descending" @@ -1773,7 +1809,29 @@ def _rank( divisor = pc.count(result) result = pc.divide(result, divisor) - return type(self)(result) + return result + + def _rank( + self, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ): + """ + See Series.rank.__doc__. + """ + return type(self)( + self._rank_calc( + axis=axis, + method=method, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + ) def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str) -> Self: """ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 933944dbd4632..177c105688d0c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -61,6 +61,7 @@ roperator, ) from pandas.core.algorithms import ( + duplicated, factorize_array, isin, map_array, @@ -125,6 +126,7 @@ class ExtensionArray: astype copy dropna + duplicated factorize fillna equals @@ -891,7 +893,6 @@ def interpolate( limit, limit_direction, limit_area, - fill_value, copy: bool, **kwargs, ) -> Self: @@ -1116,6 +1117,31 @@ def dropna(self) -> Self: # error: Unsupported operand type for ~ ("ExtensionArray") return self[~self.isna()] # type: ignore[operator] + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + """ + Return boolean ndarray denoting duplicate values. + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + - ``first`` : Mark duplicates as ``True`` except for the first occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last occurrence. + - False : Mark all duplicates as ``True``. + + Returns + ------- + ndarray[bool] + + Examples + -------- + >>> pd.array([1, 1, 2, 3, 3], dtype="Int64").duplicated() + array([False, True, False, False, True]) + """ + mask = self.isna().astype(np.bool_, copy=False) + return duplicated(values=self, keep=keep, mask=mask) + def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: """ Shift values by desired number. diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 9b85fb0477e6f..56d3711c7d13b 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -952,6 +952,14 @@ def copy(self) -> Self: mask = self._mask.copy() return self._simple_new(data, mask) + @doc(ExtensionArray.duplicated) + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + values = self._data + mask = self._mask + return algos.duplicated(values, keep=keep, mask=mask) + def unique(self) -> Self: """ Compute the BaseMaskedArray of unique values. diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 4d5eef960293f..cf349220e4ba7 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -28,6 +28,7 @@ from pandas._libs.tslibs import NaT from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning +from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( validate_bool_kwarg, @@ -830,6 +831,14 @@ def _first_fill_value_loc(self): diff = np.r_[np.diff(indices), 2] return indices[(diff > 1).argmax()] + 1 + @doc(ExtensionArray.duplicated) + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + values = np.asarray(self) + mask = np.asarray(self.isna()) + return algos.duplicated(values, keep=keep, mask=mask) + def unique(self) -> Self: uniques = algos.unique(self.sp_values) if len(self.sp_values) != len(self): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 6262055827428..db11e74951ee5 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -53,6 +53,7 @@ from collections.abc import Sequence from pandas._typing import ( + AxisInt, Dtype, Scalar, npt, @@ -501,6 +502,28 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): def _convert_int_dtype(self, result): return Int64Dtype().__from_arrow__(result) + def _rank( + self, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ): + """ + See Series.rank.__doc__. + """ + return self._convert_int_dtype( + self._rank_calc( + axis=axis, + method=method, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + ) + class ArrowStringArrayNumpySemantics(ArrowStringArray): _storage = "pyarrow_numpy" @@ -584,7 +607,10 @@ def _str_map( return lib.map_infer_mask(arr, f, mask.view("uint8")) def _convert_int_dtype(self, result): - result = result.to_numpy() + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + else: + result = result.to_numpy() if result.dtype == np.int32: result = result.astype(np.int64) return result @@ -613,3 +639,8 @@ def _reduce( ) else: return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + + def insert(self, loc: int, item) -> ArrowStringArrayNumpySemantics: + if item is np.nan: + item = libmissing.NA + return super().insert(loc, item) # type: ignore[return-value] diff --git a/pandas/core/base.py b/pandas/core/base.py index 3026189e747bb..d4421560bcea7 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1365,7 +1365,10 @@ def drop_duplicates(self, *, keep: DropKeep = "first"): @final def _duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: - return algorithms.duplicated(self._values, keep=keep) + arr = self._values + if isinstance(arr, ExtensionArray): + return arr.duplicated(keep=keep) + return algorithms.duplicated(arr, keep=keep) def _arith_method(self, other, op): res_name = ops.get_op_result_name(self, other) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index aaac0dc73486f..e661d590ab330 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -562,7 +562,12 @@ def sanitize_array( if not is_list_like(data): if index is None: raise ValueError("index must be specified when data is not list-like") + if isinstance(data, str) and using_pyarrow_string_dtype(): + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype("pyarrow_numpy") data = construct_1d_arraylike_from_scalar(data, len(index), dtype) + return data elif isinstance(data, ABCExtensionArray): diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 9da4eac6a42c8..42e909a6b9856 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1664,9 +1664,12 @@ def is_all_strings(value: ArrayLike) -> bool: dtype = value.dtype if isinstance(dtype, np.dtype): - return dtype == np.dtype("object") and lib.is_string_array( - np.asarray(value), skipna=False - ) + if len(value) == 0: + return dtype == np.dtype("object") + else: + return dtype == np.dtype("object") and lib.is_string_array( + np.asarray(value), skipna=False + ) elif isinstance(dtype, CategoricalDtype): return dtype.categories.inferred_type == "string" return dtype == "string" diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9017ff121976b..11f2cc8ebf1ff 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4725,6 +4725,13 @@ def _join_multi(self, other: Index, how: JoinHow): multi_join_idx = multi_join_idx.remove_unused_levels() + # maintain the order of the index levels + if how == "right": + level_order = other_names_list + ldrop_names + else: + level_order = self_names_list + rdrop_names + multi_join_idx = multi_join_idx.reorder_levels(level_order) + return multi_join_idx, lidx, ridx jl = next(iter(overlap)) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 6f30bc650aa36..d6aeda3d418ed 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -383,7 +383,7 @@ def ndarray_to_mgr( new_block( dtype.construct_array_type()._from_sequence(data, dtype=dtype), BlockPlacement(slice(i, i + 1)), - ndim=1, + ndim=2, ) for i, data in enumerate(obj_columns) ] diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 74e6a6a28ccb0..387d43f47fe9b 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -134,7 +134,9 @@ def melt( mcolumns = id_vars + var_name + [value_name] - if frame.shape[1] > 0: + if frame.shape[1] > 0 and not any( + not isinstance(dt, np.dtype) and dt._supports_2d for dt in frame.dtypes + ): mdata[value_name] = concat( [frame.iloc[:, i] for i in range(frame.shape[1])] ).values diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 6ce6ac71b1ddd..32f8a1fe81a9f 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -77,11 +77,11 @@ DtypeArg, DtypeBackend, FilePath, - HashableT, IndexLabel, ReadCsvBuffer, Self, StorageOptions, + UsecolsArgType, ) _doc_read_csv_and_table = ( r""" @@ -142,7 +142,7 @@ Note: ``index_col=False`` can be used to force pandas to *not* use the first column as the index, e.g., when you have a malformed file with delimiters at the end of each line. -usecols : list of Hashable or Callable, optional +usecols : Sequence of Hashable or Callable, optional Subset of columns to select, denoted either by column labels or column indices. If list-like, all elements must either be positional (i.e. integer indices into the document columns) or strings @@ -645,10 +645,7 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -707,10 +704,7 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -770,10 +764,7 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -833,10 +824,7 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -907,10 +895,7 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = "infer", names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, index_col: IndexLabel | Literal[False] | None = None, - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = None, + usecols: UsecolsArgType = None, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, @@ -1005,10 +990,7 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1065,10 +1047,7 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1125,10 +1104,7 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1185,10 +1161,7 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1258,10 +1231,7 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = "infer", names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, index_col: IndexLabel | Literal[False] | None = None, - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = None, + usecols: UsecolsArgType = None, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index e6408a0eae841..5a7ceabbf554e 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -13,6 +13,7 @@ Any, cast, ) +import warnings import matplotlib.dates as mdates from matplotlib.ticker import ( @@ -239,18 +240,29 @@ def _convert_1d(values, units, axis): if not hasattr(axis, "freq"): raise TypeError("Axis must have `freq` set to convert to Periods") valid_types = (str, datetime, Period, pydt.date, pydt.time, np.datetime64) - if isinstance(values, valid_types) or is_integer(values) or is_float(values): - return get_datevalue(values, axis.freq) - elif isinstance(values, PeriodIndex): - return values.asfreq(axis.freq).asi8 - elif isinstance(values, Index): - return values.map(lambda x: get_datevalue(x, axis.freq)) - elif lib.infer_dtype(values, skipna=False) == "period": - # https://github.com/pandas-dev/pandas/issues/24304 - # convert ndarray[period] -> PeriodIndex - return PeriodIndex(values, freq=axis.freq).asi8 - elif isinstance(values, (list, tuple, np.ndarray, Index)): - return [get_datevalue(x, axis.freq) for x in values] + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "Period with BDay freq is deprecated", category=FutureWarning + ) + warnings.filterwarnings( + "ignore", r"PeriodDtype\[B\] is deprecated", category=FutureWarning + ) + if ( + isinstance(values, valid_types) + or is_integer(values) + or is_float(values) + ): + return get_datevalue(values, axis.freq) + elif isinstance(values, PeriodIndex): + return values.asfreq(axis.freq).asi8 + elif isinstance(values, Index): + return values.map(lambda x: get_datevalue(x, axis.freq)) + elif lib.infer_dtype(values, skipna=False) == "period": + # https://github.com/pandas-dev/pandas/issues/24304 + # convert ndarray[period] -> PeriodIndex + return PeriodIndex(values, freq=axis.freq).asi8 + elif isinstance(values, (list, tuple, np.ndarray, Index)): + return [get_datevalue(x, axis.freq) for x in values] return values @@ -575,11 +587,18 @@ def _daily_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: (vmin, vmax) = (int(vmin), int(vmax)) span = vmax - vmin + 1 - dates_ = period_range( - start=Period(ordinal=vmin, freq=freq), - end=Period(ordinal=vmax, freq=freq), - freq=freq, - ) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "Period with BDay freq is deprecated", category=FutureWarning + ) + warnings.filterwarnings( + "ignore", r"PeriodDtype\[B\] is deprecated", category=FutureWarning + ) + dates_ = period_range( + start=Period(ordinal=vmin, freq=freq), + end=Period(ordinal=vmax, freq=freq), + freq=freq, + ) # Initialize the output info = np.zeros( @@ -1072,7 +1091,13 @@ def __call__(self, x, pos: int = 0) -> str: fmt = self.formatdict.pop(x, "") if isinstance(fmt, np.bytes_): fmt = fmt.decode("utf-8") - period = Period(ordinal=int(x), freq=self.freq) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Period with BDay freq is deprecated", + category=FutureWarning, + ) + period = Period(ordinal=int(x), freq=self.freq) assert isinstance(period, Period) return period.strftime(fmt) diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py index dd8c3eda9ed05..71156a4d84ae5 100644 --- a/pandas/tests/arrays/boolean/test_reduction.py +++ b/pandas/tests/arrays/boolean/test_reduction.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_long + import pandas as pd @@ -51,7 +53,7 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): s = s.dropna() if op in ("sum", "prod"): - assert isinstance(getattr(s, op)(), np.int_) + assert isinstance(getattr(s, op)(), np_long) elif op == "count": # Oddly on the 32 bit build (but not Windows), this is intc (!= intp) assert isinstance(getattr(s, op)(), np.integer) diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index ed08df74461ef..50eaa1f4d8713 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -170,7 +170,7 @@ def test_infer_dtype_from_scalar(value, expected): @pytest.mark.parametrize( "arr, expected", [ - ([1], np.int_), + ([1], np.dtype(int)), (np.array([1], dtype=np.int64), np.int64), ([np.nan, 1, ""], np.object_), (np.array([[1.0, 2.0]]), np.float64), diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 4507857418e9e..6f6cc5a5ad5d8 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -301,14 +301,23 @@ def test_is_categorical_dtype(): assert com.is_categorical_dtype(pd.CategoricalIndex([1, 2, 3])) -def test_is_string_dtype(): - assert not com.is_string_dtype(int) - assert not com.is_string_dtype(pd.Series([1, 2])) - - assert com.is_string_dtype(str) - assert com.is_string_dtype(object) - assert com.is_string_dtype(np.array(["a", "b"])) - assert com.is_string_dtype(pd.StringDtype()) +@pytest.mark.parametrize( + "dtype, expected", + [ + (int, False), + (pd.Series([1, 2]), False), + (str, True), + (object, True), + (np.array(["a", "b"]), True), + (pd.StringDtype(), True), + (pd.Index([], dtype="O"), True), + ], +) +def test_is_string_dtype(dtype, expected): + # GH#54661 + + result = com.is_string_dtype(dtype) + assert result is expected @pytest.mark.parametrize( diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index 3d1274df0a21b..12006248b1db3 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -248,7 +248,7 @@ def get_reduction_result_dtype(dtype): return NUMPY_INT_TO_DTYPE[np.dtype(int)] else: # i.e. dtype.kind == "u" - return NUMPY_INT_TO_DTYPE[np.dtype(np.uint)] + return NUMPY_INT_TO_DTYPE[np.dtype("uint")] if method in ["sum", "prod"]: # std and var are not dtype-preserving diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 4e0bc8d804bab..e10c6ef9a7018 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -248,6 +248,18 @@ def test_sort_values_frame(self, data_for_sorting, ascending): ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("keep", ["first", "last", False]) + def test_duplicated(self, data, keep): + arr = data.take([0, 1, 0, 1]) + result = arr.duplicated(keep=keep) + if keep == "first": + expected = np.array([False, False, True, True]) + elif keep == "last": + expected = np.array([True, True, False, False]) + else: + expected = np.array([True, True, True, True]) + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("box", [pd.Series, lambda x: x]) @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique]) def test_unique(self, data, box, method): diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 370cbf0f33174..de8df15a9d747 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -108,11 +108,11 @@ def test_setitem_list(self, float_frame): data["A"] = newcolumndata def test_setitem_list2(self): - df = DataFrame(0, index=range(3), columns=["tt1", "tt2"], dtype=np.int_) + df = DataFrame(0, index=range(3), columns=["tt1", "tt2"], dtype=int) df.loc[1, ["tt1", "tt2"]] = [1, 2] result = df.loc[df.index[1], ["tt1", "tt2"]] - expected = Series([1, 2], df.columns, dtype=np.int_, name=1) + expected = Series([1, 2], df.columns, dtype=int, name=1) tm.assert_series_equal(result, expected) df["tt1"] = df["tt2"] = "0" @@ -1905,6 +1905,19 @@ def test_adding_new_conditional_column() -> None: tm.assert_frame_equal(df, expected) +def test_add_new_column_infer_string(): + # GH#55366 + pytest.importorskip("pyarrow") + df = DataFrame({"x": [1]}) + with pd.option_context("future.infer_string", True): + df.loc[df["x"] == 1, "y"] = "1" + expected = DataFrame( + {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, + columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(df, expected) + + class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py # but checks for warnings instead of errors. diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 291a79815a81c..67aa07dd83764 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -497,3 +497,9 @@ def test_interpolate_empty_df(self): result = df.interpolate(inplace=True) assert result is None tm.assert_frame_equal(df, expected) + + def test_interpolate_ea_raise(self): + # GH#55347 + df = DataFrame({"a": [1, None, 2]}, dtype="Int64") + with pytest.raises(NotImplementedError, match="does not implement"): + df.interpolate() diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 8b451c84dc5da..b5b5e42691e59 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -488,3 +488,15 @@ def test_rank_mixed_axis_zero(self, data, expected): df.rank() result = df.rank(numeric_only=True) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype, exp_dtype", + [("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")], + ) + def test_rank_string_dtype(self, dtype, exp_dtype): + # GH#55362 + pytest.importorskip("pyarrow") + obj = Series(["foo", "foo", None, "foo"], dtype=dtype) + result = obj.rank(method="first") + expected = Series([1, 2, None, 3], dtype=exp_dtype) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 60c05767a5e1a..6bd441400dc54 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat.numpy import np_long import pandas.util._test_decorators as td import pandas as pd @@ -471,22 +472,22 @@ def test_shift_axis1_multiple_blocks_with_int_fill(self): df1 = DataFrame(rng.integers(1000, size=(5, 3), dtype=int)) df2 = DataFrame(rng.integers(1000, size=(5, 2), dtype=int)) df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1) - result = df3.shift(2, axis=1, fill_value=np.int_(0)) + result = df3.shift(2, axis=1, fill_value=np_long(0)) assert len(df3._mgr.blocks) == 2 expected = df3.take([-1, -1, 0, 1], axis=1) - expected.iloc[:, :2] = np.int_(0) + expected.iloc[:, :2] = np_long(0) expected.columns = df3.columns tm.assert_frame_equal(result, expected) # Case with periods < 0 df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1) - result = df3.shift(-2, axis=1, fill_value=np.int_(0)) + result = df3.shift(-2, axis=1, fill_value=np_long(0)) assert len(df3._mgr.blocks) == 2 expected = df3.take([2, 3, -1, -1], axis=1) - expected.iloc[:, -2:] = np.int_(0) + expected.iloc[:, -2:] = np_long(0) expected.columns = df3.columns tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index fd851ab244cb8..543a5be9544cc 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1823,7 +1823,7 @@ def test_constructor_single_value(self): DataFrame("a", [1, 2], ["a", "c"], float) def test_constructor_with_datetimes(self): - intname = np.dtype(np.int_).name + intname = np.dtype(int).name floatname = np.dtype(np.float64).name objectname = np.dtype(np.object_).name @@ -2743,6 +2743,13 @@ def test_frame_string_inference_array_string_dtype(self): df = DataFrame(np.array([["a", "c"], ["b", "d"]]), columns=["a", "b"]) tm.assert_frame_equal(df, expected) + def test_frame_string_inference_block_dim(self): + # GH#55363 + pytest.importorskip("pyarrow") + with pd.option_context("future.infer_string", True): + df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) + assert df._mgr.blocks[0].ndim == 2 + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index e66557f132c1d..66f813c80ed16 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -10,6 +10,10 @@ IS64, is_platform_windows, ) +from pandas.compat.numpy import ( + np_long, + np_ulong, +) import pandas.util._test_decorators as td import pandas as pd @@ -1056,6 +1060,19 @@ def test_idxmax_numeric_only(self, numeric_only): expected = Series([1, 0, 1], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) + def test_idxmax_arrow_types(self): + # GH#55368 + pytest.importorskip("pyarrow") + + df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1]}, dtype="int64[pyarrow]") + result = df.idxmax() + expected = Series([1, 0], index=["a", "b"]) + tm.assert_series_equal(result, expected) + + result = df.idxmin() + expected = Series([2, 1], index=["a", "b"]) + tm.assert_series_equal(result, expected) + def test_idxmax_axis_2(self, float_frame): frame = float_frame msg = "No axis named 2 for object type DataFrame" @@ -1700,11 +1717,11 @@ class TestEmptyDataFrameReductions: "opname, dtype, exp_value, exp_dtype", [ ("sum", np.int8, 0, np.int64), - ("prod", np.int8, 1, np.int_), + ("prod", np.int8, 1, np_long), ("sum", np.int64, 0, np.int64), ("prod", np.int64, 1, np.int64), ("sum", np.uint8, 0, np.uint64), - ("prod", np.uint8, 1, np.uint), + ("prod", np.uint8, 1, np_ulong), ("sum", np.uint64, 0, np.uint64), ("prod", np.uint64, 1, np.uint64), ("sum", np.float32, 0, np.float32), diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 865fda0ab54a2..5c99882cef6d2 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -229,7 +229,7 @@ def test_cython_agg_empty_buckets_nanops(observed): # GH-18869 can't call nanops on empty groups, so hardcode expected # for these df = DataFrame([11, 12, 13], columns=["a"]) - grps = np.arange(0, 25, 5, dtype=np.int_) + grps = np.arange(0, 25, 5, dtype=int) # add / sum result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( "sum", alt=None, numeric_only=True diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 41bbfcf6840a9..a92880c87b847 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -824,7 +824,7 @@ def test_nlargest_and_smallest_noop(data, groups, dtype, method): data = list(reversed(data)) ser = Series(data, name="a") result = getattr(ser.groupby(groups), method)(n=2) - expidx = np.array(groups, dtype=np.int_) if isinstance(groups, list) else groups + expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9132be50d5857..4ca8b0e317bd2 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3004,12 +3004,12 @@ def test_groupby_reduce_period(): res = gb.max() expected = ser[-10:] - expected.index = Index(range(10), dtype=np.int_) + expected.index = Index(range(10), dtype=int) tm.assert_series_equal(res, expected) res = gb.min() expected = ser[:10] - expected.index = Index(range(10), dtype=np.int_) + expected.index = Index(range(10), dtype=int) tm.assert_series_equal(res, expected) diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/test_min_max.py index 37eb52be0b37b..30c7e1df1e691 100644 --- a/pandas/tests/groupby/test_min_max.py +++ b/pandas/tests/groupby/test_min_max.py @@ -108,7 +108,7 @@ def test_max_inat_not_all_na(): # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1}) - expected.index = expected.index.astype(np.int_) + expected.index = expected.index.astype(int) tm.assert_series_equal(result, expected, check_exact=True) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index efe7b171d630d..805fef2125fda 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -468,7 +468,7 @@ def test_groupby_quantile_dt64tz_period(): # Check that we match the group-by-group result exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)} expected = DataFrame(exp).T.infer_objects() - expected.index = expected.index.astype(np.int_) + expected.index = expected.index.astype(int) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/test_size.py index 76b26c04f9f3a..c275db9d1788c 100644 --- a/pandas/tests/groupby/test_size.py +++ b/pandas/tests/groupby/test_size.py @@ -39,7 +39,7 @@ def test_size_axis_1(df, axis_1, by, sort, dropna): if sort: expected = expected.sort_index() if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by): - expected.index = expected.index.astype(np.int_) + expected.index = expected.index.astype(int) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 944dda8977882..45a33d3b70f71 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -996,7 +996,7 @@ def test_mixed_groupings(normalize, expected_label, expected_values): result = gp.value_counts(sort=True, normalize=normalize) expected = DataFrame( { - "level_0": np.array([4, 4, 5], dtype=np.int_), + "level_0": np.array([4, 4, 5], dtype=int), "A": [1, 1, 2], "level_2": [8, 8, 7], "B": [1, 3, 2], diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index 5ecb2c753644d..6586f5f9de480 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -54,6 +54,14 @@ def test_insert_datetime_into_object(self, loc, val): tm.assert_index_equal(result, expected) assert type(expected[2]) is type(val) + def test_insert_none_into_string_numpy(self): + # GH#55365 + pytest.importorskip("pyarrow") + index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]") + result = index.insert(-1, None) + expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]") + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( "pos,expected", [ diff --git a/pandas/tests/indexes/categorical/test_equals.py b/pandas/tests/indexes/categorical/test_equals.py index 1ed8f3a903439..a8353f301a3c3 100644 --- a/pandas/tests/indexes/categorical/test_equals.py +++ b/pandas/tests/indexes/categorical/test_equals.py @@ -88,3 +88,9 @@ def test_equals_multiindex(self): ci = mi.to_flat_index().astype("category") assert not ci.equals(mi) + + def test_equals_string_dtype(self, any_string_dtype): + # GH#55364 + idx = CategoricalIndex(list("abc"), name="B") + other = Index(["a", "b", "c"], name="B", dtype=any_string_dtype) + assert idx.equals(other) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index f44cbbf560584..d08757a206e67 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_long + import pandas as pd from pandas import ( DataFrame, @@ -54,7 +56,7 @@ def test_time_overflow_for_32bit_machines(self): # (which has value 1e9) and since the max value for np.int32 is ~2e9, # and since those machines won't promote np.int32 to np.int64, we get # overflow. - periods = np.int_(1000) + periods = np_long(1000) idx1 = date_range(start="2000", periods=periods, freq="s") assert len(idx1) == periods diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index c3944a4443d67..d877110e72b26 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_long + import pandas as pd from pandas import ( DatetimeIndex, @@ -91,7 +93,7 @@ def test_dti_business_getitem(self, freq): assert fancy_indexed.freq is None # 32-bit vs. 64-bit platforms - assert rng[4] == rng[np.int_(4)] + assert rng[4] == rng[np_long(4)] @pytest.mark.parametrize("freq", ["B", "C"]) def test_dti_business_getitem_matplotlib_hackaround(self, freq): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index bc04c1c6612f4..8fd1e296fb79a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -457,7 +457,7 @@ def test_fancy(self, simple_index): ["string", "int64", "int32", "uint64", "uint32", "float64", "float32"], indirect=True, ) - @pytest.mark.parametrize("dtype", [np.int_, np.bool_]) + @pytest.mark.parametrize("dtype", [int, np.bool_]) def test_empty_fancy(self, index, dtype): empty_arr = np.array([], dtype=dtype) empty_index = type(index)([], dtype=index.dtype) diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index de989ad550f2b..081da385ebcc3 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -166,7 +166,7 @@ def test_getitem_intkey_leading_level( mi = ser.index assert isinstance(mi, MultiIndex) if dtype is int: - assert mi.levels[0].dtype == np.int_ + assert mi.levels[0].dtype == np.dtype(int) else: assert mi.levels[0].dtype == np.float64 diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index a2693c85e507f..20568342222c3 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -442,7 +442,7 @@ def test_loc_to_fail(self): ) msg = ( - rf"\"None of \[Index\(\[1, 2\], dtype='{np.int_().dtype}'\)\] are " + rf"\"None of \[Index\(\[1, 2\], dtype='{np.dtype(int)}'\)\] are " r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -460,7 +460,7 @@ def test_loc_to_fail2(self): s.loc[-1] msg = ( - rf"\"None of \[Index\(\[-1, -2\], dtype='{np.int_().dtype}'\)\] are " + rf"\"None of \[Index\(\[-1, -2\], dtype='{np.dtype(int)}'\)\] are " r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -476,7 +476,7 @@ def test_loc_to_fail2(self): s["a"] = 2 msg = ( - rf"\"None of \[Index\(\[-2\], dtype='{np.int_().dtype}'\)\] are " + rf"\"None of \[Index\(\[-2\], dtype='{np.dtype(int)}'\)\] are " r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -493,7 +493,7 @@ def test_loc_to_fail3(self): df = DataFrame([["a"], ["b"]], index=[1, 2], columns=["value"]) msg = ( - rf"\"None of \[Index\(\[3\], dtype='{np.int_().dtype}'\)\] are " + rf"\"None of \[Index\(\[3\], dtype='{np.dtype(int)}'\)\] are " r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -510,7 +510,7 @@ def test_loc_getitem_list_with_fail(self): s.loc[[2]] - msg = f"\"None of [Index([3], dtype='{np.int_().dtype}')] are in the [index]" + msg = f"\"None of [Index([3], dtype='{np.dtype(int)}')] are in the [index]" with pytest.raises(KeyError, match=re.escape(msg)): s.loc[[3]] @@ -1209,7 +1209,7 @@ def test_loc_setitem_empty_append_raises(self): df = DataFrame(columns=["x", "y"]) df.index = df.index.astype(np.int64) msg = ( - rf"None of \[Index\(\[0, 1\], dtype='{np.int_().dtype}'\)\] " + rf"None of \[Index\(\[0, 1\], dtype='{np.dtype(int)}'\)\] " r"are in the \[index\]" ) with pytest.raises(KeyError, match=msg): diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 8e5cde42ec91b..8f499644f1013 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -402,7 +402,7 @@ def test_series_partial_set(self): # raises as nothing is in the index msg = ( - rf"\"None of \[Index\(\[3, 3, 3\], dtype='{np.int_().dtype}'\)\] " + rf"\"None of \[Index\(\[3, 3, 3\], dtype='{np.dtype(int)}'\)\] " r"are in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -483,7 +483,7 @@ def test_series_partial_set_with_name(self): # raises as nothing is in the index msg = ( - rf"\"None of \[Index\(\[3, 3, 3\], dtype='{np.int_().dtype}', " + rf"\"None of \[Index\(\[3, 3, 3\], dtype='{np.dtype(int)}', " r"name='idx'\)\] are in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 597bc2975268e..09567e7012313 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -468,7 +468,7 @@ def test_set_change_dtype(self, mgr): np.random.default_rng(2).standard_normal(N).astype(int), ) idx = mgr2.items.get_loc("quux") - assert mgr2.iget(idx).dtype == np.int_ + assert mgr2.iget(idx).dtype == np.dtype(int) mgr2.iset( mgr2.items.get_loc("quux"), np.random.default_rng(2).standard_normal(N) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index c79fdd9145a6a..9f7840588f89e 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -47,7 +47,7 @@ def test_read_csv_with_custom_date_parser(all_parsers): # GH36111 def __custom_date_parser(time): time = time.astype(np.float64) - time = time.astype(np.int_) # convert float seconds to int type + time = time.astype(int) # convert float seconds to int type return pd.to_timedelta(time, unit="s") testdata = StringIO( @@ -87,7 +87,7 @@ def test_read_csv_with_custom_date_parser_parse_dates_false(all_parsers): # GH44366 def __custom_date_parser(time): time = time.astype(np.float64) - time = time.astype(np.int_) # convert float seconds to int type + time = time.astype(int) # convert float seconds to int type return pd.to_timedelta(time, unit="s") testdata = StringIO( diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 47114cab47619..2ef1f065f603d 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -2487,6 +2487,15 @@ def test_secondary_y(self, secondary_y): assert ax.get_ylim() == (0, 100) assert ax.get_yticks()[0] == 99 + @pytest.mark.slow + def test_plot_no_warning(self): + # GH 55138 + # TODO(3.0): this can be removed once Period[B] deprecation is enforced + df = tm.makeTimeDataFrame() + with tm.assert_produces_warning(False): + _ = df.plot() + _ = df.T.plot() + def _generate_4_axes_via_gridspec(): import matplotlib.pyplot as plt diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index e77dc0b305171..bf4474d085b11 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -6,7 +6,10 @@ import pytest from pandas.compat import is_platform_linux -from pandas.compat.numpy import np_version_gte1p24 +from pandas.compat.numpy import ( + np_long, + np_version_gte1p24, +) import pandas.util._test_decorators as td import pandas as pd @@ -561,7 +564,7 @@ def test_plot_fails_with_dupe_color_and_style(self): [ ["scott", 20], [None, 20], - [None, np.int_(20)], + [None, np_long(20)], [0.5, np.linspace(-100, 100, 20)], ], ) @@ -973,3 +976,10 @@ def test_series_none_color(self): ax = series.plot(color=None) expected = _unpack_cycler(mpl.pyplot.rcParams)[:1] _check_colors(ax.get_lines(), linecolors=expected) + + @pytest.mark.slow + def test_plot_no_warning(self, ts): + # GH 55138 + # TODO(3.0): this can be removed once Period[B] deprecation is enforced + with tm.assert_produces_warning(False): + _ = ts.plot() diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 20daa388c2c88..c630ba6a43cb1 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -902,7 +902,7 @@ def test_join_inner_multiindex_deterministic_order(): result = left.join(right, how="inner") expected = DataFrame( {"e": [5], "f": [6]}, - index=MultiIndex.from_tuples([(2, 1, 4, 3)], names=("b", "a", "d", "c")), + index=MultiIndex.from_tuples([(1, 2, 4, 3)], names=("a", "b", "d", "c")), ) tm.assert_frame_equal(result, expected) @@ -926,10 +926,16 @@ def test_join_multiindex_one_level(join_type): ) right = DataFrame(data={"d": 4}, index=MultiIndex.from_tuples([(2,)], names=("b",))) result = left.join(right, how=join_type) - expected = DataFrame( - {"c": [3], "d": [4]}, - index=MultiIndex.from_tuples([(2, 1)], names=["b", "a"]), - ) + if join_type == "right": + expected = DataFrame( + {"c": [3], "d": [4]}, + index=MultiIndex.from_tuples([(2, 1)], names=["b", "a"]), + ) + else: + expected = DataFrame( + {"c": [3], "d": [4]}, + index=MultiIndex.from_tuples([(1, 2)], names=["a", "b"]), + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d889ae2e4806b..d203a04a7fffc 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -365,7 +365,7 @@ def test_merge_join_key_dtype_cast(self): lkey = np.array([1]) rkey = np.array([2]) df = merge(df1, df2, left_on=lkey, right_on=rkey, how="outer") - assert df["key_0"].dtype == np.int_ + assert df["key_0"].dtype == np.dtype(int) def test_handle_join_key_pass_array(self): left = DataFrame( @@ -389,7 +389,7 @@ def test_handle_join_key_pass_array(self): rkey = np.array([1, 1, 2, 3, 4, 5]) merged = merge(left, right, left_on=lkey, right_on=rkey, how="outer") - expected = Series([1, 1, 1, 1, 2, 2, 3, 4, 5], dtype=np.int_, name="key_0") + expected = Series([1, 1, 1, 1, 2, 2, 3, 4, 5], dtype=int, name="key_0") tm.assert_series_equal(merged["key_0"], expected) left = DataFrame({"value": np.arange(3)}) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index ab010bdb909f1..c029acf0c8938 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -69,11 +69,6 @@ def on_cols_multi(): return ["Origin", "Destination", "Period"] -@pytest.fixture -def idx_cols_multi(): - return ["Origin", "Destination", "Period", "TripPurp", "LinkType"] - - class TestMergeMulti: def test_merge_on_multikey(self, left, right, join_type): on_cols = ["key1", "key2"] @@ -815,9 +810,13 @@ def test_join_multi_levels2(self): class TestJoinMultiMulti: - def test_join_multi_multi( - self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi - ): + def test_join_multi_multi(self, left_multi, right_multi, join_type, on_cols_multi): + left_names = left_multi.index.names + right_names = right_multi.index.names + if join_type == "right": + level_order = right_names + left_names.difference(right_names) + else: + level_order = left_names + right_names.difference(left_names) # Multi-index join tests expected = ( merge( @@ -826,7 +825,7 @@ def test_join_multi_multi( how=join_type, on=on_cols_multi, ) - .set_index(idx_cols_multi) + .set_index(level_order) .sort_index() ) @@ -834,11 +833,18 @@ def test_join_multi_multi( tm.assert_frame_equal(result, expected) def test_join_multi_empty_frames( - self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi + self, left_multi, right_multi, join_type, on_cols_multi ): left_multi = left_multi.drop(columns=left_multi.columns) right_multi = right_multi.drop(columns=right_multi.columns) + left_names = left_multi.index.names + right_names = right_multi.index.names + if join_type == "right": + level_order = right_names + left_names.difference(right_names) + else: + level_order = left_names + right_names.difference(left_names) + expected = ( merge( left_multi.reset_index(), @@ -846,7 +852,7 @@ def test_join_multi_empty_frames( how=join_type, on=on_cols_multi, ) - .set_index(idx_cols_multi) + .set_index(level_order) .sort_index() ) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 941478066a7d8..ef748e264188c 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -459,6 +459,47 @@ def test_melt_ea_columns(self): ) tm.assert_frame_equal(result, expected) + def test_melt_preserves_datetime(self): + df = DataFrame( + data=[ + { + "type": "A0", + "start_date": pd.Timestamp("2023/03/01", tz="Asia/Tokyo"), + "end_date": pd.Timestamp("2023/03/10", tz="Asia/Tokyo"), + }, + { + "type": "A1", + "start_date": pd.Timestamp("2023/03/01", tz="Asia/Tokyo"), + "end_date": pd.Timestamp("2023/03/11", tz="Asia/Tokyo"), + }, + ], + index=["aaaa", "bbbb"], + ) + result = df.melt( + id_vars=["type"], + value_vars=["start_date", "end_date"], + var_name="start/end", + value_name="date", + ) + expected = DataFrame( + { + "type": {0: "A0", 1: "A1", 2: "A0", 3: "A1"}, + "start/end": { + 0: "start_date", + 1: "start_date", + 2: "end_date", + 3: "end_date", + }, + "date": { + 0: pd.Timestamp("2023-03-01 00:00:00+0900", tz="Asia/Tokyo"), + 1: pd.Timestamp("2023-03-01 00:00:00+0900", tz="Asia/Tokyo"), + 2: pd.Timestamp("2023-03-10 00:00:00+0900", tz="Asia/Tokyo"), + 3: pd.Timestamp("2023-03-11 00:00:00+0900", tz="Asia/Tokyo"), + }, + } + ) + tm.assert_frame_equal(result, expected) + class TestLreshape: def test_pairs(self): diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 287b7557f50f9..44ce5c79db348 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -9,6 +9,7 @@ import pytest from pandas._libs.missing import NA +from pandas.compat.numpy import np_long from pandas.core.dtypes.common import is_scalar @@ -102,9 +103,9 @@ def test_comparison_ops(comparison_op, other): -0.0, False, np.bool_(False), - np.int_(0), + np_long(0), np.float64(0), - np.int_(-0), + np_long(-0), np.float64(-0), ], ) @@ -123,7 +124,7 @@ def test_pow_special(value, asarray): @pytest.mark.parametrize( - "value", [1, 1.0, True, np.bool_(True), np.int_(1), np.float64(1)] + "value", [1, 1.0, True, np.bool_(True), np_long(1), np.float64(1)] ) @pytest.mark.parametrize("asarray", [True, False]) def test_rpow_special(value, asarray): @@ -133,14 +134,14 @@ def test_rpow_special(value, asarray): if asarray: result = result[0] - elif not isinstance(value, (np.float64, np.bool_, np.int_)): + elif not isinstance(value, (np.float64, np.bool_, np_long)): # this assertion isn't possible with asarray=True assert isinstance(result, type(value)) assert result == value -@pytest.mark.parametrize("value", [-1, -1.0, np.int_(-1), np.float64(-1)]) +@pytest.mark.parametrize("value", [-1, -1.0, np_long(-1), np.float64(-1)]) @pytest.mark.parametrize("asarray", [True, False]) def test_rpow_minus_one(value, asarray): if asarray: diff --git a/pandas/tests/series/methods/test_align.py b/pandas/tests/series/methods/test_align.py index e1b3dd4888ef5..d36fd5335bdfc 100644 --- a/pandas/tests/series/methods/test_align.py +++ b/pandas/tests/series/methods/test_align.py @@ -240,10 +240,10 @@ def test_align_left_different_named_levels(): result_left, result_right = left.align(right) expected_left = Series( - [2], index=pd.MultiIndex.from_tuples([(1, 3, 4, 2)], names=["a", "c", "d", "b"]) + [2], index=pd.MultiIndex.from_tuples([(1, 4, 3, 2)], names=["a", "d", "c", "b"]) ) expected_right = Series( - [1], index=pd.MultiIndex.from_tuples([(1, 3, 4, 2)], names=["a", "c", "d", "b"]) + [1], index=pd.MultiIndex.from_tuples([(1, 4, 3, 2)], names=["a", "d", "c", "b"]) ) tm.assert_series_equal(result_left, expected_left) tm.assert_series_equal(result_right, expected_right) diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index f3075c116883a..9d6611cd53068 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -197,7 +197,7 @@ def test_reindex_int(datetime_series): # NO NaNs introduced reindexed_int = int_ts.reindex(int_ts.index[::2]) - assert reindexed_int.dtype == np.int_ + assert reindexed_int.dtype == np.dtype(int) def test_reindex_bool(datetime_series): diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 8547fd6988791..d40ff6c139653 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -941,8 +941,8 @@ def test_series_varied_multiindex_alignment(): expected = Series( [1000, 2001, 3002, 4003], index=pd.MultiIndex.from_tuples( - [("x", 1, "a"), ("x", 2, "a"), ("y", 1, "a"), ("y", 2, "a")], - names=["xy", "num", "ab"], + [("a", "x", 1), ("a", "x", 2), ("a", "y", 1), ("a", "y", 2)], + names=["ab", "xy", "num"], ), ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index f294885fb8f4d..be68918d2a380 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -348,7 +348,7 @@ def test_categorical_series_repr(self): 8 8 9 9 dtype: category -Categories (10, {np.int_().dtype}): [0, 1, 2, 3, ..., 6, 7, 8, 9]""" +Categories (10, {np.dtype(int)}): [0, 1, 2, 3, ..., 6, 7, 8, 9]""" assert repr(s) == exp @@ -374,7 +374,7 @@ def test_categorical_series_repr_ordered(self): 8 8 9 9 dtype: category -Categories (10, {np.int_().dtype}): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]""" +Categories (10, {np.dtype(int)}): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]""" assert repr(s) == exp diff --git a/pyproject.toml b/pyproject.toml index 89432c2353ea8..a8388a9ff52de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -505,6 +505,8 @@ filterwarnings = [ "ignore:distutils Version classes are deprecated:DeprecationWarning:numexpr", "ignore:distutils Version classes are deprecated:DeprecationWarning:fastparquet", "ignore:distutils Version classes are deprecated:DeprecationWarning:fsspec", + # Can be removed once https://github.com/numpy/numpy/pull/24794 is merged + "ignore:.*In the future `np.long` will be defined as.*:FutureWarning", ] junit_family = "xunit2" markers = [