Merge remote-tracking branch 'upstream/main' into ci/debug

pandas-dev · Nov 6, 2023 · 182f432 · 182f432
2 parents dc4f3cb + 2c12853
commit 182f432
Show file tree

Hide file tree

Showing 36 changed files with 348 additions and 271 deletions.
diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
@@ -2013,7 +2013,7 @@ frequency. Arithmetic is not allowed between ``Period`` with different ``freq``
    p == pd.Period("2012-01", freq="3M")
 
 
-If ``Period`` freq is daily or higher (``D``, ``H``, ``T``, ``S``, ``L``, ``U``, ``N``), ``offsets`` and ``timedelta``-like can be added if the result can have the same freq. Otherwise, ``ValueError`` will be raised.
+If ``Period`` freq is daily or higher (``D``, ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns``), ``offsets`` and ``timedelta``-like can be added if the result can have the same freq. Otherwise, ``ValueError`` will be raised.
 
 .. ipython:: python
 

diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst
@@ -22,7 +22,7 @@ Fixed regressions
 Bug fixes
 ~~~~~~~~~
 - Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`)
--
+- Bug in :meth:`Index.isin` raising for Arrow backed string and ``None`` value (:issue:`55821`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_213.other:

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -324,8 +324,10 @@ Performance improvements
 - Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`)
 - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`)
 - Performance improvement in :meth:`Index.difference` (:issue:`55108`)
+- Performance improvement in :meth:`MultiIndex.get_indexer` when ``method`` is not ``None`` (:issue:`55839`)
 - Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`)
 - Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`)
+- Performance improvement when indexing into a non-unique index (:issue:`55816`)
 - Performance improvement when indexing with more than 4 keys (:issue:`54550`)
 - Performance improvement when localizing time to UTC (:issue:`55241`)
 
@@ -346,6 +348,7 @@ Datetimelike
 - Bug in :class:`DatetimeIndex` construction when passing both a ``tz`` and either ``dayfirst`` or ``yearfirst`` ignoring dayfirst/yearfirst (:issue:`55813`)
 - Bug in :class:`DatetimeIndex` when passing an object-dtype ndarray of float objects and a ``tz`` incorrectly localizing the result (:issue:`55780`)
 - Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame. (:issue:`52093`)
+- Bug in :func:`testing.assert_extension_array_equal` that could use the wrong unit when comparing resolutions (:issue:`55730`)
 - Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing a list of mixed-string-and-numeric types incorrectly raising (:issue:`55780`)
 - Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`)
 - Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`)

diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi
@@ -80,13 +80,6 @@ class BaseMultiIndexCodesEngine:
     ) -> None: ...
     def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ...
     def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ...
-    def get_indexer_with_fill(
-        self,
-        target: np.ndarray,  # np.ndarray[object] of tuples
-        values: np.ndarray,  # np.ndarray[object] of tuples
-        method: str,
-        limit: int | None,
-    ) -> npt.NDArray[np.intp]: ...
 
 class ExtensionEngine:
     def __init__(self, values: ExtensionArray) -> None: ...

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -354,7 +354,7 @@ cdef class IndexEngine:
             dict d = {}
             object val
             Py_ssize_t count = 0, count_missing = 0
-            Py_ssize_t i, j, n, n_t, n_alloc, start, end
+            Py_ssize_t i, j, n, n_t, n_alloc, max_alloc, start, end
             bint check_na_values = False
 
         values = self.values
@@ -364,6 +364,7 @@ cdef class IndexEngine:
 
         n = len(values)
         n_t = len(targets)
+        max_alloc = n * n_t
         if n > 10_000:
             n_alloc = 10_000
         else:
@@ -453,7 +454,9 @@ cdef class IndexEngine:
 
                     # realloc if needed
                     if count >= n_alloc:
-                        n_alloc += 10_000
+                        n_alloc *= 2
+                        if n_alloc > max_alloc:
+                            n_alloc = max_alloc
                         result = np.resize(result, n_alloc)
 
                     result[count] = j
@@ -463,7 +466,9 @@ cdef class IndexEngine:
             else:
 
                 if count >= n_alloc:
-                    n_alloc += 10_000
+                    n_alloc *= 2
+                    if n_alloc > max_alloc:
+                        n_alloc = max_alloc
                     result = np.resize(result, n_alloc)
                 result[count] = -1
                 count += 1
@@ -748,91 +753,6 @@ cdef class BaseMultiIndexCodesEngine:
         """
         return self._base.get_indexer(self, target)
 
-    def get_indexer_with_fill(self, ndarray target, ndarray values,
-                              str method, object limit) -> np.ndarray:
-        """
-        Returns an array giving the positions of each value of `target` in
-        `values`, where -1 represents a value in `target` which does not
-        appear in `values`
-
-        If `method` is "backfill" then the position for a value in `target`
-        which does not appear in `values` is that of the next greater value
-        in `values` (if one exists), and -1 if there is no such value.
-
-        Similarly, if the method is "pad" then the position for a value in
-        `target` which does not appear in `values` is that of the next smaller
-        value in `values` (if one exists), and -1 if there is no such value.
-
-        Parameters
-        ----------
-        target: ndarray[object] of tuples
-            need not be sorted, but all must have the same length, which must be
-            the same as the length of all tuples in `values`
-        values : ndarray[object] of tuples
-            must be sorted and all have the same length.  Should be the set of
-            the MultiIndex's values.
-        method: string
-            "backfill" or "pad"
-        limit: int or None
-            if provided, limit the number of fills to this value
-
-        Returns
-        -------
-        np.ndarray[intp_t, ndim=1] of the indexer of `target` into `values`,
-        filled with the `method` (and optionally `limit`) specified
-        """
-        assert method in ("backfill", "pad")
-        cdef:
-            int64_t i, j, next_code
-            int64_t num_values, num_target_values
-            ndarray[int64_t, ndim=1] target_order
-            ndarray[object, ndim=1] target_values
-            ndarray[int64_t, ndim=1] new_codes, new_target_codes
-            ndarray[intp_t, ndim=1] sorted_indexer
-
-        target_order = np.argsort(target).astype("int64")
-        target_values = target[target_order]
-        num_values, num_target_values = len(values), len(target_values)
-        new_codes, new_target_codes = (
-            np.empty((num_values,)).astype("int64"),
-            np.empty((num_target_values,)).astype("int64"),
-        )
-
-        # `values` and `target_values` are both sorted, so we walk through them
-        # and memoize the (ordered) set of indices in the (implicit) merged-and
-        # sorted list of the two which belong to each of them
-        # the effect of this is to create a factorization for the (sorted)
-        # merger of the index values, where `new_codes` and `new_target_codes`
-        # are the subset of the factors which appear in `values` and `target`,
-        # respectively
-        i, j, next_code = 0, 0, 0
-        while i < num_values and j < num_target_values:
-            val, target_val = values[i], target_values[j]
-            if val <= target_val:
-                new_codes[i] = next_code
-                i += 1
-            if target_val <= val:
-                new_target_codes[j] = next_code
-                j += 1
-            next_code += 1
-
-        # at this point, at least one should have reached the end
-        # the remaining values of the other should be added to the end
-        assert i == num_values or j == num_target_values
-        while i < num_values:
-            new_codes[i] = next_code
-            i += 1
-            next_code += 1
-        while j < num_target_values:
-            new_target_codes[j] = next_code
-            j += 1
-            next_code += 1
-
-        # get the indexer, and undo the sorting of `target.values`
-        algo = algos.backfill if method == "backfill" else algos.pad
-        sorted_indexer = algo(new_codes, new_target_codes, limit=limit)
-        return sorted_indexer[np.argsort(target_order)]
-
     def get_loc(self, object key):
         if is_definitely_invalid_key(key):
             raise TypeError(f"'{key}' is an invalid key")
@@ -1211,7 +1131,7 @@ cdef class MaskedIndexEngine(IndexEngine):
             dict d = {}
             object val
             Py_ssize_t count = 0, count_missing = 0
-            Py_ssize_t i, j, n, n_t, n_alloc, start, end, na_idx
+            Py_ssize_t i, j, n, n_t, n_alloc, max_alloc, start, end, na_idx
 
         target_vals = self._get_data(targets)
         target_mask = self._get_mask(targets)
@@ -1224,6 +1144,7 @@ cdef class MaskedIndexEngine(IndexEngine):
 
         n = len(values)
         n_t = len(target_vals)
+        max_alloc = n * n_t
         if n > 10_000:
             n_alloc = 10_000
         else:
@@ -1274,8 +1195,9 @@ cdef class MaskedIndexEngine(IndexEngine):
                     for na_idx in na_pos:
                         # realloc if needed
                         if count >= n_alloc:
-                            n_alloc += 10_000
-                            result = np.resize(result, n_alloc)
+                            n_alloc *= 2
+                            if n_alloc > max_alloc:
+                                n_alloc = max_alloc
 
                         result[count] = na_idx
                         count += 1
@@ -1289,8 +1211,9 @@ cdef class MaskedIndexEngine(IndexEngine):
 
                     # realloc if needed
                     if count >= n_alloc:
-                        n_alloc += 10_000
-                        result = np.resize(result, n_alloc)
+                        n_alloc *= 2
+                        if n_alloc > max_alloc:
+                            n_alloc = max_alloc
 
                     result[count] = j
                     count += 1

diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build
@@ -61,12 +61,12 @@ subdir('tslibs')
 libs_sources = {
     # Dict of extension name -> dict of {sources, include_dirs, and deps}
     # numpy include dir is implicitly included
-    'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper, _khash_primitive_helper]},
+    'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper], 'deps': _khash_primitive_helper_dep},
     'arrays': {'sources': ['arrays.pyx']},
     'groupby': {'sources': ['groupby.pyx']},
     'hashing': {'sources': ['hashing.pyx']},
-    'hashtable': {'sources': ['hashtable.pyx', _khash_primitive_helper, _hashtable_class_helper, _hashtable_func_helper]},
-    'index': {'sources': ['index.pyx', _index_class_helper]},
+    'hashtable': {'sources': ['hashtable.pyx', _hashtable_class_helper, _hashtable_func_helper], 'deps': _khash_primitive_helper_dep},
+    'index': {'sources': ['index.pyx', _index_class_helper], 'deps': _khash_primitive_helper_dep},
     'indexing': {'sources': ['indexing.pyx']},
     'internals': {'sources': ['internals.pyx']},
     'interval': {'sources': ['interval.pyx', _intervaltree_helper],

diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
@@ -693,6 +693,8 @@ def array_to_datetime_with_tz(
         object item
         int64_t ival
         _TSObject tsobj
+        bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC
+        DatetimeParseState state = DatetimeParseState(creso)
 
     for i in range(n):
         # Analogous to `item = values[i]`
@@ -707,6 +709,9 @@ def array_to_datetime_with_tz(
                 item, tz=tz, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst, nanos=0
             )
             if tsobj.value != NPY_NAT:
+                state.update_creso(tsobj.creso)
+                if infer_reso:
+                    creso = state.creso
                 tsobj.ensure_reso(creso, item, round_ok=True)
             ival = tsobj.value
 
@@ -715,4 +720,20 @@ def array_to_datetime_with_tz(
 
         cnp.PyArray_MultiIter_NEXT(mi)
 
+    if infer_reso:
+        if state.creso_ever_changed:
+            # We encountered mismatched resolutions, need to re-parse with
+            #  the correct one.
+            return array_to_datetime_with_tz(values, tz=tz, creso=creso)
+
+        # Otherwise we can use the single reso that we encountered and avoid
+        #  a second pass.
+        abbrev = npy_unit_to_abbrev(creso)
+        result = result.view(f"M8[{abbrev}]")
+    elif creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
+        # We didn't find any non-NaT to infer from, default to "ns"
+        result = result.view("M8[ns]")
+    else:
+        abbrev = npy_unit_to_abbrev(creso)
+        result = result.view(f"M8[{abbrev}]")
     return result
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -30,6 +30,7 @@
     is_float_dtype,
     is_sequence,
     is_signed_integer_dtype,
+    is_string_dtype,
     is_unsigned_integer_dtype,
     pandas_dtype,
 )
@@ -1019,6 +1020,17 @@ def iat(x):
 
 # -----------------------------------------------------------------------------
 
+_UNITS = ["s", "ms", "us", "ns"]
+
+
+def get_finest_unit(left: str, right: str):
+    """
+    Find the higher of two datetime64 units.
+    """
+    if _UNITS.index(left) >= _UNITS.index(right):
+        return left
+    return right
+
 
 def shares_memory(left, right) -> bool:
     """
@@ -1044,10 +1056,18 @@ def shares_memory(left, right) -> bool:
     if isinstance(left, pd.core.arrays.IntervalArray):
         return shares_memory(left._left, right) or shares_memory(left._right, right)
 
-    if isinstance(left, ExtensionArray) and left.dtype == "string[pyarrow]":
+    if (
+        isinstance(left, ExtensionArray)
+        and is_string_dtype(left.dtype)
+        and left.dtype.storage in ("pyarrow", "pyarrow_numpy")  # type: ignore[attr-defined]  # noqa: E501
+    ):
         # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
         left = cast("ArrowExtensionArray", left)
-        if isinstance(right, ExtensionArray) and right.dtype == "string[pyarrow]":
+        if (
+            isinstance(right, ExtensionArray)
+            and is_string_dtype(right.dtype)
+            and right.dtype.storage in ("pyarrow", "pyarrow_numpy")  # type: ignore[attr-defined]  # noqa: E501
+        ):
             right = cast("ArrowExtensionArray", right)
             left_pa_data = left._pa_array
             right_pa_data = right._pa_array
@@ -1121,6 +1141,7 @@ def shares_memory(left, right) -> bool:
     "getitem",
     "get_locales",
     "getMixedTypeDict",
+    "get_finest_unit",
     "get_obj",
     "get_op_from_name",
     "getPeriodData",

diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
@@ -745,7 +745,7 @@ def assert_extension_array_equal(
             else:
                 l_unit = np.datetime_data(left.dtype)[0]
             if not isinstance(right.dtype, np.dtype):
-                r_unit = cast(DatetimeTZDtype, left.dtype).unit
+                r_unit = cast(DatetimeTZDtype, right.dtype).unit
             else:
                 r_unit = np.datetime_data(right.dtype)[0]
             if (

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -1303,6 +1303,9 @@ def unit(request):
     return request.param
 
 
+unit2 = unit
+
+
 # ----------------------------------------------------------------
 # Dtypes
 # ----------------------------------------------------------------

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -2241,14 +2241,14 @@ def _sequence_to_dt64(
             data = data.astype(np.int64)
         elif tz is not None and ambiguous == "raise":
             obj_data = np.asarray(data, dtype=object)
-            i8data = tslib.array_to_datetime_with_tz(
+            result = tslib.array_to_datetime_with_tz(
                 obj_data,
                 tz=tz,
                 dayfirst=dayfirst,
                 yearfirst=yearfirst,
                 creso=abbrev_to_npy_unit(out_unit),
             )
-            return i8data.view(out_dtype), tz, None
+            return result, tz, None
         else:
             # data comes back here as either i8 to denote UTC timestamps
             #  or M8[ns] to denote wall times

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -222,7 +222,9 @@ def isin(self, values) -> npt.NDArray[np.bool_]:
         if not len(value_set):
             return np.zeros(len(self), dtype=bool)
 
-        result = pc.is_in(self._pa_array, value_set=pa.array(value_set))
+        result = pc.is_in(
+            self._pa_array, value_set=pa.array(value_set, type=self._pa_array.type)
+        )
         # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
         # to False
         return np.array(result, dtype=np.bool_)