Merge branch 'main' into perf_map_infer_mask_fused

pandas-dev · Nov 6, 2023 · da55ccc · da55ccc
2 parents 59c836d + ef52fea
commit da55ccc
Show file tree

Hide file tree

Showing 80 changed files with 1,906 additions and 604 deletions.
diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst
@@ -526,6 +526,23 @@ Sparse-dtype specific methods and attributes are provided under the
    Series.sparse.to_coo
 
 
+.. _api.series.list:
+
+List accessor
+~~~~~~~~~~~~~
+
+Arrow list-dtype specific methods and attributes are provided under the
+``Series.list`` accessor.
+
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/accessor_method.rst
+
+   Series.list.flatten
+   Series.list.len
+   Series.list.__getitem__
+
+
 .. _api.series.struct:
 
 Struct accessor

diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst
@@ -2007,7 +2007,7 @@ documentation sections for more on each type.
 |                                                 |                           |                    |                               | ``'Int64'``, ``'UInt8'``, ``'UInt16'``,|
 |                                                 |                           |                    |                               | ``'UInt32'``, ``'UInt64'``             |
 +-------------------------------------------------+---------------------------+--------------------+-------------------------------+----------------------------------------+
-| ``nullable float``                              | :class:`Float64Dtype`, ...| (none)             | :class:`arrays.FloatingArray` | ``'Float32'``, ``'Float64'``           |
+| :ref:`nullable float <api.arrays.float_na>`     | :class:`Float64Dtype`, ...| (none)             | :class:`arrays.FloatingArray` | ``'Float32'``, ``'Float64'``           |
 +-------------------------------------------------+---------------------------+--------------------+-------------------------------+----------------------------------------+
 | :ref:`Strings <text>`                           | :class:`StringDtype`      | :class:`str`       | :class:`arrays.StringArray`   | ``'string'``                           |
 +-------------------------------------------------+---------------------------+--------------------+-------------------------------+----------------------------------------+
@@ -2261,23 +2261,6 @@ non-conforming elements intermixed that you want to represent as missing:
     m = ["apple", pd.Timedelta("1day")]
     pd.to_timedelta(m, errors="coerce")
 
-The ``errors`` parameter has a third option of ``errors='ignore'``, which will simply return the passed in data if it
-encounters any errors with the conversion to a desired data type:
-
-.. ipython:: python
-    :okwarning:
-
-    import datetime
-
-    m = ["apple", datetime.datetime(2016, 3, 2)]
-    pd.to_datetime(m, errors="ignore")
-
-    m = ["apple", 2, 3]
-    pd.to_numeric(m, errors="ignore")
-
-    m = ["apple", pd.Timedelta("1day")]
-    pd.to_timedelta(m, errors="ignore")
-
 In addition to object conversion, :meth:`~pandas.to_numeric` provides another argument ``downcast``, which gives the
 option of downcasting the newly (or already) numeric data to a smaller dtype, which can conserve memory:
 

diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
@@ -294,12 +294,6 @@ The default behavior, ``errors='raise'``, is to raise when unparsable:
 
    pd.to_datetime(['2009/07/31', 'asd'], errors='raise')
 
-Pass ``errors='ignore'`` to return the original input when unparsable:
-
-.. ipython:: python
-
-   pd.to_datetime(["2009/07/31", "asd"], errors="ignore")
-
 Pass ``errors='coerce'`` to convert unparsable data to ``NaT`` (not a time):
 
 .. ipython:: python
@@ -2019,7 +2013,7 @@ frequency. Arithmetic is not allowed between ``Period`` with different ``freq``
    p == pd.Period("2012-01", freq="3M")
 
 
-If ``Period`` freq is daily or higher (``D``, ``H``, ``T``, ``S``, ``L``, ``U``, ``N``), ``offsets`` and ``timedelta``-like can be added if the result can have the same freq. Otherwise, ``ValueError`` will be raised.
+If ``Period`` freq is daily or higher (``D``, ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns``), ``offsets`` and ``timedelta``-like can be added if the result can have the same freq. Otherwise, ``ValueError`` will be raised.
 
 .. ipython:: python
 

diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst
@@ -632,9 +632,10 @@ Of course you can coerce this as well.
 
 To keep the previous behavior, you can use ``errors='ignore'``:
 
-.. ipython:: python
+.. code-block:: ipython
 
-   pd.to_datetime(["2009-07-31", "asd"], errors="ignore")
+   In [4]: pd.to_datetime(["2009-07-31", "asd"], errors="ignore")
+   Out[4]: Index(['2009-07-31', 'asd'], dtype='object')
 
 Furthermore, ``pd.to_timedelta`` has gained a similar API, of ``errors='raise'|'ignore'|'coerce'``, and the ``coerce`` keyword
 has been deprecated in favor of ``errors='coerce'``.

diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst
@@ -22,7 +22,7 @@ Fixed regressions
 Bug fixes
 ~~~~~~~~~
 - Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`)
--
+- Bug in :meth:`Index.isin` raising for Arrow backed string and ``None`` value (:issue:`55821`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_213.other:

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -64,10 +64,30 @@ DataFrame. (:issue:`54938`)
     )
     series.struct.explode()
 
-.. _whatsnew_220.enhancements.enhancement2:
+.. _whatsnew_220.enhancements.list_accessor:
 
-enhancement2
-^^^^^^^^^^^^
+Series.list accessor for PyArrow list data
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``Series.list`` accessor provides attributes and methods for processing
+data with ``list[pyarrow]`` dtype Series. For example,
+:meth:`Series.list.__getitem__` allows indexing pyarrow lists in
+a Series. (:issue:`55323`)
+
+.. ipython:: python
+
+    import pyarrow as pa
+    series = pd.Series(
+        [
+            [1, 2, 3],
+            [4, 5],
+            [6],
+        ],
+        dtype=pd.ArrowDtype(
+            pa.list_(pa.int64())
+        ),
+    )
+    series.list[0]
 
 .. _whatsnew_220.enhancements.other:
 
@@ -284,11 +304,13 @@ Other Deprecations
 - Deprecated strings ``H``, ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`)
 - Deprecated strings ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`)
 - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`)
+- Deprecated the ``errors="ignore"`` option in :func:`to_datetime`, :func:`to_timedelta`, and :func:`to_numeric`; explicitly catch exceptions instead (:issue:`54467`)
 - Deprecated the ``fastpath`` keyword in the :class:`Series` constructor (:issue:`20110`)
 - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`)
 - Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`)
 - Deprecated the previous implementation of :class:`DataFrame.stack`; specify ``future_stack=True`` to adopt the future version (:issue:`53515`)
 - Deprecating downcasting the results of :meth:`DataFrame.fillna`, :meth:`Series.fillna`, :meth:`DataFrame.ffill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill`, :meth:`Series.bfill` in object-dtype cases. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`54261`)
+-
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_220.performance:
@@ -302,9 +324,11 @@ Performance improvements
 - Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`)
 - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`)
 - Performance improvement in :meth:`Index.difference` (:issue:`55108`)
+- Performance improvement in :meth:`MultiIndex.get_indexer` when ``method`` is not ``None`` (:issue:`55839`)
 - Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`)
 - Performance improvement in :meth:`Series.str` methods (:issue:`55736`)
 - Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`)
+- Performance improvement when indexing into a non-unique index (:issue:`55816`)
 - Performance improvement when indexing with more than 4 keys (:issue:`54550`)
 - Performance improvement when localizing time to UTC (:issue:`55241`)
 
@@ -322,7 +346,11 @@ Categorical
 
 Datetimelike
 ^^^^^^^^^^^^
+- Bug in :class:`DatetimeIndex` construction when passing both a ``tz`` and either ``dayfirst`` or ``yearfirst`` ignoring dayfirst/yearfirst (:issue:`55813`)
+- Bug in :class:`DatetimeIndex` when passing an object-dtype ndarray of float objects and a ``tz`` incorrectly localizing the result (:issue:`55780`)
 - Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame. (:issue:`52093`)
+- Bug in :func:`testing.assert_extension_array_equal` that could use the wrong unit when comparing resolutions (:issue:`55730`)
+- Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing a list of mixed-string-and-numeric types incorrectly raising (:issue:`55780`)
 - Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`)
 - Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`)
 - Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`)

diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi
@@ -80,13 +80,6 @@ class BaseMultiIndexCodesEngine:
     ) -> None: ...
     def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ...
     def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ...
-    def get_indexer_with_fill(
-        self,
-        target: np.ndarray,  # np.ndarray[object] of tuples
-        values: np.ndarray,  # np.ndarray[object] of tuples
-        method: str,
-        limit: int | None,
-    ) -> npt.NDArray[np.intp]: ...
 
 class ExtensionEngine:
     def __init__(self, values: ExtensionArray) -> None: ...

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -354,7 +354,7 @@ cdef class IndexEngine:
             dict d = {}
             object val
             Py_ssize_t count = 0, count_missing = 0
-            Py_ssize_t i, j, n, n_t, n_alloc, start, end
+            Py_ssize_t i, j, n, n_t, n_alloc, max_alloc, start, end
             bint check_na_values = False
 
         values = self.values
@@ -364,6 +364,7 @@ cdef class IndexEngine:
 
         n = len(values)
         n_t = len(targets)
+        max_alloc = n * n_t
         if n > 10_000:
             n_alloc = 10_000
         else:
@@ -453,7 +454,9 @@ cdef class IndexEngine:
 
                     # realloc if needed
                     if count >= n_alloc:
-                        n_alloc += 10_000
+                        n_alloc *= 2
+                        if n_alloc > max_alloc:
+                            n_alloc = max_alloc
                         result = np.resize(result, n_alloc)
 
                     result[count] = j
@@ -463,7 +466,9 @@ cdef class IndexEngine:
             else:
 
                 if count >= n_alloc:
-                    n_alloc += 10_000
+                    n_alloc *= 2
+                    if n_alloc > max_alloc:
+                        n_alloc = max_alloc
                     result = np.resize(result, n_alloc)
                 result[count] = -1
                 count += 1
@@ -748,91 +753,6 @@ cdef class BaseMultiIndexCodesEngine:
         """
         return self._base.get_indexer(self, target)
 
-    def get_indexer_with_fill(self, ndarray target, ndarray values,
-                              str method, object limit) -> np.ndarray:
-        """
-        Returns an array giving the positions of each value of `target` in
-        `values`, where -1 represents a value in `target` which does not
-        appear in `values`
-
-        If `method` is "backfill" then the position for a value in `target`
-        which does not appear in `values` is that of the next greater value
-        in `values` (if one exists), and -1 if there is no such value.
-
-        Similarly, if the method is "pad" then the position for a value in
-        `target` which does not appear in `values` is that of the next smaller
-        value in `values` (if one exists), and -1 if there is no such value.
-
-        Parameters
-        ----------
-        target: ndarray[object] of tuples
-            need not be sorted, but all must have the same length, which must be
-            the same as the length of all tuples in `values`
-        values : ndarray[object] of tuples
-            must be sorted and all have the same length.  Should be the set of
-            the MultiIndex's values.
-        method: string
-            "backfill" or "pad"
-        limit: int or None
-            if provided, limit the number of fills to this value
-
-        Returns
-        -------
-        np.ndarray[intp_t, ndim=1] of the indexer of `target` into `values`,
-        filled with the `method` (and optionally `limit`) specified
-        """
-        assert method in ("backfill", "pad")
-        cdef:
-            int64_t i, j, next_code
-            int64_t num_values, num_target_values
-            ndarray[int64_t, ndim=1] target_order
-            ndarray[object, ndim=1] target_values
-            ndarray[int64_t, ndim=1] new_codes, new_target_codes
-            ndarray[intp_t, ndim=1] sorted_indexer
-
-        target_order = np.argsort(target).astype("int64")
-        target_values = target[target_order]
-        num_values, num_target_values = len(values), len(target_values)
-        new_codes, new_target_codes = (
-            np.empty((num_values,)).astype("int64"),
-            np.empty((num_target_values,)).astype("int64"),
-        )
-
-        # `values` and `target_values` are both sorted, so we walk through them
-        # and memoize the (ordered) set of indices in the (implicit) merged-and
-        # sorted list of the two which belong to each of them
-        # the effect of this is to create a factorization for the (sorted)
-        # merger of the index values, where `new_codes` and `new_target_codes`
-        # are the subset of the factors which appear in `values` and `target`,
-        # respectively
-        i, j, next_code = 0, 0, 0
-        while i < num_values and j < num_target_values:
-            val, target_val = values[i], target_values[j]
-            if val <= target_val:
-                new_codes[i] = next_code
-                i += 1
-            if target_val <= val:
-                new_target_codes[j] = next_code
-                j += 1
-            next_code += 1
-
-        # at this point, at least one should have reached the end
-        # the remaining values of the other should be added to the end
-        assert i == num_values or j == num_target_values
-        while i < num_values:
-            new_codes[i] = next_code
-            i += 1
-            next_code += 1
-        while j < num_target_values:
-            new_target_codes[j] = next_code
-            j += 1
-            next_code += 1
-
-        # get the indexer, and undo the sorting of `target.values`
-        algo = algos.backfill if method == "backfill" else algos.pad
-        sorted_indexer = algo(new_codes, new_target_codes, limit=limit)
-        return sorted_indexer[np.argsort(target_order)]
-
     def get_loc(self, object key):
         if is_definitely_invalid_key(key):
             raise TypeError(f"'{key}' is an invalid key")
@@ -1211,7 +1131,7 @@ cdef class MaskedIndexEngine(IndexEngine):
             dict d = {}
             object val
             Py_ssize_t count = 0, count_missing = 0
-            Py_ssize_t i, j, n, n_t, n_alloc, start, end, na_idx
+            Py_ssize_t i, j, n, n_t, n_alloc, max_alloc, start, end, na_idx
 
         target_vals = self._get_data(targets)
         target_mask = self._get_mask(targets)
@@ -1224,6 +1144,7 @@ cdef class MaskedIndexEngine(IndexEngine):
 
         n = len(values)
         n_t = len(target_vals)
+        max_alloc = n * n_t
         if n > 10_000:
             n_alloc = 10_000
         else:
@@ -1274,8 +1195,9 @@ cdef class MaskedIndexEngine(IndexEngine):
                     for na_idx in na_pos:
                         # realloc if needed
                         if count >= n_alloc:
-                            n_alloc += 10_000
-                            result = np.resize(result, n_alloc)
+                            n_alloc *= 2
+                            if n_alloc > max_alloc:
+                                n_alloc = max_alloc
 
                         result[count] = na_idx
                         count += 1
@@ -1289,8 +1211,9 @@ cdef class MaskedIndexEngine(IndexEngine):
 
                     # realloc if needed
                     if count >= n_alloc:
-                        n_alloc += 10_000
-                        result = np.resize(result, n_alloc)
+                        n_alloc *= 2
+                        if n_alloc > max_alloc:
+                            n_alloc = max_alloc
 
                     result[count] = j
                     count += 1

diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build
@@ -61,12 +61,12 @@ subdir('tslibs')
 libs_sources = {
     # Dict of extension name -> dict of {sources, include_dirs, and deps}
     # numpy include dir is implicitly included
-    'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper, _khash_primitive_helper]},
+    'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper], 'deps': _khash_primitive_helper_dep},
     'arrays': {'sources': ['arrays.pyx']},
     'groupby': {'sources': ['groupby.pyx']},
     'hashing': {'sources': ['hashing.pyx']},
-    'hashtable': {'sources': ['hashtable.pyx', _khash_primitive_helper, _hashtable_class_helper, _hashtable_func_helper]},
-    'index': {'sources': ['index.pyx', _index_class_helper]},
+    'hashtable': {'sources': ['hashtable.pyx', _hashtable_class_helper, _hashtable_func_helper], 'deps': _khash_primitive_helper_dep},
+    'index': {'sources': ['index.pyx', _index_class_helper], 'deps': _khash_primitive_helper_dep},
     'indexing': {'sources': ['indexing.pyx']},
     'internals': {'sources': ['internals.pyx']},
     'interval': {'sources': ['interval.pyx', _intervaltree_helper],

diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi
@@ -29,5 +29,9 @@ def array_to_datetime(
 # returned ndarray may be object dtype or datetime64[ns]
 
 def array_to_datetime_with_tz(
-    values: npt.NDArray[np.object_], tz: tzinfo, creso: int
+    values: npt.NDArray[np.object_],
+    tz: tzinfo,
+    dayfirst: bool,
+    yearfirst: bool,
+    creso: int,
 ) -> npt.NDArray[np.int64]: ...