Merge branch 'main' into patch-1

pandas-dev · Oct 26, 2023 · 07ceac4 · 07ceac4
2 parents 6231fd2 + 0cbe41a
commit 07ceac4
Show file tree

Hide file tree

Showing 37 changed files with 284 additions and 139 deletions.
diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst
@@ -8,24 +8,37 @@ including other versions of pandas.
 
 {{ header }}
 
+.. ---------------------------------------------------------------------------
+.. _whatsnew_212.deprecations:
+
+Deprecations
+~~~~~~~~~~~~
+
+- Reverted deprecation of ``fill_method=None`` in :meth:`DataFrame.pct_change`, :meth:`Series.pct_change`, :meth:`DataFrameGroupBy.pct_change`, and :meth:`SeriesGroupBy.pct_change`; the values ``'backfill'``, ``'bfill'``, ``'pad'``, and ``'ffill'`` are still deprecated (:issue:`53491`)
+
 .. ---------------------------------------------------------------------------
 .. _whatsnew_212.regressions:
 
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
 - Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`)
+- Fixed regression in :meth:`~DataFrame.rolling` where non-nanosecond index or ``on`` column would produce incorrect results (:issue:`55026`, :issue:`55106`, :issue:`55299`)
 - Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`)
 - Fixed regression in :meth:`DataFrame.sort_index` which was not sorting correctly when the index was a sliced :class:`MultiIndex` (:issue:`55379`)
 - Fixed regression in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` where if the option ``compute.use_numba`` was set to True, groupby methods not supported by the numba engine would raise a ``TypeError`` (:issue:`55520`)
 - Fixed performance regression with wide DataFrames, typically involving methods where all columns were accessed individually (:issue:`55256`, :issue:`55245`)
 - Fixed regression in :func:`merge_asof` raising ``TypeError`` for ``by`` with datetime and timedelta dtypes (:issue:`55453`)
+- Fixed regression in :func:`read_parquet` when reading a file with a string column consisting of more than 2 GB of string data and using the ``"string"`` dtype (:issue:`55606`)
+- Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite when using ``detect_types`` (:issue:`55554`)
+- Fixed regression in construction of certain DataFrame or Series subclasses (:issue:`54922`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_212.bug_fixes:
 
 Bug fixes
 ~~~~~~~~~
 - Fixed bug in :class:`.DataFrameGroupBy` reductions not preserving object dtype when ``infer_string`` is set (:issue:`55620`)
+- Fixed bug in :meth:`.DataFrameGroupBy.min()` and :meth:`.DataFrameGroupBy.max()` not preserving extension dtype for empty object (:issue:`55619`)
 - Fixed bug in :meth:`.SeriesGroupBy.value_counts` returning incorrect dtype for string columns (:issue:`55627`)
 - Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`)
 - Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`)

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -288,7 +288,6 @@ Other Deprecations
 - Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`)
 - Deprecated the previous implementation of :class:`DataFrame.stack`; specify ``future_stack=True`` to adopt the future version (:issue:`53515`)
 - Deprecating downcasting the results of :meth:`DataFrame.fillna`, :meth:`Series.fillna`, :meth:`DataFrame.ffill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill`, :meth:`Series.bfill` in object-dtype cases. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`54261`)
--
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_220.performance:

diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx
@@ -32,8 +32,6 @@ from pandas._libs.tslibs.nattype cimport (
 )
 from pandas._libs.tslibs.np_datetime cimport (
     get_datetime64_unit,
-    get_datetime64_value,
-    get_timedelta64_value,
     import_pandas_datetime,
 )
 
@@ -122,16 +120,16 @@ cpdef bint is_matching_na(object left, object right, bint nan_matches_none=False
         )
     elif cnp.is_datetime64_object(left):
         return (
-            get_datetime64_value(left) == NPY_NAT
+            cnp.get_datetime64_value(left) == NPY_NAT
             and cnp.is_datetime64_object(right)
-            and get_datetime64_value(right) == NPY_NAT
+            and cnp.get_datetime64_value(right) == NPY_NAT
             and get_datetime64_unit(left) == get_datetime64_unit(right)
         )
     elif cnp.is_timedelta64_object(left):
         return (
-            get_timedelta64_value(left) == NPY_NAT
+            cnp.get_timedelta64_value(left) == NPY_NAT
             and cnp.is_timedelta64_object(right)
-            and get_timedelta64_value(right) == NPY_NAT
+            and cnp.get_timedelta64_value(right) == NPY_NAT
             and get_datetime64_unit(left) == get_datetime64_unit(right)
         )
     elif is_decimal_na(left):
@@ -170,9 +168,9 @@ cpdef bint checknull(object val, bint inf_as_na=False):
             return val == INF or val == NEGINF
         return False
     elif cnp.is_timedelta64_object(val):
-        return get_timedelta64_value(val) == NPY_NAT
+        return cnp.get_timedelta64_value(val) == NPY_NAT
     elif cnp.is_datetime64_object(val):
-        return get_datetime64_value(val) == NPY_NAT
+        return cnp.get_datetime64_value(val) == NPY_NAT
     else:
         return is_decimal_na(val)
 

diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx
@@ -41,7 +41,6 @@ from pandas._libs.tslibs.np_datetime cimport (
     convert_reso,
     get_conversion_factor,
     get_datetime64_unit,
-    get_datetime64_value,
     get_implementation_bounds,
     import_pandas_datetime,
     npy_datetime,
@@ -198,7 +197,7 @@ cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1:
         NPY_DATETIMEUNIT unit
         npy_datetime ival
 
-    ival = get_datetime64_value(val)
+    ival = cnp.get_datetime64_value(val)
     if ival == NPY_NAT:
         return NPY_NAT
 

diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx
@@ -21,10 +21,6 @@ from numpy cimport int64_t
 cnp.import_array()
 
 cimport pandas._libs.tslibs.util as util
-from pandas._libs.tslibs.np_datetime cimport (
-    get_datetime64_value,
-    get_timedelta64_value,
-)
 
 # ----------------------------------------------------------------------
 # Constants
@@ -1439,7 +1435,7 @@ cdef bint is_dt64nat(object val):
     Is this a np.datetime64 object np.datetime64("NaT").
     """
     if cnp.is_datetime64_object(val):
-        return get_datetime64_value(val) == NPY_NAT
+        return cnp.get_datetime64_value(val) == NPY_NAT
     return False
 
 
@@ -1448,5 +1444,5 @@ cdef bint is_td64nat(object val):
     Is this a np.timedelta64 object np.timedelta64("NaT").
     """
     if cnp.is_timedelta64_object(val):
-        return get_timedelta64_value(val) == NPY_NAT
+        return cnp.get_timedelta64_value(val) == NPY_NAT
     return False
diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd
@@ -25,11 +25,6 @@ cdef extern from "numpy/arrayscalars.h":
         npy_datetime obval
         PyArray_DatetimeMetaData obmeta
 
-    ctypedef struct PyTimedeltaScalarObject:
-        # PyObject_HEAD
-        npy_timedelta obval
-        PyArray_DatetimeMetaData obmeta
-
 cdef extern from "numpy/ndarraytypes.h":
     ctypedef struct npy_datetimestruct:
         int64_t year
@@ -96,8 +91,6 @@ cdef int64_t pydate_to_dt64(
 )
 cdef void pydate_to_dtstruct(date val, npy_datetimestruct *dts) noexcept
 
-cdef npy_datetime get_datetime64_value(object obj) noexcept nogil
-cdef npy_timedelta get_timedelta64_value(object obj) noexcept nogil
 cdef NPY_DATETIMEUNIT get_datetime64_unit(object obj) noexcept nogil
 
 cdef int string_to_dts(

diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx
@@ -59,22 +59,6 @@ cdef extern from "pandas/datetime/pd_datetime.h":
 # ----------------------------------------------------------------------
 # numpy object inspection
 
-cdef npy_datetime get_datetime64_value(object obj) noexcept nogil:
-    """
-    returns the int64 value underlying scalar numpy datetime64 object
-
-    Note that to interpret this as a datetime, the corresponding unit is
-    also needed.  That can be found using `get_datetime64_unit`.
-    """
-    return (<PyDatetimeScalarObject*>obj).obval
-
-
-cdef npy_timedelta get_timedelta64_value(object obj) noexcept nogil:
-    """
-    returns the int64 value underlying scalar numpy timedelta64 object
-    """
-    return (<PyTimedeltaScalarObject*>obj).obval
-
 
 cdef NPY_DATETIMEUNIT get_datetime64_unit(object obj) noexcept nogil:
     """
@@ -278,6 +262,7 @@ cdef void pydate_to_dtstruct(date val, npy_datetimestruct *dts) noexcept:
     dts.ps = dts.as = 0
     return
 
+
 cdef int64_t pydate_to_dt64(
     date val, npy_datetimestruct *dts, NPY_DATETIMEUNIT reso=NPY_FR_ns
 ):

diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx
@@ -53,7 +53,6 @@ from pandas._libs.tslibs.np_datetime cimport (
     NPY_FR_D,
     astype_overflowsafe,
     check_dts_bounds,
-    get_timedelta64_value,
     import_pandas_datetime,
     npy_datetimestruct,
     npy_datetimestruct_to_datetime,
@@ -1822,7 +1821,7 @@ cdef class _Period(PeriodMixin):
 
         if (
             cnp.is_timedelta64_object(other) and
-            get_timedelta64_value(other) == NPY_NAT
+            cnp.get_timedelta64_value(other) == NPY_NAT
         ):
             # i.e. np.timedelta64("nat")
             return NaT

diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
@@ -63,7 +63,6 @@ from pandas._libs.tslibs.np_datetime cimport (
     cmp_scalar,
     convert_reso,
     get_datetime64_unit,
-    get_timedelta64_value,
     get_unit_from_dtype,
     import_pandas_datetime,
     npy_datetimestruct,
@@ -261,7 +260,7 @@ cpdef int64_t delta_to_nanoseconds(
                 "delta_to_nanoseconds does not support Y or M units, "
                 "as their duration in nanoseconds is ambiguous."
             )
-        n = get_timedelta64_value(delta)
+        n = cnp.get_timedelta64_value(delta)
 
     elif PyDelta_Check(delta):
         in_reso = NPY_DATETIMEUNIT.NPY_FR_us
@@ -313,7 +312,7 @@ cdef object ensure_td64ns(object ts):
     ):
         unitstr = npy_unit_to_abbrev(td64_unit)
 
-        td64_value = get_timedelta64_value(ts)
+        td64_value = cnp.get_timedelta64_value(ts)
 
         mult = precision_from_unit(unitstr)[0]
         try:
@@ -484,7 +483,7 @@ cdef int64_t _item_to_timedelta64(
     See array_to_timedelta64.
     """
     try:
-        return get_timedelta64_value(convert_to_timedelta64(item, parsed_unit))
+        return cnp.get_timedelta64_value(convert_to_timedelta64(item, parsed_unit))
     except ValueError as err:
         if errors == "coerce":
             return NPY_NAT
@@ -1859,7 +1858,7 @@ class Timedelta(_Timedelta):
         elif is_timedelta64_object(value):
             # Retain the resolution if possible, otherwise cast to the nearest
             #  supported resolution.
-            new_value = get_timedelta64_value(value)
+            new_value = cnp.get_timedelta64_value(value)
             if new_value == NPY_NAT:
                 # i.e. np.timedelta64("NaT")
                 return NaT
@@ -2223,7 +2222,7 @@ def truediv_object_array(ndarray left, ndarray right):
         td64 = left[i]
         obj = right[i]
 
-        if get_timedelta64_value(td64) == NPY_NAT:
+        if cnp.get_timedelta64_value(td64) == NPY_NAT:
             # td here should be interpreted as a td64 NaT
             if _should_cast_to_timedelta(obj):
                 res_value = np.nan
@@ -2252,7 +2251,7 @@ def floordiv_object_array(ndarray left, ndarray right):
         td64 = left[i]
         obj = right[i]
 
-        if get_timedelta64_value(td64) == NPY_NAT:
+        if cnp.get_timedelta64_value(td64) == NPY_NAT:
             # td here should be interpreted as a td64 NaT
             if _should_cast_to_timedelta(obj):
                 res_value = np.nan

diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx
@@ -88,7 +88,6 @@ from pandas._libs.tslibs.np_datetime cimport (
     cmp_scalar,
     convert_reso,
     get_datetime64_unit,
-    get_datetime64_value,
     get_unit_from_dtype,
     import_pandas_datetime,
     npy_datetimestruct,
@@ -307,7 +306,7 @@ cdef class _Timestamp(ABCTimestamp):
             NPY_DATETIMEUNIT reso
 
         reso = get_datetime64_unit(dt64)
-        value = get_datetime64_value(dt64)
+        value = cnp.get_datetime64_value(dt64)
         return cls._from_value_and_reso(value, reso, None)
 
     # -----------------------------------------------------------------

diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd
@@ -33,8 +33,6 @@ cdef extern from "numpy/arrayobject.h":
     PyTypeObject PyFloatingArrType_Type
 
 cdef extern from "numpy/ndarrayobject.h":
-    PyTypeObject PyTimedeltaArrType_Type
-    PyTypeObject PyDatetimeArrType_Type
     PyTypeObject PyComplexFloatingArrType_Type
     PyTypeObject PyBoolArrType_Type
 

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -2352,6 +2352,9 @@ def _groupby_op(
         # GH#43682
         if isinstance(self.dtype, StringDtype):
             # StringArray
+            if op.how not in ["any", "all"]:
+                # Fail early to avoid conversion to object
+                op._get_cython_function(op.kind, op.how, np.dtype(object), False)
             npvalues = self.to_numpy(object, na_value=np.nan)
         else:
             raise NotImplementedError(

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -228,11 +228,19 @@ def __from_arrow__(
                 # pyarrow.ChunkedArray
                 chunks = array.chunks
 
+            results = []
+            for arr in chunks:
+                # convert chunk by chunk to numpy and concatenate then, to avoid
+                # overflow for large string data when concatenating the pyarrow arrays
+                arr = arr.to_numpy(zero_copy_only=False)
+                arr = ensure_string_array(arr, na_value=libmissing.NA)
+                results.append(arr)
+
         if len(chunks) == 0:
             arr = np.array([], dtype=object)
         else:
-            arr = pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False)
-            arr = ensure_string_array(arr, na_value=libmissing.NA)
+            arr = np.concatenate(results)
+
         # Bypass validation inside StringArray constructor, see GH#47781
         new_string_array = StringArray.__new__(StringArray)
         NDArrayBacked.__init__(

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -643,27 +643,25 @@ def _constructor(self) -> Callable[..., DataFrame]:
         return DataFrame
 
     def _constructor_from_mgr(self, mgr, axes):
-        df = self._from_mgr(mgr, axes=axes)
-        if type(self) is DataFrame:
-            # fastpath avoiding constructor call
-            return df
+        if self._constructor is DataFrame:
+            # we are pandas.DataFrame (or a subclass that doesn't override _constructor)
+            return self._from_mgr(mgr, axes=axes)
         else:
             assert axes is mgr.axes
-            return self._constructor(df, copy=False)
+            return self._constructor(mgr)
 
     _constructor_sliced: Callable[..., Series] = Series
 
     def _sliced_from_mgr(self, mgr, axes) -> Series:
         return Series._from_mgr(mgr, axes)
 
     def _constructor_sliced_from_mgr(self, mgr, axes):
-        ser = self._sliced_from_mgr(mgr, axes=axes)
-        ser._name = None  # caller is responsible for setting real name
-        if type(self) is DataFrame:
-            # fastpath avoiding constructor call
+        if self._constructor_sliced is Series:
+            ser = self._sliced_from_mgr(mgr, axes)
+            ser._name = None  # caller is responsible for setting real name
             return ser
         assert axes is mgr.axes
-        return self._constructor_sliced(ser, copy=False)
+        return self._constructor_sliced(mgr)
 
     # ----------------------------------------------------------------------
     # Constructors