Merge remote-tracking branch 'upstream/main' into warn_cow_mode_update

pandas-dev · Nov 21, 2023 · 5dfdcd4 · 5dfdcd4
2 parents 427417a + e8d9a32
commit 5dfdcd4
Show file tree

Hide file tree

Showing 152 changed files with 2,866 additions and 2,603 deletions.
diff --git a/.github/workflows/comment-commands.yml b/.github/workflows/comment-commands.yml
@@ -77,7 +77,7 @@ jobs:
           echo 'EOF' >> $GITHUB_ENV
           echo "REGEX=$REGEX" >> $GITHUB_ENV
 
-      - uses: actions/github-script@v6
+      - uses: actions/github-script@v7
         env:
           BENCH_OUTPUT: ${{env.BENCH_OUTPUT}}
           REGEX: ${{env.REGEX}}

diff --git a/.github/workflows/deprecation-tracking-bot.yml b/.github/workflows/deprecation-tracking-bot.yml
@@ -21,7 +21,7 @@ jobs:
     env:
       DEPRECATION_TRACKER_ISSUE: 50578
     steps:
-    - uses: actions/github-script@v6
+    - uses: actions/github-script@v7
       id: update-deprecation-issue
       with:
         script: |

diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -306,6 +306,10 @@ def time_loc_null_slice_plus_slice(self, unique_levels):
         target = (self.tgt_null_slice, self.tgt_slice)
         self.df.loc[target, :]
 
+    def time_loc_multiindex(self, unique_levels):
+        target = self.df.index[::10]
+        self.df.loc[target]
+
     def time_xs_level_0(self, unique_levels):
         target = self.tgt_scalar
         self.df.xs(target, level=0)

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -621,4 +621,15 @@ def time_read_csv_index_col(self):
         )
 
 
+class ReadCSVCParserLowMemory:
+    # GH 16798
+    def setup(self):
+        self.csv = StringIO(
+            "strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)])
+        )
+
+    def peakmem_over_2gb_input(self):
+        read_csv(self.csv, engine="c", low_memory=False)
+
+
 from ..pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/doc/source/user_guide/copy_on_write.rst b/doc/source/user_guide/copy_on_write.rst
@@ -6,6 +6,12 @@
 Copy-on-Write (CoW)
 *******************
 
+.. note::
+
+    Copy-on-Write will become the default in pandas 3.0. We recommend
+    :ref:`turning it on now <copy_on_write_enabling>`
+    to benefit from all improvements.
+
 Copy-on-Write was first introduced in version 1.5.0. Starting from version 2.0 most of the
 optimizations that become possible through CoW are implemented and supported. All possible
 optimizations are supported starting from pandas 2.1.
@@ -123,6 +129,8 @@ CoW triggers a copy when ``df`` is changed to avoid mutating ``view`` as well:
     df
     view
 
+.. _copy_on_write_chained_assignment:
+
 Chained Assignment
 ------------------
 
@@ -238,6 +246,8 @@ and :meth:`DataFrame.rename`.
 These methods return views when Copy-on-Write is enabled, which provides a significant
 performance improvement compared to the regular execution.
 
+.. _copy_on_write_enabling:
+
 How to enable CoW
 -----------------
 

diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
@@ -1727,6 +1727,22 @@ You can assign a custom index to the ``index`` attribute:
 Returning a view versus a copy
 ------------------------------
 
+.. warning::
+
+    :ref:`Copy-on-Write <copy_on_write>`
+    will become the new default in pandas 3.0. This means than chained indexing will
+    never work. As a consequence, the ``SettingWithCopyWarning`` won't be necessary
+    anymore.
+    See :ref:`this section <copy_on_write_chained_assignment>`
+    for more context.
+    We recommend turning Copy-on-Write on to leverage the improvements with
+
+    ```
+    pd.options.mode.copy_on_write = True
+    ```
+
+    even before pandas 3.0 is available.
+
 When setting values in a pandas object, care must be taken to avoid what is called
 ``chained indexing``. Here is an example.
 
@@ -1765,6 +1781,22 @@ faster, and allows one to index *both* axes if so desired.
 Why does assignment fail when using chained indexing?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+.. warning::
+
+    :ref:`Copy-on-Write <copy_on_write>`
+    will become the new default in pandas 3.0. This means than chained indexing will
+    never work. As a consequence, the ``SettingWithCopyWarning`` won't be necessary
+    anymore.
+    See :ref:`this section <copy_on_write_chained_assignment>`
+    for more context.
+    We recommend turning Copy-on-Write on to leverage the improvements with
+
+    ```
+    pd.options.mode.copy_on_write = True
+    ```
+
+    even before pandas 3.0 is available.
+
 The problem in the previous section is just a performance issue. What's up with
 the ``SettingWithCopy`` warning? We don't **usually** throw warnings around when
 you do something that might cost a few extra milliseconds!
@@ -1821,6 +1853,22 @@ Yikes!
 Evaluation order matters
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
+.. warning::
+
+    :ref:`Copy-on-Write <copy_on_write>`
+    will become the new default in pandas 3.0. This means than chained indexing will
+    never work. As a consequence, the ``SettingWithCopyWarning`` won't be necessary
+    anymore.
+    See :ref:`this section <copy_on_write_chained_assignment>`
+    for more context.
+    We recommend turning Copy-on-Write on to leverage the improvements with
+
+    ```
+    pd.options.mode.copy_on_write = True
+    ```
+
+    even before pandas 3.0 is available.
+
 When you use chained indexing, the order and type of the indexing operation
 partially determine whether the result is a slice into the original object, or
 a copy of the slice.

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -270,7 +270,9 @@ Other Deprecations
 - Deprecated :func:`read_gbq` and :meth:`DataFrame.to_gbq`. Use ``pandas_gbq.read_gbq`` and ``pandas_gbq.to_gbq`` instead https://pandas-gbq.readthedocs.io/en/latest/api.html (:issue:`55525`)
 - Deprecated :meth:`.DataFrameGroupBy.fillna` and :meth:`.SeriesGroupBy.fillna`; use :meth:`.DataFrameGroupBy.ffill`, :meth:`.DataFrameGroupBy.bfill` for forward and backward filling or :meth:`.DataFrame.fillna` to fill with a single value (or the Series equivalents) (:issue:`55718`)
 - Deprecated :meth:`Index.format`, use ``index.astype(str)`` or ``index.map(formatter)`` instead (:issue:`55413`)
+- Deprecated ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock``, use public APIs instead (:issue:`55139`)
 - Deprecated ``year``, ``month``, ``quarter``, ``day``, ``hour``, ``minute``, and ``second`` keywords in the :class:`PeriodIndex` constructor, use :meth:`PeriodIndex.from_fields` instead (:issue:`55960`)
+- Deprecated allowing non-integer ``periods`` argument in :func:`date_range`, :func:`timedelta_range`, :func:`period_range`, and :func:`interval_range` (:issue:`56036`)
 - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard`. (:issue:`54229`)
 - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf``. (:issue:`54229`)
 - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_dict`. (:issue:`54229`)
@@ -289,6 +291,7 @@ Other Deprecations
 - Deprecated automatic downcasting of object-dtype results in :meth:`Series.replace` and :meth:`DataFrame.replace`, explicitly call ``result = result.infer_objects(copy=False)`` instead. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`54710`)
 - Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`53656`)
 - Deprecated including the groups in computations when using :meth:`DataFrameGroupBy.apply` and :meth:`DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`)
+- Deprecated indexing an :class:`Index`  with a boolean indexer of length zero (:issue:`55820`)
 - Deprecated not passing a tuple to :class:`DataFrameGroupBy.get_group` or :class:`SeriesGroupBy.get_group` when grouping by a length-1 list-like (:issue:`25971`)
 - Deprecated string ``AS`` denoting frequency in :class:`YearBegin` and strings ``AS-DEC``, ``AS-JAN``, etc. denoting annual frequencies with various fiscal year starts (:issue:`54275`)
 - Deprecated string ``A`` denoting frequency in :class:`YearEnd` and strings ``A-DEC``, ``A-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`54275`)
@@ -316,10 +319,12 @@ Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 - Performance improvement in :func:`.testing.assert_frame_equal` and :func:`.testing.assert_series_equal` (:issue:`55949`, :issue:`55971`)
 - Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`)
+- Performance improvement in :func:`get_dummies` (:issue:`56089`)
 - Performance improvement in :func:`merge_asof` when ``by`` is not ``None`` (:issue:`55580`, :issue:`55678`)
 - Performance improvement in :func:`read_stata` for files with many variables (:issue:`55515`)
 - Performance improvement in :func:`to_dict` on converting DataFrame to dictionary (:issue:`50990`)
 - Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`)
+- Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` when indexing with a :class:`MultiIndex` (:issue:`56062`)
 - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`)
 - Performance improvement in :meth:`Index.difference` (:issue:`55108`)
 - Performance improvement in :meth:`MultiIndex.get_indexer` when ``method`` is not ``None`` (:issue:`55839`)
@@ -363,6 +368,8 @@ Datetimelike
 - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`)
 - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` (or :class:`DatetimeTZDtype`) from mixed-numeric inputs treating those as nanoseconds instead of as multiples of the dtype's unit (which would happen with non-mixed numeric inputs) (:issue:`56004`)
 - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`)
+- Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`)
+- Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`)
 -
 
 Timedelta
@@ -374,7 +381,7 @@ Timezones
 ^^^^^^^^^
 - Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`)
 - Bug in :class:`Timestamp` construction with an ambiguous value and a ``pytz`` timezone failing to raise ``pytz.AmbiguousTimeError`` (:issue:`55657`)
--
+- Bug in :meth:`Timestamp.tz_localize` with ``nonexistent="shift_forward`` around UTC+0 during DST (:issue:`51501`)
 
 Numeric
 ^^^^^^^
@@ -463,10 +470,11 @@ Reshaping
 - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`)
 - Bug in :meth:`pandas.DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`)
 - Bug in :meth:`pandas.DataFrame.melt` where it would not preserve the datetime (:issue:`55254`)
+- Bug in :meth:`pandas.DataFrame.pivot_table` where the row margin is incorrect when the columns have numeric names (:issue:`26568`)
 
 Sparse
 ^^^^^^
--
+- Bug in :meth:`SparseArray.take` when using a different fill value than the array's fill value (:issue:`55181`)
 -
 
 ExtensionArray

diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx
@@ -508,7 +508,7 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz,
         npy_datetimestruct dts
         int out_local = 0, out_tzoffset = 0, string_to_dts_failed
         datetime dt
-        int64_t ival
+        int64_t ival, nanos = 0
         NPY_DATETIMEUNIT out_bestunit, reso
         _TSObject obj
 
@@ -560,10 +560,14 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz,
                 return obj
 
         dt = parse_datetime_string(
-            ts, dayfirst=dayfirst, yearfirst=yearfirst, out_bestunit=&out_bestunit
+            ts,
+            dayfirst=dayfirst,
+            yearfirst=yearfirst,
+            out_bestunit=&out_bestunit,
+            nanos=&nanos,
         )
         reso = get_supported_reso(out_bestunit)
-        return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=reso)
+        return convert_datetime_to_tsobject(dt, tz, nanos=nanos, reso=reso)
 
     return convert_datetime_to_tsobject(dt, tz)
 

diff --git a/pandas/_libs/tslibs/parsing.pxd b/pandas/_libs/tslibs/parsing.pxd
@@ -1,4 +1,5 @@
 from cpython.datetime cimport datetime
+from numpy cimport int64_t
 
 from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT
 
@@ -10,5 +11,6 @@ cdef datetime parse_datetime_string(
     str date_string,
     bint dayfirst,
     bint yearfirst,
-    NPY_DATETIMEUNIT* out_bestunit
+    NPY_DATETIMEUNIT* out_bestunit,
+    int64_t* nanos,
 )
diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx
@@ -34,6 +34,7 @@ from numpy cimport (
     PyArray_IterNew,
     flatiter,
     float64_t,
+    int64_t,
 )
 
 cnp.import_array()
@@ -272,8 +273,11 @@ def py_parse_datetime_string(
     #  parse_datetime_string cpdef bc it has a pointer argument)
     cdef:
         NPY_DATETIMEUNIT out_bestunit
+        int64_t nanos
 
-    return parse_datetime_string(date_string, dayfirst, yearfirst, &out_bestunit)
+    return parse_datetime_string(
+        date_string, dayfirst, yearfirst, &out_bestunit, &nanos
+    )
 
 
 cdef datetime parse_datetime_string(
@@ -283,7 +287,8 @@ cdef datetime parse_datetime_string(
     str date_string,
     bint dayfirst,
     bint yearfirst,
-    NPY_DATETIMEUNIT* out_bestunit
+    NPY_DATETIMEUNIT* out_bestunit,
+    int64_t* nanos,
 ):
     """
     Parse datetime string, only returns datetime.
@@ -311,7 +316,7 @@ cdef datetime parse_datetime_string(
         default = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
         dt = dateutil_parse(date_string, default=default,
                             dayfirst=dayfirst, yearfirst=yearfirst,
-                            ignoretz=False, out_bestunit=out_bestunit)
+                            ignoretz=False, out_bestunit=out_bestunit, nanos=nanos)
         return dt
 
     dt = _parse_delimited_date(date_string, dayfirst, out_bestunit)
@@ -330,7 +335,7 @@ cdef datetime parse_datetime_string(
 
     dt = dateutil_parse(date_string, default=_DEFAULT_DATETIME,
                         dayfirst=dayfirst, yearfirst=yearfirst,
-                        ignoretz=False, out_bestunit=out_bestunit)
+                        ignoretz=False, out_bestunit=out_bestunit, nanos=nanos)
     return dt
 
 
@@ -436,7 +441,7 @@ def parse_datetime_string_with_reso(
 
     parsed = dateutil_parse(date_string, _DEFAULT_DATETIME,
                             dayfirst=dayfirst, yearfirst=yearfirst,
-                            ignoretz=False, out_bestunit=&out_bestunit)
+                            ignoretz=False, out_bestunit=&out_bestunit, nanos=NULL)
     reso = npy_unit_to_attrname[out_bestunit]
     return parsed, reso
 
@@ -639,7 +644,8 @@ cdef datetime dateutil_parse(
     bint ignoretz,
     bint dayfirst,
     bint yearfirst,
-    NPY_DATETIMEUNIT* out_bestunit
+    NPY_DATETIMEUNIT* out_bestunit,
+    int64_t* nanos,
 ):
     """ lifted from dateutil to get resolution"""
 
@@ -671,11 +677,8 @@ cdef datetime dateutil_parse(
     if reso is None:
         raise DateParseError(f"Unable to parse datetime string: {timestr}")
 
-    if reso == "microsecond":
-        if repl["microsecond"] == 0:
-            reso = "second"
-        elif repl["microsecond"] % 1000 == 0:
-            reso = "millisecond"
+    if reso == "microsecond" and repl["microsecond"] % 1000 == 0:
+        reso = _find_subsecond_reso(timestr, nanos=nanos)
 
     try:
         ret = default.replace(**repl)
@@ -745,6 +748,38 @@ cdef datetime dateutil_parse(
     return ret
 
 
+cdef object _reso_pattern = re.compile(r"\d:\d{2}:\d{2}\.(?P<frac>\d+)")
+
+cdef _find_subsecond_reso(str timestr, int64_t* nanos):
+    # GH#55737
+    # Check for trailing zeros in a H:M:S.f pattern
+    match = _reso_pattern.search(timestr)
+    if not match:
+        reso = "second"
+    else:
+        frac = match.groupdict()["frac"]
+        if len(frac) <= 3:
+            reso = "millisecond"
+        elif len(frac) > 6:
+            if frac[6:] == "0" * len(frac[6:]):
+                # corner case where we haven't lost any data
+                reso = "nanosecond"
+            elif len(frac) <= 9:
+                reso = "nanosecond"
+                if nanos is not NULL:
+                    if len(frac) < 9:
+                        frac = frac + "0" * (9 - len(frac))
+                    nanos[0] = int(frac[6:])
+            else:
+                # TODO: should we warn/raise in higher-than-nano cases?
+                reso = "nanosecond"
+                if nanos is not NULL:
+                    nanos[0] = int(frac[6:9])
+        else:
+            reso = "microsecond"
+    return reso
+
+
 # ----------------------------------------------------------------------
 # Parsing for type-inference
 
@@ -916,6 +951,7 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None:
             yearfirst=False,
             ignoretz=False,
             out_bestunit=&out_bestunit,
+            nanos=NULL,
         )
     except (ValueError, OverflowError, InvalidOperation):
         # In case the datetime can't be parsed, its format cannot be guessed

diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx
@@ -416,8 +416,13 @@ timedelta-like}
 
                 else:
                     delta_idx = bisect_right_i8(info.tdata, new_local, info.ntrans)
-
-                    delta_idx = delta_idx - delta_idx_offset
+                    # Logic similar to the precompute section. But check the current
+                    # delta in case we are moving between UTC+0 and non-zero timezone
+                    if (shift_forward or shift_delta > 0) and \
+                       info.deltas[delta_idx - 1] >= 0:
+                        delta_idx = delta_idx - 1
+                    else:
+                        delta_idx = delta_idx - delta_idx_offset
                     result[i] = new_local - info.deltas[delta_idx]
             elif fill_nonexist:
                 result[i] = NPY_NAT