Merge branch 'main' into fix_docstring_groupby

pandas-dev · Dec 5, 2024 · 2e25207 · 2e25207
2 parents 9d2d4cf + a36c44e
commit 2e25207
Show file tree

Hide file tree

Showing 12 changed files with 97 additions and 41 deletions.
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -73,8 +73,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Period.freq GL08" \
         -i "pandas.Period.ordinal GL08" \
         -i "pandas.RangeIndex.from_range PR01,SA01" \
-        -i "pandas.Series.dt.unit GL08" \
-        -i "pandas.Series.pad PR01,SA01" \
         -i "pandas.Timedelta.max PR02" \
         -i "pandas.Timedelta.min PR02" \
         -i "pandas.Timedelta.resolution PR02" \
@@ -87,12 +85,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.arrays.NumpyExtensionArray SA01" \
         -i "pandas.arrays.TimedeltaArray PR07,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \
-        -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \
-        -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
-        -i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
-        -i "pandas.core.resample.Resampler.get_group RT03,SA01" \
         -i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \
         -i "pandas.core.resample.Resampler.mean SA01" \
         -i "pandas.core.resample.Resampler.min PR01,RT03,SA01" \

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -797,6 +797,7 @@ Other
 - Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`)
 - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)
 - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`)
+- Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`)
 
 .. ***DO NOT USE THIS SECTION***
 

diff --git a/environment.yml b/environment.yml
@@ -35,6 +35,7 @@ dependencies:
   - hypothesis>=6.84.0
   - gcsfs>=2022.11.0
   - ipython
+  - pickleshare  # Needed for IPython Sphinx directive in the docs GH#60429
   - jinja2>=3.1.2
   - lxml>=4.9.2
   - matplotlib>=3.6.3

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -2073,7 +2073,29 @@ def _creso(self) -> int:
 
     @cache_readonly
     def unit(self) -> str:
-        # e.g. "ns", "us", "ms"
+        """
+        The precision unit of the datetime data.
+
+        Returns the precision unit for the dtype.
+        It means the smallest time frame that can be stored within this dtype.
+
+        Returns
+        -------
+        str
+            Unit string representation (e.g. "ns").
+
+        See Also
+        --------
+        TimelikeOps.as_unit : Converts to a specific unit.
+
+        Examples
+        --------
+        >>> idx = pd.DatetimeIndex(["2020-01-02 01:02:03.004005006"])
+        >>> idx.unit
+        'ns'
+        >>> idx.as_unit("s").unit
+        's'
+        """
         # error: Argument 1 to "dtype_to_unit" has incompatible type
         # "ExtensionDtype"; expected "Union[DatetimeTZDtype, dtype[Any]]"
         return dtype_to_unit(self.dtype)  # type: ignore[arg-type]

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -87,8 +87,8 @@
 
 if TYPE_CHECKING:
     from collections.abc import (
+        Collection,
         Sequence,
-        Sized,
     )
 
     from pandas._typing import (
@@ -1581,7 +1581,7 @@ def _maybe_box_and_unbox_datetimelike(value: Scalar, dtype: DtypeObj):
     return _maybe_unbox_datetimelike(value, dtype)
 
 
-def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray:
+def construct_1d_object_array_from_listlike(values: Collection) -> np.ndarray:
     """
     Transform any list-like object in a 1-dimensional numpy array of object
     dtype.
@@ -1599,11 +1599,9 @@ def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray:
     -------
     1-dimensional numpy array of dtype object
     """
-    # numpy will try to interpret nested lists as further dimensions, hence
-    # making a 1D array that contains list-likes is a bit tricky:
-    result = np.empty(len(values), dtype="object")
-    result[:] = values
-    return result
+    # numpy will try to interpret nested lists as further dimensions in np.array(),
+    # hence explicitly making a 1D array using np.fromiter
+    return np.fromiter(values, dtype="object", count=len(values))
 
 
 def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.ndarray:

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3878,6 +3878,14 @@ def to_csv(
         >>> import os  # doctest: +SKIP
         >>> os.makedirs("folder/subfolder", exist_ok=True)  # doctest: +SKIP
         >>> df.to_csv("folder/subfolder/out.csv")  # doctest: +SKIP
+
+        Format floats to two decimal places:
+
+        >>> df.to_csv("out1.csv", float_format="%.2f")  # doctest: +SKIP
+
+        Format floats using scientific notation:
+
+        >>> df.to_csv("out2.csv", float_format="{{:.2e}}".format)  # doctest: +SKIP
         """
         df = self if isinstance(self, ABCDataFrame) else self.to_frame()
 

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1321,8 +1321,8 @@ def idxmin(self, skipna: bool = True) -> Series:
 
         Returns
         -------
-        Index
-            Label of the minimum value.
+        Series
+            Indexes of minima in each group.
 
         Raises
         ------
@@ -1374,8 +1374,8 @@ def idxmax(self, skipna: bool = True) -> Series:
 
         Returns
         -------
-        Index
-            Label of the maximum value.
+        Series
+            Indexes of maxima in each group.
 
         Raises
         ------
@@ -2453,6 +2453,10 @@ def nunique(self, dropna: bool = True) -> DataFrame:
         nunique: DataFrame
             Counts of unique elements in each position.
 
+        See Also
+        --------
+        DataFrame.nunique : Count number of distinct elements in specified axis.
+
         Examples
         --------
         >>> df = pd.DataFrame(
@@ -2508,8 +2512,8 @@ def idxmax(
 
         Returns
         -------
-        Series
-            Indexes of maxima in each group.
+        DataFrame
+            Indexes of maxima in each column according to the group.
 
         Raises
         ------
@@ -2519,6 +2523,7 @@ def idxmax(
         See Also
         --------
         Series.idxmax : Return index of the maximum element.
+        DataFrame.idxmax : Indexes of maxima along the specified axis.
 
         Notes
         -----
@@ -2532,6 +2537,7 @@ def idxmax(
         ...     {
         ...         "consumption": [10.51, 103.11, 55.48],
         ...         "co2_emissions": [37.2, 19.66, 1712],
+        ...         "food_type": ["meat", "plant", "meat"],
         ...     },
         ...     index=["Pork", "Wheat Products", "Beef"],
         ... )
@@ -2542,12 +2548,14 @@ def idxmax(
         Wheat Products       103.11         19.66
         Beef                  55.48       1712.00
 
-        By default, it returns the index for the maximum value in each column.
+        By default, it returns the index for the maximum value in each column
+        according to the group.
 
-        >>> df.idxmax()
-        consumption     Wheat Products
-        co2_emissions             Beef
-        dtype: object
+        >>> df.groupby("food_type").idxmax()
+                        consumption   co2_emissions
+        food_type
+        animal                 Beef            Beef
+        plant        Wheat Products  Wheat Products
         """
         return self._idxmax_idxmin("idxmax", numeric_only=numeric_only, skipna=skipna)
 
@@ -2570,8 +2578,8 @@ def idxmin(
 
         Returns
         -------
-        Series
-            Indexes of minima in each group.
+        DataFrame
+            Indexes of minima in each column according to the group.
 
         Raises
         ------
@@ -2581,6 +2589,7 @@ def idxmin(
         See Also
         --------
         Series.idxmin : Return index of the minimum element.
+        DataFrame.idxmin : Indexes of minima along the specified axis.
 
         Notes
         -----
@@ -2594,6 +2603,7 @@ def idxmin(
         ...     {
         ...         "consumption": [10.51, 103.11, 55.48],
         ...         "co2_emissions": [37.2, 19.66, 1712],
+        ...         "food_type": ["meat", "plant", "meat"],
         ...     },
         ...     index=["Pork", "Wheat Products", "Beef"],
         ... )
@@ -2604,12 +2614,14 @@ def idxmin(
         Wheat Products       103.11         19.66
         Beef                  55.48       1712.00
 
-        By default, it returns the index for the minimum value in each column.
+        By default, it returns the index for the minimum value in each column
+        according to the group.
 
-        >>> df.idxmin()
-        consumption                Pork
-        co2_emissions    Wheat Products
-        dtype: object
+        >>> df.groupby("food_type").idxmin()
+                        consumption   co2_emissions
+        food_type
+        animal                 Pork            Pork
+        plant        Wheat Products  Wheat Products
         """
         return self._idxmax_idxmin("idxmin", numeric_only=numeric_only, skipna=skipna)
 

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -715,7 +715,19 @@ def get_group(self, name) -> DataFrame | Series:
 
         Returns
         -------
-        DataFrame or Series
+        Series or DataFrame
+            Get the respective Series or DataFrame corresponding to the group provided.
+
+        See Also
+        --------
+        DataFrameGroupBy.groups: Dictionary representation of the groupings formed
+            during a groupby operation.
+        DataFrameGroupBy.indices: Provides a mapping of group rows to positions
+            of the elements.
+        SeriesGroupBy.groups: Dictionary representation of the groupings formed
+            during a groupby operation.
+        SeriesGroupBy.indices: Provides a mapping of group rows to positions
+            of the elements.
 
         Examples
         --------
@@ -2660,8 +2672,8 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT:
 
         See Also
         --------
-        Series.sem : Return unbiased standard error of the mean over requested axis.
         DataFrame.sem : Return unbiased standard error of the mean over requested axis.
+        Series.sem : Return unbiased standard error of the mean over requested axis.
 
         Examples
         --------

diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
@@ -669,9 +669,9 @@ def _truncate_horizontally(self) -> None:
         assert self.max_cols_fitted is not None
         col_num = self.max_cols_fitted // 2
         if col_num >= 1:
-            left = self.tr_frame.iloc[:, :col_num]
-            right = self.tr_frame.iloc[:, -col_num:]
-            self.tr_frame = concat((left, right), axis=1)
+            _len = len(self.tr_frame.columns)
+            _slice = np.hstack([np.arange(col_num), np.arange(_len - col_num, _len)])
+            self.tr_frame = self.tr_frame.iloc[:, _slice]
 
             # truncate formatter
             if isinstance(self.formatters, (list, tuple)):
@@ -682,7 +682,7 @@ def _truncate_horizontally(self) -> None:
         else:
             col_num = cast(int, self.max_cols)
             self.tr_frame = self.tr_frame.iloc[:, :col_num]
-        self.tr_col_num = col_num
+        self.tr_col_num: int = col_num
 
     def _truncate_vertically(self) -> None:
         """Remove rows, which are not to be displayed.

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -2207,14 +2207,14 @@ def _convert_datetime_to_stata_type(fmt: str) -> np.dtype:
 def _maybe_convert_to_int_keys(convert_dates: dict, varlist: list[Hashable]) -> dict:
     new_dict = {}
     for key, value in convert_dates.items():
-        if not value.startswith("%"):  # make sure proper fmts
+        if not convert_dates[key].startswith("%"):  # make sure proper fmts
             convert_dates[key] = "%" + value
         if key in varlist:
-            new_dict[varlist.index(key)] = value
+            new_dict[varlist.index(key)] = convert_dates[key]
         else:
             if not isinstance(key, int):
                 raise ValueError("convert_dates key must be a column or an integer")
-            new_dict[key] = value
+            new_dict[key] = convert_dates[key]
     return new_dict
 
 

diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py
@@ -129,6 +129,13 @@ def test_repr_truncation_preserves_na(self):
         with option_context("display.max_rows", 2, "display.show_dimensions", False):
             assert repr(df) == "       a\n0   <NA>\n..   ...\n9   <NA>"
 
+    def test_repr_truncation_dataframe_attrs(self):
+        # GH#60455
+        df = DataFrame([[0] * 10])
+        df.attrs["b"] = DataFrame([])
+        with option_context("display.max_columns", 2, "display.show_dimensions", False):
+            assert repr(df) == "   0  ...  9\n0  0  ...  0"
+
     def test_max_colwidth_negative_int_raises(self):
         # Deprecation enforced from:
         # https://github.com/pandas-dev/pandas/issues/31532

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -24,6 +24,7 @@ html5lib>=1.1
 hypothesis>=6.84.0
 gcsfs>=2022.11.0
 ipython
+pickleshare
 jinja2>=3.1.2
 lxml>=4.9.2
 matplotlib>=3.6.3