From 86954016384c53745d6144af80da5957ad2e82fd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 3 Dec 2024 19:34:25 +0100 Subject: [PATCH 1/4] PERF: improve construct_1d_object_array_from_listlike (#60461) * PERF: improve construct_1d_object_array_from_listlike * use np.fromiter and update annotation --- pandas/core/dtypes/cast.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 137a49c4487f6..02b9291da9b31 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -87,8 +87,8 @@ if TYPE_CHECKING: from collections.abc import ( + Collection, Sequence, - Sized, ) from pandas._typing import ( @@ -1581,7 +1581,7 @@ def _maybe_box_and_unbox_datetimelike(value: Scalar, dtype: DtypeObj): return _maybe_unbox_datetimelike(value, dtype) -def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: +def construct_1d_object_array_from_listlike(values: Collection) -> np.ndarray: """ Transform any list-like object in a 1-dimensional numpy array of object dtype. @@ -1599,11 +1599,9 @@ def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: ------- 1-dimensional numpy array of dtype object """ - # numpy will try to interpret nested lists as further dimensions, hence - # making a 1D array that contains list-likes is a bit tricky: - result = np.empty(len(values), dtype="object") - result[:] = values - return result + # numpy will try to interpret nested lists as further dimensions in np.array(), + # hence explicitly making a 1D array using np.fromiter + return np.fromiter(values, dtype="object", count=len(values)) def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.ndarray: From aa4b621172f2710cdb970e10248d669c5d9b5e0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=93scar=20G=C3=B3mez?= Date: Tue, 3 Dec 2024 19:39:25 +0100 Subject: [PATCH 2/4] DOC: Fix some docstring validations in pd.Series (#60481) * DOC: Fix some docstring validations in pd.Series * new circle --- ci/code_checks.sh | 2 -- pandas/core/arrays/datetimelike.py | 24 +++++++++++++++++++++++- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index dde98a01cc770..a21b87950cee1 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -73,8 +73,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ -i "pandas.RangeIndex.from_range PR01,SA01" \ - -i "pandas.Series.dt.unit GL08" \ - -i "pandas.Series.pad PR01,SA01" \ -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 9c821bf0d184e..c6b6367e347ba 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2073,7 +2073,29 @@ def _creso(self) -> int: @cache_readonly def unit(self) -> str: - # e.g. "ns", "us", "ms" + """ + The precision unit of the datetime data. + + Returns the precision unit for the dtype. + It means the smallest time frame that can be stored within this dtype. + + Returns + ------- + str + Unit string representation (e.g. "ns"). + + See Also + -------- + TimelikeOps.as_unit : Converts to a specific unit. + + Examples + -------- + >>> idx = pd.DatetimeIndex(["2020-01-02 01:02:03.004005006"]) + >>> idx.unit + 'ns' + >>> idx.as_unit("s").unit + 's' + """ # error: Argument 1 to "dtype_to_unit" has incompatible type # "ExtensionDtype"; expected "Union[DatetimeTZDtype, dtype[Any]]" return dtype_to_unit(self.dtype) # type: ignore[arg-type] From 0c0938399cfb1c2a4baa9e83a03a0ada692246ed Mon Sep 17 00:00:00 2001 From: Chris <76128089+thedataninja1786@users.noreply.github.com> Date: Tue, 3 Dec 2024 20:40:09 +0200 Subject: [PATCH 3/4] Adds See Also sections to pandas.core.groupby.DataFrameGroupBy.sem, pandas.core.groupby.DataFrameGroupBy.nunique (#60480) * Added See Also Sections * pre-commit checks * Update code_checks.sh * Udpate code_checks.sh * Update ci/code_checks.sh --- ci/code_checks.sh | 4 ---- pandas/core/groupby/generic.py | 4 ++++ pandas/core/groupby/groupby.py | 5 +++++ 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index a21b87950cee1..f23481b3da3a2 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -86,19 +86,15 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.arrays.TimedeltaArray PR07,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ - -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \ -i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ - -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \ -i "pandas.core.resample.Resampler.get_group RT03,SA01" \ -i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.mean SA01" \ -i "pandas.core.resample.Resampler.min PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.prod SA01" \ -i "pandas.core.resample.Resampler.quantile PR01,PR07" \ - -i "pandas.core.resample.Resampler.sem SA01" \ -i "pandas.core.resample.Resampler.std SA01" \ -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.var SA01" \ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 35ec09892ede6..3a917e0147396 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2453,6 +2453,10 @@ def nunique(self, dropna: bool = True) -> DataFrame: nunique: DataFrame Counts of unique elements in each position. + See Also + -------- + DataFrame.nunique : Count number of distinct elements in specified axis. + Examples -------- >>> df = pd.DataFrame( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 48d4e0456d4fa..e750c606a4c44 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2658,6 +2658,11 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: Series or DataFrame Standard error of the mean of values within each group. + See Also + -------- + DataFrame.sem : Return unbiased standard error of the mean over requested axis. + Series.sem : Return unbiased standard error of the mean over requested axis. + Examples -------- For SeriesGroupBy: From 844b3191bd45b95cbaae341048bf7f367f086f2f Mon Sep 17 00:00:00 2001 From: Axeldnahcram <33946160+Axeldnahcram@users.noreply.github.com> Date: Tue, 3 Dec 2024 19:42:59 +0100 Subject: [PATCH 4/4] DOC: DataFrameGroupBy.idxmin() returns DataFrame, documentation says Serie (#60474) * DOC: modify examples and return in docs * DOC: fix examples * DOC: unify * Whitespace * Pre commit * Double line breaks * DOC: finally rann pre commit * Remove unused notebook --- pandas/core/groupby/generic.py | 44 ++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 3a917e0147396..3fa34007a739b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1321,8 +1321,8 @@ def idxmin(self, skipna: bool = True) -> Series: Returns ------- - Index - Label of the minimum value. + Series + Indexes of minima in each group. Raises ------ @@ -1374,8 +1374,8 @@ def idxmax(self, skipna: bool = True) -> Series: Returns ------- - Index - Label of the maximum value. + Series + Indexes of maxima in each group. Raises ------ @@ -2512,8 +2512,8 @@ def idxmax( Returns ------- - Series - Indexes of maxima in each group. + DataFrame + Indexes of maxima in each column according to the group. Raises ------ @@ -2523,6 +2523,7 @@ def idxmax( See Also -------- Series.idxmax : Return index of the maximum element. + DataFrame.idxmax : Indexes of maxima along the specified axis. Notes ----- @@ -2536,6 +2537,7 @@ def idxmax( ... { ... "consumption": [10.51, 103.11, 55.48], ... "co2_emissions": [37.2, 19.66, 1712], + ... "food_type": ["meat", "plant", "meat"], ... }, ... index=["Pork", "Wheat Products", "Beef"], ... ) @@ -2546,12 +2548,14 @@ def idxmax( Wheat Products 103.11 19.66 Beef 55.48 1712.00 - By default, it returns the index for the maximum value in each column. + By default, it returns the index for the maximum value in each column + according to the group. - >>> df.idxmax() - consumption Wheat Products - co2_emissions Beef - dtype: object + >>> df.groupby("food_type").idxmax() + consumption co2_emissions + food_type + animal Beef Beef + plant Wheat Products Wheat Products """ return self._idxmax_idxmin("idxmax", numeric_only=numeric_only, skipna=skipna) @@ -2574,8 +2578,8 @@ def idxmin( Returns ------- - Series - Indexes of minima in each group. + DataFrame + Indexes of minima in each column according to the group. Raises ------ @@ -2585,6 +2589,7 @@ def idxmin( See Also -------- Series.idxmin : Return index of the minimum element. + DataFrame.idxmin : Indexes of minima along the specified axis. Notes ----- @@ -2598,6 +2603,7 @@ def idxmin( ... { ... "consumption": [10.51, 103.11, 55.48], ... "co2_emissions": [37.2, 19.66, 1712], + ... "food_type": ["meat", "plant", "meat"], ... }, ... index=["Pork", "Wheat Products", "Beef"], ... ) @@ -2608,12 +2614,14 @@ def idxmin( Wheat Products 103.11 19.66 Beef 55.48 1712.00 - By default, it returns the index for the minimum value in each column. + By default, it returns the index for the minimum value in each column + according to the group. - >>> df.idxmin() - consumption Pork - co2_emissions Wheat Products - dtype: object + >>> df.groupby("food_type").idxmin() + consumption co2_emissions + food_type + animal Pork Pork + plant Wheat Products Wheat Products """ return self._idxmax_idxmin("idxmin", numeric_only=numeric_only, skipna=skipna)