Merge branch 'main' into string

pandas-dev · Dec 17, 2024 · 48dc7fd · 48dc7fd
2 parents 043c667 + 1e530b6
commit 48dc7fd
Show file tree

Hide file tree

Showing 35 changed files with 268 additions and 96 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -34,7 +34,6 @@ jobs:
             fi
             python -m pip install --no-build-isolation -ve . -Csetup-args="--werror"
             PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH
-            sudo apt-get update && sudo apt-get install -y libegl1 libopengl0
             ci/run_tests.sh
   test-linux-musl:
     docker:

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -385,10 +385,12 @@ jobs:
           nogil: true
 
       - name: Build Environment
+        # TODO: Once numpy 2.2.1 is out, don't install nightly version
+        # Tests segfault with numpy 2.2.0: https://github.com/numpy/numpy/pull/27955
         run: |
           python --version
-          python -m pip install --upgrade pip setuptools wheel numpy meson[ninja]==1.2.1 meson-python==0.13.1
-          python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython
+          python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1
+          python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython numpy
           python -m pip install versioneer[toml]
           python -m pip install python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov
           python -m pip install -ve . --no-build-isolation --no-index --no-deps -Csetup-args="--werror"

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -81,7 +81,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timestamp.resolution PR02" \
         -i "pandas.Timestamp.tzinfo GL08" \
         -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
-        -i "pandas.arrays.IntervalArray.length SA01" \
         -i "pandas.arrays.NumpyExtensionArray SA01" \
         -i "pandas.arrays.TimedeltaArray PR07,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
@@ -94,11 +93,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.resample.Resampler.std SA01" \
         -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \
         -i "pandas.core.resample.Resampler.var SA01" \
-        -i "pandas.errors.UndefinedVariableError PR01,SA01" \
         -i "pandas.errors.ValueLabelTypeMismatch SA01" \
-        -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \
         -i "pandas.plotting.andrews_curves RT03,SA01" \
-        -i "pandas.plotting.scatter_matrix PR07,SA01" \
         -i "pandas.tseries.offsets.BDay PR02,SA01" \
         -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \
         -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -56,6 +56,7 @@ Other enhancements
 - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
 - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
 - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
+- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
 - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
 - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
 - :meth:`str.get_dummies` now accepts a  ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
@@ -801,6 +802,7 @@ Other
 - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`)
 - Bug in ``Series.list`` methods not preserving the original name. (:issue:`60522`)
 - Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`)
+- Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`)
 
 .. ***DO NOT USE THIS SECTION***
 

diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
@@ -1306,6 +1306,20 @@ def length(self) -> Index:
         """
         Return an Index with entries denoting the length of each Interval.
 
+        The length of an interval is calculated as the difference between
+        its `right` and `left` bounds. This property is particularly useful
+        when working with intervals where the size of the interval is an important
+        attribute, such as in time-series analysis or spatial data analysis.
+
+        See Also
+        --------
+        arrays.IntervalArray.left : Return the left endpoints of each Interval in
+            the IntervalArray as an Index.
+        arrays.IntervalArray.right : Return the right endpoints of each Interval in
+            the IntervalArray as an Index.
+        arrays.IntervalArray.mid : Return the midpoint of each Interval in the
+            IntervalArray as an Index.
+
         Examples
         --------
 

diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py
@@ -108,7 +108,7 @@ def _evaluate_numexpr(op, op_str, left_op, right_op):
         try:
             result = ne.evaluate(
                 f"left_value {op_str} right_value",
-                local_dict={"left_value": left_value, "right_value": right_op},
+                local_dict={"left_value": left_value, "right_value": right_value},
                 casting="safe",
             )
         except TypeError:
@@ -257,11 +257,17 @@ def where(cond, left_op, right_op, use_numexpr: bool = True):
         Whether to try to use numexpr.
     """
     assert _where is not None
+        string
     return (
         _where(cond, left_op, right_op)
         if use_numexpr
         else _where_standard(cond, left_op, right_op)
     )
+    if use_numexpr:
+        return _where(cond, left_op, right_op)
+    else:
+        return _where_standard(cond, left_op, right_op)
+        main
 
 
 def set_test_mode(v: bool = True) -> None:

diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -430,7 +430,7 @@ def is_period_dtype(arr_or_dtype) -> bool:
     Check whether an array-like or dtype is of the Period dtype.
 
     .. deprecated:: 2.2.0
-        Use isinstance(dtype, pd.Period) instead.
+        Use isinstance(dtype, pd.PeriodDtype) instead.
 
     Parameters
     ----------

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -665,7 +665,7 @@ def size(self) -> int:
 
         See Also
         --------
-        ndarray.size : Number of elements in the array.
+        numpy.ndarray.size : Number of elements in the array.
 
         Examples
         --------

diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py
@@ -490,7 +490,7 @@ def online(
         klass="Series/Dataframe",
         axis="",
     )
-    def aggregate(self, func, *args, **kwargs):
+    def aggregate(self, func=None, *args, **kwargs):
         return super().aggregate(func, *args, **kwargs)
 
     agg = aggregate
@@ -981,7 +981,7 @@ def reset(self) -> None:
         """
         self._mean.reset()
 
-    def aggregate(self, func, *args, **kwargs):
+    def aggregate(self, func=None, *args, **kwargs):
         raise NotImplementedError("aggregate is not implemented.")
 
     def std(self, bias: bool = False, *args, **kwargs):

diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py
@@ -167,7 +167,7 @@ def _get_window_indexer(self) -> BaseIndexer:
         klass="Series/Dataframe",
         axis="",
     )
-    def aggregate(self, func, *args, **kwargs):
+    def aggregate(self, func=None, *args, **kwargs):
         return super().aggregate(func, *args, **kwargs)
 
     agg = aggregate

diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
@@ -44,7 +44,10 @@
 
 from pandas.core._numba import executor
 from pandas.core.algorithms import factorize
-from pandas.core.apply import ResamplerWindowApply
+from pandas.core.apply import (
+    ResamplerWindowApply,
+    reconstruct_func,
+)
 from pandas.core.arrays import ExtensionArray
 from pandas.core.base import SelectionMixin
 import pandas.core.common as com
@@ -646,8 +649,12 @@ def _numba_apply(
             out = obj._constructor(result, index=index, columns=columns)
             return self._resolve_output(out, obj)
 
-    def aggregate(self, func, *args, **kwargs):
+    def aggregate(self, func=None, *args, **kwargs):
+        relabeling, func, columns, order = reconstruct_func(func, **kwargs)
         result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg()
+        if isinstance(result, ABCDataFrame) and relabeling:
+            result = result.iloc[:, order]
+            result.columns = columns  # type: ignore[union-attr]
         if result is None:
             return self.apply(func, raw=False, args=args, kwargs=kwargs)
         return result
@@ -1239,7 +1246,7 @@ def calc(x):
         klass="Series/DataFrame",
         axis="",
     )
-    def aggregate(self, func, *args, **kwargs):
+    def aggregate(self, func=None, *args, **kwargs):
         result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg()
         if result is None:
             # these must apply directly
@@ -1951,7 +1958,7 @@ def _raise_monotonic_error(self, msg: str):
         klass="Series/Dataframe",
         axis="",
     )
-    def aggregate(self, func, *args, **kwargs):
+    def aggregate(self, func=None, *args, **kwargs):
         return super().aggregate(func, *args, **kwargs)
 
     agg = aggregate

diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py
@@ -588,6 +588,20 @@ class UndefinedVariableError(NameError):
 
     It will also specify whether the undefined variable is local or not.
 
+    Parameters
+    ----------
+    name : str
+        The name of the undefined variable.
+    is_local : bool or None, optional
+        Indicates whether the undefined variable is considered a local variable.
+        If ``True``, the error message specifies it as a local variable.
+        If ``False`` or ``None``, the variable is treated as a non-local name.
+
+    See Also
+    --------
+    DataFrame.query : Query the columns of a DataFrame with a boolean expression.
+    DataFrame.eval : Evaluate a string describing operations on DataFrame columns.
+
     Examples
     --------
     >>> df = pd.DataFrame({"A": [1, 1, 1]})

diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
@@ -78,7 +78,6 @@
 )
 from pandas.core.indexes.datetimes import DatetimeIndex
 from pandas.core.indexes.timedeltas import TimedeltaIndex
-from pandas.core.reshape.concat import concat
 
 from pandas.io.common import (
     check_parent_directory,
@@ -245,7 +244,11 @@ def _chk_truncate(self) -> None:
                 series = series.iloc[:max_rows]
             else:
                 row_num = max_rows // 2
-                series = concat((series.iloc[:row_num], series.iloc[-row_num:]))
+                _len = len(series)
+                _slice = np.hstack(
+                    [np.arange(row_num), np.arange(_len - row_num, _len)]
+                )
+                series = series.iloc[_slice]
             self.tr_row_num = row_num
         else:
             self.tr_row_num = None

diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py
@@ -239,9 +239,16 @@ def build_table_schema(
     """
     Create a Table schema from ``data``.
 
+    This method is a utility to generate a JSON-serializable schema
+    representation of a pandas Series or DataFrame, compatible with the
+    Table Schema specification. It enables structured data to be shared
+    and validated in various applications, ensuring consistency and
+    interoperability.
+
     Parameters
     ----------
-    data : Series, DataFrame
+    data : Series or DataFrame
+        The input data for which the table schema is to be created.
     index : bool, default True
         Whether to include ``data.index`` in the schema.
     primary_key : bool or None, default True
@@ -256,6 +263,12 @@ def build_table_schema(
     Returns
     -------
     dict
+        A dictionary representing the Table schema.
+
+    See Also
+    --------
+    DataFrame.to_json : Convert the object to a JSON string.
+    read_json : Convert a JSON string to pandas object.
 
     Notes
     -----

diff --git a/pandas/io/sql.py b/pandas/io/sql.py
@@ -241,7 +241,7 @@ def read_sql_table(  # pyright: ignore[reportOverlappingOverload]
     schema=...,
     index_col: str | list[str] | None = ...,
     coerce_float=...,
-    parse_dates: list[str] | dict[str, str] | None = ...,
+    parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ...,
     columns: list[str] | None = ...,
     chunksize: None = ...,
     dtype_backend: DtypeBackend | lib.NoDefault = ...,
@@ -255,7 +255,7 @@ def read_sql_table(
     schema=...,
     index_col: str | list[str] | None = ...,
     coerce_float=...,
-    parse_dates: list[str] | dict[str, str] | None = ...,
+    parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ...,
     columns: list[str] | None = ...,
     chunksize: int = ...,
     dtype_backend: DtypeBackend | lib.NoDefault = ...,
@@ -268,7 +268,7 @@ def read_sql_table(
     schema: str | None = None,
     index_col: str | list[str] | None = None,
     coerce_float: bool = True,
-    parse_dates: list[str] | dict[str, str] | None = None,
+    parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = None,
     columns: list[str] | None = None,
     chunksize: int | None = None,
     dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
@@ -372,7 +372,7 @@ def read_sql_query(  # pyright: ignore[reportOverlappingOverload]
     index_col: str | list[str] | None = ...,
     coerce_float=...,
     params: list[Any] | Mapping[str, Any] | None = ...,
-    parse_dates: list[str] | dict[str, str] | None = ...,
+    parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ...,
     chunksize: None = ...,
     dtype: DtypeArg | None = ...,
     dtype_backend: DtypeBackend | lib.NoDefault = ...,
@@ -386,7 +386,7 @@ def read_sql_query(
     index_col: str | list[str] | None = ...,
     coerce_float=...,
     params: list[Any] | Mapping[str, Any] | None = ...,
-    parse_dates: list[str] | dict[str, str] | None = ...,
+    parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ...,
     chunksize: int = ...,
     dtype: DtypeArg | None = ...,
     dtype_backend: DtypeBackend | lib.NoDefault = ...,
@@ -399,7 +399,7 @@ def read_sql_query(
     index_col: str | list[str] | None = None,
     coerce_float: bool = True,
     params: list[Any] | Mapping[str, Any] | None = None,
-    parse_dates: list[str] | dict[str, str] | None = None,
+    parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = None,
     chunksize: int | None = None,
     dtype: DtypeArg | None = None,
     dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py
@@ -178,14 +178,21 @@ def scatter_matrix(
     """
     Draw a matrix of scatter plots.
 
+    Each pair of numeric columns in the DataFrame is plotted against each other,
+    resulting in a matrix of scatter plots. The diagonal plots can display either
+    histograms or Kernel Density Estimation (KDE) plots for each variable.
+
     Parameters
     ----------
     frame : DataFrame
+        The data to be plotted.
     alpha : float, optional
         Amount of transparency applied.
     figsize : (float,float), optional
         A tuple (width, height) in inches.
     ax : Matplotlib axis object, optional
+        An existing Matplotlib axis object for the plots. If None, a new axis is
+        created.
     grid : bool, optional
         Setting this to True will show the grid.
     diagonal : {'hist', 'kde'}
@@ -208,6 +215,14 @@ def scatter_matrix(
     numpy.ndarray
         A matrix of scatter plots.
 
+    See Also
+    --------
+    plotting.parallel_coordinates : Plots parallel coordinates for multivariate data.
+    plotting.andrews_curves : Generates Andrews curves for visualizing clusters of
+        multivariate data.
+    plotting.radviz : Creates a RadViz visualization.
+    plotting.bootstrap_plot : Visualizes uncertainty in data via bootstrap sampling.
+
     Examples
     --------
 

diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -1647,7 +1647,7 @@ def test_from_arrow_respecting_given_dtype():
 
 def test_from_arrow_respecting_given_dtype_unsafe():
     array = pa.array([1.5, 2.5], type=pa.float64())
-    with pytest.raises(pa.ArrowInvalid, match="Float value 1.5 was truncated"):
+    with tm.external_error_raised(pa.ArrowInvalid):
         array.to_pandas(types_mapper={pa.float64(): ArrowDtype(pa.int64())}.get)