Merge branch 'main' into table_prefixes

pandas-dev · Dec 4, 2024 · 3c8a12e · 3c8a12e
2 parents b595021 + cfd0d3f
commit 3c8a12e
Show file tree

Hide file tree

Showing 83 changed files with 487 additions and 367 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -19,7 +19,7 @@ ci:
     skip: [pyright, mypy]
 repos:
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.7.2
+    rev: v0.8.1
     hooks:
     -   id: ruff
         args: [--exit-non-zero-on-fix]
@@ -47,7 +47,7 @@ repos:
         types_or: [python, rst, markdown, cython, c]
         additional_dependencies: [tomli]
 -   repo: https://github.com/MarcoGorelli/cython-lint
-    rev: v0.16.2
+    rev: v0.16.6
     hooks:
     -   id: cython-lint
     -   id: double-quote-cython-strings
@@ -95,7 +95,7 @@ repos:
     - id: sphinx-lint
       args: ["--enable", "all", "--disable", "line-too-long"]
 -   repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v19.1.3
+    rev: v19.1.4
     hooks:
     - id: clang-format
       files: ^pandas/_libs/src|^pandas/_libs/include

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -511,8 +511,7 @@ def setup(self, dtype, method, application, ncols, engine):
         # grouping on multiple columns
         # and we lack kernels for a bunch of methods
         if (
-            engine == "numba"
-            and method in _numba_unsupported_methods
+            (engine == "numba" and method in _numba_unsupported_methods)
             or ncols > 1
             or application == "transformation"
             or dtype == "datetime"

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -73,8 +73,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Period.freq GL08" \
         -i "pandas.Period.ordinal GL08" \
         -i "pandas.RangeIndex.from_range PR01,SA01" \
-        -i "pandas.Series.dt.unit GL08" \
-        -i "pandas.Series.pad PR01,SA01" \
         -i "pandas.Timedelta.max PR02" \
         -i "pandas.Timedelta.min PR02" \
         -i "pandas.Timedelta.resolution PR02" \
@@ -88,19 +86,15 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.arrays.TimedeltaArray PR07,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \
-        -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
-        -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
-        -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \
         -i "pandas.core.resample.Resampler.get_group RT03,SA01" \
         -i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \
         -i "pandas.core.resample.Resampler.mean SA01" \
         -i "pandas.core.resample.Resampler.min PR01,RT03,SA01" \
         -i "pandas.core.resample.Resampler.prod SA01" \
         -i "pandas.core.resample.Resampler.quantile PR01,PR07" \
-        -i "pandas.core.resample.Resampler.sem SA01" \
         -i "pandas.core.resample.Resampler.std SA01" \
         -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \
         -i "pandas.core.resample.Resampler.var SA01" \

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -667,6 +667,7 @@ Indexing
 ^^^^^^^^
 - Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`)
 - Bug in :meth:`DataFrame.from_records` throwing a ``ValueError`` when passed an empty list in ``index`` (:issue:`58594`)
+- Bug in :meth:`MultiIndex.insert` when a new value inserted to a datetime-like level gets cast to ``NaT`` and fails indexing (:issue:`60388`)
 - Bug in printing :attr:`Index.names` and :attr:`MultiIndex.levels` would not escape single quotes (:issue:`60190`)
 
 Missing
@@ -700,6 +701,7 @@ I/O
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
 - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
+- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
 - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
 - Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`)
 - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
@@ -736,6 +738,7 @@ Groupby/resample/rolling
 - Bug in :meth:`DataFrameGroupBy.cumsum` and :meth:`DataFrameGroupBy.cumprod` where ``numeric_only`` parameter was passed indirectly through kwargs instead of passing directly. (:issue:`58811`)
 - Bug in :meth:`DataFrameGroupBy.cumsum` where it did not return the correct dtype when the label contained ``None``. (:issue:`58811`)
 - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`)
+- Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`)
 - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`)
 - Bug in :meth:`Series.resample` could raise when the the date range ended shortly before a non-existent time. (:issue:`58380`)
 
@@ -794,6 +797,7 @@ Other
 - Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`)
 - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)
 - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`)
+- Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`)
 
 .. ***DO NOT USE THIS SECTION***
 

diff --git a/environment.yml b/environment.yml
@@ -35,6 +35,7 @@ dependencies:
   - hypothesis>=6.84.0
   - gcsfs>=2022.11.0
   - ipython
+  - pickleshare  # Needed for IPython Sphinx directive in the docs GH#60429
   - jinja2>=3.1.2
   - lxml>=4.9.2
   - matplotlib>=3.6.3

diff --git a/pandas/__init__.py b/pandas/__init__.py
@@ -235,6 +235,7 @@
 # Pandas is not (yet) a py.typed library: the public API is determined
 # based on the documentation.
 __all__ = [
+    "NA",
     "ArrowDtype",
     "BooleanDtype",
     "Categorical",
@@ -253,15 +254,14 @@
     "HDFStore",
     "Index",
     "IndexSlice",
+    "Int8Dtype",
     "Int16Dtype",
     "Int32Dtype",
     "Int64Dtype",
-    "Int8Dtype",
     "Interval",
     "IntervalDtype",
     "IntervalIndex",
     "MultiIndex",
-    "NA",
     "NaT",
     "NamedAgg",
     "Period",
@@ -274,10 +274,10 @@
     "Timedelta",
     "TimedeltaIndex",
     "Timestamp",
+    "UInt8Dtype",
     "UInt16Dtype",
     "UInt32Dtype",
     "UInt64Dtype",
-    "UInt8Dtype",
     "api",
     "array",
     "arrays",
@@ -290,8 +290,8 @@
     "errors",
     "eval",
     "factorize",
-    "get_dummies",
     "from_dummies",
+    "get_dummies",
     "get_option",
     "infer_freq",
     "interval_range",

diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py
@@ -8,13 +8,13 @@
 
 __all__ = [
     "config",
+    "describe_option",
     "detect_console_encoding",
     "get_option",
-    "set_option",
-    "reset_option",
-    "describe_option",
     "option_context",
     "options",
+    "reset_option",
+    "set_option",
 ]
 from pandas._config import config
 from pandas._config import dates  # pyright: ignore[reportUnusedImport]  # noqa: F401

diff --git a/pandas/_libs/__init__.py b/pandas/_libs/__init__.py
@@ -1,12 +1,12 @@
 __all__ = [
+    "Interval",
     "NaT",
     "NaTType",
     "OutOfBoundsDatetime",
     "Period",
     "Timedelta",
     "Timestamp",
     "iNaT",
-    "Interval",
 ]
 
 

diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py
@@ -1,39 +1,39 @@
 __all__ = [
-    "dtypes",
-    "localize_pydatetime",
+    "BaseOffset",
+    "IncompatibleFrequency",
     "NaT",
     "NaTType",
-    "iNaT",
-    "nat_strings",
     "OutOfBoundsDatetime",
     "OutOfBoundsTimedelta",
-    "IncompatibleFrequency",
     "Period",
     "Resolution",
+    "Tick",
     "Timedelta",
-    "normalize_i8_timestamps",
-    "is_date_array_normalized",
-    "dt64arr_to_periodarr",
+    "Timestamp",
+    "add_overflowsafe",
+    "astype_overflowsafe",
     "delta_to_nanoseconds",
+    "dt64arr_to_periodarr",
+    "dtypes",
+    "get_resolution",
+    "get_supported_dtype",
+    "get_unit_from_dtype",
+    "guess_datetime_format",
+    "iNaT",
     "ints_to_pydatetime",
     "ints_to_pytimedelta",
-    "get_resolution",
-    "Timestamp",
-    "tz_convert_from_utc_single",
-    "tz_convert_from_utc",
-    "to_offset",
-    "Tick",
-    "BaseOffset",
-    "tz_compare",
+    "is_date_array_normalized",
+    "is_supported_dtype",
     "is_unitless",
-    "astype_overflowsafe",
-    "get_unit_from_dtype",
+    "localize_pydatetime",
+    "nat_strings",
+    "normalize_i8_timestamps",
     "periods_per_day",
     "periods_per_second",
-    "guess_datetime_format",
-    "add_overflowsafe",
-    "get_supported_dtype",
-    "is_supported_dtype",
+    "to_offset",
+    "tz_compare",
+    "tz_convert_from_utc",
+    "tz_convert_from_utc_single",
 ]
 
 from pandas._libs.tslibs import dtypes

diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -540,6 +540,25 @@ def shares_memory(left, right) -> bool:
     "ALL_INT_NUMPY_DTYPES",
     "ALL_NUMPY_DTYPES",
     "ALL_REAL_NUMPY_DTYPES",
+    "BOOL_DTYPES",
+    "BYTES_DTYPES",
+    "COMPLEX_DTYPES",
+    "DATETIME64_DTYPES",
+    "ENDIAN",
+    "FLOAT_EA_DTYPES",
+    "FLOAT_NUMPY_DTYPES",
+    "NARROW_NP_DTYPES",
+    "NP_NAT_OBJECTS",
+    "NULL_OBJECTS",
+    "OBJECT_DTYPES",
+    "SIGNED_INT_EA_DTYPES",
+    "SIGNED_INT_NUMPY_DTYPES",
+    "STRING_DTYPES",
+    "TIMEDELTA64_DTYPES",
+    "UNSIGNED_INT_EA_DTYPES",
+    "UNSIGNED_INT_NUMPY_DTYPES",
+    "SubclassedDataFrame",
+    "SubclassedSeries",
     "assert_almost_equal",
     "assert_attr_equal",
     "assert_categorical_equal",
@@ -563,51 +582,32 @@ def shares_memory(left, right) -> bool:
     "assert_sp_array_equal",
     "assert_timedelta_array_equal",
     "at",
-    "BOOL_DTYPES",
     "box_expected",
-    "BYTES_DTYPES",
     "can_set_locale",
-    "COMPLEX_DTYPES",
     "convert_rows_list_to_csv_str",
-    "DATETIME64_DTYPES",
     "decompress_file",
-    "ENDIAN",
     "ensure_clean",
     "external_error_raised",
-    "FLOAT_EA_DTYPES",
-    "FLOAT_NUMPY_DTYPES",
     "get_cython_table_params",
     "get_dtype",
-    "getitem",
-    "get_locales",
     "get_finest_unit",
+    "get_locales",
     "get_obj",
     "get_op_from_name",
+    "getitem",
     "iat",
     "iloc",
     "loc",
     "maybe_produces_warning",
-    "NARROW_NP_DTYPES",
-    "NP_NAT_OBJECTS",
-    "NULL_OBJECTS",
-    "OBJECT_DTYPES",
     "raise_assert_detail",
     "raises_chained_assignment_error",
     "round_trip_pathlib",
     "round_trip_pickle",
-    "setitem",
     "set_locale",
     "set_timezone",
+    "setitem",
     "shares_memory",
-    "SIGNED_INT_EA_DTYPES",
-    "SIGNED_INT_NUMPY_DTYPES",
-    "STRING_DTYPES",
-    "SubclassedDataFrame",
-    "SubclassedSeries",
-    "TIMEDELTA64_DTYPES",
     "to_array",
-    "UNSIGNED_INT_EA_DTYPES",
-    "UNSIGNED_INT_NUMPY_DTYPES",
     "with_csv_dialect",
     "write_to_compressed",
 ]
diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
@@ -755,11 +755,8 @@ def assert_extension_array_equal(
         and atol is lib.no_default
     ):
         check_exact = (
-            is_numeric_dtype(left.dtype)
-            and not is_float_dtype(left.dtype)
-            or is_numeric_dtype(right.dtype)
-            and not is_float_dtype(right.dtype)
-        )
+            is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype)
+        ) or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype))
     elif check_exact is lib.no_default:
         check_exact = False
 
@@ -944,11 +941,8 @@ def assert_series_equal(
         and atol is lib.no_default
     ):
         check_exact = (
-            is_numeric_dtype(left.dtype)
-            and not is_float_dtype(left.dtype)
-            or is_numeric_dtype(right.dtype)
-            and not is_float_dtype(right.dtype)
-        )
+            is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype)
+        ) or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype))
         left_index_dtypes = (
             [left.index.dtype] if left.index.nlevels == 1 else left.index.dtypes
         )

diff --git a/pandas/_typing.py b/pandas/_typing.py
@@ -273,7 +273,7 @@ def mode(self) -> str:
         # for _get_filepath_or_buffer
         ...
 
-    def seek(self, __offset: int, __whence: int = ...) -> int:
+    def seek(self, offset: int, whence: int = ..., /) -> int:
         # with one argument: gzip.GzipFile, bz2.BZ2File
         # with two arguments: zip.ZipFile, read_sas
         ...
@@ -288,13 +288,13 @@ def tell(self) -> int:
 
 
 class ReadBuffer(BaseBuffer, Protocol[AnyStr_co]):
-    def read(self, __n: int = ...) -> AnyStr_co:
+    def read(self, n: int = ..., /) -> AnyStr_co:
         # for BytesIOWrapper, gzip.GzipFile, bz2.BZ2File
         ...
 
 
 class WriteBuffer(BaseBuffer, Protocol[AnyStr_contra]):
-    def write(self, __b: AnyStr_contra) -> Any:
+    def write(self, b: AnyStr_contra, /) -> Any:
         # for gzip.GzipFile, bz2.BZ2File
         ...
 

diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py
@@ -9,9 +9,9 @@
 )
 
 __all__ = [
-    "interchange",
     "extensions",
     "indexers",
+    "interchange",
     "types",
     "typing",
 ]