diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 354402c572ade..32ca5573ac08a 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -152,7 +152,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.21.3 + uses: pypa/cibuildwheel@v2.22.0 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 09912bfb6c349..b7b9b1818c122 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.7.2 + rev: v0.8.1 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -47,7 +47,7 @@ repos: types_or: [python, rst, markdown, cython, c] additional_dependencies: [tomli] - repo: https://github.com/MarcoGorelli/cython-lint - rev: v0.16.2 + rev: v0.16.6 hooks: - id: cython-lint - id: double-quote-cython-strings @@ -95,7 +95,7 @@ repos: - id: sphinx-lint args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v19.1.3 + rev: v19.1.4 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index abffa1f702b9c..19c556dfe9d1f 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -511,8 +511,7 @@ def setup(self, dtype, method, application, ncols, engine): # grouping on multiple columns # and we lack kernels for a bunch of methods if ( - engine == "numba" - and method in _numba_unsupported_methods + (engine == "numba" and method in _numba_unsupported_methods) or ncols > 1 or application == "transformation" or dtype == "datetime" diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 379f7cb5f037d..adc5bc9a01bdd 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -73,8 +73,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ -i "pandas.RangeIndex.from_range PR01,SA01" \ - -i "pandas.Series.dt.unit GL08" \ - -i "pandas.Series.pad PR01,SA01" \ -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ @@ -82,49 +80,27 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.min PR02" \ -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.tzinfo GL08" \ - -i "pandas.api.types.is_re_compilable PR07,SA01" \ -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ - -i "pandas.arrays.IntegerArray SA01" \ -i "pandas.arrays.IntervalArray.length SA01" \ -i "pandas.arrays.NumpyExtensionArray SA01" \ -i "pandas.arrays.TimedeltaArray PR07,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ - -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.indices SA01" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ - -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \ - -i "pandas.core.resample.Resampler.get_group RT03,SA01" \ - -i "pandas.core.resample.Resampler.indices SA01" \ -i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.mean SA01" \ -i "pandas.core.resample.Resampler.min PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.prod SA01" \ -i "pandas.core.resample.Resampler.quantile PR01,PR07" \ - -i "pandas.core.resample.Resampler.sem SA01" \ -i "pandas.core.resample.Resampler.std SA01" \ -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.var SA01" \ - -i "pandas.errors.ChainedAssignmentError SA01" \ - -i "pandas.errors.DuplicateLabelError SA01" \ - -i "pandas.errors.IntCastingNaNError SA01" \ - -i "pandas.errors.InvalidIndexError SA01" \ -i "pandas.errors.NullFrequencyError SA01" \ - -i "pandas.errors.NumExprClobberingError SA01" \ -i "pandas.errors.NumbaUtilError SA01" \ - -i "pandas.errors.OutOfBoundsTimedelta SA01" \ -i "pandas.errors.PerformanceWarning SA01" \ - -i "pandas.errors.PossibleDataLossError SA01" \ -i "pandas.errors.UndefinedVariableError PR01,SA01" \ - -i "pandas.errors.UnsortedIndexError SA01" \ -i "pandas.errors.ValueLabelTypeMismatch SA01" \ - -i "pandas.infer_freq SA01" \ -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \ - -i "pandas.io.stata.StataWriter.write_file SA01" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ diff --git a/doc/source/conf.py b/doc/source/conf.py index ddbda0aa3bf65..677ee6274b093 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -242,7 +242,6 @@ "external_links": [], "footer_start": ["pandas_footer", "sphinx-version"], "github_url": "https://github.com/pandas-dev/pandas", - "twitter_url": "https://twitter.com/pandas_dev", "analytics": { "plausible_analytics_domain": "pandas.pydata.org", "plausible_analytics_url": "https://views.scientific-python.org/js/script.js", @@ -258,6 +257,11 @@ # patch version doesn't compare as equal (e.g. 2.2.1 != 2.2.0 but it should be) "show_version_warning_banner": False, "icon_links": [ + { + "name": "X", + "url": "https://x.com/pandas_dev", + "icon": "fa-brands fa-square-x-twitter", + }, { "name": "Mastodon", "url": "https://fosstodon.org/@pandas_dev", diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index b3982c4ad091f..bda959f380e8a 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -193,25 +193,25 @@ Visualization Installable with ``pip install "pandas[plot, output-formatting]"``. -========================= ================== ================== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== ================== ============================================================= -matplotlib 3.6.3 plot Plotting library -Jinja2 3.1.2 output-formatting Conditional formatting with DataFrame.style -tabulate 0.9.0 output-formatting Printing in Markdown-friendly format (see `tabulate`_) -========================= ================== ================== ============================================================= +========================================================== ================== ================== ======================================================= +Dependency Minimum Version pip extra Notes +========================================================== ================== ================== ======================================================= +`matplotlib `__ 3.6.3 plot Plotting library +`Jinja2 `__ 3.1.2 output-formatting Conditional formatting with DataFrame.style +`tabulate `__ 0.9.0 output-formatting Printing in Markdown-friendly format (see `tabulate`_) +========================================================== ================== ================== ======================================================= Computation ^^^^^^^^^^^ Installable with ``pip install "pandas[computation]"``. -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -SciPy 1.10.0 computation Miscellaneous statistical functions -xarray 2022.12.0 computation pandas-like API for N-dimensional data -========================= ================== =============== ============================================================= +============================================== ================== =============== ======================================= +Dependency Minimum Version pip extra Notes +============================================== ================== =============== ======================================= +`SciPy `__ 1.10.0 computation Miscellaneous statistical functions +`xarray `__ 2022.12.0 computation pandas-like API for N-dimensional data +============================================== ================== =============== ======================================= .. _install.excel_dependencies: @@ -220,29 +220,29 @@ Excel files Installable with ``pip install "pandas[excel]"``. -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -xlrd 2.0.1 excel Reading for xls files -xlsxwriter 3.0.5 excel Writing for xlsx files -openpyxl 3.1.0 excel Reading / writing for Excel 2010 xlsx/xlsm/xltx/xltm files -pyxlsb 1.0.10 excel Reading for xlsb files -python-calamine 0.1.7 excel Reading for xls/xlsx/xlsm/xlsb/xla/xlam/ods files -odfpy 1.4.1 excel Reading / writing for OpenDocument 1.2 files -========================= ================== =============== ============================================================= +================================================================== ================== =============== ============================================================= +Dependency Minimum Version pip extra Notes +================================================================== ================== =============== ============================================================= +`xlrd `__ 2.0.1 excel Reading for xls files +`xlsxwriter `__ 3.0.5 excel Writing for xlsx files +`openpyxl `__ 3.1.0 excel Reading / writing for Excel 2010 xlsx/xlsm/xltx/xltm files +`pyxlsb `__ 1.0.10 excel Reading for xlsb files +`python-calamine `__ 0.1.7 excel Reading for xls/xlsx/xlsm/xlsb/xla/xlam/ods files +`odfpy `__ 1.4.1 excel Reading / writing for OpenDocument 1.2 files +================================================================== ================== =============== ============================================================= HTML ^^^^ Installable with ``pip install "pandas[html]"``. -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -BeautifulSoup4 4.11.2 html HTML parser for read_html -html5lib 1.1 html HTML parser for read_html -lxml 4.9.2 html HTML parser for read_html -========================= ================== =============== ============================================================= +=============================================================== ================== =============== ========================== +Dependency Minimum Version pip extra Notes +=============================================================== ================== =============== ========================== +`BeautifulSoup4 `__ 4.11.2 html HTML parser for read_html +`html5lib `__ 1.1 html HTML parser for read_html +`lxml `__ 4.9.2 html HTML parser for read_html +=============================================================== ================== =============== ========================== One of the following combinations of libraries is needed to use the top-level :func:`~pandas.read_html` function: @@ -273,45 +273,45 @@ XML Installable with ``pip install "pandas[xml]"``. -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -lxml 4.9.2 xml XML parser for read_xml and tree builder for to_xml -========================= ================== =============== ============================================================= +======================================== ================== =============== ==================================================== +Dependency Minimum Version pip extra Notes +======================================== ================== =============== ==================================================== +`lxml `__ 4.9.2 xml XML parser for read_xml and tree builder for to_xml +======================================== ================== =============== ==================================================== SQL databases ^^^^^^^^^^^^^ Traditional drivers are installable with ``pip install "pandas[postgresql, mysql, sql-other]"`` -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -SQLAlchemy 2.0.0 postgresql, SQL support for databases other than sqlite - mysql, - sql-other -psycopg2 2.9.6 postgresql PostgreSQL engine for sqlalchemy -pymysql 1.0.2 mysql MySQL engine for sqlalchemy -adbc-driver-postgresql 0.10.0 postgresql ADBC Driver for PostgreSQL -adbc-driver-sqlite 0.8.0 sql-other ADBC Driver for SQLite -========================= ================== =============== ============================================================= +================================================================== ================== =============== ============================================ +Dependency Minimum Version pip extra Notes +================================================================== ================== =============== ============================================ +`SQLAlchemy `__ 2.0.0 postgresql, SQL support for databases other than sqlite + mysql, + sql-other +`psycopg2 `__ 2.9.6 postgresql PostgreSQL engine for sqlalchemy +`pymysql `__ 1.0.2 mysql MySQL engine for sqlalchemy +`adbc-driver-postgresql `__ 0.10.0 postgresql ADBC Driver for PostgreSQL +`adbc-driver-sqlite `__ 0.8.0 sql-other ADBC Driver for SQLite +================================================================== ================== =============== ============================================ Other data sources ^^^^^^^^^^^^^^^^^^ Installable with ``pip install "pandas[hdf5, parquet, feather, spss, excel]"`` -========================= ================== ================ ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== ================ ============================================================= -PyTables 3.8.0 hdf5 HDF5-based reading / writing -blosc 1.21.3 hdf5 Compression for HDF5; only available on ``conda`` -zlib hdf5 Compression for HDF5 -fastparquet 2023.10.0 - Parquet reading / writing (pyarrow is default) -pyarrow 10.0.1 parquet, feather Parquet, ORC, and feather reading / writing -pyreadstat 1.2.0 spss SPSS files (.sav) reading -odfpy 1.4.1 excel Open document format (.odf, .ods, .odt) reading / writing -========================= ================== ================ ============================================================= +====================================================== ================== ================ ========================================================== +Dependency Minimum Version pip extra Notes +====================================================== ================== ================ ========================================================== +`PyTables `__ 3.8.0 hdf5 HDF5-based reading / writing +`blosc `__ 1.21.3 hdf5 Compression for HDF5; only available on ``conda`` +`zlib `__ hdf5 Compression for HDF5 +`fastparquet `__ 2023.10.0 - Parquet reading / writing (pyarrow is default) +`pyarrow `__ 10.0.1 parquet, feather Parquet, ORC, and feather reading / writing +`pyreadstat `__ 1.2.0 spss SPSS files (.sav) reading +`odfpy `__ 1.4.1 excel Open document format (.odf, .ods, .odt) reading / writing +====================================================== ================== ================ ========================================================== .. _install.warn_orc: @@ -326,26 +326,26 @@ Access data in the cloud Installable with ``pip install "pandas[fss, aws, gcp]"`` -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -fsspec 2022.11.0 fss, gcp, aws Handling files aside from simple local and HTTP (required - dependency of s3fs, gcsfs). -gcsfs 2022.11.0 gcp Google Cloud Storage access -s3fs 2022.11.0 aws Amazon S3 access -========================= ================== =============== ============================================================= +============================================ ================== =============== ========================================================== +Dependency Minimum Version pip extra Notes +============================================ ================== =============== ========================================================== +`fsspec `__ 2022.11.0 fss, gcp, aws Handling files aside from simple local and HTTP (required + dependency of s3fs, gcsfs). +`gcsfs `__ 2022.11.0 gcp Google Cloud Storage access +`s3fs `__ 2022.11.0 aws Amazon S3 access +============================================ ================== =============== ========================================================== Clipboard ^^^^^^^^^ Installable with ``pip install "pandas[clipboard]"``. -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -PyQt4/PyQt5 5.15.9 clipboard Clipboard I/O -qtpy 2.3.0 clipboard Clipboard I/O -========================= ================== =============== ============================================================= +======================================================================================== ================== =============== ============== +Dependency Minimum Version pip extra Notes +======================================================================================== ================== =============== ============== +`PyQt4 `__/`PyQt5 `__ 5.15.9 clipboard Clipboard I/O +`qtpy `__ 2.3.0 clipboard Clipboard I/O +======================================================================================== ================== =============== ============== .. note:: @@ -358,19 +358,19 @@ Compression Installable with ``pip install "pandas[compression]"`` -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -Zstandard 0.19.0 compression Zstandard compression -========================= ================== =============== ============================================================= +================================================= ================== =============== ====================== +Dependency Minimum Version pip extra Notes +================================================= ================== =============== ====================== +`Zstandard `__ 0.19.0 compression Zstandard compression +================================================= ================== =============== ====================== Timezone ^^^^^^^^ Installable with ``pip install "pandas[timezone]"`` -========================= ================== =================== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =================== ============================================================= -pytz 2023.4 timezone Alternative timezone library to ``zoneinfo``. -========================= ================== =================== ============================================================= +========================================== ================== =================== ============================================== +Dependency Minimum Version pip extra Notes +========================================== ================== =================== ============================================== +`pytz `__ 2023.4 timezone Alternative timezone library to ``zoneinfo``. +========================================== ================== =================== ============================================== diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index b9c285ca30c96..89981786d60b5 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -326,7 +326,7 @@ This case is handled identically to a dict of arrays. .. ipython:: python - data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")]) + data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "S10")]) data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] pd.DataFrame(data) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 3347f3a2534f4..8c5e98791a9ef 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -321,7 +321,7 @@ The missing value can be filled with a specific value with the ``fill_value`` ar .. image:: ../_static/reshaping_melt.png The top-level :func:`~pandas.melt` function and the corresponding :meth:`DataFrame.melt` -are useful to massage a :class:`DataFrame` into a format where one or more columns +are useful to reshape a :class:`DataFrame` into a format where one or more columns are *identifier variables*, while all other columns, considered *measured variables*, are "unpivoted" to the row axis, leaving just two non-identifier columns, "variable" and "value". The names of those columns can be customized diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst index e25c4c2441920..0581951d5bfad 100644 --- a/doc/source/user_guide/window.rst +++ b/doc/source/user_guide/window.rst @@ -567,9 +567,9 @@ One must have :math:`0 < \alpha \leq 1`, and while it is possible to pass \alpha = \begin{cases} - \frac{2}{s + 1}, & \text{for span}\ s \geq 1\\ - \frac{1}{1 + c}, & \text{for center of mass}\ c \geq 0\\ - 1 - \exp^{\frac{\log 0.5}{h}}, & \text{for half-life}\ h > 0 + \frac{2}{s + 1}, & \text{for span}\ s \geq 1\\ + \frac{1}{1 + c}, & \text{for center of mass}\ c \geq 0\\ + 1 - e^{\frac{\log 0.5}{h}}, & \text{for half-life}\ h > 0 \end{cases} One must specify precisely one of **span**, **center of mass**, **half-life** diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index fbf2bed550c85..ab5746eca1b18 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -667,7 +667,8 @@ Indexing ^^^^^^^^ - Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`) - Bug in :meth:`DataFrame.from_records` throwing a ``ValueError`` when passed an empty list in ``index`` (:issue:`58594`) -- +- Bug in :meth:`MultiIndex.insert` when a new value inserted to a datetime-like level gets cast to ``NaT`` and fails indexing (:issue:`60388`) +- Bug in printing :attr:`Index.names` and :attr:`MultiIndex.levels` would not escape single quotes (:issue:`60190`) Missing ^^^^^^^ @@ -690,6 +691,7 @@ I/O - Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`) - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) +- Bug in :meth:`DataFrame.to_excel` where the :class:`MultiIndex` index with a period level was not a date (:issue:`60099`) - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) - Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) @@ -699,6 +701,7 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) +- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) - Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) @@ -730,11 +733,13 @@ Groupby/resample/rolling - Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) - Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`) +- Bug in :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` for empty data frame with ``group_keys=False`` still creating output index using group keys. (:issue:`60471`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) - Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`) - Bug in :meth:`DataFrameGroupBy.cumsum` and :meth:`DataFrameGroupBy.cumprod` where ``numeric_only`` parameter was passed indirectly through kwargs instead of passing directly. (:issue:`58811`) - Bug in :meth:`DataFrameGroupBy.cumsum` where it did not return the correct dtype when the label contained ``None``. (:issue:`58811`) - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) +- Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`) - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) - Bug in :meth:`Series.resample` could raise when the the date range ended shortly before a non-existent time. (:issue:`58380`) @@ -760,11 +765,12 @@ ExtensionArray - Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`) - Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`) - Bug in comparison between object with :class:`ArrowDtype` and incompatible-dtyped (e.g. string vs bool) incorrectly raising instead of returning all-``False`` (for ``==``) or all-``True`` (for ``!=``) (:issue:`59505`) +- Bug in constructing pandas data structures when passing into ``dtype`` a string of the type followed by ``[pyarrow]`` while PyArrow is not installed would raise ``NameError`` rather than ``ImportError`` (:issue:`57928`) - Bug in various :class:`DataFrame` reductions for pyarrow temporal dtypes returning incorrect dtype when result was null (:issue:`59234`) Styler ^^^^^^ -- +- Bug in :meth:`Styler.to_latex` where styling column headers when combined with a hidden index or hidden index-levels is fixed. Other ^^^^^ @@ -788,9 +794,11 @@ Other - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) +- Bug in :meth:`Series.to_string` when series contains complex floats with exponents (:issue:`60405`) - Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) +- Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`) .. ***DO NOT USE THIS SECTION*** diff --git a/environment.yml b/environment.yml index 9bf6cf2a92347..69647a436e3ad 100644 --- a/environment.yml +++ b/environment.yml @@ -35,6 +35,7 @@ dependencies: - hypothesis>=6.84.0 - gcsfs>=2022.11.0 - ipython + - pickleshare # Needed for IPython Sphinx directive in the docs GH#60429 - jinja2>=3.1.2 - lxml>=4.9.2 - matplotlib>=3.6.3 @@ -87,7 +88,7 @@ dependencies: - google-auth - natsort # DataFrame.sort_values doctest - numpydoc - - pydata-sphinx-theme=0.14 + - pydata-sphinx-theme=0.16 - pytest-cython # doctest - sphinx - sphinx-design diff --git a/pandas/__init__.py b/pandas/__init__.py index 6c97baa890777..c570fb8d70204 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -235,6 +235,7 @@ # Pandas is not (yet) a py.typed library: the public API is determined # based on the documentation. __all__ = [ + "NA", "ArrowDtype", "BooleanDtype", "Categorical", @@ -253,15 +254,14 @@ "HDFStore", "Index", "IndexSlice", + "Int8Dtype", "Int16Dtype", "Int32Dtype", "Int64Dtype", - "Int8Dtype", "Interval", "IntervalDtype", "IntervalIndex", "MultiIndex", - "NA", "NaT", "NamedAgg", "Period", @@ -274,10 +274,10 @@ "Timedelta", "TimedeltaIndex", "Timestamp", + "UInt8Dtype", "UInt16Dtype", "UInt32Dtype", "UInt64Dtype", - "UInt8Dtype", "api", "array", "arrays", @@ -290,8 +290,8 @@ "errors", "eval", "factorize", - "get_dummies", "from_dummies", + "get_dummies", "get_option", "infer_freq", "interval_range", diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index 80d9ea1b364f3..463e8af7cc561 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -8,13 +8,13 @@ __all__ = [ "config", + "describe_option", "detect_console_encoding", "get_option", - "set_option", - "reset_option", - "describe_option", "option_context", "options", + "reset_option", + "set_option", ] from pandas._config import config from pandas._config import dates # pyright: ignore[reportUnusedImport] # noqa: F401 diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 1d57aa806e0f1..35139979f92fe 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -188,6 +188,11 @@ def set_option(*args) -> None: """ Set the value of the specified option or options. + This method allows fine-grained control over the behavior and display settings + of pandas. Options affect various functionalities such as output formatting, + display limits, and operational behavior. Settings can be modified at runtime + without requiring changes to global configurations or environment variables. + Parameters ---------- *args : str | object diff --git a/pandas/_libs/__init__.py b/pandas/_libs/__init__.py index 26a872a90e493..d499f9a6cd75e 100644 --- a/pandas/_libs/__init__.py +++ b/pandas/_libs/__init__.py @@ -1,4 +1,5 @@ __all__ = [ + "Interval", "NaT", "NaTType", "OutOfBoundsDatetime", @@ -6,7 +7,6 @@ "Timedelta", "Timestamp", "iNaT", - "Interval", ] diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index bf6d8ba8973d3..3af2856d2fbbf 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -72,6 +72,9 @@ class MaskedUInt16Engine(MaskedIndexEngine): ... class MaskedUInt8Engine(MaskedIndexEngine): ... class MaskedBoolEngine(MaskedUInt8Engine): ... +class StringObjectEngine(ObjectEngine): + def __init__(self, values: object, na_value) -> None: ... + class BaseMultiIndexCodesEngine: levels: list[np.ndarray] offsets: np.ndarray # np.ndarray[..., ndim=1] diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 1506a76aa94a6..688f943760d1f 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -557,6 +557,31 @@ cdef class StringEngine(IndexEngine): raise KeyError(val) return str(val) +cdef class StringObjectEngine(ObjectEngine): + + cdef: + object na_value + bint uses_na + + def __init__(self, ndarray values, na_value): + super().__init__(values) + self.na_value = na_value + self.uses_na = na_value is C_NA + + cdef bint _checknull(self, object val): + if self.uses_na: + return val is C_NA + else: + return util.is_nan(val) + + cdef _check_type(self, object val): + if isinstance(val, str): + return val + elif self._checknull(val): + return self.na_value + else: + raise KeyError(val) + cdef class DatetimeEngine(Int64Engine): diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 31979b293a940..f433a3acf356f 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -1,39 +1,39 @@ __all__ = [ - "dtypes", - "localize_pydatetime", + "BaseOffset", + "IncompatibleFrequency", "NaT", "NaTType", - "iNaT", - "nat_strings", "OutOfBoundsDatetime", "OutOfBoundsTimedelta", - "IncompatibleFrequency", "Period", "Resolution", + "Tick", "Timedelta", - "normalize_i8_timestamps", - "is_date_array_normalized", - "dt64arr_to_periodarr", + "Timestamp", + "add_overflowsafe", + "astype_overflowsafe", "delta_to_nanoseconds", + "dt64arr_to_periodarr", + "dtypes", + "get_resolution", + "get_supported_dtype", + "get_unit_from_dtype", + "guess_datetime_format", + "iNaT", "ints_to_pydatetime", "ints_to_pytimedelta", - "get_resolution", - "Timestamp", - "tz_convert_from_utc_single", - "tz_convert_from_utc", - "to_offset", - "Tick", - "BaseOffset", - "tz_compare", + "is_date_array_normalized", + "is_supported_dtype", "is_unitless", - "astype_overflowsafe", - "get_unit_from_dtype", + "localize_pydatetime", + "nat_strings", + "normalize_i8_timestamps", "periods_per_day", "periods_per_second", - "guess_datetime_format", - "add_overflowsafe", - "get_supported_dtype", - "is_supported_dtype", + "to_offset", + "tz_compare", + "tz_convert_from_utc", + "tz_convert_from_utc_single", ] from pandas._libs.tslibs import dtypes diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 193556b2697a9..1b7f04fe17238 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -201,6 +201,10 @@ class OutOfBoundsTimedelta(ValueError): Representation should be within a timedelta64[ns]. + See Also + -------- + date_range : Return a fixed frequency DatetimeIndex. + Examples -------- >>> pd.date_range(start="1/1/1700", freq="B", periods=100000) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index e092d65f08dd4..ec9b5098c97c9 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -540,6 +540,25 @@ def shares_memory(left, right) -> bool: "ALL_INT_NUMPY_DTYPES", "ALL_NUMPY_DTYPES", "ALL_REAL_NUMPY_DTYPES", + "BOOL_DTYPES", + "BYTES_DTYPES", + "COMPLEX_DTYPES", + "DATETIME64_DTYPES", + "ENDIAN", + "FLOAT_EA_DTYPES", + "FLOAT_NUMPY_DTYPES", + "NARROW_NP_DTYPES", + "NP_NAT_OBJECTS", + "NULL_OBJECTS", + "OBJECT_DTYPES", + "SIGNED_INT_EA_DTYPES", + "SIGNED_INT_NUMPY_DTYPES", + "STRING_DTYPES", + "TIMEDELTA64_DTYPES", + "UNSIGNED_INT_EA_DTYPES", + "UNSIGNED_INT_NUMPY_DTYPES", + "SubclassedDataFrame", + "SubclassedSeries", "assert_almost_equal", "assert_attr_equal", "assert_categorical_equal", @@ -563,51 +582,32 @@ def shares_memory(left, right) -> bool: "assert_sp_array_equal", "assert_timedelta_array_equal", "at", - "BOOL_DTYPES", "box_expected", - "BYTES_DTYPES", "can_set_locale", - "COMPLEX_DTYPES", "convert_rows_list_to_csv_str", - "DATETIME64_DTYPES", "decompress_file", - "ENDIAN", "ensure_clean", "external_error_raised", - "FLOAT_EA_DTYPES", - "FLOAT_NUMPY_DTYPES", "get_cython_table_params", "get_dtype", - "getitem", - "get_locales", "get_finest_unit", + "get_locales", "get_obj", "get_op_from_name", + "getitem", "iat", "iloc", "loc", "maybe_produces_warning", - "NARROW_NP_DTYPES", - "NP_NAT_OBJECTS", - "NULL_OBJECTS", - "OBJECT_DTYPES", "raise_assert_detail", "raises_chained_assignment_error", "round_trip_pathlib", "round_trip_pickle", - "setitem", "set_locale", "set_timezone", + "setitem", "shares_memory", - "SIGNED_INT_EA_DTYPES", - "SIGNED_INT_NUMPY_DTYPES", - "STRING_DTYPES", - "SubclassedDataFrame", - "SubclassedSeries", - "TIMEDELTA64_DTYPES", "to_array", - "UNSIGNED_INT_EA_DTYPES", - "UNSIGNED_INT_NUMPY_DTYPES", "with_csv_dialect", "write_to_compressed", ] diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 01c4dcd92ee40..daa5187cdb636 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -755,11 +755,8 @@ def assert_extension_array_equal( and atol is lib.no_default ): check_exact = ( - is_numeric_dtype(left.dtype) - and not is_float_dtype(left.dtype) - or is_numeric_dtype(right.dtype) - and not is_float_dtype(right.dtype) - ) + is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype) + ) or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype)) elif check_exact is lib.no_default: check_exact = False @@ -944,11 +941,8 @@ def assert_series_equal( and atol is lib.no_default ): check_exact = ( - is_numeric_dtype(left.dtype) - and not is_float_dtype(left.dtype) - or is_numeric_dtype(right.dtype) - and not is_float_dtype(right.dtype) - ) + is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype) + ) or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype)) left_index_dtypes = ( [left.index.dtype] if left.index.nlevels == 1 else left.index.dtypes ) diff --git a/pandas/_typing.py b/pandas/_typing.py index c1769126a5776..b515305fb6903 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -273,7 +273,7 @@ def mode(self) -> str: # for _get_filepath_or_buffer ... - def seek(self, __offset: int, __whence: int = ...) -> int: + def seek(self, offset: int, whence: int = ..., /) -> int: # with one argument: gzip.GzipFile, bz2.BZ2File # with two arguments: zip.ZipFile, read_sas ... @@ -288,13 +288,13 @@ def tell(self) -> int: class ReadBuffer(BaseBuffer, Protocol[AnyStr_co]): - def read(self, __n: int = ...) -> AnyStr_co: + def read(self, n: int = ..., /) -> AnyStr_co: # for BytesIOWrapper, gzip.GzipFile, bz2.BZ2File ... class WriteBuffer(BaseBuffer, Protocol[AnyStr_contra]): - def write(self, __b: AnyStr_contra) -> Any: + def write(self, b: AnyStr_contra, /) -> Any: # for gzip.GzipFile, bz2.BZ2File ... diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py index 9b007e8fe8da4..8f659e3cd14c8 100644 --- a/pandas/api/__init__.py +++ b/pandas/api/__init__.py @@ -9,9 +9,9 @@ ) __all__ = [ - "interchange", "extensions", "indexers", + "interchange", "types", "typing", ] diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py index ea5f1ba926899..1c88c0d35b4d7 100644 --- a/pandas/api/extensions/__init__.py +++ b/pandas/api/extensions/__init__.py @@ -21,13 +21,13 @@ ) __all__ = [ - "no_default", + "ExtensionArray", "ExtensionDtype", - "register_extension_dtype", + "ExtensionScalarOpsMixin", + "no_default", "register_dataframe_accessor", + "register_extension_dtype", "register_index_accessor", "register_series_accessor", "take", - "ExtensionArray", - "ExtensionScalarOpsMixin", ] diff --git a/pandas/api/indexers/__init__.py b/pandas/api/indexers/__init__.py index 78357f11dc3b7..f3c6546218de4 100644 --- a/pandas/api/indexers/__init__.py +++ b/pandas/api/indexers/__init__.py @@ -10,8 +10,8 @@ ) __all__ = [ - "check_array_indexer", "BaseIndexer", "FixedForwardWindowIndexer", "VariableOffsetWindowIndexer", + "check_array_indexer", ] diff --git a/pandas/api/interchange/__init__.py b/pandas/api/interchange/__init__.py index 2f3a73bc46b31..aded37abc7224 100644 --- a/pandas/api/interchange/__init__.py +++ b/pandas/api/interchange/__init__.py @@ -5,4 +5,4 @@ from pandas.core.interchange.dataframe_protocol import DataFrame from pandas.core.interchange.from_dataframe import from_dataframe -__all__ = ["from_dataframe", "DataFrame"] +__all__ = ["DataFrame", "from_dataframe"] diff --git a/pandas/api/types/__init__.py b/pandas/api/types/__init__.py index c601086bb9f86..4a5c742b1628b 100644 --- a/pandas/api/types/__init__.py +++ b/pandas/api/types/__init__.py @@ -14,10 +14,10 @@ ) __all__ = [ - "infer_dtype", - "union_categoricals", "CategoricalDtype", "DatetimeTZDtype", "IntervalDtype", "PeriodDtype", + "infer_dtype", + "union_categoricals", ] diff --git a/pandas/api/typing/__init__.py b/pandas/api/typing/__init__.py index c58fa0f085266..a18a1e9d5cbb7 100644 --- a/pandas/api/typing/__init__.py +++ b/pandas/api/typing/__init__.py @@ -42,18 +42,16 @@ "ExponentialMovingWindowGroupby", "FrozenList", "JsonReader", - "NaTType", "NAType", + "NaTType", "PeriodIndexResamplerGroupby", "Resampler", "Rolling", "RollingGroupby", + "SASReader", "SeriesGroupBy", "StataReader", - "SASReader", - # See TODO above - # "Styler", - "TimedeltaIndexResamplerGroupby", "TimeGrouper", + "TimedeltaIndexResamplerGroupby", "Window", ] diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 756c209661fbb..e7674386408f7 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -150,6 +150,13 @@ def is_ci_environment() -> bool: __all__ = [ + "HAS_PYARROW", + "IS64", + "ISMUSL", + "PY311", + "PY312", + "PYPY", + "WASM", "is_numpy_dev", "pa_version_under10p1", "pa_version_under11p0", @@ -159,11 +166,4 @@ def is_ci_environment() -> bool: "pa_version_under16p0", "pa_version_under17p0", "pa_version_under18p0", - "HAS_PYARROW", - "IS64", - "ISMUSL", - "PY311", - "PY312", - "PYPY", - "WASM", ] diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 2fab8f32b8e71..3306b36d71806 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -47,7 +47,7 @@ __all__ = [ - "np", "_np_version", "is_numpy_dev", + "np", ] diff --git a/pandas/core/_numba/kernels/__init__.py b/pandas/core/_numba/kernels/__init__.py index 1116c61c4ca8e..6983711480455 100644 --- a/pandas/core/_numba/kernels/__init__.py +++ b/pandas/core/_numba/kernels/__init__.py @@ -16,12 +16,12 @@ ) __all__ = [ - "sliding_mean", "grouped_mean", - "sliding_sum", + "grouped_min_max", "grouped_sum", - "sliding_var", "grouped_var", + "sliding_mean", "sliding_min_max", - "grouped_min_max", + "sliding_sum", + "sliding_var", ] diff --git a/pandas/core/api.py b/pandas/core/api.py index c8a4e9d8a23b2..ec12d543d8389 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -80,59 +80,59 @@ from pandas.core.frame import DataFrame # isort:skip __all__ = [ - "array", + "NA", "ArrowDtype", - "bdate_range", "BooleanDtype", "Categorical", "CategoricalDtype", "CategoricalIndex", "DataFrame", "DateOffset", - "date_range", "DatetimeIndex", "DatetimeTZDtype", - "factorize", "Flags", "Float32Dtype", "Float64Dtype", "Grouper", "Index", "IndexSlice", + "Int8Dtype", "Int16Dtype", "Int32Dtype", "Int64Dtype", - "Int8Dtype", "Interval", "IntervalDtype", "IntervalIndex", - "interval_range", - "isna", - "isnull", "MultiIndex", - "NA", - "NamedAgg", "NaT", - "notna", - "notnull", + "NamedAgg", "Period", "PeriodDtype", "PeriodIndex", - "period_range", "RangeIndex", "Series", - "set_eng_float_format", "StringDtype", "Timedelta", "TimedeltaIndex", - "timedelta_range", "Timestamp", - "to_datetime", - "to_numeric", - "to_timedelta", + "UInt8Dtype", "UInt16Dtype", "UInt32Dtype", "UInt64Dtype", - "UInt8Dtype", + "array", + "bdate_range", + "date_range", + "factorize", + "interval_range", + "isna", + "isnull", + "notna", + "notnull", + "period_range", + "set_eng_float_format", + "timedelta_range", + "to_datetime", + "to_numeric", + "to_timedelta", "unique", ] diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 245a171fea74b..f183e9236471e 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -23,21 +23,21 @@ __all__ = [ "ArrowExtensionArray", - "ExtensionArray", - "ExtensionOpsMixin", - "ExtensionScalarOpsMixin", "ArrowStringArray", "BaseMaskedArray", "BooleanArray", "Categorical", "DatetimeArray", + "ExtensionArray", + "ExtensionOpsMixin", + "ExtensionScalarOpsMixin", "FloatingArray", "IntegerArray", "IntervalArray", "NumpyExtensionArray", "PeriodArray", - "period_array", "SparseArray", "StringArray", "TimedeltaArray", + "period_array", ] diff --git a/pandas/core/arrays/arrow/__init__.py b/pandas/core/arrays/arrow/__init__.py index 5fc50f786fc6a..50274a2de2cc1 100644 --- a/pandas/core/arrays/arrow/__init__.py +++ b/pandas/core/arrays/arrow/__init__.py @@ -4,4 +4,4 @@ ) from pandas.core.arrays.arrow.array import ArrowExtensionArray -__all__ = ["ArrowExtensionArray", "StructAccessor", "ListAccessor"] +__all__ = ["ArrowExtensionArray", "ListAccessor", "StructAccessor"] diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e0c93db0afb07..afa219f611992 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1446,8 +1446,7 @@ def to_numpy( pa.types.is_floating(pa_type) and ( na_value is np.nan - or original_na_value is lib.no_default - and is_float_dtype(dtype) + or (original_na_value is lib.no_default and is_float_dtype(dtype)) ) ): result = data._pa_array.to_numpy() diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 9c821bf0d184e..c6b6367e347ba 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2073,7 +2073,29 @@ def _creso(self) -> int: @cache_readonly def unit(self) -> str: - # e.g. "ns", "us", "ms" + """ + The precision unit of the datetime data. + + Returns the precision unit for the dtype. + It means the smallest time frame that can be stored within this dtype. + + Returns + ------- + str + Unit string representation (e.g. "ns"). + + See Also + -------- + TimelikeOps.as_unit : Converts to a specific unit. + + Examples + -------- + >>> idx = pd.DatetimeIndex(["2020-01-02 01:02:03.004005006"]) + >>> idx.unit + 'ns' + >>> idx.as_unit("s").unit + 's' + """ # error: Argument 1 to "dtype_to_unit" has incompatible type # "ExtensionDtype"; expected "Union[DatetimeTZDtype, dtype[Any]]" return dtype_to_unit(self.dtype) # type: ignore[arg-type] diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index f85fbd062b0c3..afbadd754cdbc 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -105,6 +105,12 @@ class IntegerArray(NumericArray): ------- IntegerArray + See Also + -------- + array : Create an array using the appropriate dtype, including ``IntegerArray``. + Int32Dtype : An ExtensionDtype for int32 integer data. + UInt16Dtype : An ExtensionDtype for uint16 integer data. + Examples -------- Create an IntegerArray with :func:`pandas.array`. diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index f47ef095a8409..bbbf1d9ca60bd 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1055,7 +1055,9 @@ def shift(self, periods: int = 1, fill_value: object = None) -> IntervalArray: from pandas import Index fill_value = Index(self._left, copy=False)._na_value - empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1)) + empty = IntervalArray.from_breaks( + [fill_value] * (empty_len + 1), closed=self.closed + ) else: empty = self._from_sequence([fill_value] * empty_len, dtype=self.dtype) diff --git a/pandas/core/arrays/sparse/__init__.py b/pandas/core/arrays/sparse/__init__.py index adf83963aca39..93d5cb8cc335a 100644 --- a/pandas/core/arrays/sparse/__init__.py +++ b/pandas/core/arrays/sparse/__init__.py @@ -12,8 +12,8 @@ __all__ = [ "BlockIndex", "IntIndex", - "make_sparse_index", "SparseAccessor", "SparseArray", "SparseFrameAccessor", + "make_sparse_index", ] diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 4ccfbd71d9ce8..86f83489e71ae 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -371,10 +371,12 @@ def eval( is_extension_array_dtype(parsed_expr.terms.return_type) and not is_string_dtype(parsed_expr.terms.return_type) ) - or getattr(parsed_expr.terms, "operand_types", None) is not None - and any( - (is_extension_array_dtype(elem) and not is_string_dtype(elem)) - for elem in parsed_expr.terms.operand_types + or ( + getattr(parsed_expr.terms, "operand_types", None) is not None + and any( + (is_extension_array_dtype(elem) and not is_string_dtype(elem)) + for elem in parsed_expr.terms.operand_types + ) ) ): warnings.warn( diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 7025d8a72e561..010fad1bbf0b6 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -512,8 +512,7 @@ def _maybe_evaluate_binop( ) if self.engine != "pytables" and ( - res.op in CMP_OPS_SYMS - and getattr(lhs, "is_datetime", False) + (res.op in CMP_OPS_SYMS and getattr(lhs, "is_datetime", False)) or getattr(rhs, "is_datetime", False) ): # all date ops must be done in python bc numexpr doesn't work diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 39511048abf49..fe7e27f537b01 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -408,11 +408,12 @@ def prune(self, klass): operand = operand.prune(klass) if operand is not None and ( - issubclass(klass, ConditionBinOp) - and operand.condition is not None - or not issubclass(klass, ConditionBinOp) - and issubclass(klass, FilterBinOp) - and operand.filter is not None + (issubclass(klass, ConditionBinOp) and operand.condition is not None) + or ( + not issubclass(klass, ConditionBinOp) + and issubclass(klass, FilterBinOp) + and operand.filter is not None + ) ): return operand.invert() return None diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index 7b31e03e58b4b..336d62b9d9579 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -140,7 +140,7 @@ class Scope: temps : dict """ - __slots__ = ["level", "scope", "target", "resolvers", "temps"] + __slots__ = ["level", "resolvers", "scope", "target", "temps"] level: int scope: DeepChainMap resolvers: DeepChainMap diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 137a49c4487f6..02b9291da9b31 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -87,8 +87,8 @@ if TYPE_CHECKING: from collections.abc import ( + Collection, Sequence, - Sized, ) from pandas._typing import ( @@ -1581,7 +1581,7 @@ def _maybe_box_and_unbox_datetimelike(value: Scalar, dtype: DtypeObj): return _maybe_unbox_datetimelike(value, dtype) -def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: +def construct_1d_object_array_from_listlike(values: Collection) -> np.ndarray: """ Transform any list-like object in a 1-dimensional numpy array of object dtype. @@ -1599,11 +1599,9 @@ def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: ------- 1-dimensional numpy array of dtype object """ - # numpy will try to interpret nested lists as further dimensions, hence - # making a 1D array that contains list-likes is a bit tricky: - result = np.empty(len(values), dtype="object") - result[:] = values - return result + # numpy will try to interpret nested lists as further dimensions in np.array(), + # hence explicitly making a 1D array using np.fromiter + return np.fromiter(values, dtype="object", count=len(values)) def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.ndarray: diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 8f93b1a397c1f..6fa21d9410187 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1889,13 +1889,14 @@ def is_all_strings(value: ArrayLike) -> bool: __all__ = [ - "classes", "DT64NS_DTYPE", + "INT64_DTYPE", + "TD64NS_DTYPE", + "classes", "ensure_float64", "ensure_python_int", "ensure_str", "infer_dtype_from_object", - "INT64_DTYPE", "is_1d_only_ea_dtype", "is_all_strings", "is_any_real_numeric_dtype", @@ -1940,6 +1941,5 @@ def is_all_strings(value: ArrayLike) -> bool: "is_unsigned_integer_dtype", "needs_i8_conversion", "pandas_dtype", - "TD64NS_DTYPE", "validate_all_hashable", ] diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 60eff6e10f0be..a430e8f23c046 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -73,7 +73,7 @@ from collections.abc import MutableMapping from datetime import tzinfo - import pyarrow as pa # noqa: TCH004 + import pyarrow as pa # noqa: TC004 from pandas._typing import ( Dtype, @@ -1115,10 +1115,8 @@ def construct_from_string(cls, string: str_type) -> PeriodDtype: possible """ if ( - isinstance(string, str) - and (string.startswith(("period[", "Period["))) - or isinstance(string, BaseOffset) - ): + isinstance(string, str) and (string.startswith(("period[", "Period["))) + ) or isinstance(string, BaseOffset): # do not parse string like U as period[U] # avoid tuple to be regarded as freq try: @@ -2346,6 +2344,8 @@ def construct_from_string(cls, string: str) -> ArrowDtype: if string == "string[pyarrow]": # Ensure Registry.find skips ArrowDtype to use StringDtype instead raise TypeError("string[pyarrow] should be constructed by StringDtype") + if pa_version_under10p1: + raise ImportError("pyarrow>=10.0.1 is required for ArrowDtype") base_type = string[:-9] # get rid of "[pyarrow]" try: diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 6adb34ff0f777..918d107f2ce6c 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -190,12 +190,17 @@ def is_re_compilable(obj: object) -> bool: Parameters ---------- obj : The object to check + The object to check if the object can be compiled into a regex pattern instance. Returns ------- bool Whether `obj` can be compiled as a regex pattern. + See Also + -------- + api.types.is_re : Check if the object is a regex pattern instance. + Examples -------- >>> from pandas.api.types import is_re_compilable diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 34eb198b4b4da..34b448a0d8d1c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1018,7 +1018,7 @@ def shape(self) -> tuple[int, int]: See Also -------- - ndarray.shape : Tuple of array dimensions. + numpy.ndarray.shape : Tuple of array dimensions. Examples -------- @@ -3929,8 +3929,7 @@ def __getitem__(self, key): # GH#45316 Return view if key is not duplicated # Only use drop_duplicates with duplicates for performance if not is_mi and ( - self.columns.is_unique - and key in self.columns + (self.columns.is_unique and key in self.columns) or key in self.columns.drop_duplicates(keep=False) ): return self._get_item(key) @@ -4742,7 +4741,8 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: 3 4 4 7 8 0 4 5 2 6 7 3 - For columns with spaces in their name, you can use backtick quoting. + For columns with spaces or other disallowed characters in their name, you can + use backtick quoting. >>> df.eval("B * `C&C`") 0 100 @@ -6775,8 +6775,7 @@ def f(vals) -> tuple[np.ndarray, int]: elif ( not np.iterable(subset) or isinstance(subset, str) - or isinstance(subset, tuple) - and subset in self.columns + or (isinstance(subset, tuple) and subset in self.columns) ): subset = (subset,) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 039bdf9c36ee7..d1aa20501b060 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -640,7 +640,7 @@ def ndim(self) -> int: See Also -------- - ndarray.ndim : Number of array dimensions. + numpy.ndarray.ndim : Number of array dimensions. Examples -------- @@ -838,7 +838,7 @@ def pop(self, item: Hashable) -> Series | Any: return result @final - def squeeze(self, axis: Axis | None = None): + def squeeze(self, axis: Axis | None = None) -> Scalar | Series | DataFrame: """ Squeeze 1 dimensional axis objects into scalars. @@ -3878,6 +3878,14 @@ def to_csv( >>> import os # doctest: +SKIP >>> os.makedirs("folder/subfolder", exist_ok=True) # doctest: +SKIP >>> df.to_csv("folder/subfolder/out.csv") # doctest: +SKIP + + Format floats to two decimal places: + + >>> df.to_csv("out1.csv", float_format="%.2f") # doctest: +SKIP + + Format floats using scientific notation: + + >>> df.to_csv("out2.csv", float_format="{{:.2e}}".format) # doctest: +SKIP """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() diff --git a/pandas/core/groupby/__init__.py b/pandas/core/groupby/__init__.py index 8248f378e2c1a..ec477626a098f 100644 --- a/pandas/core/groupby/__init__.py +++ b/pandas/core/groupby/__init__.py @@ -8,8 +8,8 @@ __all__ = [ "DataFrameGroupBy", - "NamedAgg", - "SeriesGroupBy", "GroupBy", "Grouper", + "NamedAgg", + "SeriesGroupBy", ] diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 35ec09892ede6..f4e3f3e8b1001 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -583,6 +583,8 @@ def _wrap_applied_output( if is_transform: # GH#47787 see test_group_on_empty_multiindex res_index = data.index + elif not self.group_keys: + res_index = None else: res_index = self._grouper.result_index @@ -1321,8 +1323,8 @@ def idxmin(self, skipna: bool = True) -> Series: Returns ------- - Index - Label of the minimum value. + Series + Indexes of minima in each group. Raises ------ @@ -1374,8 +1376,8 @@ def idxmax(self, skipna: bool = True) -> Series: Returns ------- - Index - Label of the maximum value. + Series + Indexes of maxima in each group. Raises ------ @@ -1967,6 +1969,8 @@ def _wrap_applied_output( if is_transform: # GH#47787 see test_group_on_empty_multiindex res_index = data.index + elif not self.group_keys: + res_index = None else: res_index = self._grouper.result_index @@ -2453,6 +2457,10 @@ def nunique(self, dropna: bool = True) -> DataFrame: nunique: DataFrame Counts of unique elements in each position. + See Also + -------- + DataFrame.nunique : Count number of distinct elements in specified axis. + Examples -------- >>> df = pd.DataFrame( @@ -2508,8 +2516,8 @@ def idxmax( Returns ------- - Series - Indexes of maxima in each group. + DataFrame + Indexes of maxima in each column according to the group. Raises ------ @@ -2519,6 +2527,7 @@ def idxmax( See Also -------- Series.idxmax : Return index of the maximum element. + DataFrame.idxmax : Indexes of maxima along the specified axis. Notes ----- @@ -2532,6 +2541,7 @@ def idxmax( ... { ... "consumption": [10.51, 103.11, 55.48], ... "co2_emissions": [37.2, 19.66, 1712], + ... "food_type": ["meat", "plant", "meat"], ... }, ... index=["Pork", "Wheat Products", "Beef"], ... ) @@ -2542,12 +2552,14 @@ def idxmax( Wheat Products 103.11 19.66 Beef 55.48 1712.00 - By default, it returns the index for the maximum value in each column. + By default, it returns the index for the maximum value in each column + according to the group. - >>> df.idxmax() - consumption Wheat Products - co2_emissions Beef - dtype: object + >>> df.groupby("food_type").idxmax() + consumption co2_emissions + food_type + animal Beef Beef + plant Wheat Products Wheat Products """ return self._idxmax_idxmin("idxmax", numeric_only=numeric_only, skipna=skipna) @@ -2570,8 +2582,8 @@ def idxmin( Returns ------- - Series - Indexes of minima in each group. + DataFrame + Indexes of minima in each column according to the group. Raises ------ @@ -2581,6 +2593,7 @@ def idxmin( See Also -------- Series.idxmin : Return index of the minimum element. + DataFrame.idxmin : Indexes of minima along the specified axis. Notes ----- @@ -2594,6 +2607,7 @@ def idxmin( ... { ... "consumption": [10.51, 103.11, 55.48], ... "co2_emissions": [37.2, 19.66, 1712], + ... "food_type": ["meat", "plant", "meat"], ... }, ... index=["Pork", "Wheat Products", "Beef"], ... ) @@ -2604,12 +2618,14 @@ def idxmin( Wheat Products 103.11 19.66 Beef 55.48 1712.00 - By default, it returns the index for the minimum value in each column. + By default, it returns the index for the minimum value in each column + according to the group. - >>> df.idxmin() - consumption Pork - co2_emissions Wheat Products - dtype: object + >>> df.groupby("food_type").idxmin() + consumption co2_emissions + food_type + animal Pork Pork + plant Wheat Products Wheat Products """ return self._idxmax_idxmin("idxmin", numeric_only=numeric_only, skipna=skipna) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ad23127ad449f..f0513be3498d1 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -515,6 +515,15 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: """ Dict {group name -> group indices}. + See Also + -------- + core.groupby.DataFrameGroupBy.indices : Provides a mapping of group rows to + positions of the elements. + core.groupby.SeriesGroupBy.indices : Provides a mapping of group rows to + positions of the elements. + core.resample.Resampler.indices : Provides a mapping of group rows to + positions of the elements. + Examples -------- @@ -706,7 +715,19 @@ def get_group(self, name) -> DataFrame | Series: Returns ------- - DataFrame or Series + Series or DataFrame + Get the respective Series or DataFrame corresponding to the group provided. + + See Also + -------- + DataFrameGroupBy.groups: Dictionary representation of the groupings formed + during a groupby operation. + DataFrameGroupBy.indices: Provides a mapping of group rows to positions + of the elements. + SeriesGroupBy.groups: Dictionary representation of the groupings formed + during a groupby operation. + SeriesGroupBy.indices: Provides a mapping of group rows to positions + of the elements. Examples -------- @@ -2649,6 +2670,11 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: Series or DataFrame Standard error of the mean of values within each group. + See Also + -------- + DataFrame.sem : Return unbiased standard error of the mean over requested axis. + Series.sem : Return unbiased standard error of the mean over requested axis. + Examples -------- For SeriesGroupBy: diff --git a/pandas/core/indexers/__init__.py b/pandas/core/indexers/__init__.py index ba8a4f1d0ee7a..036b32b3feac2 100644 --- a/pandas/core/indexers/__init__.py +++ b/pandas/core/indexers/__init__.py @@ -15,17 +15,17 @@ ) __all__ = [ - "is_valid_positional_slice", + "check_array_indexer", + "check_key_length", + "check_setitem_lengths", + "disallow_ndim_indexing", + "is_empty_indexer", "is_list_like_indexer", "is_scalar_indexer", - "is_empty_indexer", - "check_setitem_lengths", - "validate_indices", - "maybe_convert_indices", + "is_valid_positional_slice", "length_of_indexer", - "disallow_ndim_indexing", + "maybe_convert_indices", "unpack_1tuple", - "check_key_length", - "check_array_indexer", "unpack_tuple_and_ellipses", + "validate_indices", ] diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 5144e647e73b4..058e584336905 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -37,26 +37,26 @@ __all__ = [ - "Index", - "MultiIndex", "CategoricalIndex", + "DatetimeIndex", + "Index", "IntervalIndex", - "RangeIndex", "InvalidIndexError", - "TimedeltaIndex", + "MultiIndex", + "NaT", "PeriodIndex", - "DatetimeIndex", + "RangeIndex", + "TimedeltaIndex", "_new_Index", - "NaT", + "all_indexes_same", + "default_index", "ensure_index", "ensure_index_from_sequences", "get_objs_combined_axis", - "union_indexes", "get_unanimous_names", - "all_indexes_same", - "default_index", - "safe_sort_index", "maybe_sequence_to_range", + "safe_sort_index", + "union_indexes", ] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d4ba7e01ebfa9..165fe109c4c94 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -876,7 +876,7 @@ def _engine( # ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr] target_values = self._data._ndarray # type: ignore[union-attr] elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype): - return libindex.StringEngine(target_values) + return libindex.StringObjectEngine(target_values, self.dtype.na_value) # type: ignore[union-attr] # error: Argument 1 to "ExtensionEngine" has incompatible type # "ndarray[Any, Any]"; expected "ExtensionArray" @@ -5974,7 +5974,6 @@ def _should_fallback_to_positional(self) -> bool: def get_indexer_non_unique( self, target ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: - target = ensure_index(target) target = self._maybe_cast_listlike_indexer(target) if not self._should_compare(target) and not self._should_partial_index(target): diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index c559c529586b5..254bd71ade209 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -110,7 +110,9 @@ def _disabled(self, *args, **kwargs) -> NoReturn: raise TypeError(f"'{type(self).__name__}' does not support mutable operations.") def __str__(self) -> str: - return pprint_thing(self, quote_strings=True, escape_chars=("\t", "\r", "\n")) + return pprint_thing( + self, quote_strings=True, escape_chars=("\t", "\r", "\n", "'") + ) def __repr__(self) -> str: return f"{type(self).__name__}({self!s})" diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 36e68465a99d9..dc48cd1ed958e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -4084,11 +4084,10 @@ def insert(self, loc: int, item) -> MultiIndex: # have to insert into level # must insert at end otherwise you have to recompute all the # other codes - if isna(k): # GH 59003 + lev_loc = len(level) + level = level.insert(lev_loc, k) + if isna(level[lev_loc]): # GH 59003, 60388 lev_loc = -1 - else: - lev_loc = len(level) - level = level.insert(lev_loc, k) else: lev_loc = level.get_loc(k) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 7eeaab3b0443f..935762d0455c5 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1195,7 +1195,7 @@ def _getitem_slice(self, slobj: slice) -> Self: @unpack_zerodim_and_defer("__floordiv__") def __floordiv__(self, other): if is_integer(other) and other != 0: - if len(self) == 0 or self.start % other == 0 and self.step % other == 0: + if len(self) == 0 or (self.start % other == 0 and self.step % other == 0): start = self.start // other step = self.step // other stop = start + len(self) * step diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 0d6d7e68f58a4..e0bc0a23acd9f 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1239,8 +1239,10 @@ def _validate_key(self, key, axis: Axis) -> None: if isinstance(key, bool) and not ( is_bool_dtype(ax.dtype) or ax.dtype.name == "boolean" - or isinstance(ax, MultiIndex) - and is_bool_dtype(ax.get_level_values(0).dtype) + or ( + isinstance(ax, MultiIndex) + and is_bool_dtype(ax.get_level_values(0).dtype) + ) ): raise KeyError( f"{key}: boolean label can not be used without a boolean index" @@ -2120,7 +2122,7 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: is_full_setter = com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)) - is_null_setter = com.is_empty_slice(pi) or is_array_like(pi) and len(pi) == 0 + is_null_setter = com.is_empty_slice(pi) or (is_array_like(pi) and len(pi) == 0) if is_null_setter: # no-op, don't cast dtype later @@ -2744,19 +2746,15 @@ def check_dict_or_set_indexers(key) -> None: """ Check if the indexer is or contains a dict or set, which is no longer allowed. """ - if ( - isinstance(key, set) - or isinstance(key, tuple) - and any(isinstance(x, set) for x in key) + if isinstance(key, set) or ( + isinstance(key, tuple) and any(isinstance(x, set) for x in key) ): raise TypeError( "Passing a set as an indexer is not supported. Use a list instead." ) - if ( - isinstance(key, dict) - or isinstance(key, tuple) - and any(isinstance(x, dict) for x in key) + if isinstance(key, dict) or ( + isinstance(key, tuple) and any(isinstance(x, dict) for x in key) ): raise TypeError( "Passing a dict as an indexer is not supported. Use a list instead." diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 5ab70ba38f9c2..202bebde88c2c 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -7,11 +7,11 @@ __all__ = [ "Block", - "ExtensionBlock", - "make_block", "BlockManager", + "ExtensionBlock", "SingleBlockManager", "concatenate_managers", + "make_block", ] diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 54273ff89f1af..f44ad926dda5c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -514,9 +514,8 @@ def convert(self) -> list[Block]: convert_non_numeric=True, ) refs = None - if ( - res_values is values - or isinstance(res_values, NumpyExtensionArray) + if res_values is values or ( + isinstance(res_values, NumpyExtensionArray) and res_values._ndarray is values ): refs = self.refs diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index c7f0134e38508..ef5b144ee690b 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -417,8 +417,7 @@ def dict_to_mgr( else x.copy(deep=True) if ( isinstance(x, Index) - or isinstance(x, ABCSeries) - and is_1d_only_ea_dtype(x.dtype) + or (isinstance(x, ABCSeries) and is_1d_only_ea_dtype(x.dtype)) ) else x for x in arrays diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 34a0bb1f45e2c..9f9d69a182f72 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -66,15 +66,18 @@ __all__ = [ "ARITHMETIC_BINOPS", "arithmetic_op", - "comparison_op", "comp_method_OBJECT_ARRAY", - "invalid_comparison", + "comparison_op", "fill_binop", + "get_array_op", + "get_op_result_name", + "invalid_comparison", "kleene_and", "kleene_or", "kleene_xor", "logical_op", "make_flex_doc", + "maybe_prepare_scalar_for_op", "radd", "rand_", "rdiv", @@ -88,7 +91,4 @@ "rtruediv", "rxor", "unpack_zerodim_and_defer", - "get_op_result_name", - "maybe_prepare_scalar_for_op", - "get_array_op", ] diff --git a/pandas/core/resample.py b/pandas/core/resample.py index ca4d3fc768efb..fdfb9f21bdb9f 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2002,9 +2002,7 @@ def __init__( raise ValueError(f"Unsupported value {convention} for `convention`") if ( - key is None - and obj is not None - and isinstance(obj.index, PeriodIndex) # type: ignore[attr-defined] + (key is None and obj is not None and isinstance(obj.index, PeriodIndex)) # type: ignore[attr-defined] or ( key is not None and obj is not None diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index bfd8e3ccd2f7c..f4cb82816bbcf 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -51,9 +51,9 @@ def melt( """ Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. - This function is useful to massage a DataFrame into a format where one + This function is useful to reshape a DataFrame into a format where one or more columns are identifier variables (`id_vars`), while all other - columns, considered measured variables (`value_vars`), are "unpivoted" to + columns are considered measured variables (`value_vars`), and are "unpivoted" to the row axis, leaving just two non-identifier columns, 'variable' and 'value'. diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6f9bb8cb24f43..5fddd9f9aca5b 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2746,8 +2746,7 @@ def _factorize_keys( isinstance(lk.dtype, ArrowDtype) and ( is_numeric_dtype(lk.dtype.numpy_dtype) - or is_string_dtype(lk.dtype) - and not sort + or (is_string_dtype(lk.dtype) and not sort) ) ): lk, _ = lk._values_for_factorize() diff --git a/pandas/core/series.py b/pandas/core/series.py index 35b576da87ed7..4fa8b86fa4c16 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -567,7 +567,7 @@ def __arrow_c_stream__(self, requested_schema=None): Export the pandas Series as an Arrow C stream PyCapsule. This relies on pyarrow to convert the pandas Series to the Arrow - format (and follows the default behaviour of ``pyarrow.Array.from_pandas`` + format (and follows the default behavior of ``pyarrow.Array.from_pandas`` in its handling of the index, i.e. to ignore it). This conversion is not necessarily zero-copy. @@ -2226,7 +2226,7 @@ def drop_duplicates( 5 hippo Name: animal, dtype: object - With the 'keep' parameter, the selection behaviour of duplicated values + With the 'keep' parameter, the selection behavior of duplicated values can be changed. The value 'first' keeps the first occurrence for each set of duplicated entries. The default value of keep is 'first'. @@ -3451,7 +3451,7 @@ def sort_values( 4 5.0 dtype: float64 - Sort values ascending order (default behaviour) + Sort values ascending order (default behavior) >>> s.sort_values(ascending=True) 1 1.0 @@ -4098,7 +4098,7 @@ def swaplevel( In the following example, we will swap the levels of the indices. Here, we will swap the levels column-wise, but levels can be swapped row-wise - in a similar manner. Note that column-wise is the default behaviour. + in a similar manner. Note that column-wise is the default behavior. By not supplying any arguments for i and j, we swap the last and second to last indices. diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index f159babb7e018..bc45343d6e2d3 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -226,19 +226,18 @@ def to_numeric( set(), coerce_numeric=coerce_numeric, convert_to_masked_nullable=dtype_backend is not lib.no_default - or isinstance(values_dtype, StringDtype) - and values_dtype.na_value is libmissing.NA, + or ( + isinstance(values_dtype, StringDtype) + and values_dtype.na_value is libmissing.NA + ), ) if new_mask is not None: # Remove unnecessary values, is expected later anyway and enables # downcasting values = values[~new_mask] - elif ( - dtype_backend is not lib.no_default - and new_mask is None - or isinstance(values_dtype, StringDtype) - and values_dtype.na_value is libmissing.NA + elif (dtype_backend is not lib.no_default and new_mask is None) or ( + isinstance(values_dtype, StringDtype) and values_dtype.na_value is libmissing.NA ): new_mask = np.zeros(values.shape, dtype=np.bool_) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index b1c37ab48fa57..4446b21976069 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -269,7 +269,7 @@ def _create_data(self, obj: NDFrameT, numeric_only: bool = False) -> NDFrameT: """ # filter out the on from the object if self.on is not None and not isinstance(self.on, Index) and obj.ndim == 2: - obj = obj.reindex(columns=obj.columns.difference([self.on])) + obj = obj.reindex(columns=obj.columns.difference([self.on], sort=False)) if obj.ndim > 1 and numeric_only: obj = self._make_numeric_only(obj) return obj diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 84f7239c6549d..1de6f06ef316c 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -20,6 +20,16 @@ class IntCastingNaNError(ValueError): """ Exception raised when converting (``astype``) an array with NaN to an integer type. + This error occurs when attempting to cast a data structure containing non-finite + values (such as NaN or infinity) to an integer data type. Integer types do not + support non-finite values, so such conversions are explicitly disallowed to + prevent silent data corruption or unexpected behavior. + + See Also + -------- + DataFrame.astype : Method to cast a pandas DataFrame object to a specified dtype. + Series.astype : Method to cast a pandas Series object to a specified dtype. + Examples -------- >>> pd.DataFrame(np.array([[1, np.nan], [2, 3]]), dtype="i8") @@ -100,6 +110,11 @@ class UnsortedIndexError(KeyError): Subclass of `KeyError`. + See Also + -------- + DataFrame.sort_index : Sort a DataFrame by its index. + DataFrame.set_index : Set the DataFrame index using existing columns. + Examples -------- >>> df = pd.DataFrame( @@ -388,6 +403,19 @@ class DuplicateLabelError(ValueError): """ Error raised when an operation would introduce duplicate labels. + This error is typically encountered when performing operations on objects + with `allows_duplicate_labels=False` and the operation would result in + duplicate labels in the index. Duplicate labels can lead to ambiguities + in indexing and reduce data integrity. + + See Also + -------- + Series.set_flags : Return a new ``Series`` object with updated flags. + DataFrame.set_flags : Return a new ``DataFrame`` object with updated flags. + Series.reindex : Conform ``Series`` object to new index with optional filling logic. + DataFrame.reindex : Conform ``DataFrame`` object to new index with optional filling + logic. + Examples -------- >>> s = pd.Series([0, 1, 2], index=["a", "b", "c"]).set_flags( @@ -407,6 +435,16 @@ class InvalidIndexError(Exception): """ Exception raised when attempting to use an invalid index key. + This exception is triggered when a user attempts to access or manipulate + data in a pandas DataFrame or Series using an index key that is not valid + for the given object. This may occur in cases such as using a malformed + slice, a mismatched key for a ``MultiIndex``, or attempting to access an index + element that does not exist. + + See Also + -------- + MultiIndex : A multi-level, or hierarchical, index object for pandas objects. + Examples -------- >>> idx = pd.MultiIndex.from_product([["x", "y"], [0, 1]]) @@ -487,6 +525,11 @@ class ChainedAssignmentError(Warning): For more information on Copy-on-Write, see :ref:`the user guide`. + See Also + -------- + options.mode.copy_on_write : Global setting for enabling or disabling + Copy-on-Write behavior. + Examples -------- >>> pd.options.mode.copy_on_write = True @@ -505,6 +548,11 @@ class NumExprClobberingError(NameError): to 'numexpr'. 'numexpr' is the default engine value for these methods if the numexpr package is installed. + See Also + -------- + eval : Evaluate a Python expression as a string using various backends. + DataFrame.query : Query the columns of a DataFrame with a boolean expression. + Examples -------- >>> df = pd.DataFrame({"abs": [1, 1, 1]}) @@ -628,6 +676,15 @@ class PossibleDataLossError(Exception): """ Exception raised when trying to open a HDFStore file when already opened. + This error is triggered when there is a potential risk of data loss due to + conflicting operations on an HDFStore file. It serves to prevent unintended + overwrites or data corruption by enforcing exclusive access to the file. + + See Also + -------- + HDFStore : Dict-like IO interface for storing pandas objects in PyTables. + HDFStore.open : Open an HDFStore file in the specified mode. + Examples -------- >>> store = pd.HDFStore("my-store", "a") # doctest: +SKIP @@ -808,28 +865,28 @@ class InvalidComparison(Exception): __all__ = [ "AbstractMethodError", "AttributeConflictWarning", + "CSSWarning", "CategoricalConversionWarning", "ChainedAssignmentError", "ClosedFileError", - "CSSWarning", - "DatabaseError", "DataError", + "DatabaseError", "DtypeWarning", "DuplicateLabelError", "EmptyDataError", "IncompatibilityWarning", + "IndexingError", "IntCastingNaNError", "InvalidColumnName", "InvalidComparison", "InvalidIndexError", "InvalidVersion", - "IndexingError", "LossySetitemError", "MergeError", "NoBufferPresent", "NullFrequencyError", - "NumbaUtilError", "NumExprClobberingError", + "NumbaUtilError", "OptionError", "OutOfBoundsDatetime", "OutOfBoundsTimedelta", diff --git a/pandas/io/__init__.py b/pandas/io/__init__.py index c804b81c49e7c..1c7e531debb14 100644 --- a/pandas/io/__init__.py +++ b/pandas/io/__init__.py @@ -1,4 +1,4 @@ -# ruff: noqa: TCH004 +# ruff: noqa: TC004 from typing import TYPE_CHECKING if TYPE_CHECKING: diff --git a/pandas/io/excel/__init__.py b/pandas/io/excel/__init__.py index 275cbf0148f94..f13d7afa63d84 100644 --- a/pandas/io/excel/__init__.py +++ b/pandas/io/excel/__init__.py @@ -8,7 +8,7 @@ from pandas.io.excel._util import register_writer from pandas.io.excel._xlsxwriter import XlsxWriter as _XlsxWriter -__all__ = ["read_excel", "ExcelWriter", "ExcelFile"] +__all__ = ["ExcelFile", "ExcelWriter", "read_excel"] register_writer(_OpenpyxlWriter) diff --git a/pandas/io/formats/__init__.py b/pandas/io/formats/__init__.py index 5e56b1bc7ba43..895669c342f97 100644 --- a/pandas/io/formats/__init__.py +++ b/pandas/io/formats/__init__.py @@ -1,4 +1,4 @@ -# ruff: noqa: TCH004 +# ruff: noqa: TC004 from typing import TYPE_CHECKING if TYPE_CHECKING: diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 6a3e215de3f96..5fde6577e9f95 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -37,6 +37,7 @@ DataFrame, Index, MultiIndex, + Period, PeriodIndex, ) import pandas.core.common as com @@ -803,6 +804,9 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: allow_fill=levels._can_hold_na, fill_value=levels._na_value, ) + # GH#60099 + if isinstance(values[0], Period): + values = values.to_timestamp() for i, span_val in spans.items(): mergestart, mergeend = None, None @@ -827,6 +831,10 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: # Format hierarchical rows with non-merged values. for indexcolvals in zip(*self.df.index): for idx, indexcolval in enumerate(indexcolvals): + # GH#60099 + if isinstance(indexcolval, Period): + indexcolval = indexcolval.to_timestamp() + yield CssExcelCell( row=self.rowcounter + idx, col=gcolidx, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 861f5885f80c6..17460eae8c049 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -669,9 +669,9 @@ def _truncate_horizontally(self) -> None: assert self.max_cols_fitted is not None col_num = self.max_cols_fitted // 2 if col_num >= 1: - left = self.tr_frame.iloc[:, :col_num] - right = self.tr_frame.iloc[:, -col_num:] - self.tr_frame = concat((left, right), axis=1) + _len = len(self.tr_frame.columns) + _slice = np.hstack([np.arange(col_num), np.arange(_len - col_num, _len)]) + self.tr_frame = self.tr_frame.iloc[:, _slice] # truncate formatter if isinstance(self.formatters, (list, tuple)): @@ -682,7 +682,7 @@ def _truncate_horizontally(self) -> None: else: col_num = cast(int, self.max_cols) self.tr_frame = self.tr_frame.iloc[:, :col_num] - self.tr_col_num = col_num + self.tr_col_num: int = col_num def _truncate_vertically(self) -> None: """Remove rows, which are not to be displayed. @@ -1749,7 +1749,7 @@ def _trim_zeros_complex(str_complexes: ArrayLike, decimal: str = ".") -> list[st # The split will give [{"", "-"}, "xxx", "+/-", "xxx", "j", ""] # Therefore, the imaginary part is the 4th and 3rd last elements, # and the real part is everything before the imaginary part - trimmed = re.split(r"([j+-])", x) + trimmed = re.split(r"(? str: - translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r"} + translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r", "'": r"\'"} if isinstance(escape_chars, Mapping): if default_escapes: translate.update(escape_chars) diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index ecfe3de10c829..c0f0608f1ab32 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -868,7 +868,8 @@ def _translate_latex(self, d: dict, clines: str | None) -> None: or multirow sparsification (so that \multirow and \multicol work correctly). """ index_levels = self.index.nlevels - visible_index_level_n = index_levels - sum(self.hide_index_) + # GH 52218 + visible_index_level_n = max(1, index_levels - sum(self.hide_index_)) d["head"] = [ [ {**col, "cellstyle": self.ctx_columns[r, c - visible_index_level_n]} diff --git a/pandas/io/html.py b/pandas/io/html.py index c9897f628fdc9..183af3a03221b 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -454,15 +454,26 @@ def row_is_all_th(row): while body_rows and row_is_all_th(body_rows[0]): header_rows.append(body_rows.pop(0)) - header = self._expand_colspan_rowspan(header_rows, section="header") - body = self._expand_colspan_rowspan(body_rows, section="body") - footer = self._expand_colspan_rowspan(footer_rows, section="footer") + header, rem = self._expand_colspan_rowspan(header_rows, section="header") + body, rem = self._expand_colspan_rowspan( + body_rows, + section="body", + remainder=rem, + overflow=len(footer_rows) > 0, + ) + footer, _ = self._expand_colspan_rowspan( + footer_rows, section="footer", remainder=rem, overflow=False + ) return header, body, footer def _expand_colspan_rowspan( - self, rows, section: Literal["header", "footer", "body"] - ) -> list[list]: + self, + rows, + section: Literal["header", "footer", "body"], + remainder: list[tuple[int, str | tuple, int]] | None = None, + overflow: bool = True, + ) -> tuple[list[list], list[tuple[int, str | tuple, int]]]: """ Given a list of s, return a list of text rows. @@ -471,12 +482,20 @@ def _expand_colspan_rowspan( rows : list of node-like List of s section : the section that the rows belong to (header, body or footer). + remainder: list[tuple[int, str | tuple, int]] | None + Any remainder from the expansion of previous section + overflow: bool + If true, return any partial rows as 'remainder'. If not, use up any + partial rows. True by default. Returns ------- list of list Each returned row is a list of str text, or tuple (text, link) if extract_links is not None. + remainder + Remaining partial rows if any. If overflow is False, an empty list + is returned. Notes ----- @@ -485,9 +504,7 @@ def _expand_colspan_rowspan( """ all_texts = [] # list of rows, each a list of str text: str | tuple - remainder: list[ - tuple[int, str | tuple, int] - ] = [] # list of (index, text, nrows) + remainder = remainder if remainder is not None else [] for tr in rows: texts = [] # the output for this row @@ -528,19 +545,20 @@ def _expand_colspan_rowspan( all_texts.append(texts) remainder = next_remainder - # Append rows that only appear because the previous row had non-1 - # rowspan - while remainder: - next_remainder = [] - texts = [] - for prev_i, prev_text, prev_rowspan in remainder: - texts.append(prev_text) - if prev_rowspan > 1: - next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) - all_texts.append(texts) - remainder = next_remainder + if not overflow: + # Append rows that only appear because the previous row had non-1 + # rowspan + while remainder: + next_remainder = [] + texts = [] + for prev_i, prev_text, prev_rowspan in remainder: + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + all_texts.append(texts) + remainder = next_remainder - return all_texts + return all_texts, remainder def _handle_hidden_tables(self, tbl_list, attr_name: str): """ diff --git a/pandas/io/json/__init__.py b/pandas/io/json/__init__.py index 8f4e7a62834b5..39f78e26d6041 100644 --- a/pandas/io/json/__init__.py +++ b/pandas/io/json/__init__.py @@ -7,9 +7,9 @@ from pandas.io.json._table_schema import build_table_schema __all__ = [ - "ujson_dumps", - "ujson_loads", + "build_table_schema", "read_json", "to_json", - "build_table_schema", + "ujson_dumps", + "ujson_loads", ] diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 983780f81043f..237518b3c8d92 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -364,10 +364,8 @@ def __init__( ) # TODO: Do this timedelta properly in objToJSON.c See GH #15137 - if ( - (obj.ndim == 1) - and (obj.name in set(obj.index.names)) - or len(obj.columns.intersection(obj.index.names)) + if ((obj.ndim == 1) and (obj.name in set(obj.index.names))) or len( + obj.columns.intersection(obj.index.names) ): msg = "Overlapping names between the index and columns" raise ValueError(msg) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 7294efe843cce..e263c69376d05 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -368,7 +368,7 @@ def _agg_index(self, index) -> Index: index_converter = converters.get(self.index_names[i]) is not None try_num_bool = not ( - cast_type and is_string_dtype(cast_type) or index_converter + (cast_type and is_string_dtype(cast_type)) or index_converter ) arr, _ = self._infer_types( diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 99d584db61755..db9547a18b600 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1052,8 +1052,9 @@ def _remove_empty_lines(self, lines: list[list[T]]) -> list[list[T]]: for line in lines if ( len(line) > 1 - or len(line) == 1 - and (not isinstance(line[0], str) or line[0].strip()) + or ( + len(line) == 1 and (not isinstance(line[0], str) or line[0].strip()) + ) ) ] return ret diff --git a/pandas/io/stata.py b/pandas/io/stata.py index ed89d5766c306..34d95fb59a21c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2206,15 +2206,15 @@ def _convert_datetime_to_stata_type(fmt: str) -> np.dtype: def _maybe_convert_to_int_keys(convert_dates: dict, varlist: list[Hashable]) -> dict: new_dict = {} - for key in convert_dates: + for key, value in convert_dates.items(): if not convert_dates[key].startswith("%"): # make sure proper fmts - convert_dates[key] = "%" + convert_dates[key] + convert_dates[key] = "%" + value if key in varlist: - new_dict.update({varlist.index(key): convert_dates[key]}) + new_dict[varlist.index(key)] = convert_dates[key] else: if not isinstance(key, int): raise ValueError("convert_dates key must be a column or an integer") - new_dict.update({key: convert_dates[key]}) + new_dict[key] = convert_dates[key] return new_dict @@ -2748,6 +2748,18 @@ def write_file(self) -> None: """ Export DataFrame object to Stata dta format. + This method writes the contents of a pandas DataFrame to a `.dta` file + compatible with Stata. It includes features for handling value labels, + variable types, and metadata like timestamps and data labels. The output + file can then be read and used in Stata or other compatible statistical + tools. + + See Also + -------- + read_stata : Read Stata file into DataFrame. + DataFrame.to_stata : Export DataFrame object to Stata dta format. + io.stata.StataWriter : A class for writing Stata binary dta files. + Examples -------- >>> df = pd.DataFrame( @@ -2867,7 +2879,7 @@ def _write_header( # ds_format - just use 114 self._write_bytes(struct.pack("b", 114)) # byteorder - self._write(byteorder == ">" and "\x01" or "\x02") + self._write((byteorder == ">" and "\x01") or "\x02") # filetype self._write("\x01") # unused @@ -3413,7 +3425,7 @@ def _write_header( # ds_format - 117 bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release")) # byteorder - bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder")) + bio.write(self._tag((byteorder == ">" and "MSF") or "LSF", "byteorder")) # number of vars, 2 bytes in 117 and 118, 4 byte in 119 nvar_type = "H" if self._dta_version <= 118 else "I" bio.write(self._tag(struct.pack(byteorder + nvar_type, self.nvar), "K")) diff --git a/pandas/plotting/__init__.py b/pandas/plotting/__init__.py index c7a4c1eacfcae..837bfaf82ca27 100644 --- a/pandas/plotting/__init__.py +++ b/pandas/plotting/__init__.py @@ -80,20 +80,20 @@ __all__ = [ "PlotAccessor", + "andrews_curves", + "autocorrelation_plot", + "bootstrap_plot", "boxplot", "boxplot_frame", "boxplot_frame_groupby", + "deregister_matplotlib_converters", "hist_frame", "hist_series", - "scatter_matrix", - "radviz", - "andrews_curves", - "bootstrap_plot", - "parallel_coordinates", "lag_plot", - "autocorrelation_plot", - "table", + "parallel_coordinates", "plot_params", + "radviz", "register_matplotlib_converters", - "deregister_matplotlib_converters", + "scatter_matrix", + "table", ] diff --git a/pandas/plotting/_matplotlib/__init__.py b/pandas/plotting/_matplotlib/__init__.py index 87f3ca09ad346..ff28868aa0033 100644 --- a/pandas/plotting/_matplotlib/__init__.py +++ b/pandas/plotting/_matplotlib/__init__.py @@ -74,20 +74,20 @@ def plot(data, kind, **kwargs): __all__ = [ - "plot", - "hist_series", - "hist_frame", - "boxplot", - "boxplot_frame", - "boxplot_frame_groupby", - "table", "andrews_curves", "autocorrelation_plot", "bootstrap_plot", + "boxplot", + "boxplot_frame", + "boxplot_frame_groupby", + "deregister", + "hist_frame", + "hist_series", "lag_plot", "parallel_coordinates", + "plot", "radviz", - "scatter_matrix", "register", - "deregister", + "scatter_matrix", + "table", ] diff --git a/pandas/testing.py b/pandas/testing.py index 0445fa5b5efc0..433b22bf1107e 100644 --- a/pandas/testing.py +++ b/pandas/testing.py @@ -12,6 +12,6 @@ __all__ = [ "assert_extension_array_equal", "assert_frame_equal", - "assert_series_equal", "assert_index_equal", + "assert_series_equal", ] diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index e338fb1331734..5a59617ce5bd3 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -835,3 +835,10 @@ def test_pandas_dtype_string_dtypes(string_storage): with pd.option_context("string_storage", string_storage): result = pandas_dtype("string") assert result == pd.StringDtype(string_storage, na_value=pd.NA) + + +@td.skip_if_installed("pyarrow") +def test_construct_from_string_without_pyarrow_installed(): + # GH 57928 + with pytest.raises(ImportError, match="pyarrow>=10.0.1 is required"): + pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]") diff --git a/pandas/tests/extension/decimal/__init__.py b/pandas/tests/extension/decimal/__init__.py index 34727b43a7b0f..47b1c7c57a47a 100644 --- a/pandas/tests/extension/decimal/__init__.py +++ b/pandas/tests/extension/decimal/__init__.py @@ -5,4 +5,4 @@ to_decimal, ) -__all__ = ["DecimalArray", "DecimalDtype", "to_decimal", "make_data"] +__all__ = ["DecimalArray", "DecimalDtype", "make_data", "to_decimal"] diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9defb97394635..c6ac6368f2770 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -896,9 +896,7 @@ def _is_temporal_supported(self, opname, pa_dtype): ) ) and pa.types.is_duration(pa_dtype) - or opname in ("__sub__", "__rsub__") - and pa.types.is_temporal(pa_dtype) - ) + ) or (opname in ("__sub__", "__rsub__") and pa.types.is_temporal(pa_dtype)) def _get_expected_exception( self, op_name: str, obj, other diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 27621193a9b8d..e19351b2ad058 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -187,9 +187,8 @@ def _get_expected_exception( return None def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: - return ( - op_name in ["min", "max", "sum"] - or ser.dtype.na_value is np.nan # type: ignore[union-attr] + return op_name in ["min", "max", "sum"] or ( + ser.dtype.na_value is np.nan # type: ignore[union-attr] and op_name in ("any", "all") ) diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 52e871cc795b4..c6e5304ae3cb4 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -159,7 +159,7 @@ def test_nlargest_n_duplicate_index(self, n, order, request): result = df.nlargest(n, order) expected = df.sort_values(order, ascending=False).head(n) if Version(np.__version__) >= Version("1.25") and ( - (order == ["a"] and n in (1, 2, 3, 4)) or (order == ["a", "b"]) and n == 5 + (order == ["a"] and n in (1, 2, 3, 4)) or ((order == ["a", "b"]) and n == 5) ): request.applymarker( pytest.mark.xfail( diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index a0f96ff111444..b52240c208493 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -757,3 +757,12 @@ def test_shift_with_offsets_freq_empty(self): df_shifted = DataFrame(index=shifted_dates) result = df.shift(freq=offset) tm.assert_frame_equal(result, df_shifted) + + def test_series_shift_interval_preserves_closed(self): + # GH#60389 + ser = Series( + [pd.Interval(1, 2, closed="right"), pd.Interval(2, 3, closed="right")] + ) + result = ser.shift(1) + expected = Series([np.nan, pd.Interval(1, 2, closed="right")]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 64220f1d3d5b4..b7e6e55739c17 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -159,6 +159,7 @@ def test_agg_apply_corner(ts, tsframe): tm.assert_frame_equal(grouped.agg("sum"), exp_df) res = grouped.apply(np.sum, axis=0) + exp_df = exp_df.reset_index(drop=True) tm.assert_frame_equal(res, exp_df) diff --git a/pandas/tests/groupby/test_all_methods.py b/pandas/tests/groupby/test_all_methods.py index 945c3e421a132..4625c5c27a803 100644 --- a/pandas/tests/groupby/test_all_methods.py +++ b/pandas/tests/groupby/test_all_methods.py @@ -22,7 +22,7 @@ def test_multiindex_group_all_columns_when_empty(groupby_func): # GH 32464 df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"]) - gb = df.groupby(["a", "b", "c"], group_keys=False) + gb = df.groupby(["a", "b", "c"], group_keys=True) method = getattr(gb, groupby_func) args = get_groupby_method_args(groupby_func, df) if groupby_func == "corrwith": diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 366eb59ee226a..4e7c0acb127ed 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -777,10 +777,21 @@ def test_evaluate_with_empty_groups(self, func, expected): # (not testing other agg fns, because they return # different index objects. df = DataFrame({1: [], 2: []}) - g = df.groupby(1, group_keys=False) + g = df.groupby(1, group_keys=True) result = getattr(g[2], func)(lambda x: x) tm.assert_series_equal(result, expected) + def test_groupby_apply_empty_with_group_keys_false(self): + # 60471 + # test apply'ing empty groups with group_keys False + # (not testing other agg fns, because they return + # different index objects. + df = DataFrame({"A": [], "B": [], "C": []}) + g = df.groupby("A", group_keys=False) + result = g.apply(lambda x: x / x.sum(), include_groups=False) + expected = DataFrame({"B": [], "C": []}, index=None) + tm.assert_frame_equal(result, expected) + def test_groupby_empty(self): # https://github.com/pandas-dev/pandas/issues/27190 s = Series([], name="name", dtype="float64") diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py index 755b7109a5a04..d1a278af337b7 100644 --- a/pandas/tests/indexes/string/test_indexing.py +++ b/pandas/tests/indexes/string/test_indexing.py @@ -6,6 +6,51 @@ import pandas._testing as tm +def _isnan(val): + try: + return val is not pd.NA and np.isnan(val) + except TypeError: + return False + + +class TestGetLoc: + def test_get_loc(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + assert index.get_loc("b") == 1 + + def test_get_loc_raises(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="d"): + index.get_loc("d") + + def test_get_loc_invalid_value(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="1"): + index.get_loc(1) + + def test_get_loc_non_unique(self, any_string_dtype): + index = Index(["a", "b", "a"], dtype=any_string_dtype) + result = index.get_loc("a") + expected = np.array([True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError): + index.get_loc(nulls_fixture) + + def test_get_loc_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype) + if any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and nulls_fixture is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(nulls_fixture)) + ): + with pytest.raises(KeyError): + index.get_loc(nulls_fixture) + else: + assert index.get_loc(nulls_fixture) == 2 + + class TestGetIndexer: @pytest.mark.parametrize( "method,expected", @@ -41,23 +86,60 @@ def test_get_indexer_strings_raises(self, any_string_dtype): ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] ) + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_missing(self, any_string_dtype, null, using_infer_string): + # NaT and Decimal("NaN") from null_fixture are not supported for string dtype + index = Index(["a", "b", null], dtype=any_string_dtype) + result = index.get_indexer(["a", null, "c"]) + if using_infer_string: + expected = np.array([0, 2, -1], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + expected = np.array([0, -1, -1], dtype=np.intp) + else: + expected = np.array([0, 2, -1], dtype=np.intp) -class TestGetIndexerNonUnique: - @pytest.mark.xfail(reason="TODO(infer_string)", strict=False) - def test_get_indexer_non_unique_nas(self, any_string_dtype, nulls_fixture): - index = Index(["a", "b", None], dtype=any_string_dtype) - indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + tm.assert_numpy_array_equal(result, expected) - expected_indexer = np.array([2], dtype=np.intp) - expected_missing = np.array([], dtype=np.intp) + +class TestGetIndexerNonUnique: + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_non_unique_nas( + self, any_string_dtype, null, using_infer_string + ): + index = Index(["a", "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique(["a", null]) + + if using_infer_string: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + expected_indexer = np.array([0, -1], dtype=np.intp) + expected_missing = np.array([1], dtype=np.intp) + else: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) # actually non-unique - index = Index(["a", None, "b", None], dtype=any_string_dtype) - indexer, missing = index.get_indexer_non_unique([nulls_fixture]) - - expected_indexer = np.array([1, 3], dtype=np.intp) + index = Index(["a", null, "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique(["a", null]) + + if using_infer_string: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + pass + else: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index f70e65e34c584..71ef1201e523f 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -9,6 +9,9 @@ from pandas import ( DataFrame, + MultiIndex, + Timestamp, + period_range, read_excel, ) import pandas._testing as tm @@ -333,3 +336,26 @@ def test_styler_to_s3(s3_public_bucket, s3so): f"s3://{mock_bucket_name}/{target_file}", index_col=0, storage_options=s3so ) tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize("merge_cells", [True, False, "columns"]) +def test_format_hierarchical_rows_periodindex(merge_cells): + # GH#60099 + df = DataFrame( + {"A": [1, 2]}, + index=MultiIndex.from_arrays( + [ + period_range(start="2006-10-06", end="2006-10-07", freq="D"), + ["X", "Y"], + ], + names=["date", "category"], + ), + ) + formatter = ExcelFormatter(df, merge_cells=merge_cells) + formatted_cells = formatter._format_hierarchical_rows() + + for cell in formatted_cells: + if cell.row != 0 and cell.col == 0: + assert isinstance( + cell.val, Timestamp + ), "Period should be converted to Timestamp" diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 19fe9855dbb85..18948de72200a 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -23,6 +23,7 @@ MultiIndex, date_range, option_context, + period_range, ) import pandas._testing as tm @@ -335,6 +336,43 @@ def test_multiindex_interval_datetimes(self, tmp_excel): ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("merge_cells", [True, False, "columns"]) + def test_excel_round_trip_with_periodindex(self, tmp_excel, merge_cells): + # GH#60099 + df = DataFrame( + {"A": [1, 2]}, + index=MultiIndex.from_arrays( + [ + period_range(start="2006-10-06", end="2006-10-07", freq="D"), + ["X", "Y"], + ], + names=["date", "category"], + ), + ) + df.to_excel(tmp_excel, merge_cells=merge_cells) + result = pd.read_excel(tmp_excel, index_col=[0, 1]) + expected = DataFrame( + {"A": [1, 2]}, + MultiIndex.from_arrays( + [ + [ + pd.to_datetime("2006-10-06 00:00:00"), + pd.to_datetime("2006-10-07 00:00:00"), + ], + ["X", "Y"], + ], + names=["date", "category"], + ), + ) + time_format = ( + "datetime64[s]" if tmp_excel.endswith(".ods") else "datetime64[us]" + ) + expected.index = expected.index.set_levels( + expected.index.levels[0].astype(time_format), level=0 + ) + + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "engine,ext", diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 0dc16e1ebc723..d7db3d5082135 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -129,6 +129,13 @@ def test_repr_truncation_preserves_na(self): with option_context("display.max_rows", 2, "display.show_dimensions", False): assert repr(df) == " a\n0 \n.. ...\n9 " + def test_repr_truncation_dataframe_attrs(self): + # GH#60455 + df = DataFrame([[0] * 10]) + df.attrs["b"] = DataFrame([]) + with option_context("display.max_columns", 2, "display.show_dimensions", False): + assert repr(df) == " 0 ... 9\n0 0 ... 0" + def test_max_colwidth_negative_int_raises(self): # Deprecation enforced from: # https://github.com/pandas-dev/pandas/issues/31532 diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py index 1009dfec53218..3b63011bf862e 100644 --- a/pandas/tests/io/formats/test_printing.py +++ b/pandas/tests/io/formats/test_printing.py @@ -3,11 +3,33 @@ from collections.abc import Mapping import string +import pytest + import pandas._config.config as cf +import pandas as pd + from pandas.io.formats import printing +@pytest.mark.parametrize( + "input_names, expected_names", + [ + (["'a b"], "['\\'a b']"), # Escape leading quote + (["test's b"], "['test\\'s b']"), # Escape apostrophe + (["'test' b"], "['\\'test\\' b']"), # Escape surrounding quotes + (["test b'"], "['test b\\'']"), # Escape single quote + (["test\n' b"], "['test\\n\\' b']"), # Escape quotes, preserve newline + ], +) +def test_formatted_index_names(input_names, expected_names): + # GH#60190 + df = pd.DataFrame({name: [1, 2, 3] for name in input_names}).set_index(input_names) + formatted_names = str(df.index.names) + + assert formatted_names == expected_names + + def test_adjoin(): data = [["a", "b", "c"], ["dd", "ee", "ff"], ["ggg", "hhh", "iii"]] expected = "a dd ggg\nb ee hhh\nc ff iii" diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 1de53993fe646..8d46442611719 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -1405,3 +1405,88 @@ def test_to_latex_multiindex_multirow(self): """ ) assert result == expected + + def test_to_latex_multiindex_format_single_index_hidden(self): + # GH 52218 + df = DataFrame( + { + "A": [1, 2], + "B": [4, 5], + } + ) + result = ( + df.style.hide(axis="index") + .map_index(lambda v: "textbf:--rwrap;", axis="columns") + .to_latex() + ) + expected = _dedent(r""" + \begin{tabular}{rr} + \textbf{A} & \textbf{B} \\ + 1 & 4 \\ + 2 & 5 \\ + \end{tabular} + """) + assert result == expected + + def test_to_latex_multiindex_format_triple_index_two_hidden(self): + # GH 52218 + arrays = [ + ["A", "A", "B", "B"], + ["one", "two", "one", "two"], + ["x", "x", "y", "y"], + ] + index = pd.MultiIndex.from_arrays( + arrays, names=["Level 0", "Level 1", "Level 2"] + ) + df = DataFrame( + [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], + index=index, + columns=["C1", "C2", "C3"], + ) + result = ( + df.style.hide(axis="index", level=[0, 1]) + .map_index(lambda v: "textbf:--rwrap;", axis="columns") + .to_latex() + ) + expected = _dedent(r""" + \begin{tabular}{lrrr} + & \textbf{C1} & \textbf{C2} & \textbf{C3} \\ + Level 2 & & & \\ + x & 0 & 0 & 0 \\ + x & 0 & 0 & 0 \\ + y & 0 & 0 & 0 \\ + y & 0 & 0 & 0 \\ + \end{tabular} + """) + assert result == expected + + def test_to_latex_multiindex_format_triple_index_all_hidden(self): + # GH 52218 + arrays = [ + ["A", "A", "B", "B"], + ["one", "two", "one", "two"], + ["x", "x", "y", "y"], + ] + index = pd.MultiIndex.from_arrays( + arrays, names=["Level 0", "Level 1", "Level 2"] + ) + df = DataFrame( + [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], + index=index, + columns=["C1", "C2", "C3"], + ) + result = ( + df.style.hide(axis="index", level=[0, 1, 2]) + .map_index(lambda v: "textbf:--rwrap;", axis="columns") + .to_latex() + ) + expected = _dedent(r""" + \begin{tabular}{rrr} + \textbf{C1} & \textbf{C2} & \textbf{C3} \\ + 0 & 0 & 0 \\ + 0 & 0 & 0 \\ + 0 & 0 & 0 \\ + 0 & 0 & 0 \\ + \end{tabular} + """) + assert result == expected diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index 5731f74a03852..af3cdf2d44af3 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -422,6 +422,24 @@ def test_to_string_complex_float_formatting(self): ) assert result == expected + def test_to_string_complex_float_formatting_with_exponents(self): + # GH #60393 + with option_context("display.precision", 6): + df = DataFrame( + { + "x": [ + (1.8816e-09 + 0j), + (1.8816e-09 + 3.39676e-09j), + ] + } + ) + result = df.to_string() + expected = ( + " x\n0 1.881600e-09+0.000000e+00j\n" + "1 1.881600e-09+3.396760e-09j" + ) + assert result == expected + def test_to_string_format_inf(self): # GH#24861 df = DataFrame( diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 511db2c6a33d8..3680273f5e98a 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -15,6 +15,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.errors import ( EmptyDataError, ParserError, @@ -766,7 +767,7 @@ def test_dict_keys_as_names(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0 def test_encoding_surrogatepass(all_parsers): # GH39017 diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 73e9933e3681b..bef28c4f027da 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1004,6 +1004,33 @@ def test_rowspan_only_rows(self, flavor_read_html): tm.assert_frame_equal(result, expected) + def test_rowspan_in_header_overflowing_to_body(self, flavor_read_html): + # GH60210 + + result = flavor_read_html( + StringIO( + """ + + + + + + + + + + + + +
AB
1
C2
+ """ + ) + )[0] + + expected = DataFrame(data=[["A", 1], ["C", 2]], columns=["A", "B"]) + + tm.assert_frame_equal(result, expected) + def test_header_inferred_from_rows_with_only_th(self, flavor_read_html): # GH17054 result = flavor_read_html( diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index d8a9acdc561fd..f42f7f8232229 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2376,9 +2376,13 @@ def test_pivot_table_with_margins_and_numeric_columns(self): tm.assert_frame_equal(result, expected) - def test_pivot_ea_dtype_dropna(self, dropna): + @pytest.mark.parametrize( + "dtype,expected_dtype", [("Int64", "Float64"), ("int64", "float64")] + ) + def test_pivot_ea_dtype_dropna(self, dropna, dtype, expected_dtype): # GH#47477 - df = DataFrame({"x": "a", "y": "b", "age": Series([20, 40], dtype="Int64")}) + # GH#47971 + df = DataFrame({"x": "a", "y": "b", "age": Series([20, 40], dtype=dtype)}) result = df.pivot_table( index="x", columns="y", values="age", aggfunc="mean", dropna=dropna ) @@ -2386,7 +2390,7 @@ def test_pivot_ea_dtype_dropna(self, dropna): [[30]], index=Index(["a"], name="x"), columns=Index(["b"], name="y"), - dtype="Float64", + dtype=expected_dtype, ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 3d1177c23c612..611b92eb022d6 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1254,7 +1254,7 @@ def test_value_counts_nat(self): result_dt = algos.value_counts_internal(dt) tm.assert_series_equal(result_dt, exp_dt) - exp_td = Series({np.timedelta64(10000): 1}, name="count") + exp_td = Series([1], index=[np.timedelta64(10000)], name="count") result_td = algos.value_counts_internal(td) tm.assert_series_equal(result_td, exp_td) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index e87498742061b..a23e6d9b3973a 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -295,6 +295,29 @@ def test_multiindex_insert_level_with_na(self, na): df[na, "B"] = 1 tm.assert_frame_equal(df[na], DataFrame([1], columns=["B"])) + def test_multiindex_dt_with_nan(self): + # GH#60388 + df = DataFrame( + [ + [1, np.nan, 5, np.nan], + [2, np.nan, 6, np.nan], + [np.nan, 3, np.nan, 7], + [np.nan, 4, np.nan, 8], + ], + index=Series(["a", "b", "c", "d"], dtype=object, name="sub"), + columns=MultiIndex.from_product( + [ + ["value1", "value2"], + [datetime.datetime(2024, 11, 1), datetime.datetime(2024, 11, 2)], + ], + names=[None, "Date"], + ), + ) + df = df.reset_index() + result = df[df.columns[0]] + expected = Series(["a", "b", "c", "d"], name=("sub", np.nan)) + tm.assert_series_equal(result, expected) + class TestSorted: """everything you wanted to test about sorting""" diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index ce41f1e76de79..e7ed8e855a762 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -537,11 +537,8 @@ def _argminmax_wrap(self, value, axis=None, func=None): nullnan = isna(nans) if res.ndim: res[nullnan] = -1 - elif ( - hasattr(nullnan, "all") - and nullnan.all() - or not hasattr(nullnan, "all") - and nullnan + elif (hasattr(nullnan, "all") and nullnan.all()) or ( + not hasattr(nullnan, "all") and nullnan ): res = -1 return res diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index d9ab4723a8f2c..120dbe788a23f 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -459,6 +459,38 @@ def f(x): ) tm.assert_frame_equal(result, expected) + def test_table_method_rolling_apply_col_order(self): + # GH#59666 + def f(x): + return np.nanmean(x[:, 0] - x[:, 1]) + + df = DataFrame( + { + "a": [1, 2, 3, 4, 5, 6], + "b": [6, 7, 8, 5, 6, 7], + } + ) + result = df.rolling(3, method="table", min_periods=0)[["a", "b"]].apply( + f, raw=True, engine="numba" + ) + expected = DataFrame( + { + "a": [-5, -5, -5, -3.66667, -2.33333, -1], + "b": [-5, -5, -5, -3.66667, -2.33333, -1], + } + ) + tm.assert_almost_equal(result, expected) + result = df.rolling(3, method="table", min_periods=0)[["b", "a"]].apply( + f, raw=True, engine="numba" + ) + expected = DataFrame( + { + "b": [5, 5, 5, 3.66667, 2.33333, 1], + "a": [5, 5, 5, 3.66667, 2.33333, 1], + } + ) + tm.assert_almost_equal(result, expected) + def test_table_method_rolling_weighted_mean(self, step): def weighted_mean(x): arr = np.ones((1, x.shape[1])) diff --git a/pandas/tseries/__init__.py b/pandas/tseries/__init__.py index e361726dc6f80..c00843ecac418 100644 --- a/pandas/tseries/__init__.py +++ b/pandas/tseries/__init__.py @@ -1,4 +1,4 @@ -# ruff: noqa: TCH004 +# ruff: noqa: TC004 from typing import TYPE_CHECKING if TYPE_CHECKING: diff --git a/pandas/tseries/api.py b/pandas/tseries/api.py index ec2d7d2304839..5ea899f1610a7 100644 --- a/pandas/tseries/api.py +++ b/pandas/tseries/api.py @@ -7,4 +7,4 @@ from pandas.tseries import offsets from pandas.tseries.frequencies import infer_freq -__all__ = ["infer_freq", "offsets", "guess_datetime_format"] +__all__ = ["guess_datetime_format", "infer_freq", "offsets"] diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 534bee5fede44..9a01568971af8 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -89,6 +89,11 @@ def infer_freq( """ Infer the most likely frequency given the input index. + This method attempts to deduce the most probable frequency (e.g., 'D' for daily, + 'H' for hourly) from a sequence of datetime-like objects. It is particularly useful + when the frequency of a time series is not explicitly set or known but can be + inferred from its values. + Parameters ---------- index : DatetimeIndex, TimedeltaIndex, Series or array-like @@ -106,6 +111,13 @@ def infer_freq( ValueError If there are fewer than three values. + See Also + -------- + date_range : Return a fixed frequency DatetimeIndex. + timedelta_range : Return a fixed frequency TimedeltaIndex with day as the default. + period_range : Return a fixed frequency PeriodIndex. + DatetimeIndex.freq : Return the frequency object if it is set, otherwise None. + Examples -------- >>> idx = pd.date_range(start="2020/12/01", end="2020/12/30", periods=30) diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index bf4ec2e551f01..2d195fbbc4e84 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -636,12 +636,17 @@ def HolidayCalendarFactory(name: str, base, other, base_class=AbstractHolidayCal __all__ = [ + "FR", + "MO", + "SA", + "SU", + "TH", + "TU", + "WE", + "HolidayCalendarFactory", "after_nearest_workday", "before_nearest_workday", - "FR", "get_calendar", - "HolidayCalendarFactory", - "MO", "nearest_workday", "next_monday", "next_monday_or_tuesday", @@ -649,11 +654,6 @@ def HolidayCalendarFactory(name: str, base, other, base_class=AbstractHolidayCal "previous_friday", "previous_workday", "register", - "SA", - "SU", "sunday_to_monday", - "TH", - "TU", - "WE", "weekend_to_monday", ] diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 169c9cc18a7fd..a065137e6971c 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -46,46 +46,46 @@ ) __all__ = [ - "Day", + "FY5253", + "BDay", + "BMonthBegin", + "BMonthEnd", + "BQuarterBegin", + "BQuarterEnd", + "BYearBegin", + "BYearEnd", "BaseOffset", "BusinessDay", + "BusinessHour", "BusinessMonthBegin", "BusinessMonthEnd", - "BDay", + "CBMonthBegin", + "CBMonthEnd", + "CDay", "CustomBusinessDay", + "CustomBusinessHour", "CustomBusinessMonthBegin", "CustomBusinessMonthEnd", - "CDay", - "CBMonthEnd", - "CBMonthBegin", + "DateOffset", + "Day", + "Easter", + "FY5253Quarter", + "Hour", + "LastWeekOfMonth", + "Micro", + "Milli", + "Minute", "MonthBegin", - "BMonthBegin", "MonthEnd", - "BMonthEnd", - "SemiMonthEnd", - "SemiMonthBegin", - "BusinessHour", - "CustomBusinessHour", - "YearBegin", - "BYearBegin", - "YearEnd", - "BYearEnd", + "Nano", "QuarterBegin", - "BQuarterBegin", "QuarterEnd", - "BQuarterEnd", - "LastWeekOfMonth", - "FY5253Quarter", - "FY5253", + "Second", + "SemiMonthBegin", + "SemiMonthEnd", + "Tick", "Week", "WeekOfMonth", - "Easter", - "Tick", - "Hour", - "Minute", - "Second", - "Milli", - "Micro", - "Nano", - "DateOffset", + "YearBegin", + "YearEnd", ] diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index 165824bec131f..a1a0d51a7c72b 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -83,7 +83,7 @@ def wrapper(*args, **kwargs) -> Callable[..., Any]: if alternative.__doc__.count("\n") < 3: raise AssertionError(doc_error_msg) empty1, summary, empty2, doc_string = alternative.__doc__.split("\n", 3) - if empty1 or empty2 and not summary: + if empty1 or (empty2 and not summary): raise AssertionError(doc_error_msg) wrapper.__doc__ = dedent( f""" @@ -497,13 +497,13 @@ def indent(text: str | None, indents: int = 1) -> str: __all__ = [ "Appender", + "Substitution", "cache_readonly", "deprecate", "deprecate_kwarg", "deprecate_nonkeyword_arguments", "doc", "future_version_msg", - "Substitution", ] diff --git a/pyproject.toml b/pyproject.toml index 0c76ecd0b15b4..7ab9cd2c17669 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -304,10 +304,6 @@ ignore = [ "PERF102", # try-except-in-loop, becomes useless in Python 3.11 "PERF203", - # pytest-missing-fixture-name-underscore - "PT004", - # pytest-incorrect-fixture-name-underscore - "PT005", # pytest-parametrize-names-wrong-type "PT006", # pytest-parametrize-values-wrong-type diff --git a/requirements-dev.txt b/requirements-dev.txt index 69568cf661241..fb4d9cdb589ca 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -24,6 +24,7 @@ html5lib>=1.1 hypothesis>=6.84.0 gcsfs>=2022.11.0 ipython +pickleshare jinja2>=3.1.2 lxml>=4.9.2 matplotlib>=3.6.3 @@ -62,7 +63,7 @@ gitdb google-auth natsort numpydoc -pydata-sphinx-theme==0.14 +pydata-sphinx-theme==0.16 pytest-cython sphinx sphinx-design diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 076acc359f933..d804e15f6d48f 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -319,10 +319,10 @@ def nodefault_used_not_only_for_typing(file_obj: IO[str]) -> Iterable[tuple[int, while nodes: in_annotation, node = nodes.pop() if not in_annotation and ( - isinstance(node, ast.Name) # Case `NoDefault` - and node.id == "NoDefault" - or isinstance(node, ast.Attribute) # Cases e.g. `lib.NoDefault` - and node.attr == "NoDefault" + (isinstance(node, ast.Name) # Case `NoDefault` + and node.id == "NoDefault") + or (isinstance(node, ast.Attribute) # Cases e.g. `lib.NoDefault` + and node.attr == "NoDefault") ): yield (node.lineno, "NoDefault is used not only for typing") diff --git a/web/pandas/_templates/layout.html b/web/pandas/_templates/layout.html index 4c66f28818abd..c26b093b0c4ba 100644 --- a/web/pandas/_templates/layout.html +++ b/web/pandas/_templates/layout.html @@ -73,12 +73,12 @@
  • - +
  • - +
  • diff --git a/web/pandas/index.html b/web/pandas/index.html index 63bc11d3ed5d8..98628b856edb6 100644 --- a/web/pandas/index.html +++ b/web/pandas/index.html @@ -83,8 +83,8 @@

    Follow us

  • - - + +