diff --git a/.circleci/config.yml b/.circleci/config.yml index 9c986e5b1b054..139ea9d220453 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -34,7 +34,6 @@ jobs: fi python -m pip install --no-build-isolation -ve . -Csetup-args="--werror" PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH - sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 ci/run_tests.sh test-linux-musl: docker: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 07fb0c19262a1..899b49cc4eff5 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -385,10 +385,12 @@ jobs: nogil: true - name: Build Environment + # TODO: Once numpy 2.2.1 is out, don't install nightly version + # Tests segfault with numpy 2.2.0: https://github.com/numpy/numpy/pull/27955 run: | python --version - python -m pip install --upgrade pip setuptools wheel numpy meson[ninja]==1.2.1 meson-python==0.13.1 - python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython + python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython numpy python -m pip install versioneer[toml] python -m pip install python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov python -m pip install -ve . --no-build-isolation --no-index --no-deps -Csetup-args="--werror" diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 354402c572ade..32ca5573ac08a 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -152,7 +152,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.21.3 + uses: pypa/cibuildwheel@v2.22.0 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 09912bfb6c349..b7b9b1818c122 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.7.2 + rev: v0.8.1 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -47,7 +47,7 @@ repos: types_or: [python, rst, markdown, cython, c] additional_dependencies: [tomli] - repo: https://github.com/MarcoGorelli/cython-lint - rev: v0.16.2 + rev: v0.16.6 hooks: - id: cython-lint - id: double-quote-cython-strings @@ -95,7 +95,7 @@ repos: - id: sphinx-lint args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v19.1.3 + rev: v19.1.4 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index abffa1f702b9c..19c556dfe9d1f 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -511,8 +511,7 @@ def setup(self, dtype, method, application, ncols, engine): # grouping on multiple columns # and we lack kernels for a bunch of methods if ( - engine == "numba" - and method in _numba_unsupported_methods + (engine == "numba" and method in _numba_unsupported_methods) or ncols > 1 or application == "transformation" or dtype == "datetime" diff --git a/ci/code_checks.sh b/ci/code_checks.sh index fae1a7abba6a8..39cea0c361a72 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -73,9 +73,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ -i "pandas.RangeIndex.from_range PR01,SA01" \ - -i "pandas.Series.dt.freq GL08" \ - -i "pandas.Series.dt.unit GL08" \ - -i "pandas.Series.pad PR01,SA01" \ -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ @@ -83,57 +80,21 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.min PR02" \ -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.tzinfo GL08" \ - -i "pandas.api.types.is_re_compilable PR07,SA01" \ - -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ - -i "pandas.arrays.IntegerArray SA01" \ - -i "pandas.arrays.IntervalArray.length SA01" \ -i "pandas.arrays.NumpyExtensionArray SA01" \ -i "pandas.arrays.TimedeltaArray PR07,SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \ - -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ - -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.indices SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ - -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \ - -i "pandas.core.resample.Resampler.get_group RT03,SA01" \ - -i "pandas.core.resample.Resampler.indices SA01" \ -i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.mean SA01" \ -i "pandas.core.resample.Resampler.min PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.prod SA01" \ -i "pandas.core.resample.Resampler.quantile PR01,PR07" \ - -i "pandas.core.resample.Resampler.sem SA01" \ -i "pandas.core.resample.Resampler.std SA01" \ -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.var SA01" \ - -i "pandas.errors.AttributeConflictWarning SA01" \ - -i "pandas.errors.ChainedAssignmentError SA01" \ - -i "pandas.errors.DuplicateLabelError SA01" \ - -i "pandas.errors.IntCastingNaNError SA01" \ - -i "pandas.errors.InvalidIndexError SA01" \ - -i "pandas.errors.NullFrequencyError SA01" \ - -i "pandas.errors.NumExprClobberingError SA01" \ - -i "pandas.errors.NumbaUtilError SA01" \ - -i "pandas.errors.OutOfBoundsTimedelta SA01" \ - -i "pandas.errors.PerformanceWarning SA01" \ - -i "pandas.errors.PossibleDataLossError SA01" \ - -i "pandas.errors.UndefinedVariableError PR01,SA01" \ - -i "pandas.errors.UnsortedIndexError SA01" \ -i "pandas.errors.ValueLabelTypeMismatch SA01" \ - -i "pandas.infer_freq SA01" \ - -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \ - -i "pandas.io.stata.StataWriter.write_file SA01" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ - -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \ diff --git a/doc/source/conf.py b/doc/source/conf.py index ddbda0aa3bf65..677ee6274b093 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -242,7 +242,6 @@ "external_links": [], "footer_start": ["pandas_footer", "sphinx-version"], "github_url": "https://github.com/pandas-dev/pandas", - "twitter_url": "https://twitter.com/pandas_dev", "analytics": { "plausible_analytics_domain": "pandas.pydata.org", "plausible_analytics_url": "https://views.scientific-python.org/js/script.js", @@ -258,6 +257,11 @@ # patch version doesn't compare as equal (e.g. 2.2.1 != 2.2.0 but it should be) "show_version_warning_banner": False, "icon_links": [ + { + "name": "X", + "url": "https://x.com/pandas_dev", + "icon": "fa-brands fa-square-x-twitter", + }, { "name": "Mastodon", "url": "https://fosstodon.org/@pandas_dev", diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index b3982c4ad091f..bda959f380e8a 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -193,25 +193,25 @@ Visualization Installable with ``pip install "pandas[plot, output-formatting]"``. -========================= ================== ================== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== ================== ============================================================= -matplotlib 3.6.3 plot Plotting library -Jinja2 3.1.2 output-formatting Conditional formatting with DataFrame.style -tabulate 0.9.0 output-formatting Printing in Markdown-friendly format (see `tabulate`_) -========================= ================== ================== ============================================================= +========================================================== ================== ================== ======================================================= +Dependency Minimum Version pip extra Notes +========================================================== ================== ================== ======================================================= +`matplotlib `__ 3.6.3 plot Plotting library +`Jinja2 `__ 3.1.2 output-formatting Conditional formatting with DataFrame.style +`tabulate `__ 0.9.0 output-formatting Printing in Markdown-friendly format (see `tabulate`_) +========================================================== ================== ================== ======================================================= Computation ^^^^^^^^^^^ Installable with ``pip install "pandas[computation]"``. -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -SciPy 1.10.0 computation Miscellaneous statistical functions -xarray 2022.12.0 computation pandas-like API for N-dimensional data -========================= ================== =============== ============================================================= +============================================== ================== =============== ======================================= +Dependency Minimum Version pip extra Notes +============================================== ================== =============== ======================================= +`SciPy `__ 1.10.0 computation Miscellaneous statistical functions +`xarray `__ 2022.12.0 computation pandas-like API for N-dimensional data +============================================== ================== =============== ======================================= .. _install.excel_dependencies: @@ -220,29 +220,29 @@ Excel files Installable with ``pip install "pandas[excel]"``. -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -xlrd 2.0.1 excel Reading for xls files -xlsxwriter 3.0.5 excel Writing for xlsx files -openpyxl 3.1.0 excel Reading / writing for Excel 2010 xlsx/xlsm/xltx/xltm files -pyxlsb 1.0.10 excel Reading for xlsb files -python-calamine 0.1.7 excel Reading for xls/xlsx/xlsm/xlsb/xla/xlam/ods files -odfpy 1.4.1 excel Reading / writing for OpenDocument 1.2 files -========================= ================== =============== ============================================================= +================================================================== ================== =============== ============================================================= +Dependency Minimum Version pip extra Notes +================================================================== ================== =============== ============================================================= +`xlrd `__ 2.0.1 excel Reading for xls files +`xlsxwriter `__ 3.0.5 excel Writing for xlsx files +`openpyxl `__ 3.1.0 excel Reading / writing for Excel 2010 xlsx/xlsm/xltx/xltm files +`pyxlsb `__ 1.0.10 excel Reading for xlsb files +`python-calamine `__ 0.1.7 excel Reading for xls/xlsx/xlsm/xlsb/xla/xlam/ods files +`odfpy `__ 1.4.1 excel Reading / writing for OpenDocument 1.2 files +================================================================== ================== =============== ============================================================= HTML ^^^^ Installable with ``pip install "pandas[html]"``. -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -BeautifulSoup4 4.11.2 html HTML parser for read_html -html5lib 1.1 html HTML parser for read_html -lxml 4.9.2 html HTML parser for read_html -========================= ================== =============== ============================================================= +=============================================================== ================== =============== ========================== +Dependency Minimum Version pip extra Notes +=============================================================== ================== =============== ========================== +`BeautifulSoup4 `__ 4.11.2 html HTML parser for read_html +`html5lib `__ 1.1 html HTML parser for read_html +`lxml `__ 4.9.2 html HTML parser for read_html +=============================================================== ================== =============== ========================== One of the following combinations of libraries is needed to use the top-level :func:`~pandas.read_html` function: @@ -273,45 +273,45 @@ XML Installable with ``pip install "pandas[xml]"``. -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -lxml 4.9.2 xml XML parser for read_xml and tree builder for to_xml -========================= ================== =============== ============================================================= +======================================== ================== =============== ==================================================== +Dependency Minimum Version pip extra Notes +======================================== ================== =============== ==================================================== +`lxml `__ 4.9.2 xml XML parser for read_xml and tree builder for to_xml +======================================== ================== =============== ==================================================== SQL databases ^^^^^^^^^^^^^ Traditional drivers are installable with ``pip install "pandas[postgresql, mysql, sql-other]"`` -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -SQLAlchemy 2.0.0 postgresql, SQL support for databases other than sqlite - mysql, - sql-other -psycopg2 2.9.6 postgresql PostgreSQL engine for sqlalchemy -pymysql 1.0.2 mysql MySQL engine for sqlalchemy -adbc-driver-postgresql 0.10.0 postgresql ADBC Driver for PostgreSQL -adbc-driver-sqlite 0.8.0 sql-other ADBC Driver for SQLite -========================= ================== =============== ============================================================= +================================================================== ================== =============== ============================================ +Dependency Minimum Version pip extra Notes +================================================================== ================== =============== ============================================ +`SQLAlchemy `__ 2.0.0 postgresql, SQL support for databases other than sqlite + mysql, + sql-other +`psycopg2 `__ 2.9.6 postgresql PostgreSQL engine for sqlalchemy +`pymysql `__ 1.0.2 mysql MySQL engine for sqlalchemy +`adbc-driver-postgresql `__ 0.10.0 postgresql ADBC Driver for PostgreSQL +`adbc-driver-sqlite `__ 0.8.0 sql-other ADBC Driver for SQLite +================================================================== ================== =============== ============================================ Other data sources ^^^^^^^^^^^^^^^^^^ Installable with ``pip install "pandas[hdf5, parquet, feather, spss, excel]"`` -========================= ================== ================ ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== ================ ============================================================= -PyTables 3.8.0 hdf5 HDF5-based reading / writing -blosc 1.21.3 hdf5 Compression for HDF5; only available on ``conda`` -zlib hdf5 Compression for HDF5 -fastparquet 2023.10.0 - Parquet reading / writing (pyarrow is default) -pyarrow 10.0.1 parquet, feather Parquet, ORC, and feather reading / writing -pyreadstat 1.2.0 spss SPSS files (.sav) reading -odfpy 1.4.1 excel Open document format (.odf, .ods, .odt) reading / writing -========================= ================== ================ ============================================================= +====================================================== ================== ================ ========================================================== +Dependency Minimum Version pip extra Notes +====================================================== ================== ================ ========================================================== +`PyTables `__ 3.8.0 hdf5 HDF5-based reading / writing +`blosc `__ 1.21.3 hdf5 Compression for HDF5; only available on ``conda`` +`zlib `__ hdf5 Compression for HDF5 +`fastparquet `__ 2023.10.0 - Parquet reading / writing (pyarrow is default) +`pyarrow `__ 10.0.1 parquet, feather Parquet, ORC, and feather reading / writing +`pyreadstat `__ 1.2.0 spss SPSS files (.sav) reading +`odfpy `__ 1.4.1 excel Open document format (.odf, .ods, .odt) reading / writing +====================================================== ================== ================ ========================================================== .. _install.warn_orc: @@ -326,26 +326,26 @@ Access data in the cloud Installable with ``pip install "pandas[fss, aws, gcp]"`` -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -fsspec 2022.11.0 fss, gcp, aws Handling files aside from simple local and HTTP (required - dependency of s3fs, gcsfs). -gcsfs 2022.11.0 gcp Google Cloud Storage access -s3fs 2022.11.0 aws Amazon S3 access -========================= ================== =============== ============================================================= +============================================ ================== =============== ========================================================== +Dependency Minimum Version pip extra Notes +============================================ ================== =============== ========================================================== +`fsspec `__ 2022.11.0 fss, gcp, aws Handling files aside from simple local and HTTP (required + dependency of s3fs, gcsfs). +`gcsfs `__ 2022.11.0 gcp Google Cloud Storage access +`s3fs `__ 2022.11.0 aws Amazon S3 access +============================================ ================== =============== ========================================================== Clipboard ^^^^^^^^^ Installable with ``pip install "pandas[clipboard]"``. -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -PyQt4/PyQt5 5.15.9 clipboard Clipboard I/O -qtpy 2.3.0 clipboard Clipboard I/O -========================= ================== =============== ============================================================= +======================================================================================== ================== =============== ============== +Dependency Minimum Version pip extra Notes +======================================================================================== ================== =============== ============== +`PyQt4 `__/`PyQt5 `__ 5.15.9 clipboard Clipboard I/O +`qtpy `__ 2.3.0 clipboard Clipboard I/O +======================================================================================== ================== =============== ============== .. note:: @@ -358,19 +358,19 @@ Compression Installable with ``pip install "pandas[compression]"`` -========================= ================== =============== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =============== ============================================================= -Zstandard 0.19.0 compression Zstandard compression -========================= ================== =============== ============================================================= +================================================= ================== =============== ====================== +Dependency Minimum Version pip extra Notes +================================================= ================== =============== ====================== +`Zstandard `__ 0.19.0 compression Zstandard compression +================================================= ================== =============== ====================== Timezone ^^^^^^^^ Installable with ``pip install "pandas[timezone]"`` -========================= ================== =================== ============================================================= -Dependency Minimum Version pip extra Notes -========================= ================== =================== ============================================================= -pytz 2023.4 timezone Alternative timezone library to ``zoneinfo``. -========================= ================== =================== ============================================================= +========================================== ================== =================== ============================================== +Dependency Minimum Version pip extra Notes +========================================== ================== =================== ============================================== +`pytz `__ 2023.4 timezone Alternative timezone library to ``zoneinfo``. +========================================== ================== =================== ============================================== diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 7680c8b434866..e701d48a89db7 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -185,7 +185,6 @@ Reindexing / selection / label manipulation DataFrame.duplicated DataFrame.equals DataFrame.filter - DataFrame.head DataFrame.idxmax DataFrame.idxmin DataFrame.reindex @@ -196,7 +195,6 @@ Reindexing / selection / label manipulation DataFrame.sample DataFrame.set_axis DataFrame.set_index - DataFrame.tail DataFrame.take DataFrame.truncate diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 1525afcac87f7..b2b5c5cc1014e 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -459,7 +459,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df # List the size of the animals with the highest weight. - df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()], include_groups=False) + df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()]) `Using get_group `__ @@ -482,7 +482,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to return pd.Series(["L", avg_weight, True], index=["size", "weight", "adult"]) - expected_df = gb.apply(GrowUp, include_groups=False) + expected_df = gb.apply(GrowUp) expected_df `Expanding apply diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index b9c285ca30c96..89981786d60b5 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -326,7 +326,7 @@ This case is handled identically to a dict of arrays. .. ipython:: python - data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")]) + data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "S10")]) data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] pd.DataFrame(data) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index acb5a2b7919ac..4a32381a7de47 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1074,7 +1074,7 @@ missing values with the ``ffill()`` method. ).set_index("date") df_re - df_re.groupby("group").resample("1D", include_groups=False).ffill() + df_re.groupby("group").resample("1D").ffill() .. _groupby.filter: @@ -1252,13 +1252,13 @@ the argument ``group_keys`` which defaults to ``True``. Compare .. ipython:: python - df.groupby("A", group_keys=True).apply(lambda x: x, include_groups=False) + df.groupby("A", group_keys=True).apply(lambda x: x) with .. ipython:: python - df.groupby("A", group_keys=False).apply(lambda x: x, include_groups=False) + df.groupby("A", group_keys=False).apply(lambda x: x) Numba accelerated routines @@ -1742,7 +1742,7 @@ column index name will be used as the name of the inserted column: result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()} return pd.Series(result, name="metrics") - result = df.groupby("a").apply(compute_metrics, include_groups=False) + result = df.groupby("a").apply(compute_metrics) result diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 3347f3a2534f4..8c5e98791a9ef 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -321,7 +321,7 @@ The missing value can be filled with a specific value with the ``fill_value`` ar .. image:: ../_static/reshaping_melt.png The top-level :func:`~pandas.melt` function and the corresponding :meth:`DataFrame.melt` -are useful to massage a :class:`DataFrame` into a format where one or more columns +are useful to reshape a :class:`DataFrame` into a format where one or more columns are *identifier variables*, while all other columns, considered *measured variables*, are "unpivoted" to the row axis, leaving just two non-identifier columns, "variable" and "value". The names of those columns can be customized diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst index e25c4c2441920..0581951d5bfad 100644 --- a/doc/source/user_guide/window.rst +++ b/doc/source/user_guide/window.rst @@ -567,9 +567,9 @@ One must have :math:`0 < \alpha \leq 1`, and while it is possible to pass \alpha = \begin{cases} - \frac{2}{s + 1}, & \text{for span}\ s \geq 1\\ - \frac{1}{1 + c}, & \text{for center of mass}\ c \geq 0\\ - 1 - \exp^{\frac{\log 0.5}{h}}, & \text{for half-life}\ h > 0 + \frac{2}{s + 1}, & \text{for span}\ s \geq 1\\ + \frac{1}{1 + c}, & \text{for center of mass}\ c \geq 0\\ + 1 - e^{\frac{\log 0.5}{h}}, & \text{for half-life}\ h > 0 \end{cases} One must specify precisely one of **span**, **center of mass**, **half-life** diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index da0fcfc2b3f64..b107a5d3ba100 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -118,7 +118,7 @@ Interval Indexing ^^^^^^^^ -- +- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`) - Missing diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index de69166b8c196..92c67865ae88f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -54,7 +54,9 @@ Other enhancements - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) +- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) +- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) - :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) @@ -552,6 +554,7 @@ Other Removals - Removed the ``method`` keyword in ``ExtensionArray.fillna``, implement ``ExtensionArray._pad_or_backfill`` instead (:issue:`53621`) - Removed the attribute ``dtypes`` from :class:`.DataFrameGroupBy` (:issue:`51997`) - Enforced deprecation of ``argmin``, ``argmax``, ``idxmin``, and ``idxmax`` returning a result when ``skipna=False`` and an NA value is encountered or all values are NA values; these operations will now raise in such cases (:issue:`33941`, :issue:`51276`) +- Removed specifying ``include_groups=True`` in :class:`.DataFrameGroupBy.apply` and :class:`.Resampler.apply` (:issue:`7155`) .. --------------------------------------------------------------------------- .. _whatsnew_300.performance: @@ -625,7 +628,9 @@ Datetimelike - Bug in :meth:`DatetimeIndex.union` and :meth:`DatetimeIndex.intersection` when ``unit`` was non-nanosecond (:issue:`59036`) - Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`) - Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`) +- Bug in :meth:`to_datetime` on float32 df with year, month, day etc. columns leads to precision issues and incorrect result. (:issue:`60506`) - Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`) +- Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) Timedelta @@ -665,7 +670,8 @@ Indexing ^^^^^^^^ - Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`) - Bug in :meth:`DataFrame.from_records` throwing a ``ValueError`` when passed an empty list in ``index`` (:issue:`58594`) -- +- Bug in :meth:`MultiIndex.insert` when a new value inserted to a datetime-like level gets cast to ``NaT`` and fails indexing (:issue:`60388`) +- Bug in printing :attr:`Index.names` and :attr:`MultiIndex.levels` would not escape single quotes (:issue:`60190`) Missing ^^^^^^^ @@ -688,6 +694,7 @@ I/O - Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`) - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) +- Bug in :meth:`DataFrame.to_excel` where the :class:`MultiIndex` index with a period level was not a date (:issue:`60099`) - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) - Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) @@ -697,11 +704,14 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) +- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) - Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) - Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`) - Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`) +- Bug in :meth:`set_option` where setting the pandas option ``display.html.use_mathjax`` to ``False`` has no effect (:issue:`59884`) +- Bug in :meth:`to_excel` where :class:`MultiIndex` columns would be merged to a single row when ``merge_cells=False`` is passed (:issue:`60274`) Period ^^^^^^ @@ -726,11 +736,13 @@ Groupby/resample/rolling - Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) - Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`) +- Bug in :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` for empty data frame with ``group_keys=False`` still creating output index using group keys. (:issue:`60471`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) - Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`) - Bug in :meth:`DataFrameGroupBy.cumsum` and :meth:`DataFrameGroupBy.cumprod` where ``numeric_only`` parameter was passed indirectly through kwargs instead of passing directly. (:issue:`58811`) - Bug in :meth:`DataFrameGroupBy.cumsum` where it did not return the correct dtype when the label contained ``None``. (:issue:`58811`) - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) +- Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`) - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) - Bug in :meth:`Series.resample` could raise when the the date range ended shortly before a non-existent time. (:issue:`58380`) @@ -756,11 +768,12 @@ ExtensionArray - Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`) - Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`) - Bug in comparison between object with :class:`ArrowDtype` and incompatible-dtyped (e.g. string vs bool) incorrectly raising instead of returning all-``False`` (for ``==``) or all-``True`` (for ``!=``) (:issue:`59505`) +- Bug in constructing pandas data structures when passing into ``dtype`` a string of the type followed by ``[pyarrow]`` while PyArrow is not installed would raise ``NameError`` rather than ``ImportError`` (:issue:`57928`) - Bug in various :class:`DataFrame` reductions for pyarrow temporal dtypes returning incorrect dtype when result was null (:issue:`59234`) Styler ^^^^^^ -- +- Bug in :meth:`Styler.to_latex` where styling column headers when combined with a hidden index or hidden index-levels is fixed. Other ^^^^^ @@ -784,9 +797,13 @@ Other - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) +- Bug in :meth:`Series.to_string` when series contains complex floats with exponents (:issue:`60405`) - Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) +- Bug in ``Series.list`` methods not preserving the original name. (:issue:`60522`) +- Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`) +- Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`) .. ***DO NOT USE THIS SECTION*** diff --git a/environment.yml b/environment.yml index 9bf6cf2a92347..69647a436e3ad 100644 --- a/environment.yml +++ b/environment.yml @@ -35,6 +35,7 @@ dependencies: - hypothesis>=6.84.0 - gcsfs>=2022.11.0 - ipython + - pickleshare # Needed for IPython Sphinx directive in the docs GH#60429 - jinja2>=3.1.2 - lxml>=4.9.2 - matplotlib>=3.6.3 @@ -87,7 +88,7 @@ dependencies: - google-auth - natsort # DataFrame.sort_values doctest - numpydoc - - pydata-sphinx-theme=0.14 + - pydata-sphinx-theme=0.16 - pytest-cython # doctest - sphinx - sphinx-design diff --git a/pandas/__init__.py b/pandas/__init__.py index 6c97baa890777..c570fb8d70204 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -235,6 +235,7 @@ # Pandas is not (yet) a py.typed library: the public API is determined # based on the documentation. __all__ = [ + "NA", "ArrowDtype", "BooleanDtype", "Categorical", @@ -253,15 +254,14 @@ "HDFStore", "Index", "IndexSlice", + "Int8Dtype", "Int16Dtype", "Int32Dtype", "Int64Dtype", - "Int8Dtype", "Interval", "IntervalDtype", "IntervalIndex", "MultiIndex", - "NA", "NaT", "NamedAgg", "Period", @@ -274,10 +274,10 @@ "Timedelta", "TimedeltaIndex", "Timestamp", + "UInt8Dtype", "UInt16Dtype", "UInt32Dtype", "UInt64Dtype", - "UInt8Dtype", "api", "array", "arrays", @@ -290,8 +290,8 @@ "errors", "eval", "factorize", - "get_dummies", "from_dummies", + "get_dummies", "get_option", "infer_freq", "interval_range", diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index 80d9ea1b364f3..463e8af7cc561 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -8,13 +8,13 @@ __all__ = [ "config", + "describe_option", "detect_console_encoding", "get_option", - "set_option", - "reset_option", - "describe_option", "option_context", "options", + "reset_option", + "set_option", ] from pandas._config import config from pandas._config import dates # pyright: ignore[reportUnusedImport] # noqa: F401 diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 1d57aa806e0f1..35139979f92fe 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -188,6 +188,11 @@ def set_option(*args) -> None: """ Set the value of the specified option or options. + This method allows fine-grained control over the behavior and display settings + of pandas. Options affect various functionalities such as output formatting, + display limits, and operational behavior. Settings can be modified at runtime + without requiring changes to global configurations or environment variables. + Parameters ---------- *args : str | object diff --git a/pandas/_libs/__init__.py b/pandas/_libs/__init__.py index 26a872a90e493..d499f9a6cd75e 100644 --- a/pandas/_libs/__init__.py +++ b/pandas/_libs/__init__.py @@ -1,4 +1,5 @@ __all__ = [ + "Interval", "NaT", "NaTType", "OutOfBoundsDatetime", @@ -6,7 +7,6 @@ "Timedelta", "Timestamp", "iNaT", - "Interval", ] diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index bf6d8ba8973d3..3af2856d2fbbf 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -72,6 +72,9 @@ class MaskedUInt16Engine(MaskedIndexEngine): ... class MaskedUInt8Engine(MaskedIndexEngine): ... class MaskedBoolEngine(MaskedUInt8Engine): ... +class StringObjectEngine(ObjectEngine): + def __init__(self, values: object, na_value) -> None: ... + class BaseMultiIndexCodesEngine: levels: list[np.ndarray] offsets: np.ndarray # np.ndarray[..., ndim=1] diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 1506a76aa94a6..688f943760d1f 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -557,6 +557,31 @@ cdef class StringEngine(IndexEngine): raise KeyError(val) return str(val) +cdef class StringObjectEngine(ObjectEngine): + + cdef: + object na_value + bint uses_na + + def __init__(self, ndarray values, na_value): + super().__init__(values) + self.na_value = na_value + self.uses_na = na_value is C_NA + + cdef bint _checknull(self, object val): + if self.uses_na: + return val is C_NA + else: + return util.is_nan(val) + + cdef _check_type(self, object val): + if isinstance(val, str): + return val + elif self._checknull(val): + return self.na_value + else: + raise KeyError(val) + cdef class DatetimeEngine(Int64Engine): diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index cc65f34d6b6fe..9a022095feee9 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -660,11 +660,12 @@ void pandas_datetime_to_datetimestruct(npy_datetime dt, NPY_DATETIMEUNIT base, perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000; set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); - out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); - out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000); - out->us = (npy_int32)extract_unit(&dt, 1000LL); - out->ps = (npy_int32)(dt * 1000); + out->hour = + (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 60 * 60); + out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 60); + out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000); + out->us = (npy_int32)extract_unit(&dt, 1000LL * 1000); + out->ps = (npy_int32)(dt); break; case NPY_FR_fs: diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 31979b293a940..f433a3acf356f 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -1,39 +1,39 @@ __all__ = [ - "dtypes", - "localize_pydatetime", + "BaseOffset", + "IncompatibleFrequency", "NaT", "NaTType", - "iNaT", - "nat_strings", "OutOfBoundsDatetime", "OutOfBoundsTimedelta", - "IncompatibleFrequency", "Period", "Resolution", + "Tick", "Timedelta", - "normalize_i8_timestamps", - "is_date_array_normalized", - "dt64arr_to_periodarr", + "Timestamp", + "add_overflowsafe", + "astype_overflowsafe", "delta_to_nanoseconds", + "dt64arr_to_periodarr", + "dtypes", + "get_resolution", + "get_supported_dtype", + "get_unit_from_dtype", + "guess_datetime_format", + "iNaT", "ints_to_pydatetime", "ints_to_pytimedelta", - "get_resolution", - "Timestamp", - "tz_convert_from_utc_single", - "tz_convert_from_utc", - "to_offset", - "Tick", - "BaseOffset", - "tz_compare", + "is_date_array_normalized", + "is_supported_dtype", "is_unitless", - "astype_overflowsafe", - "get_unit_from_dtype", + "localize_pydatetime", + "nat_strings", + "normalize_i8_timestamps", "periods_per_day", "periods_per_second", - "guess_datetime_format", - "add_overflowsafe", - "get_supported_dtype", - "is_supported_dtype", + "to_offset", + "tz_compare", + "tz_convert_from_utc", + "tz_convert_from_utc_single", ] from pandas._libs.tslibs import dtypes diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 193556b2697a9..1b7f04fe17238 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -201,6 +201,10 @@ class OutOfBoundsTimedelta(ValueError): Representation should be within a timedelta64[ns]. + See Also + -------- + date_range : Return a fixed frequency DatetimeIndex. + Examples -------- >>> pd.date_range(start="1/1/1700", freq="B", periods=100000) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index e092d65f08dd4..ec9b5098c97c9 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -540,6 +540,25 @@ def shares_memory(left, right) -> bool: "ALL_INT_NUMPY_DTYPES", "ALL_NUMPY_DTYPES", "ALL_REAL_NUMPY_DTYPES", + "BOOL_DTYPES", + "BYTES_DTYPES", + "COMPLEX_DTYPES", + "DATETIME64_DTYPES", + "ENDIAN", + "FLOAT_EA_DTYPES", + "FLOAT_NUMPY_DTYPES", + "NARROW_NP_DTYPES", + "NP_NAT_OBJECTS", + "NULL_OBJECTS", + "OBJECT_DTYPES", + "SIGNED_INT_EA_DTYPES", + "SIGNED_INT_NUMPY_DTYPES", + "STRING_DTYPES", + "TIMEDELTA64_DTYPES", + "UNSIGNED_INT_EA_DTYPES", + "UNSIGNED_INT_NUMPY_DTYPES", + "SubclassedDataFrame", + "SubclassedSeries", "assert_almost_equal", "assert_attr_equal", "assert_categorical_equal", @@ -563,51 +582,32 @@ def shares_memory(left, right) -> bool: "assert_sp_array_equal", "assert_timedelta_array_equal", "at", - "BOOL_DTYPES", "box_expected", - "BYTES_DTYPES", "can_set_locale", - "COMPLEX_DTYPES", "convert_rows_list_to_csv_str", - "DATETIME64_DTYPES", "decompress_file", - "ENDIAN", "ensure_clean", "external_error_raised", - "FLOAT_EA_DTYPES", - "FLOAT_NUMPY_DTYPES", "get_cython_table_params", "get_dtype", - "getitem", - "get_locales", "get_finest_unit", + "get_locales", "get_obj", "get_op_from_name", + "getitem", "iat", "iloc", "loc", "maybe_produces_warning", - "NARROW_NP_DTYPES", - "NP_NAT_OBJECTS", - "NULL_OBJECTS", - "OBJECT_DTYPES", "raise_assert_detail", "raises_chained_assignment_error", "round_trip_pathlib", "round_trip_pickle", - "setitem", "set_locale", "set_timezone", + "setitem", "shares_memory", - "SIGNED_INT_EA_DTYPES", - "SIGNED_INT_NUMPY_DTYPES", - "STRING_DTYPES", - "SubclassedDataFrame", - "SubclassedSeries", - "TIMEDELTA64_DTYPES", "to_array", - "UNSIGNED_INT_EA_DTYPES", - "UNSIGNED_INT_NUMPY_DTYPES", "with_csv_dialect", "write_to_compressed", ] diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 01c4dcd92ee40..daa5187cdb636 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -755,11 +755,8 @@ def assert_extension_array_equal( and atol is lib.no_default ): check_exact = ( - is_numeric_dtype(left.dtype) - and not is_float_dtype(left.dtype) - or is_numeric_dtype(right.dtype) - and not is_float_dtype(right.dtype) - ) + is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype) + ) or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype)) elif check_exact is lib.no_default: check_exact = False @@ -944,11 +941,8 @@ def assert_series_equal( and atol is lib.no_default ): check_exact = ( - is_numeric_dtype(left.dtype) - and not is_float_dtype(left.dtype) - or is_numeric_dtype(right.dtype) - and not is_float_dtype(right.dtype) - ) + is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype) + ) or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype)) left_index_dtypes = ( [left.index.dtype] if left.index.nlevels == 1 else left.index.dtypes ) diff --git a/pandas/_typing.py b/pandas/_typing.py index c1769126a5776..b515305fb6903 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -273,7 +273,7 @@ def mode(self) -> str: # for _get_filepath_or_buffer ... - def seek(self, __offset: int, __whence: int = ...) -> int: + def seek(self, offset: int, whence: int = ..., /) -> int: # with one argument: gzip.GzipFile, bz2.BZ2File # with two arguments: zip.ZipFile, read_sas ... @@ -288,13 +288,13 @@ def tell(self) -> int: class ReadBuffer(BaseBuffer, Protocol[AnyStr_co]): - def read(self, __n: int = ...) -> AnyStr_co: + def read(self, n: int = ..., /) -> AnyStr_co: # for BytesIOWrapper, gzip.GzipFile, bz2.BZ2File ... class WriteBuffer(BaseBuffer, Protocol[AnyStr_contra]): - def write(self, __b: AnyStr_contra) -> Any: + def write(self, b: AnyStr_contra, /) -> Any: # for gzip.GzipFile, bz2.BZ2File ... diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py index 9b007e8fe8da4..8f659e3cd14c8 100644 --- a/pandas/api/__init__.py +++ b/pandas/api/__init__.py @@ -9,9 +9,9 @@ ) __all__ = [ - "interchange", "extensions", "indexers", + "interchange", "types", "typing", ] diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py index ea5f1ba926899..1c88c0d35b4d7 100644 --- a/pandas/api/extensions/__init__.py +++ b/pandas/api/extensions/__init__.py @@ -21,13 +21,13 @@ ) __all__ = [ - "no_default", + "ExtensionArray", "ExtensionDtype", - "register_extension_dtype", + "ExtensionScalarOpsMixin", + "no_default", "register_dataframe_accessor", + "register_extension_dtype", "register_index_accessor", "register_series_accessor", "take", - "ExtensionArray", - "ExtensionScalarOpsMixin", ] diff --git a/pandas/api/indexers/__init__.py b/pandas/api/indexers/__init__.py index 78357f11dc3b7..f3c6546218de4 100644 --- a/pandas/api/indexers/__init__.py +++ b/pandas/api/indexers/__init__.py @@ -10,8 +10,8 @@ ) __all__ = [ - "check_array_indexer", "BaseIndexer", "FixedForwardWindowIndexer", "VariableOffsetWindowIndexer", + "check_array_indexer", ] diff --git a/pandas/api/interchange/__init__.py b/pandas/api/interchange/__init__.py index 2f3a73bc46b31..aded37abc7224 100644 --- a/pandas/api/interchange/__init__.py +++ b/pandas/api/interchange/__init__.py @@ -5,4 +5,4 @@ from pandas.core.interchange.dataframe_protocol import DataFrame from pandas.core.interchange.from_dataframe import from_dataframe -__all__ = ["from_dataframe", "DataFrame"] +__all__ = ["DataFrame", "from_dataframe"] diff --git a/pandas/api/types/__init__.py b/pandas/api/types/__init__.py index c601086bb9f86..4a5c742b1628b 100644 --- a/pandas/api/types/__init__.py +++ b/pandas/api/types/__init__.py @@ -14,10 +14,10 @@ ) __all__ = [ - "infer_dtype", - "union_categoricals", "CategoricalDtype", "DatetimeTZDtype", "IntervalDtype", "PeriodDtype", + "infer_dtype", + "union_categoricals", ] diff --git a/pandas/api/typing/__init__.py b/pandas/api/typing/__init__.py index c58fa0f085266..a18a1e9d5cbb7 100644 --- a/pandas/api/typing/__init__.py +++ b/pandas/api/typing/__init__.py @@ -42,18 +42,16 @@ "ExponentialMovingWindowGroupby", "FrozenList", "JsonReader", - "NaTType", "NAType", + "NaTType", "PeriodIndexResamplerGroupby", "Resampler", "Rolling", "RollingGroupby", + "SASReader", "SeriesGroupBy", "StataReader", - "SASReader", - # See TODO above - # "Styler", - "TimedeltaIndexResamplerGroupby", "TimeGrouper", + "TimedeltaIndexResamplerGroupby", "Window", ] diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 756c209661fbb..e7674386408f7 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -150,6 +150,13 @@ def is_ci_environment() -> bool: __all__ = [ + "HAS_PYARROW", + "IS64", + "ISMUSL", + "PY311", + "PY312", + "PYPY", + "WASM", "is_numpy_dev", "pa_version_under10p1", "pa_version_under11p0", @@ -159,11 +166,4 @@ def is_ci_environment() -> bool: "pa_version_under16p0", "pa_version_under17p0", "pa_version_under18p0", - "HAS_PYARROW", - "IS64", - "ISMUSL", - "PY311", - "PY312", - "PYPY", - "WASM", ] diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 2fab8f32b8e71..3306b36d71806 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -47,7 +47,7 @@ __all__ = [ - "np", "_np_version", "is_numpy_dev", + "np", ] diff --git a/pandas/core/_numba/kernels/__init__.py b/pandas/core/_numba/kernels/__init__.py index 1116c61c4ca8e..6983711480455 100644 --- a/pandas/core/_numba/kernels/__init__.py +++ b/pandas/core/_numba/kernels/__init__.py @@ -16,12 +16,12 @@ ) __all__ = [ - "sliding_mean", "grouped_mean", - "sliding_sum", + "grouped_min_max", "grouped_sum", - "sliding_var", "grouped_var", + "sliding_mean", "sliding_min_max", - "grouped_min_max", + "sliding_sum", + "sliding_var", ] diff --git a/pandas/core/api.py b/pandas/core/api.py index c8a4e9d8a23b2..ec12d543d8389 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -80,59 +80,59 @@ from pandas.core.frame import DataFrame # isort:skip __all__ = [ - "array", + "NA", "ArrowDtype", - "bdate_range", "BooleanDtype", "Categorical", "CategoricalDtype", "CategoricalIndex", "DataFrame", "DateOffset", - "date_range", "DatetimeIndex", "DatetimeTZDtype", - "factorize", "Flags", "Float32Dtype", "Float64Dtype", "Grouper", "Index", "IndexSlice", + "Int8Dtype", "Int16Dtype", "Int32Dtype", "Int64Dtype", - "Int8Dtype", "Interval", "IntervalDtype", "IntervalIndex", - "interval_range", - "isna", - "isnull", "MultiIndex", - "NA", - "NamedAgg", "NaT", - "notna", - "notnull", + "NamedAgg", "Period", "PeriodDtype", "PeriodIndex", - "period_range", "RangeIndex", "Series", - "set_eng_float_format", "StringDtype", "Timedelta", "TimedeltaIndex", - "timedelta_range", "Timestamp", - "to_datetime", - "to_numeric", - "to_timedelta", + "UInt8Dtype", "UInt16Dtype", "UInt32Dtype", "UInt64Dtype", - "UInt8Dtype", + "array", + "bdate_range", + "date_range", + "factorize", + "interval_range", + "isna", + "isnull", + "notna", + "notnull", + "period_range", + "set_eng_float_format", + "timedelta_range", + "to_datetime", + "to_numeric", + "to_timedelta", "unique", ] diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index f946c5adcbb0b..a9ad66b7cb2e5 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -151,4 +151,6 @@ def re_replacer(s): if mask is None: values[:] = f(values) else: + if values.ndim != mask.ndim: + mask = np.broadcast_to(mask, values.shape) values[mask] = f(values[mask]) diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 245a171fea74b..f183e9236471e 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -23,21 +23,21 @@ __all__ = [ "ArrowExtensionArray", - "ExtensionArray", - "ExtensionOpsMixin", - "ExtensionScalarOpsMixin", "ArrowStringArray", "BaseMaskedArray", "BooleanArray", "Categorical", "DatetimeArray", + "ExtensionArray", + "ExtensionOpsMixin", + "ExtensionScalarOpsMixin", "FloatingArray", "IntegerArray", "IntervalArray", "NumpyExtensionArray", "PeriodArray", - "period_array", "SparseArray", "StringArray", "TimedeltaArray", + "period_array", ] diff --git a/pandas/core/arrays/arrow/__init__.py b/pandas/core/arrays/arrow/__init__.py index 5fc50f786fc6a..50274a2de2cc1 100644 --- a/pandas/core/arrays/arrow/__init__.py +++ b/pandas/core/arrays/arrow/__init__.py @@ -4,4 +4,4 @@ ) from pandas.core.arrays.arrow.array import ArrowExtensionArray -__all__ = ["ArrowExtensionArray", "StructAccessor", "ListAccessor"] +__all__ = ["ArrowExtensionArray", "ListAccessor", "StructAccessor"] diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 230522846d377..b220a94d032b5 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -117,7 +117,10 @@ def len(self) -> Series: value_lengths = pc.list_value_length(self._pa_array) return Series( - value_lengths, dtype=ArrowDtype(value_lengths.type), index=self._data.index + value_lengths, + dtype=ArrowDtype(value_lengths.type), + index=self._data.index, + name=self._data.name, ) def __getitem__(self, key: int | slice) -> Series: @@ -162,7 +165,10 @@ def __getitem__(self, key: int | slice) -> Series: # key = pc.add(key, pc.list_value_length(self._pa_array)) element = pc.list_element(self._pa_array, key) return Series( - element, dtype=ArrowDtype(element.type), index=self._data.index + element, + dtype=ArrowDtype(element.type), + index=self._data.index, + name=self._data.name, ) elif isinstance(key, slice): if pa_version_under11p0: @@ -181,7 +187,12 @@ def __getitem__(self, key: int | slice) -> Series: if step is None: step = 1 sliced = pc.list_slice(self._pa_array, start, stop, step) - return Series(sliced, dtype=ArrowDtype(sliced.type), index=self._data.index) + return Series( + sliced, + dtype=ArrowDtype(sliced.type), + index=self._data.index, + name=self._data.name, + ) else: raise ValueError(f"key must be an int or slice, got {type(key).__name__}") @@ -223,7 +234,12 @@ def flatten(self) -> Series: counts = pa.compute.list_value_length(self._pa_array) flattened = pa.compute.list_flatten(self._pa_array) index = self._data.index.repeat(counts.fill_null(pa.scalar(0, counts.type))) - return Series(flattened, dtype=ArrowDtype(flattened.type), index=index) + return Series( + flattened, + dtype=ArrowDtype(flattened.type), + index=index, + name=self._data.name, + ) class StructAccessor(ArrowAccessor): diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index fcc50c5b6b20f..afa219f611992 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1446,8 +1446,7 @@ def to_numpy( pa.types.is_floating(pa_type) and ( na_value is np.nan - or original_na_value is lib.no_default - and is_float_dtype(dtype) + or (original_na_value is lib.no_default and is_float_dtype(dtype)) ) ): result = data._pa_array.to_numpy() @@ -1644,7 +1643,11 @@ def _accumulate( else: data_to_accum = data_to_accum.cast(pa.int64()) - result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) + try: + result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) + except pa.ArrowNotImplementedError as err: + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) from err if convert_to_int: result = result.cast(pa_dtype) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 9c821bf0d184e..c6b6367e347ba 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2073,7 +2073,29 @@ def _creso(self) -> int: @cache_readonly def unit(self) -> str: - # e.g. "ns", "us", "ms" + """ + The precision unit of the datetime data. + + Returns the precision unit for the dtype. + It means the smallest time frame that can be stored within this dtype. + + Returns + ------- + str + Unit string representation (e.g. "ns"). + + See Also + -------- + TimelikeOps.as_unit : Converts to a specific unit. + + Examples + -------- + >>> idx = pd.DatetimeIndex(["2020-01-02 01:02:03.004005006"]) + >>> idx.unit + 'ns' + >>> idx.as_unit("s").unit + 's' + """ # error: Argument 1 to "dtype_to_unit" has incompatible type # "ExtensionDtype"; expected "Union[DatetimeTZDtype, dtype[Any]]" return dtype_to_unit(self.dtype) # type: ignore[arg-type] diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index f85fbd062b0c3..afbadd754cdbc 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -105,6 +105,12 @@ class IntegerArray(NumericArray): ------- IntegerArray + See Also + -------- + array : Create an array using the appropriate dtype, including ``IntegerArray``. + Int32Dtype : An ExtensionDtype for int32 integer data. + UInt16Dtype : An ExtensionDtype for uint16 integer data. + Examples -------- Create an IntegerArray with :func:`pandas.array`. diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index f47ef095a8409..0bf2089df5f85 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1055,7 +1055,9 @@ def shift(self, periods: int = 1, fill_value: object = None) -> IntervalArray: from pandas import Index fill_value = Index(self._left, copy=False)._na_value - empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1)) + empty = IntervalArray.from_breaks( + [fill_value] * (empty_len + 1), closed=self.closed + ) else: empty = self._from_sequence([fill_value] * empty_len, dtype=self.dtype) @@ -1304,6 +1306,20 @@ def length(self) -> Index: """ Return an Index with entries denoting the length of each Interval. + The length of an interval is calculated as the difference between + its `right` and `left` bounds. This property is particularly useful + when working with intervals where the size of the interval is an important + attribute, such as in time-series analysis or spatial data analysis. + + See Also + -------- + arrays.IntervalArray.left : Return the left endpoints of each Interval in + the IntervalArray as an Index. + arrays.IntervalArray.right : Return the right endpoints of each Interval in + the IntervalArray as an Index. + arrays.IntervalArray.mid : Return the midpoint of each Interval in the + IntervalArray as an Index. + Examples -------- diff --git a/pandas/core/arrays/sparse/__init__.py b/pandas/core/arrays/sparse/__init__.py index adf83963aca39..93d5cb8cc335a 100644 --- a/pandas/core/arrays/sparse/__init__.py +++ b/pandas/core/arrays/sparse/__init__.py @@ -12,8 +12,8 @@ __all__ = [ "BlockIndex", "IntIndex", - "make_sparse_index", "SparseAccessor", "SparseArray", "SparseFrameAccessor", + "make_sparse_index", ] diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 4ccfbd71d9ce8..86f83489e71ae 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -371,10 +371,12 @@ def eval( is_extension_array_dtype(parsed_expr.terms.return_type) and not is_string_dtype(parsed_expr.terms.return_type) ) - or getattr(parsed_expr.terms, "operand_types", None) is not None - and any( - (is_extension_array_dtype(elem) and not is_string_dtype(elem)) - for elem in parsed_expr.terms.operand_types + or ( + getattr(parsed_expr.terms, "operand_types", None) is not None + and any( + (is_extension_array_dtype(elem) and not is_string_dtype(elem)) + for elem in parsed_expr.terms.operand_types + ) ) ): warnings.warn( diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 7025d8a72e561..010fad1bbf0b6 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -512,8 +512,7 @@ def _maybe_evaluate_binop( ) if self.engine != "pytables" and ( - res.op in CMP_OPS_SYMS - and getattr(lhs, "is_datetime", False) + (res.op in CMP_OPS_SYMS and getattr(lhs, "is_datetime", False)) or getattr(rhs, "is_datetime", False) ): # all date ops must be done in python bc numexpr doesn't work diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index e2acd9a2c97c2..5a5fad0d83d7a 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -65,23 +65,23 @@ def set_numexpr_threads(n=None) -> None: ne.set_num_threads(n) -def _evaluate_standard(op, op_str, a, b): +def _evaluate_standard(op, op_str, left_op, right_op): """ Standard evaluation. """ if _TEST_MODE: _store_test_result(False) - return op(a, b) + return op(left_op, right_op) -def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool: - """return a boolean if we WILL be using numexpr""" +def _can_use_numexpr(op, op_str, left_op, right_op, dtype_check) -> bool: + """return left_op boolean if we WILL be using numexpr""" if op_str is not None: # required min elements (otherwise we are adding overhead) - if a.size > _MIN_ELEMENTS: + if left_op.size > _MIN_ELEMENTS: # check for dtype compatibility dtypes: set[str] = set() - for o in [a, b]: + for o in [left_op, right_op]: # ndarray and Series Case if hasattr(o, "dtype"): dtypes |= {o.dtype.name} @@ -93,22 +93,22 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool: return False -def _evaluate_numexpr(op, op_str, a, b): +def _evaluate_numexpr(op, op_str, left_op, right_op): result = None - if _can_use_numexpr(op, op_str, a, b, "evaluate"): + if _can_use_numexpr(op, op_str, left_op, right_op, "evaluate"): is_reversed = op.__name__.strip("_").startswith("r") if is_reversed: # we were originally called by a reversed op method - a, b = b, a + left_op, right_op = right_op, left_op - a_value = a - b_value = b + left_value = left_op + right_value = right_op try: result = ne.evaluate( - f"a_value {op_str} b_value", - local_dict={"a_value": a_value, "b_value": b_value}, + f"left_value {op_str} right_value", + local_dict={"left_value": left_value, "right_value": right_value}, casting="safe", ) except TypeError: @@ -116,20 +116,20 @@ def _evaluate_numexpr(op, op_str, a, b): # (https://github.com/pydata/numexpr/issues/379) pass except NotImplementedError: - if _bool_arith_fallback(op_str, a, b): + if _bool_arith_fallback(op_str, left_op, right_op): pass else: raise if is_reversed: # reverse order to original for fallback - a, b = b, a + left_op, right_op = right_op, left_op if _TEST_MODE: _store_test_result(result is not None) if result is None: - result = _evaluate_standard(op, op_str, a, b) + result = _evaluate_standard(op, op_str, left_op, right_op) return result @@ -170,24 +170,24 @@ def _evaluate_numexpr(op, op_str, a, b): } -def _where_standard(cond, a, b): +def _where_standard(cond, left_op, right_op): # Caller is responsible for extracting ndarray if necessary - return np.where(cond, a, b) + return np.where(cond, left_op, right_op) -def _where_numexpr(cond, a, b): +def _where_numexpr(cond, left_op, right_op): # Caller is responsible for extracting ndarray if necessary result = None - if _can_use_numexpr(None, "where", a, b, "where"): + if _can_use_numexpr(None, "where", left_op, right_op, "where"): result = ne.evaluate( "where(cond_value, a_value, b_value)", - local_dict={"cond_value": cond, "a_value": a, "b_value": b}, + local_dict={"cond_value": cond, "a_value": left_op, "b_value": right_op}, casting="safe", ) if result is None: - result = _where_standard(cond, a, b) + result = _where_standard(cond, left_op, right_op) return result @@ -206,13 +206,13 @@ def _has_bool_dtype(x): _BOOL_OP_UNSUPPORTED = {"+": "|", "*": "&", "-": "^"} -def _bool_arith_fallback(op_str, a, b) -> bool: +def _bool_arith_fallback(op_str, left_op, right_op) -> bool: """ Check if we should fallback to the python `_evaluate_standard` in case of an unsupported operation by numexpr, which is the case for some boolean ops. """ - if _has_bool_dtype(a) and _has_bool_dtype(b): + if _has_bool_dtype(left_op) and _has_bool_dtype(right_op): if op_str in _BOOL_OP_UNSUPPORTED: warnings.warn( f"evaluating in Python space because the {op_str!r} " @@ -224,15 +224,15 @@ def _bool_arith_fallback(op_str, a, b) -> bool: return False -def evaluate(op, a, b, use_numexpr: bool = True): +def evaluate(op, left_op, right_op, use_numexpr: bool = True): """ - Evaluate and return the expression of the op on a and b. + Evaluate and return the expression of the op on left_op and right_op. Parameters ---------- op : the actual operand - a : left operand - b : right operand + left_op : left operand + right_op : right operand use_numexpr : bool, default True Whether to try to use numexpr. """ @@ -240,24 +240,27 @@ def evaluate(op, a, b, use_numexpr: bool = True): if op_str is not None: if use_numexpr: # error: "None" not callable - return _evaluate(op, op_str, a, b) # type: ignore[misc] - return _evaluate_standard(op, op_str, a, b) + return _evaluate(op, op_str, left_op, right_op) # type: ignore[misc] + return _evaluate_standard(op, op_str, left_op, right_op) -def where(cond, a, b, use_numexpr: bool = True): +def where(cond, left_op, right_op, use_numexpr: bool = True): """ - Evaluate the where condition cond on a and b. + Evaluate the where condition cond on left_op and right_op. Parameters ---------- cond : np.ndarray[bool] - a : return if cond is True - b : return if cond is False + left_op : return if cond is True + right_op : return if cond is False use_numexpr : bool, default True Whether to try to use numexpr. """ assert _where is not None - return _where(cond, a, b) if use_numexpr else _where_standard(cond, a, b) + if use_numexpr: + return _where(cond, left_op, right_op) + else: + return _where_standard(cond, left_op, right_op) def set_test_mode(v: bool = True) -> None: diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 39511048abf49..166c9d47294cd 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -205,7 +205,7 @@ def generate(self, v) -> str: val = v.tostring(self.encoding) return f"({self.lhs} {self.op} {val})" - def convert_value(self, v) -> TermValue: + def convert_value(self, conv_val) -> TermValue: """ convert the expression that is in the term to something that is accepted by pytables @@ -219,44 +219,44 @@ def stringify(value): kind = ensure_decoded(self.kind) meta = ensure_decoded(self.meta) if kind == "datetime" or (kind and kind.startswith("datetime64")): - if isinstance(v, (int, float)): - v = stringify(v) - v = ensure_decoded(v) - v = Timestamp(v).as_unit("ns") - if v.tz is not None: - v = v.tz_convert("UTC") - return TermValue(v, v._value, kind) + if isinstance(conv_val, (int, float)): + conv_val = stringify(conv_val) + conv_val = ensure_decoded(conv_val) + conv_val = Timestamp(conv_val).as_unit("ns") + if conv_val.tz is not None: + conv_val = conv_val.tz_convert("UTC") + return TermValue(conv_val, conv_val._value, kind) elif kind in ("timedelta64", "timedelta"): - if isinstance(v, str): - v = Timedelta(v) + if isinstance(conv_val, str): + conv_val = Timedelta(conv_val) else: - v = Timedelta(v, unit="s") - v = v.as_unit("ns")._value - return TermValue(int(v), v, kind) + conv_val = Timedelta(conv_val, unit="s") + conv_val = conv_val.as_unit("ns")._value + return TermValue(int(conv_val), conv_val, kind) elif meta == "category": metadata = extract_array(self.metadata, extract_numpy=True) result: npt.NDArray[np.intp] | np.intp | int - if v not in metadata: + if conv_val not in metadata: result = -1 else: - result = metadata.searchsorted(v, side="left") + result = metadata.searchsorted(conv_val, side="left") return TermValue(result, result, "integer") elif kind == "integer": try: - v_dec = Decimal(v) + v_dec = Decimal(conv_val) except InvalidOperation: # GH 54186 # convert v to float to raise float's ValueError - float(v) + float(conv_val) else: - v = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN")) - return TermValue(v, v, kind) + conv_val = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN")) + return TermValue(conv_val, conv_val, kind) elif kind == "float": - v = float(v) - return TermValue(v, v, kind) + conv_val = float(conv_val) + return TermValue(conv_val, conv_val, kind) elif kind == "bool": - if isinstance(v, str): - v = v.strip().lower() not in [ + if isinstance(conv_val, str): + conv_val = conv_val.strip().lower() not in [ "false", "f", "no", @@ -268,13 +268,15 @@ def stringify(value): "", ] else: - v = bool(v) - return TermValue(v, v, kind) - elif isinstance(v, str): + conv_val = bool(conv_val) + return TermValue(conv_val, conv_val, kind) + elif isinstance(conv_val, str): # string quoting - return TermValue(v, stringify(v), "string") + return TermValue(conv_val, stringify(conv_val), "string") else: - raise TypeError(f"Cannot compare {v} of type {type(v)} to {kind} column") + raise TypeError( + f"Cannot compare {conv_val} of type {type(conv_val)} to {kind} column" + ) def convert_values(self) -> None: pass @@ -408,11 +410,12 @@ def prune(self, klass): operand = operand.prune(klass) if operand is not None and ( - issubclass(klass, ConditionBinOp) - and operand.condition is not None - or not issubclass(klass, ConditionBinOp) - and issubclass(klass, FilterBinOp) - and operand.filter is not None + (issubclass(klass, ConditionBinOp) and operand.condition is not None) + or ( + not issubclass(klass, ConditionBinOp) + and issubclass(klass, FilterBinOp) + and operand.filter is not None + ) ): return operand.invert() return None diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index 7b31e03e58b4b..336d62b9d9579 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -140,7 +140,7 @@ class Scope: temps : dict """ - __slots__ = ["level", "scope", "target", "resolvers", "temps"] + __slots__ = ["level", "resolvers", "scope", "target", "temps"] level: int scope: DeepChainMap resolvers: DeepChainMap diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 830b84852c704..02b9291da9b31 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -87,8 +87,8 @@ if TYPE_CHECKING: from collections.abc import ( + Collection, Sequence, - Sized, ) from pandas._typing import ( @@ -1162,6 +1162,7 @@ def convert_dtypes( def maybe_infer_to_datetimelike( value: npt.NDArray[np.object_], + convert_to_nullable_dtype: bool = False, ) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray | IntervalArray: """ we might have a array (or single object) that is datetime like, @@ -1199,6 +1200,7 @@ def maybe_infer_to_datetimelike( # numpy would have done it for us. convert_numeric=False, convert_non_numeric=True, + convert_to_nullable_dtype=convert_to_nullable_dtype, dtype_if_all_nat=np.dtype("M8[s]"), ) @@ -1579,7 +1581,7 @@ def _maybe_box_and_unbox_datetimelike(value: Scalar, dtype: DtypeObj): return _maybe_unbox_datetimelike(value, dtype) -def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: +def construct_1d_object_array_from_listlike(values: Collection) -> np.ndarray: """ Transform any list-like object in a 1-dimensional numpy array of object dtype. @@ -1597,11 +1599,9 @@ def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: ------- 1-dimensional numpy array of dtype object """ - # numpy will try to interpret nested lists as further dimensions, hence - # making a 1D array that contains list-likes is a bit tricky: - result = np.empty(len(values), dtype="object") - result[:] = values - return result + # numpy will try to interpret nested lists as further dimensions in np.array(), + # hence explicitly making a 1D array using np.fromiter + return np.fromiter(values, dtype="object", count=len(values)) def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.ndarray: diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 98c770ec4a8b0..b0c8ec1ffc083 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -430,7 +430,7 @@ def is_period_dtype(arr_or_dtype) -> bool: Check whether an array-like or dtype is of the Period dtype. .. deprecated:: 2.2.0 - Use isinstance(dtype, pd.Period) instead. + Use isinstance(dtype, pd.PeriodDtype) instead. Parameters ---------- @@ -1785,16 +1785,22 @@ def pandas_dtype(dtype) -> DtypeObj: Parameters ---------- - dtype : object to be converted + dtype : object + The object to be converted into a dtype. Returns ------- np.dtype or a pandas dtype + The converted dtype, which can be either a numpy dtype or a pandas dtype. Raises ------ TypeError if not a dtype + See Also + -------- + api.types.is_dtype : Return true if the condition is satisfied for the arr_or_dtype. + Examples -------- >>> pd.api.types.pandas_dtype(int) @@ -1883,13 +1889,14 @@ def is_all_strings(value: ArrayLike) -> bool: __all__ = [ - "classes", "DT64NS_DTYPE", + "INT64_DTYPE", + "TD64NS_DTYPE", + "classes", "ensure_float64", "ensure_python_int", "ensure_str", "infer_dtype_from_object", - "INT64_DTYPE", "is_1d_only_ea_dtype", "is_all_strings", "is_any_real_numeric_dtype", @@ -1934,6 +1941,5 @@ def is_all_strings(value: ArrayLike) -> bool: "is_unsigned_integer_dtype", "needs_i8_conversion", "pandas_dtype", - "TD64NS_DTYPE", "validate_all_hashable", ] diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 96b0aa16940a6..1dd1b12d6ae95 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -73,7 +73,7 @@ from collections.abc import MutableMapping from datetime import tzinfo - import pyarrow as pa # noqa: TCH004 + import pyarrow as pa # noqa: TC004 from pandas._typing import ( Dtype, @@ -1115,10 +1115,8 @@ def construct_from_string(cls, string: str_type) -> PeriodDtype: possible """ if ( - isinstance(string, str) - and (string.startswith(("period[", "Period["))) - or isinstance(string, BaseOffset) - ): + isinstance(string, str) and (string.startswith(("period[", "Period["))) + ) or isinstance(string, BaseOffset): # do not parse string like U as period[U] # avoid tuple to be regarded as freq try: @@ -2344,6 +2342,8 @@ def construct_from_string(cls, string: str) -> ArrowDtype: if string == "string[pyarrow]": # Ensure Registry.find skips ArrowDtype to use StringDtype instead raise TypeError("string[pyarrow] should be constructed by StringDtype") + if pa_version_under10p1: + raise ImportError("pyarrow>=10.0.1 is required for ArrowDtype") base_type = string[:-9] # get rid of "[pyarrow]" try: diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 6adb34ff0f777..918d107f2ce6c 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -190,12 +190,17 @@ def is_re_compilable(obj: object) -> bool: Parameters ---------- obj : The object to check + The object to check if the object can be compiled into a regex pattern instance. Returns ------- bool Whether `obj` can be compiled as a regex pattern. + See Also + -------- + api.types.is_re : Check if the object is a regex pattern instance. + Examples -------- >>> from pandas.api.types import is_re_compilable diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b35e2c8497fb7..34b448a0d8d1c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1018,7 +1018,7 @@ def shape(self) -> tuple[int, int]: See Also -------- - ndarray.shape : Tuple of array dimensions. + numpy.ndarray.shape : Tuple of array dimensions. Examples -------- @@ -2115,8 +2115,8 @@ def from_records( """ Convert structured or record ndarray to DataFrame. - Creates a DataFrame object from a structured ndarray, sequence of - tuples or dicts, or DataFrame. + Creates a DataFrame object from a structured ndarray, or sequence of + tuples or dicts. Parameters ---------- @@ -3929,8 +3929,7 @@ def __getitem__(self, key): # GH#45316 Return view if key is not duplicated # Only use drop_duplicates with duplicates for performance if not is_mi and ( - self.columns.is_unique - and key in self.columns + (self.columns.is_unique and key in self.columns) or key in self.columns.drop_duplicates(keep=False) ): return self._get_item(key) @@ -4742,7 +4741,8 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: 3 4 4 7 8 0 4 5 2 6 7 3 - For columns with spaces in their name, you can use backtick quoting. + For columns with spaces or other disallowed characters in their name, you can + use backtick quoting. >>> df.eval("B * `C&C`") 0 100 @@ -6775,8 +6775,7 @@ def f(vals) -> tuple[np.ndarray, int]: elif ( not np.iterable(subset) or isinstance(subset, str) - or isinstance(subset, tuple) - and subset in self.columns + or (isinstance(subset, tuple) and subset in self.columns) ): subset = (subset,) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 56031f20faa16..de7fb3682fb4f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -640,7 +640,7 @@ def ndim(self) -> int: See Also -------- - ndarray.ndim : Number of array dimensions. + numpy.ndarray.ndim : Number of array dimensions. Examples -------- @@ -665,7 +665,7 @@ def size(self) -> int: See Also -------- - ndarray.size : Number of elements in the array. + numpy.ndarray.size : Number of elements in the array. Examples -------- @@ -838,7 +838,7 @@ def pop(self, item: Hashable) -> Series | Any: return result @final - def squeeze(self, axis: Axis | None = None): + def squeeze(self, axis: Axis | None = None) -> Scalar | Series | DataFrame: """ Squeeze 1 dimensional axis objects into scalars. @@ -2211,8 +2211,9 @@ def to_excel( via the options ``io.excel.xlsx.writer`` or ``io.excel.xlsm.writer``. - merge_cells : bool, default True - Write MultiIndex and Hierarchical Rows as merged cells. + merge_cells : bool or 'columns', default False + If True, write MultiIndex index and columns as merged cells. + If 'columns', merge MultiIndex column cells only. {encoding_parameter} inf_rep : str, default 'inf' Representation for infinity (there is no native representation for @@ -3877,6 +3878,14 @@ def to_csv( >>> import os # doctest: +SKIP >>> os.makedirs("folder/subfolder", exist_ok=True) # doctest: +SKIP >>> df.to_csv("folder/subfolder/out.csv") # doctest: +SKIP + + Format floats to two decimal places: + + >>> df.to_csv("out1.csv", float_format="%.2f") # doctest: +SKIP + + Format floats using scientific notation: + + >>> df.to_csv("out2.csv", float_format="{{:.2e}}".format) # doctest: +SKIP """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() diff --git a/pandas/core/groupby/__init__.py b/pandas/core/groupby/__init__.py index 8248f378e2c1a..ec477626a098f 100644 --- a/pandas/core/groupby/__init__.py +++ b/pandas/core/groupby/__init__.py @@ -8,8 +8,8 @@ __all__ = [ "DataFrameGroupBy", - "NamedAgg", - "SeriesGroupBy", "GroupBy", "Grouper", + "NamedAgg", + "SeriesGroupBy", ] diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5ba382bf66bb7..f4e3f3e8b1001 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -583,6 +583,8 @@ def _wrap_applied_output( if is_transform: # GH#47787 see test_group_on_empty_multiindex res_index = data.index + elif not self.group_keys: + res_index = None else: res_index = self._grouper.result_index @@ -1321,8 +1323,8 @@ def idxmin(self, skipna: bool = True) -> Series: Returns ------- - Index - Label of the minimum value. + Series + Indexes of minima in each group. Raises ------ @@ -1374,8 +1376,8 @@ def idxmax(self, skipna: bool = True) -> Series: Returns ------- - Index - Label of the maximum value. + Series + Indexes of maxima in each group. Raises ------ @@ -1443,6 +1445,11 @@ def is_monotonic_increasing(self) -> Series: ------- Series + See Also + -------- + SeriesGroupBy.is_monotonic_decreasing : Return whether each group's values + are monotonically decreasing. + Examples -------- >>> s = pd.Series([2, 1, 3, 4], index=["Falcon", "Falcon", "Parrot", "Parrot"]) @@ -1462,6 +1469,11 @@ def is_monotonic_decreasing(self) -> Series: ------- Series + See Also + -------- + SeriesGroupBy.is_monotonic_increasing : Return whether each group's values + are monotonically increasing. + Examples -------- >>> s = pd.Series([2, 1, 3, 4], index=["Falcon", "Falcon", "Parrot", "Parrot"]) @@ -1957,6 +1969,8 @@ def _wrap_applied_output( if is_transform: # GH#47787 see test_group_on_empty_multiindex res_index = data.index + elif not self.group_keys: + res_index = None else: res_index = self._grouper.result_index @@ -2443,6 +2457,10 @@ def nunique(self, dropna: bool = True) -> DataFrame: nunique: DataFrame Counts of unique elements in each position. + See Also + -------- + DataFrame.nunique : Count number of distinct elements in specified axis. + Examples -------- >>> df = pd.DataFrame( @@ -2498,8 +2516,8 @@ def idxmax( Returns ------- - Series - Indexes of maxima in each group. + DataFrame + Indexes of maxima in each column according to the group. Raises ------ @@ -2509,6 +2527,7 @@ def idxmax( See Also -------- Series.idxmax : Return index of the maximum element. + DataFrame.idxmax : Indexes of maxima along the specified axis. Notes ----- @@ -2522,6 +2541,7 @@ def idxmax( ... { ... "consumption": [10.51, 103.11, 55.48], ... "co2_emissions": [37.2, 19.66, 1712], + ... "food_type": ["meat", "plant", "meat"], ... }, ... index=["Pork", "Wheat Products", "Beef"], ... ) @@ -2532,12 +2552,14 @@ def idxmax( Wheat Products 103.11 19.66 Beef 55.48 1712.00 - By default, it returns the index for the maximum value in each column. + By default, it returns the index for the maximum value in each column + according to the group. - >>> df.idxmax() - consumption Wheat Products - co2_emissions Beef - dtype: object + >>> df.groupby("food_type").idxmax() + consumption co2_emissions + food_type + animal Beef Beef + plant Wheat Products Wheat Products """ return self._idxmax_idxmin("idxmax", numeric_only=numeric_only, skipna=skipna) @@ -2560,8 +2582,8 @@ def idxmin( Returns ------- - Series - Indexes of minima in each group. + DataFrame + Indexes of minima in each column according to the group. Raises ------ @@ -2571,6 +2593,7 @@ def idxmin( See Also -------- Series.idxmin : Return index of the minimum element. + DataFrame.idxmin : Indexes of minima along the specified axis. Notes ----- @@ -2584,6 +2607,7 @@ def idxmin( ... { ... "consumption": [10.51, 103.11, 55.48], ... "co2_emissions": [37.2, 19.66, 1712], + ... "food_type": ["meat", "plant", "meat"], ... }, ... index=["Pork", "Wheat Products", "Beef"], ... ) @@ -2594,12 +2618,14 @@ def idxmin( Wheat Products 103.11 19.66 Beef 55.48 1712.00 - By default, it returns the index for the minimum value in each column. + By default, it returns the index for the minimum value in each column + according to the group. - >>> df.idxmin() - consumption Pork - co2_emissions Wheat Products - dtype: object + >>> df.groupby("food_type").idxmin() + consumption co2_emissions + food_type + animal Pork Pork + plant Wheat Products Wheat Products """ return self._idxmax_idxmin("idxmin", numeric_only=numeric_only, skipna=skipna) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9c30132347111..f4ba40e275a8d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -515,6 +515,15 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: """ Dict {group name -> group indices}. + See Also + -------- + core.groupby.DataFrameGroupBy.indices : Provides a mapping of group rows to + positions of the elements. + core.groupby.SeriesGroupBy.indices : Provides a mapping of group rows to + positions of the elements. + core.resample.Resampler.indices : Provides a mapping of group rows to + positions of the elements. + Examples -------- @@ -706,7 +715,19 @@ def get_group(self, name) -> DataFrame | Series: Returns ------- - DataFrame or Series + Series or DataFrame + Get the respective Series or DataFrame corresponding to the group provided. + + See Also + -------- + DataFrameGroupBy.groups: Dictionary representation of the groupings formed + during a groupby operation. + DataFrameGroupBy.indices: Provides a mapping of group rows to positions + of the elements. + SeriesGroupBy.groups: Dictionary representation of the groupings formed + during a groupby operation. + SeriesGroupBy.indices: Provides a mapping of group rows to positions + of the elements. Examples -------- @@ -1372,7 +1393,7 @@ def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): # ----------------------------------------------------------------- # apply/agg/transform - def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: + def apply(self, func, *args, include_groups: bool = False, **kwargs) -> NDFrameT: """ Apply function ``func`` group-wise and combine the results together. @@ -1398,7 +1419,7 @@ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: *args : tuple Optional positional arguments to pass to ``func``. - include_groups : bool, default True + include_groups : bool, default False When True, will attempt to apply ``func`` to the groupings in the case that they are columns of the DataFrame. If this raises a TypeError, the result will be computed with the groupings excluded. @@ -1406,10 +1427,9 @@ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: .. versionadded:: 2.2.0 - .. deprecated:: 2.2.0 + .. versionchanged:: 3.0.0 - Setting include_groups to True is deprecated. Only the value - False will be allowed in a future version of pandas. + The default changed from True to False, and True is no longer allowed. **kwargs : dict Optional keyword arguments to pass to ``func``. @@ -1499,7 +1519,7 @@ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: each group together into a Series, including setting the index as appropriate: - >>> g1.apply(lambda x: x.C.max() - x.B.min(), include_groups=False) + >>> g1.apply(lambda x: x.C.max() - x.B.min()) A a 5 b 2 @@ -1508,11 +1528,13 @@ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: Example 4: The function passed to ``apply`` returns ``None`` for one of the group. This group is filtered from the result: - >>> g1.apply(lambda x: None if x.iloc[0, 0] == 3 else x, include_groups=False) + >>> g1.apply(lambda x: None if x.iloc[0, 0] == 3 else x) B C 0 1 4 1 2 6 """ + if include_groups: + raise ValueError("include_groups=True is no longer allowed.") if isinstance(func, str): if hasattr(self, func): res = getattr(self, func) @@ -1539,33 +1561,7 @@ def f(g): else: f = func - if not include_groups: - return self._python_apply_general(f, self._obj_with_exclusions) - - try: - result = self._python_apply_general(f, self._selected_obj) - if ( - not isinstance(self.obj, Series) - and self._selection is None - and self._selected_obj.shape != self._obj_with_exclusions.shape - ): - warnings.warn( - message=_apply_groupings_depr.format(type(self).__name__, "apply"), - category=DeprecationWarning, - stacklevel=find_stack_level(), - ) - except TypeError: - # gh-20949 - # try again, with .apply acting as a filtering - # operation, by excluding the grouping column - # This would normally not be triggered - # except if the udf is trying an operation that - # fails on *some* columns, e.g. a numeric operation - # on a string grouper column - - return self._python_apply_general(f, self._obj_with_exclusions) - - return result + return self._python_apply_general(f, self._obj_with_exclusions) @final def _python_apply_general( @@ -2649,6 +2645,11 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: Series or DataFrame Standard error of the mean of values within each group. + See Also + -------- + DataFrame.sem : Return unbiased standard error of the mean over requested axis. + Series.sem : Return unbiased standard error of the mean over requested axis. + Examples -------- For SeriesGroupBy: @@ -3398,7 +3399,9 @@ def describe( return result @final - def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resampler: + def resample( + self, rule, *args, include_groups: bool = False, **kwargs + ) -> Resampler: """ Provide resampling when using a TimeGrouper. @@ -3423,10 +3426,9 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp .. versionadded:: 2.2.0 - .. deprecated:: 2.2.0 + .. versionchanged:: 3.0 - Setting include_groups to True is deprecated. Only the value - False will be allowed in a future version of pandas. + The default was changed to False, and True is no longer allowed. **kwargs Possible arguments are `how`, `fill_method`, `limit`, `kind` and @@ -3459,7 +3461,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Downsample the DataFrame into 3 minute bins and sum the values of the timestamps falling into a bin. - >>> df.groupby("a").resample("3min", include_groups=False).sum() + >>> df.groupby("a").resample("3min").sum() b a 0 2000-01-01 00:00:00 2 @@ -3468,7 +3470,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Upsample the series into 30 second bins. - >>> df.groupby("a").resample("30s", include_groups=False).sum() + >>> df.groupby("a").resample("30s").sum() b a 0 2000-01-01 00:00:00 1 @@ -3482,7 +3484,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Resample by month. Values are assigned to the month of the period. - >>> df.groupby("a").resample("ME", include_groups=False).sum() + >>> df.groupby("a").resample("ME").sum() b a 0 2000-01-31 3 @@ -3491,11 +3493,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Downsample the series into 3 minute bins as above, but close the right side of the bin interval. - >>> ( - ... df.groupby("a") - ... .resample("3min", closed="right", include_groups=False) - ... .sum() - ... ) + >>> (df.groupby("a").resample("3min", closed="right").sum()) b a 0 1999-12-31 23:57:00 1 @@ -3506,11 +3504,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp the bin interval, but label each bin using the right edge instead of the left. - >>> ( - ... df.groupby("a") - ... .resample("3min", closed="right", label="right", include_groups=False) - ... .sum() - ... ) + >>> (df.groupby("a").resample("3min", closed="right", label="right").sum()) b a 0 2000-01-01 00:00:00 1 @@ -3519,11 +3513,10 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp """ from pandas.core.resample import get_resampler_for_grouping - # mypy flags that include_groups could be specified via `*args` or `**kwargs` - # GH#54961 would resolve. - return get_resampler_for_grouping( # type: ignore[misc] - self, rule, *args, include_groups=include_groups, **kwargs - ) + if include_groups: + raise ValueError("include_groups=True is no longer allowed.") + + return get_resampler_for_grouping(self, rule, *args, **kwargs) @final def rolling( @@ -3983,19 +3976,6 @@ def nth(self) -> GroupByNthSelector: 'all' or 'any'; this is equivalent to calling dropna(how=dropna) before the groupby. - Parameters - ---------- - n : int, slice or list of ints and slices - A single nth value for the row or a list of nth values or slices. - - .. versionchanged:: 1.4.0 - Added slice and lists containing slices. - Added index notation. - - dropna : {'any', 'all', None}, default None - Apply the specified dropna operation before counting which row is - the nth row. Only supported if n is an int. - Returns ------- Series or DataFrame @@ -5548,13 +5528,3 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde mi = MultiIndex(levels=levels, codes=codes, names=[idx.name, None]) return mi - - -# GH#7155 -_apply_groupings_depr = ( - "{}.{} operated on the grouping columns. This behavior is deprecated, " - "and in a future version of pandas the grouping columns will be excluded " - "from the operation. Either pass `include_groups=False` to exclude the " - "groupings or explicitly select the grouping columns after groupby to silence " - "this warning." -) diff --git a/pandas/core/indexers/__init__.py b/pandas/core/indexers/__init__.py index ba8a4f1d0ee7a..036b32b3feac2 100644 --- a/pandas/core/indexers/__init__.py +++ b/pandas/core/indexers/__init__.py @@ -15,17 +15,17 @@ ) __all__ = [ - "is_valid_positional_slice", + "check_array_indexer", + "check_key_length", + "check_setitem_lengths", + "disallow_ndim_indexing", + "is_empty_indexer", "is_list_like_indexer", "is_scalar_indexer", - "is_empty_indexer", - "check_setitem_lengths", - "validate_indices", - "maybe_convert_indices", + "is_valid_positional_slice", "length_of_indexer", - "disallow_ndim_indexing", + "maybe_convert_indices", "unpack_1tuple", - "check_key_length", - "check_array_indexer", "unpack_tuple_and_ellipses", + "validate_indices", ] diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index e2dc71f68a65b..c404323a1168c 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -373,6 +373,28 @@ def to_pydatetime(self) -> Series: @property def freq(self): + """ + Tries to return a string representing a frequency generated by infer_freq. + + Returns None if it can't autodetect the frequency. + + See Also + -------- + Series.dt.to_period : Cast to PeriodArray/PeriodIndex at a particular + frequency. + + Examples + -------- + >>> ser = pd.Series(["2024-01-01", "2024-01-02", "2024-01-03", "2024-01-04"]) + >>> ser = pd.to_datetime(ser) + >>> ser.dt.freq + 'D' + + >>> ser = pd.Series(["2022-01-01", "2024-01-01", "2026-01-01", "2028-01-01"]) + >>> ser = pd.to_datetime(ser) + >>> ser.dt.freq + '2YS-JAN' + """ return self._get_values().inferred_freq def isocalendar(self) -> DataFrame: diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 5144e647e73b4..058e584336905 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -37,26 +37,26 @@ __all__ = [ - "Index", - "MultiIndex", "CategoricalIndex", + "DatetimeIndex", + "Index", "IntervalIndex", - "RangeIndex", "InvalidIndexError", - "TimedeltaIndex", + "MultiIndex", + "NaT", "PeriodIndex", - "DatetimeIndex", + "RangeIndex", + "TimedeltaIndex", "_new_Index", - "NaT", + "all_indexes_same", + "default_index", "ensure_index", "ensure_index_from_sequences", "get_objs_combined_axis", - "union_indexes", "get_unanimous_names", - "all_indexes_same", - "default_index", - "safe_sort_index", "maybe_sequence_to_range", + "safe_sort_index", + "union_indexes", ] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4a90b164c89cc..165fe109c4c94 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -876,7 +876,7 @@ def _engine( # ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr] target_values = self._data._ndarray # type: ignore[union-attr] elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype): - return libindex.StringEngine(target_values) + return libindex.StringObjectEngine(target_values, self.dtype.na_value) # type: ignore[union-attr] # error: Argument 1 to "ExtensionEngine" has incompatible type # "ndarray[Any, Any]"; expected "ExtensionArray" @@ -5974,7 +5974,6 @@ def _should_fallback_to_positional(self) -> bool: def get_indexer_non_unique( self, target ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: - target = ensure_index(target) target = self._maybe_cast_listlike_indexer(target) if not self._should_compare(target) and not self._should_partial_index(target): @@ -6556,7 +6555,16 @@ def _maybe_cast_listlike_indexer(self, target) -> Index: """ Analogue to maybe_cast_indexer for get_indexer instead of get_loc. """ - return ensure_index(target) + target_index = ensure_index(target) + if ( + not hasattr(target, "dtype") + and self.dtype == object + and target_index.dtype == "string" + ): + # If we started with a list-like, avoid inference to string dtype if self + # is object dtype (coercing to string dtype will alter the missing values) + target_index = Index(target, dtype=self.dtype) + return target_index @final def _validate_indexer( diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index c559c529586b5..254bd71ade209 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -110,7 +110,9 @@ def _disabled(self, *args, **kwargs) -> NoReturn: raise TypeError(f"'{type(self).__name__}' does not support mutable operations.") def __str__(self) -> str: - return pprint_thing(self, quote_strings=True, escape_chars=("\t", "\r", "\n")) + return pprint_thing( + self, quote_strings=True, escape_chars=("\t", "\r", "\n", "'") + ) def __repr__(self) -> str: return f"{type(self).__name__}({self!s})" diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 36e68465a99d9..dc48cd1ed958e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -4084,11 +4084,10 @@ def insert(self, loc: int, item) -> MultiIndex: # have to insert into level # must insert at end otherwise you have to recompute all the # other codes - if isna(k): # GH 59003 + lev_loc = len(level) + level = level.insert(lev_loc, k) + if isna(level[lev_loc]): # GH 59003, 60388 lev_loc = -1 - else: - lev_loc = len(level) - level = level.insert(lev_loc, k) else: lev_loc = level.get_loc(k) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 7eeaab3b0443f..935762d0455c5 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1195,7 +1195,7 @@ def _getitem_slice(self, slobj: slice) -> Self: @unpack_zerodim_and_defer("__floordiv__") def __floordiv__(self, other): if is_integer(other) and other != 0: - if len(self) == 0 or self.start % other == 0 and self.step % other == 0: + if len(self) == 0 or (self.start % other == 0 and self.step % other == 0): start = self.start // other step = self.step // other stop = start + len(self) * step diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 0d6d7e68f58a4..e0bc0a23acd9f 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1239,8 +1239,10 @@ def _validate_key(self, key, axis: Axis) -> None: if isinstance(key, bool) and not ( is_bool_dtype(ax.dtype) or ax.dtype.name == "boolean" - or isinstance(ax, MultiIndex) - and is_bool_dtype(ax.get_level_values(0).dtype) + or ( + isinstance(ax, MultiIndex) + and is_bool_dtype(ax.get_level_values(0).dtype) + ) ): raise KeyError( f"{key}: boolean label can not be used without a boolean index" @@ -2120,7 +2122,7 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: is_full_setter = com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)) - is_null_setter = com.is_empty_slice(pi) or is_array_like(pi) and len(pi) == 0 + is_null_setter = com.is_empty_slice(pi) or (is_array_like(pi) and len(pi) == 0) if is_null_setter: # no-op, don't cast dtype later @@ -2744,19 +2746,15 @@ def check_dict_or_set_indexers(key) -> None: """ Check if the indexer is or contains a dict or set, which is no longer allowed. """ - if ( - isinstance(key, set) - or isinstance(key, tuple) - and any(isinstance(x, set) for x in key) + if isinstance(key, set) or ( + isinstance(key, tuple) and any(isinstance(x, set) for x in key) ): raise TypeError( "Passing a set as an indexer is not supported. Use a list instead." ) - if ( - isinstance(key, dict) - or isinstance(key, tuple) - and any(isinstance(x, dict) for x in key) + if isinstance(key, dict) or ( + isinstance(key, tuple) and any(isinstance(x, dict) for x in key) ): raise TypeError( "Passing a dict as an indexer is not supported. Use a list instead." diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 0e5776ae8cdd9..5c9b8ac8ea085 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -9,6 +9,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -147,8 +149,6 @@ def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame: ------- pd.DataFrame """ - # We need a dict of columns here, with each column being a NumPy array (at - # least for now, deal with non-NumPy dtypes later). columns: dict[str, Any] = {} buffers = [] # hold on to buffers, keeps memory alive for name in df.column_names(): @@ -347,8 +347,12 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: # Add to our list of strings str_list[i] = string - # Convert the string list to a NumPy array - return np.asarray(str_list, dtype="object"), buffers + if using_string_dtype(): + res = pd.Series(str_list, dtype="str") + else: + res = np.asarray(str_list, dtype="object") # type: ignore[assignment] + + return res, buffers # type: ignore[return-value] def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray: diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 5ab70ba38f9c2..202bebde88c2c 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -7,11 +7,11 @@ __all__ = [ "Block", - "ExtensionBlock", - "make_block", "BlockManager", + "ExtensionBlock", "SingleBlockManager", "concatenate_managers", + "make_block", ] diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3264676771d5d..f44ad926dda5c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -108,6 +108,7 @@ PeriodArray, TimedeltaArray, ) +from pandas.core.arrays.string_ import StringDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.computation import expressions @@ -513,9 +514,8 @@ def convert(self) -> list[Block]: convert_non_numeric=True, ) refs = None - if ( - res_values is values - or isinstance(res_values, NumpyExtensionArray) + if res_values is values or ( + isinstance(res_values, NumpyExtensionArray) and res_values._ndarray is values ): refs = self.refs @@ -1336,7 +1336,7 @@ def fillna( return [self.copy(deep=False)] if limit is not None: - mask[mask.cumsum(self.ndim - 1) > limit] = False + mask[mask.cumsum(self.values.ndim - 1) > limit] = False if inplace: nbs = self.putmask(mask.T, value) @@ -1684,9 +1684,16 @@ def where(self, other, cond) -> list[Block]: res_values = arr._where(cond, other).T except (ValueError, TypeError): if self.ndim == 1 or self.shape[0] == 1: - if isinstance(self.dtype, IntervalDtype): + if isinstance(self.dtype, (IntervalDtype, StringDtype)): # TestSetitemFloatIntervalWithIntIntervalValues blk = self.coerce_to_target_dtype(orig_other, raise_on_upcast=False) + if ( + self.ndim == 2 + and isinstance(orig_cond, np.ndarray) + and orig_cond.ndim == 1 + and not is_1d_only_ea_dtype(blk.dtype) + ): + orig_cond = orig_cond[:, None] return blk.where(orig_other, orig_cond) elif isinstance(self, NDArrayBackedExtensionBlock): @@ -1854,9 +1861,9 @@ def fillna( limit: int | None = None, inplace: bool = False, ) -> list[Block]: - if isinstance(self.dtype, IntervalDtype): + if isinstance(self.dtype, (IntervalDtype, StringDtype)): # Block.fillna handles coercion (test_fillna_interval) - if limit is not None: + if isinstance(self.dtype, IntervalDtype) and limit is not None: raise ValueError("limit must be None") return super().fillna( value=value, diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 0812ba5e6def4..dfff34656f82b 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -417,8 +417,7 @@ def dict_to_mgr( else x.copy(deep=True) if ( isinstance(x, Index) - or isinstance(x, ABCSeries) - and is_1d_only_ea_dtype(x.dtype) + or (isinstance(x, ABCSeries) and is_1d_only_ea_dtype(x.dtype)) ) else x for x in arrays @@ -966,8 +965,9 @@ def convert(arr): if dtype is None: if arr.dtype == np.dtype("O"): # i.e. maybe_convert_objects didn't convert - arr = maybe_infer_to_datetimelike(arr) - if dtype_backend != "numpy" and arr.dtype == np.dtype("O"): + convert_to_nullable_dtype = dtype_backend != "numpy" + arr = maybe_infer_to_datetimelike(arr, convert_to_nullable_dtype) + if convert_to_nullable_dtype and arr.dtype == np.dtype("O"): new_dtype = StringDtype() arr_cls = new_dtype.construct_array_type() arr = arr_cls._from_sequence(arr, dtype=new_dtype) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 34a0bb1f45e2c..9f9d69a182f72 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -66,15 +66,18 @@ __all__ = [ "ARITHMETIC_BINOPS", "arithmetic_op", - "comparison_op", "comp_method_OBJECT_ARRAY", - "invalid_comparison", + "comparison_op", "fill_binop", + "get_array_op", + "get_op_result_name", + "invalid_comparison", "kleene_and", "kleene_or", "kleene_xor", "logical_op", "make_flex_doc", + "maybe_prepare_scalar_for_op", "radd", "rand_", "rdiv", @@ -88,7 +91,4 @@ "rtruediv", "rxor", "unpack_zerodim_and_defer", - "get_op_result_name", - "maybe_prepare_scalar_for_op", - "get_array_op", ] diff --git a/pandas/core/resample.py b/pandas/core/resample.py index ca4d3fc768efb..27e498683bf8f 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -31,10 +31,7 @@ Substitution, doc, ) -from pandas.util._exceptions import ( - find_stack_level, - rewrite_warning, -) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -59,7 +56,6 @@ from pandas.core.groupby.groupby import ( BaseGroupBy, GroupBy, - _apply_groupings_depr, _pipe_template, get_groupby, ) @@ -167,14 +163,15 @@ def __init__( gpr_index: Index, group_keys: bool = False, selection=None, - include_groups: bool = True, + include_groups: bool = False, ) -> None: + if include_groups: + raise ValueError("include_groups=True is no longer allowed.") self._timegrouper = timegrouper self.keys = None self.sort = True self.group_keys = group_keys self.as_index = True - self.include_groups = include_groups self.obj, self.ax, self._indexer = self._timegrouper._set_grouper( self._convert_obj(obj), sort=True, gpr_index=gpr_index @@ -465,9 +462,7 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): # a DataFrame column, but aggregate_item_by_item operates column-wise # on Series, raising AttributeError or KeyError # (depending on whether the column lookup uses getattr/__getitem__) - result = _apply( - grouped, how, *args, include_groups=self.include_groups, **kwargs - ) + result = grouped.apply(how, *args, **kwargs) except ValueError as err: if "Must produce aggregated value" in str(err): @@ -479,21 +474,23 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): # we have a non-reducing function # try to evaluate - result = _apply( - grouped, how, *args, include_groups=self.include_groups, **kwargs - ) + result = grouped.apply(how, *args, **kwargs) return self._wrap_result(result) @final def _get_resampler_for_grouping( - self, groupby: GroupBy, key, include_groups: bool = True + self, + groupby: GroupBy, + key, ): """ Return the correct class for resampling with groupby. """ return self._resampler_for_grouping( - groupby=groupby, key=key, parent=self, include_groups=include_groups + groupby=groupby, + key=key, + parent=self, ) def _wrap_result(self, result): @@ -694,7 +691,7 @@ def bfill(self, limit: int | None = None): References ---------- - .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics) + .. [1] https://en.wikipedia.org/wiki/Imputation_%28statistics%29 Examples -------- @@ -935,7 +932,7 @@ def interpolate( "supported. If you tried to resample and interpolate on a " "grouped data frame, please use:\n" "`df.groupby(...).apply(lambda x: x.resample(...)." - "interpolate(...), include_groups=False)`" + "interpolate(...))`" "\ninstead, as resampling and interpolation has to be " "performed for each group independently." ) @@ -1541,7 +1538,6 @@ def __init__( groupby: GroupBy, key=None, selection: IndexLabel | None = None, - include_groups: bool = False, ) -> None: # reached via ._gotitem and _get_resampler_for_grouping @@ -1564,7 +1560,6 @@ def __init__( self.ax = parent.ax self.obj = parent.obj - self.include_groups = include_groups @no_type_check def _apply(self, f, *args, **kwargs): @@ -1581,7 +1576,7 @@ def func(x): return x.apply(f, *args, **kwargs) - result = _apply(self._groupby, func, include_groups=self.include_groups) + result = self._groupby.apply(func) return self._wrap_result(result) _upsample = _apply @@ -1937,7 +1932,6 @@ def get_resampler_for_grouping( fill_method=None, limit: int | None = None, on=None, - include_groups: bool = True, **kwargs, ) -> Resampler: """ @@ -1946,9 +1940,7 @@ def get_resampler_for_grouping( # .resample uses 'on' similar to how .groupby uses 'key' tg = TimeGrouper(freq=rule, key=on, **kwargs) resampler = tg._get_resampler(groupby.obj) - return resampler._get_resampler_for_grouping( - groupby=groupby, include_groups=include_groups, key=tg.key - ) + return resampler._get_resampler_for_grouping(groupby=groupby, key=tg.key) class TimeGrouper(Grouper): @@ -2002,9 +1994,7 @@ def __init__( raise ValueError(f"Unsupported value {convention} for `convention`") if ( - key is None - and obj is not None - and isinstance(obj.index, PeriodIndex) # type: ignore[attr-defined] + (key is None and obj is not None and isinstance(obj.index, PeriodIndex)) # type: ignore[attr-defined] or ( key is not None and obj is not None @@ -2729,18 +2719,3 @@ def _asfreq_compat(index: FreqIndexT, freq) -> FreqIndexT: else: # pragma: no cover raise TypeError(type(index)) return new_index - - -def _apply( - grouped: GroupBy, how: Callable, *args, include_groups: bool, **kwargs -) -> DataFrame: - # GH#7155 - rewrite warning to appear as if it came from `.resample` - target_message = "DataFrameGroupBy.apply operated on the grouping columns" - new_message = _apply_groupings_depr.format("DataFrameGroupBy", "resample") - with rewrite_warning( - target_message=target_message, - target_category=DeprecationWarning, - new_message=new_message, - ): - result = grouped.apply(how, *args, include_groups=include_groups, **kwargs) - return result diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index cfe83111b6e38..e7cb7069bbc26 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -17,6 +17,7 @@ import numpy as np from pandas._libs import lib +from pandas.util._decorators import set_module from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( @@ -149,6 +150,7 @@ def concat( ) -> DataFrame | Series: ... +@set_module("pandas") def concat( objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], *, diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index bfd8e3ccd2f7c..f4cb82816bbcf 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -51,9 +51,9 @@ def melt( """ Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. - This function is useful to massage a DataFrame into a format where one + This function is useful to reshape a DataFrame into a format where one or more columns are identifier variables (`id_vars`), while all other - columns, considered measured variables (`value_vars`), are "unpivoted" to + columns are considered measured variables (`value_vars`), and are "unpivoted" to the row axis, leaving just two non-identifier columns, 'variable' and 'value'. diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6f9bb8cb24f43..5fddd9f9aca5b 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2746,8 +2746,7 @@ def _factorize_keys( isinstance(lk.dtype, ArrowDtype) and ( is_numeric_dtype(lk.dtype.numpy_dtype) - or is_string_dtype(lk.dtype) - and not sort + or (is_string_dtype(lk.dtype) and not sort) ) ): lk, _ = lk._values_for_factorize() diff --git a/pandas/core/series.py b/pandas/core/series.py index 35b576da87ed7..4fa8b86fa4c16 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -567,7 +567,7 @@ def __arrow_c_stream__(self, requested_schema=None): Export the pandas Series as an Arrow C stream PyCapsule. This relies on pyarrow to convert the pandas Series to the Arrow - format (and follows the default behaviour of ``pyarrow.Array.from_pandas`` + format (and follows the default behavior of ``pyarrow.Array.from_pandas`` in its handling of the index, i.e. to ignore it). This conversion is not necessarily zero-copy. @@ -2226,7 +2226,7 @@ def drop_duplicates( 5 hippo Name: animal, dtype: object - With the 'keep' parameter, the selection behaviour of duplicated values + With the 'keep' parameter, the selection behavior of duplicated values can be changed. The value 'first' keeps the first occurrence for each set of duplicated entries. The default value of keep is 'first'. @@ -3451,7 +3451,7 @@ def sort_values( 4 5.0 dtype: float64 - Sort values ascending order (default behaviour) + Sort values ascending order (default behavior) >>> s.sort_values(ascending=True) 1 1.0 @@ -4098,7 +4098,7 @@ def swaplevel( In the following example, we will swap the levels of the indices. Here, we will swap the levels column-wise, but levels can be swapped row-wise - in a similar manner. Note that column-wise is the default behaviour. + in a similar manner. Note that column-wise is the default behavior. By not supplying any arguments for i and j, we swap the last and second to last indices. diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 05e1a36877e06..c68b6303661b9 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1374,6 +1374,11 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default): """ Determine if each string starts with a match of a regular expression. + Determines whether each string in the Series or Index starts with a + match to a specified regular expression. This function is especially + useful for validating prefixes, such as ensuring that codes, tags, or + identifiers begin with a specific pattern. + Parameters ---------- pat : str @@ -1419,6 +1424,11 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): """ Determine if each string entirely matches a regular expression. + Checks if each string in the Series or Index fully matches the + specified regular expression pattern. This function is useful when the + requirement is for an entire string to conform to a pattern, such as + validating formats like phone numbers or email addresses. + Parameters ---------- pat : str @@ -1647,6 +1657,10 @@ def repeat(self, repeats): """ Duplicate each string in the Series or Index. + Duplicates each string in the Series or Index, either by applying the + same repeat count to all elements or by using different repeat values + for each element. + Parameters ---------- repeats : int or sequence of int @@ -1710,6 +1724,12 @@ def pad( """ Pad strings in the Series/Index up to width. + This function pads strings in a Series or Index to a specified width, + filling the extra space with a character of your choice. It provides + flexibility in positioning the padding, allowing it to be added to the + left, right, or both sides. This is useful for formatting strings to + align text or ensure consistent string lengths in data processing. + Parameters ---------- width : int @@ -1920,6 +1940,11 @@ def slice(self, start=None, stop=None, step=None): """ Slice substrings from each element in the Series or Index. + Slicing substrings from strings in a Series or Index helps extract + specific portions of data, making it easier to analyze or manipulate + text. This is useful for tasks like parsing structured text fields or + isolating parts of strings with a consistent format. + Parameters ---------- start : int, optional @@ -1996,6 +2021,11 @@ def slice_replace(self, start=None, stop=None, repl=None): """ Replace a positional slice of a string with another value. + This function allows replacing specific parts of a string in a Series + or Index by specifying start and stop positions. It is useful for + modifying substrings in a controlled way, such as updating sections of + text based on their positions or patterns. + Parameters ---------- start : int, optional diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 4680a63bf57a1..30487de7bafd5 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -44,6 +44,7 @@ from pandas.core.dtypes.common import ( ensure_object, is_float, + is_float_dtype, is_integer, is_integer_dtype, is_list_like, @@ -1153,6 +1154,10 @@ def coerce(values): # we allow coercion to if errors allows values = to_numeric(values, errors=errors) + # prevent prevision issues in case of float32 # GH#60506 + if is_float_dtype(values.dtype): + values = values.astype("float64") + # prevent overflow in case of int8 or int16 if is_integer_dtype(values.dtype): values = values.astype("int64") diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index f159babb7e018..bc45343d6e2d3 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -226,19 +226,18 @@ def to_numeric( set(), coerce_numeric=coerce_numeric, convert_to_masked_nullable=dtype_backend is not lib.no_default - or isinstance(values_dtype, StringDtype) - and values_dtype.na_value is libmissing.NA, + or ( + isinstance(values_dtype, StringDtype) + and values_dtype.na_value is libmissing.NA + ), ) if new_mask is not None: # Remove unnecessary values, is expected later anyway and enables # downcasting values = values[~new_mask] - elif ( - dtype_backend is not lib.no_default - and new_mask is None - or isinstance(values_dtype, StringDtype) - and values_dtype.na_value is libmissing.NA + elif (dtype_backend is not lib.no_default and new_mask is None) or ( + isinstance(values_dtype, StringDtype) and values_dtype.na_value is libmissing.NA ): new_mask = np.zeros(values.shape, dtype=np.bool_) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 43a3c03b6cef9..73e4de6ea6208 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -490,7 +490,7 @@ def online( klass="Series/Dataframe", axis="", ) - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): return super().aggregate(func, *args, **kwargs) agg = aggregate @@ -981,7 +981,7 @@ def reset(self) -> None: """ self._mean.reset() - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): raise NotImplementedError("aggregate is not implemented.") def std(self, bias: bool = False, *args, **kwargs): diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 4bf77b3d38689..bff3a1660eba9 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -167,7 +167,7 @@ def _get_window_indexer(self) -> BaseIndexer: klass="Series/Dataframe", axis="", ) - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): return super().aggregate(func, *args, **kwargs) agg = aggregate diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index b1c37ab48fa57..385ffb901acf0 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -44,7 +44,10 @@ from pandas.core._numba import executor from pandas.core.algorithms import factorize -from pandas.core.apply import ResamplerWindowApply +from pandas.core.apply import ( + ResamplerWindowApply, + reconstruct_func, +) from pandas.core.arrays import ExtensionArray from pandas.core.base import SelectionMixin import pandas.core.common as com @@ -269,7 +272,7 @@ def _create_data(self, obj: NDFrameT, numeric_only: bool = False) -> NDFrameT: """ # filter out the on from the object if self.on is not None and not isinstance(self.on, Index) and obj.ndim == 2: - obj = obj.reindex(columns=obj.columns.difference([self.on])) + obj = obj.reindex(columns=obj.columns.difference([self.on], sort=False)) if obj.ndim > 1 and numeric_only: obj = self._make_numeric_only(obj) return obj @@ -646,8 +649,12 @@ def _numba_apply( out = obj._constructor(result, index=index, columns=columns) return self._resolve_output(out, obj) - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): + relabeling, func, columns, order = reconstruct_func(func, **kwargs) result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() + if isinstance(result, ABCDataFrame) and relabeling: + result = result.iloc[:, order] + result.columns = columns # type: ignore[union-attr] if result is None: return self.apply(func, raw=False, args=args, kwargs=kwargs) return result @@ -1239,7 +1246,7 @@ def calc(x): klass="Series/DataFrame", axis="", ) - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() if result is None: # these must apply directly @@ -1951,7 +1958,7 @@ def _raise_monotonic_error(self, msg: str): klass="Series/Dataframe", axis="", ) - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): return super().aggregate(func, *args, **kwargs) agg = aggregate diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index cacbfb49c311f..f150de3d217f2 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -20,6 +20,16 @@ class IntCastingNaNError(ValueError): """ Exception raised when converting (``astype``) an array with NaN to an integer type. + This error occurs when attempting to cast a data structure containing non-finite + values (such as NaN or infinity) to an integer data type. Integer types do not + support non-finite values, so such conversions are explicitly disallowed to + prevent silent data corruption or unexpected behavior. + + See Also + -------- + DataFrame.astype : Method to cast a pandas DataFrame object to a specified dtype. + Series.astype : Method to cast a pandas Series object to a specified dtype. + Examples -------- >>> pd.DataFrame(np.array([[1, np.nan], [2, 3]]), dtype="i8") @@ -35,6 +45,11 @@ class NullFrequencyError(ValueError): Particularly ``DatetimeIndex.shift``, ``TimedeltaIndex.shift``, ``PeriodIndex.shift``. + See Also + -------- + Index.shift : Shift values of Index. + Series.shift : Shift values of Series. + Examples -------- >>> df = pd.DatetimeIndex(["2011-01-01 10:00", "2011-01-01"], freq=None) @@ -48,6 +63,12 @@ class PerformanceWarning(Warning): """ Warning raised when there is a possible performance impact. + See Also + -------- + DataFrame.set_index : Set the DataFrame index using existing columns. + DataFrame.loc : Access a group of rows and columns by label(s) \ + or a boolean array. + Examples -------- >>> df = pd.DataFrame( @@ -100,6 +121,11 @@ class UnsortedIndexError(KeyError): Subclass of `KeyError`. + See Also + -------- + DataFrame.sort_index : Sort a DataFrame by its index. + DataFrame.set_index : Set the DataFrame index using existing columns. + Examples -------- >>> df = pd.DataFrame( @@ -370,6 +396,13 @@ class NumbaUtilError(Exception): """ Error raised for unsupported Numba engine routines. + See Also + -------- + DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns. + Series.groupby : Group Series using a mapper or by a Series of columns. + DataFrame.agg : Aggregate using one or more operations over the specified axis. + Series.agg : Aggregate using one or more operations over the specified axis. + Examples -------- >>> df = pd.DataFrame( @@ -388,6 +421,19 @@ class DuplicateLabelError(ValueError): """ Error raised when an operation would introduce duplicate labels. + This error is typically encountered when performing operations on objects + with `allows_duplicate_labels=False` and the operation would result in + duplicate labels in the index. Duplicate labels can lead to ambiguities + in indexing and reduce data integrity. + + See Also + -------- + Series.set_flags : Return a new ``Series`` object with updated flags. + DataFrame.set_flags : Return a new ``DataFrame`` object with updated flags. + Series.reindex : Conform ``Series`` object to new index with optional filling logic. + DataFrame.reindex : Conform ``DataFrame`` object to new index with optional filling + logic. + Examples -------- >>> s = pd.Series([0, 1, 2], index=["a", "b", "c"]).set_flags( @@ -407,6 +453,16 @@ class InvalidIndexError(Exception): """ Exception raised when attempting to use an invalid index key. + This exception is triggered when a user attempts to access or manipulate + data in a pandas DataFrame or Series using an index key that is not valid + for the given object. This may occur in cases such as using a malformed + slice, a mismatched key for a ``MultiIndex``, or attempting to access an index + element that does not exist. + + See Also + -------- + MultiIndex : A multi-level, or hierarchical, index object for pandas objects. + Examples -------- >>> idx = pd.MultiIndex.from_product([["x", "y"], [0, 1]]) @@ -487,6 +543,11 @@ class ChainedAssignmentError(Warning): For more information on Copy-on-Write, see :ref:`the user guide`. + See Also + -------- + options.mode.copy_on_write : Global setting for enabling or disabling + Copy-on-Write behavior. + Examples -------- >>> pd.options.mode.copy_on_write = True @@ -505,6 +566,11 @@ class NumExprClobberingError(NameError): to 'numexpr'. 'numexpr' is the default engine value for these methods if the numexpr package is installed. + See Also + -------- + eval : Evaluate a Python expression as a string using various backends. + DataFrame.query : Query the columns of a DataFrame with a boolean expression. + Examples -------- >>> df = pd.DataFrame({"abs": [1, 1, 1]}) @@ -522,6 +588,20 @@ class UndefinedVariableError(NameError): It will also specify whether the undefined variable is local or not. + Parameters + ---------- + name : str + The name of the undefined variable. + is_local : bool or None, optional + Indicates whether the undefined variable is considered a local variable. + If ``True``, the error message specifies it as a local variable. + If ``False`` or ``None``, the variable is treated as a non-local name. + + See Also + -------- + DataFrame.query : Query the columns of a DataFrame with a boolean expression. + DataFrame.eval : Evaluate a string describing operations on DataFrame columns. + Examples -------- >>> df = pd.DataFrame({"A": [1, 1, 1]}) @@ -628,6 +708,15 @@ class PossibleDataLossError(Exception): """ Exception raised when trying to open a HDFStore file when already opened. + This error is triggered when there is a potential risk of data loss due to + conflicting operations on an HDFStore file. It serves to prevent unintended + overwrites or data corruption by enforcing exclusive access to the file. + + See Also + -------- + HDFStore : Dict-like IO interface for storing pandas objects in PyTables. + HDFStore.open : Open an HDFStore file in the specified mode. + Examples -------- >>> store = pd.HDFStore("my-store", "a") # doctest: +SKIP @@ -672,6 +761,12 @@ class AttributeConflictWarning(Warning): name than the existing index on an HDFStore or attempting to append an index with a different frequency than the existing index on an HDFStore. + See Also + -------- + HDFStore : Dict-like IO interface for storing pandas objects in PyTables. + DataFrame.to_hdf : Write the contained data to an HDF5 file using HDFStore. + read_hdf : Read from an HDF5 file into a DataFrame. + Examples -------- >>> idx1 = pd.Index(["a", "b"], name="name1") @@ -802,28 +897,28 @@ class InvalidComparison(Exception): __all__ = [ "AbstractMethodError", "AttributeConflictWarning", + "CSSWarning", "CategoricalConversionWarning", "ChainedAssignmentError", "ClosedFileError", - "CSSWarning", - "DatabaseError", "DataError", + "DatabaseError", "DtypeWarning", "DuplicateLabelError", "EmptyDataError", "IncompatibilityWarning", + "IndexingError", "IntCastingNaNError", "InvalidColumnName", "InvalidComparison", "InvalidIndexError", "InvalidVersion", - "IndexingError", "LossySetitemError", "MergeError", "NoBufferPresent", "NullFrequencyError", - "NumbaUtilError", "NumExprClobberingError", + "NumbaUtilError", "OptionError", "OutOfBoundsDatetime", "OutOfBoundsTimedelta", diff --git a/pandas/io/__init__.py b/pandas/io/__init__.py index c804b81c49e7c..1c7e531debb14 100644 --- a/pandas/io/__init__.py +++ b/pandas/io/__init__.py @@ -1,4 +1,4 @@ -# ruff: noqa: TCH004 +# ruff: noqa: TC004 from typing import TYPE_CHECKING if TYPE_CHECKING: diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 9a8c87a738d4c..9778a404e23e0 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -1,9 +1,15 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Literal, +) import numpy as np +from pandas._config import using_string_dtype + +from pandas._libs import lib from pandas.compat import pa_version_under18p0 from pandas.compat._optional import import_optional_dependency @@ -12,6 +18,10 @@ if TYPE_CHECKING: from collections.abc import Callable + import pyarrow + + from pandas._typing import DtypeBackend + def _arrow_dtype_mapping() -> dict: pa = import_optional_dependency("pyarrow") @@ -33,7 +43,7 @@ def _arrow_dtype_mapping() -> dict: } -def arrow_string_types_mapper() -> Callable: +def _arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") mapping = { @@ -44,3 +54,34 @@ def arrow_string_types_mapper() -> Callable: mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan) return mapping.get + + +def arrow_table_to_pandas( + table: pyarrow.Table, + dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default, + null_to_int64: bool = False, + to_pandas_kwargs: dict | None = None, +) -> pd.DataFrame: + pa = import_optional_dependency("pyarrow") + + to_pandas_kwargs = {} if to_pandas_kwargs is None else to_pandas_kwargs + + types_mapper: type[pd.ArrowDtype] | None | Callable + if dtype_backend == "numpy_nullable": + mapping = _arrow_dtype_mapping() + if null_to_int64: + # Modify the default mapping to also map null to Int64 + # (to match other engines - only for CSV parser) + mapping[pa.null()] = pd.Int64Dtype() + types_mapper = mapping.get + elif dtype_backend == "pyarrow": + types_mapper = pd.ArrowDtype + elif using_string_dtype(): + types_mapper = _arrow_string_types_mapper() + elif dtype_backend is lib.no_default or dtype_backend == "numpy": + types_mapper = None + else: + raise NotImplementedError + + df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs) + return df diff --git a/pandas/io/excel/__init__.py b/pandas/io/excel/__init__.py index 275cbf0148f94..f13d7afa63d84 100644 --- a/pandas/io/excel/__init__.py +++ b/pandas/io/excel/__init__.py @@ -8,7 +8,7 @@ from pandas.io.excel._util import register_writer from pandas.io.excel._xlsxwriter import XlsxWriter as _XlsxWriter -__all__ = ["read_excel", "ExcelWriter", "ExcelFile"] +__all__ = ["ExcelFile", "ExcelWriter", "read_excel"] register_writer(_OpenpyxlWriter) diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index aaae9857b4fae..7b4c81853eba3 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -15,11 +15,10 @@ from pandas.util._decorators import doc from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas.core.api import DataFrame from pandas.core.shared_docs import _shared_docs -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import get_handle if TYPE_CHECKING: @@ -147,16 +146,4 @@ def read_feather( pa_table = feather.read_table( handles.handle, columns=columns, use_threads=bool(use_threads) ) - - if dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get) - - elif dtype_backend == "pyarrow": - return pa_table.to_pandas(types_mapper=pd.ArrowDtype) - - elif using_string_dtype(): - return pa_table.to_pandas(types_mapper=arrow_string_types_mapper()) - else: - raise NotImplementedError + return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) diff --git a/pandas/io/formats/__init__.py b/pandas/io/formats/__init__.py index 5e56b1bc7ba43..895669c342f97 100644 --- a/pandas/io/formats/__init__.py +++ b/pandas/io/formats/__init__.py @@ -1,4 +1,4 @@ -# ruff: noqa: TCH004 +# ruff: noqa: TC004 from typing import TYPE_CHECKING if TYPE_CHECKING: diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 52b5755558900..5fde6577e9f95 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -37,6 +37,7 @@ DataFrame, Index, MultiIndex, + Period, PeriodIndex, ) import pandas.core.common as com @@ -48,7 +49,6 @@ CSSWarning, ) from pandas.io.formats.format import get_level_lengths -from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: from pandas._typing import ( @@ -620,9 +620,8 @@ def _format_header_mi(self) -> Iterable[ExcelCell]: return columns = self.columns - level_strs = columns._format_multi( - sparsify=self.merge_cells in {True, "columns"}, include_names=False - ) + merge_columns = self.merge_cells in {True, "columns"} + level_strs = columns._format_multi(sparsify=merge_columns, include_names=False) level_lengths = get_level_lengths(level_strs) coloffset = 0 lnum = 0 @@ -630,51 +629,34 @@ def _format_header_mi(self) -> Iterable[ExcelCell]: if self.index and isinstance(self.df.index, MultiIndex): coloffset = self.df.index.nlevels - 1 - if self.merge_cells in {True, "columns"}: - # Format multi-index as a merged cells. - for lnum, name in enumerate(columns.names): - yield ExcelCell( - row=lnum, - col=coloffset, - val=name, - style=None, - ) + for lnum, name in enumerate(columns.names): + yield ExcelCell( + row=lnum, + col=coloffset, + val=name, + style=None, + ) - for lnum, (spans, levels, level_codes) in enumerate( - zip(level_lengths, columns.levels, columns.codes) - ): - values = levels.take(level_codes) - for i, span_val in spans.items(): - mergestart, mergeend = None, None - if span_val > 1: - mergestart, mergeend = lnum, coloffset + i + span_val - yield CssExcelCell( - row=lnum, - col=coloffset + i + 1, - val=values[i], - style=None, - css_styles=getattr(self.styler, "ctx_columns", None), - css_row=lnum, - css_col=i, - css_converter=self.style_converter, - mergestart=mergestart, - mergeend=mergeend, - ) - else: - # Format in legacy format with dots to indicate levels. - for i, values in enumerate(zip(*level_strs)): - v = ".".join(map(pprint_thing, values)) + for lnum, (spans, levels, level_codes) in enumerate( + zip(level_lengths, columns.levels, columns.codes) + ): + values = levels.take(level_codes) + for i, span_val in spans.items(): + mergestart, mergeend = None, None + if merge_columns and span_val > 1: + mergestart, mergeend = lnum, coloffset + i + span_val yield CssExcelCell( row=lnum, col=coloffset + i + 1, - val=v, + val=values[i], style=None, css_styles=getattr(self.styler, "ctx_columns", None), css_row=lnum, css_col=i, css_converter=self.style_converter, + mergestart=mergestart, + mergeend=mergeend, ) - self.rowcounter = lnum def _format_header_regular(self) -> Iterable[ExcelCell]: @@ -798,11 +780,8 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: # MultiIndex columns require an extra row # with index names (blank if None) for - # unambiguous round-trip, unless not merging, - # in which case the names all go on one row Issue #11328 - if isinstance(self.columns, MultiIndex) and ( - self.merge_cells in {True, "columns"} - ): + # unambiguous round-trip, Issue #11328 + if isinstance(self.columns, MultiIndex): self.rowcounter += 1 # if index labels are not empty go ahead and dump @@ -825,6 +804,9 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: allow_fill=levels._can_hold_na, fill_value=levels._na_value, ) + # GH#60099 + if isinstance(values[0], Period): + values = values.to_timestamp() for i, span_val in spans.items(): mergestart, mergeend = None, None @@ -849,6 +831,10 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: # Format hierarchical rows with non-merged values. for indexcolvals in zip(*self.df.index): for idx, indexcolval in enumerate(indexcolvals): + # GH#60099 + if isinstance(indexcolval, Period): + indexcolval = indexcolval.to_timestamp() + yield CssExcelCell( row=self.rowcounter + idx, col=gcolidx, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 861f5885f80c6..46ecb2b9a8f12 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -78,7 +78,6 @@ ) from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex -from pandas.core.reshape.concat import concat from pandas.io.common import ( check_parent_directory, @@ -245,7 +244,11 @@ def _chk_truncate(self) -> None: series = series.iloc[:max_rows] else: row_num = max_rows // 2 - series = concat((series.iloc[:row_num], series.iloc[-row_num:])) + _len = len(series) + _slice = np.hstack( + [np.arange(row_num), np.arange(_len - row_num, _len)] + ) + series = series.iloc[_slice] self.tr_row_num = row_num else: self.tr_row_num = None @@ -669,9 +672,9 @@ def _truncate_horizontally(self) -> None: assert self.max_cols_fitted is not None col_num = self.max_cols_fitted // 2 if col_num >= 1: - left = self.tr_frame.iloc[:, :col_num] - right = self.tr_frame.iloc[:, -col_num:] - self.tr_frame = concat((left, right), axis=1) + _len = len(self.tr_frame.columns) + _slice = np.hstack([np.arange(col_num), np.arange(_len - col_num, _len)]) + self.tr_frame = self.tr_frame.iloc[:, _slice] # truncate formatter if isinstance(self.formatters, (list, tuple)): @@ -682,7 +685,7 @@ def _truncate_horizontally(self) -> None: else: col_num = cast(int, self.max_cols) self.tr_frame = self.tr_frame.iloc[:, :col_num] - self.tr_col_num = col_num + self.tr_col_num: int = col_num def _truncate_vertically(self) -> None: """Remove rows, which are not to be displayed. @@ -1749,7 +1752,7 @@ def _trim_zeros_complex(str_complexes: ArrayLike, decimal: str = ".") -> list[st # The split will give [{"", "-"}, "xxx", "+/-", "xxx", "j", ""] # Therefore, the imaginary part is the 4th and 3rd last elements, # and the real part is everything before the imaginary part - trimmed = re.split(r"([j+-])", x) + trimmed = re.split(r"(? None: use_mathjax = get_option("display.html.use_mathjax") if not use_mathjax: _classes.append("tex2jax_ignore") + _classes.append("mathjax_ignore") if self.classes is not None: if isinstance(self.classes, str): self.classes = self.classes.split() diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 67b5eb6f5ee5b..a9936ba8c8f2c 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -203,7 +203,7 @@ def pprint_thing( def as_escaped_string( thing: Any, escape_chars: EscapeChars | None = escape_chars ) -> str: - translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r"} + translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r", "'": r"\'"} if isinstance(escape_chars, Mapping): if default_escapes: translate.update(escape_chars) diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 08d9fd938c873..c0f0608f1ab32 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -366,9 +366,11 @@ def _translate( if not get_option("styler.html.mathjax"): table_attr = table_attr or "" if 'class="' in table_attr: - table_attr = table_attr.replace('class="', 'class="tex2jax_ignore ') + table_attr = table_attr.replace( + 'class="', 'class="tex2jax_ignore mathjax_ignore ' + ) else: - table_attr += ' class="tex2jax_ignore"' + table_attr += ' class="tex2jax_ignore mathjax_ignore"' d.update({"table_attributes": table_attr}) if self.tooltips: @@ -866,7 +868,8 @@ def _translate_latex(self, d: dict, clines: str | None) -> None: or multirow sparsification (so that \multirow and \multicol work correctly). """ index_levels = self.index.nlevels - visible_index_level_n = index_levels - sum(self.hide_index_) + # GH 52218 + visible_index_level_n = max(1, index_levels - sum(self.hide_index_)) d["head"] = [ [ {**col, "cellstyle": self.ctx_columns[r, c - visible_index_level_n]} diff --git a/pandas/io/html.py b/pandas/io/html.py index c9897f628fdc9..183af3a03221b 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -454,15 +454,26 @@ def row_is_all_th(row): while body_rows and row_is_all_th(body_rows[0]): header_rows.append(body_rows.pop(0)) - header = self._expand_colspan_rowspan(header_rows, section="header") - body = self._expand_colspan_rowspan(body_rows, section="body") - footer = self._expand_colspan_rowspan(footer_rows, section="footer") + header, rem = self._expand_colspan_rowspan(header_rows, section="header") + body, rem = self._expand_colspan_rowspan( + body_rows, + section="body", + remainder=rem, + overflow=len(footer_rows) > 0, + ) + footer, _ = self._expand_colspan_rowspan( + footer_rows, section="footer", remainder=rem, overflow=False + ) return header, body, footer def _expand_colspan_rowspan( - self, rows, section: Literal["header", "footer", "body"] - ) -> list[list]: + self, + rows, + section: Literal["header", "footer", "body"], + remainder: list[tuple[int, str | tuple, int]] | None = None, + overflow: bool = True, + ) -> tuple[list[list], list[tuple[int, str | tuple, int]]]: """ Given a list of s, return a list of text rows. @@ -471,12 +482,20 @@ def _expand_colspan_rowspan( rows : list of node-like List of s section : the section that the rows belong to (header, body or footer). + remainder: list[tuple[int, str | tuple, int]] | None + Any remainder from the expansion of previous section + overflow: bool + If true, return any partial rows as 'remainder'. If not, use up any + partial rows. True by default. Returns ------- list of list Each returned row is a list of str text, or tuple (text, link) if extract_links is not None. + remainder + Remaining partial rows if any. If overflow is False, an empty list + is returned. Notes ----- @@ -485,9 +504,7 @@ def _expand_colspan_rowspan( """ all_texts = [] # list of rows, each a list of str text: str | tuple - remainder: list[ - tuple[int, str | tuple, int] - ] = [] # list of (index, text, nrows) + remainder = remainder if remainder is not None else [] for tr in rows: texts = [] # the output for this row @@ -528,19 +545,20 @@ def _expand_colspan_rowspan( all_texts.append(texts) remainder = next_remainder - # Append rows that only appear because the previous row had non-1 - # rowspan - while remainder: - next_remainder = [] - texts = [] - for prev_i, prev_text, prev_rowspan in remainder: - texts.append(prev_text) - if prev_rowspan > 1: - next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) - all_texts.append(texts) - remainder = next_remainder + if not overflow: + # Append rows that only appear because the previous row had non-1 + # rowspan + while remainder: + next_remainder = [] + texts = [] + for prev_i, prev_text, prev_rowspan in remainder: + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + all_texts.append(texts) + remainder = next_remainder - return all_texts + return all_texts, remainder def _handle_hidden_tables(self, tbl_list, attr_name: str): """ diff --git a/pandas/io/json/__init__.py b/pandas/io/json/__init__.py index 8f4e7a62834b5..39f78e26d6041 100644 --- a/pandas/io/json/__init__.py +++ b/pandas/io/json/__init__.py @@ -7,9 +7,9 @@ from pandas.io.json._table_schema import build_table_schema __all__ = [ - "ujson_dumps", - "ujson_loads", + "build_table_schema", "read_json", "to_json", - "build_table_schema", + "ujson_dumps", + "ujson_loads", ] diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index e9c9f5ba225a5..237518b3c8d92 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -36,7 +36,6 @@ from pandas.core.dtypes.dtypes import PeriodDtype from pandas import ( - ArrowDtype, DataFrame, Index, MultiIndex, @@ -48,6 +47,7 @@ from pandas.core.reshape.concat import concat from pandas.core.shared_docs import _shared_docs +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( IOHandles, dedup_names, @@ -364,10 +364,8 @@ def __init__( ) # TODO: Do this timedelta properly in objToJSON.c See GH #15137 - if ( - (obj.ndim == 1) - and (obj.name in set(obj.index.names)) - or len(obj.columns.intersection(obj.index.names)) + if ((obj.ndim == 1) and (obj.name in set(obj.index.names))) or len( + obj.columns.intersection(obj.index.names) ): msg = "Overlapping names between the index and columns" raise ValueError(msg) @@ -940,18 +938,7 @@ def read(self) -> DataFrame | Series: if self.engine == "pyarrow": pyarrow_json = import_optional_dependency("pyarrow.json") pa_table = pyarrow_json.read_json(self.data) - - mapping: type[ArrowDtype] | None | Callable - if self.dtype_backend == "pyarrow": - mapping = ArrowDtype - elif self.dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - else: - mapping = None - - return pa_table.to_pandas(types_mapper=mapping) + return arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend) elif self.engine == "ujson": if self.lines: if self.chunksize: diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 9d250ee5c08ce..7879be18b52c9 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -239,9 +239,16 @@ def build_table_schema( """ Create a Table schema from ``data``. + This method is a utility to generate a JSON-serializable schema + representation of a pandas Series or DataFrame, compatible with the + Table Schema specification. It enables structured data to be shared + and validated in various applications, ensuring consistency and + interoperability. + Parameters ---------- - data : Series, DataFrame + data : Series or DataFrame + The input data for which the table schema is to be created. index : bool, default True Whether to include ``data.index`` in the schema. primary_key : bool or None, default True @@ -256,6 +263,12 @@ def build_table_schema( Returns ------- dict + A dictionary representing the Table schema. + + See Also + -------- + DataFrame.to_json : Convert the object to a JSON string. + read_json : Convert a JSON string to pandas object. Notes ----- diff --git a/pandas/io/orc.py b/pandas/io/orc.py index f179dafc919e5..a945f3dc38d35 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -9,16 +9,13 @@ Literal, ) -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas.core.indexes.api import default_index -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( get_handle, is_fsspec_url, @@ -127,21 +124,7 @@ def read_orc( pa_table = orc.read_table( source=source, columns=columns, filesystem=filesystem, **kwargs ) - if dtype_backend is not lib.no_default: - if dtype_backend == "pyarrow": - df = pa_table.to_pandas(types_mapper=pd.ArrowDtype) - else: - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping() - df = pa_table.to_pandas(types_mapper=mapping.get) - return df - else: - if using_string_dtype(): - types_mapper = arrow_string_types_mapper() - else: - types_mapper = None - return pa_table.to_pandas(types_mapper=types_mapper) + return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) def to_orc( diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 24415299e799b..6a5a83088e986 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -15,22 +15,19 @@ filterwarnings, ) -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError from pandas.util._decorators import doc from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas import ( DataFrame, get_option, ) from pandas.core.shared_docs import _shared_docs -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( IOHandles, get_handle, @@ -245,21 +242,11 @@ def read( dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, storage_options: StorageOptions | None = None, filesystem=None, + to_pandas_kwargs: dict[str, Any] | None = None, **kwargs, ) -> DataFrame: kwargs["use_pandas_metadata"] = True - to_pandas_kwargs = {} - if dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping() - to_pandas_kwargs["types_mapper"] = mapping.get - elif dtype_backend == "pyarrow": - to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] - elif using_string_dtype(): - to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper() - path_or_handle, handles, filesystem = _get_path_or_handle( path, filesystem, @@ -280,7 +267,11 @@ def read( "make_block is deprecated", DeprecationWarning, ) - result = pa_table.to_pandas(**to_pandas_kwargs) + result = arrow_table_to_pandas( + pa_table, + dtype_backend=dtype_backend, + to_pandas_kwargs=to_pandas_kwargs, + ) if pa_table.schema.metadata: if b"PANDAS_ATTRS" in pa_table.schema.metadata: @@ -361,6 +352,7 @@ def read( filters=None, storage_options: StorageOptions | None = None, filesystem=None, + to_pandas_kwargs: dict | None = None, **kwargs, ) -> DataFrame: parquet_kwargs: dict[str, Any] = {} @@ -376,6 +368,10 @@ def read( raise NotImplementedError( "filesystem is not implemented for the fastparquet engine." ) + if to_pandas_kwargs is not None: + raise NotImplementedError( + "to_pandas_kwargs is not implemented for the fastparquet engine." + ) path = stringify_path(path) handles = None if is_fsspec_url(path): @@ -466,7 +462,7 @@ def to_parquet( .. versionadded:: 2.1.0 kwargs - Additional keyword arguments passed to the engine + Additional keyword arguments passed to the engine. Returns ------- @@ -505,6 +501,7 @@ def read_parquet( dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, filesystem: Any = None, filters: list[tuple] | list[list[tuple]] | None = None, + to_pandas_kwargs: dict | None = None, **kwargs, ) -> DataFrame: """ @@ -578,6 +575,12 @@ def read_parquet( .. versionadded:: 2.1.0 + to_pandas_kwargs : dict | None, default None + Keyword arguments to pass through to :func:`pyarrow.Table.to_pandas` + when ``engine="pyarrow"``. + + .. versionadded:: 3.0.0 + **kwargs Any additional kwargs are passed to the engine. @@ -650,5 +653,6 @@ def read_parquet( storage_options=storage_options, dtype_backend=dtype_backend, filesystem=filesystem, + to_pandas_kwargs=to_pandas_kwargs, **kwargs, ) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 86bb5f190e403..672672490996d 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -3,8 +3,6 @@ from typing import TYPE_CHECKING import warnings -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -16,18 +14,14 @@ from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.inference import is_integer -import pandas as pd -from pandas import DataFrame - -from pandas.io._util import ( - _arrow_dtype_mapping, - arrow_string_types_mapper, -) +from pandas.io._util import arrow_table_to_pandas from pandas.io.parsers.base_parser import ParserBase if TYPE_CHECKING: from pandas._typing import ReadBuffer + from pandas import DataFrame + class ArrowParserWrapper(ParserBase): """ @@ -293,17 +287,8 @@ def read(self) -> DataFrame: "make_block is deprecated", DeprecationWarning, ) - if dtype_backend == "pyarrow": - frame = table.to_pandas(types_mapper=pd.ArrowDtype) - elif dtype_backend == "numpy_nullable": - # Modify the default mapping to also - # map null to Int64 (to match other engines) - dtype_mapping = _arrow_dtype_mapping() - dtype_mapping[pa.null()] = pd.Int64Dtype() - frame = table.to_pandas(types_mapper=dtype_mapping.get) - elif using_string_dtype(): - frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) + frame = arrow_table_to_pandas( + table, dtype_backend=dtype_backend, null_to_int64=True + ) - else: - frame = table.to_pandas() return self._finalize_pandas_output(frame) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 7294efe843cce..e263c69376d05 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -368,7 +368,7 @@ def _agg_index(self, index) -> Index: index_converter = converters.get(self.index_names[i]) is not None try_num_bool = not ( - cast_type and is_string_dtype(cast_type) or index_converter + (cast_type and is_string_dtype(cast_type)) or index_converter ) arr, _ = self._infer_types( diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 99d584db61755..db9547a18b600 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1052,8 +1052,9 @@ def _remove_empty_lines(self, lines: list[list[T]]) -> list[list[T]]: for line in lines if ( len(line) > 1 - or len(line) == 1 - and (not isinstance(line[0], str) or line[0].strip()) + or ( + len(line) == 1 and (not isinstance(line[0], str) or line[0].strip()) + ) ) ] return ret diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 9aff5600cf49b..5652d7fab0c7c 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -45,11 +45,10 @@ from pandas.core.dtypes.common import ( is_dict_like, is_list_like, + is_object_dtype, + is_string_dtype, ) -from pandas.core.dtypes.dtypes import ( - ArrowDtype, - DatetimeTZDtype, -) +from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna from pandas import get_option @@ -58,12 +57,15 @@ Series, ) from pandas.core.arrays import ArrowExtensionArray +from pandas.core.arrays.string_ import StringDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.common import maybe_make_list from pandas.core.internals.construction import convert_object_array from pandas.core.tools.datetimes import to_datetime +from pandas.io._util import arrow_table_to_pandas + if TYPE_CHECKING: from collections.abc import ( Callable, @@ -239,7 +241,7 @@ def read_sql_table( # pyright: ignore[reportOverlappingOverload] schema=..., index_col: str | list[str] | None = ..., coerce_float=..., - parse_dates: list[str] | dict[str, str] | None = ..., + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ..., columns: list[str] | None = ..., chunksize: None = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., @@ -253,7 +255,7 @@ def read_sql_table( schema=..., index_col: str | list[str] | None = ..., coerce_float=..., - parse_dates: list[str] | dict[str, str] | None = ..., + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ..., columns: list[str] | None = ..., chunksize: int = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., @@ -266,7 +268,7 @@ def read_sql_table( schema: str | None = None, index_col: str | list[str] | None = None, coerce_float: bool = True, - parse_dates: list[str] | dict[str, str] | None = None, + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = None, columns: list[str] | None = None, chunksize: int | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, @@ -370,7 +372,7 @@ def read_sql_query( # pyright: ignore[reportOverlappingOverload] index_col: str | list[str] | None = ..., coerce_float=..., params: list[Any] | Mapping[str, Any] | None = ..., - parse_dates: list[str] | dict[str, str] | None = ..., + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ..., chunksize: None = ..., dtype: DtypeArg | None = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., @@ -384,7 +386,7 @@ def read_sql_query( index_col: str | list[str] | None = ..., coerce_float=..., params: list[Any] | Mapping[str, Any] | None = ..., - parse_dates: list[str] | dict[str, str] | None = ..., + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ..., chunksize: int = ..., dtype: DtypeArg | None = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., @@ -397,7 +399,7 @@ def read_sql_query( index_col: str | list[str] | None = None, coerce_float: bool = True, params: list[Any] | Mapping[str, Any] | None = None, - parse_dates: list[str] | dict[str, str] | None = None, + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = None, chunksize: int | None = None, dtype: DtypeArg | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, @@ -1316,7 +1318,12 @@ def _harmonize_columns( elif dtype_backend == "numpy" and col_type is float: # floats support NA, can always convert! self.frame[col_name] = df_col.astype(col_type) - + elif ( + using_string_dtype() + and is_string_dtype(col_type) + and is_object_dtype(self.frame[col_name]) + ): + self.frame[col_name] = df_col.astype(col_type) elif dtype_backend == "numpy" and len(df_col) == df_col.count(): # No NA values, can convert ints and bools if col_type is np.dtype("int64") or col_type is bool: @@ -1403,6 +1410,7 @@ def _get_dtype(self, sqltype): DateTime, Float, Integer, + String, ) if isinstance(sqltype, Float): @@ -1422,6 +1430,10 @@ def _get_dtype(self, sqltype): return date elif isinstance(sqltype, Boolean): return bool + elif isinstance(sqltype, String): + if using_string_dtype(): + return StringDtype(na_value=np.nan) + return object @@ -2195,23 +2207,10 @@ def read_table( else: stmt = f"SELECT {select_list} FROM {table_name}" - mapping: type[ArrowDtype] | None | Callable - if dtype_backend == "pyarrow": - mapping = ArrowDtype - elif dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - elif using_string_dtype(): - from pandas.io._util import arrow_string_types_mapper - - arrow_string_types_mapper() - else: - mapping = None - with self.con.cursor() as cur: cur.execute(stmt) - df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + pa_table = cur.fetch_arrow_table() + df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) return _wrap_result_adbc( df, @@ -2279,19 +2278,10 @@ def read_query( if chunksize: raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") - mapping: type[ArrowDtype] | None | Callable - if dtype_backend == "pyarrow": - mapping = ArrowDtype - elif dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - else: - mapping = None - with self.con.cursor() as cur: cur.execute(sql) - df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + pa_table = cur.fetch_arrow_table() + df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) return _wrap_result_adbc( df, diff --git a/pandas/io/stata.py b/pandas/io/stata.py index ed89d5766c306..34d95fb59a21c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2206,15 +2206,15 @@ def _convert_datetime_to_stata_type(fmt: str) -> np.dtype: def _maybe_convert_to_int_keys(convert_dates: dict, varlist: list[Hashable]) -> dict: new_dict = {} - for key in convert_dates: + for key, value in convert_dates.items(): if not convert_dates[key].startswith("%"): # make sure proper fmts - convert_dates[key] = "%" + convert_dates[key] + convert_dates[key] = "%" + value if key in varlist: - new_dict.update({varlist.index(key): convert_dates[key]}) + new_dict[varlist.index(key)] = convert_dates[key] else: if not isinstance(key, int): raise ValueError("convert_dates key must be a column or an integer") - new_dict.update({key: convert_dates[key]}) + new_dict[key] = convert_dates[key] return new_dict @@ -2748,6 +2748,18 @@ def write_file(self) -> None: """ Export DataFrame object to Stata dta format. + This method writes the contents of a pandas DataFrame to a `.dta` file + compatible with Stata. It includes features for handling value labels, + variable types, and metadata like timestamps and data labels. The output + file can then be read and used in Stata or other compatible statistical + tools. + + See Also + -------- + read_stata : Read Stata file into DataFrame. + DataFrame.to_stata : Export DataFrame object to Stata dta format. + io.stata.StataWriter : A class for writing Stata binary dta files. + Examples -------- >>> df = pd.DataFrame( @@ -2867,7 +2879,7 @@ def _write_header( # ds_format - just use 114 self._write_bytes(struct.pack("b", 114)) # byteorder - self._write(byteorder == ">" and "\x01" or "\x02") + self._write((byteorder == ">" and "\x01") or "\x02") # filetype self._write("\x01") # unused @@ -3413,7 +3425,7 @@ def _write_header( # ds_format - 117 bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release")) # byteorder - bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder")) + bio.write(self._tag((byteorder == ">" and "MSF") or "LSF", "byteorder")) # number of vars, 2 bytes in 117 and 118, 4 byte in 119 nvar_type = "H" if self._dta_version <= 118 else "I" bio.write(self._tag(struct.pack(byteorder + nvar_type, self.nvar), "K")) diff --git a/pandas/plotting/__init__.py b/pandas/plotting/__init__.py index c7a4c1eacfcae..837bfaf82ca27 100644 --- a/pandas/plotting/__init__.py +++ b/pandas/plotting/__init__.py @@ -80,20 +80,20 @@ __all__ = [ "PlotAccessor", + "andrews_curves", + "autocorrelation_plot", + "bootstrap_plot", "boxplot", "boxplot_frame", "boxplot_frame_groupby", + "deregister_matplotlib_converters", "hist_frame", "hist_series", - "scatter_matrix", - "radviz", - "andrews_curves", - "bootstrap_plot", - "parallel_coordinates", "lag_plot", - "autocorrelation_plot", - "table", + "parallel_coordinates", "plot_params", + "radviz", "register_matplotlib_converters", - "deregister_matplotlib_converters", + "scatter_matrix", + "table", ] diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index fbf9009cedc40..aee872f9ae50a 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -570,18 +570,23 @@ def boxplot_frame_groupby( Parameters ---------- - grouped : Grouped DataFrame + grouped : DataFrameGroupBy + The grouped DataFrame object over which to create the box plots. subplots : bool * ``False`` - no subplots will be used * ``True`` - create a subplot for each group. - column : column name or list of names, or vector Can be any valid input to groupby. fontsize : float or str - rot : label rotation angle - grid : Setting this to True will show the grid + Font size for the labels. + rot : float + Rotation angle of labels (in degrees) on the x-axis. + grid : bool + Whether to show grid lines on the plot. ax : Matplotlib axis object, default None - figsize : A tuple (width, height) in inches + The axes on which to draw the plots. If None, uses the current axes. + figsize : tuple of (float, float) + The figure size in inches (width, height). layout : tuple (optional) The layout of the plot: (rows, columns). sharex : bool, default False @@ -599,8 +604,15 @@ def boxplot_frame_groupby( Returns ------- - dict of key/value = group key/DataFrame.boxplot return value - or DataFrame.boxplot return value in case subplots=figures=False + dict or DataFrame.boxplot return value + If ``subplots=True``, returns a dictionary of group keys to the boxplot + return values. If ``subplots=False``, returns the boxplot return value + of a single DataFrame. + + See Also + -------- + DataFrame.boxplot : Create a box plot from a DataFrame. + Series.plot : Plot a Series. Examples -------- diff --git a/pandas/plotting/_matplotlib/__init__.py b/pandas/plotting/_matplotlib/__init__.py index 87f3ca09ad346..ff28868aa0033 100644 --- a/pandas/plotting/_matplotlib/__init__.py +++ b/pandas/plotting/_matplotlib/__init__.py @@ -74,20 +74,20 @@ def plot(data, kind, **kwargs): __all__ = [ - "plot", - "hist_series", - "hist_frame", - "boxplot", - "boxplot_frame", - "boxplot_frame_groupby", - "table", "andrews_curves", "autocorrelation_plot", "bootstrap_plot", + "boxplot", + "boxplot_frame", + "boxplot_frame_groupby", + "deregister", + "hist_frame", + "hist_series", "lag_plot", "parallel_coordinates", + "plot", "radviz", - "scatter_matrix", "register", - "deregister", + "scatter_matrix", + "table", ] diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 68682344f98ca..5ad30a68ae3c9 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -20,6 +20,7 @@ import pandas as pd import pandas.core.common as com +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.core import ( @@ -54,7 +55,8 @@ def _set_ticklabels(ax: Axes, labels: list[str], is_vertical: bool, **kwargs) -> ticks = ax.get_xticks() if is_vertical else ax.get_yticks() if len(ticks) != len(labels): i, remainder = divmod(len(ticks), len(labels)) - assert remainder == 0, remainder + if Version(mpl.__version__) < Version("3.10"): + assert remainder == 0, remainder labels *= i if is_vertical: ax.set_xticklabels(labels, **kwargs) diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index d5624aecd1215..8ee75e7fe553e 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -56,7 +56,7 @@ def format_date_labels(ax: Axes, rot) -> None: fig = ax.get_figure() if fig is not None: # should always be a Figure but can technically be None - maybe_adjust_figure(fig, bottom=0.2) + maybe_adjust_figure(fig, bottom=0.2) # type: ignore[arg-type] def table( diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 7face74dcbc89..b20f8ac5f4796 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -178,14 +178,21 @@ def scatter_matrix( """ Draw a matrix of scatter plots. + Each pair of numeric columns in the DataFrame is plotted against each other, + resulting in a matrix of scatter plots. The diagonal plots can display either + histograms or Kernel Density Estimation (KDE) plots for each variable. + Parameters ---------- frame : DataFrame + The data to be plotted. alpha : float, optional Amount of transparency applied. figsize : (float,float), optional A tuple (width, height) in inches. ax : Matplotlib axis object, optional + An existing Matplotlib axis object for the plots. If None, a new axis is + created. grid : bool, optional Setting this to True will show the grid. diagonal : {'hist', 'kde'} @@ -208,6 +215,14 @@ def scatter_matrix( numpy.ndarray A matrix of scatter plots. + See Also + -------- + plotting.parallel_coordinates : Plots parallel coordinates for multivariate data. + plotting.andrews_curves : Generates Andrews curves for visualizing clusters of + multivariate data. + plotting.radviz : Creates a RadViz visualization. + plotting.bootstrap_plot : Visualizes uncertainty in data via bootstrap sampling. + Examples -------- diff --git a/pandas/testing.py b/pandas/testing.py index 0445fa5b5efc0..433b22bf1107e 100644 --- a/pandas/testing.py +++ b/pandas/testing.py @@ -12,6 +12,6 @@ __all__ = [ "assert_extension_array_equal", "assert_frame_equal", - "assert_series_equal", "assert_index_equal", + "assert_series_equal", ] diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 75f9958b16286..c1d9f5ea4d25c 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -417,6 +417,7 @@ def test_set_module(): assert pd.Period.__module__ == "pandas" assert pd.Timestamp.__module__ == "pandas" assert pd.Timedelta.__module__ == "pandas" + assert pd.concat.__module__ == "pandas" assert pd.isna.__module__ == "pandas" assert pd.notna.__module__ == "pandas" assert pd.merge.__module__ == "pandas" diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index e19c21f81b3e1..0503bf9166ec7 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -218,18 +218,12 @@ def transform(row): def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string): # GH 21224 if using_infer_string: - if df.dtypes.iloc[0].storage == "pyarrow": - import pyarrow as pa - - # TODO(infer_string) - # should raise a proper TypeError instead of propagating the pyarrow error - - expected = (expected, pa.lib.ArrowNotImplementedError) - else: - expected = (expected, NotImplementedError) + expected = (expected, NotImplementedError) msg = ( - "can't multiply sequence by non-int of type 'str'|has no kernel|cannot perform" + "can't multiply sequence by non-int of type 'str'" + "|cannot perform cumprod with type str" # NotImplementedError python backend + "|operation 'cumprod' not supported for dtype 'str'" # TypeError pyarrow ) warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): @@ -259,16 +253,12 @@ def test_agg_cython_table_raises_series(series, func, expected, using_infer_stri if func == "median" or func is np.nanmedian or func is np.median: msg = r"Cannot convert \['a' 'b' 'c'\] to numeric" - if using_infer_string: - if series.dtype.storage == "pyarrow": - import pyarrow as pa - - # TODO(infer_string) - # should raise a proper TypeError instead of propagating the pyarrow error - expected = (expected, pa.lib.ArrowNotImplementedError) - else: - expected = (expected, NotImplementedError) - msg = msg + "|does not support|has no kernel|Cannot perform|cannot perform" + if using_infer_string and func == "cumprod": + expected = (expected, NotImplementedError) + + msg = ( + msg + "|does not support|has no kernel|Cannot perform|cannot perform|operation" + ) warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index 732652f24e2eb..c52168ae48ca8 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import WASM from pandas.core.dtypes.common import is_number @@ -81,7 +79,6 @@ def test_apply_np_transformer(float_frame, op, how): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, func, expected", chain( @@ -140,7 +137,6 @@ def test_agg_cython_table_series(series, func, expected): assert result == expected -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, func, expected", chain( @@ -163,10 +159,17 @@ def test_agg_cython_table_series(series, func, expected): ), ), ) -def test_agg_cython_table_transform_series(series, func, expected): +def test_agg_cython_table_transform_series(request, series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + if series.dtype == "string" and func == "cumsum": + request.applymarker( + pytest.mark.xfail( + raises=(TypeError, NotImplementedError), + reason="TODO(infer_string) cumsum not yet implemented for string", + ) + ) warn = None if isinstance(func, str) else FutureWarning with tm.assert_produces_warning(warn, match="is currently using Series.*"): result = series.agg(func) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 888e8628f8664..e3a821519c638 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gt2 @@ -392,9 +390,6 @@ def test_to_numpy(arr, expected, zero_copy, index_or_series_or_array): assert np.may_share_memory(result_nocopy1, result_nocopy2) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize("as_series", [True, False]) @pytest.mark.parametrize( "arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)] @@ -406,13 +401,13 @@ def test_to_numpy_copy(arr, as_series, using_infer_string): # no copy by default result = obj.to_numpy() - if using_infer_string and arr.dtype == object: + if using_infer_string and arr.dtype == object and obj.dtype.storage == "pyarrow": assert np.shares_memory(arr, result) is False else: assert np.shares_memory(arr, result) is True result = obj.to_numpy(copy=False) - if using_infer_string and arr.dtype == object: + if using_infer_string and arr.dtype == object and obj.dtype.storage == "pyarrow": assert np.shares_memory(arr, result) is False else: assert np.shares_memory(arr, result) is True diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index e338fb1331734..5a59617ce5bd3 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -835,3 +835,10 @@ def test_pandas_dtype_string_dtypes(string_storage): with pd.option_context("string_storage", string_storage): result = pandas_dtype("string") assert result == pd.StringDtype(string_storage, na_value=pd.NA) + + +@td.skip_if_installed("pyarrow") +def test_construct_from_string_without_pyarrow_installed(): + # GH 57928 + with pytest.raises(ImportError, match="pyarrow>=10.0.1 is required"): + pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]") diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index bab8566a06dc2..60cade97ab528 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -113,13 +113,9 @@ def test_groupby_extension_transform(self, data_for_grouping): def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("B", group_keys=False, observed=False).apply(groupby_apply_op) + df.groupby("B", group_keys=False, observed=False).apply(groupby_apply_op) df.groupby("B", group_keys=False, observed=False).A.apply(groupby_apply_op) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("A", group_keys=False, observed=False).apply(groupby_apply_op) + df.groupby("A", group_keys=False, observed=False).apply(groupby_apply_op) df.groupby("A", group_keys=False, observed=False).B.apply(groupby_apply_op) def test_groupby_apply_identity(self, data_for_grouping): diff --git a/pandas/tests/extension/decimal/__init__.py b/pandas/tests/extension/decimal/__init__.py index 34727b43a7b0f..47b1c7c57a47a 100644 --- a/pandas/tests/extension/decimal/__init__.py +++ b/pandas/tests/extension/decimal/__init__.py @@ -5,4 +5,4 @@ to_decimal, ) -__all__ = ["DecimalArray", "DecimalDtype", "to_decimal", "make_data"] +__all__ = ["DecimalArray", "DecimalDtype", "make_data", "to_decimal"] diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index f0ff11e5fa3f7..6dd1f3f15bc15 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -441,7 +441,7 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques request.applymarker( pytest.mark.xfail( reason=f"{all_numeric_accumulations} not implemented for {pa_type}", - raises=NotImplementedError, + raises=TypeError, ) ) @@ -896,9 +896,7 @@ def _is_temporal_supported(self, opname, pa_dtype): ) ) and pa.types.is_duration(pa_dtype) - or opname in ("__sub__", "__rsub__") - and pa.types.is_temporal(pa_dtype) - ) + ) or (opname in ("__sub__", "__rsub__") and pa.types.is_temporal(pa_dtype)) def _get_expected_exception( self, op_name: str, obj, other @@ -1649,7 +1647,7 @@ def test_from_arrow_respecting_given_dtype(): def test_from_arrow_respecting_given_dtype_unsafe(): array = pa.array([1.5, 2.5], type=pa.float64()) - with pytest.raises(pa.ArrowInvalid, match="Float value 1.5 was truncated"): + with tm.external_error_raised(pa.ArrowInvalid): array.to_pandas(types_mapper={pa.float64(): ArrowDtype(pa.int64())}.get) diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index ec979ac6d22dc..011bf0b2016b2 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -101,6 +101,31 @@ def test_fillna_limit_series(self, data_missing): def test_fillna_length_mismatch(self, data_missing): super().test_fillna_length_mismatch(data_missing) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_hash_pandas_object(self, data): + super().test_hash_pandas_object(data) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_hash_pandas_object_works(self, data, as_frame): + super().test_hash_pandas_object_works(data, as_frame) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data, request): + super().test_EA_types(engine, data, request) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_astype_str(self, data): + super().test_astype_str(data) + # TODO: either belongs in tests.arrays.interval or move into base tests. def test_fillna_non_scalar_raises(data_missing): diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 27621193a9b8d..e19351b2ad058 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -187,9 +187,8 @@ def _get_expected_exception( return None def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: - return ( - op_name in ["min", "max", "sum"] - or ser.dtype.na_value is np.nan # type: ignore[union-attr] + return op_name in ["min", "max", "sum"] or ( + ser.dtype.na_value is np.nan # type: ignore[union-attr] and op_name in ("any", "all") ) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 86b39ddd19ec1..d6570fcda2ee8 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -1025,15 +1025,9 @@ def test_where_producing_ea_cond_for_np_dtype(): @pytest.mark.parametrize( "replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)] ) -def test_where_int_overflow(replacement, using_infer_string): +def test_where_int_overflow(replacement): # GH 31687 df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]]) - if using_infer_string and replacement not in (None, "snake"): - with pytest.raises( - TypeError, match=f"Invalid value '{replacement}' for dtype 'str'" - ): - df.where(pd.notnull(df), replacement) - return result = df.where(pd.notnull(df), replacement) expected = DataFrame([[1.0, 2e25, "nine"], [replacement, 0.1, replacement]]) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index ad1a37916e381..67d1d45af1cb3 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( Categorical, DataFrame, @@ -65,15 +63,20 @@ def test_fillna_datetime(self, datetime_frame): with pytest.raises(TypeError, match=msg): datetime_frame.fillna() - # TODO(infer_string) test as actual error instead of xfail - @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") - def test_fillna_mixed_type(self, float_string_frame): + def test_fillna_mixed_type(self, float_string_frame, using_infer_string): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan mf.loc[mf.index[-10:], "A"] = np.nan - # TODO: make stronger assertion here, GH 25640 - mf.fillna(value=0) - mf.ffill() + + result = mf.ffill() + assert ( + result.loc[result.index[-10:], "A"] == result.loc[result.index[-11], "A"] + ).all() + assert (result.loc[result.index[5:20], "foo"] == "bar").all() + + result = mf.fillna(value=0) + assert (result.loc[result.index[-10:], "A"] == 0).all() + assert (result.loc[result.index[5:20], "foo"] == 0).all() def test_fillna_mixed_float(self, mixed_float_frame): # mixed numeric (but no float16) @@ -84,28 +87,21 @@ def test_fillna_mixed_float(self, mixed_float_frame): result = mf.ffill() _check_mixed_float(result, dtype={"C": None}) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_fillna_different_dtype(self, using_infer_string): + def test_fillna_different_dtype(self): # with different dtype (GH#3386) df = DataFrame( [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] ) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.fillna({2: "foo"}) - else: - result = df.fillna({2: "foo"}) + result = df.fillna({2: "foo"}) expected = DataFrame( [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] ) + # column is originally float (all-NaN) -> filling with string gives object dtype + expected[2] = expected[2].astype("object") tm.assert_frame_equal(result, expected) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - return_value = df.fillna({2: "foo"}, inplace=True) - else: - return_value = df.fillna({2: "foo"}, inplace=True) + return_value = df.fillna({2: "foo"}, inplace=True) tm.assert_frame_equal(df, expected) assert return_value is None @@ -276,8 +272,7 @@ def test_fillna_dictlike_value_duplicate_colnames(self, columns): expected["A"] = 0.0 tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_fillna_dtype_conversion(self, using_infer_string): + def test_fillna_dtype_conversion(self): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) result = df.dtypes @@ -292,7 +287,7 @@ def test_fillna_dtype_conversion(self, using_infer_string): # empty block df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") result = df.fillna("nan") - expected = DataFrame("nan", index=range(3), columns=["A", "B"]) + expected = DataFrame("nan", dtype="object", index=range(3), columns=["A", "B"]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("val", ["", 1, np.nan, 1.0]) @@ -540,18 +535,10 @@ def test_fillna_col_reordering(self): filled = df.ffill() assert df.columns.tolist() == filled.columns.tolist() - # TODO(infer_string) test as actual error instead of xfail - @pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string") - def test_fill_corner(self, float_frame, float_string_frame): - mf = float_string_frame - mf.loc[mf.index[5:20], "foo"] = np.nan - mf.loc[mf.index[-10:], "A"] = np.nan - - filled = float_string_frame.fillna(value=0) - assert (filled.loc[filled.index[5:20], "foo"] == 0).all() - del float_string_frame["foo"] - - float_frame.reindex(columns=[]).fillna(value=0) + def test_fill_empty(self, float_frame): + df = float_frame.reindex(columns=[]) + result = df.fillna(value=0) + tm.assert_frame_equal(result, df) def test_fillna_with_columns_and_limit(self): # GH40989 diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 52e871cc795b4..c6e5304ae3cb4 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -159,7 +159,7 @@ def test_nlargest_n_duplicate_index(self, n, order, request): result = df.nlargest(n, order) expected = df.sort_values(order, ascending=False).head(n) if Version(np.__version__) >= Version("1.25") and ( - (order == ["a"] and n in (1, 2, 3, 4)) or (order == ["a", "b"]) and n == 5 + (order == ["a"] and n in (1, 2, 3, 4)) or ((order == ["a", "b"]) and n == 5) ): request.applymarker( pytest.mark.xfail( diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 73f44bcc6657e..b2320798ea9a2 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -30,7 +28,6 @@ def mix_abc() -> dict[str, list[float | str]]: class TestDataFrameReplace: - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_inplace(self, datetime_frame, float_string_frame): datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan @@ -46,7 +43,9 @@ def test_replace_inplace(self, datetime_frame, float_string_frame): mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan result = float_string_frame.replace(np.nan, 0) - expected = float_string_frame.fillna(value=0) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=0) tm.assert_frame_equal(result, expected) tsframe = datetime_frame.copy() @@ -291,22 +290,20 @@ def test_regex_replace_dict_nested_non_first_character( expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_dict_nested_gh4115(self): - df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) - expected = DataFrame( - {"Type": Series([0, 1, 0, 0, 1], dtype=df.Type.dtype), "tmp": 2} + df = DataFrame( + {"Type": Series(["Q", "T", "Q", "Q", "T"], dtype=object), "tmp": 2} ) + expected = DataFrame({"Type": Series([0, 1, 0, 0, 1], dtype=object), "tmp": 2}) result = df.replace({"Type": {"Q": 0, "T": 1}}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_list_to_scalar(self, mix_abc): df = DataFrame(mix_abc) expec = DataFrame( { "a": mix_abc["a"], - "b": np.array([np.nan] * 4, dtype=object), + "b": Series([np.nan] * 4, dtype="str"), "c": [np.nan, np.nan, np.nan, "d"], } ) @@ -326,7 +323,6 @@ def test_regex_replace_list_to_scalar(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_str_to_numeric(self, mix_abc): # what happens when you try to replace a numeric value with a regex? df = DataFrame(mix_abc) @@ -338,11 +334,12 @@ def test_regex_replace_str_to_numeric(self, mix_abc): return_value = res3.replace(regex=r"\s*\.\s*", value=0, inplace=True) assert return_value is None expec = DataFrame({"a": mix_abc["a"], "b": ["a", "b", 0, 0], "c": mix_abc["c"]}) + # TODO(infer_string) + expec["c"] = expec["c"].astype(object) tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_regex_list_to_numeric(self, mix_abc): df = DataFrame(mix_abc) res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) @@ -535,31 +532,37 @@ def test_replace_series_dict(self): result = df.replace(s, df.mean()) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") - def test_replace_convert(self): - # gh 3907 - df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) + def test_replace_convert(self, any_string_dtype): + # gh 3907 (pandas >= 3.0 no longer converts dtypes) + df = DataFrame( + [["foo", "bar", "bah"], ["bar", "foo", "bah"]], dtype=any_string_dtype + ) m = {"foo": 1, "bar": 2, "bah": 3} rep = df.replace(m) - expec = df.dtypes - res = rep.dtypes - tm.assert_series_equal(expec, res) + assert (rep.dtypes == object).all() - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_mixed(self, float_string_frame): mf = float_string_frame mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan result = float_string_frame.replace(np.nan, -18) - expected = float_string_frame.fillna(value=-18) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=-18) tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result.replace(-18, np.nan), float_string_frame) + expected2 = float_string_frame.copy() + expected2["foo"] = expected2["foo"].astype(object) + tm.assert_frame_equal(result.replace(-18, np.nan), expected2) result = float_string_frame.replace(np.nan, -1e8) - expected = float_string_frame.fillna(value=-1e8) + expected = float_string_frame.copy() + expected["foo"] = expected["foo"].astype(object) + expected = expected.fillna(value=-1e8) tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result.replace(-1e8, np.nan), float_string_frame) + expected2 = float_string_frame.copy() + expected2["foo"] = expected2["foo"].astype(object) + tm.assert_frame_equal(result.replace(-1e8, np.nan), expected2) def test_replace_mixed_int_block_upcasting(self): # int block upcasting @@ -601,8 +604,7 @@ def test_replace_mixed_int_block_splitting(self): result = df.replace(0, 0.5) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_replace_mixed2(self, using_infer_string): + def test_replace_mixed2(self): # to object block upcasting df = DataFrame( { @@ -621,7 +623,7 @@ def test_replace_mixed2(self, using_infer_string): expected = DataFrame( { - "A": Series(["foo", "bar"]), + "A": Series(["foo", "bar"], dtype="object"), "B": Series([0, "foo"], dtype="object"), } ) @@ -917,8 +919,7 @@ def test_replace_limit(self): # TODO pass - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") - def test_replace_dict_no_regex(self): + def test_replace_dict_no_regex(self, any_string_dtype): answer = Series( { 0: "Strongly Agree", @@ -926,7 +927,8 @@ def test_replace_dict_no_regex(self): 2: "Neutral", 3: "Disagree", 4: "Strongly Disagree", - } + }, + dtype=any_string_dtype, ) weights = { "Agree": 4, @@ -935,11 +937,11 @@ def test_replace_dict_no_regex(self): "Strongly Agree": 5, "Strongly Disagree": 1, } - expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}, dtype=answer.dtype) + expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}, dtype=object) result = answer.replace(weights) tm.assert_series_equal(result, expected) - def test_replace_series_no_regex(self): + def test_replace_series_no_regex(self, any_string_dtype): answer = Series( { 0: "Strongly Agree", @@ -947,7 +949,8 @@ def test_replace_series_no_regex(self): 2: "Neutral", 3: "Disagree", 4: "Strongly Disagree", - } + }, + dtype=any_string_dtype, ) weights = Series( { @@ -1043,16 +1046,15 @@ def test_nested_dict_overlapping_keys_replace_str(self): expected = df.replace({"a": dict(zip(astr, bstr))}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") - def test_replace_swapping_bug(self, using_infer_string): + def test_replace_swapping_bug(self): df = DataFrame({"a": [True, False, True]}) res = df.replace({"a": {True: "Y", False: "N"}}) - expect = DataFrame({"a": ["Y", "N", "Y"]}) + expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object) tm.assert_frame_equal(res, expect) df = DataFrame({"a": [0, 1, 0]}) res = df.replace({"a": {0: "Y", 1: "N"}}) - expect = DataFrame({"a": ["Y", "N", "Y"]}) + expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object) tm.assert_frame_equal(res, expect) def test_replace_datetimetz(self): @@ -1186,7 +1188,7 @@ def test_replace_commutative(self, df, to_replace, exp): ) def test_replace_replacer_dtype(self, replacer): # GH26632 - df = DataFrame(["a"]) + df = DataFrame(["a"], dtype=object) result = df.replace({"a": replacer, "b": replacer}) expected = DataFrame([replacer], dtype=object) tm.assert_frame_equal(result, expected) @@ -1266,7 +1268,6 @@ def test_categorical_replace_with_dict(self, replace_dict, final_data): assert return_value is None tm.assert_frame_equal(df, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_value_category_type(self): """ Test for #23305: to ensure category dtypes are maintained @@ -1322,7 +1323,7 @@ def test_replace_value_category_type(self): lambda x: x.astype("category").cat.rename_categories({"cat2": "catX"}) ) - result = result.astype({"col1": "int64", "col3": "float64", "col5": "object"}) + result = result.astype({"col1": "int64", "col3": "float64", "col5": "str"}) tm.assert_frame_equal(result, expected) def test_replace_dict_category_type(self): @@ -1363,12 +1364,11 @@ def test_replace_with_compiled_regex(self): expected = DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_intervals(self): # https://github.com/pandas-dev/pandas/issues/35931 df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) result = df.replace({"a": {pd.Interval(0, 1): "x"}}) - expected = DataFrame({"a": ["x", "x"]}) + expected = DataFrame({"a": ["x", "x"]}, dtype=object) tm.assert_frame_equal(result, expected) def test_replace_unicode(self): @@ -1468,17 +1468,21 @@ def test_regex_replace_scalar( expected.loc[expected["a"] == ".", "a"] = expected_replace_val tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_frame(self, regex): # GH-48644 df1 = DataFrame({"A": ["0"], "B": ["0"]}) - expected_df1 = DataFrame({"A": [1], "B": [1]}, dtype=df1.dtypes.iloc[0]) + expected_df1 = DataFrame({"A": [1], "B": [1]}, dtype=object) result_df1 = df1.replace(to_replace="0", value=1, regex=regex) tm.assert_frame_equal(result_df1, expected_df1) df2 = DataFrame({"A": ["0"], "B": ["1"]}) - expected_df2 = DataFrame({"A": [1], "B": ["1"]}, dtype=df2.dtypes.iloc[0]) + if regex: + # TODO(infer_string): both string columns get cast to object, + # while only needed for column A + expected_df2 = DataFrame({"A": [1], "B": ["1"]}, dtype=object) + else: + expected_df2 = DataFrame({"A": Series([1], dtype=object), "B": ["1"]}) result_df2 = df2.replace(to_replace="0", value=1, regex=regex) tm.assert_frame_equal(result_df2, expected_df2) diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index a0f96ff111444..b52240c208493 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -757,3 +757,12 @@ def test_shift_with_offsets_freq_empty(self): df_shifted = DataFrame(index=shifted_dates) result = df.shift(freq=offset) tm.assert_frame_equal(result, df_shifted) + + def test_series_shift_interval_preserves_closed(self): + # GH#60389 + ser = Series( + [pd.Interval(1, 2, closed="right"), pd.Interval(2, 3, closed="right")] + ) + result = ser.shift(1) + expected = Series([np.nan, pd.Interval(1, 2, closed="right")]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py index d38bc06260a0e..36088cceb13f1 100644 --- a/pandas/tests/frame/methods/test_to_numpy.py +++ b/pandas/tests/frame/methods/test_to_numpy.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas import ( DataFrame, @@ -31,6 +32,9 @@ def test_to_numpy_copy(self): # and that can be respected because we are already numpy-float assert df.to_numpy(copy=False).base is df.values.base + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_to_numpy_mixed_dtype_to_str(self): # https://github.com/pandas-dev/pandas/issues/35455 df = DataFrame([[Timestamp("2020-01-01 00:00:00"), 100.0]]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 3d8213cb3d11a..9b6080603f0c9 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2404,6 +2404,9 @@ def test_construct_with_two_categoricalindex_series(self): ) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_constructor_series_nonexact_categoricalindex(self): # GH 42424 ser = Series(range(100)) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 57c803c23b001..dae7fe2575c22 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1858,10 +1858,7 @@ def test_unstack_bug(self, future_stack): } ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) - + result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) unstacked = result.unstack() restacked = unstacked.stack(future_stack=future_stack) tm.assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) diff --git a/pandas/tests/groupby/__init__.py b/pandas/tests/groupby/__init__.py index 446d9da437771..79046cd7ed415 100644 --- a/pandas/tests/groupby/__init__.py +++ b/pandas/tests/groupby/__init__.py @@ -2,7 +2,7 @@ def get_groupby_method_args(name, obj): """ Get required arguments for a groupby method. - When parametrizing a test over groupby methods (e.g. "sum", "mean", "fillna"), + When parametrizing a test over groupby methods (e.g. "sum", "mean"), it is often the case that arguments are required for certain methods. Parameters @@ -16,7 +16,7 @@ def get_groupby_method_args(name, obj): ------- A tuple of required arguments for the method. """ - if name in ("nth", "fillna", "take"): + if name in ("nth", "take"): return (0,) if name == "quantile": return (0.5,) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 46c27849356b5..b7e6e55739c17 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import SpecificationError from pandas.core.dtypes.common import is_integer_dtype @@ -161,6 +159,7 @@ def test_agg_apply_corner(ts, tsframe): tm.assert_frame_equal(grouped.agg("sum"), exp_df) res = grouped.apply(np.sum, axis=0) + exp_df = exp_df.reset_index(drop=True) tm.assert_frame_equal(res, exp_df) @@ -296,12 +295,11 @@ def aggfun_1(ser): assert len(result) == 0 -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_wrap_agg_out(three_group): grouped = three_group.groupby(["A", "B"]) def func(ser): - if ser.dtype == object: + if ser.dtype == object or ser.dtype == "string": raise TypeError("Test error message") return ser.sum() @@ -1117,7 +1115,6 @@ def test_lambda_named_agg(func): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_aggregate_mixed_types(): # GH 16916 df = DataFrame( @@ -1129,7 +1126,7 @@ def test_aggregate_mixed_types(): expected = DataFrame( expected_data, index=Index([2, "group 1"], dtype="object", name="grouping"), - columns=Index(["X", "Y", "Z"], dtype="object"), + columns=Index(["X", "Y", "Z"]), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index b937e7dcc8136..a706ea795a0e2 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -92,7 +90,6 @@ def test_cython_agg_boolean(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_cython_agg_nothing_to_agg(): frame = DataFrame( {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25} @@ -108,7 +105,9 @@ def test_cython_agg_nothing_to_agg(): result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True) expected = DataFrame( - [], index=frame["a"].sort_values().drop_duplicates(), columns=[] + [], + index=frame["a"].sort_values().drop_duplicates(), + columns=Index([], dtype="str"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 835cad0d13078..1c016143d50c3 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import SpecificationError import pandas as pd @@ -308,7 +306,6 @@ def test_series_agg_multikey(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_series_agg_multi_pure_python(): data = DataFrame( { @@ -358,7 +355,8 @@ def test_series_agg_multi_pure_python(): ) def bad(x): - assert len(x.values.base) > 0 + if isinstance(x.values, np.ndarray): + assert len(x.values.base) > 0 return "foo" result = data.groupby(["A", "B"]).agg(bad) @@ -501,17 +499,13 @@ def test_agg_timezone_round_trip(): assert ts == grouped.first()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] + assert ts == grouped.apply(lambda x: x.iloc[0])["B"].iloc[0] ts = df["B"].iloc[2] assert ts == grouped.last()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] + assert ts == grouped.apply(lambda x: x.iloc[-1])["B"].iloc[0] def test_sum_uint64_overflow(): diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index 4a8ad65200caa..28cb25b515ed2 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -158,11 +156,10 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_raises(): df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) - msg = "dtype 'object' does not support operation 'quantile'" + msg = "dtype '(object|str)' does not support operation 'quantile'" with pytest.raises(TypeError, match=msg): df.groupby("key").quantile() diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index 91200f53e36bd..2dc89bc75746f 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -76,6 +76,8 @@ def test_size_series_masked_type_returns_Int64(dtype): tm.assert_series_equal(result, expected) +# TODO(infer_string) in case the column is object dtype, it should preserve that dtype +# for the result's index @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_size_strings(any_string_dtype): # GH#55627 diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 8ca6593a19f20..1050f8154572a 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -324,12 +324,9 @@ def test_against_frame_and_seriesgroupby( ) if frame: # compare against apply with DataFrame value_counts - warn = DeprecationWarning if groupby == "column" else None - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(warn, match=msg): - expected = gp.apply( - _frame_value_counts, ["gender", "education"], normalize, sort, ascending - ) + expected = gp.apply( + _frame_value_counts, ["gender", "education"], normalize, sort, ascending + ) if as_index: tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_all_methods.py b/pandas/tests/groupby/test_all_methods.py index 945c3e421a132..4625c5c27a803 100644 --- a/pandas/tests/groupby/test_all_methods.py +++ b/pandas/tests/groupby/test_all_methods.py @@ -22,7 +22,7 @@ def test_multiindex_group_all_columns_when_empty(groupby_func): # GH 32464 df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"]) - gb = df.groupby(["a", "b", "c"], group_keys=False) + gb = df.groupby(["a", "b", "c"], group_keys=True) method = getattr(gb, groupby_func) args = get_groupby_method_args(groupby_func, df) if groupby_func == "corrwith": diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 1a4127ab49b0e..fd1c82932f57f 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -27,12 +27,9 @@ def test_apply_func_that_appends_group_to_list_without_copy(): def store(group): groups.append(group) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("index").apply(store) - expected_value = DataFrame( - {"index": [0] * 10, 0: [1] * 10}, index=pd.RangeIndex(0, 100, 10) - ) + df.groupby("index").apply(store) + expected_value = DataFrame({0: [1] * 10}, index=pd.RangeIndex(0, 100, 10)) + expected_value.columns = expected_value.columns.astype(object) tm.assert_frame_equal(groups[0], expected_value) @@ -111,11 +108,7 @@ def test_apply_index_date_object(): ] exp_idx = Index(["2011-05-16", "2011-05-17", "2011-05-18"], name="date") expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("date", group_keys=False).apply( - lambda x: x["time"][x["value"].idxmax()] - ) + result = df.groupby("date").apply(lambda x: x["time"][x["value"].idxmax()]) tm.assert_series_equal(result, expected) @@ -189,9 +182,7 @@ def f_constant_df(group): for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]: del names[:] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("a", group_keys=False).apply(func) + df.groupby("a").apply(func) assert names == group_names @@ -209,11 +200,9 @@ def test_group_apply_once_per_group2(capsys): index=["0", "2", "4", "6", "8", "10", "12", "14"], ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("group_by_column", group_keys=False).apply( - lambda df: print("function_called") - ) + df.groupby("group_by_column", group_keys=False).apply( + lambda df: print("function_called") + ) result = capsys.readouterr().out.count("function_called") # If `groupby` behaves unexpectedly, this test will break @@ -233,12 +222,8 @@ def slow(group): def fast(group): return group.copy() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - fast_df = df.groupby("A", group_keys=False).apply(fast) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - slow_df = df.groupby("A", group_keys=False).apply(slow) - + fast_df = df.groupby("A", group_keys=False).apply(fast) + slow_df = df.groupby("A", group_keys=False).apply(slow) tm.assert_frame_equal(fast_df, slow_df) @@ -258,11 +243,8 @@ def test_groupby_apply_identity_maybecopy_index_identical(func): # transparent to the user df = DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("g", group_keys=False).apply(func) - tm.assert_frame_equal(result, df) + result = df.groupby("g", group_keys=False).apply(func) + tm.assert_frame_equal(result, df[["a", "b"]]) def test_apply_with_mixed_dtype(): @@ -304,11 +286,8 @@ def test_groupby_as_index_apply(): tm.assert_index_equal(res_as, exp) tm.assert_index_equal(res_not_as, exp) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res_as_apply = g_as.apply(lambda x: x.head(2)).index - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index + res_as_apply = g_as.apply(lambda x: x.head(2)).index + res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index # apply doesn't maintain the original ordering # changed in GH5610 as the as_index=False returns a MI here @@ -323,9 +302,7 @@ def test_groupby_as_index_apply(): def test_groupby_as_index_apply_str(): ind = Index(list("abcde")) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index + res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index tm.assert_index_equal(res, ind) @@ -354,19 +331,13 @@ def desc3(group): # weirdo return result - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(desc) + result = grouped.apply(desc) assert result.index.names == ("A", "B", "stat") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result2 = grouped.apply(desc2) + result2 = grouped.apply(desc2) assert result2.index.names == ("A", "B", "stat") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result3 = grouped.apply(desc3) + result3 = grouped.apply(desc3) assert result3.index.names == ("A", "B", None) @@ -396,9 +367,7 @@ def test_apply_series_yield_constant(df): def test_apply_frame_yield_constant(df): # GH13568 - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["A", "B"]).apply(len) + result = df.groupby(["A", "B"]).apply(len) assert isinstance(result, Series) assert result.name is None @@ -409,9 +378,7 @@ def test_apply_frame_yield_constant(df): def test_apply_frame_to_series(df): grouped = df.groupby(["A", "B"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(len) + result = grouped.apply(len) expected = grouped.count()["C"] tm.assert_index_equal(result.index, expected.index) tm.assert_numpy_array_equal(result.values, expected.values) @@ -420,9 +387,7 @@ def test_apply_frame_to_series(df): def test_apply_frame_not_as_index_column_name(df): # GH 35964 - path within _wrap_applied_output not hit by a test grouped = df.groupby(["A", "B"], as_index=False) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(len) + result = grouped.apply(len) expected = grouped.count().rename(columns={"C": np.nan}).drop(columns="D") # TODO(GH#34306): Use assert_frame_equal when column name is not np.nan tm.assert_index_equal(result.index, expected.index) @@ -445,9 +410,7 @@ def trans2(group): } ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(trans) + result = df.groupby("A").apply(trans) exp = df.groupby("A")["C"].apply(trans2) tm.assert_series_equal(result, exp, check_names=False) assert result.name == "C" @@ -476,10 +439,8 @@ def test_apply_chunk_view(group_keys): # Low level tinkering could be unsafe, make sure not df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) - expected = df.take([0, 1, 3, 4, 6, 7]) + result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) + expected = df[["value"]].take([0, 1, 3, 4, 6, 7]) if group_keys: expected.index = MultiIndex.from_arrays( [[1, 1, 2, 2, 3, 3], expected.index], names=["key", None] @@ -499,9 +460,7 @@ def test_apply_no_name_column_conflict(): # it works! #2605 grouped = df.groupby(["name", "name2"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grouped.apply(lambda x: x.sort_values("value", inplace=True)) + grouped.apply(lambda x: x.sort_values("value", inplace=True)) def test_apply_typecast_fail(): @@ -518,11 +477,9 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("d", group_keys=False).apply(f) + result = df.groupby("d", group_keys=False).apply(f) - expected = df.copy() + expected = df[["c", "v"]] expected["v2"] = np.tile([0.0, 0.5, 1], 2) tm.assert_frame_equal(result, expected) @@ -544,13 +501,10 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("d", group_keys=False).apply(f) + result = df.groupby("d", group_keys=False).apply(f) - expected = df.copy() + expected = df[["c", "v"]] expected["v2"] = np.tile([0.0, 0.5, 1], 2) - tm.assert_frame_equal(result, expected) @@ -584,11 +538,8 @@ def filt2(x): else: return x[x.category == "c"] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = data.groupby("id_field").apply(filt1) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = data.groupby("id_field").apply(filt2) + expected = data.groupby("id_field").apply(filt1) + result = data.groupby("id_field").apply(filt2) tm.assert_frame_equal(result, expected) @@ -601,19 +552,11 @@ def test_apply_with_duplicated_non_sorted_axis(test_series): if test_series: ser = df.set_index("Y")["X"] result = ser.groupby(level=0, group_keys=False).apply(lambda x: x) - - # not expecting the order to remain the same for duplicated axis - result = result.sort_index() - expected = ser.sort_index() + expected = ser tm.assert_series_equal(result, expected) else: - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("Y", group_keys=False).apply(lambda x: x) - - # not expecting the order to remain the same for duplicated axis - result = result.sort_values("Y") - expected = df.sort_values("Y") + result = df.groupby("Y", group_keys=False).apply(lambda x: x) + expected = df[["X"]] tm.assert_frame_equal(result, expected) @@ -654,9 +597,7 @@ def f(g): g["value3"] = g["value1"] * 2 return g - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(f) + result = grouped.apply(f) assert "value3" in result @@ -670,13 +611,9 @@ def test_apply_numeric_coercion_when_datetime(): df = DataFrame( {"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]} ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) df.Date = pd.to_datetime(df.Date) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) tm.assert_series_equal(result["Str"], expected["Str"]) @@ -689,9 +626,7 @@ def test_apply_numeric_coercion_when_datetime_getitem(): def get_B(g): return g.iloc[0][["B"]] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(get_B)["B"] + result = df.groupby("A").apply(get_B)["B"] expected = df.B expected.index = df.A tm.assert_series_equal(result, expected) @@ -718,11 +653,8 @@ def predictions(tool): ) df2 = df1.copy() df2.oTime = pd.to_datetime(df2.oTime) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df1.groupby("Key").apply(predictions).p1 - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df2.groupby("Key").apply(predictions).p1 + expected = df1.groupby("Key").apply(predictions).p1 + result = df2.groupby("Key").apply(predictions).p1 tm.assert_series_equal(expected, result) @@ -737,13 +669,11 @@ def test_apply_aggregating_timedelta_and_datetime(): } ) df["time_delta_zero"] = df.datetime - df.datetime - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("clientid").apply( - lambda ddf: Series( - {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} - ) + result = df.groupby("clientid").apply( + lambda ddf: Series( + {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} ) + ) expected = DataFrame( { "clientid": ["A", "B", "C"], @@ -786,15 +716,11 @@ def func_with_no_date(batch): def func_with_date(batch): return Series({"b": datetime(2015, 1, 1), "c": 2}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) + dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) dfg_no_conversion_expected = DataFrame({"c": 2}, index=[1]) dfg_no_conversion_expected.index.name = "a" - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) + dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) dfg_conversion_expected = DataFrame( {"b": pd.Timestamp(2015, 1, 1), "c": 2}, index=[1] ) @@ -838,11 +764,8 @@ def test_groupby_apply_all_none(): def test_func(x): pass - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = test_df.groupby("groups").apply(test_func) - expected = DataFrame(columns=test_df.columns) - expected = expected.astype(test_df.dtypes) + result = test_df.groupby("groups").apply(test_func) + expected = DataFrame(columns=["random_vars"], dtype="int64") tm.assert_frame_equal(result, expected) @@ -852,12 +775,12 @@ def test_func(x): [ {"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]}, [[1, 1], [0, 2]], - {"groups": [1, 1], "vars": [0, 2]}, + {"vars": [0, 2]}, ], [ {"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]}, [[2, 2], [1, 3]], - {"groups": [2, 2], "vars": [1, 3]}, + {"vars": [1, 3]}, ], ], ) @@ -870,9 +793,7 @@ def test_func(x): return None return x.iloc[[0, -1]] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result1 = test_df1.groupby("groups").apply(test_func) + result1 = test_df1.groupby("groups").apply(test_func) index1 = MultiIndex.from_arrays(out_idx, names=["groups", None]) expected1 = DataFrame(out_data, index=index1) tm.assert_frame_equal(result1, expected1) @@ -882,9 +803,7 @@ def test_groupby_apply_return_empty_chunk(): # GH 22221: apply filter which returns some empty groups df = DataFrame({"value": [0, 1], "group": ["filled", "empty"]}) groups = df.groupby("group") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = groups.apply(lambda group: group[group.value != 1]["value"]) + result = groups.apply(lambda group: group[group.value != 1]["value"]) expected = Series( [0], name="value", @@ -909,9 +828,7 @@ def test_apply_with_mixed_types(meth): def test_func_returns_object(): # GH 28652 df = DataFrame({"a": [1, 2]}, index=Index([1, 2])) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("a").apply(lambda g: g.index) + result = df.groupby("a").apply(lambda g: g.index) expected = Series([Index([1]), Index([2])], index=Index([1, 2], name="a")) tm.assert_series_equal(result, expected) @@ -928,9 +845,7 @@ def test_apply_datetime_issue(group_column_dtlike): # standard int values in range(len(num_columns)) df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) + result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) expected = DataFrame(["spam"], Index(["foo"], dtype="str", name="a"), columns=[42]) tm.assert_frame_equal(result, expected) @@ -967,9 +882,7 @@ def test_apply_series_return_dataframe_groups(): def most_common_values(df): return Series({c: s.value_counts().index[0] for c, s in df.items()}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = tdf.groupby("day").apply(most_common_values)["userId"] + result = tdf.groupby("day").apply(most_common_values)["userId"] expected = Series( ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" ) @@ -1010,13 +923,11 @@ def test_groupby_apply_datetime_result_dtypes(using_infer_string): ], columns=["observation", "color", "mood", "intensity", "score"], ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes + result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes dtype = pd.StringDtype(na_value=np.nan) if using_infer_string else object expected = Series( - [np.dtype("datetime64[us]"), dtype, dtype, np.int64, dtype], - index=["observation", "color", "mood", "intensity", "score"], + [np.dtype("datetime64[us]"), dtype, np.int64, dtype], + index=["observation", "mood", "intensity", "score"], ) tm.assert_series_equal(result, expected) @@ -1033,10 +944,8 @@ def test_groupby_apply_datetime_result_dtypes(using_infer_string): def test_apply_index_has_complex_internals(index): # GH 31248 df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("group", group_keys=False).apply(lambda x: x) - tm.assert_frame_equal(result, df) + result = df.groupby("group", group_keys=False).apply(lambda x: x) + tm.assert_frame_equal(result, df[["value"]]) @pytest.mark.parametrize( @@ -1058,9 +967,7 @@ def test_apply_index_has_complex_internals(index): def test_apply_function_returns_non_pandas_non_scalar(function, expected_values): # GH 31441 df = DataFrame(["A", "A", "B", "B"], columns=["groups"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("groups").apply(function) + result = df.groupby("groups").apply(function) expected = Series(expected_values, index=Index(["A", "B"], name="groups")) tm.assert_series_equal(result, expected) @@ -1072,9 +979,7 @@ def fct(group): df = DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(fct) + result = df.groupby("A").apply(fct) expected = Series( [[1.0, 2.0], [3.0], [np.nan]], index=Index(["a", "b", "none"], name="A") ) @@ -1085,9 +990,7 @@ def fct(group): def test_apply_function_index_return(function): # GH: 22541 df = DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("id").apply(function) + result = df.groupby("id").apply(function) expected = Series( [Index([0, 4, 7, 9]), Index([1, 2, 3, 5]), Index([6, 8])], index=Index([1, 2, 3], name="id"), @@ -1123,9 +1026,7 @@ def test_apply_result_type(group_keys, udf): # We'd like to control whether the group keys end up in the index # regardless of whether the UDF happens to be a transform. df = DataFrame({"A": ["a", "b"], "B": [1, 2]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df_result = df.groupby("A", group_keys=group_keys).apply(udf) + df_result = df.groupby("A", group_keys=group_keys).apply(udf) series_result = df.B.groupby(df.A, group_keys=group_keys).apply(udf) if group_keys: @@ -1140,11 +1041,8 @@ def test_result_order_group_keys_false(): # GH 34998 # apply result order should not depend on whether index is the same or just equal df = DataFrame({"A": [2, 1, 2], "B": [1, 2, 3]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A", group_keys=False).apply(lambda x: x) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) + result = df.groupby("A", group_keys=False).apply(lambda x: x) + expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) tm.assert_frame_equal(result, expected) @@ -1156,15 +1054,8 @@ def test_apply_with_timezones_aware(): df1 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_no_tz}) df2 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result1 = df1.groupby("x", group_keys=False).apply( - lambda df: df[["x", "y"]].copy() - ) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result2 = df2.groupby("x", group_keys=False).apply( - lambda df: df[["x", "y"]].copy() - ) + result1 = df1.groupby("x", group_keys=False).apply(lambda df: df[["y"]].copy()) + result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["y"]].copy()) tm.assert_frame_equal(result1, result2) @@ -1187,7 +1078,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): # Check output when no other methods are called before .apply() grp = df.groupby(by="a") - result = grp.apply(np.sum, axis=0, include_groups=False) + result = grp.apply(np.sum, axis=0) tm.assert_frame_equal(result, expected) # Check output when another method is called before .apply() @@ -1201,7 +1092,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): msg = "" with tm.assert_produces_warning(warn, match=msg): _ = getattr(grp, reduction_func)(*args) - result = grp.apply(np.sum, axis=0, include_groups=False) + result = grp.apply(np.sum, axis=0) tm.assert_frame_equal(result, expected) @@ -1223,14 +1114,12 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): ) grp = df.groupby(["A", "B"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grp.apply(lambda x: x.head(1)) + result = grp.apply(lambda x: x.head(1)) expected = df.iloc[[0, 2, 3]] expected = expected.reset_index() expected.index = MultiIndex.from_frame(expected[["A", "B", "idx"]]) - expected = expected.drop(columns=["idx"]) + expected = expected.drop(columns=["A", "B", "idx"]) tm.assert_frame_equal(result, expected) for val in result.index.levels[1]: @@ -1247,10 +1136,8 @@ def test_apply_dropna_with_indexed_same(dropna): }, index=list("xxyxz"), ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) - expected = df.dropna() if dropna else df.iloc[[0, 3, 1, 2, 4]] + result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) + expected = df.dropna()[["col"]] if dropna else df[["col"]].iloc[[0, 3, 1, 2, 4]] tm.assert_frame_equal(result, expected) @@ -1274,9 +1161,7 @@ def test_apply_dropna_with_indexed_same(dropna): def test_apply_as_index_constant_lambda(as_index, expected): # GH 13217 df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 1, 2, 2], "c": [1, 1, 1, 1]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) + result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) tm.assert_equal(result, expected) @@ -1286,9 +1171,7 @@ def test_sort_index_groups(): {"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 0], "C": [1, 1, 1, 2, 2]}, index=range(5), ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("C").apply(lambda x: x.A.sort_index()) + result = df.groupby("C").apply(lambda x: x.A.sort_index()) expected = Series( range(1, 6), index=MultiIndex.from_tuples( @@ -1308,12 +1191,10 @@ def test_positional_slice_groups_datetimelike(): "let": list("abcde"), } ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = expected.groupby( - [expected.let, expected.date.dt.date], group_keys=False - ).apply(lambda x: x.iloc[0:]) - tm.assert_frame_equal(result, expected) + result = expected.groupby( + [expected.let, expected.date.dt.date], group_keys=False + ).apply(lambda x: x.iloc[0:]) + tm.assert_frame_equal(result, expected[["date", "vals"]]) def test_groupby_apply_shape_cache_safety(): @@ -1354,32 +1235,27 @@ def test_apply_na(dropna): {"grp": [1, 1, 2, 2], "y": [1, 0, 2, 5], "z": [1, 2, np.nan, np.nan]} ) dfgrp = df.groupby("grp", dropna=dropna) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) + result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) + expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) tm.assert_frame_equal(result, expected) def test_apply_empty_string_nan_coerce_bug(): # GH#24903 - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = ( - DataFrame( - { - "a": [1, 1, 2, 2], - "b": ["", "", "", ""], - "c": pd.to_datetime([1, 2, 3, 4], unit="s"), - } - ) - .groupby(["a", "b"]) - .apply(lambda df: df.iloc[-1]) + result = ( + DataFrame( + { + "a": [1, 1, 2, 2], + "b": ["", "", "", ""], + "c": pd.to_datetime([1, 2, 3, 4], unit="s"), + } ) + .groupby(["a", "b"]) + .apply(lambda df: df.iloc[-1]) + ) expected = DataFrame( - [[1, "", pd.to_datetime(2, unit="s")], [2, "", pd.to_datetime(4, unit="s")]], - columns=["a", "b", "c"], + [[pd.to_datetime(2, unit="s")], [pd.to_datetime(4, unit="s")]], + columns=["c"], index=MultiIndex.from_tuples([(1, ""), (2, "")], names=["a", "b"]), ) tm.assert_frame_equal(result, expected) @@ -1401,11 +1277,9 @@ def test_apply_index_key_error_bug(index_values): }, index=Index(["a2", "a3", "aa"], name="a"), ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = result.groupby("a").apply( - lambda df: Series([df["b"].mean()], index=["b_mean"]) - ) + result = result.groupby("a").apply( + lambda df: Series([df["b"].mean()], index=["b_mean"]) + ) tm.assert_frame_equal(result, expected) @@ -1452,10 +1326,9 @@ def test_apply_index_key_error_bug(index_values): ) def test_apply_nonmonotonic_float_index(arg, idx): # GH 34455 - expected = DataFrame({"col": arg}, index=idx) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = expected.groupby("col", group_keys=False).apply(lambda x: x) + df = DataFrame({"grp": arg, "col": arg}, index=idx) + result = df.groupby("grp", group_keys=False).apply(lambda x: x) + expected = df[["col"]] tm.assert_frame_equal(result, expected) @@ -1502,19 +1375,12 @@ def test_empty_df(method, op): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("include_groups", [True, False]) -def test_include_groups(include_groups): +def test_include_groups(): # GH#7155 df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) gb = df.groupby("a") - warn = DeprecationWarning if include_groups else None - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(warn, match=msg): - result = gb.apply(lambda x: x.sum(), include_groups=include_groups) - expected = DataFrame({"a": [2, 2], "b": [7, 5]}, index=Index([1, 2], name="a")) - if not include_groups: - expected = expected[["b"]] - tm.assert_frame_equal(result, expected) + with pytest.raises(ValueError, match="include_groups=True is no longer allowed"): + gb.apply(lambda x: x.sum(), include_groups=True) @pytest.mark.parametrize("func, value", [(max, 2), (min, 1), (sum, 3)]) @@ -1523,7 +1389,7 @@ def test_builtins_apply(func, value): # Builtins act as e.g. sum(group), which sums the column labels of group df = DataFrame({0: [1, 1, 2], 1: [3, 4, 5], 2: [3, 4, 5]}) gb = df.groupby(0) - result = gb.apply(func, include_groups=False) + result = gb.apply(func) expected = Series([value, value], index=Index([1, 2], name=0)) tm.assert_series_equal(result, expected) @@ -1544,9 +1410,7 @@ def f_0(grp): return grp.iloc[0] expected = df.groupby("A").first()[["B"]] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(f_0)[["B"]] + result = df.groupby("A").apply(f_0)[["B"]] tm.assert_frame_equal(result, expected) def f_1(grp): @@ -1554,9 +1418,7 @@ def f_1(grp): return None return grp.iloc[0] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(f_1)[["B"]] + result = df.groupby("A").apply(f_1)[["B"]] e = expected.copy() e.loc["Tiger"] = np.nan tm.assert_frame_equal(result, e) @@ -1566,9 +1428,7 @@ def f_2(grp): return None return grp.iloc[0] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(f_2)[["B"]] + result = df.groupby("A").apply(f_2)[["B"]] e = expected.copy() e.loc["Pony"] = np.nan tm.assert_frame_equal(result, e) @@ -1579,9 +1439,7 @@ def f_3(grp): return None return grp.iloc[0] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(f_3)[["C"]] + result = df.groupby("A").apply(f_3)[["C"]] e = df.groupby("A").first()[["C"]] e.loc["Pony"] = pd.NaT tm.assert_frame_equal(result, e) @@ -1592,9 +1450,7 @@ def f_4(grp): return None return grp.iloc[0].loc["C"] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(f_4) + result = df.groupby("A").apply(f_4) e = df.groupby("A").first()["C"].copy() e.loc["Pony"] = np.nan e.name = None diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py index fa20efad4da77..970334917faab 100644 --- a/pandas/tests/groupby/test_apply_mutate.py +++ b/pandas/tests/groupby/test_apply_mutate.py @@ -13,16 +13,10 @@ def test_group_by_copy(): } ).set_index("name") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grp_by_same_value = df.groupby(["age"], group_keys=False).apply( - lambda group: group - ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grp_by_copy = df.groupby(["age"], group_keys=False).apply( - lambda group: group.copy() - ) + grp_by_same_value = df.groupby(["age"], group_keys=False).apply(lambda group: group) + grp_by_copy = df.groupby(["age"], group_keys=False).apply( + lambda group: group.copy() + ) tm.assert_frame_equal(grp_by_same_value, grp_by_copy) @@ -53,11 +47,8 @@ def f_no_copy(x): x["rank"] = x.val.rank(method="min") return x.groupby("cat2")["rank"].min() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grpby_copy = df.groupby("cat1").apply(f_copy) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grpby_no_copy = df.groupby("cat1").apply(f_no_copy) + grpby_copy = df.groupby("cat1").apply(f_copy) + grpby_no_copy = df.groupby("cat1").apply(f_no_copy) tm.assert_series_equal(grpby_copy, grpby_no_copy) @@ -67,11 +58,8 @@ def test_no_mutate_but_looks_like(): # second does not, but should yield the same results df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) + result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].value) + result2 = df.groupby("key", group_keys=True).apply(lambda x: x.value) tm.assert_series_equal(result1, result2) @@ -85,9 +73,7 @@ def fn(x): x.loc[x.index[-1], "col2"] = 0 return x.col2 - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["col1"], as_index=False).apply(fn) + result = df.groupby(["col1"], as_index=False).apply(fn) expected = pd.Series( [1, 2, 0, 4, 5, 0], index=range(6), diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 1e86b5401ee09..656a61de5d105 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( Categorical, @@ -129,10 +127,8 @@ def test_basic_string(using_infer_string): def f(x): return x.drop_duplicates("person_name").iloc[0] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.apply(f) - expected = x.iloc[[0, 1]].copy() + result = g.apply(f) + expected = x[["person_name"]].iloc[[0, 1]] expected.index = Index([1, 2], name="person_id") dtype = "str" if using_infer_string else object expected["person_name"] = expected["person_name"].astype(dtype) @@ -316,14 +312,11 @@ def test_apply(ordered): # but for transform we should still get back the original index idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) expected = Series(1, index=idx) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(lambda x: 1) + result = grouped.apply(lambda x: 1) tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -def test_observed(observed): +def test_observed(request, using_infer_string, observed): # multiple groupers, don't re-expand the output space # of the grouper # gh-14942 (implement) @@ -331,6 +324,10 @@ def test_observed(observed): # gh-8138 (back-compat) # gh-8869 + if using_infer_string and not observed: + # TODO(infer_string) this fails with filling the string column with 0 + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) + cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) @@ -1356,11 +1353,7 @@ def test_get_nonexistent_category(): # Accessing a Category that is not in the dataframe df = DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)}) with pytest.raises(KeyError, match="'vau'"): - df.groupby("var").apply( - lambda rows: DataFrame( - {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]} - ) - ) + df.groupby("var").apply(lambda rows: DataFrame({"val": [rows.iloc[-1]["vau"]]})) def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed): @@ -1962,10 +1955,7 @@ def test_category_order_transformer( df = df.set_index(keys) args = get_groupby_method_args(transformation_func, df) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - warn = FutureWarning if transformation_func == "fillna" else None - msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=msg): - op_result = getattr(gb, transformation_func)(*args) + op_result = getattr(gb, transformation_func)(*args) result = op_result.index.get_level_values("a").categories expected = Index([1, 4, 3, 2]) tm.assert_index_equal(result, expected) @@ -2036,10 +2026,7 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde df["a2"] = df["a"] df = df.set_index(keys) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - warn = DeprecationWarning if method == "apply" and index_kind == "range" else None - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(warn, match=msg): - op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) + op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) if (method == "transform" or not as_index) and index_kind == "range": result = op_result["a"].cat.categories else: diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 47ad18c9ad2c8..679f7eb7f7f11 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -289,9 +289,7 @@ def test_count(): for key in ["1st", "2nd", ["1st", "2nd"]]: left = df.groupby(key).count() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) + right = df.groupby(key).apply(DataFrame.count) tm.assert_frame_equal(left, right) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3305b48a4dcdc..c4c1e7bd9ac4f 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -66,11 +66,9 @@ def test_groupby_nonobject_dtype_mixed(): def max_value(group): return group.loc[group["value"].idxmax()] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - applied = df.groupby("A").apply(max_value) + applied = df.groupby("A").apply(max_value) result = applied.dtypes - expected = df.dtypes + expected = df.drop(columns="A").dtypes tm.assert_series_equal(result, expected) @@ -229,11 +227,8 @@ def f3(x): df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)}) # correct result - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result1 = df.groupby("a").apply(f1) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result2 = df2.groupby("a").apply(f1) + result1 = df.groupby("a").apply(f1) + result2 = df2.groupby("a").apply(f1) tm.assert_frame_equal(result1, result2) # should fail (not the same number of levels) @@ -1055,17 +1050,13 @@ def summarize_random_name(df): # Provide a different name for each Series. In this case, groupby # should not attempt to propagate the Series name since they are # inconsistent. - return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"]) + return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["C"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - metrics = df.groupby("A").apply(summarize) + metrics = df.groupby("A").apply(summarize) assert metrics.columns.name is None - with tm.assert_produces_warning(DeprecationWarning, match=msg): - metrics = df.groupby("A").apply(summarize, "metrics") + metrics = df.groupby("A").apply(summarize, "metrics") assert metrics.columns.name == "metrics" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - metrics = df.groupby("A").apply(summarize_random_name) + metrics = df.groupby("A").apply(summarize_random_name) assert metrics.columns.name is None @@ -1281,7 +1272,6 @@ def test_groupby_two_group_keys_all_nan(): assert result == {} -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_2d_malformed(): d = DataFrame(index=range(2)) d["group"] = ["g1", "g2"] @@ -1290,7 +1280,7 @@ def test_groupby_2d_malformed(): d["label"] = ["l1", "l2"] tmp = d.groupby(["group"]).mean(numeric_only=True) res_values = np.array([[0.0, 1.0], [0.0, 1.0]]) - tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"])) + tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"], dtype=object)) tm.assert_numpy_array_equal(tmp.values, res_values) @@ -1362,10 +1352,8 @@ def test_dont_clobber_name_column(): {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2} ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("key", group_keys=False).apply(lambda x: x) - tm.assert_frame_equal(result, df) + result = df.groupby("key", group_keys=False).apply(lambda x: x) + tm.assert_frame_equal(result, df[["name"]]) def test_skip_group_keys(): @@ -1442,9 +1430,7 @@ def freducex(x): grouped = df.groupby(grouper, group_keys=False) # make sure all these work - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grouped.apply(f) + grouped.apply(f) grouped.aggregate(freduce) grouped.aggregate({"C": freduce, "D": freduce}) grouped.transform(f) @@ -1465,10 +1451,7 @@ def f(group): names.append(group.name) return group.copy() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("a", sort=False, group_keys=False).apply(f) - + df.groupby("a", sort=False, group_keys=False).apply(f) expected_names = [0, 1, 2] assert names == expected_names @@ -1673,9 +1656,7 @@ def test_groupby_preserves_sort(sort_column, group_column): def test_sort(x): tm.assert_frame_equal(x, x.sort_values(by=sort_column)) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - g.apply(test_sort) + g.apply(test_sort) def test_pivot_table_values_key_error(): @@ -1861,10 +1842,8 @@ def test_empty_groupby_apply_nonunique_columns(): df[3] = df[3].astype(np.int64) df.columns = [0, 1, 2, 0] gb = df.groupby(df[1], group_keys=False) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res = gb.apply(lambda x: x) - assert (res.dtypes == df.dtypes).all() + res = gb.apply(lambda x: x) + assert (res.dtypes == df.drop(columns=1).dtypes).all() def test_tuple_as_grouping(): @@ -2099,36 +2078,14 @@ def test_group_on_empty_multiindex(transformation_func, request): df["col_3"] = df["col_3"].astype(int) df["col_4"] = df["col_4"].astype(int) df = df.set_index(["col_1", "col_2"]) - if transformation_func == "fillna": - args = ("ffill",) - else: - args = () - warn = FutureWarning if transformation_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - result = df.iloc[:0].groupby(["col_1"]).transform(transformation_func, *args) - with tm.assert_produces_warning(warn, match=warn_msg): - expected = df.groupby(["col_1"]).transform(transformation_func, *args).iloc[:0] + result = df.iloc[:0].groupby(["col_1"]).transform(transformation_func) + expected = df.groupby(["col_1"]).transform(transformation_func).iloc[:0] if transformation_func in ("diff", "shift"): expected = expected.astype(int) tm.assert_equal(result, expected) - warn_msg = "SeriesGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - result = ( - df["col_3"] - .iloc[:0] - .groupby(["col_1"]) - .transform(transformation_func, *args) - ) - warn_msg = "SeriesGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - expected = ( - df["col_3"] - .groupby(["col_1"]) - .transform(transformation_func, *args) - .iloc[:0] - ) + result = df["col_3"].iloc[:0].groupby(["col_1"]).transform(transformation_func) + expected = df["col_3"].groupby(["col_1"]).transform(transformation_func).iloc[:0] if transformation_func in ("diff", "shift"): expected = expected.astype(int) tm.assert_equal(result, expected) @@ -2345,7 +2302,6 @@ def test_groupby_all_nan_groups_drop(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_groupby_empty_multi_column(as_index, numeric_only): # GH 15106 & GH 41998 @@ -2354,7 +2310,7 @@ def test_groupby_empty_multi_column(as_index, numeric_only): result = gb.sum(numeric_only=numeric_only) if as_index: index = MultiIndex([[], []], [[], []], names=["A", "B"]) - columns = ["C"] if not numeric_only else [] + columns = ["C"] if not numeric_only else Index([], dtype="str") else: index = RangeIndex(0) columns = ["A", "B", "C"] if not numeric_only else ["A", "B"] @@ -2362,7 +2318,6 @@ def test_groupby_empty_multi_column(as_index, numeric_only): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_aggregation_non_numeric_dtype(): # GH #43108 df = DataFrame( @@ -2373,7 +2328,7 @@ def test_groupby_aggregation_non_numeric_dtype(): { "v": [[1, 1], [10, 20]], }, - index=Index(["M", "W"], dtype="object", name="MW"), + index=Index(["M", "W"], name="MW"), ) gb = df.groupby(by=["MW"]) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index d42aa06d6bbfe..8c4ab42b7be7a 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.missing import na_value_for_dtype @@ -99,7 +97,6 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups( tm.assert_frame_equal(grouped, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dropna, idx, outputs", [ @@ -126,7 +123,7 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"]) grouped = df.groupby("a", dropna=dropna).sum() - expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a")) + expected = pd.DataFrame(outputs, index=pd.Index(idx, name="a")) tm.assert_frame_equal(grouped, expected) @@ -326,9 +323,7 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, df = pd.DataFrame(data) gb = df.groupby("groups", dropna=dropna) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) + result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) mi_tuples = tuple(zip(data["groups"], selected_data["values"])) mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None]) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index a1f4627475bab..3ee9c9ea0c7fd 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -36,11 +36,11 @@ def test_groupby_preserves_subclass(obj, groupby_func): args = get_groupby_method_args(groupby_func, obj) - warn = FutureWarning if groupby_func == "fillna" else None - msg = f"{type(grouped).__name__}.fillna is deprecated" - with tm.assert_produces_warning(warn, match=msg, raise_on_extra_warnings=False): + warn = FutureWarning if groupby_func == "corrwith" else None + msg = f"{type(grouped).__name__}.corrwith is deprecated" + with tm.assert_produces_warning(warn, match=msg): result1 = getattr(grouped, groupby_func)(*args) - with tm.assert_produces_warning(warn, match=msg, raise_on_extra_warnings=False): + with tm.assert_produces_warning(warn, match=msg): result2 = grouped.agg(groupby_func, *args) # Reduction or transformation kernels should preserve type @@ -72,18 +72,11 @@ def func(group): assert group.testattr == "hello" return group.testattr - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning( - DeprecationWarning, - match=msg, - raise_on_extra_warnings=False, - check_stacklevel=False, - ): - result = custom_df.groupby("c").apply(func) + result = custom_df.groupby("c").apply(func) expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c")) tm.assert_series_equal(result, expected) - result = custom_df.groupby("c").apply(func, include_groups=False) + result = custom_df.groupby("c").apply(func) tm.assert_series_equal(result, expected) # https://github.com/pandas-dev/pandas/pull/56761 @@ -124,12 +117,5 @@ def test_groupby_resample_preserves_subclass(obj): df = df.set_index("Date") # Confirm groupby.resample() preserves dataframe type - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning( - DeprecationWarning, - match=msg, - raise_on_extra_warnings=False, - check_stacklevel=False, - ): - result = df.groupby("Buyer").resample("5D").sum() + result = df.groupby("Buyer").resample("5D").sum() assert isinstance(result, obj) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 6bb2eaf89b5d7..53e9c53efebf7 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -10,8 +10,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import SpecificationError import pandas as pd @@ -235,11 +233,7 @@ def test_grouper_creation_bug(self): result = g.sum() tm.assert_frame_equal(result, expected) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.apply(lambda x: x.sum()) - expected["A"] = [0, 2, 4] - expected = expected.loc[:, ["A", "B"]] + result = g.apply(lambda x: x.sum()) tm.assert_frame_equal(result, expected) def test_grouper_creation_bug2(self): @@ -779,10 +773,21 @@ def test_evaluate_with_empty_groups(self, func, expected): # (not testing other agg fns, because they return # different index objects. df = DataFrame({1: [], 2: []}) - g = df.groupby(1, group_keys=False) + g = df.groupby(1, group_keys=True) result = getattr(g[2], func)(lambda x: x) tm.assert_series_equal(result, expected) + def test_groupby_apply_empty_with_group_keys_false(self): + # 60471 + # test apply'ing empty groups with group_keys False + # (not testing other agg fns, because they return + # different index objects. + df = DataFrame({"A": [], "B": [], "C": []}) + g = df.groupby("A", group_keys=False) + result = g.apply(lambda x: x / x.sum()) + expected = DataFrame({"B": [], "C": []}, index=None) + tm.assert_frame_equal(result, expected) + def test_groupby_empty(self): # https://github.com/pandas-dev/pandas/issues/27190 s = Series([], name="name", dtype="float64") @@ -807,7 +812,6 @@ def test_groupby_empty(self): expected = ["name"] assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_level_index_value_all_na(self): # issue 20519 df = DataFrame( @@ -817,7 +821,7 @@ def test_groupby_level_index_value_all_na(self): expected = DataFrame( data=[], index=MultiIndex( - levels=[Index(["x"], dtype="object"), Index([], dtype="float64")], + levels=[Index(["x"], dtype="str"), Index([], dtype="float64")], codes=[[], []], names=["A", "B"], ), @@ -864,9 +868,7 @@ def test_groupby_tuple_keys_handle_multiindex(self): } ) expected = df.sort_values(by=["category_tuple", "num1"]) - result = df.groupby("category_tuple").apply( - lambda x: x.sort_values(by="num1"), include_groups=False - ) + result = df.groupby("category_tuple").apply(lambda x: x.sort_values(by="num1")) expected = expected[result.columns] tm.assert_frame_equal(result.reset_index(drop=True), expected) @@ -981,12 +983,13 @@ def test_groupby_with_empty(self): grouped = series.groupby(grouper) assert next(iter(grouped), None) is None - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_with_single_column(self): df = DataFrame({"a": list("abssbab")}) tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]]) # GH 13530 - exp = DataFrame(index=Index(["a", "b", "s"], name="a"), columns=[]) + exp = DataFrame( + index=Index(["a", "b", "s"], name="a"), columns=Index([], dtype="str") + ) tm.assert_frame_equal(df.groupby("a").count(), exp) tm.assert_frame_equal(df.groupby("a").sum(), exp) diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index cb4569812f600..0779faa8d8975 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -278,14 +278,11 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): kernel in ("first", "last") or ( # kernels that work on any dtype and don't have numeric_only arg - kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique") + kernel in ("any", "all", "bfill", "ffill", "nth", "nunique") and numeric_only is lib.no_default ) ): - warn = FutureWarning if kernel == "fillna" else None - msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=msg): - result = method(*args, **kwargs) + result = method(*args, **kwargs) assert "b" in result.columns elif has_arg: assert numeric_only is not True diff --git a/pandas/tests/groupby/test_pipe.py b/pandas/tests/groupby/test_pipe.py index 1044c83e3e56b..ee59a93695bcf 100644 --- a/pandas/tests/groupby/test_pipe.py +++ b/pandas/tests/groupby/test_pipe.py @@ -1,7 +1,4 @@ import numpy as np -import pytest - -from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -11,7 +8,6 @@ import pandas._testing as tm -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_pipe(): # Test the pipe method of DataFrameGroupBy. # Issue #17871 @@ -39,7 +35,7 @@ def square(srs): # NDFrame.pipe methods result = df.groupby("A").pipe(f).pipe(square) - index = Index(["bar", "foo"], dtype="object", name="A") + index = Index(["bar", "foo"], name="A") expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index) tm.assert_series_equal(expected, result) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 1e0a15d0ba796..789105c275625 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -144,7 +144,6 @@ def test_groupby_raises_string( ), "diff": (TypeError, "unsupported operand type"), "ffill": (None, ""), - "fillna": (None, ""), "first": (None, ""), "idxmax": (None, ""), "idxmin": (None, ""), @@ -211,10 +210,7 @@ def test_groupby_raises_string( elif groupby_func == "corrwith": msg = "Cannot perform reduction 'mean' with string dtype" - if groupby_func == "fillna": - kind = "Series" if groupby_series else "DataFrame" - warn_msg = f"{kind}GroupBy.fillna is deprecated" - elif groupby_func == "corrwith": + if groupby_func == "corrwith": warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" @@ -301,7 +297,6 @@ def test_groupby_raises_datetime( "cumsum": (TypeError, "datetime64 type does not support operation 'cumsum'"), "diff": (None, ""), "ffill": (None, ""), - "fillna": (None, ""), "first": (None, ""), "idxmax": (None, ""), "idxmin": (None, ""), @@ -333,10 +328,7 @@ def test_groupby_raises_datetime( "var": (TypeError, "datetime64 type does not support operation 'var'"), }[groupby_func] - if groupby_func == "fillna": - kind = "Series" if groupby_series else "DataFrame" - warn_msg = f"{kind}GroupBy.fillna is deprecated" - elif groupby_func == "corrwith": + if groupby_func == "corrwith": warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" @@ -457,7 +449,6 @@ def test_groupby_raises_category( r"unsupported operand type\(s\) for -: 'Categorical' and 'Categorical'", ), "ffill": (None, ""), - "fillna": (None, ""), # no-op with CoW "first": (None, ""), "idxmax": (None, ""), "idxmin": (None, ""), @@ -532,10 +523,7 @@ def test_groupby_raises_category( ), }[groupby_func] - if groupby_func == "fillna": - kind = "Series" if groupby_series else "DataFrame" - warn_msg = f"{kind}GroupBy.fillna is deprecated" - elif groupby_func == "corrwith": + if groupby_func == "corrwith": warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" @@ -650,7 +638,6 @@ def test_groupby_raises_category_on_category( ), "diff": (TypeError, "unsupported operand type"), "ffill": (None, ""), - "fillna": (None, ""), # no-op with CoW "first": (None, ""), "idxmax": (ValueError, "empty group due to unobserved categories") if empty_groups @@ -710,10 +697,7 @@ def test_groupby_raises_category_on_category( ), }[groupby_func] - if groupby_func == "fillna": - kind = "Series" if groupby_series else "DataFrame" - warn_msg = f"{kind}GroupBy.fillna is deprecated" - elif groupby_func == "corrwith": + if groupby_func == "corrwith": warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index a6ea1502103c5..51c7eab2bfa82 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import iNaT from pandas.core.dtypes.common import pandas_dtype @@ -470,8 +468,7 @@ def test_max_min_non_numeric(): assert "ss" in result -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_max_min_object_multiple_columns(): +def test_max_min_object_multiple_columns(using_infer_string): # GH#41111 case where the aggregation is valid for some columns but not # others; we split object blocks column-wise, consistent with # DataFrame._reduce @@ -484,7 +481,7 @@ def test_max_min_object_multiple_columns(): } ) df._consolidate_inplace() # should already be consolidate, but double-check - assert len(df._mgr.blocks) == 2 + assert len(df._mgr.blocks) == 3 if using_infer_string else 2 gb = df.groupby("A") diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index ee4973cbf18af..550efe9187fe8 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -76,6 +76,8 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): class TestGroupBy: + # TODO(infer_string) resample sum introduces 0's + # https://github.com/pandas-dev/pandas/issues/60229 @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_with_timegrouper(self): # GH 4161 @@ -481,12 +483,8 @@ def test_timegrouper_apply_return_type_series(self): def sumfunc_series(x): return Series([x["value"].sum()], ("sum",)) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series) + expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) + result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series) tm.assert_frame_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -502,11 +500,8 @@ def test_timegrouper_apply_return_type_value(self): def sumfunc_value(x): return x.value.sum() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value) + expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) + result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value) tm.assert_series_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -932,9 +927,7 @@ def test_groupby_apply_timegrouper_with_nat_apply_squeeze( assert gb._selected_obj.index.nlevels == 1 # function that returns a Series - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res = gb.apply(lambda x: x["Quantity"] * 2) + res = gb.apply(lambda x: x["Quantity"] * 2) dti = Index([Timestamp("2013-12-31")], dtype=df["Date"].dtype, name="Date") expected = DataFrame( diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 5b8fa96291c9f..888b97f2e0206 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.core.dtypes.common import ensure_platform_int @@ -331,9 +329,6 @@ def test_transform_transformation_func(transformation_func): if transformation_func == "cumcount": test_op = lambda x: x.transform("cumcount") mock_op = lambda x: Series(range(len(x)), x.index) - elif transformation_func == "fillna": - test_op = lambda x: x.transform("fillna", value=0) - mock_op = lambda x: x.fillna(value=0) elif transformation_func == "ngroup": test_op = lambda x: x.transform("ngroup") counter = -1 @@ -536,15 +531,13 @@ def f(group): return group[:1] grouped = df.groupby("c") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(f) + result = grouped.apply(f) assert result["d"].dtype == np.float64 # this is by definition a mutating operation! for key, group in grouped: - res = f(group) + res = f(group.drop(columns="c")) tm.assert_frame_equal(res, result.loc[key]) @@ -690,18 +683,14 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target): f = gb[["float", "float_missing"]].apply(targop) expected = concat([f, i], axis=1) else: - if op != "shift" or not isinstance(gb_target.get("by"), (str, list)): - warn = None - else: - warn = DeprecationWarning - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(warn, match=msg): - expected = gb.apply(targop) + expected = gb.apply(targop) expected = expected.sort_index(axis=1) if op == "shift": expected["string_missing"] = expected["string_missing"].fillna(np.nan) - expected["string"] = expected["string"].fillna(np.nan) + by = gb_target.get("by") + if not isinstance(by, (str, list)) or (by != "string" and "string" not in by): + expected["string"] = expected["string"].fillna(np.nan) result = gb[expected.columns].transform(op, *args).sort_index(axis=1) tm.assert_frame_equal(result, expected) @@ -1034,20 +1023,19 @@ def test_groupby_transform_with_datetimes(func, values): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_transform_dtype(): # GH 22243 df = DataFrame({"a": [1], "val": [1.35]}) result = df["val"].transform(lambda x: x.map(lambda y: f"+{y}")) - expected1 = Series(["+1.35"], name="val", dtype="object") + expected1 = Series(["+1.35"], name="val") tm.assert_series_equal(result, expected1) result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}")) tm.assert_series_equal(result, expected1) result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+({y})")) - expected2 = Series(["+(1.35)"], name="val", dtype="object") + expected2 = Series(["+(1.35)"], name="val") tm.assert_series_equal(result, expected2) df["val"] = df["val"].astype(object) @@ -1439,11 +1427,7 @@ def test_null_group_str_transformer_series(dropna, transformation_func): dtype = object if transformation_func in ("any", "all") else None buffer.append(Series([np.nan], index=[3], dtype=dtype)) expected = concat(buffer) - - warn = FutureWarning if transformation_func == "fillna" else None - msg = "SeriesGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=msg): - result = gb.transform(transformation_func, *args) + result = gb.transform(transformation_func, *args) tm.assert_equal(result, expected) diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index 59c555b9644a1..dde5f38074efb 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -186,6 +186,12 @@ def test_subtype_datetimelike(self, index, subtype): with pytest.raises(TypeError, match=msg): index.astype(dtype) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_astype_category(self, index): + super().test_astype_category(index) + class TestDatetimelikeSubtype(AstypeTests): """Tests specific to IntervalIndex with datetime-like subtype""" diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index f858ae137ca4e..73bbfc91028b3 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -59,6 +59,9 @@ def test_repr_floats(self): expected = "(329.973, 345.137] 1\n(345.137, 360.191] 2\ndtype: int64" assert result == expected + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) @pytest.mark.parametrize( "tuples, closed, expected_data", [ diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index 787461b944bd0..5783a16e81d37 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -340,6 +340,9 @@ def test_get_indexer_categorical(self, target, ordered): expected = index.get_indexer(target) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_get_indexer_categorical_with_nans(self): # GH#41934 nans in both index and in target ii = IntervalIndex.from_breaks(range(5)) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index e85091aaae608..f7544cf62e5fa 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( CategoricalIndex, @@ -754,13 +752,12 @@ def test_intersection_keep_ea_dtypes(val, any_numeric_ea_dtype): tm.assert_index_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_union_with_na_when_constructing_dataframe(): # GH43222 series1 = Series( (1,), index=MultiIndex.from_arrays( - [Series([None], dtype="string"), Series([None], dtype="string")] + [Series([None], dtype="str"), Series([None], dtype="str")] ), ) series2 = Series((10, 20), index=MultiIndex.from_tuples(((None, None), ("a", "b")))) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 89648bc316c16..2c5968314e5cf 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -62,6 +62,15 @@ def test_get_indexer_with_NA_values( expected = np.array([0, 1, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_infer_string_missing_values(self): + # ensure the passed list is not cast to string but to object so that + # the None value is matched in the index + # https://github.com/pandas-dev/pandas/issues/55834 + idx = Index(["a", "b", None], dtype="object") + result = idx.get_indexer([None, "x"]) + expected = np.array([2, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + class TestGetIndexerNonUnique: def test_get_indexer_non_unique_nas(self, nulls_fixture): diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py index 755b7109a5a04..d1a278af337b7 100644 --- a/pandas/tests/indexes/string/test_indexing.py +++ b/pandas/tests/indexes/string/test_indexing.py @@ -6,6 +6,51 @@ import pandas._testing as tm +def _isnan(val): + try: + return val is not pd.NA and np.isnan(val) + except TypeError: + return False + + +class TestGetLoc: + def test_get_loc(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + assert index.get_loc("b") == 1 + + def test_get_loc_raises(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="d"): + index.get_loc("d") + + def test_get_loc_invalid_value(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="1"): + index.get_loc(1) + + def test_get_loc_non_unique(self, any_string_dtype): + index = Index(["a", "b", "a"], dtype=any_string_dtype) + result = index.get_loc("a") + expected = np.array([True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError): + index.get_loc(nulls_fixture) + + def test_get_loc_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype) + if any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and nulls_fixture is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(nulls_fixture)) + ): + with pytest.raises(KeyError): + index.get_loc(nulls_fixture) + else: + assert index.get_loc(nulls_fixture) == 2 + + class TestGetIndexer: @pytest.mark.parametrize( "method,expected", @@ -41,23 +86,60 @@ def test_get_indexer_strings_raises(self, any_string_dtype): ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] ) + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_missing(self, any_string_dtype, null, using_infer_string): + # NaT and Decimal("NaN") from null_fixture are not supported for string dtype + index = Index(["a", "b", null], dtype=any_string_dtype) + result = index.get_indexer(["a", null, "c"]) + if using_infer_string: + expected = np.array([0, 2, -1], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + expected = np.array([0, -1, -1], dtype=np.intp) + else: + expected = np.array([0, 2, -1], dtype=np.intp) -class TestGetIndexerNonUnique: - @pytest.mark.xfail(reason="TODO(infer_string)", strict=False) - def test_get_indexer_non_unique_nas(self, any_string_dtype, nulls_fixture): - index = Index(["a", "b", None], dtype=any_string_dtype) - indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + tm.assert_numpy_array_equal(result, expected) - expected_indexer = np.array([2], dtype=np.intp) - expected_missing = np.array([], dtype=np.intp) + +class TestGetIndexerNonUnique: + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_non_unique_nas( + self, any_string_dtype, null, using_infer_string + ): + index = Index(["a", "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique(["a", null]) + + if using_infer_string: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + expected_indexer = np.array([0, -1], dtype=np.intp) + expected_missing = np.array([1], dtype=np.intp) + else: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) # actually non-unique - index = Index(["a", None, "b", None], dtype=any_string_dtype) - indexer, missing = index.get_indexer_non_unique([nulls_fixture]) - - expected_indexer = np.array([1, 3], dtype=np.intp) + index = Index(["a", null, "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique(["a", null]) + + if using_infer_string: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + pass + else: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 19b46d9b2c15f..06df8902f319c 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -8,12 +8,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import ( - HAS_PYARROW, - IS64, -) +from pandas.compat import IS64 from pandas.errors import InvalidIndexError import pandas.util._test_decorators as td @@ -823,11 +818,6 @@ def test_isin(self, values, index, expected): expected = np.array(expected, dtype=bool) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) def test_isin_nan_common_object( self, nulls_fixture, nulls_fixture2, using_infer_string ): diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 5f934ca3e6e83..58b69d79c65ce 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -525,6 +525,7 @@ def test_intersection_difference_match_empty(self, index, sort): tm.assert_index_equal(inter, diff, exact=True) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @pytest.mark.parametrize( "method", ["intersection", "union", "difference", "symmetric_difference"] diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 29ce9d0c03111..b80b4b923c247 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import iNaT from pandas.compat import ( is_ci_environment, @@ -401,7 +399,6 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: pd.api.interchange.from_dataframe(df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_empty_string_column(): # https://github.com/pandas-dev/pandas/issues/56703 df = pd.DataFrame({"a": []}, dtype=str) @@ -410,13 +407,12 @@ def test_empty_string_column(): tm.assert_frame_equal(df, result) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_large_string(): # GH#56702 pytest.importorskip("pyarrow") df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") result = pd.api.interchange.from_dataframe(df.__dataframe__()) - expected = pd.DataFrame({"a": ["x"]}, dtype="object") + expected = pd.DataFrame({"a": ["x"]}, dtype="str") tm.assert_frame_equal(result, expected) @@ -427,7 +423,6 @@ def test_non_str_names(): assert names == ["0"] -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_non_str_names_w_duplicates(): # https://github.com/pandas-dev/pandas/issues/56701 df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]}) @@ -438,7 +433,7 @@ def test_non_str_names_w_duplicates(): "Expected a Series, got a DataFrame. This likely happened because you " "called __dataframe__ on a DataFrame which, after converting column " r"names to string, resulted in duplicated names: Index\(\['0', '0'\], " - r"dtype='object'\). Please rename these columns before using the " + r"dtype='(str|object)'\). Please rename these columns before using the " "interchange protocol." ), ): diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 3989e022dbbd2..34824f0a67985 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -17,8 +17,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -625,7 +623,6 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): expected = DataFrame(expected) tm.assert_frame_equal(actual, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_dtype_backend(self, read_ext, dtype_backend, engine, tmp_excel): # GH#36712 if read_ext in (".xlsb", ".xls"): diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index f70e65e34c584..71ef1201e523f 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -9,6 +9,9 @@ from pandas import ( DataFrame, + MultiIndex, + Timestamp, + period_range, read_excel, ) import pandas._testing as tm @@ -333,3 +336,26 @@ def test_styler_to_s3(s3_public_bucket, s3so): f"s3://{mock_bucket_name}/{target_file}", index_col=0, storage_options=s3so ) tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize("merge_cells", [True, False, "columns"]) +def test_format_hierarchical_rows_periodindex(merge_cells): + # GH#60099 + df = DataFrame( + {"A": [1, 2]}, + index=MultiIndex.from_arrays( + [ + period_range(start="2006-10-06", end="2006-10-07", freq="D"), + ["X", "Y"], + ], + names=["date", "category"], + ), + ) + formatter = ExcelFormatter(df, merge_cells=merge_cells) + formatted_cells = formatter._format_hierarchical_rows() + + for cell in formatted_cells: + if cell.row != 0 and cell.col == 0: + assert isinstance( + cell.val, Timestamp + ), "Period should be converted to Timestamp" diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 81aa0be24bffc..ced4feb9e7eb9 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -13,8 +13,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -25,6 +23,7 @@ MultiIndex, date_range, option_context, + period_range, ) import pandas._testing as tm @@ -337,6 +336,43 @@ def test_multiindex_interval_datetimes(self, tmp_excel): ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("merge_cells", [True, False, "columns"]) + def test_excel_round_trip_with_periodindex(self, tmp_excel, merge_cells): + # GH#60099 + df = DataFrame( + {"A": [1, 2]}, + index=MultiIndex.from_arrays( + [ + period_range(start="2006-10-06", end="2006-10-07", freq="D"), + ["X", "Y"], + ], + names=["date", "category"], + ), + ) + df.to_excel(tmp_excel, merge_cells=merge_cells) + result = pd.read_excel(tmp_excel, index_col=[0, 1]) + expected = DataFrame( + {"A": [1, 2]}, + MultiIndex.from_arrays( + [ + [ + pd.to_datetime("2006-10-06 00:00:00"), + pd.to_datetime("2006-10-07 00:00:00"), + ], + ["X", "Y"], + ], + names=["date", "category"], + ), + ) + time_format = ( + "datetime64[s]" if tmp_excel.endswith(".ods") else "datetime64[us]" + ) + expected.index = expected.index.set_levels( + expected.index.levels[0].astype(time_format), level=0 + ) + + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "engine,ext", @@ -764,6 +800,9 @@ def test_excel_date_datetime_format(self, ext, tmp_excel, tmp_path): # we need to use df_expected to check the result. tm.assert_frame_equal(rs2, df_expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_to_excel_interval_no_labels(self, tmp_excel, using_infer_string): # see gh-19242 # @@ -870,27 +909,49 @@ def test_to_excel_multiindex_nan_label(self, merge_cells, tmp_excel): # Test for Issue 11328. If column indices are integers, make # sure they are handled correctly for either setting of # merge_cells - def test_to_excel_multiindex_cols(self, merge_cells, frame, tmp_excel): + def test_to_excel_multiindex_cols(self, merge_cells, tmp_excel): + # GH#11328 + frame = DataFrame( + { + "A": [1, 2, 3], + "B": [4, 5, 6], + "C": [7, 8, 9], + } + ) arrays = np.arange(len(frame.index) * 2, dtype=np.int64).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index - new_cols_index = MultiIndex.from_tuples([(40, 1), (40, 2), (50, 1), (50, 2)]) + new_cols_index = MultiIndex.from_tuples([(40, 1), (40, 2), (50, 1)]) frame.columns = new_cols_index - header = [0, 1] - if not merge_cells: - header = 0 - - # round trip frame.to_excel(tmp_excel, sheet_name="test1", merge_cells=merge_cells) + + # Check round trip with ExcelFile(tmp_excel) as reader: - df = pd.read_excel( - reader, sheet_name="test1", header=header, index_col=[0, 1] + result = pd.read_excel( + reader, sheet_name="test1", header=[0, 1], index_col=[0, 1] ) + tm.assert_frame_equal(result, frame) + + # GH#60274 + # Check with header/index_col None to determine which cells were merged + with ExcelFile(tmp_excel) as reader: + result = pd.read_excel( + reader, sheet_name="test1", header=None, index_col=None + ) + expected = DataFrame( + { + 0: [np.nan, np.nan, "first", 0, 1, 2], + 1: [np.nan, np.nan, "second", 3, 4, 5], + 2: [40.0, 1.0, np.nan, 1.0, 2.0, 3.0], + 3: [np.nan, 2.0, np.nan, 4.0, 5.0, 6.0], + 4: [50.0, 1.0, np.nan, 7.0, 8.0, 9.0], + } + ) if not merge_cells: - fm = frame.columns._format_multi(sparsify=False, include_names=False) - frame.columns = [".".join(map(str, q)) for q in zip(*fm)] - tm.assert_frame_equal(frame, df) + # MultiIndex column value is repeated + expected.loc[0, 3] = 40.0 + tm.assert_frame_equal(result, expected) def test_to_excel_multiindex_dates(self, merge_cells, tmp_excel): # try multiindex with dates @@ -1365,12 +1426,11 @@ def test_freeze_panes(self, tmp_excel): result = pd.read_excel(tmp_excel, index_col=0) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_path_path_lib(self, engine, ext): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), columns=Index(list("ABCD")), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + index=Index([f"i-{i}" for i in range(30)]), ) writer = partial(df.to_excel, engine=engine) diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index e9fc2b2d27afd..ff8a1b9f570ab 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -488,9 +488,11 @@ def test_repr_html_ok(self, styler): def test_repr_html_mathjax(self, styler): # gh-19824 / 41395 assert "tex2jax_ignore" not in styler._repr_html_() + assert "mathjax_ignore" not in styler._repr_html_() with option_context("styler.html.mathjax", False): assert "tex2jax_ignore" in styler._repr_html_() + assert "mathjax_ignore" in styler._repr_html_() def test_update_ctx(self, styler): styler._update_ctx(DataFrame({"A": ["color: red", "color: blue"]})) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 0dc16e1ebc723..86682e8160762 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -129,6 +129,20 @@ def test_repr_truncation_preserves_na(self): with option_context("display.max_rows", 2, "display.show_dimensions", False): assert repr(df) == " a\n0 \n.. ...\n9 " + def test_repr_truncation_dataframe_attrs(self): + # GH#60455 + df = DataFrame([[0] * 10]) + df.attrs["b"] = DataFrame([]) + with option_context("display.max_columns", 2, "display.show_dimensions", False): + assert repr(df) == " 0 ... 9\n0 0 ... 0" + + def test_repr_truncation_series_with_dataframe_attrs(self): + # GH#60568 + ser = Series([0] * 10) + ser.attrs["b"] = DataFrame([]) + with option_context("display.max_rows", 2, "display.show_dimensions", False): + assert repr(ser) == "0 0\n ..\n9 0\ndtype: int64" + def test_max_colwidth_negative_int_raises(self): # Deprecation enforced from: # https://github.com/pandas-dev/pandas/issues/31532 diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py index 1009dfec53218..3b63011bf862e 100644 --- a/pandas/tests/io/formats/test_printing.py +++ b/pandas/tests/io/formats/test_printing.py @@ -3,11 +3,33 @@ from collections.abc import Mapping import string +import pytest + import pandas._config.config as cf +import pandas as pd + from pandas.io.formats import printing +@pytest.mark.parametrize( + "input_names, expected_names", + [ + (["'a b"], "['\\'a b']"), # Escape leading quote + (["test's b"], "['test\\'s b']"), # Escape apostrophe + (["'test' b"], "['\\'test\\' b']"), # Escape surrounding quotes + (["test b'"], "['test b\\'']"), # Escape single quote + (["test\n' b"], "['test\\n\\' b']"), # Escape quotes, preserve newline + ], +) +def test_formatted_index_names(input_names, expected_names): + # GH#60190 + df = pd.DataFrame({name: [1, 2, 3] for name in input_names}).set_index(input_names) + formatted_names = str(df.index.names) + + assert formatted_names == expected_names + + def test_adjoin(): data = [["a", "b", "c"], ["dd", "ee", "ff"], ["ggg", "hhh", "iii"]] expected = "a dd ggg\nb ee hhh\nc ff iii" diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 8031f67cd0567..b1a437bfdbd8a 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -934,9 +934,11 @@ def test_repr_html(self, float_frame): def test_repr_html_mathjax(self): df = DataFrame([[1, 2], [3, 4]]) assert "tex2jax_ignore" not in df._repr_html_() + assert "mathjax_ignore" not in df._repr_html_() with option_context("display.html.use_mathjax", False): assert "tex2jax_ignore" in df._repr_html_() + assert "mathjax_ignore" in df._repr_html_() def test_repr_html_wide(self): max_cols = 20 diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 1de53993fe646..8d46442611719 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -1405,3 +1405,88 @@ def test_to_latex_multiindex_multirow(self): """ ) assert result == expected + + def test_to_latex_multiindex_format_single_index_hidden(self): + # GH 52218 + df = DataFrame( + { + "A": [1, 2], + "B": [4, 5], + } + ) + result = ( + df.style.hide(axis="index") + .map_index(lambda v: "textbf:--rwrap;", axis="columns") + .to_latex() + ) + expected = _dedent(r""" + \begin{tabular}{rr} + \textbf{A} & \textbf{B} \\ + 1 & 4 \\ + 2 & 5 \\ + \end{tabular} + """) + assert result == expected + + def test_to_latex_multiindex_format_triple_index_two_hidden(self): + # GH 52218 + arrays = [ + ["A", "A", "B", "B"], + ["one", "two", "one", "two"], + ["x", "x", "y", "y"], + ] + index = pd.MultiIndex.from_arrays( + arrays, names=["Level 0", "Level 1", "Level 2"] + ) + df = DataFrame( + [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], + index=index, + columns=["C1", "C2", "C3"], + ) + result = ( + df.style.hide(axis="index", level=[0, 1]) + .map_index(lambda v: "textbf:--rwrap;", axis="columns") + .to_latex() + ) + expected = _dedent(r""" + \begin{tabular}{lrrr} + & \textbf{C1} & \textbf{C2} & \textbf{C3} \\ + Level 2 & & & \\ + x & 0 & 0 & 0 \\ + x & 0 & 0 & 0 \\ + y & 0 & 0 & 0 \\ + y & 0 & 0 & 0 \\ + \end{tabular} + """) + assert result == expected + + def test_to_latex_multiindex_format_triple_index_all_hidden(self): + # GH 52218 + arrays = [ + ["A", "A", "B", "B"], + ["one", "two", "one", "two"], + ["x", "x", "y", "y"], + ] + index = pd.MultiIndex.from_arrays( + arrays, names=["Level 0", "Level 1", "Level 2"] + ) + df = DataFrame( + [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], + index=index, + columns=["C1", "C2", "C3"], + ) + result = ( + df.style.hide(axis="index", level=[0, 1, 2]) + .map_index(lambda v: "textbf:--rwrap;", axis="columns") + .to_latex() + ) + expected = _dedent(r""" + \begin{tabular}{rrr} + \textbf{C1} & \textbf{C2} & \textbf{C3} \\ + 0 & 0 & 0 \\ + 0 & 0 & 0 \\ + 0 & 0 & 0 \\ + 0 & 0 & 0 \\ + \end{tabular} + """) + assert result == expected diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index 5731f74a03852..af3cdf2d44af3 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -422,6 +422,24 @@ def test_to_string_complex_float_formatting(self): ) assert result == expected + def test_to_string_complex_float_formatting_with_exponents(self): + # GH #60393 + with option_context("display.precision", 6): + df = DataFrame( + { + "x": [ + (1.8816e-09 + 0j), + (1.8816e-09 + 3.39676e-09j), + ] + } + ) + result = df.to_string() + expected = ( + " x\n0 1.881600e-09+0.000000e+00j\n" + "1 1.881600e-09+3.396760e-09j" + ) + assert result == expected + def test_to_string_format_inf(self): # GH#24861 df = DataFrame( diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 7f367ded39863..7936982e4a055 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, @@ -27,10 +25,6 @@ set_default_names, ) -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - @pytest.fixture def df_schema(): @@ -127,7 +121,7 @@ def test_multiindex(self, df_schema, using_infer_string): expected["fields"][0] = { "name": "level_0", "type": "any", - "extDtype": "string", + "extDtype": "str", } expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "str"} assert result == expected diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index d3328d1dfcaef..ad9dbf7554a8b 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -84,7 +84,7 @@ def datetime_frame(self): # since that doesn't round-trip, see GH#33711 df = DataFrame( np.random.default_rng(2).standard_normal((30, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=30, freq="B"), ) df.index = df.index._with_freq(None) @@ -184,7 +184,6 @@ def test_roundtrip_simple(self, orient, convert_axes, dtype, float_frame): assert_json_roundtrip_equal(result, expected, orient) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [False, np.int64]) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_intframe(self, orient, convert_axes, dtype, int_frame): @@ -270,7 +269,6 @@ def test_roundtrip_empty(self, orient, convert_axes): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_timestamp(self, orient, convert_axes, datetime_frame): # TODO: improve coverage with date_format parameter @@ -698,7 +696,6 @@ def test_series_roundtrip_simple(self, orient, string_series, using_infer_string tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("dtype", [False, None]) def test_series_roundtrip_object(self, orient, dtype, object_series): data = StringIO(object_series.to_json(orient=orient)) @@ -710,6 +707,9 @@ def test_series_roundtrip_object(self, orient, dtype, object_series): if orient != "split": expected.name = None + if using_string_dtype(): + expected = expected.astype("str") + tm.assert_series_equal(result, expected) def test_series_roundtrip_empty(self, orient): @@ -808,7 +808,6 @@ def test_path(self, float_frame, int_frame, datetime_frame): df.to_json(path) read_json(path) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_axis_dates(self, datetime_series, datetime_frame): # frame json = StringIO(datetime_frame.to_json()) @@ -821,7 +820,6 @@ def test_axis_dates(self, datetime_series, datetime_frame): tm.assert_series_equal(result, datetime_series, check_names=False) assert result.name is None - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dates(self, datetime_series, datetime_frame): # frame df = datetime_frame @@ -912,7 +910,6 @@ def test_convert_dates_infer(self, infer_word): result = read_json(StringIO(ujson_dumps(data)))[["id", infer_word]] tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "date,date_unit", [ @@ -973,7 +970,6 @@ def test_date_format_series_raises(self, datetime_series): with pytest.raises(ValueError, match=msg): ts.to_json(date_format="iso", date_unit="foo") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_date_unit(self, unit, datetime_frame): df = datetime_frame df["date"] = Timestamp("20130101 20:43:42").as_unit("ns") @@ -1114,7 +1110,6 @@ def test_round_trip_exception(self, datapath): res = res.fillna(np.nan) tm.assert_frame_equal(res, df) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.network @pytest.mark.single_cpu @pytest.mark.parametrize( @@ -1555,7 +1550,6 @@ def test_data_frame_size_after_to_json(self): assert size_before == size_after - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "index", [None, [1, 2], [1.0, 2.0], ["a", "b"], ["1", "2"], ["1.", "2."]] ) diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index a6504473fb55f..65ad7273666e5 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import parsers as libparsers from pandas.errors import DtypeWarning @@ -231,8 +229,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): assert result.a.dtype == float -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -def test_warn_if_chunks_have_mismatched_type(all_parsers): +def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string): warning_type = None parser = all_parsers size = 10000 @@ -260,8 +257,12 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers): "Specify dtype option on import or set low_memory=False.", buf, ) - - assert df.a.dtype == object + if parser.engine == "c" and parser.low_memory: + assert df.a.dtype == object + elif using_infer_string: + assert df.a.dtype == "str" + else: + assert df.a.dtype == object @pytest.mark.parametrize("iterator", [True, False]) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 511db2c6a33d8..3680273f5e98a 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -15,6 +15,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.errors import ( EmptyDataError, ParserError, @@ -766,7 +767,7 @@ def test_dict_keys_as_names(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0 def test_encoding_surrogatepass(all_parsers): # GH39017 diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index d8b8f24abcedd..cef57318195ec 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -15,8 +15,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import WASM from pandas.errors import ( EmptyDataError, @@ -71,14 +69,13 @@ def test_local_file(all_parsers, csv_dir_path): pytest.skip("Failing on: " + " ".join(platform.uname())) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # AssertionError: DataFrame.index are different def test_path_path_lib(all_parsers): parser = all_parsers df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0)) tm.assert_frame_equal(df, result) diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 54b59ac4e25ed..8352cc80f5e62 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -9,8 +9,6 @@ import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Index, @@ -88,9 +86,13 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) -def test_multi_index_no_level_names(all_parsers, index_col): +def test_multi_index_no_level_names( + request, all_parsers, index_col, using_infer_string +): + if using_infer_string and all_parsers.engine == "pyarrow": + # result should have string columns instead of object dtype + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) data = """index1,index2,A,B,C,D foo,one,2,3,4,5 foo,two,7,8,9,10 diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index e02562ac8d93d..75b7cf0d42cb8 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ParserWarning import pandas as pd @@ -57,7 +55,6 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_per_column(all_parsers): parser = all_parsers @@ -71,7 +68,6 @@ def test_dtype_per_column(all_parsers): [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] ) expected["one"] = expected["one"].astype(np.float64) - expected["two"] = expected["two"].astype(object) result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 9226f265ca2b3..11a30a26f91ef 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -18,8 +18,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import WASM from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import ( @@ -184,8 +182,7 @@ def error(val: float, actual_val: Decimal) -> Decimal: assert max(precise_errors) <= max(normal_errors) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_usecols_dtypes(c_parser_only): +def test_usecols_dtypes(c_parser_only, using_infer_string): parser = c_parser_only data = """\ 1,2,3 @@ -210,8 +207,12 @@ def test_usecols_dtypes(c_parser_only): dtype={"b": int, "c": float}, ) - assert (result.dtypes == [object, int, float]).all() - assert (result2.dtypes == [object, float]).all() + if using_infer_string: + assert (result.dtypes == ["string", int, float]).all() + assert (result2.dtypes == ["string", float]).all() + else: + assert (result.dtypes == [object, int, float]).all() + assert (result2.dtypes == [object, float]).all() def test_disable_bool_parsing(c_parser_only): diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 0423327c7333c..c6ba2213033ea 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -188,7 +186,6 @@ def convert_score(x): tm.assert_frame_equal(results[0], results[1]) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("conv_f", [lambda x: x, str]) def test_converter_index_col_bug(all_parsers, conv_f): # see gh-1835 , GH#40589 @@ -207,7 +204,7 @@ def test_converter_index_col_bug(all_parsers, conv_f): StringIO(data), sep=";", index_col="A", converters={"A": conv_f} ) - xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object")) + xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A")) tm.assert_frame_equal(rs, xp) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index ce2ed5e9764bd..9977e2b8e1a1d 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Index, @@ -345,7 +343,6 @@ def test_infer_types_boolean_sum(all_parsers): tm.assert_frame_equal(result, expected, check_index_type=False) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)]) def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): # GH#9435 @@ -356,7 +353,7 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine") ) result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype}) - expected = DataFrame({"b": [2]}, index=Index([val], name="a")) + expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype)) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 6a2ae3bffdc74..d3789cd387c05 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -8,9 +8,10 @@ import pytest -from pandas._config import using_string_dtype - -from pandas import DataFrame +from pandas import ( + DataFrame, + Index, +) import pandas._testing as tm xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @@ -121,7 +122,6 @@ def test_thorough_mangle_names(all_parsers, data, names, expected): parser.read_csv(StringIO(data), names=names) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # AssertionError: DataFrame.columns are different def test_mangled_unnamed_placeholders(all_parsers): # xref gh-13017 @@ -133,7 +133,7 @@ def test_mangled_unnamed_placeholders(all_parsers): # This test recursively updates `df`. for i in range(3): - expected = DataFrame() + expected = DataFrame(columns=Index([], dtype="str")) for j in range(i + 1): col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 89645b526f2ee..3a68d38cc0bde 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.parsers import STR_NA_VALUES from pandas import ( @@ -261,7 +259,6 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "kwargs,expected", [ @@ -299,7 +296,9 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): ), ], ) -def test_na_values_keep_default(all_parsers, kwargs, expected, request): +def test_na_values_keep_default( + all_parsers, kwargs, expected, request, using_infer_string +): data = """\ A,B,C a,1,one @@ -317,8 +316,9 @@ def test_na_values_keep_default(all_parsers, kwargs, expected, request): with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), **kwargs) return - mark = pytest.mark.xfail() - request.applymarker(mark) + if not using_infer_string or "na_values" in kwargs: + mark = pytest.mark.xfail() + request.applymarker(mark) result = parser.read_csv(StringIO(data), **kwargs) expected = DataFrame(expected) @@ -429,8 +429,6 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -@xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case @pytest.mark.parametrize( "na_filter,row_data", [ @@ -438,14 +436,21 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v (False, [["1", "A"], ["nan", "B"], ["3", "C"]]), ], ) -def test_na_values_na_filter_override(all_parsers, na_filter, row_data): +def test_na_values_na_filter_override( + request, all_parsers, na_filter, row_data, using_infer_string +): + parser = all_parsers + if parser.engine == "pyarrow": + # mismatched dtypes in both cases, FutureWarning in the True case + if not (using_infer_string and na_filter): + mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") + request.applymarker(mark) data = """\ A,B 1,A nan,B 3,C """ - parser = all_parsers result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter) expected = DataFrame(row_data, columns=["A", "B"]) @@ -536,7 +541,6 @@ def test_na_values_dict_aliasing(all_parsers): tm.assert_dict_equal(na_values, na_values_copy) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_na_values_dict_null_column_name(all_parsers): # see gh-57547 parser = all_parsers @@ -560,11 +564,10 @@ def test_na_values_dict_null_column_name(all_parsers): return expected = DataFrame( - {None: ["MA", "NA", "OA"], "x": [1.0, 2.0, np.nan], "y": [2.0, 1.0, 3.0]} + {"x": [1.0, 2.0, np.nan], "y": [2.0, 1.0, 3.0]}, + index=Index(["MA", "NA", "OA"], dtype=object), ) - expected = expected.set_index(None) - result = parser.read_csv( StringIO(data), index_col=0, diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index d0ce0b5b2af72..5688e3ce4a243 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -13,8 +13,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -421,7 +419,6 @@ def test_parse_timezone(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @skip_pyarrow # pandas.errors.ParserError: CSV parse error @pytest.mark.parametrize( "date_string", @@ -429,7 +426,7 @@ def test_parse_timezone(all_parsers): ) def test_invalid_parse_delimited_date(all_parsers, date_string): parser = all_parsers - expected = DataFrame({0: [date_string]}, dtype="object") + expected = DataFrame({0: [date_string]}, dtype="str") result = parser.read_csv( StringIO(date_string), header=None, @@ -609,7 +606,6 @@ def test_date_parser_usecols_thousands(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dayfirst_warnings(): # GH 12585 @@ -642,7 +638,7 @@ def test_dayfirst_warnings(): # first in DD/MM/YYYY, second in MM/DD/YYYY input = "date\n31/12/2014\n03/30/2011" - expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date") + expected = Index(["31/12/2014", "03/30/2011"], dtype="str", name="date") # A. use dayfirst=True res5 = read_csv( @@ -752,7 +748,6 @@ def test_parse_dates_and_string_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_parse_dot_separated_dates(all_parsers): # https://github.com/pandas-dev/pandas/issues/2586 parser = all_parsers @@ -762,7 +757,7 @@ def test_parse_dot_separated_dates(all_parsers): if parser.engine == "pyarrow": expected_index = Index( ["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"], - dtype="object", + dtype="str", name="a", ) warn = None diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index 01e576ba40f26..bc4c4c2e24e9c 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.parsers import ( _maybe_upcast, na_values, @@ -86,7 +84,6 @@ def test_maybe_upcaste_all_nan(): tm.assert_extension_array_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("val", [na_values[np.object_], "c"]) def test_maybe_upcast_object(val, string_storage): # GH#36712 diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 541cc39606047..b5e97314caf03 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ( PyperclipException, PyperclipWindowsException, @@ -26,10 +24,6 @@ init_qt_clipboard, ) -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - def build_kwargs(sep, excel): kwargs = {} @@ -351,7 +345,7 @@ def test_raw_roundtrip(self, data): @pytest.mark.parametrize("engine", ["c", "python"]) def test_read_clipboard_dtype_backend( - self, clipboard, string_storage, dtype_backend, engine + self, clipboard, string_storage, dtype_backend, engine, using_infer_string ): # GH#50502 if dtype_backend == "pyarrow": @@ -396,6 +390,11 @@ def test_read_clipboard_dtype_backend( ) expected["g"] = ArrowExtensionArray(pa.array([None, None])) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) + tm.assert_frame_equal(result, expected) def test_invalid_dtype_backend(self): diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 4f3f613f71542..70422a0ea6edc 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -140,7 +140,6 @@ def test_bytesiowrapper_returns_correct_bytes(self): assert result == data.encode("utf-8") # Test that pyarrow can handle a file opened with get_handle - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_get_handle_pyarrow_compat(self): pa_csv = pytest.importorskip("pyarrow.csv") @@ -155,6 +154,8 @@ def test_get_handle_pyarrow_compat(self): s = StringIO(data) with icom.get_handle(s, "rb", is_text=False) as handles: df = pa_csv.read_csv(handles.handle).to_pandas() + # TODO will have to update this when pyarrow' to_pandas() is fixed + expected = expected.astype("object") tm.assert_frame_equal(df, expected) assert not s.closed @@ -338,7 +339,6 @@ def test_read_fspath_all(self, reader, module, path, datapath): ("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"), ], ) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_write_fspath_all(self, writer_name, writer_kwargs, module): if writer_name in ["to_latex"]: # uses Styler implementation pytest.importorskip("jinja2") @@ -365,7 +365,7 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): expected = f_path.read() assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) hdf support") def test_write_fspath_hdf5(self): # Same test as write_fspath_all, except HDF5 files aren't # necessarily byte-for-byte identical for a given dataframe, so we'll @@ -438,14 +438,13 @@ def test_unknown_engine(self): with tm.ensure_clean() as path: df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.to_csv(path) with pytest.raises(ValueError, match="Unknown engine"): pd.read_csv(path, engine="pyt") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_binary_mode(self): """ 'encoding' shouldn't be passed to 'open' in binary mode. @@ -455,8 +454,8 @@ def test_binary_mode(self): with tm.ensure_clean() as path: df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.to_csv(path, mode="w+b") tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) @@ -473,8 +472,8 @@ def test_warning_missing_utf_bom(self, encoding, compression_): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: with tm.assert_produces_warning(UnicodeWarning, match="byte order mark"): @@ -504,15 +503,14 @@ def test_is_fsspec_url(): assert icom.is_fsspec_url("RFC-3986+compliant.spec://something") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("encoding", [None, "utf-8"]) @pytest.mark.parametrize("format", ["csv", "json"]) def test_codecs_encoding(encoding, format): # GH39247 expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: with codecs.open(path, mode="w", encoding=encoding) as handle: @@ -525,13 +523,12 @@ def test_codecs_encoding(encoding, format): tm.assert_frame_equal(expected, df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_codecs_get_writer_reader(): # GH39247 expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: with open(path, "wb") as handle: @@ -556,8 +553,8 @@ def test_explicit_encoding(io_class, mode, msg): # wrong mode is requested expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with io_class() as buffer: with pytest.raises(TypeError, match=msg): @@ -674,3 +671,17 @@ def test_pickle_reader(reader): # GH 22265 with BytesIO() as buffer: pickle.dump(reader, buffer) + + +@td.skip_if_no("pyarrow") +def test_pyarrow_read_csv_datetime_dtype(): + # GH 59904 + data = '"date"\n"20/12/2025"\n""\n"31/12/2020"' + result = pd.read_csv( + StringIO(data), parse_dates=["date"], dayfirst=True, dtype_backend="pyarrow" + ) + + expect_data = pd.to_datetime(["20/12/2025", pd.NaT, "31/12/2020"], dayfirst=True) + expect = pd.DataFrame({"date": expect_data}) + + tm.assert_frame_equal(expect, result) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 5eb202dd5aa24..fd1e9b4fdf211 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -12,8 +12,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import is_platform_windows import pandas as pd @@ -139,7 +137,6 @@ def test_compression_warning(compression_only): df.to_csv(handles.handle, compression=compression_only) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_compression_binary(compression_only): """ Binary file handles support compression. @@ -148,8 +145,8 @@ def test_compression_binary(compression_only): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) # with a file @@ -180,8 +177,8 @@ def test_gzip_reproducibility_file_name(): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) compression_options = {"method": "gzip", "mtime": 1} @@ -203,8 +200,8 @@ def test_gzip_reproducibility_file_object(): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) compression_options = {"method": "gzip", "mtime": 1} diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index aa9c47ea0e63c..5340560884afe 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -5,6 +5,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas import ( DataFrame, date_range, @@ -176,7 +178,9 @@ def test_excel_options(fsspectest): assert fsspectest.test[0] == "read" -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") +@pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string) fastparquet" +) def test_to_parquet_new_file(cleared_fs, df1): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index a9e7b2da03a4d..f68ef5fa2e0e5 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat.pyarrow import pa_version_under17p0 from pandas import ( @@ -158,7 +156,6 @@ def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): assert result == expected -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("encoding", ["utf-8", "cp1251"]) def test_to_csv_compression_encoding_gcs( gcs_buffer, compression_only, encoding, compression_to_extension @@ -171,8 +168,8 @@ def test_to_csv_compression_encoding_gcs( """ df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) # reference of compressed and encoded file @@ -208,7 +205,6 @@ def test_to_csv_compression_encoding_gcs( tm.assert_frame_equal(df, read_df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 73e9933e3681b..bef28c4f027da 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1004,6 +1004,33 @@ def test_rowspan_only_rows(self, flavor_read_html): tm.assert_frame_equal(result, expected) + def test_rowspan_in_header_overflowing_to_body(self, flavor_read_html): + # GH60210 + + result = flavor_read_html( + StringIO( + """ + + + + + + + + + + + + +
AB
1
C2
+ """ + ) + )[0] + + expected = DataFrame(data=[["A", 1], ["C", 2]], columns=["A", "B"]) + + tm.assert_frame_equal(result, expected) + def test_header_inferred_from_rows_with_only_th(self, flavor_read_html): # GH17054 result = flavor_read_html( diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 6ef7105cf5ccc..7919bb956dc7a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1172,11 +1172,33 @@ def test_non_nanosecond_timestamps(self, temp_file): ) tm.assert_frame_equal(result, expected) + def test_maps_as_pydicts(self, pa): + pyarrow = pytest.importorskip("pyarrow", "13.0.0") + + schema = pyarrow.schema( + [("foo", pyarrow.map_(pyarrow.string(), pyarrow.int64()))] + ) + df = pd.DataFrame([{"foo": {"A": 1}}, {"foo": {"B": 2}}]) + check_round_trip( + df, + pa, + write_kwargs={"schema": schema}, + read_kwargs={"to_pandas_kwargs": {"maps_as_pydicts": "strict"}}, + ) + class TestParquetFastParquet(Base): - @pytest.mark.xfail(reason="datetime_with_nat gets incorrect values") - def test_basic(self, fp, df_full): + def test_basic(self, fp, df_full, request): pytz = pytest.importorskip("pytz") + import fastparquet + + if Version(fastparquet.__version__) < Version("2024.11.0"): + request.applymarker( + pytest.mark.xfail( + reason=("datetime_with_nat gets incorrect values"), + ) + ) + tz = pytz.timezone("US/Eastern") df = df_full @@ -1213,11 +1235,17 @@ def test_duplicate_columns(self, fp): msg = "Cannot create parquet dataset with duplicate column names" self.check_error_on_write(df, fp, ValueError, msg) - @pytest.mark.xfail( - Version(np.__version__) >= Version("2.0.0"), - reason="fastparquet uses np.float_ in numpy2", - ) - def test_bool_with_none(self, fp): + def test_bool_with_none(self, fp, request): + import fastparquet + + if Version(fastparquet.__version__) < Version("2024.11.0") and Version( + np.__version__ + ) >= Version("2.0.0"): + request.applymarker( + pytest.mark.xfail( + reason=("fastparquet uses np.float_ in numpy2"), + ) + ) df = pd.DataFrame({"a": [True, None, False]}) expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") # Fastparquet bug in 0.7.1 makes it so that this dtype becomes @@ -1331,10 +1359,19 @@ def test_empty_dataframe(self, fp): expected = df.copy() check_round_trip(df, fp, expected=expected) - @pytest.mark.xfail( - reason="fastparquet bug, see https://github.com/dask/fastparquet/issues/929" - ) - def test_timezone_aware_index(self, fp, timezone_aware_date_list): + def test_timezone_aware_index(self, fp, timezone_aware_date_list, request): + import fastparquet + + if Version(fastparquet.__version__) < Version("2024.11.0"): + request.applymarker( + pytest.mark.xfail( + reason=( + "fastparquet bug, see " + "https://github.com/dask/fastparquet/issues/929" + ), + ) + ) + idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index beca8dea9407d..7e1220ecee218 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -60,7 +60,7 @@ pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ), - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), + pytest.mark.single_cpu, ] @@ -685,6 +685,7 @@ def postgresql_psycopg2_conn(postgresql_psycopg2_engine): @pytest.fixture def postgresql_adbc_conn(): + pytest.importorskip("pyarrow") pytest.importorskip("adbc_driver_postgresql") from adbc_driver_postgresql import dbapi @@ -817,6 +818,7 @@ def sqlite_conn_types(sqlite_engine_types): @pytest.fixture def sqlite_adbc_conn(): + pytest.importorskip("pyarrow") pytest.importorskip("adbc_driver_sqlite") from adbc_driver_sqlite import dbapi @@ -957,12 +959,12 @@ def sqlite_buildin_types(sqlite_buildin, types_data): adbc_connectable_iris = [ pytest.param("postgresql_adbc_iris", marks=pytest.mark.db), - pytest.param("sqlite_adbc_iris", marks=pytest.mark.db), + "sqlite_adbc_iris", ] adbc_connectable_types = [ pytest.param("postgresql_adbc_types", marks=pytest.mark.db), - pytest.param("sqlite_adbc_types", marks=pytest.mark.db), + "sqlite_adbc_types", ] @@ -986,13 +988,13 @@ def test_dataframe_to_sql(conn, test_frame1, request): @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_empty(conn, test_frame1, request): - if conn == "postgresql_adbc_conn": + if conn == "postgresql_adbc_conn" and not using_string_dtype(): request.node.add_marker( pytest.mark.xfail( - reason="postgres ADBC driver cannot insert index with null type", - strict=True, + reason="postgres ADBC driver < 1.2 cannot insert index with null type", ) ) + # GH 51086 if conn is sqlite_engine conn = request.getfixturevalue(conn) empty_df = test_frame1.iloc[:0] @@ -3557,7 +3559,8 @@ def test_read_sql_dtype_backend( result = getattr(pd, func)( f"Select * from {table}", conn, dtype_backend=dtype_backend ) - expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + tm.assert_frame_equal(result, expected) if "adbc" in conn_name: @@ -3607,7 +3610,7 @@ def test_read_sql_dtype_backend_table( with pd.option_context("mode.string_storage", string_storage): result = getattr(pd, func)(table, conn, dtype_backend=dtype_backend) - expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) tm.assert_frame_equal(result, expected) if "adbc" in conn_name: @@ -4123,7 +4126,7 @@ def tquery(query, con=None): def test_xsqlite_basic(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) assert sql.to_sql(frame, name="test_table", con=sqlite_buildin, index=False) == 10 @@ -4150,7 +4153,7 @@ def test_xsqlite_basic(sqlite_buildin): def test_xsqlite_write_row_by_row(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) frame.iloc[0, 0] = np.nan @@ -4173,7 +4176,7 @@ def test_xsqlite_write_row_by_row(sqlite_buildin): def test_xsqlite_execute(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) create_sql = sql.get_schema(frame, "test") @@ -4194,7 +4197,7 @@ def test_xsqlite_execute(sqlite_buildin): def test_xsqlite_schema(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) create_sql = sql.get_schema(frame, "test") diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 8fa85d13bbdb5..9288b98d79fbe 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1719,7 +1719,6 @@ def test_date_parsing_ignores_format_details(self, column, datapath): formatted = df.loc[0, column + "_fmt"] assert unformatted == formatted - # @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("byteorder", ["little", "big"]) def test_writer_117(self, byteorder, temp_file, using_infer_string): original = DataFrame( diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 845f369d3090f..d18f098267599 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -1070,28 +1070,43 @@ def test_boxplot_series_positions(self, hist_df): tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), positions) assert len(ax.lines) == 7 * len(numeric_cols) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib 3.10", + ) def test_boxplot_vertical(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] # if horizontal, yticklabels are rotated - ax = df.plot.box(rot=50, fontsize=8, vert=False) + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) + ax = df.plot.box(rot=50, fontsize=8, **kwargs) _check_ticks_props(ax, xrot=0, yrot=50, ylabelsize=8) _check_text_labels(ax.get_yticklabels(), labels) assert len(ax.lines) == 7 * len(numeric_cols) - @pytest.mark.filterwarnings("ignore:Attempt:UserWarning") + @pytest.mark.filterwarnings("ignore::UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib version 3.10", + ) def test_boxplot_vertical_subplots(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) axes = _check_plot_works( - df.plot.box, - default_axes=True, - subplots=True, - vert=False, - logx=True, + df.plot.box, default_axes=True, subplots=True, logx=True, **kwargs ) _check_axes_shape(axes, axes_num=3, layout=(1, 3)) _check_ax_scales(axes, xaxis="log") @@ -1099,12 +1114,22 @@ def test_boxplot_vertical_subplots(self, hist_df): _check_text_labels(ax.get_yticklabels(), [label]) assert len(ax.lines) == 7 + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib 3.10", + ) def test_boxplot_vertical_positions(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] positions = np.array([3, 2, 8]) - ax = df.plot.box(positions=positions, vert=False) + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) + ax = df.plot.box(positions=positions, **kwargs) _check_text_labels(ax.get_yticklabels(), labels) tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), positions) assert len(ax.lines) == 7 * len(numeric_cols) diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 4916963ab7c87..2267b6197cd80 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -1,5 +1,7 @@ """Test cases for .boxplot method""" +from __future__ import annotations + import itertools import string @@ -22,6 +24,7 @@ _check_ticks_props, _check_visible, ) +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing @@ -35,6 +38,17 @@ def _check_ax_limits(col, ax): assert y_max >= col.max() +if Version(mpl.__version__) < Version("3.10"): + verts: list[dict[str, bool | str]] = [{"vert": False}, {"vert": True}] +else: + verts = [{"orientation": "horizontal"}, {"orientation": "vertical"}] + + +@pytest.fixture(params=verts) +def vert(request): + return request.param + + class TestDataFramePlots: def test_stacked_boxplot_set_axis(self): # GH2980 @@ -312,7 +326,7 @@ def test_specified_props_kwd(self, props, expected): assert result[expected][0].get_color() == "C1" - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_plot_xlabel_ylabel(self, vert): df = DataFrame( { @@ -322,11 +336,11 @@ def test_plot_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.plot(kind="box", vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.plot(kind="box", xlabel=xlabel, ylabel=ylabel, **vert) assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_plot_box(self, vert): # GH 54941 rng = np.random.default_rng(2) @@ -335,13 +349,13 @@ def test_plot_box(self, vert): xlabel, ylabel = "x", "y" _, axs = plt.subplots(ncols=2, figsize=(10, 7), sharey=True) - df1.plot.box(ax=axs[0], vert=vert, xlabel=xlabel, ylabel=ylabel) - df2.plot.box(ax=axs[1], vert=vert, xlabel=xlabel, ylabel=ylabel) + df1.plot.box(ax=axs[0], xlabel=xlabel, ylabel=ylabel, **vert) + df2.plot.box(ax=axs[1], xlabel=xlabel, ylabel=ylabel, **vert) for ax in axs: assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_boxplot_xlabel_ylabel(self, vert): df = DataFrame( { @@ -351,11 +365,11 @@ def test_boxplot_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.boxplot(vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.boxplot(xlabel=xlabel, ylabel=ylabel, **vert) assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_boxplot_group_xlabel_ylabel(self, vert): df = DataFrame( { @@ -365,13 +379,19 @@ def test_boxplot_group_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.boxplot(by="group", vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.boxplot(by="group", xlabel=xlabel, ylabel=ylabel, **vert) for subplot in ax: assert subplot.get_xlabel() == xlabel assert subplot.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) - def test_boxplot_group_no_xlabel_ylabel(self, vert): + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + def test_boxplot_group_no_xlabel_ylabel(self, vert, request): + if Version(mpl.__version__) >= Version("3.10") and vert == { + "orientation": "horizontal" + }: + request.applymarker( + pytest.mark.xfail(reason=f"{vert} fails starting with matplotlib 3.10") + ) df = DataFrame( { "a": np.random.default_rng(2).standard_normal(10), @@ -379,9 +399,13 @@ def test_boxplot_group_no_xlabel_ylabel(self, vert): "group": np.random.default_rng(2).choice(["group1", "group2"], 10), } ) - ax = df.boxplot(by="group", vert=vert) + ax = df.boxplot(by="group", **vert) for subplot in ax: - target_label = subplot.get_xlabel() if vert else subplot.get_ylabel() + target_label = ( + subplot.get_xlabel() + if vert == {"vert": True} or vert == {"orientation": "vertical"} + else subplot.get_ylabel() + ) assert target_label == pprint_thing(["group"]) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 52ca66c218862..9675b936c171e 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -958,3 +958,16 @@ def test_plot_no_warning(self, ts): # TODO(3.0): this can be removed once Period[B] deprecation is enforced with tm.assert_produces_warning(False): _ = ts.plot() + + def test_secondary_y_subplot_axis_labels(self): + # GH#14102 + s1 = Series([5, 7, 6, 8, 7], index=[1, 2, 3, 4, 5]) + s2 = Series([6, 4, 5, 3, 4], index=[1, 2, 3, 4, 5]) + + ax = plt.subplot(2, 1, 1) + s1.plot(ax=ax) + s2.plot(ax=ax, secondary_y=True) + ax2 = plt.subplot(2, 1, 2) + s1.plot(ax=ax2) + assert len(ax.xaxis.get_minor_ticks()) == 0 + assert len(ax.get_xticklabels()) > 0 diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 179f2c0e6cfa9..3a7fd548ca961 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1022,12 +1022,8 @@ def test_resample_segfault(unit): all_wins_and_wagers, columns=("ID", "timestamp", "A", "B") ).set_index("timestamp") df.index = df.index.as_unit(unit) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("ID").resample("5min").sum() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) + result = df.groupby("ID").resample("5min").sum() + expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) tm.assert_frame_equal(result, expected) @@ -1046,9 +1042,7 @@ def test_resample_dtype_preservation(unit): result = df.resample("1D").ffill() assert result.val.dtype == np.int32 - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("group").resample("1D").ffill() + result = df.groupby("group").resample("1D").ffill() assert result.val.dtype == np.int32 @@ -1821,12 +1815,8 @@ def f(data, add_arg): multiplier = 10 df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10)) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby("A").resample("D").mean().multiply(multiplier) + result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) + expected = df.groupby("A").resample("D").mean().multiply(multiplier) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index b7b80b5e427ff..da1774cf22587 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -76,9 +76,7 @@ def test_groupby_resample_api(): ) index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i], names=["group", "date"]) expected = DataFrame({"val": [5] * 7 + [6] + [7] * 7 + [8]}, index=index) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] + result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index ff1b82210e20d..e7850f96b3b0f 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -71,12 +71,8 @@ def test_deferred_with_groupby(): def f_0(x): return x.set_index("date").resample("D").asfreq() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby("id").apply(f_0) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.set_index("date").groupby("id").resample("D").asfreq() + expected = df.groupby("id").apply(f_0) + result = df.set_index("date").groupby("id").resample("D").asfreq() tm.assert_frame_equal(result, expected) df = DataFrame( @@ -90,12 +86,8 @@ def f_0(x): def f_1(x): return x.resample("1D").ffill() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby("group").apply(f_1) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("group").resample("1D").ffill() + expected = df.groupby("group").apply(f_1) + result = df.groupby("group").resample("1D").ffill() tm.assert_frame_equal(result, expected) @@ -110,9 +102,7 @@ def test_getitem(test_frame): result = g.B.resample("2s").mean() tm.assert_series_equal(result, expected) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.resample("2s").mean().B + result = g.resample("2s").mean().B tm.assert_series_equal(result, expected) @@ -236,12 +226,8 @@ def test_methods(f, test_frame): g = test_frame.groupby("A") r = g.resample("2s") - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = getattr(r, f)() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) + result = getattr(r, f)() + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) tm.assert_equal(result, expected) @@ -258,12 +244,8 @@ def test_methods_nunique(test_frame): def test_methods_std_var(f, test_frame): g = test_frame.groupby("A") r = g.resample("2s") - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = getattr(r, f)(ddof=1) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) + result = getattr(r, f)(ddof=1) + expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) tm.assert_frame_equal(result, expected) @@ -272,24 +254,18 @@ def test_apply(test_frame): r = g.resample("2s") # reduction - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.resample("2s").sum() + expected = g.resample("2s").sum() def f_0(x): return x.resample("2s").sum() - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = r.apply(f_0) + result = r.apply(f_0) tm.assert_frame_equal(result, expected) def f_1(x): return x.resample("2s").apply(lambda y: y.sum()) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.apply(f_1) + result = g.apply(f_1) # y.sum() results in int64 instead of int32 on 32-bit architectures expected = expected.astype("int64") tm.assert_frame_equal(result, expected) @@ -357,9 +333,7 @@ def test_resample_groupby_with_label(unit): # GH 13235 index = date_range("2000-01-01", freq="2D", periods=5, unit=unit) df = DataFrame(index=index, data={"col0": [0, 0, 1, 1, 2], "col1": [1, 1, 1, 1, 1]}) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("col0").resample("1W", label="left").sum() + result = df.groupby("col0").resample("1W", label="left").sum() mi = [ np.array([0, 0, 1, 2], dtype=np.int64), @@ -369,9 +343,7 @@ def test_resample_groupby_with_label(unit): ), ] mindex = pd.MultiIndex.from_arrays(mi, names=["col0", None]) - expected = DataFrame( - data={"col0": [0, 0, 2, 2], "col1": [1, 1, 2, 1]}, index=mindex - ) + expected = DataFrame(data={"col1": [1, 1, 2, 1]}, index=mindex) tm.assert_frame_equal(result, expected) @@ -380,9 +352,7 @@ def test_consistency_with_window(test_frame): # consistent return values with window df = test_frame expected = Index([1, 2, 3], name="A") - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").resample("2s").mean() + result = df.groupby("A").resample("2s").mean() assert result.index.nlevels == 2 tm.assert_index_equal(result.index.levels[0], expected) @@ -479,13 +449,12 @@ def test_resample_groupby_agg_listlike(): def test_empty(keys): # GH 26411 df = DataFrame([], columns=["a", "b"], index=TimedeltaIndex([])) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + expected_columns = ["b"] if keys == ["a"] else [] expected = ( DataFrame(columns=["a", "b"]) .set_index(keys, drop=False) - .set_index(TimedeltaIndex([]), append=True) + .set_index(TimedeltaIndex([]), append=True)[expected_columns] ) if len(keys) == 1: expected.index.name = keys[0] @@ -505,9 +474,7 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): if consolidate: df = df._consolidate() - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["key"]).resample("W", on="date").min() + result = df.groupby(["key"]).resample("W", on="date").min() idx = pd.MultiIndex.from_arrays( [ ["A"] * 3 + ["B"] * 3, @@ -519,7 +486,6 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): ) expected = DataFrame( { - "key": ["A"] * 3 + ["B"] * 3, "col1": [0, 5, 12] * 2, "col_object": ["val"] * 3 + [np.nan] * 3, }, @@ -557,12 +523,11 @@ def test_resample_no_index(keys): df = DataFrame([], columns=["a", "b", "date"]) df["date"] = pd.to_datetime(df["date"]) df = df.set_index("date") - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + expected_columns = ["b"] if keys == ["a"] else [] expected = DataFrame(columns=["a", "b", "date"]).set_index(keys, drop=False) expected["date"] = pd.to_datetime(expected["date"]) - expected = expected.set_index("date", append=True, drop=True) + expected = expected.set_index("date", append=True, drop=True)[expected_columns] if len(keys) == 1: expected.index.name = keys[0] @@ -606,9 +571,7 @@ def test_groupby_resample_size_all_index_same(): {"A": [1] * 3 + [2] * 3 + [1] * 3 + [2] * 3, "B": np.arange(12)}, index=date_range("31/12/2000 18:00", freq="h", periods=12), ) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").resample("D").size() + result = df.groupby("A").resample("D").size() mi_exp = pd.MultiIndex.from_arrays( [ diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index f694b90a707c7..30e2c9dfe3d30 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -351,14 +351,11 @@ def test_groupby_resample_interpolate_raises(groupy_test_df): dfs = [groupy_test_df, groupy_test_df_without_index_name] for df in dfs: - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - with pytest.raises( - NotImplementedError, - match="Direct interpolation of MultiIndex data frames is " - "not supported", - ): - df.groupby("volume").resample("1D").interpolate(method="linear") + with pytest.raises( + NotImplementedError, + match="Direct interpolation of MultiIndex data frames is " "not supported", + ): + df.groupby("volume").resample("1D").interpolate(method="linear") def test_groupby_resample_interpolate_with_apply_syntax(groupy_test_df): @@ -373,7 +370,6 @@ def test_groupby_resample_interpolate_with_apply_syntax(groupy_test_df): for df in dfs: result = df.groupby("volume").apply( lambda x: x.resample("1D").interpolate(method="linear"), - include_groups=False, ) volume = [50] * 15 + [60] @@ -417,7 +413,7 @@ def test_groupby_resample_interpolate_with_apply_syntax_off_grid(groupy_test_df) See GH#21351.""" # GH#21351 result = groupy_test_df.groupby("volume").apply( - lambda x: x.resample("265h").interpolate(method="linear"), include_groups=False + lambda x: x.resample("265h").interpolate(method="linear") ) volume = [50, 50, 60] diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index d8bb4fba1e1fe..63332fe4658e5 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -733,6 +733,7 @@ def test_cut_with_duplicated_index_lowest_included(): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_cut_with_nonexact_categorical_indices(): # GH 42424 diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index d8a9acdc561fd..f42f7f8232229 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2376,9 +2376,13 @@ def test_pivot_table_with_margins_and_numeric_columns(self): tm.assert_frame_equal(result, expected) - def test_pivot_ea_dtype_dropna(self, dropna): + @pytest.mark.parametrize( + "dtype,expected_dtype", [("Int64", "Float64"), ("int64", "float64")] + ) + def test_pivot_ea_dtype_dropna(self, dropna, dtype, expected_dtype): # GH#47477 - df = DataFrame({"x": "a", "y": "b", "age": Series([20, 40], dtype="Int64")}) + # GH#47971 + df = DataFrame({"x": "a", "y": "b", "age": Series([20, 40], dtype=dtype)}) result = df.pivot_table( index="x", columns="y", values="age", aggfunc="mean", dropna=dropna ) @@ -2386,7 +2390,7 @@ def test_pivot_ea_dtype_dropna(self, dropna): [[30]], index=Index(["a"], name="x"), columns=Index(["b"], name="y"), - dtype="Float64", + dtype=expected_dtype, ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index 1d5d16f39e648..081feae6fc43f 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.concat import union_categoricals import pandas as pd @@ -124,12 +122,15 @@ def test_union_categoricals_nan(self): exp = Categorical([np.nan, np.nan, np.nan, np.nan]) tm.assert_categorical_equal(res, exp) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("val", [[], ["1"]]) def test_union_categoricals_empty(self, val, request, using_infer_string): # GH 13759 if using_infer_string and val == ["1"]: - request.applymarker(pytest.mark.xfail("object and strings dont match")) + request.applymarker( + pytest.mark.xfail( + reason="TDOD(infer_string) object and strings dont match" + ) + ) res = union_categoricals([Categorical([]), Categorical(val)]) exp = Categorical(val) tm.assert_categorical_equal(res, exp) diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index 7aa6c6c0496a9..d65d425620c84 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -314,6 +314,17 @@ def test_timestamp_add_timedelta_push_over_dst_boundary(self, tz): assert result == expected + def test_timestamp_dst_transition(self): + # GH 60084 + dt_str = "2023-11-05 01:00-08:00" + tz_str = "America/Los_Angeles" + + ts1 = Timestamp(dt_str, tz=tz_str) + ts2 = ts1 + Timedelta(hours=0) + + assert ts1 == ts2 + assert hash(ts1) == hash(ts2) + class SubDatetime(datetime): pass diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py index c153e800cb534..bec8ca13a2f5f 100644 --- a/pandas/tests/series/accessors/test_list_accessor.py +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -25,9 +25,10 @@ def test_list_getitem(list_dtype): ser = Series( [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(list_dtype), + name="a", ) actual = ser.list[1] - expected = Series([2, None, None], dtype="int64[pyarrow]") + expected = Series([2, None, None], dtype="int64[pyarrow]", name="a") tm.assert_series_equal(actual, expected) @@ -37,9 +38,15 @@ def test_list_getitem_index(): [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), index=[1, 3, 7], + name="a", ) actual = ser.list[1] - expected = Series([2, None, None], dtype="int64[pyarrow]", index=[1, 3, 7]) + expected = Series( + [2, None, None], + dtype="int64[pyarrow]", + index=[1, 3, 7], + name="a", + ) tm.assert_series_equal(actual, expected) @@ -48,6 +55,7 @@ def test_list_getitem_slice(): [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), index=[1, 3, 7], + name="a", ) if pa_version_under11p0: with pytest.raises( @@ -60,6 +68,7 @@ def test_list_getitem_slice(): [[2, 3], [None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), index=[1, 3, 7], + name="a", ) tm.assert_series_equal(actual, expected) @@ -68,9 +77,10 @@ def test_list_len(): ser = Series( [[1, 2, 3], [4, None], None], dtype=ArrowDtype(pa.list_(pa.int64())), + name="a", ) actual = ser.list.len() - expected = Series([3, 2, None], dtype=ArrowDtype(pa.int32())) + expected = Series([3, 2, None], dtype=ArrowDtype(pa.int32()), name="a") tm.assert_series_equal(actual, expected) @@ -78,12 +88,14 @@ def test_list_flatten(): ser = Series( [[1, 2, 3], None, [4, None], [], [7, 8]], dtype=ArrowDtype(pa.list_(pa.int64())), + name="a", ) actual = ser.list.flatten() expected = Series( [1, 2, 3, 4, None, 7, 8], dtype=ArrowDtype(pa.int64()), index=[0, 0, 0, 2, 2, 4, 4], + name="a", ) tm.assert_series_equal(actual, expected) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 0d62317893326..158198239ba75 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -839,11 +839,6 @@ def test_series_where(self, obj, key, expected, raises, val, is_inplace): obj = obj.copy() arr = obj._values - if raises and obj.dtype == "string": - with pytest.raises(TypeError, match="Invalid value"): - obj.where(~mask, val) - return - res = obj.where(~mask, val) if val is NA and res.dtype == object: diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 3d1177c23c612..611b92eb022d6 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1254,7 +1254,7 @@ def test_value_counts_nat(self): result_dt = algos.value_counts_internal(dt) tm.assert_series_equal(result_dt, exp_dt) - exp_td = Series({np.timedelta64(10000): 1}, name="count") + exp_td = Series([1], index=[np.timedelta64(10000)], name="count") result_td = algos.value_counts_internal(td) tm.assert_series_equal(result_td, exp_td) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index e87498742061b..a23e6d9b3973a 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -295,6 +295,29 @@ def test_multiindex_insert_level_with_na(self, na): df[na, "B"] = 1 tm.assert_frame_equal(df[na], DataFrame([1], columns=["B"])) + def test_multiindex_dt_with_nan(self): + # GH#60388 + df = DataFrame( + [ + [1, np.nan, 5, np.nan], + [2, np.nan, 6, np.nan], + [np.nan, 3, np.nan, 7], + [np.nan, 4, np.nan, 8], + ], + index=Series(["a", "b", "c", "d"], dtype=object, name="sub"), + columns=MultiIndex.from_product( + [ + ["value1", "value2"], + [datetime.datetime(2024, 11, 1), datetime.datetime(2024, 11, 2)], + ], + names=[None, "Date"], + ), + ) + df = df.reset_index() + result = df[df.columns[0]] + expected = Series(["a", "b", "c", "d"], name=("sub", np.nan)) + tm.assert_series_equal(result, expected) + class TestSorted: """everything you wanted to test about sorting""" diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index ce41f1e76de79..e7ed8e855a762 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -537,11 +537,8 @@ def _argminmax_wrap(self, value, axis=None, func=None): nullnan = isna(nans) if res.ndim: res[nullnan] = -1 - elif ( - hasattr(nullnan, "all") - and nullnan.all() - or not hasattr(nullnan, "all") - and nullnan + elif (hasattr(nullnan, "all") and nullnan.all()) or ( + not hasattr(nullnan, "all") and nullnan ): res = -1 return res diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index a9d3c235f63f6..74b051aec71a4 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2084,6 +2084,18 @@ def test_dataframe_str_dtype(self, df, cache): ) tm.assert_series_equal(result, expected) + def test_dataframe_float32_dtype(self, df, cache): + # GH#60506 + # coerce to float64 + result = to_datetime(df.astype(np.float32), cache=cache) + expected = Series( + [ + Timestamp("20150204 06:58:10.001002003"), + Timestamp("20160305 07:59:11.001002003"), + ] + ) + tm.assert_series_equal(result, expected) + def test_dataframe_coerce(self, cache): # passing coerce df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]}) @@ -3668,3 +3680,12 @@ def test_to_datetime_mixed_awareness_mixed_types(aware_val, naive_val, naive_fir to_datetime(vec, format="mixed") with pytest.raises(ValueError, match=msg): DatetimeIndex(vec) + + +def test_to_datetime_wrapped_datetime64_ps(): + # GH#60341 + result = to_datetime([np.datetime64(1901901901901, "ps")]) + expected = DatetimeIndex( + ["1970-01-01 00:00:01.901901901"], dtype="datetime64[ns]", freq=None + ) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 4d37c6d57f788..f53250378e33c 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -6,6 +6,7 @@ DatetimeIndex, Index, MultiIndex, + NamedAgg, Series, Timestamp, date_range, @@ -100,11 +101,7 @@ def test_rolling(self, f, roll_frame): r = g.rolling(window=4) result = getattr(r, f)() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.rolling(4), f)()) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: getattr(x.rolling(4), f)()) # GH 39732 expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)]) expected.index = expected_index @@ -116,11 +113,7 @@ def test_rolling_ddof(self, f, roll_frame): r = g.rolling(window=4) result = getattr(r, f)(ddof=1) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) # GH 39732 expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)]) expected.index = expected_index @@ -134,13 +127,9 @@ def test_rolling_quantile(self, interpolation, roll_frame): r = g.rolling(window=4) result = r.quantile(0.4, interpolation=interpolation) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply( - lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) - ) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply( + lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) + ) # GH 39732 expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)]) expected.index = expected_index @@ -181,9 +170,7 @@ def test_rolling_corr_cov_other_diff_size_as_groups(self, f, roll_frame): def func(x): return getattr(x.rolling(4), f)(roll_frame) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(func) + expected = g.apply(func) # GH 39591: The grouped column should be all np.nan # (groupby.apply inserts 0s for cov) expected["A"] = np.nan @@ -199,9 +186,7 @@ def test_rolling_corr_cov_pairwise(self, f, roll_frame): def func(x): return getattr(x.B.rolling(4), f)(pairwise=True) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(func) + expected = g.apply(func) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -246,11 +231,7 @@ def test_rolling_apply(self, raw, roll_frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) # GH 39732 expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)]) expected.index = expected_index @@ -489,6 +470,36 @@ def test_groupby_rolling_subset_with_closed(self): ) tm.assert_series_equal(result, expected) + def test_groupby_rolling_agg_namedagg(self): + # GH#28333 + df = DataFrame( + { + "kind": ["cat", "dog", "cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0, 12.0, 8.0], + "weight": [7.9, 7.5, 9.9, 198.0, 10.0, 42.0], + } + ) + result = ( + df.groupby("kind") + .rolling(2) + .agg( + total_weight=NamedAgg(column="weight", aggfunc=sum), + min_height=NamedAgg(column="height", aggfunc=min), + ) + ) + expected = DataFrame( + { + "total_weight": [np.nan, 17.8, 19.9, np.nan, 205.5, 240.0], + "min_height": [np.nan, 9.1, 9.5, np.nan, 6.0, 8.0], + }, + index=MultiIndex( + [["cat", "dog"], [0, 1, 2, 3, 4, 5]], + [[0, 0, 0, 1, 1, 1], [0, 2, 4, 1, 3, 5]], + names=["kind", None], + ), + ) + tm.assert_frame_equal(result, expected) + def test_groupby_subset_rolling_subset_with_closed(self): # GH 35549 df = DataFrame( @@ -795,13 +806,9 @@ def test_groupby_rolling_resulting_multiindex3(self): def test_groupby_rolling_object_doesnt_affect_groupby_apply(self, roll_frame): # GH 39732 g = roll_frame.groupby("A", group_keys=False) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: x.rolling(4).sum()).index + expected = g.apply(lambda x: x.rolling(4).sum()).index _ = g.rolling(window=4) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.apply(lambda x: x.rolling(4).sum()).index + result = g.apply(lambda x: x.rolling(4).sum()).index tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -977,13 +984,11 @@ def test_groupby_monotonic(self): df["date"] = to_datetime(df["date"]) df = df.sort_values("date") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = ( - df.set_index("date") - .groupby("name") - .apply(lambda x: x.rolling("180D")["amount"].sum()) - ) + expected = ( + df.set_index("date") + .groupby("name") + .apply(lambda x: x.rolling("180D")["amount"].sum()) + ) result = df.groupby("name").rolling("180D", on="date")["amount"].sum() tm.assert_series_equal(result, expected) @@ -1002,13 +1007,9 @@ def test_datelike_on_monotonic_within_each_group(self): } ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = ( - df.set_index("B") - .groupby("A") - .apply(lambda x: x.rolling("4s")["C"].mean()) - ) + expected = ( + df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) + ) result = df.groupby("A").rolling("4s", on="B").C.mean() tm.assert_series_equal(result, expected) @@ -1038,11 +1039,7 @@ def test_expanding(self, f, frame): r = g.expanding() result = getattr(r, f)() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.expanding(), f)()) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: getattr(x.expanding(), f)()) # GH 39732 expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index @@ -1054,11 +1051,7 @@ def test_expanding_ddof(self, f, frame): r = g.expanding() result = getattr(r, f)(ddof=0) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) # GH 39732 expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index @@ -1072,13 +1065,9 @@ def test_expanding_quantile(self, interpolation, frame): r = g.expanding() result = r.quantile(0.4, interpolation=interpolation) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply( - lambda x: x.expanding().quantile(0.4, interpolation=interpolation) - ) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply( + lambda x: x.expanding().quantile(0.4, interpolation=interpolation) + ) # GH 39732 expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index @@ -1094,9 +1083,7 @@ def test_expanding_corr_cov(self, f, frame): def func_0(x): return getattr(x.expanding(), f)(frame) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(func_0) + expected = g.apply(func_0) # GH 39591: groupby.apply returns 1 instead of nan for windows # with all nan values null_idx = list(range(20, 61)) + list(range(72, 113)) @@ -1111,9 +1098,7 @@ def func_0(x): def func_1(x): return getattr(x.B.expanding(), f)(pairwise=True) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(func_1) + expected = g.apply(func_1) tm.assert_series_equal(result, expected) def test_expanding_apply(self, raw, frame): @@ -1122,18 +1107,42 @@ def test_expanding_apply(self, raw, frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply( - lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw) - ) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)) # GH 39732 expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index tm.assert_frame_equal(result, expected) + def test_groupby_expanding_agg_namedagg(self): + # GH#28333 + df = DataFrame( + { + "kind": ["cat", "dog", "cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0, 12.0, 8.0], + "weight": [7.9, 7.5, 9.9, 198.0, 10.0, 42.0], + } + ) + result = ( + df.groupby("kind") + .expanding(1) + .agg( + total_weight=NamedAgg(column="weight", aggfunc=sum), + min_height=NamedAgg(column="height", aggfunc=min), + ) + ) + expected = DataFrame( + { + "total_weight": [7.9, 17.8, 27.8, 7.5, 205.5, 247.5], + "min_height": [9.1, 9.1, 9.1, 6.0, 6.0, 6.0], + }, + index=MultiIndex( + [["cat", "dog"], [0, 1, 2, 3, 4, 5]], + [[0, 0, 0, 1, 1, 1], [0, 2, 4, 1, 3, 5]], + names=["kind", None], + ), + ) + tm.assert_frame_equal(result, expected) + class TestEWM: @pytest.mark.parametrize( @@ -1162,6 +1171,41 @@ def test_methods(self, method, expected_data): ) tm.assert_frame_equal(result, expected) + def test_groupby_ewm_agg_namedagg(self): + # GH#28333 + df = DataFrame({"A": ["a"] * 4, "B": range(4)}) + result = ( + df.groupby("A") + .ewm(com=1.0) + .agg( + B_mean=NamedAgg(column="B", aggfunc="mean"), + B_std=NamedAgg(column="B", aggfunc="std"), + B_var=NamedAgg(column="B", aggfunc="var"), + ) + ) + expected = DataFrame( + { + "B_mean": [ + 0.0, + 0.6666666666666666, + 1.4285714285714286, + 2.2666666666666666, + ], + "B_std": [np.nan, 0.707107, 0.963624, 1.177164], + "B_var": [np.nan, 0.5, 0.9285714285714286, 1.3857142857142857], + }, + index=MultiIndex.from_tuples( + [ + ("a", 0), + ("a", 1), + ("a", 2), + ("a", 3), + ], + names=["A", None], + ), + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "method, expected_data", [["corr", [np.nan, 1.0, 1.0, 1]], ["cov", [np.nan, 0.5, 0.928571, 1.385714]]], diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index d9ab4723a8f2c..120dbe788a23f 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -459,6 +459,38 @@ def f(x): ) tm.assert_frame_equal(result, expected) + def test_table_method_rolling_apply_col_order(self): + # GH#59666 + def f(x): + return np.nanmean(x[:, 0] - x[:, 1]) + + df = DataFrame( + { + "a": [1, 2, 3, 4, 5, 6], + "b": [6, 7, 8, 5, 6, 7], + } + ) + result = df.rolling(3, method="table", min_periods=0)[["a", "b"]].apply( + f, raw=True, engine="numba" + ) + expected = DataFrame( + { + "a": [-5, -5, -5, -3.66667, -2.33333, -1], + "b": [-5, -5, -5, -3.66667, -2.33333, -1], + } + ) + tm.assert_almost_equal(result, expected) + result = df.rolling(3, method="table", min_periods=0)[["b", "a"]].apply( + f, raw=True, engine="numba" + ) + expected = DataFrame( + { + "b": [5, 5, 5, 3.66667, 2.33333, 1], + "a": [5, 5, 5, 3.66667, 2.33333, 1], + } + ) + tm.assert_almost_equal(result, expected) + def test_table_method_rolling_weighted_mean(self, step): def weighted_mean(x): arr = np.ones((1, x.shape[1])) diff --git a/pandas/tseries/__init__.py b/pandas/tseries/__init__.py index e361726dc6f80..c00843ecac418 100644 --- a/pandas/tseries/__init__.py +++ b/pandas/tseries/__init__.py @@ -1,4 +1,4 @@ -# ruff: noqa: TCH004 +# ruff: noqa: TC004 from typing import TYPE_CHECKING if TYPE_CHECKING: diff --git a/pandas/tseries/api.py b/pandas/tseries/api.py index ec2d7d2304839..5ea899f1610a7 100644 --- a/pandas/tseries/api.py +++ b/pandas/tseries/api.py @@ -7,4 +7,4 @@ from pandas.tseries import offsets from pandas.tseries.frequencies import infer_freq -__all__ = ["infer_freq", "offsets", "guess_datetime_format"] +__all__ = ["guess_datetime_format", "infer_freq", "offsets"] diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 534bee5fede44..9a01568971af8 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -89,6 +89,11 @@ def infer_freq( """ Infer the most likely frequency given the input index. + This method attempts to deduce the most probable frequency (e.g., 'D' for daily, + 'H' for hourly) from a sequence of datetime-like objects. It is particularly useful + when the frequency of a time series is not explicitly set or known but can be + inferred from its values. + Parameters ---------- index : DatetimeIndex, TimedeltaIndex, Series or array-like @@ -106,6 +111,13 @@ def infer_freq( ValueError If there are fewer than three values. + See Also + -------- + date_range : Return a fixed frequency DatetimeIndex. + timedelta_range : Return a fixed frequency TimedeltaIndex with day as the default. + period_range : Return a fixed frequency PeriodIndex. + DatetimeIndex.freq : Return the frequency object if it is set, otherwise None. + Examples -------- >>> idx = pd.date_range(start="2020/12/01", end="2020/12/30", periods=30) diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index bf4ec2e551f01..2d195fbbc4e84 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -636,12 +636,17 @@ def HolidayCalendarFactory(name: str, base, other, base_class=AbstractHolidayCal __all__ = [ + "FR", + "MO", + "SA", + "SU", + "TH", + "TU", + "WE", + "HolidayCalendarFactory", "after_nearest_workday", "before_nearest_workday", - "FR", "get_calendar", - "HolidayCalendarFactory", - "MO", "nearest_workday", "next_monday", "next_monday_or_tuesday", @@ -649,11 +654,6 @@ def HolidayCalendarFactory(name: str, base, other, base_class=AbstractHolidayCal "previous_friday", "previous_workday", "register", - "SA", - "SU", "sunday_to_monday", - "TH", - "TU", - "WE", "weekend_to_monday", ] diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 169c9cc18a7fd..a065137e6971c 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -46,46 +46,46 @@ ) __all__ = [ - "Day", + "FY5253", + "BDay", + "BMonthBegin", + "BMonthEnd", + "BQuarterBegin", + "BQuarterEnd", + "BYearBegin", + "BYearEnd", "BaseOffset", "BusinessDay", + "BusinessHour", "BusinessMonthBegin", "BusinessMonthEnd", - "BDay", + "CBMonthBegin", + "CBMonthEnd", + "CDay", "CustomBusinessDay", + "CustomBusinessHour", "CustomBusinessMonthBegin", "CustomBusinessMonthEnd", - "CDay", - "CBMonthEnd", - "CBMonthBegin", + "DateOffset", + "Day", + "Easter", + "FY5253Quarter", + "Hour", + "LastWeekOfMonth", + "Micro", + "Milli", + "Minute", "MonthBegin", - "BMonthBegin", "MonthEnd", - "BMonthEnd", - "SemiMonthEnd", - "SemiMonthBegin", - "BusinessHour", - "CustomBusinessHour", - "YearBegin", - "BYearBegin", - "YearEnd", - "BYearEnd", + "Nano", "QuarterBegin", - "BQuarterBegin", "QuarterEnd", - "BQuarterEnd", - "LastWeekOfMonth", - "FY5253Quarter", - "FY5253", + "Second", + "SemiMonthBegin", + "SemiMonthEnd", + "Tick", "Week", "WeekOfMonth", - "Easter", - "Tick", - "Hour", - "Minute", - "Second", - "Milli", - "Micro", - "Nano", - "DateOffset", + "YearBegin", + "YearEnd", ] diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index 165824bec131f..a1a0d51a7c72b 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -83,7 +83,7 @@ def wrapper(*args, **kwargs) -> Callable[..., Any]: if alternative.__doc__.count("\n") < 3: raise AssertionError(doc_error_msg) empty1, summary, empty2, doc_string = alternative.__doc__.split("\n", 3) - if empty1 or empty2 and not summary: + if empty1 or (empty2 and not summary): raise AssertionError(doc_error_msg) wrapper.__doc__ = dedent( f""" @@ -497,13 +497,13 @@ def indent(text: str | None, indents: int = 1) -> str: __all__ = [ "Appender", + "Substitution", "cache_readonly", "deprecate", "deprecate_kwarg", "deprecate_nonkeyword_arguments", "doc", "future_version_msg", - "Substitution", ] diff --git a/pyproject.toml b/pyproject.toml index 0c76ecd0b15b4..7ab9cd2c17669 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -304,10 +304,6 @@ ignore = [ "PERF102", # try-except-in-loop, becomes useless in Python 3.11 "PERF203", - # pytest-missing-fixture-name-underscore - "PT004", - # pytest-incorrect-fixture-name-underscore - "PT005", # pytest-parametrize-names-wrong-type "PT006", # pytest-parametrize-values-wrong-type diff --git a/requirements-dev.txt b/requirements-dev.txt index 69568cf661241..fb4d9cdb589ca 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -24,6 +24,7 @@ html5lib>=1.1 hypothesis>=6.84.0 gcsfs>=2022.11.0 ipython +pickleshare jinja2>=3.1.2 lxml>=4.9.2 matplotlib>=3.6.3 @@ -62,7 +63,7 @@ gitdb google-auth natsort numpydoc -pydata-sphinx-theme==0.14 +pydata-sphinx-theme==0.16 pytest-cython sphinx sphinx-design diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 076acc359f933..d804e15f6d48f 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -319,10 +319,10 @@ def nodefault_used_not_only_for_typing(file_obj: IO[str]) -> Iterable[tuple[int, while nodes: in_annotation, node = nodes.pop() if not in_annotation and ( - isinstance(node, ast.Name) # Case `NoDefault` - and node.id == "NoDefault" - or isinstance(node, ast.Attribute) # Cases e.g. `lib.NoDefault` - and node.attr == "NoDefault" + (isinstance(node, ast.Name) # Case `NoDefault` + and node.id == "NoDefault") + or (isinstance(node, ast.Attribute) # Cases e.g. `lib.NoDefault` + and node.attr == "NoDefault") ): yield (node.lineno, "NoDefault is used not only for typing") diff --git a/web/pandas/_templates/layout.html b/web/pandas/_templates/layout.html index 4c66f28818abd..c26b093b0c4ba 100644 --- a/web/pandas/_templates/layout.html +++ b/web/pandas/_templates/layout.html @@ -73,12 +73,12 @@
  • - +
  • - +
  • diff --git a/web/pandas/about/citing.md b/web/pandas/about/citing.md index 4ce1fdb207865..a3c470d05e55f 100644 --- a/web/pandas/about/citing.md +++ b/web/pandas/about/citing.md @@ -20,7 +20,7 @@ following paper: url = {https://doi.org/10.5281/zenodo.3509134} } -- [Data structures for statistical computing in python](https://conference.scipy.org/proceedings/scipy2010/pdfs/mckinney.pdf), +- [Data structures for statistical computing in python](https://pub.curvenote.com/01908378-3686-7168-a380-d82bbf21c799/public/mckinney-57fc0d4e8a08cd7f26a4b8bf468a71f4.pdf), McKinney, Proceedings of the 9th Python in Science Conference, Volume 445, 2010. @InProceedings{ mckinney-proc-scipy-2010, diff --git a/web/pandas/index.html b/web/pandas/index.html index 63bc11d3ed5d8..98628b856edb6 100644 --- a/web/pandas/index.html +++ b/web/pandas/index.html @@ -83,8 +83,8 @@

    Follow us

  • - - + +