From 37221468b5ed118595606d4b0e67321b163a8b09 Mon Sep 17 00:00:00 2001 From: Sai-Suraj-27 Date: Mon, 25 Sep 2023 23:38:06 +0530 Subject: [PATCH 1/7] Updated `pre-commit` hooks versions. (#55277) Updated pre-commit hooks version and checked all the files accordingly. --- .pre-commit-config.yaml | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c01bf65818167..b0b511e1048c6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,11 +20,11 @@ ci: repos: - repo: https://github.com/hauntsaninja/black-pre-commit-mirror # black compiled with mypyc - rev: 23.7.0 + rev: 23.9.1 hooks: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.0.287 + rev: v0.0.291 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -107,7 +107,7 @@ repos: hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v3.10.1 + rev: v3.13.0 hooks: - id: pyupgrade args: [--py39-plus] diff --git a/pyproject.toml b/pyproject.toml index 4e1c77413efda..df89bd129901f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -187,7 +187,7 @@ environment = {CFLAGS="-g0"} [tool.black] target-version = ['py39', 'py310'] -required-version = '23.7.0' +required-version = '23.9.1' exclude = ''' ( asv_bench/env From f9ade667e2ebc6cc1731065ccbf40d4e70d9c3b5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 25 Sep 2023 11:08:55 -0700 Subject: [PATCH 2/7] Bump pypa/cibuildwheel from 2.15.0 to 2.16.0 (#55276) Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.15.0 to 2.16.0. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.15.0...v2.16.0) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 83d14b51092e6..4c7a7b329777b 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -138,7 +138,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.15.0 + uses: pypa/cibuildwheel@v2.16.0 with: package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From bdc2cd4c6423c23e8796d80b589dea937fffc2aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Lucas=20Mayer?= Date: Mon, 25 Sep 2023 15:20:52 -0300 Subject: [PATCH 3/7] ASV: add GroupBy.sum benchmark with timedelta and integer types (#55273) add SumTimeDelta asv benchmark Co-authored-by: Willian Wang --- asv_bench/benchmarks/groupby.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index b206523dfe851..4567b5b414301 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -841,6 +841,23 @@ def time_groupby_sum_multiindex(self): self.df.groupby(level=[0, 1]).sum() +class SumTimeDelta: + # GH 20660 + def setup(self): + N = 10**4 + self.df = DataFrame( + np.random.randint(1000, 100000, (N, 100)), + index=np.random.randint(200, size=(N,)), + ).astype("timedelta64[ns]") + self.df_int = self.df.copy().astype("int64") + + def time_groupby_sum_timedelta(self): + self.df.groupby(lambda x: x).sum() + + def time_groupby_sum_int(self): + self.df_int.groupby(lambda x: x).sum() + + class Transform: def setup(self): n1 = 400 From ce814c211c787216f10d251081d44ea8996ca9f7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 25 Sep 2023 08:22:33 -1000 Subject: [PATCH 4/7] TST: Clean up autouse fixtures (#55269) * TST: Clean up autouse fixtures * Use another flavor_read_html --- pandas/tests/arithmetic/conftest.py | 18 - pandas/tests/arithmetic/test_numeric.py | 7 + pandas/tests/frame/test_arithmetic.py | 11 +- pandas/tests/io/formats/test_format.py | 15 +- pandas/tests/io/test_html.py | 444 +++++++++++++----------- pandas/tests/series/test_arithmetic.py | 9 +- pandas/tests/test_expressions.py | 118 +++---- 7 files changed, 313 insertions(+), 309 deletions(-) diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index 7dd5169202ba4..7ec77e5b65b7e 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -7,23 +7,8 @@ RangeIndex, ) import pandas._testing as tm -from pandas.core.computation import expressions as expr -@pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"]) -def switch_numexpr_min_elements(request): - _MIN_ELEMENTS = expr._MIN_ELEMENTS - expr._MIN_ELEMENTS = request.param - yield request.param - expr._MIN_ELEMENTS = _MIN_ELEMENTS - - -# ------------------------------------------------------------------ - - -# doctest with +SKIP for one fixture fails during setup with -# 'DoctestItem' object has no attribute 'callspec' -# due to switch_numexpr_min_elements fixture @pytest.fixture(params=[1, np.array(1, dtype=np.int64)]) def one(request): """ @@ -58,9 +43,6 @@ def one(request): zeros.extend([0, 0.0, -0.0]) -# doctest with +SKIP for zero fixture fails during setup with -# 'DoctestItem' object has no attribute 'callspec' -# due to switch_numexpr_min_elements fixture @pytest.fixture(params=zeros) def zero(request): """ diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index fa17c24fffb26..c93a03ad0f479 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -29,6 +29,13 @@ ) +@pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"]) +def switch_numexpr_min_elements(request, monkeypatch): + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", request.param) + yield request.param + + @pytest.fixture(params=[Index, Series, tm.to_array]) def box_pandas_1d_array(request): """ diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 1488fa65fabc0..24a70e55e2f0e 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -33,11 +33,10 @@ @pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"]) -def switch_numexpr_min_elements(request): - _MIN_ELEMENTS = expr._MIN_ELEMENTS - expr._MIN_ELEMENTS = request.param - yield request.param - expr._MIN_ELEMENTS = _MIN_ELEMENTS +def switch_numexpr_min_elements(request, monkeypatch): + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", request.param) + yield request.param class DummyElement: @@ -1074,7 +1073,7 @@ def test_frame_with_frame_reindex(self): ], ids=lambda x: x.__name__, ) - def test_binop_other(self, op, value, dtype, switch_numexpr_min_elements, request): + def test_binop_other(self, op, value, dtype, switch_numexpr_min_elements): skip = { (operator.truediv, "bool"), (operator.pow, "bool"), diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index f3a1ebe23b568..53ee449c2dc0c 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -22,8 +22,6 @@ import pytest import pytz -from pandas._config import config - import pandas as pd from pandas import ( DataFrame, @@ -51,17 +49,6 @@ def get_local_am_pm(): return am_local, pm_local -@pytest.fixture(autouse=True) -def clean_config(): - curr_deprecated_options = config._deprecated_options.copy() - curr_registered_options = config._registered_options.copy() - curr_global_config = config._global_config.copy() - yield - config._deprecated_options = curr_deprecated_options - config._registered_options = curr_registered_options - config._global_config = curr_global_config - - @pytest.fixture(params=["string", "pathlike", "buffer"]) def filepath_or_buffer_id(request): """ @@ -3604,7 +3591,7 @@ def test_repr_html_ipython_config(ip): df._repr_html_() """ ) - result = ip.run_cell(code) + result = ip.run_cell(code, silent=True) assert not result.error_in_exec diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 6cf90749e5b30..dcee52011a691 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -99,15 +99,18 @@ def test_same_ordering(datapath): assert_framelist_equal(dfs_lxml, dfs_bs4) -@pytest.mark.parametrize( - "flavor", - [ +@pytest.fixture( + params=[ pytest.param("bs4", marks=[td.skip_if_no("bs4"), td.skip_if_no("html5lib")]), pytest.param("lxml", marks=td.skip_if_no("lxml")), ], ) +def flavor_read_html(request): + return partial(read_html, flavor=request.param) + + class TestReadHtml: - def test_literal_html_deprecation(self): + def test_literal_html_deprecation(self, flavor_read_html): # GH 53785 msg = ( "Passing literal html to 'read_html' is deprecated and " @@ -116,7 +119,7 @@ def test_literal_html_deprecation(self): ) with tm.assert_produces_warning(FutureWarning, match=msg): - self.read_html( + flavor_read_html( """ @@ -147,12 +150,7 @@ def spam_data(self, datapath): def banklist_data(self, datapath): return datapath("io", "data", "html", "banklist.html") - @pytest.fixture(autouse=True) - def set_defaults(self, flavor): - self.read_html = partial(read_html, flavor=flavor) - yield - - def test_to_html_compat(self): + def test_to_html_compat(self, flavor_read_html): df = ( tm.makeCustomDataframe( 4, @@ -165,12 +163,12 @@ def test_to_html_compat(self): .map("{:.3f}".format).astype(float) ) out = df.to_html() - res = self.read_html(StringIO(out), attrs={"class": "dataframe"}, index_col=0)[ - 0 - ] + res = flavor_read_html( + StringIO(out), attrs={"class": "dataframe"}, index_col=0 + )[0] tm.assert_frame_equal(res, df) - def test_dtype_backend(self, string_storage, dtype_backend): + def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): # GH#50286 df = DataFrame( { @@ -196,7 +194,7 @@ def test_dtype_backend(self, string_storage, dtype_backend): out = df.to_html(index=False) with pd.option_context("mode.string_storage", string_storage): - result = self.read_html(StringIO(out), dtype_backend=dtype_backend)[0] + result = flavor_read_html(StringIO(out), dtype_backend=dtype_backend)[0] expected = DataFrame( { @@ -227,16 +225,16 @@ def test_dtype_backend(self, string_storage, dtype_backend): @pytest.mark.network @pytest.mark.single_cpu - def test_banklist_url(self, httpserver, banklist_data): + def test_banklist_url(self, httpserver, banklist_data, flavor_read_html): with open(banklist_data, encoding="utf-8") as f: httpserver.serve_content(content=f.read()) - df1 = self.read_html( + df1 = flavor_read_html( # lxml cannot find attrs leave out for now httpserver.url, match="First Federal Bank of Florida", # attrs={"class": "dataTable"} ) # lxml cannot find attrs leave out for now - df2 = self.read_html( + df2 = flavor_read_html( httpserver.url, match="Metcalf Bank", ) # attrs={"class": "dataTable"}) @@ -245,165 +243,169 @@ def test_banklist_url(self, httpserver, banklist_data): @pytest.mark.network @pytest.mark.single_cpu - def test_spam_url(self, httpserver, spam_data): + def test_spam_url(self, httpserver, spam_data, flavor_read_html): with open(spam_data, encoding="utf-8") as f: httpserver.serve_content(content=f.read()) - df1 = self.read_html(httpserver.url, match=".*Water.*") - df2 = self.read_html(httpserver.url, match="Unit") + df1 = flavor_read_html(httpserver.url, match=".*Water.*") + df2 = flavor_read_html(httpserver.url, match="Unit") assert_framelist_equal(df1, df2) @pytest.mark.slow - def test_banklist(self, banklist_data): - df1 = self.read_html(banklist_data, match=".*Florida.*", attrs={"id": "table"}) - df2 = self.read_html(banklist_data, match="Metcalf Bank", attrs={"id": "table"}) + def test_banklist(self, banklist_data, flavor_read_html): + df1 = flavor_read_html( + banklist_data, match=".*Florida.*", attrs={"id": "table"} + ) + df2 = flavor_read_html( + banklist_data, match="Metcalf Bank", attrs={"id": "table"} + ) assert_framelist_equal(df1, df2) - def test_spam(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*") - df2 = self.read_html(spam_data, match="Unit") + def test_spam(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*") + df2 = flavor_read_html(spam_data, match="Unit") assert_framelist_equal(df1, df2) assert df1[0].iloc[0, 0] == "Proximates" assert df1[0].columns[0] == "Nutrient" - def test_spam_no_match(self, spam_data): - dfs = self.read_html(spam_data) + def test_spam_no_match(self, spam_data, flavor_read_html): + dfs = flavor_read_html(spam_data) for df in dfs: assert isinstance(df, DataFrame) - def test_banklist_no_match(self, banklist_data): - dfs = self.read_html(banklist_data, attrs={"id": "table"}) + def test_banklist_no_match(self, banklist_data, flavor_read_html): + dfs = flavor_read_html(banklist_data, attrs={"id": "table"}) for df in dfs: assert isinstance(df, DataFrame) - def test_spam_header(self, spam_data): - df = self.read_html(spam_data, match=".*Water.*", header=2)[0] + def test_spam_header(self, spam_data, flavor_read_html): + df = flavor_read_html(spam_data, match=".*Water.*", header=2)[0] assert df.columns[0] == "Proximates" assert not df.empty - def test_skiprows_int(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=1) - df2 = self.read_html(spam_data, match="Unit", skiprows=1) + def test_skiprows_int(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=1) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=1) assert_framelist_equal(df1, df2) - def test_skiprows_range(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=range(2)) - df2 = self.read_html(spam_data, match="Unit", skiprows=range(2)) + def test_skiprows_range(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=range(2)) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=range(2)) assert_framelist_equal(df1, df2) - def test_skiprows_list(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=[1, 2]) - df2 = self.read_html(spam_data, match="Unit", skiprows=[2, 1]) + def test_skiprows_list(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=[1, 2]) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=[2, 1]) assert_framelist_equal(df1, df2) - def test_skiprows_set(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows={1, 2}) - df2 = self.read_html(spam_data, match="Unit", skiprows={2, 1}) + def test_skiprows_set(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows={1, 2}) + df2 = flavor_read_html(spam_data, match="Unit", skiprows={2, 1}) assert_framelist_equal(df1, df2) - def test_skiprows_slice(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=1) - df2 = self.read_html(spam_data, match="Unit", skiprows=1) + def test_skiprows_slice(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=1) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=1) assert_framelist_equal(df1, df2) - def test_skiprows_slice_short(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=slice(2)) - df2 = self.read_html(spam_data, match="Unit", skiprows=slice(2)) + def test_skiprows_slice_short(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=slice(2)) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=slice(2)) assert_framelist_equal(df1, df2) - def test_skiprows_slice_long(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=slice(2, 5)) - df2 = self.read_html(spam_data, match="Unit", skiprows=slice(4, 1, -1)) + def test_skiprows_slice_long(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=slice(2, 5)) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=slice(4, 1, -1)) assert_framelist_equal(df1, df2) - def test_skiprows_ndarray(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=np.arange(2)) - df2 = self.read_html(spam_data, match="Unit", skiprows=np.arange(2)) + def test_skiprows_ndarray(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=np.arange(2)) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=np.arange(2)) assert_framelist_equal(df1, df2) - def test_skiprows_invalid(self, spam_data): + def test_skiprows_invalid(self, spam_data, flavor_read_html): with pytest.raises(TypeError, match=("is not a valid type for skipping rows")): - self.read_html(spam_data, match=".*Water.*", skiprows="asdf") + flavor_read_html(spam_data, match=".*Water.*", skiprows="asdf") - def test_index(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", index_col=0) - df2 = self.read_html(spam_data, match="Unit", index_col=0) + def test_index(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", index_col=0) + df2 = flavor_read_html(spam_data, match="Unit", index_col=0) assert_framelist_equal(df1, df2) - def test_header_and_index_no_types(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", header=1, index_col=0) - df2 = self.read_html(spam_data, match="Unit", header=1, index_col=0) + def test_header_and_index_no_types(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", header=1, index_col=0) + df2 = flavor_read_html(spam_data, match="Unit", header=1, index_col=0) assert_framelist_equal(df1, df2) - def test_header_and_index_with_types(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", header=1, index_col=0) - df2 = self.read_html(spam_data, match="Unit", header=1, index_col=0) + def test_header_and_index_with_types(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", header=1, index_col=0) + df2 = flavor_read_html(spam_data, match="Unit", header=1, index_col=0) assert_framelist_equal(df1, df2) - def test_infer_types(self, spam_data): + def test_infer_types(self, spam_data, flavor_read_html): # 10892 infer_types removed - df1 = self.read_html(spam_data, match=".*Water.*", index_col=0) - df2 = self.read_html(spam_data, match="Unit", index_col=0) + df1 = flavor_read_html(spam_data, match=".*Water.*", index_col=0) + df2 = flavor_read_html(spam_data, match="Unit", index_col=0) assert_framelist_equal(df1, df2) - def test_string_io(self, spam_data): + def test_string_io(self, spam_data, flavor_read_html): with open(spam_data, encoding="UTF-8") as f: data1 = StringIO(f.read()) with open(spam_data, encoding="UTF-8") as f: data2 = StringIO(f.read()) - df1 = self.read_html(data1, match=".*Water.*") - df2 = self.read_html(data2, match="Unit") + df1 = flavor_read_html(data1, match=".*Water.*") + df2 = flavor_read_html(data2, match="Unit") assert_framelist_equal(df1, df2) - def test_string(self, spam_data): + def test_string(self, spam_data, flavor_read_html): with open(spam_data, encoding="UTF-8") as f: data = f.read() - df1 = self.read_html(StringIO(data), match=".*Water.*") - df2 = self.read_html(StringIO(data), match="Unit") + df1 = flavor_read_html(StringIO(data), match=".*Water.*") + df2 = flavor_read_html(StringIO(data), match="Unit") assert_framelist_equal(df1, df2) - def test_file_like(self, spam_data): + def test_file_like(self, spam_data, flavor_read_html): with open(spam_data, encoding="UTF-8") as f: - df1 = self.read_html(f, match=".*Water.*") + df1 = flavor_read_html(f, match=".*Water.*") with open(spam_data, encoding="UTF-8") as f: - df2 = self.read_html(f, match="Unit") + df2 = flavor_read_html(f, match="Unit") assert_framelist_equal(df1, df2) @pytest.mark.network @pytest.mark.single_cpu - def test_bad_url_protocol(self, httpserver): + def test_bad_url_protocol(self, httpserver, flavor_read_html): httpserver.serve_content("urlopen error unknown url type: git", code=404) with pytest.raises(URLError, match="urlopen error unknown url type: git"): - self.read_html("git://github.com", match=".*Water.*") + flavor_read_html("git://github.com", match=".*Water.*") @pytest.mark.slow @pytest.mark.network @pytest.mark.single_cpu - def test_invalid_url(self, httpserver): + def test_invalid_url(self, httpserver, flavor_read_html): httpserver.serve_content("Name or service not known", code=404) with pytest.raises((URLError, ValueError), match="HTTP Error 404: NOT FOUND"): - self.read_html(httpserver.url, match=".*Water.*") + flavor_read_html(httpserver.url, match=".*Water.*") @pytest.mark.slow - def test_file_url(self, banklist_data): + def test_file_url(self, banklist_data, flavor_read_html): url = banklist_data - dfs = self.read_html( + dfs = flavor_read_html( file_path_to_url(os.path.abspath(url)), match="First", attrs={"id": "table"} ) assert isinstance(dfs, list) @@ -411,54 +413,78 @@ def test_file_url(self, banklist_data): assert isinstance(df, DataFrame) @pytest.mark.slow - def test_invalid_table_attrs(self, banklist_data): + def test_invalid_table_attrs(self, banklist_data, flavor_read_html): url = banklist_data with pytest.raises(ValueError, match="No tables found"): - self.read_html( + flavor_read_html( url, match="First Federal Bank of Florida", attrs={"id": "tasdfable"} ) - def _bank_data(self, path, **kwargs): - return self.read_html(path, match="Metcalf", attrs={"id": "table"}, **kwargs) - @pytest.mark.slow - def test_multiindex_header(self, banklist_data): - df = self._bank_data(banklist_data, header=[0, 1])[0] + def test_multiindex_header(self, banklist_data, flavor_read_html): + df = flavor_read_html( + banklist_data, match="Metcalf", attrs={"id": "table"}, header=[0, 1] + )[0] assert isinstance(df.columns, MultiIndex) @pytest.mark.slow - def test_multiindex_index(self, banklist_data): - df = self._bank_data(banklist_data, index_col=[0, 1])[0] + def test_multiindex_index(self, banklist_data, flavor_read_html): + df = flavor_read_html( + banklist_data, match="Metcalf", attrs={"id": "table"}, index_col=[0, 1] + )[0] assert isinstance(df.index, MultiIndex) @pytest.mark.slow - def test_multiindex_header_index(self, banklist_data): - df = self._bank_data(banklist_data, header=[0, 1], index_col=[0, 1])[0] + def test_multiindex_header_index(self, banklist_data, flavor_read_html): + df = flavor_read_html( + banklist_data, + match="Metcalf", + attrs={"id": "table"}, + header=[0, 1], + index_col=[0, 1], + )[0] assert isinstance(df.columns, MultiIndex) assert isinstance(df.index, MultiIndex) @pytest.mark.slow - def test_multiindex_header_skiprows_tuples(self, banklist_data): - df = self._bank_data(banklist_data, header=[0, 1], skiprows=1)[0] + def test_multiindex_header_skiprows_tuples(self, banklist_data, flavor_read_html): + df = flavor_read_html( + banklist_data, + match="Metcalf", + attrs={"id": "table"}, + header=[0, 1], + skiprows=1, + )[0] assert isinstance(df.columns, MultiIndex) @pytest.mark.slow - def test_multiindex_header_skiprows(self, banklist_data): - df = self._bank_data(banklist_data, header=[0, 1], skiprows=1)[0] + def test_multiindex_header_skiprows(self, banklist_data, flavor_read_html): + df = flavor_read_html( + banklist_data, + match="Metcalf", + attrs={"id": "table"}, + header=[0, 1], + skiprows=1, + )[0] assert isinstance(df.columns, MultiIndex) @pytest.mark.slow - def test_multiindex_header_index_skiprows(self, banklist_data): - df = self._bank_data( - banklist_data, header=[0, 1], index_col=[0, 1], skiprows=1 + def test_multiindex_header_index_skiprows(self, banklist_data, flavor_read_html): + df = flavor_read_html( + banklist_data, + match="Metcalf", + attrs={"id": "table"}, + header=[0, 1], + index_col=[0, 1], + skiprows=1, )[0] assert isinstance(df.index, MultiIndex) assert isinstance(df.columns, MultiIndex) @pytest.mark.slow - def test_regex_idempotency(self, banklist_data): + def test_regex_idempotency(self, banklist_data, flavor_read_html): url = banklist_data - dfs = self.read_html( + dfs = flavor_read_html( file_path_to_url(os.path.abspath(url)), match=re.compile(re.compile("Florida")), attrs={"id": "table"}, @@ -467,10 +493,10 @@ def test_regex_idempotency(self, banklist_data): for df in dfs: assert isinstance(df, DataFrame) - def test_negative_skiprows(self, spam_data): + def test_negative_skiprows(self, spam_data, flavor_read_html): msg = r"\(you passed a negative value\)" with pytest.raises(ValueError, match=msg): - self.read_html(spam_data, match="Water", skiprows=-1) + flavor_read_html(spam_data, match="Water", skiprows=-1) @pytest.fixture def python_docs(self): @@ -523,20 +549,20 @@ def python_docs(self): @pytest.mark.network @pytest.mark.single_cpu - def test_multiple_matches(self, python_docs, httpserver): + def test_multiple_matches(self, python_docs, httpserver, flavor_read_html): httpserver.serve_content(content=python_docs) - dfs = self.read_html(httpserver.url, match="Python") + dfs = flavor_read_html(httpserver.url, match="Python") assert len(dfs) > 1 @pytest.mark.network @pytest.mark.single_cpu - def test_python_docs_table(self, python_docs, httpserver): + def test_python_docs_table(self, python_docs, httpserver, flavor_read_html): httpserver.serve_content(content=python_docs) - dfs = self.read_html(httpserver.url, match="Python") + dfs = flavor_read_html(httpserver.url, match="Python") zz = [df.iloc[0, 0][0:4] for df in dfs] assert sorted(zz) == ["Pyth", "What"] - def test_empty_tables(self): + def test_empty_tables(self, flavor_read_html): """ Make sure that read_html ignores empty tables. """ @@ -560,13 +586,13 @@ def test_empty_tables(self):
""" - result = self.read_html(StringIO(html)) + result = flavor_read_html(StringIO(html)) assert len(result) == 1 - def test_multiple_tbody(self): + def test_multiple_tbody(self, flavor_read_html): # GH-20690 # Read all tbody tags within a single table. - result = self.read_html( + result = flavor_read_html( StringIO( """ @@ -595,12 +621,12 @@ def test_multiple_tbody(self): tm.assert_frame_equal(result, expected) - def test_header_and_one_column(self): + def test_header_and_one_column(self, flavor_read_html): """ Don't fail with bs4 when there is a header and only one column as described in issue #9178 """ - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -621,11 +647,11 @@ def test_header_and_one_column(self): tm.assert_frame_equal(result, expected) - def test_thead_without_tr(self): + def test_thead_without_tr(self, flavor_read_html): """ Ensure parser adds within on malformed HTML. """ - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -653,7 +679,7 @@ def test_thead_without_tr(self): tm.assert_frame_equal(result, expected) - def test_tfoot_read(self): + def test_tfoot_read(self, flavor_read_html): """ Make sure that read_html reads tfoot, containing td or th. Ignores empty tfoot @@ -685,16 +711,16 @@ def test_tfoot_read(self): data1 = data_template.format(footer="") data2 = data_template.format(footer="") - result1 = self.read_html(StringIO(data1))[0] - result2 = self.read_html(StringIO(data2))[0] + result1 = flavor_read_html(StringIO(data1))[0] + result2 = flavor_read_html(StringIO(data2))[0] tm.assert_frame_equal(result1, expected1) tm.assert_frame_equal(result2, expected2) - def test_parse_header_of_non_string_column(self): + def test_parse_header_of_non_string_column(self, flavor_read_html): # GH5048: if header is specified explicitly, an int column should be # parsed as int while its header is parsed as str - result = self.read_html( + result = flavor_read_html( StringIO( """
footAfootB
@@ -717,7 +743,7 @@ def test_parse_header_of_non_string_column(self): tm.assert_frame_equal(result, expected) @pytest.mark.slow - def test_banklist_header(self, banklist_data, datapath): + def test_banklist_header(self, banklist_data, datapath, flavor_read_html): from pandas.io.html import _remove_whitespace def try_remove_ws(x): @@ -726,7 +752,7 @@ def try_remove_ws(x): except AttributeError: return x - df = self.read_html(banklist_data, match="Metcalf", attrs={"id": "table"})[0] + df = flavor_read_html(banklist_data, match="Metcalf", attrs={"id": "table"})[0] ground_truth = read_csv( datapath("io", "data", "csv", "banklist.csv"), converters={"Updated Date": Timestamp, "Closing Date": Timestamp}, @@ -765,19 +791,19 @@ def try_remove_ws(x): tm.assert_frame_equal(converted, gtnew) @pytest.mark.slow - def test_gold_canyon(self, banklist_data): + def test_gold_canyon(self, banklist_data, flavor_read_html): gc = "Gold Canyon" with open(banklist_data, encoding="utf-8") as f: raw_text = f.read() assert gc in raw_text - df = self.read_html(banklist_data, match="Gold Canyon", attrs={"id": "table"})[ - 0 - ] + df = flavor_read_html( + banklist_data, match="Gold Canyon", attrs={"id": "table"} + )[0] assert gc in df.to_string() - def test_different_number_of_cols(self): - expected = self.read_html( + def test_different_number_of_cols(self, flavor_read_html): + expected = flavor_read_html( StringIO( """
@@ -813,7 +839,7 @@ def test_different_number_of_cols(self): index_col=0, )[0] - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -848,9 +874,9 @@ def test_different_number_of_cols(self): tm.assert_frame_equal(result, expected) - def test_colspan_rowspan_1(self): + def test_colspan_rowspan_1(self, flavor_read_html): # GH17054 - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -873,7 +899,7 @@ def test_colspan_rowspan_1(self): tm.assert_frame_equal(result, expected) - def test_colspan_rowspan_copy_values(self): + def test_colspan_rowspan_copy_values(self, flavor_read_html): # GH17054 # In ASCII, with lowercase letters being copies: @@ -881,7 +907,7 @@ def test_colspan_rowspan_copy_values(self): # X x Y Z W # A B b z C - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -908,7 +934,7 @@ def test_colspan_rowspan_copy_values(self): tm.assert_frame_equal(result, expected) - def test_colspan_rowspan_both_not_1(self): + def test_colspan_rowspan_both_not_1(self, flavor_read_html): # GH17054 # In ASCII, with lowercase letters being copies: @@ -916,7 +942,7 @@ def test_colspan_rowspan_both_not_1(self): # A B b b C # a b b b D - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -940,7 +966,7 @@ def test_colspan_rowspan_both_not_1(self): tm.assert_frame_equal(result, expected) - def test_rowspan_at_end_of_row(self): + def test_rowspan_at_end_of_row(self, flavor_read_html): # GH17054 # In ASCII, with lowercase letters being copies: @@ -948,7 +974,7 @@ def test_rowspan_at_end_of_row(self): # A B # C b - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -969,10 +995,10 @@ def test_rowspan_at_end_of_row(self): tm.assert_frame_equal(result, expected) - def test_rowspan_only_rows(self): + def test_rowspan_only_rows(self, flavor_read_html): # GH17054 - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -990,9 +1016,9 @@ def test_rowspan_only_rows(self): tm.assert_frame_equal(result, expected) - def test_header_inferred_from_rows_with_only_th(self): + def test_header_inferred_from_rows_with_only_th(self, flavor_read_html): # GH17054 - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -1018,15 +1044,15 @@ def test_header_inferred_from_rows_with_only_th(self): tm.assert_frame_equal(result, expected) - def test_parse_dates_list(self): + def test_parse_dates_list(self, flavor_read_html): df = DataFrame({"date": date_range("1/1/2001", periods=10)}) expected = df.to_html() - res = self.read_html(StringIO(expected), parse_dates=[1], index_col=0) + res = flavor_read_html(StringIO(expected), parse_dates=[1], index_col=0) tm.assert_frame_equal(df, res[0]) - res = self.read_html(StringIO(expected), parse_dates=["date"], index_col=0) + res = flavor_read_html(StringIO(expected), parse_dates=["date"], index_col=0) tm.assert_frame_equal(df, res[0]) - def test_parse_dates_combine(self): + def test_parse_dates_combine(self, flavor_read_html): raw_dates = Series(date_range("1/1/2001", periods=10)) df = DataFrame( { @@ -1034,32 +1060,32 @@ def test_parse_dates_combine(self): "time": raw_dates.map(lambda x: str(x.time())), } ) - res = self.read_html( + res = flavor_read_html( StringIO(df.to_html()), parse_dates={"datetime": [1, 2]}, index_col=1 ) newdf = DataFrame({"datetime": raw_dates}) tm.assert_frame_equal(newdf, res[0]) - def test_wikipedia_states_table(self, datapath): + def test_wikipedia_states_table(self, datapath, flavor_read_html): data = datapath("io", "data", "html", "wikipedia_states.html") assert os.path.isfile(data), f"{repr(data)} is not a file" assert os.path.getsize(data), f"{repr(data)} is an empty file" - result = self.read_html(data, match="Arizona", header=1)[0] + result = flavor_read_html(data, match="Arizona", header=1)[0] assert result.shape == (60, 12) assert "Unnamed" in result.columns[-1] assert result["sq mi"].dtype == np.dtype("float64") assert np.allclose(result.loc[0, "sq mi"], 665384.04) - def test_wikipedia_states_multiindex(self, datapath): + def test_wikipedia_states_multiindex(self, datapath, flavor_read_html): data = datapath("io", "data", "html", "wikipedia_states.html") - result = self.read_html(data, match="Arizona", index_col=0)[0] + result = flavor_read_html(data, match="Arizona", index_col=0)[0] assert result.shape == (60, 11) assert "Unnamed" in result.columns[-1][1] assert result.columns.nlevels == 2 assert np.allclose(result.loc["Alaska", ("Total area[2]", "sq mi")], 665384.04) - def test_parser_error_on_empty_header_row(self): - result = self.read_html( + def test_parser_error_on_empty_header_row(self, flavor_read_html): + result = flavor_read_html( StringIO( """
@@ -1083,9 +1109,9 @@ def test_parser_error_on_empty_header_row(self): ) tm.assert_frame_equal(result[0], expected) - def test_decimal_rows(self): + def test_decimal_rows(self, flavor_read_html): # GH 12907 - result = self.read_html( + result = flavor_read_html( StringIO( """ @@ -1113,7 +1139,7 @@ def test_decimal_rows(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("arg", [True, False]) - def test_bool_header_arg(self, spam_data, arg): + def test_bool_header_arg(self, spam_data, arg, flavor_read_html): # GH 6114 msg = re.escape( "Passing a bool to header is invalid. Use header=None for no header or " @@ -1121,11 +1147,11 @@ def test_bool_header_arg(self, spam_data, arg): "column names" ) with pytest.raises(TypeError, match=msg): - self.read_html(spam_data, header=arg) + flavor_read_html(spam_data, header=arg) - def test_converters(self): + def test_converters(self, flavor_read_html): # GH 13461 - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -1150,9 +1176,9 @@ def test_converters(self): tm.assert_frame_equal(result, expected) - def test_na_values(self): + def test_na_values(self, flavor_read_html): # GH 13461 - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -1177,7 +1203,7 @@ def test_na_values(self): tm.assert_frame_equal(result, expected) - def test_keep_default_na(self): + def test_keep_default_na(self, flavor_read_html): html_data = """
@@ -1195,15 +1221,15 @@ def test_keep_default_na(self):
""" expected_df = DataFrame({"a": ["N/A", "NA"]}) - html_df = self.read_html(StringIO(html_data), keep_default_na=False)[0] + html_df = flavor_read_html(StringIO(html_data), keep_default_na=False)[0] tm.assert_frame_equal(expected_df, html_df) expected_df = DataFrame({"a": [np.nan, np.nan]}) - html_df = self.read_html(StringIO(html_data), keep_default_na=True)[0] + html_df = flavor_read_html(StringIO(html_data), keep_default_na=True)[0] tm.assert_frame_equal(expected_df, html_df) - def test_preserve_empty_rows(self): - result = self.read_html( + def test_preserve_empty_rows(self, flavor_read_html): + result = flavor_read_html( StringIO( """ @@ -1228,8 +1254,8 @@ def test_preserve_empty_rows(self): tm.assert_frame_equal(result, expected) - def test_ignore_empty_rows_when_inferring_header(self): - result = self.read_html( + def test_ignore_empty_rows_when_inferring_header(self, flavor_read_html): + result = flavor_read_html( StringIO( """
@@ -1251,7 +1277,7 @@ def test_ignore_empty_rows_when_inferring_header(self): tm.assert_frame_equal(result, expected) - def test_multiple_header_rows(self): + def test_multiple_header_rows(self, flavor_read_html): # Issue #13434 expected_df = DataFrame( data=[("Hillary", 68, "D"), ("Bernie", 74, "D"), ("Donald", 69, "R")] @@ -1261,20 +1287,20 @@ def test_multiple_header_rows(self): ["Name", "Unnamed: 1_level_1", "Unnamed: 2_level_1"], ] html = expected_df.to_html(index=False) - html_df = self.read_html(StringIO(html))[0] + html_df = flavor_read_html(StringIO(html))[0] tm.assert_frame_equal(expected_df, html_df) - def test_works_on_valid_markup(self, datapath): + def test_works_on_valid_markup(self, datapath, flavor_read_html): filename = datapath("io", "data", "html", "valid_markup.html") - dfs = self.read_html(filename, index_col=0) + dfs = flavor_read_html(filename, index_col=0) assert isinstance(dfs, list) assert isinstance(dfs[0], DataFrame) @pytest.mark.slow - def test_fallback_success(self, datapath): + def test_fallback_success(self, datapath, flavor_read_html): banklist_data = datapath("io", "data", "html", "banklist.html") - self.read_html(banklist_data, match=".*Water.*", flavor=["lxml", "html5lib"]) + flavor_read_html(banklist_data, match=".*Water.*", flavor=["lxml", "html5lib"]) def test_to_html_timestamp(self): rng = date_range("2000-01-01", periods=10) @@ -1309,7 +1335,7 @@ def test_to_html_borderless(self): (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"])), ], ) - def test_displayed_only(self, displayed_only, exp0, exp1): + def test_displayed_only(self, displayed_only, exp0, exp1, flavor_read_html): # GH 20027 data = """ @@ -1331,7 +1357,7 @@ def test_displayed_only(self, displayed_only, exp0, exp1): """ - dfs = self.read_html(StringIO(data), displayed_only=displayed_only) + dfs = flavor_read_html(StringIO(data), displayed_only=displayed_only) tm.assert_frame_equal(dfs[0], exp0) if exp1 is not None: @@ -1340,7 +1366,7 @@ def test_displayed_only(self, displayed_only, exp0, exp1): assert len(dfs) == 1 # Should not parse hidden table @pytest.mark.parametrize("displayed_only", [True, False]) - def test_displayed_only_with_many_elements(self, displayed_only): + def test_displayed_only_with_many_elements(self, displayed_only, flavor_read_html): html_table = """
@@ -1357,7 +1383,9 @@ def test_displayed_only_with_many_elements(self, displayed_only):
""" - result = read_html(StringIO(html_table), displayed_only=displayed_only)[0] + result = flavor_read_html(StringIO(html_table), displayed_only=displayed_only)[ + 0 + ] expected = DataFrame({"A": [1, 4], "B": [2, 5]}) tm.assert_frame_equal(result, expected) @@ -1365,23 +1393,23 @@ def test_displayed_only_with_many_elements(self, displayed_only): "ignore:You provided Unicode markup but also provided a value for " "from_encoding.*:UserWarning" ) - def test_encode(self, html_encoding_file): + def test_encode(self, html_encoding_file, flavor_read_html): base_path = os.path.basename(html_encoding_file) root = os.path.splitext(base_path)[0] _, encoding = root.split("_") try: with open(html_encoding_file, "rb") as fobj: - from_string = self.read_html( + from_string = flavor_read_html( fobj.read(), encoding=encoding, index_col=0 ).pop() with open(html_encoding_file, "rb") as fobj: - from_file_like = self.read_html( + from_file_like = flavor_read_html( BytesIO(fobj.read()), encoding=encoding, index_col=0 ).pop() - from_filename = self.read_html( + from_filename = flavor_read_html( html_encoding_file, encoding=encoding, index_col=0 ).pop() tm.assert_frame_equal(from_string, from_file_like) @@ -1393,10 +1421,10 @@ def test_encode(self, html_encoding_file): pytest.skip() raise - def test_parse_failure_unseekable(self): + def test_parse_failure_unseekable(self, flavor_read_html): # Issue #17975 - if self.read_html.keywords.get("flavor") == "lxml": + if flavor_read_html.keywords.get("flavor") == "lxml": pytest.skip("Not applicable for lxml") class UnseekableStringIO(StringIO): @@ -1408,12 +1436,12 @@ def seekable(self):
spameggs
""" ) - assert self.read_html(bad) + assert flavor_read_html(bad) with pytest.raises(ValueError, match="passed a non-rewindable file object"): - self.read_html(bad) + flavor_read_html(bad) - def test_parse_failure_rewinds(self): + def test_parse_failure_rewinds(self, flavor_read_html): # Issue #17975 class MockFile: @@ -1444,12 +1472,12 @@ def __iter__(self) -> Iterator: good = MockFile("
spam
eggs
") bad = MockFile("
spameggs
") - assert self.read_html(good) - assert self.read_html(bad) + assert flavor_read_html(good) + assert flavor_read_html(bad) @pytest.mark.slow @pytest.mark.single_cpu - def test_importcheck_thread_safety(self, datapath): + def test_importcheck_thread_safety(self, datapath, flavor_read_html): # see gh-16928 class ErrorThread(threading.Thread): @@ -1462,8 +1490,8 @@ def run(self): self.err = None filename = datapath("io", "data", "html", "valid_markup.html") - helper_thread1 = ErrorThread(target=self.read_html, args=(filename,)) - helper_thread2 = ErrorThread(target=self.read_html, args=(filename,)) + helper_thread1 = ErrorThread(target=flavor_read_html, args=(filename,)) + helper_thread2 = ErrorThread(target=flavor_read_html, args=(filename,)) helper_thread1.start() helper_thread2.start() @@ -1472,17 +1500,17 @@ def run(self): pass assert None is helper_thread1.err is helper_thread2.err - def test_parse_path_object(self, datapath): + def test_parse_path_object(self, datapath, flavor_read_html): # GH 37705 file_path_string = datapath("io", "data", "html", "spam.html") file_path = Path(file_path_string) - df1 = self.read_html(file_path_string)[0] - df2 = self.read_html(file_path)[0] + df1 = flavor_read_html(file_path_string)[0] + df2 = flavor_read_html(file_path)[0] tm.assert_frame_equal(df1, df2) - def test_parse_br_as_space(self): + def test_parse_br_as_space(self, flavor_read_html): # GH 29528: pd.read_html() convert
to space - result = self.read_html( + result = flavor_read_html( StringIO( """ @@ -1502,7 +1530,7 @@ def test_parse_br_as_space(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("arg", ["all", "body", "header", "footer"]) - def test_extract_links(self, arg): + def test_extract_links(self, arg, flavor_read_html): gh_13141_data = """
@@ -1565,7 +1593,7 @@ def test_extract_links(self, arg): elif arg == "header": head_exp = gh_13141_expected["head_extract"] - result = self.read_html(StringIO(gh_13141_data), extract_links=arg)[0] + result = flavor_read_html(StringIO(gh_13141_data), extract_links=arg)[0] expected = DataFrame([data_exp, foot_exp], columns=head_exp) expected = expected.fillna(np.nan) tm.assert_frame_equal(result, expected) @@ -1578,7 +1606,7 @@ def test_extract_links_bad(self, spam_data): with pytest.raises(ValueError, match=msg): read_html(spam_data, extract_links="incorrect") - def test_extract_links_all_no_header(self): + def test_extract_links_all_no_header(self, flavor_read_html): # GH 48316 data = """
@@ -1589,7 +1617,7 @@ def test_extract_links_all_no_header(self):
""" - result = self.read_html(StringIO(data), extract_links="all")[0] + result = flavor_read_html(StringIO(data), extract_links="all")[0] expected = DataFrame([[("Google.com", "https://google.com")]]) tm.assert_frame_equal(result, expected) @@ -1601,7 +1629,7 @@ def test_invalid_dtype_backend(self): with pytest.raises(ValueError, match=msg): read_html("test", dtype_backend="numpy") - def test_style_tag(self): + def test_style_tag(self, flavor_read_html): # GH 48316 data = """ @@ -1622,6 +1650,6 @@ def test_style_tag(self):
""" - result = self.read_html(StringIO(data))[0] + result = flavor_read_html(StringIO(data))[0] expected = DataFrame(data=[["A1", "B1"], ["A2", "B2"]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 55fc77fb5705f..8547fd6988791 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -30,11 +30,10 @@ @pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"]) -def switch_numexpr_min_elements(request): - _MIN_ELEMENTS = expr._MIN_ELEMENTS - expr._MIN_ELEMENTS = request.param - yield request.param - expr._MIN_ELEMENTS = _MIN_ELEMENTS +def switch_numexpr_min_elements(request, monkeypatch): + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", request.param) + yield def _permute(obj): diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 1e66cefbcfdd0..dfec99f0786eb 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -102,12 +102,6 @@ def _array_mixed2(_mixed2): @pytest.mark.skipif(not expr.USE_NUMEXPR, reason="not using numexpr") class TestExpressions: - @pytest.fixture(autouse=True) - def save_min_elements(self): - min_elements = expr._MIN_ELEMENTS - yield - expr._MIN_ELEMENTS = min_elements - @staticmethod def call_op(df, other, flex: bool, opname: str): if flex: @@ -140,21 +134,24 @@ def call_op(df, other, flex: bool, opname: str): @pytest.mark.parametrize( "arith", ["add", "sub", "mul", "mod", "truediv", "floordiv"] ) - def test_run_arithmetic(self, request, fixture, flex, arith): + def test_run_arithmetic(self, request, fixture, flex, arith, monkeypatch): df = request.getfixturevalue(fixture) - expr._MIN_ELEMENTS = 0 - result, expected = self.call_op(df, df, flex, arith) - - if arith == "truediv": - assert all(x.kind == "f" for x in expected.dtypes.values) - tm.assert_equal(expected, result) + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", 0) + result, expected = self.call_op(df, df, flex, arith) - for i in range(len(df.columns)): - result, expected = self.call_op(df.iloc[:, i], df.iloc[:, i], flex, arith) if arith == "truediv": - assert expected.dtype.kind == "f" + assert all(x.kind == "f" for x in expected.dtypes.values) tm.assert_equal(expected, result) + for i in range(len(df.columns)): + result, expected = self.call_op( + df.iloc[:, i], df.iloc[:, i], flex, arith + ) + if arith == "truediv": + assert expected.dtype.kind == "f" + tm.assert_equal(expected, result) + @pytest.mark.parametrize( "fixture", [ @@ -168,7 +165,7 @@ def test_run_arithmetic(self, request, fixture, flex, arith): ], ) @pytest.mark.parametrize("flex", [True, False]) - def test_run_binary(self, request, fixture, flex, comparison_op): + def test_run_binary(self, request, fixture, flex, comparison_op, monkeypatch): """ tests solely that the result is the same whether or not numexpr is enabled. Need to test whether the function does the correct thing @@ -179,18 +176,19 @@ def test_run_binary(self, request, fixture, flex, comparison_op): with option_context("compute.use_numexpr", False): other = df.copy() + 1 - expr._MIN_ELEMENTS = 0 - expr.set_test_mode(True) + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", 0) + expr.set_test_mode(True) - result, expected = self.call_op(df, other, flex, arith) + result, expected = self.call_op(df, other, flex, arith) - used_numexpr = expr.get_test_result() - assert used_numexpr, "Did not use numexpr as expected." - tm.assert_equal(expected, result) + used_numexpr = expr.get_test_result() + assert used_numexpr, "Did not use numexpr as expected." + tm.assert_equal(expected, result) - for i in range(len(df.columns)): - binary_comp = other.iloc[:, i] + 1 - self.call_op(df.iloc[:, i], binary_comp, flex, "add") + for i in range(len(df.columns)): + binary_comp = other.iloc[:, i] + 1 + self.call_op(df.iloc[:, i], binary_comp, flex, "add") def test_invalid(self): array = np.random.default_rng(2).standard_normal(1_000_001) @@ -406,7 +404,7 @@ def test_bool_ops_column_name_dtype(self, test_input, expected): "arith", ("add", "sub", "mul", "mod", "truediv", "floordiv") ) @pytest.mark.parametrize("axis", (0, 1)) - def test_frame_series_axis(self, axis, arith, _frame): + def test_frame_series_axis(self, axis, arith, _frame, monkeypatch): # GH#26736 Dataframe.floordiv(Series, axis=1) fails df = _frame @@ -415,15 +413,16 @@ def test_frame_series_axis(self, axis, arith, _frame): else: other = df.iloc[:, 0] - expr._MIN_ELEMENTS = 0 + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", 0) - op_func = getattr(df, arith) + op_func = getattr(df, arith) - with option_context("compute.use_numexpr", False): - expected = op_func(other, axis=axis) + with option_context("compute.use_numexpr", False): + expected = op_func(other, axis=axis) - result = op_func(other, axis=axis) - tm.assert_frame_equal(expected, result) + result = op_func(other, axis=axis) + tm.assert_frame_equal(expected, result) @pytest.mark.parametrize( "op", @@ -436,29 +435,32 @@ def test_frame_series_axis(self, axis, arith, _frame): ) @pytest.mark.parametrize("box", [DataFrame, Series, Index]) @pytest.mark.parametrize("scalar", [-5, 5]) - def test_python_semantics_with_numexpr_installed(self, op, box, scalar): + def test_python_semantics_with_numexpr_installed( + self, op, box, scalar, monkeypatch + ): # https://github.com/pandas-dev/pandas/issues/36047 - expr._MIN_ELEMENTS = 0 - data = np.arange(-50, 50) - obj = box(data) - method = getattr(obj, op) - result = method(scalar) - - # compare result with numpy - with option_context("compute.use_numexpr", False): - expected = method(scalar) - - tm.assert_equal(result, expected) - - # compare result element-wise with Python - for i, elem in enumerate(data): - if box == DataFrame: - scalar_result = result.iloc[i, 0] - else: - scalar_result = result[i] - try: - expected = getattr(int(elem), op)(scalar) - except ZeroDivisionError: - pass - else: - assert scalar_result == expected + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", 0) + data = np.arange(-50, 50) + obj = box(data) + method = getattr(obj, op) + result = method(scalar) + + # compare result with numpy + with option_context("compute.use_numexpr", False): + expected = method(scalar) + + tm.assert_equal(result, expected) + + # compare result element-wise with Python + for i, elem in enumerate(data): + if box == DataFrame: + scalar_result = result.iloc[i, 0] + else: + scalar_result = result[i] + try: + expected = getattr(int(elem), op)(scalar) + except ZeroDivisionError: + pass + else: + assert scalar_result == expected From 2e3cb7685887f94997b9b83d0662aeb7a406e665 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 25 Sep 2023 14:42:45 -0400 Subject: [PATCH 5/7] BUG: DatetimeIndex.union returning object dtype for indexes with the same tz but different units (#55259) * add DatetimeTZDtype._get_common_type * mypy --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/dtypes/dtypes.py | 7 +++++++ pandas/tests/indexes/datetimes/test_setops.py | 8 ++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 930e03ae7d75a..ca09a69f603ab 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -259,7 +259,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ -- +- Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`) - Timedelta diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index beff71d5e9dd9..196c95c3673e9 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -928,6 +928,13 @@ def __setstate__(self, state) -> None: self._tz = state["tz"] self._unit = state["unit"] + def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: + if all(isinstance(t, DatetimeTZDtype) and t.tz == self.tz for t in dtypes): + np_dtype = np.max([cast(DatetimeTZDtype, t).base for t in [self, *dtypes]]) + unit = np.datetime_data(np_dtype)[0] + return type(self)(unit=unit, tz=self.tz) + return super()._get_common_dtype(dtypes) + @cache_readonly def index_class(self) -> type_t[DatetimeIndex]: from pandas import DatetimeIndex diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index b56bad7f2e833..ca784948a5d29 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -189,6 +189,14 @@ def test_union_with_DatetimeIndex(self, sort): # Fails with "AttributeError: can't set attribute" i2.union(i1, sort=sort) + def test_union_same_timezone_different_units(self): + # GH 55238 + idx1 = date_range("2000-01-01", periods=3, tz="UTC").as_unit("ms") + idx2 = date_range("2000-01-01", periods=3, tz="UTC").as_unit("us") + result = idx1.union(idx2) + expected = date_range("2000-01-01", periods=3, tz="UTC").as_unit("us") + tm.assert_index_equal(result, expected) + # TODO: moved from test_datetimelike; de-duplicate with version below def test_intersection2(self): first = tm.makeDateIndex(10) From 764c460cdd1d14dfa9a28d9367ab9874ac8037ba Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 25 Sep 2023 11:43:40 -0700 Subject: [PATCH 6/7] BUG: Series[slc]=foo raising with IntervalIndex (#55258) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/indexes/base.py | 12 +++------ .../tests/indexing/interval/test_interval.py | 27 +++++++++++++++++++ 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index ca09a69f603ab..0760840f9950a 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -292,6 +292,7 @@ Interval - Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown. (:issue:`55015`) - Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`) - Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`) +- Bug in setting values on a :class:`Series` with an :class:`IntervalIndex` using a slice incorrectly raising (:issue:`54722`) - Indexing diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8703fef1e5940..e23887159c9c6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4204,15 +4204,9 @@ def _convert_slice_indexer(self, key: slice, kind: Literal["loc", "getitem"]): self._validate_indexer("slice", key.step, "getitem") return key - # convert the slice to an indexer here - - # special case for interval_dtype bc we do not do partial-indexing - # on integer Intervals when slicing - # TODO: write this in terms of e.g. should_partial_index? - ints_are_positional = self._should_fallback_to_positional or isinstance( - self.dtype, IntervalDtype - ) - is_positional = is_index_slice and ints_are_positional + # convert the slice to an indexer here; checking that the user didn't + # pass a positional slice to loc + is_positional = is_index_slice and self._should_fallback_to_positional # if we are mixed and have integers if is_positional: diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index 52a1d433712ff..ae25724972fde 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -134,6 +134,33 @@ def test_getitem_interval_with_nans(self, frame_or_series, indexer_sl): tm.assert_equal(result, expected) + def test_setitem_interval_with_slice(self): + # GH#54722 + ii = IntervalIndex.from_breaks(range(4, 15)) + ser = Series(range(10), index=ii) + + orig = ser.copy() + + # This should be a no-op (used to raise) + ser.loc[1:3] = 20 + tm.assert_series_equal(ser, orig) + + ser.loc[6:8] = 19 + orig.iloc[1:4] = 19 + tm.assert_series_equal(ser, orig) + + ser2 = Series(range(5), index=ii[::2]) + orig2 = ser2.copy() + + # this used to raise + ser2.loc[6:8] = 22 # <- raises on main, sets on branch + orig2.iloc[1] = 22 + tm.assert_series_equal(ser2, orig2) + + ser2.loc[5:7] = 21 + orig2.iloc[:2] = 21 + tm.assert_series_equal(ser2, orig2) + class TestIntervalIndexInsideMultiIndex: def test_mi_intervalindex_slicing_with_scalar(self): From 89bd5695df3ccf5b9186b4645118656656831593 Mon Sep 17 00:00:00 2001 From: Artur Barseghyan Date: Tue, 26 Sep 2023 02:11:06 +0200 Subject: [PATCH 7/7] Update pyproject.toml - replace `output_formatting` with `output-formatting` (#55275) * Update pyproject.toml * Update v2.1.2.rst * Update v2.1.2.rst * Update doc/source/whatsnew/v2.1.2.rst * Update install.rst * Update package-checks.yml --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .github/workflows/package-checks.yml | 2 +- doc/source/getting_started/install.rst | 6 +++--- doc/source/whatsnew/v2.1.2.rst | 2 +- pyproject.toml | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 64a94d7fde5a9..a2c42af53c3a8 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -24,7 +24,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output_formatting", "clipboard", "compression", "consortium-standard", "all"] + extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "consortium-standard", "all"] fail-fast: false name: Install Extras - ${{ matrix.extra }} concurrency: diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 2c0787397e047..4a0ad4ae658c3 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -247,14 +247,14 @@ Dependency Minimum Version pip ext Visualization ^^^^^^^^^^^^^ -Installable with ``pip install "pandas[plot, output_formatting]"``. +Installable with ``pip install "pandas[plot, output-formatting]"``. ========================= ================== ================== ============================================================= Dependency Minimum Version pip extra Notes ========================= ================== ================== ============================================================= matplotlib 3.6.1 plot Plotting library -Jinja2 3.1.2 output_formatting Conditional formatting with DataFrame.style -tabulate 0.8.10 output_formatting Printing in Markdown-friendly format (see `tabulate`_) +Jinja2 3.1.2 output-formatting Conditional formatting with DataFrame.style +tabulate 0.8.10 output-formatting Printing in Markdown-friendly format (see `tabulate`_) ========================= ================== ================== ============================================================= Computation diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 97aeb56924e65..6fec66ec8d556 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -29,7 +29,7 @@ Bug fixes Other ~~~~~ -- +- Fixed non-working installation of optional dependency group ``output_formatting``. Replacing underscore ``_`` with a dash ``-`` fixes broken dependency resolution. A correct way to use now is ``pip install pandas[output-formatting]``. - .. --------------------------------------------------------------------------- diff --git a/pyproject.toml b/pyproject.toml index df89bd129901f..89432c2353ea8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,7 +80,7 @@ sql-other = ['SQLAlchemy>=1.4.36'] html = ['beautifulsoup4>=4.11.1', 'html5lib>=1.1', 'lxml>=4.8.0'] xml = ['lxml>=4.8.0'] plot = ['matplotlib>=3.6.1'] -output_formatting = ['jinja2>=3.1.2', 'tabulate>=0.8.10'] +output-formatting = ['jinja2>=3.1.2', 'tabulate>=0.8.10'] clipboard = ['PyQt5>=5.15.6', 'qtpy>=2.2.0'] compression = ['zstandard>=0.17.0'] consortium-standard = ['dataframe-api-compat>=0.1.7']