From ce9f3ed1a827a61b98e7973e8ee7e1993c12ef06 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Sun, 1 Dec 2024 18:28:53 -0800 Subject: [PATCH 1/4] BUG: Fix pd.read_html handling of rowspan in table header --- pandas/io/html.py | 53 +++++++++++++++++++++++------------- pandas/tests/io/test_html.py | 27 ++++++++++++++++++ 2 files changed, 61 insertions(+), 19 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index c9897f628fdc9..bf39321533e1b 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -454,14 +454,25 @@ def row_is_all_th(row): while body_rows and row_is_all_th(body_rows[0]): header_rows.append(body_rows.pop(0)) - header = self._expand_colspan_rowspan(header_rows, section="header") - body = self._expand_colspan_rowspan(body_rows, section="body") - footer = self._expand_colspan_rowspan(footer_rows, section="footer") + header, rem = self._expand_colspan_rowspan(header_rows, section="header") + body, rem = self._expand_colspan_rowspan( + body_rows, + section="body", + remainder=rem, + overflow=True if len(footer_rows) > 0 else False, + ) + footer, _ = self._expand_colspan_rowspan( + footer_rows, section="footer", remainder=rem, overflow=False + ) return header, body, footer def _expand_colspan_rowspan( - self, rows, section: Literal["header", "footer", "body"] + self, + rows, + section: Literal["header", "footer", "body"], + remainder: list[int, tuple[str | tuple, int]] | None = None, + overflow: bool = True, ) -> list[list]: """ Given a list of s, return a list of text rows. @@ -471,6 +482,11 @@ def _expand_colspan_rowspan( rows : list of node-like List of s section : the section that the rows belong to (header, body or footer). + remainder: list[int, tuple[str | tuple, int]] | None + Any remainder from the expansion of previous section + overflow: bool + If true, return any partial rows as 'remainder'. If not, use up any + partial rows. True by default. Returns ------- @@ -485,9 +501,7 @@ def _expand_colspan_rowspan( """ all_texts = [] # list of rows, each a list of str text: str | tuple - remainder: list[ - tuple[int, str | tuple, int] - ] = [] # list of (index, text, nrows) + remainder = remainder if remainder is not None else [] for tr in rows: texts = [] # the output for this row @@ -528,19 +542,20 @@ def _expand_colspan_rowspan( all_texts.append(texts) remainder = next_remainder - # Append rows that only appear because the previous row had non-1 - # rowspan - while remainder: - next_remainder = [] - texts = [] - for prev_i, prev_text, prev_rowspan in remainder: - texts.append(prev_text) - if prev_rowspan > 1: - next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) - all_texts.append(texts) - remainder = next_remainder + if not overflow: + # Append rows that only appear because the previous row had non-1 + # rowspan + while remainder: + next_remainder = [] + texts = [] + for prev_i, prev_text, prev_rowspan in remainder: + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + all_texts.append(texts) + remainder = next_remainder - return all_texts + return all_texts, remainder def _handle_hidden_tables(self, tbl_list, attr_name: str): """ diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 73e9933e3681b..bef28c4f027da 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1004,6 +1004,33 @@ def test_rowspan_only_rows(self, flavor_read_html): tm.assert_frame_equal(result, expected) + def test_rowspan_in_header_overflowing_to_body(self, flavor_read_html): + # GH60210 + + result = flavor_read_html( + StringIO( + """ + + + + + + + + + + + + +
AB
1
C2
+ """ + ) + )[0] + + expected = DataFrame(data=[["A", 1], ["C", 2]], columns=["A", "B"]) + + tm.assert_frame_equal(result, expected) + def test_header_inferred_from_rows_with_only_th(self, flavor_read_html): # GH17054 result = flavor_read_html( From 36e09fe3350a3e492efc46e23745ef008cc9cd21 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Sun, 1 Dec 2024 18:52:06 -0800 Subject: [PATCH 2/4] BUG: Fix docstring error in _expand_colspan_rowspan --- pandas/io/html.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index bf39321533e1b..fa23876a16ee5 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -471,7 +471,7 @@ def _expand_colspan_rowspan( self, rows, section: Literal["header", "footer", "body"], - remainder: list[int, tuple[str | tuple, int]] | None = None, + remainder: list[tuple[int, str | tuple, int]] | None = None, overflow: bool = True, ) -> list[list]: """ @@ -482,7 +482,7 @@ def _expand_colspan_rowspan( rows : list of node-like List of s section : the section that the rows belong to (header, body or footer). - remainder: list[int, tuple[str | tuple, int]] | None + remainder: list[tuple[int, str | tuple, int]] | None Any remainder from the expansion of previous section overflow: bool If true, return any partial rows as 'remainder'. If not, use up any From b22ca47dee082a06f0fad8ca09dd048cb064af60 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Sun, 1 Dec 2024 19:12:16 -0800 Subject: [PATCH 3/4] BUG: Update return type for _expand_colspan_rowspan --- pandas/io/html.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index fa23876a16ee5..e5ae9d90eb36f 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -473,7 +473,7 @@ def _expand_colspan_rowspan( section: Literal["header", "footer", "body"], remainder: list[tuple[int, str | tuple, int]] | None = None, overflow: bool = True, - ) -> list[list]: + ) -> tuple[list[list], list[tuple[int, str | tuple, int]]]: """ Given a list of s, return a list of text rows. @@ -493,6 +493,9 @@ def _expand_colspan_rowspan( list of list Each returned row is a list of str text, or tuple (text, link) if extract_links is not None. + remainder + Remaining partial rows if any. If overflow is False, an empty list + is returned. Notes ----- From 1f0f4d81014689afc87f718bd3806ad7ccdb7b5a Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Mon, 2 Dec 2024 10:45:21 -0800 Subject: [PATCH 4/4] BUG: Address review and add not to whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/html.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e74bd2f745b94..3eaf1b9e8ebba 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -700,6 +700,7 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) +- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) - Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) diff --git a/pandas/io/html.py b/pandas/io/html.py index e5ae9d90eb36f..183af3a03221b 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -459,7 +459,7 @@ def row_is_all_th(row): body_rows, section="body", remainder=rem, - overflow=True if len(footer_rows) > 0 else False, + overflow=len(footer_rows) > 0, ) footer, _ = self._expand_colspan_rowspan( footer_rows, section="footer", remainder=rem, overflow=False