diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index e74bd2f745b94..3eaf1b9e8ebba 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -700,6 +700,7 @@ I/O
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
+- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
- Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`)
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
diff --git a/pandas/io/html.py b/pandas/io/html.py
index c9897f628fdc9..183af3a03221b 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -454,15 +454,26 @@ def row_is_all_th(row):
while body_rows and row_is_all_th(body_rows[0]):
header_rows.append(body_rows.pop(0))
- header = self._expand_colspan_rowspan(header_rows, section="header")
- body = self._expand_colspan_rowspan(body_rows, section="body")
- footer = self._expand_colspan_rowspan(footer_rows, section="footer")
+ header, rem = self._expand_colspan_rowspan(header_rows, section="header")
+ body, rem = self._expand_colspan_rowspan(
+ body_rows,
+ section="body",
+ remainder=rem,
+ overflow=len(footer_rows) > 0,
+ )
+ footer, _ = self._expand_colspan_rowspan(
+ footer_rows, section="footer", remainder=rem, overflow=False
+ )
return header, body, footer
def _expand_colspan_rowspan(
- self, rows, section: Literal["header", "footer", "body"]
- ) -> list[list]:
+ self,
+ rows,
+ section: Literal["header", "footer", "body"],
+ remainder: list[tuple[int, str | tuple, int]] | None = None,
+ overflow: bool = True,
+ ) -> tuple[list[list], list[tuple[int, str | tuple, int]]]:
"""
Given a list of
s, return a list of text rows.
@@ -471,12 +482,20 @@ def _expand_colspan_rowspan(
rows : list of node-like
List of
s
section : the section that the rows belong to (header, body or footer).
+ remainder: list[tuple[int, str | tuple, int]] | None
+ Any remainder from the expansion of previous section
+ overflow: bool
+ If true, return any partial rows as 'remainder'. If not, use up any
+ partial rows. True by default.
Returns
-------
list of list
Each returned row is a list of str text, or tuple (text, link)
if extract_links is not None.
+ remainder
+ Remaining partial rows if any. If overflow is False, an empty list
+ is returned.
Notes
-----
@@ -485,9 +504,7 @@ def _expand_colspan_rowspan(
"""
all_texts = [] # list of rows, each a list of str
text: str | tuple
- remainder: list[
- tuple[int, str | tuple, int]
- ] = [] # list of (index, text, nrows)
+ remainder = remainder if remainder is not None else []
for tr in rows:
texts = [] # the output for this row
@@ -528,19 +545,20 @@ def _expand_colspan_rowspan(
all_texts.append(texts)
remainder = next_remainder
- # Append rows that only appear because the previous row had non-1
- # rowspan
- while remainder:
- next_remainder = []
- texts = []
- for prev_i, prev_text, prev_rowspan in remainder:
- texts.append(prev_text)
- if prev_rowspan > 1:
- next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
- all_texts.append(texts)
- remainder = next_remainder
+ if not overflow:
+ # Append rows that only appear because the previous row had non-1
+ # rowspan
+ while remainder:
+ next_remainder = []
+ texts = []
+ for prev_i, prev_text, prev_rowspan in remainder:
+ texts.append(prev_text)
+ if prev_rowspan > 1:
+ next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
+ all_texts.append(texts)
+ remainder = next_remainder
- return all_texts
+ return all_texts, remainder
def _handle_hidden_tables(self, tbl_list, attr_name: str):
"""
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 73e9933e3681b..bef28c4f027da 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -1004,6 +1004,33 @@ def test_rowspan_only_rows(self, flavor_read_html):
tm.assert_frame_equal(result, expected)
+ def test_rowspan_in_header_overflowing_to_body(self, flavor_read_html):
+ # GH60210
+
+ result = flavor_read_html(
+ StringIO(
+ """
+
+
+ A |
+ B |
+
+
+ 1 |
+
+
+ C |
+ 2 |
+
+
+ """
+ )
+ )[0]
+
+ expected = DataFrame(data=[["A", 1], ["C", 2]], columns=["A", "B"])
+
+ tm.assert_frame_equal(result, expected)
+
def test_header_inferred_from_rows_with_only_th(self, flavor_read_html):
# GH17054
result = flavor_read_html(