From ce9f3ed1a827a61b98e7973e8ee7e1993c12ef06 Mon Sep 17 00:00:00 2001
From: Nitish Satyavolu <snitish.iitk@gmail.com>
Date: Sun, 1 Dec 2024 18:28:53 -0800
Subject: [PATCH 1/4] BUG: Fix pd.read_html handling of rowspan in table header

---
 pandas/io/html.py            | 53 +++++++++++++++++++++++-------------
 pandas/tests/io/test_html.py | 27 ++++++++++++++++++
 2 files changed, 61 insertions(+), 19 deletions(-)
diff --git a/pandas/io/html.py b/pandas/io/html.py
index c9897f628fdc9..bf39321533e1b 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -454,14 +454,25 @@ def row_is_all_th(row):
             while body_rows and row_is_all_th(body_rows[0]):
                 header_rows.append(body_rows.pop(0))
 
-        header = self._expand_colspan_rowspan(header_rows, section="header")
-        body = self._expand_colspan_rowspan(body_rows, section="body")
-        footer = self._expand_colspan_rowspan(footer_rows, section="footer")
+        header, rem = self._expand_colspan_rowspan(header_rows, section="header")
+        body, rem = self._expand_colspan_rowspan(
+            body_rows,
+            section="body",
+            remainder=rem,
+            overflow=True if len(footer_rows) > 0 else False,
+        )
+        footer, _ = self._expand_colspan_rowspan(
+            footer_rows, section="footer", remainder=rem, overflow=False
+        )
 
         return header, body, footer
 
     def _expand_colspan_rowspan(
-        self, rows, section: Literal["header", "footer", "body"]
+        self,
+        rows,
+        section: Literal["header", "footer", "body"],
+        remainder: list[int, tuple[str | tuple, int]] | None = None,
+        overflow: bool = True,
     ) -> list[list]:
         """
         Given a list of <tr>s, return a list of text rows.
@@ -471,6 +482,11 @@ def _expand_colspan_rowspan(
         rows : list of node-like
             List of <tr>s
         section : the section that the rows belong to (header, body or footer).
+        remainder: list[int, tuple[str | tuple, int]] | None
+            Any remainder from the expansion of previous section
+        overflow: bool
+            If true, return any partial rows as 'remainder'. If not, use up any
+            partial rows. True by default.
 
         Returns
         -------
@@ -485,9 +501,7 @@ def _expand_colspan_rowspan(
         """
         all_texts = []  # list of rows, each a list of str
         text: str | tuple
-        remainder: list[
-            tuple[int, str | tuple, int]
-        ] = []  # list of (index, text, nrows)
+        remainder = remainder if remainder is not None else []
 
         for tr in rows:
             texts = []  # the output for this row
@@ -528,19 +542,20 @@ def _expand_colspan_rowspan(
             all_texts.append(texts)
             remainder = next_remainder
 
-        # Append rows that only appear because the previous row had non-1
-        # rowspan
-        while remainder:
-            next_remainder = []
-            texts = []
-            for prev_i, prev_text, prev_rowspan in remainder:
-                texts.append(prev_text)
-                if prev_rowspan > 1:
-                    next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
-            all_texts.append(texts)
-            remainder = next_remainder
+        if not overflow:
+            # Append rows that only appear because the previous row had non-1
+            # rowspan
+            while remainder:
+                next_remainder = []
+                texts = []
+                for prev_i, prev_text, prev_rowspan in remainder:
+                    texts.append(prev_text)
+                    if prev_rowspan > 1:
+                        next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
+                all_texts.append(texts)
+                remainder = next_remainder
 
-        return all_texts
+        return all_texts, remainder
 
     def _handle_hidden_tables(self, tbl_list, attr_name: str):
         """
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 73e9933e3681b..bef28c4f027da 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -1004,6 +1004,33 @@ def test_rowspan_only_rows(self, flavor_read_html):
 
         tm.assert_frame_equal(result, expected)
 
+    def test_rowspan_in_header_overflowing_to_body(self, flavor_read_html):
+        # GH60210
+
+        result = flavor_read_html(
+            StringIO(
+                """
+            <table>
+                <tr>
+                    <th rowspan="2">A</th>
+                    <th>B</th>
+                </tr>
+                <tr>
+                    <td>1</td>
+                </tr>
+                <tr>
+                    <td>C</td>
+                    <td>2</td>
+                </tr>
+            </table>
+        """
+            )
+        )[0]
+
+        expected = DataFrame(data=[["A", 1], ["C", 2]], columns=["A", "B"])
+
+        tm.assert_frame_equal(result, expected)
+
     def test_header_inferred_from_rows_with_only_th(self, flavor_read_html):
         # GH17054
         result = flavor_read_html(

From 36e09fe3350a3e492efc46e23745ef008cc9cd21 Mon Sep 17 00:00:00 2001
From: Nitish Satyavolu <snitish.iitk@gmail.com>
Date: Sun, 1 Dec 2024 18:52:06 -0800
Subject: [PATCH 2/4] BUG: Fix docstring error in _expand_colspan_rowspan

---
 pandas/io/html.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index bf39321533e1b..fa23876a16ee5 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -471,7 +471,7 @@ def _expand_colspan_rowspan(
         self,
         rows,
         section: Literal["header", "footer", "body"],
-        remainder: list[int, tuple[str | tuple, int]] | None = None,
+        remainder: list[tuple[int, str | tuple, int]] | None = None,
         overflow: bool = True,
     ) -> list[list]:
         """
@@ -482,7 +482,7 @@ def _expand_colspan_rowspan(
         rows : list of node-like
             List of <tr>s
         section : the section that the rows belong to (header, body or footer).
-        remainder: list[int, tuple[str | tuple, int]] | None
+        remainder: list[tuple[int, str | tuple, int]] | None
             Any remainder from the expansion of previous section
         overflow: bool
             If true, return any partial rows as 'remainder'. If not, use up any

From b22ca47dee082a06f0fad8ca09dd048cb064af60 Mon Sep 17 00:00:00 2001
From: Nitish Satyavolu <snitish.iitk@gmail.com>
Date: Sun, 1 Dec 2024 19:12:16 -0800
Subject: [PATCH 3/4] BUG: Update return type for _expand_colspan_rowspan

---
 pandas/io/html.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index fa23876a16ee5..e5ae9d90eb36f 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -473,7 +473,7 @@ def _expand_colspan_rowspan(
         section: Literal["header", "footer", "body"],
         remainder: list[tuple[int, str | tuple, int]] | None = None,
         overflow: bool = True,
-    ) -> list[list]:
+    ) -> tuple[list[list], list[tuple[int, str | tuple, int]]]:
         """
         Given a list of <tr>s, return a list of text rows.
 
@@ -493,6 +493,9 @@ def _expand_colspan_rowspan(
         list of list
             Each returned row is a list of str text, or tuple (text, link)
             if extract_links is not None.
+        remainder
+            Remaining partial rows if any. If overflow is False, an empty list
+            is returned.
 
         Notes
         -----

From 1f0f4d81014689afc87f718bd3806ad7ccdb7b5a Mon Sep 17 00:00:00 2001
From: Nitish Satyavolu <snitish.iitk@gmail.com>
Date: Mon, 2 Dec 2024 10:45:21 -0800
Subject: [PATCH 4/4] BUG: Address review and add not to whatsnew

---
 doc/source/whatsnew/v3.0.0.rst | 1 +
 pandas/io/html.py              | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index e74bd2f745b94..3eaf1b9e8ebba 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -700,6 +700,7 @@ I/O
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
 - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
+- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
 - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
 - Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`)
 - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
diff --git a/pandas/io/html.py b/pandas/io/html.py
index e5ae9d90eb36f..183af3a03221b 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -459,7 +459,7 @@ def row_is_all_th(row):
             body_rows,
             section="body",
             remainder=rem,
-            overflow=True if len(footer_rows) > 0 else False,
+            overflow=len(footer_rows) > 0,
         )
         footer, _ = self._expand_colspan_rowspan(
             footer_rows, section="footer", remainder=rem, overflow=False