Skip to content

Commit d9dfaa9

Browse files
authored
BUG: Fix pd.read_html handling of rowspan in table header (pandas-dev#60464)
* BUG: Fix pd.read_html handling of rowspan in table header * BUG: Fix docstring error in _expand_colspan_rowspan * BUG: Update return type for _expand_colspan_rowspan * BUG: Address review and add not to whatsnew
1 parent e631442 commit d9dfaa9

File tree

3 files changed

+66
-20
lines changed

3 files changed

+66
-20
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -701,6 +701,7 @@ I/O
701701
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
702702
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
703703
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
704+
- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
704705
- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
705706
- Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`)
706707
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)

pandas/io/html.py

+38-20
Original file line numberDiff line numberDiff line change
@@ -454,15 +454,26 @@ def row_is_all_th(row):
454454
while body_rows and row_is_all_th(body_rows[0]):
455455
header_rows.append(body_rows.pop(0))
456456

457-
header = self._expand_colspan_rowspan(header_rows, section="header")
458-
body = self._expand_colspan_rowspan(body_rows, section="body")
459-
footer = self._expand_colspan_rowspan(footer_rows, section="footer")
457+
header, rem = self._expand_colspan_rowspan(header_rows, section="header")
458+
body, rem = self._expand_colspan_rowspan(
459+
body_rows,
460+
section="body",
461+
remainder=rem,
462+
overflow=len(footer_rows) > 0,
463+
)
464+
footer, _ = self._expand_colspan_rowspan(
465+
footer_rows, section="footer", remainder=rem, overflow=False
466+
)
460467

461468
return header, body, footer
462469

463470
def _expand_colspan_rowspan(
464-
self, rows, section: Literal["header", "footer", "body"]
465-
) -> list[list]:
471+
self,
472+
rows,
473+
section: Literal["header", "footer", "body"],
474+
remainder: list[tuple[int, str | tuple, int]] | None = None,
475+
overflow: bool = True,
476+
) -> tuple[list[list], list[tuple[int, str | tuple, int]]]:
466477
"""
467478
Given a list of <tr>s, return a list of text rows.
468479
@@ -471,12 +482,20 @@ def _expand_colspan_rowspan(
471482
rows : list of node-like
472483
List of <tr>s
473484
section : the section that the rows belong to (header, body or footer).
485+
remainder: list[tuple[int, str | tuple, int]] | None
486+
Any remainder from the expansion of previous section
487+
overflow: bool
488+
If true, return any partial rows as 'remainder'. If not, use up any
489+
partial rows. True by default.
474490
475491
Returns
476492
-------
477493
list of list
478494
Each returned row is a list of str text, or tuple (text, link)
479495
if extract_links is not None.
496+
remainder
497+
Remaining partial rows if any. If overflow is False, an empty list
498+
is returned.
480499
481500
Notes
482501
-----
@@ -485,9 +504,7 @@ def _expand_colspan_rowspan(
485504
"""
486505
all_texts = [] # list of rows, each a list of str
487506
text: str | tuple
488-
remainder: list[
489-
tuple[int, str | tuple, int]
490-
] = [] # list of (index, text, nrows)
507+
remainder = remainder if remainder is not None else []
491508

492509
for tr in rows:
493510
texts = [] # the output for this row
@@ -528,19 +545,20 @@ def _expand_colspan_rowspan(
528545
all_texts.append(texts)
529546
remainder = next_remainder
530547

531-
# Append rows that only appear because the previous row had non-1
532-
# rowspan
533-
while remainder:
534-
next_remainder = []
535-
texts = []
536-
for prev_i, prev_text, prev_rowspan in remainder:
537-
texts.append(prev_text)
538-
if prev_rowspan > 1:
539-
next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
540-
all_texts.append(texts)
541-
remainder = next_remainder
548+
if not overflow:
549+
# Append rows that only appear because the previous row had non-1
550+
# rowspan
551+
while remainder:
552+
next_remainder = []
553+
texts = []
554+
for prev_i, prev_text, prev_rowspan in remainder:
555+
texts.append(prev_text)
556+
if prev_rowspan > 1:
557+
next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
558+
all_texts.append(texts)
559+
remainder = next_remainder
542560

543-
return all_texts
561+
return all_texts, remainder
544562

545563
def _handle_hidden_tables(self, tbl_list, attr_name: str):
546564
"""

pandas/tests/io/test_html.py

+27
Original file line numberDiff line numberDiff line change
@@ -1004,6 +1004,33 @@ def test_rowspan_only_rows(self, flavor_read_html):
10041004

10051005
tm.assert_frame_equal(result, expected)
10061006

1007+
def test_rowspan_in_header_overflowing_to_body(self, flavor_read_html):
1008+
# GH60210
1009+
1010+
result = flavor_read_html(
1011+
StringIO(
1012+
"""
1013+
<table>
1014+
<tr>
1015+
<th rowspan="2">A</th>
1016+
<th>B</th>
1017+
</tr>
1018+
<tr>
1019+
<td>1</td>
1020+
</tr>
1021+
<tr>
1022+
<td>C</td>
1023+
<td>2</td>
1024+
</tr>
1025+
</table>
1026+
"""
1027+
)
1028+
)[0]
1029+
1030+
expected = DataFrame(data=[["A", 1], ["C", 2]], columns=["A", "B"])
1031+
1032+
tm.assert_frame_equal(result, expected)
1033+
10071034
def test_header_inferred_from_rows_with_only_th(self, flavor_read_html):
10081035
# GH17054
10091036
result = flavor_read_html(

0 commit comments

Comments
 (0)