Skip to content

Commit 926f241

Browse files
adamhooperTomAugspurger
authored andcommitted
Read from multiple <tbody> within a <table> (#20891)
* Read from multiple <tbody> within a <table> refs #20690
1 parent 7b683f4 commit 926f241

File tree

3 files changed

+41
-9
lines changed

3 files changed

+41
-9
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,7 @@ Other Enhancements
495495
- :meth:`DataFrame.to_sql` now performs a multivalue insert if the underlying connection supports itk rather than inserting row by row.
496496
``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`)
497497
- :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`)
498+
- :func:`read_html` now reads all ``<tbody>`` elements in a ``<table>``, not just the first. (:issue:`20690`)
498499
- :meth:`~pandas.core.window.Rolling.quantile` and :meth:`~pandas.core.window.Expanding.quantile` now accept the ``interpolation`` keyword, ``linear`` by default (:issue:`20497`)
499500
- zip compression is supported via ``compression=zip`` in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`)
500501
- :class:`pandas.tseries.api.offsets.WeekOfMonth` constructor now supports ``n=0`` (:issue:`20517`).

pandas/io/html.py

+13-9
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,7 @@ def _parse_thead(self, table):
324324
raise com.AbstractMethodError(self)
325325

326326
def _parse_tbody(self, table):
327-
"""Return the body of the table.
327+
"""Return the list of tbody elements from the parsed table element.
328328
329329
Parameters
330330
----------
@@ -333,8 +333,8 @@ def _parse_tbody(self, table):
333333
334334
Returns
335335
-------
336-
tbody : node-like
337-
A <tbody>...</tbody> element.
336+
tbodys : list of node-like
337+
A list of <tbody>...</tbody> elements
338338
"""
339339
raise com.AbstractMethodError(self)
340340

@@ -388,13 +388,17 @@ def _parse_raw_tfoot(self, table):
388388
np.array(res).squeeze()) if res and len(res) == 1 else res
389389

390390
def _parse_raw_tbody(self, table):
391-
tbody = self._parse_tbody(table)
391+
tbodies = self._parse_tbody(table)
392392

393-
try:
394-
res = self._parse_tr(tbody[0])
395-
except IndexError:
396-
res = self._parse_tr(table)
397-
return self._parse_raw_data(res)
393+
raw_data = []
394+
395+
if tbodies:
396+
for tbody in tbodies:
397+
raw_data.extend(self._parse_tr(tbody))
398+
else:
399+
raw_data.extend(self._parse_tr(table))
400+
401+
return self._parse_raw_data(raw_data)
398402

399403
def _handle_hidden_tables(self, tbl_list, attr_name):
400404
"""Returns list of tables, potentially removing hidden elements

pandas/tests/io/test_html.py

+27
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,33 @@ def test_empty_tables(self):
396396
res2 = self.read_html(StringIO(data2))
397397
assert_framelist_equal(res1, res2)
398398

399+
def test_multiple_tbody(self):
400+
# GH-20690
401+
# Read all tbody tags within a single table.
402+
data = '''<table>
403+
<thead>
404+
<tr>
405+
<th>A</th>
406+
<th>B</th>
407+
</tr>
408+
</thead>
409+
<tbody>
410+
<tr>
411+
<td>1</td>
412+
<td>2</td>
413+
</tr>
414+
</tbody>
415+
<tbody>
416+
<tr>
417+
<td>3</td>
418+
<td>4</td>
419+
</tr>
420+
</tbody>
421+
</table>'''
422+
expected = DataFrame({'A': [1, 3], 'B': [2, 4]})
423+
result = self.read_html(StringIO(data))[0]
424+
tm.assert_frame_equal(result, expected)
425+
399426
def test_header_and_one_column(self):
400427
"""
401428
Don't fail with bs4 when there is a header and only one column

0 commit comments

Comments
 (0)