Skip to content

Commit 7587bf1

Browse files
committed
Added ability to read footer to read_html, and a test
1 parent 0a2ea0a commit 7587bf1

File tree

3 files changed

+38
-2
lines changed

3 files changed

+38
-2
lines changed

doc/source/whatsnew/v0.15.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ Enhancements
6666
- Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on `Timestamp` class (:issue:`5351`).
6767
- Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See :ref:`here<remote_data.ga>`.
6868
- Added flag ``order_categoricals`` to ``StataReader`` and ``read_stata`` to select whether to order imported categorical data (:issue:`8836`). See :ref:`here <io.stata-categorical>` for more information on importing categorical variables from Stata data files.
69+
- Added ability to read table footer to read_html (:issue:`8552`)
6970

7071
.. _whatsnew_0152.performance:
7172

pandas/io/html.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -577,7 +577,7 @@ def _parse_raw_thead(self, table):
577577
table.xpath(expr)]
578578

579579
def _parse_raw_tfoot(self, table):
580-
expr = './/tfoot//th'
580+
expr = './/tfoot//th|//tfoot//td'
581581
return [_remove_whitespace(x.text_content()) for x in
582582
table.xpath(expr)]
583583

@@ -594,14 +594,17 @@ def _expand_elements(body):
594594

595595
def _data_to_frame(data, header, index_col, skiprows, infer_types,
596596
parse_dates, tupleize_cols, thousands):
597-
head, body, _ = data # _ is footer which is rarely used: ignore for now
597+
head, body, foot = data
598598

599599
if head:
600600
body = [head] + body
601601

602602
if header is None: # special case when a table has <th> elements
603603
header = 0
604604

605+
if foot:
606+
body += [foot]
607+
605608
# fill out elements of body that are "ragged"
606609
_expand_elements(body)
607610

pandas/io/tests/test_html.py

+32
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,38 @@ def test_empty_tables(self):
426426
res1 = self.read_html(StringIO(data1))
427427
res2 = self.read_html(StringIO(data2))
428428
assert_framelist_equal(res1, res2)
429+
430+
def test_tfoot_read(self):
431+
"""
432+
Make sure that read_html reads tfoot, containing td or th.
433+
Ignores empty tfoot
434+
"""
435+
data_template = '''<table>
436+
<thead>
437+
<tr>
438+
<th>A</th>
439+
<th>B</th>
440+
</tr>
441+
</thead>
442+
<tbody>
443+
<tr>
444+
<td>bodyA</td>
445+
<td>bodyB</td>
446+
</tr>
447+
</tbody>
448+
<tfoot>
449+
{footer}
450+
</tfoot>
451+
</table>'''
452+
453+
data1 = data_template.format(footer = "")
454+
data2 = data_template.format(footer ="<tr><td>footA</td><th>footB</th></tr>")
455+
456+
d1 = {'A': ['bodyA'], 'B': ['bodyB']}
457+
d2 = {'A': ['bodyA', 'footA'], 'B': ['bodyB', 'footB']}
458+
459+
tm.assert_frame_equal(self.read_html(data1)[0], DataFrame(d1))
460+
tm.assert_frame_equal(self.read_html(data2)[0], DataFrame(d2))
429461

430462
def test_countries_municipalities(self):
431463
# GH5048

0 commit comments

Comments
 (0)