Skip to content

Commit 47ece9d

Browse files
committed
ENH:read_html() handles tables with multiple header rows pandas-dev#13434
1 parent 0c8442c commit 47ece9d

File tree

2 files changed

+42
-10
lines changed

2 files changed

+42
-10
lines changed

pandas/io/html.py

+20-10
Original file line numberDiff line numberDiff line change
@@ -355,9 +355,12 @@ def _parse_raw_thead(self, table):
355355
thead = self._parse_thead(table)
356356
res = []
357357
if thead:
358-
res = lmap(self._text_getter, self._parse_th(thead[0]))
359-
return np.atleast_1d(
360-
np.array(res).squeeze()) if res and len(res) == 1 else res
358+
trs = self._parse_tr(thead[0])
359+
for tr in trs:
360+
cols = lmap(self._text_getter, self._parse_td(tr))
361+
if any([col != '' for col in cols]):
362+
res.append(cols)
363+
return res
361364

362365
def _parse_raw_tfoot(self, table):
363366
tfoot = self._parse_tfoot(table)
@@ -591,9 +594,17 @@ def _parse_tfoot(self, table):
591594
return table.xpath('.//tfoot')
592595

593596
def _parse_raw_thead(self, table):
594-
expr = './/thead//th'
595-
return [_remove_whitespace(x.text_content()) for x in
596-
table.xpath(expr)]
597+
expr = './/thead'
598+
thead = table.xpath(expr)
599+
res = []
600+
if thead:
601+
trs = self._parse_tr(thead[0])
602+
for tr in trs:
603+
cols = [_remove_whitespace(x.text_content()) for x in
604+
self._parse_td(tr)]
605+
if any([col != '' for col in cols]):
606+
res.append(cols)
607+
return res
597608

598609
def _parse_raw_tfoot(self, table):
599610
expr = './/tfoot//th|//tfoot//td'
@@ -615,12 +626,11 @@ def _data_to_frame(**kwargs):
615626
head, body, foot = kwargs.pop('data')
616627
header = kwargs.pop('header')
617628
kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])
618-
619629
if head:
620-
body = [head] + body
621-
630+
rows = range(len(head))
631+
body = head + body
622632
if header is None: # special case when a table has <th> elements
623-
header = 0
633+
header = 0 if rows == [0] else rows
624634

625635
if foot:
626636
body += [foot]

pandas/io/tests/test_html.py

+22
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,17 @@ def test_keep_default_na(self):
760760
html_df = read_html(html_data, keep_default_na=True)[0]
761761
tm.assert_frame_equal(expected_df, html_df)
762762

763+
def test_multiple_header_rows(self):
764+
expected_df = DataFrame(data=[("Hillary", 68, "D"),
765+
("Bernie", 74, "D"),
766+
("Donald", 69, "R")])
767+
expected_df.columns = [["Unnamed: 0_level_0", "Age", "Party"],
768+
["Name", "Unnamed: 1_level_1",
769+
"Unnamed: 2_level_1"]]
770+
html = expected_df.to_html(index=False)
771+
html_df = read_html(html, )[0]
772+
tm.assert_frame_equal(expected_df, html_df)
773+
763774

764775
def _lang_enc(filename):
765776
return os.path.splitext(os.path.basename(filename))[0].split('_')
@@ -869,6 +880,17 @@ def test_computer_sales_page(self):
869880
data = os.path.join(DATA_PATH, 'computer_sales_page.html')
870881
self.read_html(data, header=[0, 1])
871882

883+
def test_multiple_header_rows(self):
884+
expected_df = DataFrame(data=[("Hillary", 68, "D"),
885+
("Bernie", 74, "D"),
886+
("Donald", 69, "R")])
887+
expected_df.columns = [["Unnamed: 0_level_0", "Age", "Party"],
888+
["Name", "Unnamed: 1_level_1",
889+
"Unnamed: 2_level_1"]]
890+
html = expected_df.to_html(index=False)
891+
html_df = read_html(html, )[0]
892+
tm.assert_frame_equal(expected_df, html_df)
893+
872894

873895
def test_invalid_flavor():
874896
url = 'google.com'

0 commit comments

Comments
 (0)