Skip to content

Commit 35582e0

Browse files
committed
ENH:read_html() handles tables with multiple header rows #13434
1 parent 1e61aed commit 35582e0

File tree

2 files changed

+32
-2
lines changed

2 files changed

+32
-2
lines changed

pandas/io/html.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,8 @@ def _parse_raw_thead(self, table):
355355
thead = self._parse_thead(table)
356356
res = []
357357
if thead:
358-
res = lmap(self._text_getter, self._parse_th(thead[0]))
358+
row = self._parse_th(thead[0])[0].find_parent('tr')
359+
res = lmap(self._text_getter, self._parse_th(row))
359360
return np.atleast_1d(
360361
np.array(res).squeeze()) if res and len(res) == 1 else res
361362

@@ -591,7 +592,7 @@ def _parse_tfoot(self, table):
591592
return table.xpath('.//tfoot')
592593

593594
def _parse_raw_thead(self, table):
594-
expr = './/thead//th'
595+
expr = './/thead//tr[th][1]//th'
595596
return [_remove_whitespace(x.text_content()) for x in
596597
table.xpath(expr)]
597598

pandas/io/tests/test_html.py

+29
Original file line numberDiff line numberDiff line change
@@ -694,6 +694,7 @@ def test_bool_header_arg(self):
694694
with tm.assertRaises(TypeError):
695695
read_html(self.spam_data, header=arg)
696696

697+
697698
def test_converters(self):
698699
# GH 13461
699700
html_data = """<table>
@@ -760,6 +761,34 @@ def test_keep_default_na(self):
760761
html_df = read_html(html_data, keep_default_na=True)[0]
761762
tm.assert_frame_equal(expected_df, html_df)
762763

764+
def test_multiple_header(self):
765+
data = StringIO('''<table border="1" class="dataframe">
766+
<thead>
767+
<tr style="text-align: right;">
768+
<th>Name</th>
769+
<th>Age</th>
770+
<th>Party</th>
771+
</tr>
772+
<tr>
773+
<th></th>
774+
<th>Gender</th>
775+
<th></th>
776+
</tr>
777+
</thead>
778+
<tbody>
779+
<tr>
780+
<th>Hillary</th>
781+
<td>68</td>
782+
<td>D</td>
783+
</tr>
784+
</tbody>
785+
</table>''')
786+
expected = DataFrame(columns=["Name", "Age", "Party"],
787+
data=[("Hillary", 68, "D")])
788+
result = self.read_html(data)[0]
789+
tm.assert_frame_equal(expected, result)
790+
791+
763792

764793
def _lang_enc(filename):
765794
return os.path.splitext(os.path.basename(filename))[0].split('_')

0 commit comments

Comments
 (0)