diff --git a/pandas/io/html.py b/pandas/io/html.py index 3c38dae91eb89..83344f2f6992e 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -355,7 +355,8 @@ def _parse_raw_thead(self, table): thead = self._parse_thead(table) res = [] if thead: - res = lmap(self._text_getter, self._parse_th(thead[0])) + row = self._parse_th(thead[0])[0].find_parent('tr') + res = lmap(self._text_getter, self._parse_th(row)) return np.atleast_1d( np.array(res).squeeze()) if res and len(res) == 1 else res @@ -591,7 +592,7 @@ def _parse_tfoot(self, table): return table.xpath('.//tfoot') def _parse_raw_thead(self, table): - expr = './/thead//th' + expr = './/thead//tr[th][1]//th' return [_remove_whitespace(x.text_content()) for x in table.xpath(expr)] diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 7b4e775db9476..030444d7c807a 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -694,6 +694,7 @@ def test_bool_header_arg(self): with tm.assertRaises(TypeError): read_html(self.spam_data, header=arg) + def test_converters(self): # GH 13461 html_data = """ @@ -760,6 +761,34 @@ def test_keep_default_na(self): html_df = read_html(html_data, keep_default_na=True)[0] tm.assert_frame_equal(expected_df, html_df) + def test_multiple_header(self): + data = StringIO('''
+ + + + + + + + + + + + + + + + + + + +
NameAgeParty
Gender
Hillary68D
''') + expected = DataFrame(columns=["Name", "Age", "Party"], + data=[("Hillary", 68, "D")]) + result = self.read_html(data)[0] + tm.assert_frame_equal(expected, result) + + def _lang_enc(filename): return os.path.splitext(os.path.basename(filename))[0].split('_')