Skip to content

Commit 6fa0489

Browse files
committed
Do not nix rows of empty
... but _ignore_ empty rows when inferring columns. This changes the behavior of test_spam_header, which previously ignored an empty row when the user explicitly stated the row number to use as header.
1 parent ad6e869 commit 6fa0489

File tree

2 files changed

+92
-51
lines changed

2 files changed

+92
-51
lines changed

pandas/io/html.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -496,11 +496,7 @@ def _expand_colspan_rowspan(self, rows):
496496
all_texts.append(texts)
497497
remainder = next_remainder
498498

499-
# ignore all-empty-text rows
500-
no_empty = [row for row in all_texts
501-
if any(text for text in row)]
502-
503-
return no_empty
499+
return all_texts
504500

505501
def _handle_hidden_tables(self, tbl_list, attr_name):
506502
"""
@@ -785,10 +781,16 @@ def _data_to_frame(**kwargs):
785781
header = kwargs.pop('header')
786782
kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])
787783
if head:
788-
rows = lrange(len(head))
789784
body = head + body
790-
if header is None: # special case when a table has <th> elements
791-
header = 0 if rows == [0] else rows
785+
786+
# Infer header when there is a <thead> or top <th>-only rows
787+
if header is None:
788+
if len(head) == 1:
789+
header = 0
790+
else:
791+
# ignore all-empty-text rows
792+
header = [i for i, row in enumerate(head)
793+
if any(text for text in row)]
792794

793795
if foot:
794796
body += foot

pandas/tests/io/test_html.py

+82-43
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
date_range, Series)
1616
from pandas.compat import (map, zip, StringIO, BytesIO,
1717
is_platform_windows, PY3, reload)
18+
from pandas.errors import ParserError
1819
from pandas.io.common import URLError, file_path_to_url
1920
import pandas.io.html
2021
from pandas.io.html import read_html
@@ -147,7 +148,7 @@ def test_banklist_no_match(self):
147148
assert isinstance(df, DataFrame)
148149

149150
def test_spam_header(self):
150-
df = self.read_html(self.spam_data, '.*Water.*', header=1)[0]
151+
df = self.read_html(self.spam_data, '.*Water.*', header=2)[0]
151152
assert df.columns[0] == 'Proximates'
152153
assert not df.empty
153154

@@ -424,7 +425,7 @@ def test_multiple_tbody(self):
424425
</tbody>
425426
</table>''')[0]
426427

427-
expected = DataFrame({'A': [1, 3], 'B': [2, 4]})
428+
expected = DataFrame(data=[[1, 2], [3, 4]], columns=['A', 'B'])
428429

429430
tm.assert_frame_equal(result, expected)
430431

@@ -471,11 +472,8 @@ def test_thead_without_tr(self):
471472
</tbody>
472473
</table>''')[0]
473474

474-
expected = DataFrame(data={
475-
'Country': ['Ukraine'],
476-
'Municipality': ['Odessa'],
477-
'Year': [1944],
478-
})
475+
expected = DataFrame(data=[['Ukraine', 'Odessa', 1944]],
476+
columns=['Country', 'Municipality', 'Year'])
479477

480478
tm.assert_frame_equal(result, expected)
481479

@@ -502,9 +500,10 @@ def test_tfoot_read(self):
502500
</tfoot>
503501
</table>'''
504502

505-
expected1 = DataFrame({'A': ['bodyA'], 'B': ['bodyB']})
506-
expected2 = DataFrame({'A': ['bodyA', 'footA'],
507-
'B': ['bodyB', 'footB']})
503+
expected1 = DataFrame(data=[['bodyA', 'bodyB']], columns=['A', 'B'])
504+
505+
expected2 = DataFrame(data=[['bodyA', 'bodyB'], ['footA', 'footB']],
506+
columns=['A', 'B'])
508507

509508
data1 = data_template.format(footer="")
510509
data2 = data_template.format(
@@ -532,7 +531,7 @@ def test_parse_header_of_non_string_column(self):
532531
</table>
533532
''', header=0)[0]
534533

535-
expected = DataFrame(data={'S': ['text'], 'I': [1944]})
534+
expected = DataFrame([['text', 1944]], columns=('S', 'I'))
536535

537536
tm.assert_frame_equal(result, expected)
538537

@@ -676,11 +675,7 @@ def test_colspan_rowspan_1(self):
676675
</table>
677676
""")[0]
678677

679-
expected = DataFrame(data={
680-
'A': ['a'],
681-
'B': ['b'],
682-
'C': ['c'],
683-
})
678+
expected = DataFrame([['a', 'b', 'c']], columns=['A', 'B', 'C'])
684679

685680
tm.assert_frame_equal(result, expected)
686681

@@ -708,13 +703,8 @@ def test_colspan_rowspan_copy_values(self):
708703
</table>
709704
""", header=0)[0]
710705

711-
expected = DataFrame(data={
712-
'X': ['A'],
713-
'X.1': ['B'],
714-
'Y': ['B'],
715-
'Z': ['Z'],
716-
'W': ['C'],
717-
})
706+
expected = DataFrame(data=[['A', 'B', 'B', 'Z', 'C']],
707+
columns=['X', 'X.1', 'Y', 'Z', 'W'])
718708

719709
tm.assert_frame_equal(result, expected)
720710

@@ -739,13 +729,8 @@ def test_colspan_rowspan_both_not_1(self):
739729
</table>
740730
""", header=0)[0]
741731

742-
expected = DataFrame(data={
743-
'A': ['A'],
744-
'B': ['B'],
745-
'B.1': ['B'],
746-
'B.2': ['B'],
747-
'C': ['D'],
748-
})
732+
expected = DataFrame(data=[['A', 'B', 'B', 'B', 'D']],
733+
columns=['A', 'B', 'B.1', 'B.2', 'C'])
749734

750735
tm.assert_frame_equal(result, expected)
751736

@@ -769,10 +754,7 @@ def test_rowspan_at_end_of_row(self):
769754
</table>
770755
""", header=0)[0]
771756

772-
expected = DataFrame(data={
773-
'A': ['C'],
774-
'B': ['B']
775-
})
757+
expected = DataFrame(data=[['C', 'B']], columns=['A', 'B'])
776758

777759
tm.assert_frame_equal(result, expected)
778760

@@ -788,14 +770,12 @@ def test_rowspan_only_rows(self):
788770
</table>
789771
""", header=0)[0]
790772

791-
expected = DataFrame(data={
792-
'A': ['A', 'A'],
793-
'B': ['B', 'B'],
794-
})
773+
expected = DataFrame(data=[['A', 'B'], ['A', 'B']],
774+
columns=['A', 'B'])
795775

796776
tm.assert_frame_equal(result, expected)
797777

798-
def test_header_inferred_from_th_elements(self):
778+
def test_header_inferred_from_rows_with_only_th(self):
799779
# GH17054
800780
result = self.read_html("""
801781
<table>
@@ -814,10 +794,9 @@ def test_header_inferred_from_th_elements(self):
814794
</table>
815795
""")[0]
816796

817-
expected = DataFrame(data={
818-
('A', 'a'): [1],
819-
('B', 'b'): [2],
820-
})
797+
columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']],
798+
labels=[[0, 1], [0, 1]])
799+
expected = DataFrame(data=[[1, 2]], columns=columns)
821800

822801
tm.assert_frame_equal(result, expected)
823802

@@ -856,6 +835,23 @@ def test_wikipedia_states_table(self, datapath):
856835
result = self.read_html(data, 'Arizona', header=1)[0]
857836
assert result['sq mi'].dtype == np.dtype('float64')
858837

838+
def test_parser_error_on_empty_header_row(self):
839+
with tm.assert_raises_regex(ParserError,
840+
r"Passed header=\[0,1\] are "
841+
r"too many rows for this "
842+
r"multi_index of columns"):
843+
self.read_html("""
844+
<table>
845+
<thead>
846+
<tr><th></th><th></tr>
847+
<tr><th>A</th><th>B</th></tr>
848+
</thead>
849+
<tbody>
850+
<tr><td>a</td><td>b</td></tr>
851+
</tbody>
852+
</table>
853+
""", header=[0, 1])
854+
859855
def test_decimal_rows(self):
860856
# GH 12907
861857
result = self.read_html('''<html>
@@ -960,6 +956,49 @@ def test_keep_default_na(self):
960956
html_df = self.read_html(html_data, keep_default_na=True)[0]
961957
tm.assert_frame_equal(expected_df, html_df)
962958

959+
def test_preserve_empty_rows(self):
960+
result = self.read_html("""
961+
<table>
962+
<tr>
963+
<th>A</th>
964+
<th>B</th>
965+
</tr>
966+
<tr>
967+
<td>a</td>
968+
<td>b</td>
969+
</tr>
970+
<tr>
971+
<td></td>
972+
<td></td>
973+
</tr>
974+
</table>
975+
""")[0]
976+
977+
expected = DataFrame(data=[['a', 'b'], [np.nan, np.nan]],
978+
columns=['A', 'B'])
979+
980+
tm.assert_frame_equal(result, expected)
981+
982+
def test_ignore_empty_rows_when_inferring_header(self):
983+
result = self.read_html("""
984+
<table>
985+
<thead>
986+
<tr><th></th><th></tr>
987+
<tr><th>A</th><th>B</th></tr>
988+
<tr><th>a</th><th>b</th></tr>
989+
</thead>
990+
<tbody>
991+
<tr><td>1</td><td>2</td></tr>
992+
</tbody>
993+
</table>
994+
""")[0]
995+
996+
columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']],
997+
labels=[[0, 1], [0, 1]])
998+
expected = DataFrame(data=[[1, 2]], columns=columns)
999+
1000+
tm.assert_frame_equal(result, expected)
1001+
9631002
def test_multiple_header_rows(self):
9641003
# Issue #13434
9651004
expected_df = DataFrame(data=[("Hillary", 68, "D"),

0 commit comments

Comments
 (0)