Skip to content

Commit 1773983

Browse files
committed
Do not nix rows of empty
... but _ignore_ empty rows when inferring columns. This changes the behavior of test_spam_header, which previously ignored an empty row when the user explicitly stated the row number to use as header.
1 parent 6f91abf commit 1773983

File tree

2 files changed

+93
-56
lines changed

2 files changed

+93
-56
lines changed

pandas/io/html.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -496,11 +496,7 @@ def _expand_colspan_rowspan(self, rows):
496496
all_texts.append(texts)
497497
remainder = next_remainder
498498

499-
# ignore all-empty-text rows
500-
no_empty = [row for row in all_texts
501-
if any(text for text in row)]
502-
503-
return no_empty
499+
return all_texts
504500

505501
def _handle_hidden_tables(self, tbl_list, attr_name):
506502
"""
@@ -785,10 +781,16 @@ def _data_to_frame(**kwargs):
785781
header = kwargs.pop('header')
786782
kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])
787783
if head:
788-
rows = lrange(len(head))
789784
body = head + body
790-
if header is None: # special case when a table has <th> elements
791-
header = 0 if rows == [0] else rows
785+
786+
# Infer header when there is a <thead> or top <th>-only rows
787+
if header is None:
788+
if len(head) == 1:
789+
header = 0
790+
else:
791+
# ignore all-empty-text rows
792+
header = [i for i, row in enumerate(head)
793+
if any(text for text in row)]
792794

793795
if foot:
794796
body += foot

pandas/tests/io/test_html.py

+83-48
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
date_range, Series)
1717
from pandas.compat import (map, zip, u, StringIO, BytesIO,
1818
is_platform_windows, PY3, reload)
19+
from pandas.errors import ParserError
1920
from pandas.io.common import URLError, file_path_to_url
2021
import pandas.io.html
2122
from pandas.io.html import read_html
@@ -135,7 +136,7 @@ def test_banklist_no_match(self):
135136
assert isinstance(df, DataFrame)
136137

137138
def test_spam_header(self):
138-
df = self.read_html(self.spam_data, '.*Water.*', header=1)[0]
139+
df = self.read_html(self.spam_data, '.*Water.*', header=2)[0]
139140
assert df.columns[0] == 'Proximates'
140141
assert not df.empty
141142

@@ -411,7 +412,7 @@ def test_multiple_tbody(self):
411412
</tbody>
412413
</table>''')[0]
413414

414-
expected = DataFrame({'A': [1, 3], 'B': [2, 4]})
415+
expected = DataFrame(data=[[1, 2], [3, 4]], columns=['A', 'B'])
415416

416417
tm.assert_frame_equal(result, expected)
417418

@@ -458,11 +459,8 @@ def test_thead_without_tr(self):
458459
</tbody>
459460
</table>''')[0]
460461

461-
expected = DataFrame(data={
462-
'Country': ['Ukraine'],
463-
'Municipality': ['Odessa'],
464-
'Year': [1944],
465-
})
462+
expected = DataFrame(data=[['Ukraine', 'Odessa', 1944]],
463+
columns=['Country', 'Municipality', 'Year'])
466464

467465
tm.assert_frame_equal(result, expected)
468466

@@ -489,9 +487,10 @@ def test_tfoot_read(self):
489487
</tfoot>
490488
</table>'''
491489

492-
expected1 = DataFrame({'A': ['bodyA'], 'B': ['bodyB']})
493-
expected2 = DataFrame({'A': ['bodyA', 'footA'],
494-
'B': ['bodyB', 'footB']})
490+
expected1 = DataFrame(data=[['bodyA', 'bodyB']], columns=['A', 'B'])
491+
492+
expected2 = DataFrame(data=[['bodyA', 'bodyB'], ['footA', 'footB']],
493+
columns=['A', 'B'])
495494

496495
data1 = data_template.format(footer="")
497496
data2 = data_template.format(
@@ -519,7 +518,7 @@ def test_parse_header_of_non_string_column(self):
519518
</table>
520519
''', header=0)[0]
521520

522-
expected = DataFrame(data={'S': ['text'], 'I': [1944]})
521+
expected = DataFrame([['text', 1944]], columns=('S', 'I'))
523522

524523
tm.assert_frame_equal(result, expected)
525524

@@ -663,11 +662,7 @@ def test_colspan_rowspan_1(self):
663662
</table>
664663
""")[0]
665664

666-
expected = DataFrame(data={
667-
'A': ['a'],
668-
'B': ['b'],
669-
'C': ['c'],
670-
})
665+
expected = DataFrame([['a', 'b', 'c']], columns=['A', 'B', 'C'])
671666

672667
tm.assert_frame_equal(result, expected)
673668

@@ -695,13 +690,8 @@ def test_colspan_rowspan_copy_values(self):
695690
</table>
696691
""", header=0)[0]
697692

698-
expected = DataFrame(data={
699-
'X': ['A'],
700-
'X.1': ['B'],
701-
'Y': ['B'],
702-
'Z': ['Z'],
703-
'W': ['C'],
704-
})
693+
expected = DataFrame(data=[['A', 'B', 'B', 'Z', 'C']],
694+
columns=['X', 'X.1', 'Y', 'Z', 'W'])
705695

706696
tm.assert_frame_equal(result, expected)
707697

@@ -726,13 +716,8 @@ def test_colspan_rowspan_both_not_1(self):
726716
</table>
727717
""", header=0)[0]
728718

729-
expected = DataFrame(data={
730-
'A': ['A'],
731-
'B': ['B'],
732-
'B.1': ['B'],
733-
'B.2': ['B'],
734-
'C': ['D'],
735-
})
719+
expected = DataFrame(data=[['A', 'B', 'B', 'B', 'D']],
720+
columns=['A', 'B', 'B.1', 'B.2', 'C'])
736721

737722
tm.assert_frame_equal(result, expected)
738723

@@ -756,10 +741,7 @@ def test_rowspan_at_end_of_row(self):
756741
</table>
757742
""", header=0)[0]
758743

759-
expected = DataFrame(data={
760-
'A': ['C'],
761-
'B': ['B']
762-
})
744+
expected = DataFrame(data=[['C', 'B']], columns=['A', 'B'])
763745

764746
tm.assert_frame_equal(result, expected)
765747

@@ -775,14 +757,12 @@ def test_rowspan_only_rows(self):
775757
</table>
776758
""", header=0)[0]
777759

778-
expected = DataFrame(data={
779-
'A': ['A', 'A'],
780-
'B': ['B', 'B'],
781-
})
760+
expected = DataFrame(data=[['A', 'B'], ['A', 'B']],
761+
columns=['A', 'B'])
782762

783763
tm.assert_frame_equal(result, expected)
784764

785-
def test_header_inferred_from_th_elements(self):
765+
def test_header_inferred_from_rows_with_only_th(self):
786766
# GH17054
787767
result = self.read_html("""
788768
<table>
@@ -801,10 +781,9 @@ def test_header_inferred_from_th_elements(self):
801781
</table>
802782
""")[0]
803783

804-
expected = DataFrame(data={
805-
('A', 'a'): [1],
806-
('B', 'b'): [2],
807-
})
784+
columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']],
785+
labels=[[0, 1], [0, 1]])
786+
expected = DataFrame(data=[[1, 2]], columns=columns)
808787

809788
tm.assert_frame_equal(result, expected)
810789

@@ -827,11 +806,7 @@ def test_parse_dates_combine(self):
827806

828807
def test_computer_sales_page(self):
829808
data = os.path.join(DATA_PATH, 'computer_sales_page.html')
830-
# This table is unique because it has no <thead>, and its <th>-only
831-
# rows are underneath an initial <td>-only row that has no content.
832-
# After skipping the empty row, header=[0,1] picks the two <th>-only
833-
# rows.
834-
df = self.read_html(data, header=[0, 1])[0]
809+
df = self.read_html(data, header=[1, 2])[0]
835810

836811
three_months = u('Three months ended April\xa030')
837812
assert df.columns[3] == (three_months, '2013.1')
@@ -844,6 +819,23 @@ def test_wikipedia_states_table(self):
844819
result = self.read_html(data, 'Arizona', header=1)[0]
845820
assert result['sq mi'].dtype == np.dtype('float64')
846821

822+
def test_parser_error_on_empty_header_row(self):
823+
with tm.assert_raises_regex(ParserError,
824+
r"Passed header=\[0,1\] are "
825+
r"too many rows for this "
826+
r"multi_index of columns"):
827+
self.read_html("""
828+
<table>
829+
<thead>
830+
<tr><th></th><th></tr>
831+
<tr><th>A</th><th>B</th></tr>
832+
</thead>
833+
<tbody>
834+
<tr><td>a</td><td>b</td></tr>
835+
</tbody>
836+
</table>
837+
""", header=[0, 1])
838+
847839
def test_decimal_rows(self):
848840
# GH 12907
849841
result = self.read_html('''<html>
@@ -948,6 +940,49 @@ def test_keep_default_na(self):
948940
html_df = self.read_html(html_data, keep_default_na=True)[0]
949941
tm.assert_frame_equal(expected_df, html_df)
950942

943+
def test_preserve_empty_rows(self):
944+
result = self.read_html("""
945+
<table>
946+
<tr>
947+
<th>A</th>
948+
<th>B</th>
949+
</tr>
950+
<tr>
951+
<td>a</td>
952+
<td>b</td>
953+
</tr>
954+
<tr>
955+
<td></td>
956+
<td></td>
957+
</tr>
958+
</table>
959+
""")[0]
960+
961+
expected = DataFrame(data=[['a', 'b'], [np.nan, np.nan]],
962+
columns=['A', 'B'])
963+
964+
tm.assert_frame_equal(result, expected)
965+
966+
def test_ignore_empty_rows_when_inferring_header(self):
967+
result = self.read_html("""
968+
<table>
969+
<thead>
970+
<tr><th></th><th></tr>
971+
<tr><th>A</th><th>B</th></tr>
972+
<tr><th>a</th><th>b</th></tr>
973+
</thead>
974+
<tbody>
975+
<tr><td>1</td><td>2</td></tr>
976+
</tbody>
977+
</table>
978+
""")[0]
979+
980+
columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']],
981+
labels=[[0, 1], [0, 1]])
982+
expected = DataFrame(data=[[1, 2]], columns=columns)
983+
984+
tm.assert_frame_equal(result, expected)
985+
951986
def test_multiple_header_rows(self):
952987
# Issue #13434
953988
expected_df = DataFrame(data=[("Hillary", 68, "D"),

0 commit comments

Comments
 (0)