16
16
date_range , Series )
17
17
from pandas .compat import (map , zip , u , StringIO , BytesIO ,
18
18
is_platform_windows , PY3 , reload )
19
+ from pandas .errors import ParserError
19
20
from pandas .io .common import URLError , file_path_to_url
20
21
import pandas .io .html
21
22
from pandas .io .html import read_html
@@ -135,7 +136,7 @@ def test_banklist_no_match(self):
135
136
assert isinstance (df , DataFrame )
136
137
137
138
def test_spam_header (self ):
138
- df = self .read_html (self .spam_data , '.*Water.*' , header = 1 )[0 ]
139
+ df = self .read_html (self .spam_data , '.*Water.*' , header = 2 )[0 ]
139
140
assert df .columns [0 ] == 'Proximates'
140
141
assert not df .empty
141
142
@@ -411,7 +412,7 @@ def test_multiple_tbody(self):
411
412
</tbody>
412
413
</table>''' )[0 ]
413
414
414
- expected = DataFrame ({ 'A' : [ 1 , 3 ], 'B' : [ 2 , 4 ]} )
415
+ expected = DataFrame (data = [[ 1 , 2 ], [ 3 , 4 ]], columns = [ 'A' , 'B' ] )
415
416
416
417
tm .assert_frame_equal (result , expected )
417
418
@@ -458,11 +459,8 @@ def test_thead_without_tr(self):
458
459
</tbody>
459
460
</table>''' )[0 ]
460
461
461
- expected = DataFrame (data = {
462
- 'Country' : ['Ukraine' ],
463
- 'Municipality' : ['Odessa' ],
464
- 'Year' : [1944 ],
465
- })
462
+ expected = DataFrame (data = [['Ukraine' , 'Odessa' , 1944 ]],
463
+ columns = ['Country' , 'Municipality' , 'Year' ])
466
464
467
465
tm .assert_frame_equal (result , expected )
468
466
@@ -489,9 +487,10 @@ def test_tfoot_read(self):
489
487
</tfoot>
490
488
</table>'''
491
489
492
- expected1 = DataFrame ({'A' : ['bodyA' ], 'B' : ['bodyB' ]})
493
- expected2 = DataFrame ({'A' : ['bodyA' , 'footA' ],
494
- 'B' : ['bodyB' , 'footB' ]})
490
+ expected1 = DataFrame (data = [['bodyA' , 'bodyB' ]], columns = ['A' , 'B' ])
491
+
492
+ expected2 = DataFrame (data = [['bodyA' , 'bodyB' ], ['footA' , 'footB' ]],
493
+ columns = ['A' , 'B' ])
495
494
496
495
data1 = data_template .format (footer = "" )
497
496
data2 = data_template .format (
@@ -519,7 +518,7 @@ def test_parse_header_of_non_string_column(self):
519
518
</table>
520
519
''' , header = 0 )[0 ]
521
520
522
- expected = DataFrame (data = { 'S' : [ 'text' ], 'I' : [ 1944 ]} )
521
+ expected = DataFrame ([[ 'text' , 1944 ]], columns = ( 'S' , 'I' ) )
523
522
524
523
tm .assert_frame_equal (result , expected )
525
524
@@ -663,11 +662,7 @@ def test_colspan_rowspan_1(self):
663
662
</table>
664
663
""" )[0 ]
665
664
666
- expected = DataFrame (data = {
667
- 'A' : ['a' ],
668
- 'B' : ['b' ],
669
- 'C' : ['c' ],
670
- })
665
+ expected = DataFrame ([['a' , 'b' , 'c' ]], columns = ['A' , 'B' , 'C' ])
671
666
672
667
tm .assert_frame_equal (result , expected )
673
668
@@ -695,13 +690,8 @@ def test_colspan_rowspan_copy_values(self):
695
690
</table>
696
691
""" , header = 0 )[0 ]
697
692
698
- expected = DataFrame (data = {
699
- 'X' : ['A' ],
700
- 'X.1' : ['B' ],
701
- 'Y' : ['B' ],
702
- 'Z' : ['Z' ],
703
- 'W' : ['C' ],
704
- })
693
+ expected = DataFrame (data = [['A' , 'B' , 'B' , 'Z' , 'C' ]],
694
+ columns = ['X' , 'X.1' , 'Y' , 'Z' , 'W' ])
705
695
706
696
tm .assert_frame_equal (result , expected )
707
697
@@ -726,13 +716,8 @@ def test_colspan_rowspan_both_not_1(self):
726
716
</table>
727
717
""" , header = 0 )[0 ]
728
718
729
- expected = DataFrame (data = {
730
- 'A' : ['A' ],
731
- 'B' : ['B' ],
732
- 'B.1' : ['B' ],
733
- 'B.2' : ['B' ],
734
- 'C' : ['D' ],
735
- })
719
+ expected = DataFrame (data = [['A' , 'B' , 'B' , 'B' , 'D' ]],
720
+ columns = ['A' , 'B' , 'B.1' , 'B.2' , 'C' ])
736
721
737
722
tm .assert_frame_equal (result , expected )
738
723
@@ -756,10 +741,7 @@ def test_rowspan_at_end_of_row(self):
756
741
</table>
757
742
""" , header = 0 )[0 ]
758
743
759
- expected = DataFrame (data = {
760
- 'A' : ['C' ],
761
- 'B' : ['B' ]
762
- })
744
+ expected = DataFrame (data = [['C' , 'B' ]], columns = ['A' , 'B' ])
763
745
764
746
tm .assert_frame_equal (result , expected )
765
747
@@ -775,14 +757,12 @@ def test_rowspan_only_rows(self):
775
757
</table>
776
758
""" , header = 0 )[0 ]
777
759
778
- expected = DataFrame (data = {
779
- 'A' : ['A' , 'A' ],
780
- 'B' : ['B' , 'B' ],
781
- })
760
+ expected = DataFrame (data = [['A' , 'B' ], ['A' , 'B' ]],
761
+ columns = ['A' , 'B' ])
782
762
783
763
tm .assert_frame_equal (result , expected )
784
764
785
- def test_header_inferred_from_th_elements (self ):
765
+ def test_header_inferred_from_rows_with_only_th (self ):
786
766
# GH17054
787
767
result = self .read_html ("""
788
768
<table>
@@ -801,10 +781,9 @@ def test_header_inferred_from_th_elements(self):
801
781
</table>
802
782
""" )[0 ]
803
783
804
- expected = DataFrame (data = {
805
- ('A' , 'a' ): [1 ],
806
- ('B' , 'b' ): [2 ],
807
- })
784
+ columns = MultiIndex (levels = [['A' , 'B' ], ['a' , 'b' ]],
785
+ labels = [[0 , 1 ], [0 , 1 ]])
786
+ expected = DataFrame (data = [[1 , 2 ]], columns = columns )
808
787
809
788
tm .assert_frame_equal (result , expected )
810
789
@@ -827,11 +806,7 @@ def test_parse_dates_combine(self):
827
806
828
807
def test_computer_sales_page (self ):
829
808
data = os .path .join (DATA_PATH , 'computer_sales_page.html' )
830
- # This table is unique because it has no <thead>, and its <th>-only
831
- # rows are underneath an initial <td>-only row that has no content.
832
- # After skipping the empty row, header=[0,1] picks the two <th>-only
833
- # rows.
834
- df = self .read_html (data , header = [0 , 1 ])[0 ]
809
+ df = self .read_html (data , header = [1 , 2 ])[0 ]
835
810
836
811
three_months = u ('Three months ended April\xa0 30' )
837
812
assert df .columns [3 ] == (three_months , '2013.1' )
@@ -844,6 +819,23 @@ def test_wikipedia_states_table(self):
844
819
result = self .read_html (data , 'Arizona' , header = 1 )[0 ]
845
820
assert result ['sq mi' ].dtype == np .dtype ('float64' )
846
821
822
+ def test_parser_error_on_empty_header_row (self ):
823
+ with tm .assert_raises_regex (ParserError ,
824
+ r"Passed header=\[0,1\] are "
825
+ r"too many rows for this "
826
+ r"multi_index of columns" ):
827
+ self .read_html ("""
828
+ <table>
829
+ <thead>
830
+ <tr><th></th><th></tr>
831
+ <tr><th>A</th><th>B</th></tr>
832
+ </thead>
833
+ <tbody>
834
+ <tr><td>a</td><td>b</td></tr>
835
+ </tbody>
836
+ </table>
837
+ """ , header = [0 , 1 ])
838
+
847
839
def test_decimal_rows (self ):
848
840
# GH 12907
849
841
result = self .read_html ('''<html>
@@ -948,6 +940,49 @@ def test_keep_default_na(self):
948
940
html_df = self .read_html (html_data , keep_default_na = True )[0 ]
949
941
tm .assert_frame_equal (expected_df , html_df )
950
942
943
+ def test_preserve_empty_rows (self ):
944
+ result = self .read_html ("""
945
+ <table>
946
+ <tr>
947
+ <th>A</th>
948
+ <th>B</th>
949
+ </tr>
950
+ <tr>
951
+ <td>a</td>
952
+ <td>b</td>
953
+ </tr>
954
+ <tr>
955
+ <td></td>
956
+ <td></td>
957
+ </tr>
958
+ </table>
959
+ """ )[0 ]
960
+
961
+ expected = DataFrame (data = [['a' , 'b' ], [np .nan , np .nan ]],
962
+ columns = ['A' , 'B' ])
963
+
964
+ tm .assert_frame_equal (result , expected )
965
+
966
+ def test_ignore_empty_rows_when_inferring_header (self ):
967
+ result = self .read_html ("""
968
+ <table>
969
+ <thead>
970
+ <tr><th></th><th></tr>
971
+ <tr><th>A</th><th>B</th></tr>
972
+ <tr><th>a</th><th>b</th></tr>
973
+ </thead>
974
+ <tbody>
975
+ <tr><td>1</td><td>2</td></tr>
976
+ </tbody>
977
+ </table>
978
+ """ )[0 ]
979
+
980
+ columns = MultiIndex (levels = [['A' , 'B' ], ['a' , 'b' ]],
981
+ labels = [[0 , 1 ], [0 , 1 ]])
982
+ expected = DataFrame (data = [[1 , 2 ]], columns = columns )
983
+
984
+ tm .assert_frame_equal (result , expected )
985
+
951
986
def test_multiple_header_rows (self ):
952
987
# Issue #13434
953
988
expected_df = DataFrame (data = [("Hillary" , 68 , "D" ),
0 commit comments