15
15
date_range , Series )
16
16
from pandas .compat import (map , zip , StringIO , BytesIO ,
17
17
is_platform_windows , PY3 , reload )
18
+ from pandas .errors import ParserError
18
19
from pandas .io .common import URLError , file_path_to_url
19
20
import pandas .io .html
20
21
from pandas .io .html import read_html
@@ -147,7 +148,7 @@ def test_banklist_no_match(self):
147
148
assert isinstance (df , DataFrame )
148
149
149
150
def test_spam_header (self ):
150
- df = self .read_html (self .spam_data , '.*Water.*' , header = 1 )[0 ]
151
+ df = self .read_html (self .spam_data , '.*Water.*' , header = 2 )[0 ]
151
152
assert df .columns [0 ] == 'Proximates'
152
153
assert not df .empty
153
154
@@ -424,7 +425,7 @@ def test_multiple_tbody(self):
424
425
</tbody>
425
426
</table>''' )[0 ]
426
427
427
- expected = DataFrame ({ 'A' : [ 1 , 3 ], 'B' : [ 2 , 4 ]} )
428
+ expected = DataFrame (data = [[ 1 , 2 ], [ 3 , 4 ]], columns = [ 'A' , 'B' ] )
428
429
429
430
tm .assert_frame_equal (result , expected )
430
431
@@ -471,11 +472,8 @@ def test_thead_without_tr(self):
471
472
</tbody>
472
473
</table>''' )[0 ]
473
474
474
- expected = DataFrame (data = {
475
- 'Country' : ['Ukraine' ],
476
- 'Municipality' : ['Odessa' ],
477
- 'Year' : [1944 ],
478
- })
475
+ expected = DataFrame (data = [['Ukraine' , 'Odessa' , 1944 ]],
476
+ columns = ['Country' , 'Municipality' , 'Year' ])
479
477
480
478
tm .assert_frame_equal (result , expected )
481
479
@@ -502,9 +500,10 @@ def test_tfoot_read(self):
502
500
</tfoot>
503
501
</table>'''
504
502
505
- expected1 = DataFrame ({'A' : ['bodyA' ], 'B' : ['bodyB' ]})
506
- expected2 = DataFrame ({'A' : ['bodyA' , 'footA' ],
507
- 'B' : ['bodyB' , 'footB' ]})
503
+ expected1 = DataFrame (data = [['bodyA' , 'bodyB' ]], columns = ['A' , 'B' ])
504
+
505
+ expected2 = DataFrame (data = [['bodyA' , 'bodyB' ], ['footA' , 'footB' ]],
506
+ columns = ['A' , 'B' ])
508
507
509
508
data1 = data_template .format (footer = "" )
510
509
data2 = data_template .format (
@@ -532,7 +531,7 @@ def test_parse_header_of_non_string_column(self):
532
531
</table>
533
532
''' , header = 0 )[0 ]
534
533
535
- expected = DataFrame (data = { 'S' : [ 'text' ], 'I' : [ 1944 ]} )
534
+ expected = DataFrame ([[ 'text' , 1944 ]], columns = ( 'S' , 'I' ) )
536
535
537
536
tm .assert_frame_equal (result , expected )
538
537
@@ -676,11 +675,7 @@ def test_colspan_rowspan_1(self):
676
675
</table>
677
676
""" )[0 ]
678
677
679
- expected = DataFrame (data = {
680
- 'A' : ['a' ],
681
- 'B' : ['b' ],
682
- 'C' : ['c' ],
683
- })
678
+ expected = DataFrame ([['a' , 'b' , 'c' ]], columns = ['A' , 'B' , 'C' ])
684
679
685
680
tm .assert_frame_equal (result , expected )
686
681
@@ -708,13 +703,8 @@ def test_colspan_rowspan_copy_values(self):
708
703
</table>
709
704
""" , header = 0 )[0 ]
710
705
711
- expected = DataFrame (data = {
712
- 'X' : ['A' ],
713
- 'X.1' : ['B' ],
714
- 'Y' : ['B' ],
715
- 'Z' : ['Z' ],
716
- 'W' : ['C' ],
717
- })
706
+ expected = DataFrame (data = [['A' , 'B' , 'B' , 'Z' , 'C' ]],
707
+ columns = ['X' , 'X.1' , 'Y' , 'Z' , 'W' ])
718
708
719
709
tm .assert_frame_equal (result , expected )
720
710
@@ -739,13 +729,8 @@ def test_colspan_rowspan_both_not_1(self):
739
729
</table>
740
730
""" , header = 0 )[0 ]
741
731
742
- expected = DataFrame (data = {
743
- 'A' : ['A' ],
744
- 'B' : ['B' ],
745
- 'B.1' : ['B' ],
746
- 'B.2' : ['B' ],
747
- 'C' : ['D' ],
748
- })
732
+ expected = DataFrame (data = [['A' , 'B' , 'B' , 'B' , 'D' ]],
733
+ columns = ['A' , 'B' , 'B.1' , 'B.2' , 'C' ])
749
734
750
735
tm .assert_frame_equal (result , expected )
751
736
@@ -769,10 +754,7 @@ def test_rowspan_at_end_of_row(self):
769
754
</table>
770
755
""" , header = 0 )[0 ]
771
756
772
- expected = DataFrame (data = {
773
- 'A' : ['C' ],
774
- 'B' : ['B' ]
775
- })
757
+ expected = DataFrame (data = [['C' , 'B' ]], columns = ['A' , 'B' ])
776
758
777
759
tm .assert_frame_equal (result , expected )
778
760
@@ -788,14 +770,12 @@ def test_rowspan_only_rows(self):
788
770
</table>
789
771
""" , header = 0 )[0 ]
790
772
791
- expected = DataFrame (data = {
792
- 'A' : ['A' , 'A' ],
793
- 'B' : ['B' , 'B' ],
794
- })
773
+ expected = DataFrame (data = [['A' , 'B' ], ['A' , 'B' ]],
774
+ columns = ['A' , 'B' ])
795
775
796
776
tm .assert_frame_equal (result , expected )
797
777
798
- def test_header_inferred_from_th_elements (self ):
778
+ def test_header_inferred_from_rows_with_only_th (self ):
799
779
# GH17054
800
780
result = self .read_html ("""
801
781
<table>
@@ -814,10 +794,9 @@ def test_header_inferred_from_th_elements(self):
814
794
</table>
815
795
""" )[0 ]
816
796
817
- expected = DataFrame (data = {
818
- ('A' , 'a' ): [1 ],
819
- ('B' , 'b' ): [2 ],
820
- })
797
+ columns = MultiIndex (levels = [['A' , 'B' ], ['a' , 'b' ]],
798
+ labels = [[0 , 1 ], [0 , 1 ]])
799
+ expected = DataFrame (data = [[1 , 2 ]], columns = columns )
821
800
822
801
tm .assert_frame_equal (result , expected )
823
802
@@ -856,6 +835,23 @@ def test_wikipedia_states_table(self, datapath):
856
835
result = self .read_html (data , 'Arizona' , header = 1 )[0 ]
857
836
assert result ['sq mi' ].dtype == np .dtype ('float64' )
858
837
838
+ def test_parser_error_on_empty_header_row (self ):
839
+ with tm .assert_raises_regex (ParserError ,
840
+ r"Passed header=\[0,1\] are "
841
+ r"too many rows for this "
842
+ r"multi_index of columns" ):
843
+ self .read_html ("""
844
+ <table>
845
+ <thead>
846
+ <tr><th></th><th></tr>
847
+ <tr><th>A</th><th>B</th></tr>
848
+ </thead>
849
+ <tbody>
850
+ <tr><td>a</td><td>b</td></tr>
851
+ </tbody>
852
+ </table>
853
+ """ , header = [0 , 1 ])
854
+
859
855
def test_decimal_rows (self ):
860
856
# GH 12907
861
857
result = self .read_html ('''<html>
@@ -960,6 +956,49 @@ def test_keep_default_na(self):
960
956
html_df = self .read_html (html_data , keep_default_na = True )[0 ]
961
957
tm .assert_frame_equal (expected_df , html_df )
962
958
959
+ def test_preserve_empty_rows (self ):
960
+ result = self .read_html ("""
961
+ <table>
962
+ <tr>
963
+ <th>A</th>
964
+ <th>B</th>
965
+ </tr>
966
+ <tr>
967
+ <td>a</td>
968
+ <td>b</td>
969
+ </tr>
970
+ <tr>
971
+ <td></td>
972
+ <td></td>
973
+ </tr>
974
+ </table>
975
+ """ )[0 ]
976
+
977
+ expected = DataFrame (data = [['a' , 'b' ], [np .nan , np .nan ]],
978
+ columns = ['A' , 'B' ])
979
+
980
+ tm .assert_frame_equal (result , expected )
981
+
982
+ def test_ignore_empty_rows_when_inferring_header (self ):
983
+ result = self .read_html ("""
984
+ <table>
985
+ <thead>
986
+ <tr><th></th><th></tr>
987
+ <tr><th>A</th><th>B</th></tr>
988
+ <tr><th>a</th><th>b</th></tr>
989
+ </thead>
990
+ <tbody>
991
+ <tr><td>1</td><td>2</td></tr>
992
+ </tbody>
993
+ </table>
994
+ """ )[0 ]
995
+
996
+ columns = MultiIndex (levels = [['A' , 'B' ], ['a' , 'b' ]],
997
+ labels = [[0 , 1 ], [0 , 1 ]])
998
+ expected = DataFrame (data = [[1 , 2 ]], columns = columns )
999
+
1000
+ tm .assert_frame_equal (result , expected )
1001
+
963
1002
def test_multiple_header_rows (self ):
964
1003
# Issue #13434
965
1004
expected_df = DataFrame (data = [("Hillary" , 68 , "D" ),
0 commit comments