16
16
cast ,
17
17
)
18
18
19
+ import numpy as np
20
+
19
21
from pandas ._typing import (
20
22
FilePath ,
21
23
ReadBuffer ,
32
34
from pandas .core .construction import create_series_with_explicit_dtype
33
35
from pandas .core .frame import DataFrame
34
36
from pandas .core .indexes .base import Index
35
- from pandas .core .indexes .multi import MultiIndex
36
37
37
38
from pandas .io .common import (
38
39
file_exists ,
@@ -185,8 +186,7 @@ class _HtmlFrameParser:
185
186
186
187
extract_links : {None, "all", "header", "body", "footer"}
187
188
Table elements in the specified section(s) with <a> tags will have their
188
- href extracted. Note that specifying "header" will result in a
189
- :class:`~pandas.MultiIndex`.
189
+ href extracted.
190
190
191
191
.. versionadded:: 1.5.0
192
192
@@ -210,8 +210,7 @@ class _HtmlFrameParser:
210
210
211
211
extract_links : {None, "all", "header", "body", "footer"}
212
212
Table elements in the specified section(s) with <a> tags will have their
213
- href extracted. Note that specifying "header" will result in a
214
- :class:`~pandas.MultiIndex`.
213
+ href extracted.
215
214
216
215
.. versionadded:: 1.5.0
217
216
@@ -875,13 +874,7 @@ def _data_to_frame(**kwargs):
875
874
# fill out elements of body that are "ragged"
876
875
_expand_elements (body )
877
876
with TextParser (body , header = header , ** kwargs ) as tp :
878
- df = tp .read ()
879
-
880
- # Cast MultiIndex header to an Index of tuples.
881
- # This maintains consistency of selection (e.g. df.columns.str[1])
882
- if isinstance (df .columns , MultiIndex ):
883
- df .columns = Index (df .columns )
884
- return df
877
+ return tp .read ()
885
878
886
879
887
880
_valid_parsers = {
@@ -1001,7 +994,19 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **
1001
994
ret = []
1002
995
for table in tables :
1003
996
try :
1004
- ret .append (_data_to_frame (data = table , ** kwargs ))
997
+ df = _data_to_frame (data = table , ** kwargs )
998
+ # Cast MultiIndex header to an Index of tuples when extracting header
999
+ # links and replace np.nan with None.
1000
+ # This maintains consistency of selection (e.g. df.columns.str[1])
1001
+ if extract_links in ("all" , "header" ):
1002
+ idx = df .columns .values
1003
+ idx [:] = np .vectorize (
1004
+ lambda cols : tuple (None if col is np .nan else col for col in cols ),
1005
+ otypes = ["object" ],
1006
+ )(idx )
1007
+ df .columns = Index (df .columns )
1008
+
1009
+ ret .append (df )
1005
1010
except EmptyDataError : # empty table
1006
1011
continue
1007
1012
return ret
@@ -1121,8 +1126,7 @@ def read_html(
1121
1126
1122
1127
extract_links : {None, "all", "header", "body", "footer"}
1123
1128
Table elements in the specified section(s) with <a> tags will have their
1124
- href extracted. Note that specifying "header" will result in a
1125
- :class:`~pandas.MultiIndex`.
1129
+ href extracted.
1126
1130
1127
1131
.. versionadded:: 1.5.0
1128
1132
0 commit comments