Skip to content

Commit ffdcf8a

Browse files
committed
Fix all tests, with both MultiIndex -> Index and np.nan -> None conversions resolved
1 parent 20e24e9 commit ffdcf8a

File tree

2 files changed

+21
-17
lines changed

2 files changed

+21
-17
lines changed

pandas/io/html.py

+19-15
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
cast,
1717
)
1818

19+
import numpy as np
20+
1921
from pandas._typing import (
2022
FilePath,
2123
ReadBuffer,
@@ -32,7 +34,6 @@
3234
from pandas.core.construction import create_series_with_explicit_dtype
3335
from pandas.core.frame import DataFrame
3436
from pandas.core.indexes.base import Index
35-
from pandas.core.indexes.multi import MultiIndex
3637

3738
from pandas.io.common import (
3839
file_exists,
@@ -185,8 +186,7 @@ class _HtmlFrameParser:
185186
186187
extract_links : {None, "all", "header", "body", "footer"}
187188
Table elements in the specified section(s) with <a> tags will have their
188-
href extracted. Note that specifying "header" will result in a
189-
:class:`~pandas.MultiIndex`.
189+
href extracted.
190190
191191
.. versionadded:: 1.5.0
192192
@@ -210,8 +210,7 @@ class _HtmlFrameParser:
210210
211211
extract_links : {None, "all", "header", "body", "footer"}
212212
Table elements in the specified section(s) with <a> tags will have their
213-
href extracted. Note that specifying "header" will result in a
214-
:class:`~pandas.MultiIndex`.
213+
href extracted.
215214
216215
.. versionadded:: 1.5.0
217216
@@ -875,13 +874,7 @@ def _data_to_frame(**kwargs):
875874
# fill out elements of body that are "ragged"
876875
_expand_elements(body)
877876
with TextParser(body, header=header, **kwargs) as tp:
878-
df = tp.read()
879-
880-
# Cast MultiIndex header to an Index of tuples.
881-
# This maintains consistency of selection (e.g. df.columns.str[1])
882-
if isinstance(df.columns, MultiIndex):
883-
df.columns = Index(df.columns)
884-
return df
877+
return tp.read()
885878

886879

887880
_valid_parsers = {
@@ -1001,7 +994,19 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **
1001994
ret = []
1002995
for table in tables:
1003996
try:
1004-
ret.append(_data_to_frame(data=table, **kwargs))
997+
df = _data_to_frame(data=table, **kwargs)
998+
# Cast MultiIndex header to an Index of tuples when extracting header
999+
# links and replace np.nan with None.
1000+
# This maintains consistency of selection (e.g. df.columns.str[1])
1001+
if extract_links in ("all", "header"):
1002+
idx = df.columns.values
1003+
idx[:] = np.vectorize(
1004+
lambda cols: tuple(None if col is np.nan else col for col in cols),
1005+
otypes=["object"],
1006+
)(idx)
1007+
df.columns = Index(df.columns)
1008+
1009+
ret.append(df)
10051010
except EmptyDataError: # empty table
10061011
continue
10071012
return ret
@@ -1121,8 +1126,7 @@ def read_html(
11211126
11221127
extract_links : {None, "all", "header", "body", "footer"}
11231128
Table elements in the specified section(s) with <a> tags will have their
1124-
href extracted. Note that specifying "header" will result in a
1125-
:class:`~pandas.MultiIndex`.
1129+
href extracted.
11261130
11271131
.. versionadded:: 1.5.0
11281132

pandas/tests/io/test_html.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -143,8 +143,8 @@ def gh_13141_expected(self):
143143
return {
144144
"head_ignore": ["HTTP", "FTP", "Linkless"],
145145
"head_extract": [
146-
("HTTP", np.nan),
147-
("FTP", np.nan),
146+
("HTTP", None),
147+
("FTP", None),
148148
("Linkless", "https://en.wiktionary.org/wiki/linkless"),
149149
],
150150
"body_ignore": ["Wikipedia", "SURROUNDING Debian TEXT", "Linkless"],

0 commit comments

Comments
 (0)