s
+ section : the section that the rows belong to (header, body or footer).
Returns
-------
list of list
- Each returned row is a list of str text.
+ Each returned row is a list of str text, or tuple (text, link)
+ if extract_links is not None.
Notes
-----
@@ -465,7 +503,10 @@ def _expand_colspan_rowspan(self, rows):
to subsequent cells.
"""
all_texts = [] # list of rows, each a list of str
- remainder: list[tuple[int, str, int]] = [] # list of (index, text, nrows)
+ text: str | tuple
+ remainder: list[
+ tuple[int, str | tuple, int]
+ ] = [] # list of (index, text, nrows)
for tr in rows:
texts = [] # the output for this row
@@ -485,6 +526,9 @@ def _expand_colspan_rowspan(self, rows):
# Append the text from this , colspan times
text = _remove_whitespace(self._text_getter(td))
+ if self.extract_links == "all" or self.extract_links == section:
+ href = self._href_getter(td)
+ text = (text, href)
rowspan = int(self._attr_getter(td, "rowspan") or 1)
colspan = int(self._attr_getter(td, "colspan") or 1)
@@ -589,6 +633,10 @@ def _parse_tables(self, doc, match, attrs):
raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
return result
+ def _href_getter(self, obj) -> str | None:
+ a = obj.find("a", href=True)
+ return None if not a else a["href"]
+
def _text_getter(self, obj):
return obj.text
@@ -680,6 +728,10 @@ class _LxmlFrameParser(_HtmlFrameParser):
:class:`_HtmlFrameParser`.
"""
+ def _href_getter(self, obj) -> str | None:
+ href = obj.xpath(".//a/@href")
+ return None if not href else href[0]
+
def _text_getter(self, obj):
return obj.text_content()
@@ -920,14 +972,14 @@ def _validate_flavor(flavor):
return flavor
-def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
+def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs):
flavor = _validate_flavor(flavor)
compiled_match = re.compile(match) # you can pass a compiled regex here
retained = None
for flav in flavor:
parser = _parser_dispatch(flav)
- p = parser(io, compiled_match, attrs, encoding, displayed_only)
+ p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links)
try:
tables = p.parse_tables()
@@ -955,7 +1007,17 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
ret = []
for table in tables:
try:
- ret.append(_data_to_frame(data=table, **kwargs))
+ df = _data_to_frame(data=table, **kwargs)
+ # Cast MultiIndex header to an Index of tuples when extracting header
+ # links and replace nan with None.
+ # This maintains consistency of selection (e.g. df.columns.str[1])
+ if extract_links in ("all", "header"):
+ df.columns = Index(
+ ((col[0], None if isna(col[1]) else col[1]) for col in df.columns),
+ tupleize_cols=False,
+ )
+
+ ret.append(df)
except EmptyDataError: # empty table
continue
return ret
@@ -978,6 +1040,7 @@ def read_html(
na_values: Iterable[object] | None = None,
keep_default_na: bool = True,
displayed_only: bool = True,
+ extract_links: Literal[None, "header", "footer", "body", "all"] = None,
) -> list[DataFrame]:
r"""
Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1072,6 +1135,12 @@ def read_html(
displayed_only : bool, default True
Whether elements with "display: none" should be parsed.
+ extract_links : {None, "all", "header", "body", "footer"}
+ Table elements in the specified section(s) with tags will have their
+ href extracted.
+
+ .. versionadded:: 1.5.0
+
Returns
-------
dfs
@@ -1120,6 +1189,12 @@ def read_html(
"cannot skip rows starting from the end of the "
"data (you passed a negative value)"
)
+ if extract_links not in [None, "header", "footer", "body", "all"]:
+ raise ValueError(
+ "`extract_links` must be one of "
+ '{None, "header", "footer", "body", "all"}, got '
+ f'"{extract_links}"'
+ )
validate_header_arg(header)
io = stringify_path(io)
@@ -1140,4 +1215,5 @@ def read_html(
na_values=na_values,
keep_default_na=keep_default_na,
displayed_only=displayed_only,
+ extract_links=extract_links,
)
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 20bcfe202f68a..045c22f106105 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -1340,3 +1340,79 @@ def test_parse_br_as_space(self):
expected = DataFrame(data=[["word1 word2"]], columns=["A"])
tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("arg", ["all", "body", "header", "footer"])
+ def test_extract_links(self, arg):
+ gh_13141_data = """
+
+ """
+
+ gh_13141_expected = {
+ "head_ignore": ["HTTP", "FTP", "Linkless"],
+ "head_extract": [
+ ("HTTP", None),
+ ("FTP", None),
+ ("Linkless", "https://en.wiktionary.org/wiki/linkless"),
+ ],
+ "body_ignore": ["Wikipedia", "SURROUNDING Debian TEXT", "Linkless"],
+ "body_extract": [
+ ("Wikipedia", "https://en.wikipedia.org/"),
+ ("SURROUNDING Debian TEXT", "ftp://ftp.us.debian.org/"),
+ ("Linkless", None),
+ ],
+ "footer_ignore": [
+ "Footer",
+ "Multiple links: Only first captured.",
+ None,
+ ],
+ "footer_extract": [
+ ("Footer", "https://en.wikipedia.org/wiki/Page_footer"),
+ ("Multiple links: Only first captured.", "1"),
+ None,
+ ],
+ }
+
+ data_exp = gh_13141_expected["body_ignore"]
+ foot_exp = gh_13141_expected["footer_ignore"]
+ head_exp = gh_13141_expected["head_ignore"]
+ if arg == "all":
+ data_exp = gh_13141_expected["body_extract"]
+ foot_exp = gh_13141_expected["footer_extract"]
+ head_exp = gh_13141_expected["head_extract"]
+ elif arg == "body":
+ data_exp = gh_13141_expected["body_extract"]
+ elif arg == "footer":
+ foot_exp = gh_13141_expected["footer_extract"]
+ elif arg == "header":
+ head_exp = gh_13141_expected["head_extract"]
+
+ result = self.read_html(gh_13141_data, extract_links=arg)[0]
+ expected = DataFrame([data_exp, foot_exp], columns=head_exp)
+ tm.assert_frame_equal(result, expected)
+
+ def test_extract_links_bad(self, spam_data):
+ msg = (
+ "`extract_links` must be one of "
+ '{None, "header", "footer", "body", "all"}, got "incorrect"'
+ )
+ with pytest.raises(ValueError, match=msg):
+ read_html(spam_data, extract_links="incorrect")
|