ENH: pd.read_html argument to extract hrefs along with text from cells (pandas-dev#45973)

abmyii · attack68 · YYYasin19 · commit 697fb9b06e0f · 2022-08-23T23:41:16.000+02:00
* ENH: pd.read_html argument to extract hrefs along with text from cells * Fix typing error * Simplify tests * Fix still incorrect typing * Summarise whatsnew entry and move detailed explanation into user guide * More flexible link extraction * Suggested changes * extract_hrefs -> extract_links * Move versionadded to correct place and improve docstring for extract_links (@attack68) * Test for invalid extract_links value * Test all extract_link options * Fix for MultiIndex headers (also fixes tests) * Test that text surrounding <a> tag is still captured * Test for multiple <a> tags in cell * Fix all tests, with both MultiIndex -> Index and np.nan -> None conversions resolved * Add back EOF newline to test_html.py * Correct user guide example * Update pandas/io/html.py * Update pandas/io/html.py * Update pandas/io/html.py * Simplify MultiIndex -> Index conversion * Move unnecessary fixtures into test body * Simplify statement * Fix code checks Co-authored-by: JHM Darbyshire <24256554+attack68@users.noreply.github.com>
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -2743,6 +2743,30 @@ succeeds, the function will return*.
 
    dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor=["lxml", "bs4"])
 
+Links can be extracted from cells along with the text using ``extract_links="all"``.
+
+.. ipython:: python
+
+    html_table = """
+    <table>
+      <tr>
+        <th>GitHub</th>
+      </tr>
+      <tr>
+        <td><a href="https://github.com/pandas-dev/pandas">pandas</a></td>
+      </tr>
+    </table>
+    """
+
+    df = pd.read_html(
+        html_table,
+        extract_links="all"
+    )[0]
+    df
+    df[("GitHub", None)]
+    df[("GitHub", None)].str[1]
+
+.. versionadded:: 1.5.0
 
 .. _io.html:
 
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -12,6 +12,7 @@
 from typing import (
     TYPE_CHECKING,
     Iterable,
+    Literal,
     Pattern,
     Sequence,
     cast,
@@ -30,7 +31,9 @@
 
 from pandas.core.dtypes.common import is_list_like
 
+from pandas import isna
 from pandas.core.construction import create_series_with_explicit_dtype
+from pandas.core.indexes.base import Index
 
 from pandas.io.common import (
     file_exists,
@@ -184,6 +187,12 @@ class _HtmlFrameParser:
     displayed_only : bool
         Whether or not items with "display:none" should be ignored
 
+    extract_links : {None, "all", "header", "body", "footer"}
+        Table elements in the specified section(s) with <a> tags will have their
+        href extracted.
+
+        .. versionadded:: 1.5.0
+
     Attributes
     ----------
     io : str or file-like
@@ -202,11 +211,18 @@ class _HtmlFrameParser:
     displayed_only : bool
         Whether or not items with "display:none" should be ignored
 
+    extract_links : {None, "all", "header", "body", "footer"}
+        Table elements in the specified section(s) with <a> tags will have their
+        href extracted.
+
+        .. versionadded:: 1.5.0
+
     Notes
     -----
     To subclass this class effectively you must override the following methods:
         * :func:`_build_doc`
         * :func:`_attr_getter`
+        * :func:`_href_getter`
         * :func:`_text_getter`
         * :func:`_parse_td`
         * :func:`_parse_thead_tr`
@@ -225,12 +241,14 @@ def __init__(
         attrs: dict[str, str] | None,
         encoding: str,
         displayed_only: bool,
+        extract_links: Literal[None, "header", "footer", "body", "all"],
     ) -> None:
         self.io = io
         self.match = match
         self.attrs = attrs
         self.encoding = encoding
         self.displayed_only = displayed_only
+        self.extract_links = extract_links
 
     def parse_tables(self):
         """
@@ -263,6 +281,22 @@ def _attr_getter(self, obj, attr):
         # Both lxml and BeautifulSoup have the same implementation:
         return obj.get(attr)
 
+    def _href_getter(self, obj):
+        """
+        Return a href if the DOM node contains a child <a> or None.
+
+        Parameters
+        ----------
+        obj : node-like
+            A DOM node.
+
+        Returns
+        -------
+        href : str or unicode
+            The href from the <a> child of the DOM node.
+        """
+        raise AbstractMethodError(self)
+
     def _text_getter(self, obj):
         """
         Return the text of an individual DOM node.
@@ -439,33 +473,40 @@ def row_is_all_th(row):
             while body_rows and row_is_all_th(body_rows[0]):
                 header_rows.append(body_rows.pop(0))
 
-        header = self._expand_colspan_rowspan(header_rows)
-        body = self._expand_colspan_rowspan(body_rows)
-        footer = self._expand_colspan_rowspan(footer_rows)
+        header = self._expand_colspan_rowspan(header_rows, section="header")
+        body = self._expand_colspan_rowspan(body_rows, section="body")
+        footer = self._expand_colspan_rowspan(footer_rows, section="footer")
 
         return header, body, footer
 
-    def _expand_colspan_rowspan(self, rows):
+    def _expand_colspan_rowspan(
+        self, rows, section: Literal["header", "footer", "body"]
+    ):
         """
         Given a list of <tr>s, return a list of text rows.
 
         Parameters
         ----------
         rows : list of node-like
             List of <tr>s
+        section : the section that the rows belong to (header, body or footer).
 
         Returns
         -------
         list of list
-            Each returned row is a list of str text.
+            Each returned row is a list of str text, or tuple (text, link)
+            if extract_links is not None.
 
         Notes
         -----
         Any cell with ``rowspan`` or ``colspan`` will have its contents copied
         to subsequent cells.
         """
         all_texts = []  # list of rows, each a list of str
-        remainder: list[tuple[int, str, int]] = []  # list of (index, text, nrows)
+        text: str | tuple
+        remainder: list[
+            tuple[int, str | tuple, int]
+        ] = []  # list of (index, text, nrows)
 
         for tr in rows:
             texts = []  # the output for this row
@@ -485,6 +526,9 @@ def _expand_colspan_rowspan(self, rows):
 
                 # Append the text from this <td>, colspan times
                 text = _remove_whitespace(self._text_getter(td))
+                if self.extract_links == "all" or self.extract_links == section:
+                    href = self._href_getter(td)
+                    text = (text, href)
                 rowspan = int(self._attr_getter(td, "rowspan") or 1)
                 colspan = int(self._attr_getter(td, "colspan") or 1)
 
@@ -589,6 +633,10 @@ def _parse_tables(self, doc, match, attrs):
             raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
         return result
 
+    def _href_getter(self, obj) -> str | None:
+        a = obj.find("a", href=True)
+        return None if not a else a["href"]
+
     def _text_getter(self, obj):
         return obj.text
 
@@ -680,6 +728,10 @@ class _LxmlFrameParser(_HtmlFrameParser):
     :class:`_HtmlFrameParser`.
     """
 
+    def _href_getter(self, obj) -> str | None:
+        href = obj.xpath(".//a/@href")
+        return None if not href else href[0]
+
     def _text_getter(self, obj):
         return obj.text_content()
 
@@ -920,14 +972,14 @@ def _validate_flavor(flavor):
     return flavor
 
 
-def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
+def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs):
     flavor = _validate_flavor(flavor)
     compiled_match = re.compile(match)  # you can pass a compiled regex here
 
     retained = None
     for flav in flavor:
         parser = _parser_dispatch(flav)
-        p = parser(io, compiled_match, attrs, encoding, displayed_only)
+        p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links)
 
         try:
             tables = p.parse_tables()
@@ -955,7 +1007,17 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
     ret = []
     for table in tables:
         try:
-            ret.append(_data_to_frame(data=table, **kwargs))
+            df = _data_to_frame(data=table, **kwargs)
+            # Cast MultiIndex header to an Index of tuples when extracting header
+            # links and replace nan with None.
+            # This maintains consistency of selection (e.g. df.columns.str[1])
+            if extract_links in ("all", "header"):
+                df.columns = Index(
+                    ((col[0], None if isna(col[1]) else col[1]) for col in df.columns),
+                    tupleize_cols=False,
+                )
+
+            ret.append(df)
         except EmptyDataError:  # empty table
             continue
     return ret
@@ -978,6 +1040,7 @@ def read_html(
     na_values: Iterable[object] | None = None,
     keep_default_na: bool = True,
     displayed_only: bool = True,
+    extract_links: Literal[None, "header", "footer", "body", "all"] = None,
 ) -> list[DataFrame]:
     r"""
     Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1072,6 +1135,12 @@ def read_html(
     displayed_only : bool, default True
         Whether elements with "display: none" should be parsed.
 
+    extract_links : {None, "all", "header", "body", "footer"}
+        Table elements in the specified section(s) with <a> tags will have their
+        href extracted.
+
+        .. versionadded:: 1.5.0
+
     Returns
     -------
     dfs
@@ -1120,6 +1189,12 @@ def read_html(
             "cannot skip rows starting from the end of the "
             "data (you passed a negative value)"
         )
+    if extract_links not in [None, "header", "footer", "body", "all"]:
+        raise ValueError(
+            "`extract_links` must be one of "
+            '{None, "header", "footer", "body", "all"}, got '
+            f'"{extract_links}"'
+        )
     validate_header_arg(header)
 
     io = stringify_path(io)
@@ -1140,4 +1215,5 @@ def read_html(
         na_values=na_values,
         keep_default_na=keep_default_na,
         displayed_only=displayed_only,
+        extract_links=extract_links,
     )
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
@@ -1340,3 +1340,79 @@ def test_parse_br_as_space(self):
         expected = DataFrame(data=[["word1 word2"]], columns=["A"])
 
         tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("arg", ["all", "body", "header", "footer"])
+    def test_extract_links(self, arg):
+        gh_13141_data = """
+          <table>
+            <tr>
+              <th>HTTP</th>
+              <th>FTP</th>
+              <th><a href="https://en.wiktionary.org/wiki/linkless">Linkless</a></th>
+            </tr>
+            <tr>
+              <td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
+              <td>SURROUNDING <a href="ftp://ftp.us.debian.org/">Debian</a> TEXT</td>
+              <td>Linkless</td>
+            </tr>
+            <tfoot>
+              <tr>
+                <td><a href="https://en.wikipedia.org/wiki/Page_footer">Footer</a></td>
+                <td>
+                  Multiple <a href="1">links:</a> <a href="2">Only first captured.</a>
+                </td>
+              </tr>
+            </tfoot>
+          </table>
+          """
+
+        gh_13141_expected = {
+            "head_ignore": ["HTTP", "FTP", "Linkless"],
+            "head_extract": [
+                ("HTTP", None),
+                ("FTP", None),
+                ("Linkless", "https://en.wiktionary.org/wiki/linkless"),
+            ],
+            "body_ignore": ["Wikipedia", "SURROUNDING Debian TEXT", "Linkless"],
+            "body_extract": [
+                ("Wikipedia", "https://en.wikipedia.org/"),
+                ("SURROUNDING Debian TEXT", "ftp://ftp.us.debian.org/"),
+                ("Linkless", None),
+            ],
+            "footer_ignore": [
+                "Footer",
+                "Multiple links: Only first captured.",
+                None,
+            ],
+            "footer_extract": [
+                ("Footer", "https://en.wikipedia.org/wiki/Page_footer"),
+                ("Multiple links: Only first captured.", "1"),
+                None,
+            ],
+        }
+
+        data_exp = gh_13141_expected["body_ignore"]
+        foot_exp = gh_13141_expected["footer_ignore"]
+        head_exp = gh_13141_expected["head_ignore"]
+        if arg == "all":
+            data_exp = gh_13141_expected["body_extract"]
+            foot_exp = gh_13141_expected["footer_extract"]
+            head_exp = gh_13141_expected["head_extract"]
+        elif arg == "body":
+            data_exp = gh_13141_expected["body_extract"]
+        elif arg == "footer":
+            foot_exp = gh_13141_expected["footer_extract"]
+        elif arg == "header":
+            head_exp = gh_13141_expected["head_extract"]
+
+        result = self.read_html(gh_13141_data, extract_links=arg)[0]
+        expected = DataFrame([data_exp, foot_exp], columns=head_exp)
+        tm.assert_frame_equal(result, expected)
+
+    def test_extract_links_bad(self, spam_data):
+        msg = (
+            "`extract_links` must be one of "
+            '{None, "header", "footer", "body", "all"}, got "incorrect"'
+        )
+        with pytest.raises(ValueError, match=msg):
+            read_html(spam_data, extract_links="incorrect")