pandas-dev · mroeschke · Aug 16, 2022 · Feb 13, 2022 · Feb 14, 2022 · Feb 15, 2022
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -2729,6 +2729,28 @@ succeeds, the function will return*.
 
    dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor=["lxml", "bs4"])
 
+Links can be extracted from cells along with the text using ``extract_hrefs=True``.
+
+.. ipython:: python
+
+    html_table = """
+    <table>
+      <tr>
+        <th>GitHub</th>
+      </tr>
+      <tr>
+        <td><a href="https://github.com/pandas-dev/pandas">pandas</a></td>
+      </tr>
+    </table>
+    """
+
+    df = pd.read_html(
+        html_table,
+        extract_hrefs=True
+    )[0]
+    df
+    df["GitHub"]
+    df["GitHub"].str[1]
 
 .. _io.html:
 

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -39,6 +39,8 @@ Other enhancements
 - :meth:`.GroupBy.min` and :meth:`.GroupBy.max` now supports `Numba <https://numba.pydata.org/>`_ execution with the ``engine`` keyword (:issue:`45428`)
 - Implemented a ``bool``-dtype :class:`Index`, passing a bool-dtype array-like to ``pd.Index`` will now retain ``bool`` dtype instead of casting to ``object`` (:issue:`45061`)
 - Implemented a complex-dtype :class:`Index`, passing a complex-dtype array-like to ``pd.Index`` will now retain complex dtype instead of casting to ``object`` (:issue:`45845`)
+- :func:`pandas.read_html` now supports extracting hrefs from table cells (:issue:`13141`).
+
 
 -
 

diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -180,6 +180,9 @@ class _HtmlFrameParser:
     displayed_only : bool
         Whether or not items with "display:none" should be ignored
 
+    extract_hrefs : bool, default False
+        Whether table elements with <a> tags should have the href extracted.
+
     Attributes
     ----------
     io : str or file-like
@@ -198,11 +201,15 @@ class _HtmlFrameParser:
     displayed_only : bool
         Whether or not items with "display:none" should be ignored
 
+    extract_hrefs : bool, default False
+        Whether table elements with <a> tags should have the href extracted.
+
     Notes
     -----
     To subclass this class effectively you must override the following methods:
         * :func:`_build_doc`
         * :func:`_attr_getter`
+        * :func:`_href_getter`
         * :func:`_text_getter`
         * :func:`_parse_td`
         * :func:`_parse_thead_tr`
@@ -221,12 +228,14 @@ def __init__(
         attrs: dict[str, str] | None,
         encoding: str,
         displayed_only: bool,
+        extract_hrefs: bool,
     ):
         self.io = io
         self.match = match
         self.attrs = attrs
         self.encoding = encoding
         self.displayed_only = displayed_only
+        self.extract_hrefs = extract_hrefs
 
     def parse_tables(self):
         """
@@ -259,6 +268,22 @@ def _attr_getter(self, obj, attr):
         # Both lxml and BeautifulSoup have the same implementation:
         return obj.get(attr)
 
+    def _href_getter(self, obj):
+        """
+        Return a href if the DOM node contains a child <a> or None.
+
+        Parameters
+        ----------
+        obj : node-like
+            A DOM node.
+
+        Returns
+        -------
+        href : str or unicode
+            The href from the <a> child of the DOM node.
+        """
+        raise AbstractMethodError(self)
+
     def _text_getter(self, obj):
         """
         Return the text of an individual DOM node.
@@ -435,20 +460,22 @@ def row_is_all_th(row):
             while body_rows and row_is_all_th(body_rows[0]):
                 header_rows.append(body_rows.pop(0))
 
-        header = self._expand_colspan_rowspan(header_rows)
+        header = self._expand_colspan_rowspan(header_rows, header=True)
         body = self._expand_colspan_rowspan(body_rows)
         footer = self._expand_colspan_rowspan(footer_rows)
 
         return header, body, footer
 
-    def _expand_colspan_rowspan(self, rows):
+    def _expand_colspan_rowspan(self, rows, header=False):
         """
         Given a list of <tr>s, return a list of text rows.
 
         Parameters
         ----------
         rows : list of node-like
             List of <tr>s
+        header : whether the current row is the header - don't capture links if so,
+            as this results in a MultiIndex which is undesirable.
 
         Returns
         -------
@@ -461,7 +488,10 @@ def _expand_colspan_rowspan(self, rows):
         to subsequent cells.
         """
         all_texts = []  # list of rows, each a list of str
-        remainder: list[tuple[int, str, int]] = []  # list of (index, text, nrows)
+        text: str | tuple
+        remainder: list[
+            tuple[int, str | tuple, int]
+        ] = []  # list of (index, text, nrows)
 
         for tr in rows:
             texts = []  # the output for this row
@@ -481,6 +511,11 @@ def _expand_colspan_rowspan(self, rows):
 
                 # Append the text from this <td>, colspan times
                 text = _remove_whitespace(self._text_getter(td))
+                if not header and self.extract_hrefs:
+                    # All cells will be tuples except for the headers for
+                    # consistency in selection (e.g. using .str indexing)
+                    href = self._href_getter(td)
+                    text = (text, href) if href else (text,)
                 rowspan = int(self._attr_getter(td, "rowspan") or 1)
                 colspan = int(self._attr_getter(td, "colspan") or 1)
 
@@ -585,6 +620,10 @@ def _parse_tables(self, doc, match, attrs):
             raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
         return result
 
+    def _href_getter(self, obj):
+        a = obj.find("a", href=True)
+        return None if not a else a["href"]
+
     def _text_getter(self, obj):
         return obj.text
 
@@ -670,6 +709,10 @@ class _LxmlFrameParser(_HtmlFrameParser):
     :class:`_HtmlFrameParser`.
     """
 
+    def _href_getter(self, obj):
+        href = obj.xpath(".//a/@href")
+        return None if not href else href[0]
+
     def _text_getter(self, obj):
         return obj.text_content()
 
@@ -906,14 +949,14 @@ def _validate_flavor(flavor):
     return flavor
 
 
-def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
+def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_hrefs, **kwargs):
     flavor = _validate_flavor(flavor)
     compiled_match = re.compile(match)  # you can pass a compiled regex here
 
     retained = None
     for flav in flavor:
         parser = _parser_dispatch(flav)
-        p = parser(io, compiled_match, attrs, encoding, displayed_only)
+        p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_hrefs)
 
         try:
             tables = p.parse_tables()
@@ -964,6 +1007,7 @@ def read_html(
     na_values=None,
     keep_default_na: bool = True,
     displayed_only: bool = True,
+    extract_hrefs: bool = False,
 ) -> list[DataFrame]:
     r"""
     Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1058,6 +1102,9 @@ def read_html(
     displayed_only : bool, default True
         Whether elements with "display: none" should be parsed.
 
+    extract_hrefs : bool, default False
+        Whether table elements with <a> tags should have the href extracted.
+
     Returns
     -------
     dfs
@@ -1126,4 +1173,5 @@ def read_html(
         na_values=na_values,
         keep_default_na=keep_default_na,
         displayed_only=displayed_only,
+        extract_hrefs=extract_hrefs,
     )
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
@@ -1286,3 +1286,41 @@ def test_parse_path_object(self, datapath):
         df1 = self.read_html(file_path_string)[0]
         df2 = self.read_html(file_path)[0]
         tm.assert_frame_equal(df1, df2)
+
+    def test_extract_hrefs(self):
+        # GH 13141:
+        # read_html argument to interpret hyperlinks as links (not merely text)
+        result = self.read_html(
+            """
+          <table>
+            <tr>
+              <th>HTTP</th>
+              <th>FTP</th>
+              <th><a href="https://en.wiktionary.org/wiki/linkless">None</a></th>
+            </tr>
+            <tr>
+              <td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
+              <td><a href="ftp://ftp.us.debian.org/">Debian</a></td>
+              <td>Linkless</td>
+            </tr>
+          </table>
+          """,
+            extract_hrefs=True,
+        )[0]
+
+        expected = DataFrame(
+            [
+                [
+                    ("Wikipedia", "https://en.wikipedia.org/"),
+                    ("Debian", "ftp://ftp.us.debian.org/"),
+                    ("Linkless",),
+                ]
+            ],
+            columns=(
+                "HTTP",
+                "FTP",
+                "None",
+            ),
+        )
+
+        tm.assert_frame_equal(result, expected)