ENH: pd.read_html argument to extract hrefs along with text from cells

abmyii · abmyii · commit d69ce74944f9 · 2022-02-14T12:31:51.000Z
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -109,6 +109,32 @@ apply converter methods, and parse dates (:issue:`43567`).
     df
     df.dtypes
 
+read_html now supports ``extract_hrefs``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`pandas.read_html` can now extract hrefs from table cells (:issue:`13141`).
+
+.. ipython:: python
+
+    html_table = """
+    <table>
+      <tr>
+        <th>GitHub</th>
+      </tr>
+      <tr>
+        <td><a href="https://github.com/pandas-dev/pandas">pandas</a></td>
+      </tr>
+    </table>
+    """
+
+    df = pd.read_html(
+        html_table,
+        extract_hrefs=True
+    )[0]
+    df
+    df["GitHub"]
+    df["GitHub"].str[1]
+
 .. _whatsnew_150.api_breaking.api_breaking2:
 
 api_breaking_change2
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -180,6 +180,9 @@ class _HtmlFrameParser:
     displayed_only : bool
         Whether or not items with "display:none" should be ignored
 
+    extract_hrefs : bool, default False
+        Whether table elements with <a> tags should have the href extracted.
+
     Attributes
     ----------
     io : str or file-like
@@ -198,11 +201,15 @@ class _HtmlFrameParser:
     displayed_only : bool
         Whether or not items with "display:none" should be ignored
 
+    extract_hrefs : bool, default False
+        Whether table elements with <a> tags should have the href extracted.
+
     Notes
     -----
     To subclass this class effectively you must override the following methods:
         * :func:`_build_doc`
         * :func:`_attr_getter`
+        * :func:`_href_getter`
         * :func:`_text_getter`
         * :func:`_parse_td`
         * :func:`_parse_thead_tr`
@@ -221,12 +228,14 @@ def __init__(
         attrs: dict[str, str] | None,
         encoding: str,
         displayed_only: bool,
+        extract_hrefs: bool,
     ):
         self.io = io
         self.match = match
         self.attrs = attrs
         self.encoding = encoding
         self.displayed_only = displayed_only
+        self.extract_hrefs = extract_hrefs
 
     def parse_tables(self):
         """
@@ -259,6 +268,22 @@ def _attr_getter(self, obj, attr):
         # Both lxml and BeautifulSoup have the same implementation:
         return obj.get(attr)
 
+    def _href_getter(self, obj):
+        """
+        Return a href if the DOM node contains a child <a> or None.
+
+        Parameters
+        ----------
+        obj : node-like
+            A DOM node.
+
+        Returns
+        -------
+        href : str or unicode
+            The href from the <a> child of the DOM node.
+        """
+        raise AbstractMethodError(self)
+
     def _text_getter(self, obj):
         """
         Return the text of an individual DOM node.
@@ -435,20 +460,22 @@ def row_is_all_th(row):
             while body_rows and row_is_all_th(body_rows[0]):
                 header_rows.append(body_rows.pop(0))
 
-        header = self._expand_colspan_rowspan(header_rows)
+        header = self._expand_colspan_rowspan(header_rows, header=True)
         body = self._expand_colspan_rowspan(body_rows)
         footer = self._expand_colspan_rowspan(footer_rows)
 
         return header, body, footer
 
-    def _expand_colspan_rowspan(self, rows):
+    def _expand_colspan_rowspan(self, rows, header=False):
         """
         Given a list of <tr>s, return a list of text rows.
 
         Parameters
         ----------
         rows : list of node-like
             List of <tr>s
+        header : whether the current row is the header - don't capture links if so,
+            as this results in a MultiIndex which is undesirable.
 
         Returns
         -------
@@ -481,6 +508,11 @@ def _expand_colspan_rowspan(self, rows):
 
                 # Append the text from this <td>, colspan times
                 text = _remove_whitespace(self._text_getter(td))
+                if not header and self.extract_hrefs:
+                    # All cells will be tuples except for the headers for
+                    # consistency in selection (e.g. using .str indexing)
+                    href = self._href_getter(td)
+                    text = (text, href) if href else (text,)
                 rowspan = int(self._attr_getter(td, "rowspan") or 1)
                 colspan = int(self._attr_getter(td, "colspan") or 1)
 
@@ -585,6 +617,10 @@ def _parse_tables(self, doc, match, attrs):
             raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
         return result
 
+    def _href_getter(self, obj):
+        a = obj.find("a", href=True)
+        return None if not a else a["href"]
+
     def _text_getter(self, obj):
         return obj.text
 
@@ -670,6 +706,10 @@ class _LxmlFrameParser(_HtmlFrameParser):
     :class:`_HtmlFrameParser`.
     """
 
+    def _href_getter(self, obj):
+        href = obj.xpath(".//a/@href")
+        return None if not href else href[0]
+
     def _text_getter(self, obj):
         return obj.text_content()
 
@@ -906,14 +946,14 @@ def _validate_flavor(flavor):
     return flavor
 
 
-def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
+def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_hrefs, **kwargs):
     flavor = _validate_flavor(flavor)
     compiled_match = re.compile(match)  # you can pass a compiled regex here
 
     retained = None
     for flav in flavor:
         parser = _parser_dispatch(flav)
-        p = parser(io, compiled_match, attrs, encoding, displayed_only)
+        p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_hrefs)
 
         try:
             tables = p.parse_tables()
@@ -964,6 +1004,7 @@ def read_html(
     na_values=None,
     keep_default_na: bool = True,
     displayed_only: bool = True,
+    extract_hrefs: bool = False,
 ) -> list[DataFrame]:
     r"""
     Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1058,6 +1099,9 @@ def read_html(
     displayed_only : bool, default True
         Whether elements with "display: none" should be parsed.
 
+    extract_hrefs : bool, default False
+        Whether table elements with <a> tags should have the href extracted.
+
     Returns
     -------
     dfs
@@ -1126,4 +1170,5 @@ def read_html(
         na_values=na_values,
         keep_default_na=keep_default_na,
         displayed_only=displayed_only,
+        extract_hrefs=extract_hrefs,
     )
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
@@ -1286,3 +1286,57 @@ def test_parse_path_object(self, datapath):
         df1 = self.read_html(file_path_string)[0]
         df2 = self.read_html(file_path)[0]
         tm.assert_frame_equal(df1, df2)
+
+    def test_extract_hrefs(self):
+        # GH 13141:
+        # read_html argument to interpret hyperlinks as links (not merely text)
+        result = self.read_html(
+            """
+          <table>
+            <tr>
+              <th>Kingdom</th>
+              <th>Phylum</th>
+              <th>Class</th>
+              <th>Order</th>
+              <th>Family</th>
+              <th>Genus</th>
+              <th>Species</th>
+            </tr>
+            <tr>
+              <td><a href="https://en.wikipedia.org/wiki/Animal">Animalia</a></td>
+              <td><a href="https://en.wikipedia.org/wiki/Chordate">Chordata</a></td>
+              <td><a href="https://en.wikipedia.org/wiki/Mammal">Mammalia</a></td>
+              <td><a href="https://en.wikipedia.org/wiki/Carnivora">Carnivora</a></td>
+              <td><a href="https://en.wikipedia.org/wiki/Bear">Ursidae</a></td>
+              <td><a href="https://en.wikipedia.org/wiki/Ailuropoda">Ailuropoda</a></td>
+              <td>A. melanoleuca</td>
+            </tr>
+          </table>
+          """,
+            extract_hrefs=True,
+        )[0]
+
+        expected = DataFrame(
+            [
+                [
+                    ("Animalia", "https://en.wikipedia.org/wiki/Animal"),
+                    ("Chordata", "https://en.wikipedia.org/wiki/Chordate"),
+                    ("Mammalia", "https://en.wikipedia.org/wiki/Mammal"),
+                    ("Carnivora", "https://en.wikipedia.org/wiki/Carnivora"),
+                    ("Ursidae", "https://en.wikipedia.org/wiki/Bear"),
+                    ("Ailuropoda", "https://en.wikipedia.org/wiki/Ailuropoda"),
+                    ("A. melanoleuca",),
+                ]
+            ],
+            columns=(
+                "Kingdom",
+                "Phylum",
+                "Class",
+                "Order",
+                "Family",
+                "Genus",
+                "Species",
+            ),
+        )
+
+        tm.assert_frame_equal(result, expected)