From d69ce74944f96baf422ab16fc36581a4b6fc7327 Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Sun, 13 Feb 2022 14:44:05 +0000
Subject: [PATCH 01/24] ENH: pd.read_html argument to extract hrefs along with
 text from cells

---
 doc/source/whatsnew/v1.5.0.rst | 26 ++++++++++++++++
 pandas/io/html.py              | 53 ++++++++++++++++++++++++++++++---
 pandas/tests/io/test_html.py   | 54 ++++++++++++++++++++++++++++++++++
 3 files changed, 129 insertions(+), 4 deletions(-)
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index a35ca589065d8..dd71907b1b3dd 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -109,6 +109,32 @@ apply converter methods, and parse dates (:issue:`43567`).
     df
     df.dtypes
 
+read_html now supports ``extract_hrefs``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`pandas.read_html` can now extract hrefs from table cells (:issue:`13141`).
+
+.. ipython:: python
+
+    html_table = """
+    <table>
+      <tr>
+        <th>GitHub</th>
+      </tr>
+      <tr>
+        <td><a href="https://github.com/pandas-dev/pandas">pandas</a></td>
+      </tr>
+    </table>
+    """
+
+    df = pd.read_html(
+        html_table,
+        extract_hrefs=True
+    )[0]
+    df
+    df["GitHub"]
+    df["GitHub"].str[1]
+
 .. _whatsnew_150.api_breaking.api_breaking2:
 
 api_breaking_change2
diff --git a/pandas/io/html.py b/pandas/io/html.py
index 05d7c2998ef27..002a238808526 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -180,6 +180,9 @@ class _HtmlFrameParser:
     displayed_only : bool
         Whether or not items with "display:none" should be ignored
 
+    extract_hrefs : bool, default False
+        Whether table elements with <a> tags should have the href extracted.
+
     Attributes
     ----------
     io : str or file-like
@@ -198,11 +201,15 @@ class _HtmlFrameParser:
     displayed_only : bool
         Whether or not items with "display:none" should be ignored
 
+    extract_hrefs : bool, default False
+        Whether table elements with <a> tags should have the href extracted.
+
     Notes
     -----
     To subclass this class effectively you must override the following methods:
         * :func:`_build_doc`
         * :func:`_attr_getter`
+        * :func:`_href_getter`
         * :func:`_text_getter`
         * :func:`_parse_td`
         * :func:`_parse_thead_tr`
@@ -221,12 +228,14 @@ def __init__(
         attrs: dict[str, str] | None,
         encoding: str,
         displayed_only: bool,
+        extract_hrefs: bool,
     ):
         self.io = io
         self.match = match
         self.attrs = attrs
         self.encoding = encoding
         self.displayed_only = displayed_only
+        self.extract_hrefs = extract_hrefs
 
     def parse_tables(self):
         """
@@ -259,6 +268,22 @@ def _attr_getter(self, obj, attr):
         # Both lxml and BeautifulSoup have the same implementation:
         return obj.get(attr)
 
+    def _href_getter(self, obj):
+        """
+        Return a href if the DOM node contains a child <a> or None.
+
+        Parameters
+        ----------
+        obj : node-like
+            A DOM node.
+
+        Returns
+        -------
+        href : str or unicode
+            The href from the <a> child of the DOM node.
+        """
+        raise AbstractMethodError(self)
+
     def _text_getter(self, obj):
         """
         Return the text of an individual DOM node.
@@ -435,13 +460,13 @@ def row_is_all_th(row):
             while body_rows and row_is_all_th(body_rows[0]):
                 header_rows.append(body_rows.pop(0))
 
-        header = self._expand_colspan_rowspan(header_rows)
+        header = self._expand_colspan_rowspan(header_rows, header=True)
         body = self._expand_colspan_rowspan(body_rows)
         footer = self._expand_colspan_rowspan(footer_rows)
 
         return header, body, footer
 
-    def _expand_colspan_rowspan(self, rows):
+    def _expand_colspan_rowspan(self, rows, header=False):
         """
         Given a list of <tr>s, return a list of text rows.
 
@@ -449,6 +474,8 @@ def _expand_colspan_rowspan(self, rows):
         ----------
         rows : list of node-like
             List of <tr>s
+        header : whether the current row is the header - don't capture links if so,
+            as this results in a MultiIndex which is undesirable.
 
         Returns
         -------
@@ -481,6 +508,11 @@ def _expand_colspan_rowspan(self, rows):
 
                 # Append the text from this <td>, colspan times
                 text = _remove_whitespace(self._text_getter(td))
+                if not header and self.extract_hrefs:
+                    # All cells will be tuples except for the headers for
+                    # consistency in selection (e.g. using .str indexing)
+                    href = self._href_getter(td)
+                    text = (text, href) if href else (text,)
                 rowspan = int(self._attr_getter(td, "rowspan") or 1)
                 colspan = int(self._attr_getter(td, "colspan") or 1)
 
@@ -585,6 +617,10 @@ def _parse_tables(self, doc, match, attrs):
             raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
         return result
 
+    def _href_getter(self, obj):
+        a = obj.find("a", href=True)
+        return None if not a else a["href"]
+
     def _text_getter(self, obj):
         return obj.text
 
@@ -670,6 +706,10 @@ class _LxmlFrameParser(_HtmlFrameParser):
     :class:`_HtmlFrameParser`.
     """
 
+    def _href_getter(self, obj):
+        href = obj.xpath(".//a/@href")
+        return None if not href else href[0]
+
     def _text_getter(self, obj):
         return obj.text_content()
 
@@ -906,14 +946,14 @@ def _validate_flavor(flavor):
     return flavor
 
 
-def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
+def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_hrefs, **kwargs):
     flavor = _validate_flavor(flavor)
     compiled_match = re.compile(match)  # you can pass a compiled regex here
 
     retained = None
     for flav in flavor:
         parser = _parser_dispatch(flav)
-        p = parser(io, compiled_match, attrs, encoding, displayed_only)
+        p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_hrefs)
 
         try:
             tables = p.parse_tables()
@@ -964,6 +1004,7 @@ def read_html(
     na_values=None,
     keep_default_na: bool = True,
     displayed_only: bool = True,
+    extract_hrefs: bool = False,
 ) -> list[DataFrame]:
     r"""
     Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1058,6 +1099,9 @@ def read_html(
     displayed_only : bool, default True
         Whether elements with "display: none" should be parsed.
 
+    extract_hrefs : bool, default False
+        Whether table elements with <a> tags should have the href extracted.
+
     Returns
     -------
     dfs
@@ -1126,4 +1170,5 @@ def read_html(
         na_values=na_values,
         keep_default_na=keep_default_na,
         displayed_only=displayed_only,
+        extract_hrefs=extract_hrefs,
     )
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index eeebb9a638afb..cfa90237b52f4 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -1286,3 +1286,57 @@ def test_parse_path_object(self, datapath):
         df1 = self.read_html(file_path_string)[0]
         df2 = self.read_html(file_path)[0]
         tm.assert_frame_equal(df1, df2)
+
+    def test_extract_hrefs(self):
+        # GH 13141:
+        # read_html argument to interpret hyperlinks as links (not merely text)
+        result = self.read_html(
+            """
+          <table>
+            <tr>
+              <th>Kingdom</th>
+              <th>Phylum</th>
+              <th>Class</th>
+              <th>Order</th>
+              <th>Family</th>
+              <th>Genus</th>
+              <th>Species</th>
+            </tr>
+            <tr>
+              <td><a href="https://en.wikipedia.org/wiki/Animal">Animalia</a></td>
+              <td><a href="https://en.wikipedia.org/wiki/Chordate">Chordata</a></td>
+              <td><a href="https://en.wikipedia.org/wiki/Mammal">Mammalia</a></td>
+              <td><a href="https://en.wikipedia.org/wiki/Carnivora">Carnivora</a></td>
+              <td><a href="https://en.wikipedia.org/wiki/Bear">Ursidae</a></td>
+              <td><a href="https://en.wikipedia.org/wiki/Ailuropoda">Ailuropoda</a></td>
+              <td>A. melanoleuca</td>
+            </tr>
+          </table>
+          """,
+            extract_hrefs=True,
+        )[0]
+
+        expected = DataFrame(
+            [
+                [
+                    ("Animalia", "https://en.wikipedia.org/wiki/Animal"),
+                    ("Chordata", "https://en.wikipedia.org/wiki/Chordate"),
+                    ("Mammalia", "https://en.wikipedia.org/wiki/Mammal"),
+                    ("Carnivora", "https://en.wikipedia.org/wiki/Carnivora"),
+                    ("Ursidae", "https://en.wikipedia.org/wiki/Bear"),
+                    ("Ailuropoda", "https://en.wikipedia.org/wiki/Ailuropoda"),
+                    ("A. melanoleuca",),
+                ]
+            ],
+            columns=(
+                "Kingdom",
+                "Phylum",
+                "Class",
+                "Order",
+                "Family",
+                "Genus",
+                "Species",
+            ),
+        )
+
+        tm.assert_frame_equal(result, expected)

From ac86888d2aaa33cb0897d94b659f6e77ae26f224 Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Mon, 14 Feb 2022 15:40:59 +0000
Subject: [PATCH 02/24] Fix typing error

---
 pandas/io/html.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index 002a238808526..6c4a51833b916 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -488,7 +488,9 @@ def _expand_colspan_rowspan(self, rows, header=False):
         to subsequent cells.
         """
         all_texts = []  # list of rows, each a list of str
-        remainder: list[tuple[int, str, int]] = []  # list of (index, text, nrows)
+        remainder: list[
+            tuple[int, str | tuple, int]
+        ] = []  # list of (index, text, nrows)
 
         for tr in rows:
             texts = []  # the output for this row

From b33dc9ee8b54ff46059b1d9b71da0e20d444185e Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Tue, 15 Feb 2022 10:05:34 +0000
Subject: [PATCH 03/24] Simplify tests

---
 pandas/tests/io/test_html.py | 40 +++++++++++-------------------------
 1 file changed, 12 insertions(+), 28 deletions(-)

diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index cfa90237b52f4..7fd05d3f342cb 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -1294,22 +1294,14 @@ def test_extract_hrefs(self):
             """
           <table>
             <tr>
-              <th>Kingdom</th>
-              <th>Phylum</th>
-              <th>Class</th>
-              <th>Order</th>
-              <th>Family</th>
-              <th>Genus</th>
-              <th>Species</th>
+              <th>HTTP</th>
+              <th>FTP</th>
+              <th><a href="https://en.wiktionary.org/wiki/linkless">None</a></th>
             </tr>
             <tr>
-              <td><a href="https://en.wikipedia.org/wiki/Animal">Animalia</a></td>
-              <td><a href="https://en.wikipedia.org/wiki/Chordate">Chordata</a></td>
-              <td><a href="https://en.wikipedia.org/wiki/Mammal">Mammalia</a></td>
-              <td><a href="https://en.wikipedia.org/wiki/Carnivora">Carnivora</a></td>
-              <td><a href="https://en.wikipedia.org/wiki/Bear">Ursidae</a></td>
-              <td><a href="https://en.wikipedia.org/wiki/Ailuropoda">Ailuropoda</a></td>
-              <td>A. melanoleuca</td>
+              <td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
+              <td><a href="ftp://ftp.us.debian.org/">Debian</a></td>
+              <td>Linkless</td>
             </tr>
           </table>
           """,
@@ -1319,23 +1311,15 @@ def test_extract_hrefs(self):
         expected = DataFrame(
             [
                 [
-                    ("Animalia", "https://en.wikipedia.org/wiki/Animal"),
-                    ("Chordata", "https://en.wikipedia.org/wiki/Chordate"),
-                    ("Mammalia", "https://en.wikipedia.org/wiki/Mammal"),
-                    ("Carnivora", "https://en.wikipedia.org/wiki/Carnivora"),
-                    ("Ursidae", "https://en.wikipedia.org/wiki/Bear"),
-                    ("Ailuropoda", "https://en.wikipedia.org/wiki/Ailuropoda"),
-                    ("A. melanoleuca",),
+                    ("Wikipedia", "https://en.wikipedia.org/"),
+                    ("Debian", "ftp://ftp.us.debian.org/"),
+                    ("Linkless",),
                 ]
             ],
             columns=(
-                "Kingdom",
-                "Phylum",
-                "Class",
-                "Order",
-                "Family",
-                "Genus",
-                "Species",
+                "HTTP",
+                "FTP",
+                "None",
             ),
         )
 

From a13c5f070d3f937c1947699eb86d83600aeb71b8 Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Tue, 15 Feb 2022 14:22:37 +0000
Subject: [PATCH 04/24] Fix still incorrect typing

---
 pandas/io/html.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index 6c4a51833b916..e8fccefa68546 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -488,6 +488,7 @@ def _expand_colspan_rowspan(self, rows, header=False):
         to subsequent cells.
         """
         all_texts = []  # list of rows, each a list of str
+        text: str | tuple
         remainder: list[
             tuple[int, str | tuple, int]
         ] = []  # list of (index, text, nrows)

From 76ebe35e6a626cedfa295ddd53c5a2b8fe49a3ff Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Thu, 17 Feb 2022 13:55:22 +0000
Subject: [PATCH 05/24] Summarise whatsnew entry and move detailed explanation
 into user guide

---
 doc/source/user_guide/io.rst   | 22 ++++++++++++++++++++++
 doc/source/whatsnew/v1.5.0.rst | 28 ++--------------------------
 2 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 34f10c1b3ec28..6bf25b9e9bb46 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -2729,6 +2729,28 @@ succeeds, the function will return*.
 
    dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor=["lxml", "bs4"])
 
+Links can be extracted from cells along with the text using ``extract_hrefs=True``.
+
+.. ipython:: python
+
+    html_table = """
+    <table>
+      <tr>
+        <th>GitHub</th>
+      </tr>
+      <tr>
+        <td><a href="https://github.com/pandas-dev/pandas">pandas</a></td>
+      </tr>
+    </table>
+    """
+
+    df = pd.read_html(
+        html_table,
+        extract_hrefs=True
+    )[0]
+    df
+    df["GitHub"]
+    df["GitHub"].str[1]
 
 .. _io.html:
 
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index dd71907b1b3dd..013e6668482f2 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -39,6 +39,8 @@ Other enhancements
 - :meth:`.GroupBy.min` and :meth:`.GroupBy.max` now supports `Numba <https://numba.pydata.org/>`_ execution with the ``engine`` keyword (:issue:`45428`)
 - Implemented a ``bool``-dtype :class:`Index`, passing a bool-dtype array-like to ``pd.Index`` will now retain ``bool`` dtype instead of casting to ``object`` (:issue:`45061`)
 - Implemented a complex-dtype :class:`Index`, passing a complex-dtype array-like to ``pd.Index`` will now retain complex dtype instead of casting to ``object`` (:issue:`45845`)
+- :func:`pandas.read_html` now supports extracting hrefs from table cells (:issue:`13141`).
+
 
 -
 
@@ -109,32 +111,6 @@ apply converter methods, and parse dates (:issue:`43567`).
     df
     df.dtypes
 
-read_html now supports ``extract_hrefs``
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-:func:`pandas.read_html` can now extract hrefs from table cells (:issue:`13141`).
-
-.. ipython:: python
-
-    html_table = """
-    <table>
-      <tr>
-        <th>GitHub</th>
-      </tr>
-      <tr>
-        <td><a href="https://github.com/pandas-dev/pandas">pandas</a></td>
-      </tr>
-    </table>
-    """
-
-    df = pd.read_html(
-        html_table,
-        extract_hrefs=True
-    )[0]
-    df
-    df["GitHub"]
-    df["GitHub"].str[1]
-
 .. _whatsnew_150.api_breaking.api_breaking2:
 
 api_breaking_change2

From cd352e7e1d7f99d9915ce6341f7eb931f91ee31a Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Wed, 23 Feb 2022 19:17:10 +0000
Subject: [PATCH 06/24] More flexible link extraction

---
 pandas/io/html.py            | 32 +++++++++++++++++++-------------
 pandas/tests/io/test_html.py | 28 ++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 15 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index e8fccefa68546..4b0819ff64db1 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -10,6 +10,7 @@
 import numbers
 import re
 from typing import (
+    Literal,
     Pattern,
     Sequence,
     cast,
@@ -180,8 +181,10 @@ class _HtmlFrameParser:
     displayed_only : bool
         Whether or not items with "display:none" should be ignored
 
-    extract_hrefs : bool, default False
-        Whether table elements with <a> tags should have the href extracted.
+    extract_hrefs : all/header/body/footer or None
+        Table elements in the specified section(s) with <a> tags will have their
+        href extracted. Note that specifying "header" will result in a
+        :class:`~pandas.MultiIndex`.
 
     Attributes
     ----------
@@ -228,7 +231,7 @@ def __init__(
         attrs: dict[str, str] | None,
         encoding: str,
         displayed_only: bool,
-        extract_hrefs: bool,
+        extract_hrefs: Literal["all", "header", "body", "footer", None],
     ):
         self.io = io
         self.match = match
@@ -460,13 +463,15 @@ def row_is_all_th(row):
             while body_rows and row_is_all_th(body_rows[0]):
                 header_rows.append(body_rows.pop(0))
 
-        header = self._expand_colspan_rowspan(header_rows, header=True)
-        body = self._expand_colspan_rowspan(body_rows)
-        footer = self._expand_colspan_rowspan(footer_rows)
+        header = self._expand_colspan_rowspan(header_rows, section="header")
+        body = self._expand_colspan_rowspan(body_rows, section="body")
+        footer = self._expand_colspan_rowspan(footer_rows, section="footer")
 
         return header, body, footer
 
-    def _expand_colspan_rowspan(self, rows, header=False):
+    def _expand_colspan_rowspan(
+        self, rows, section: Literal["header", "body", "footer"]
+    ):
         """
         Given a list of <tr>s, return a list of text rows.
 
@@ -474,8 +479,7 @@ def _expand_colspan_rowspan(self, rows, header=False):
         ----------
         rows : list of node-like
             List of <tr>s
-        header : whether the current row is the header - don't capture links if so,
-            as this results in a MultiIndex which is undesirable.
+        section : the section that the rows belong to (header, body or footer).
 
         Returns
         -------
@@ -511,7 +515,7 @@ def _expand_colspan_rowspan(self, rows, header=False):
 
                 # Append the text from this <td>, colspan times
                 text = _remove_whitespace(self._text_getter(td))
-                if not header and self.extract_hrefs:
+                if self.extract_hrefs == "all" or self.extract_hrefs == section:
                     # All cells will be tuples except for the headers for
                     # consistency in selection (e.g. using .str indexing)
                     href = self._href_getter(td)
@@ -1007,7 +1011,7 @@ def read_html(
     na_values=None,
     keep_default_na: bool = True,
     displayed_only: bool = True,
-    extract_hrefs: bool = False,
+    extract_hrefs: Literal["all", "header", "body", "footer", None] = None,
 ) -> list[DataFrame]:
     r"""
     Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1102,8 +1106,10 @@ def read_html(
     displayed_only : bool, default True
         Whether elements with "display: none" should be parsed.
 
-    extract_hrefs : bool, default False
-        Whether table elements with <a> tags should have the href extracted.
+    extract_hrefs : all/header/body/footer or None, default None
+        Table elements in the specified section(s) with <a> tags will have their
+        href extracted. Note that specifying "header" will result in a
+        :class:`~pandas.MultiIndex`.
 
     Returns
     -------
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 7fd05d3f342cb..457c3f888efb7 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -1287,7 +1287,7 @@ def test_parse_path_object(self, datapath):
         df2 = self.read_html(file_path)[0]
         tm.assert_frame_equal(df1, df2)
 
-    def test_extract_hrefs(self):
+    def test_extract_hrefs_body(self):
         # GH 13141:
         # read_html argument to interpret hyperlinks as links (not merely text)
         result = self.read_html(
@@ -1305,7 +1305,7 @@ def test_extract_hrefs(self):
             </tr>
           </table>
           """,
-            extract_hrefs=True,
+            extract_hrefs="body",
         )[0]
 
         expected = DataFrame(
@@ -1324,3 +1324,27 @@ def test_extract_hrefs(self):
         )
 
         tm.assert_frame_equal(result, expected)
+
+    def test_extract_hrefs_header(self):
+        # GH 13141:
+        # read_html argument to interpret hyperlinks as links (not merely text)
+        result = self.read_html(
+            """
+          <table>
+            <tr>
+              <th><a href="https://en.wiktionary.org/wiki/linkless">Linkless</a></th>
+            </tr>
+            <tr>
+              <td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
+            </tr>
+          </table>
+          """,
+            extract_hrefs="header",
+        )[0]
+
+        expected = DataFrame(
+            [["Wikipedia"]],
+            columns=(("Linkless", "https://en.wiktionary.org/wiki/linkless"),),
+        )
+
+        tm.assert_frame_equal(result, expected)

From 1de13246106587fb17bc2306f910b6df7e6c4fd5 Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Sat, 26 Feb 2022 19:08:43 +0000
Subject: [PATCH 07/24] Suggested changes

---
 doc/source/user_guide/io.rst |  2 ++
 pandas/io/html.py            | 20 ++++++++++++++------
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 6bf25b9e9bb46..0dba537025fae 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -2729,6 +2729,8 @@ succeeds, the function will return*.
 
    dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor=["lxml", "bs4"])
 
+.. versionadded:: 1.5.0
+
 Links can be extracted from cells along with the text using ``extract_hrefs=True``.
 
 .. ipython:: python
diff --git a/pandas/io/html.py b/pandas/io/html.py
index 4b0819ff64db1..0fae7eae8c816 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -181,7 +181,9 @@ class _HtmlFrameParser:
     displayed_only : bool
         Whether or not items with "display:none" should be ignored
 
-    extract_hrefs : all/header/body/footer or None
+    .. versionadded:: 1.5.0
+
+    extract_hrefs : "all"/"header"/"body"/"footer" or None
         Table elements in the specified section(s) with <a> tags will have their
         href extracted. Note that specifying "header" will result in a
         :class:`~pandas.MultiIndex`.
@@ -204,8 +206,12 @@ class _HtmlFrameParser:
     displayed_only : bool
         Whether or not items with "display:none" should be ignored
 
-    extract_hrefs : bool, default False
-        Whether table elements with <a> tags should have the href extracted.
+    .. versionadded:: 1.5.0
+
+    extract_hrefs : "all"/"header"/"body"/"footer" or None, default None
+        Table elements in the specified section(s) with <a> tags will have their
+        href extracted. Note that specifying "header" will result in a
+        :class:`~pandas.MultiIndex`.
 
     Notes
     -----
@@ -624,7 +630,7 @@ def _parse_tables(self, doc, match, attrs):
             raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
         return result
 
-    def _href_getter(self, obj):
+    def _href_getter(self, obj) -> str | None:
         a = obj.find("a", href=True)
         return None if not a else a["href"]
 
@@ -713,7 +719,7 @@ class _LxmlFrameParser(_HtmlFrameParser):
     :class:`_HtmlFrameParser`.
     """
 
-    def _href_getter(self, obj):
+    def _href_getter(self, obj) -> str | None:
         href = obj.xpath(".//a/@href")
         return None if not href else href[0]
 
@@ -1106,7 +1112,9 @@ def read_html(
     displayed_only : bool, default True
         Whether elements with "display: none" should be parsed.
 
-    extract_hrefs : all/header/body/footer or None, default None
+    .. versionadded:: 1.5.0
+
+    extract_hrefs : "all"/"header"/"body"/"footer" or None, default None
         Table elements in the specified section(s) with <a> tags will have their
         href extracted. Note that specifying "header" will result in a
         :class:`~pandas.MultiIndex`.

From 1190ea7e039dc7c4d4d59a38410481e4a9b2ef34 Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Mon, 28 Feb 2022 17:37:23 +0000
Subject: [PATCH 08/24] extract_hrefs -> extract_links

---
 doc/source/user_guide/io.rst   |  4 ++--
 doc/source/whatsnew/v1.5.0.rst |  2 +-
 pandas/io/html.py              | 20 ++++++++++----------
 pandas/tests/io/test_html.py   |  8 ++++----
 4 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 0dba537025fae..8306ed801eeed 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -2731,7 +2731,7 @@ succeeds, the function will return*.
 
 .. versionadded:: 1.5.0
 
-Links can be extracted from cells along with the text using ``extract_hrefs=True``.
+Links can be extracted from cells along with the text using ``extract_links=True``.
 
 .. ipython:: python
 
@@ -2748,7 +2748,7 @@ Links can be extracted from cells along with the text using ``extract_hrefs=True
 
     df = pd.read_html(
         html_table,
-        extract_hrefs=True
+        extract_links=True
     )[0]
     df
     df["GitHub"]
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 013e6668482f2..88caf4d6e41aa 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -39,7 +39,7 @@ Other enhancements
 - :meth:`.GroupBy.min` and :meth:`.GroupBy.max` now supports `Numba <https://numba.pydata.org/>`_ execution with the ``engine`` keyword (:issue:`45428`)
 - Implemented a ``bool``-dtype :class:`Index`, passing a bool-dtype array-like to ``pd.Index`` will now retain ``bool`` dtype instead of casting to ``object`` (:issue:`45061`)
 - Implemented a complex-dtype :class:`Index`, passing a complex-dtype array-like to ``pd.Index`` will now retain complex dtype instead of casting to ``object`` (:issue:`45845`)
-- :func:`pandas.read_html` now supports extracting hrefs from table cells (:issue:`13141`).
+- :func:`pandas.read_html` now supports extracting links from table cells (:issue:`13141`).
 
 
 -
diff --git a/pandas/io/html.py b/pandas/io/html.py
index 0fae7eae8c816..51fe673e04146 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -183,7 +183,7 @@ class _HtmlFrameParser:
 
     .. versionadded:: 1.5.0
 
-    extract_hrefs : "all"/"header"/"body"/"footer" or None
+    extract_links : "all"/"header"/"body"/"footer" or None
         Table elements in the specified section(s) with <a> tags will have their
         href extracted. Note that specifying "header" will result in a
         :class:`~pandas.MultiIndex`.
@@ -208,7 +208,7 @@ class _HtmlFrameParser:
 
     .. versionadded:: 1.5.0
 
-    extract_hrefs : "all"/"header"/"body"/"footer" or None, default None
+    extract_links : "all"/"header"/"body"/"footer" or None, default None
         Table elements in the specified section(s) with <a> tags will have their
         href extracted. Note that specifying "header" will result in a
         :class:`~pandas.MultiIndex`.
@@ -237,14 +237,14 @@ def __init__(
         attrs: dict[str, str] | None,
         encoding: str,
         displayed_only: bool,
-        extract_hrefs: Literal["all", "header", "body", "footer", None],
+        extract_links: Literal["all", "header", "body", "footer", None],
     ):
         self.io = io
         self.match = match
         self.attrs = attrs
         self.encoding = encoding
         self.displayed_only = displayed_only
-        self.extract_hrefs = extract_hrefs
+        self.extract_links = extract_links
 
     def parse_tables(self):
         """
@@ -521,7 +521,7 @@ def _expand_colspan_rowspan(
 
                 # Append the text from this <td>, colspan times
                 text = _remove_whitespace(self._text_getter(td))
-                if self.extract_hrefs == "all" or self.extract_hrefs == section:
+                if self.extract_links == "all" or self.extract_links == section:
                     # All cells will be tuples except for the headers for
                     # consistency in selection (e.g. using .str indexing)
                     href = self._href_getter(td)
@@ -959,14 +959,14 @@ def _validate_flavor(flavor):
     return flavor
 
 
-def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_hrefs, **kwargs):
+def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs):
     flavor = _validate_flavor(flavor)
     compiled_match = re.compile(match)  # you can pass a compiled regex here
 
     retained = None
     for flav in flavor:
         parser = _parser_dispatch(flav)
-        p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_hrefs)
+        p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links)
 
         try:
             tables = p.parse_tables()
@@ -1017,7 +1017,7 @@ def read_html(
     na_values=None,
     keep_default_na: bool = True,
     displayed_only: bool = True,
-    extract_hrefs: Literal["all", "header", "body", "footer", None] = None,
+    extract_links: Literal["all", "header", "body", "footer", None] = None,
 ) -> list[DataFrame]:
     r"""
     Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1114,7 +1114,7 @@ def read_html(
 
     .. versionadded:: 1.5.0
 
-    extract_hrefs : "all"/"header"/"body"/"footer" or None, default None
+    extract_links : "all"/"header"/"body"/"footer" or None, default None
         Table elements in the specified section(s) with <a> tags will have their
         href extracted. Note that specifying "header" will result in a
         :class:`~pandas.MultiIndex`.
@@ -1187,5 +1187,5 @@ def read_html(
         na_values=na_values,
         keep_default_na=keep_default_na,
         displayed_only=displayed_only,
-        extract_hrefs=extract_hrefs,
+        extract_links=extract_links,
     )
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 457c3f888efb7..0f6febd720eb9 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -1287,7 +1287,7 @@ def test_parse_path_object(self, datapath):
         df2 = self.read_html(file_path)[0]
         tm.assert_frame_equal(df1, df2)
 
-    def test_extract_hrefs_body(self):
+    def test_extract_links_body(self):
         # GH 13141:
         # read_html argument to interpret hyperlinks as links (not merely text)
         result = self.read_html(
@@ -1305,7 +1305,7 @@ def test_extract_hrefs_body(self):
             </tr>
           </table>
           """,
-            extract_hrefs="body",
+            extract_links="body",
         )[0]
 
         expected = DataFrame(
@@ -1325,7 +1325,7 @@ def test_extract_hrefs_body(self):
 
         tm.assert_frame_equal(result, expected)
 
-    def test_extract_hrefs_header(self):
+    def test_extract_links_header(self):
         # GH 13141:
         # read_html argument to interpret hyperlinks as links (not merely text)
         result = self.read_html(
@@ -1339,7 +1339,7 @@ def test_extract_hrefs_header(self):
             </tr>
           </table>
           """,
-            extract_hrefs="header",
+            extract_links="header",
         )[0]
 
         expected = DataFrame(

From db8b6db4beeb1c81c15dee4885567c6ea60cd68d Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Sun, 20 Mar 2022 19:21:35 +0000
Subject: [PATCH 09/24] Move versionadded to correct place and improve
 docstring for extract_links (@attack68)

---
 doc/source/user_guide/io.rst |  4 ++--
 pandas/io/html.py            | 18 +++++++++---------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 8306ed801eeed..f372b4682d733 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -2729,8 +2729,6 @@ succeeds, the function will return*.
 
    dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor=["lxml", "bs4"])
 
-.. versionadded:: 1.5.0
-
 Links can be extracted from cells along with the text using ``extract_links=True``.
 
 .. ipython:: python
@@ -2754,6 +2752,8 @@ Links can be extracted from cells along with the text using ``extract_links=True
     df["GitHub"]
     df["GitHub"].str[1]
 
+.. versionadded:: 1.5.0
+
 .. _io.html:
 
 Writing to HTML files
diff --git a/pandas/io/html.py b/pandas/io/html.py
index 51fe673e04146..6808554e220b6 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -181,13 +181,13 @@ class _HtmlFrameParser:
     displayed_only : bool
         Whether or not items with "display:none" should be ignored
 
-    .. versionadded:: 1.5.0
-
-    extract_links : "all"/"header"/"body"/"footer" or None
+    extract_links : {None, "all", "header", "body", "footer"}
         Table elements in the specified section(s) with <a> tags will have their
         href extracted. Note that specifying "header" will result in a
         :class:`~pandas.MultiIndex`.
 
+    .. versionadded:: 1.5.0
+
     Attributes
     ----------
     io : str or file-like
@@ -206,13 +206,13 @@ class _HtmlFrameParser:
     displayed_only : bool
         Whether or not items with "display:none" should be ignored
 
-    .. versionadded:: 1.5.0
-
-    extract_links : "all"/"header"/"body"/"footer" or None, default None
+    extract_links : {None, "all", "header", "body", "footer"}
         Table elements in the specified section(s) with <a> tags will have their
         href extracted. Note that specifying "header" will result in a
         :class:`~pandas.MultiIndex`.
 
+    .. versionadded:: 1.5.0
+
     Notes
     -----
     To subclass this class effectively you must override the following methods:
@@ -1112,13 +1112,13 @@ def read_html(
     displayed_only : bool, default True
         Whether elements with "display: none" should be parsed.
 
-    .. versionadded:: 1.5.0
-
-    extract_links : "all"/"header"/"body"/"footer" or None, default None
+    extract_links : {None, "all", "header", "body", "footer"}
         Table elements in the specified section(s) with <a> tags will have their
         href extracted. Note that specifying "header" will result in a
         :class:`~pandas.MultiIndex`.
 
+    .. versionadded:: 1.5.0
+
     Returns
     -------
     dfs

From 1c8c89162dc61641ac53cf945c70527a5e2f2be6 Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Sun, 20 Mar 2022 21:29:41 +0000
Subject: [PATCH 10/24] Test for invalid extract_links value

---
 pandas/io/html.py            | 12 +++++++++---
 pandas/tests/io/test_html.py | 18 ++++++++++++++++++
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index 6808554e220b6..4f13e806ec07e 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -237,7 +237,7 @@ def __init__(
         attrs: dict[str, str] | None,
         encoding: str,
         displayed_only: bool,
-        extract_links: Literal["all", "header", "body", "footer", None],
+        extract_links: Literal[None, "header", "footer", "body", "all"],
     ):
         self.io = io
         self.match = match
@@ -476,7 +476,7 @@ def row_is_all_th(row):
         return header, body, footer
 
     def _expand_colspan_rowspan(
-        self, rows, section: Literal["header", "body", "footer"]
+        self, rows, section: Literal["header", "footer", "body"]
     ):
         """
         Given a list of <tr>s, return a list of text rows.
@@ -1017,7 +1017,7 @@ def read_html(
     na_values=None,
     keep_default_na: bool = True,
     displayed_only: bool = True,
-    extract_links: Literal["all", "header", "body", "footer", None] = None,
+    extract_links: Literal[None, "header", "footer", "body", "all"] = None,
 ) -> list[DataFrame]:
     r"""
     Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1167,6 +1167,12 @@ def read_html(
             "cannot skip rows starting from the end of the "
             "data (you passed a negative value)"
         )
+    if extract_links not in [None, "header", "footer", "body", "all"]:
+        raise ValueError(
+            "`extract_links` must be one of "
+            '{None, "header", "footer", "body", "all"}, got '
+            f'"{extract_links}"'
+        )
     validate_header_arg(header)
 
     io = stringify_path(io)
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 0f6febd720eb9..f2354b845b40f 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -1348,3 +1348,21 @@ def test_extract_links_header(self):
         )
 
         tm.assert_frame_equal(result, expected)
+
+    def test_extract_links_bad(self):
+        html = """
+          <table>
+            <tr>
+              <th><a href="https://en.wiktionary.org/wiki/linkless">Linkless</a></th>
+            </tr>
+            <tr>
+              <td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
+            </tr>
+          </table>
+        """
+        msg = (
+            "`extract_links` must be one of "
+            '{None, "header", "footer", "body", "all"}, got "incorrect"'
+        )
+        with pytest.raises(ValueError, match=msg):
+            read_html(html, extract_links="incorrect")

From 1555fbd4642e4a1e310c4f8309de633fd4ba7ef4 Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Sat, 2 Apr 2022 03:42:19 +0100
Subject: [PATCH 11/24] Test all extract_link options

---
 pandas/tests/io/test_html.py | 126 +++++++++++++++++++++++++----------
 1 file changed, 92 insertions(+), 34 deletions(-)

diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index f2354b845b40f..27e1d2caee0e6 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -113,6 +113,28 @@ def spam_data(self, datapath):
     def banklist_data(self, datapath):
         return datapath("io", "data", "html", "banklist.html")
 
+    @pytest.fixture
+    def gh_13141_data(self):
+        return """
+          <table>
+            <tr>
+              <th>HTTP</th>
+              <th>FTP</th>
+              <th><a href="https://en.wiktionary.org/wiki/linkless">Linkless</a></th>
+            </tr>
+            <tr>
+              <td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
+              <td><a href="ftp://ftp.us.debian.org/">Debian</a></td>
+              <td>Linkless</td>
+            </tr>
+            <tfoot>
+              <tr>
+                <td><a href="https://en.wikipedia.org/wiki/Page_footer">Footer</a></td>
+              </tr>
+            </tfoot>
+          </table>
+          """
+
     @pytest.fixture(autouse=True, scope="function")
     def set_defaults(self, flavor):
         self.read_html = partial(read_html, flavor=flavor)
@@ -1287,25 +1309,12 @@ def test_parse_path_object(self, datapath):
         df2 = self.read_html(file_path)[0]
         tm.assert_frame_equal(df1, df2)
 
-    def test_extract_links_body(self):
+    def test_extract_links(self, gh_13141_data):
         # GH 13141:
         # read_html argument to interpret hyperlinks as links (not merely text)
         result = self.read_html(
-            """
-          <table>
-            <tr>
-              <th>HTTP</th>
-              <th>FTP</th>
-              <th><a href="https://en.wiktionary.org/wiki/linkless">None</a></th>
-            </tr>
-            <tr>
-              <td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
-              <td><a href="ftp://ftp.us.debian.org/">Debian</a></td>
-              <td>Linkless</td>
-            </tr>
-          </table>
-          """,
-            extract_links="body",
+            gh_13141_data,
+            extract_links="all",
         )[0]
 
         expected = DataFrame(
@@ -1314,37 +1323,86 @@ def test_extract_links_body(self):
                     ("Wikipedia", "https://en.wikipedia.org/"),
                     ("Debian", "ftp://ftp.us.debian.org/"),
                     ("Linkless",),
-                ]
+                ],
+                [("Footer", "https://en.wikipedia.org/wiki/Page_footer"), None, None],
+            ],
+            columns=(
+                ("HTTP", np.nan),
+                ("FTP", np.nan),
+                ("Linkless", "https://en.wiktionary.org/wiki/linkless"),
+            ),
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_extract_links_header(self, gh_13141_data):
+        result = self.read_html(
+            gh_13141_data,
+            extract_links="header",
+        )[0]
+
+        expected = DataFrame(
+            [
+                [
+                    "Wikipedia",
+                    "Debian",
+                    "Linkless",
+                ],
+                ["Footer", None, None],
+            ],
+            columns=(
+                ("HTTP", np.nan),
+                ("FTP", np.nan),
+                ("Linkless", "https://en.wiktionary.org/wiki/linkless"),
+            ),
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_extract_links_footer(self, gh_13141_data):
+        result = self.read_html(
+            gh_13141_data,
+            extract_links="footer",
+        )[0]
+
+        expected = DataFrame(
+            [
+                [
+                    "Wikipedia",
+                    "Debian",
+                    "Linkless",
+                ],
+                [("Footer", "https://en.wikipedia.org/wiki/Page_footer"), None, None],
             ],
             columns=(
                 "HTTP",
                 "FTP",
-                "None",
+                "Linkless",
             ),
         )
 
         tm.assert_frame_equal(result, expected)
 
-    def test_extract_links_header(self):
-        # GH 13141:
-        # read_html argument to interpret hyperlinks as links (not merely text)
+    def test_extract_links_body(self, gh_13141_data):
         result = self.read_html(
-            """
-          <table>
-            <tr>
-              <th><a href="https://en.wiktionary.org/wiki/linkless">Linkless</a></th>
-            </tr>
-            <tr>
-              <td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
-            </tr>
-          </table>
-          """,
-            extract_links="header",
+            gh_13141_data,
+            extract_links="body",
         )[0]
 
         expected = DataFrame(
-            [["Wikipedia"]],
-            columns=(("Linkless", "https://en.wiktionary.org/wiki/linkless"),),
+            [
+                [
+                    ("Wikipedia", "https://en.wikipedia.org/"),
+                    ("Debian", "ftp://ftp.us.debian.org/"),
+                    ("Linkless",),
+                ],
+                ["Footer", None, None],
+            ],
+            columns=(
+                "HTTP",
+                "FTP",
+                "Linkless",
+            ),
         )
 
         tm.assert_frame_equal(result, expected)

From 09356966b520a4e5a8b20cf56ff05df9c550fffb Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Mon, 25 Apr 2022 02:36:04 +0100
Subject: [PATCH 12/24] Fix for MultiIndex headers (also fixes tests)

---
 pandas/io/html.py            |  17 ++--
 pandas/tests/io/test_html.py | 151 ++++++++++-------------------------
 2 files changed, 55 insertions(+), 113 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index 4f13e806ec07e..7b1d259086bd9 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -31,6 +31,8 @@
 
 from pandas.core.construction import create_series_with_explicit_dtype
 from pandas.core.frame import DataFrame
+from pandas.core.indexes.base import Index
+from pandas.core.indexes.multi import MultiIndex
 
 from pandas.io.common import (
     file_exists,
@@ -490,7 +492,8 @@ def _expand_colspan_rowspan(
         Returns
         -------
         list of list
-            Each returned row is a list of str text.
+            Each returned row is a list of str text, or tuple (text, link)
+            if extract_links is not None.
 
         Notes
         -----
@@ -522,10 +525,8 @@ def _expand_colspan_rowspan(
                 # Append the text from this <td>, colspan times
                 text = _remove_whitespace(self._text_getter(td))
                 if self.extract_links == "all" or self.extract_links == section:
-                    # All cells will be tuples except for the headers for
-                    # consistency in selection (e.g. using .str indexing)
                     href = self._href_getter(td)
-                    text = (text, href) if href else (text,)
+                    text = (text, href) if href else (text, None)
                 rowspan = int(self._attr_getter(td, "rowspan") or 1)
                 colspan = int(self._attr_getter(td, "colspan") or 1)
 
@@ -874,7 +875,13 @@ def _data_to_frame(**kwargs):
     # fill out elements of body that are "ragged"
     _expand_elements(body)
     with TextParser(body, header=header, **kwargs) as tp:
-        return tp.read()
+        df = tp.read()
+
+        # Cast MultiIndex header to an Index of tuples.
+        # This maintains consistency of selection (e.g. df.columns.str[1])
+        if isinstance(df.columns, MultiIndex):
+            df.columns = Index(df.columns)
+        return df
 
 
 _valid_parsers = {
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 27e1d2caee0e6..88d405435c947 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -135,6 +135,29 @@ def gh_13141_data(self):
           </table>
           """
 
+    @pytest.fixture
+    def gh_13141_expected(self):
+        return {
+            "head_ignore": ["HTTP", "FTP", "Linkless"],
+            "head_extract": [
+                ("HTTP", np.nan),
+                ("FTP", np.nan),
+                ("Linkless", "https://en.wiktionary.org/wiki/linkless"),
+            ],
+            "body_ignore": ["Wikipedia", "Debian", "Linkless"],
+            "body_extract": [
+                ("Wikipedia", "https://en.wikipedia.org/"),
+                ("Debian", "ftp://ftp.us.debian.org/"),
+                ("Linkless", None),
+            ],
+            "footer_ignore": ["Footer", None, None],
+            "footer_extract": [
+                ("Footer", "https://en.wikipedia.org/wiki/Page_footer"),
+                None,
+                None,
+            ],
+        }
+
     @pytest.fixture(autouse=True, scope="function")
     def set_defaults(self, flavor):
         self.read_html = partial(read_html, flavor=flavor)
@@ -1309,118 +1332,30 @@ def test_parse_path_object(self, datapath):
         df2 = self.read_html(file_path)[0]
         tm.assert_frame_equal(df1, df2)
 
-    def test_extract_links(self, gh_13141_data):
-        # GH 13141:
-        # read_html argument to interpret hyperlinks as links (not merely text)
-        result = self.read_html(
-            gh_13141_data,
-            extract_links="all",
-        )[0]
-
-        expected = DataFrame(
-            [
-                [
-                    ("Wikipedia", "https://en.wikipedia.org/"),
-                    ("Debian", "ftp://ftp.us.debian.org/"),
-                    ("Linkless",),
-                ],
-                [("Footer", "https://en.wikipedia.org/wiki/Page_footer"), None, None],
-            ],
-            columns=(
-                ("HTTP", np.nan),
-                ("FTP", np.nan),
-                ("Linkless", "https://en.wiktionary.org/wiki/linkless"),
-            ),
-        )
-
-        tm.assert_frame_equal(result, expected)
-
-    def test_extract_links_header(self, gh_13141_data):
-        result = self.read_html(
-            gh_13141_data,
-            extract_links="header",
-        )[0]
-
-        expected = DataFrame(
-            [
-                [
-                    "Wikipedia",
-                    "Debian",
-                    "Linkless",
-                ],
-                ["Footer", None, None],
-            ],
-            columns=(
-                ("HTTP", np.nan),
-                ("FTP", np.nan),
-                ("Linkless", "https://en.wiktionary.org/wiki/linkless"),
-            ),
-        )
-
-        tm.assert_frame_equal(result, expected)
-
-    def test_extract_links_footer(self, gh_13141_data):
-        result = self.read_html(
-            gh_13141_data,
-            extract_links="footer",
-        )[0]
-
-        expected = DataFrame(
-            [
-                [
-                    "Wikipedia",
-                    "Debian",
-                    "Linkless",
-                ],
-                [("Footer", "https://en.wikipedia.org/wiki/Page_footer"), None, None],
-            ],
-            columns=(
-                "HTTP",
-                "FTP",
-                "Linkless",
-            ),
-        )
-
+    @pytest.mark.parametrize("arg", ["all", "body", "header", "footer"])
+    def test_extract_links(self, gh_13141_data, gh_13141_expected, arg):
+        data_exp = gh_13141_expected["body_ignore"]
+        foot_exp = gh_13141_expected["footer_ignore"]
+        head_exp = gh_13141_expected["head_ignore"]
+        if arg == "all":
+            data_exp = gh_13141_expected["body_extract"]
+            foot_exp = gh_13141_expected["footer_extract"]
+            head_exp = gh_13141_expected["head_extract"]
+        elif arg == "body":
+            data_exp = gh_13141_expected["body_extract"]
+        elif arg == "footer":
+            foot_exp = gh_13141_expected["footer_extract"]
+        elif arg == "header":
+            head_exp = gh_13141_expected["head_extract"]
+
+        result = self.read_html(gh_13141_data, extract_links=arg)[0]
+        expected = DataFrame([data_exp, foot_exp], columns=head_exp)
         tm.assert_frame_equal(result, expected)
 
-    def test_extract_links_body(self, gh_13141_data):
-        result = self.read_html(
-            gh_13141_data,
-            extract_links="body",
-        )[0]
-
-        expected = DataFrame(
-            [
-                [
-                    ("Wikipedia", "https://en.wikipedia.org/"),
-                    ("Debian", "ftp://ftp.us.debian.org/"),
-                    ("Linkless",),
-                ],
-                ["Footer", None, None],
-            ],
-            columns=(
-                "HTTP",
-                "FTP",
-                "Linkless",
-            ),
-        )
-
-        tm.assert_frame_equal(result, expected)
-
-    def test_extract_links_bad(self):
-        html = """
-          <table>
-            <tr>
-              <th><a href="https://en.wiktionary.org/wiki/linkless">Linkless</a></th>
-            </tr>
-            <tr>
-              <td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
-            </tr>
-          </table>
-        """
+    def test_extract_links_bad(self, gh_13141_data):
         msg = (
             "`extract_links` must be one of "
             '{None, "header", "footer", "body", "all"}, got "incorrect"'
         )
         with pytest.raises(ValueError, match=msg):
-            read_html(html, extract_links="incorrect")
+            read_html(gh_13141_data, extract_links="incorrect")

From afaad1accfc771e031c74c68d74e05f1ee8deec9 Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Mon, 25 Apr 2022 02:44:11 +0100
Subject: [PATCH 13/24] Test that text surrounding <a> tag is still captured

---
 pandas/tests/io/test_html.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 88d405435c947..801fd8119eb7e 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -124,7 +124,7 @@ def gh_13141_data(self):
             </tr>
             <tr>
               <td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
-              <td><a href="ftp://ftp.us.debian.org/">Debian</a></td>
+              <td>SURROUNDING <a href="ftp://ftp.us.debian.org/">Debian</a> TEXT</td>
               <td>Linkless</td>
             </tr>
             <tfoot>
@@ -144,10 +144,10 @@ def gh_13141_expected(self):
                 ("FTP", np.nan),
                 ("Linkless", "https://en.wiktionary.org/wiki/linkless"),
             ],
-            "body_ignore": ["Wikipedia", "Debian", "Linkless"],
+            "body_ignore": ["Wikipedia", "SURROUNDING Debian TEXT", "Linkless"],
             "body_extract": [
                 ("Wikipedia", "https://en.wikipedia.org/"),
-                ("Debian", "ftp://ftp.us.debian.org/"),
+                ("SURROUNDING Debian TEXT", "ftp://ftp.us.debian.org/"),
                 ("Linkless", None),
             ],
             "footer_ignore": ["Footer", None, None],

From 20e24e9ba44b63cb3603885ab22ef42786556496 Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Mon, 25 Apr 2022 02:51:33 +0100
Subject: [PATCH 14/24] Test for multiple <a> tags in cell

---
 pandas/tests/io/test_html.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 801fd8119eb7e..676b057aa2093 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -130,6 +130,9 @@ def gh_13141_data(self):
             <tfoot>
               <tr>
                 <td><a href="https://en.wikipedia.org/wiki/Page_footer">Footer</a></td>
+                <td>
+                  Multiple <a href="1">links:</a> <a href="2">Only first captured.</a>
+                </td>
               </tr>
             </tfoot>
           </table>
@@ -150,10 +153,14 @@ def gh_13141_expected(self):
                 ("SURROUNDING Debian TEXT", "ftp://ftp.us.debian.org/"),
                 ("Linkless", None),
             ],
-            "footer_ignore": ["Footer", None, None],
+            "footer_ignore": [
+                "Footer",
+                "Multiple links: Only first captured.",
+                None,
+            ],
             "footer_extract": [
                 ("Footer", "https://en.wikipedia.org/wiki/Page_footer"),
-                None,
+                ("Multiple links: Only first captured.", "1"),
                 None,
             ],
         }

From ffdcf8adc98a5968af0d339bc48b14c38f95fd39 Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Sun, 15 May 2022 19:40:21 +0100
Subject: [PATCH 15/24] Fix all tests, with both MultiIndex -> Index and np.nan
 -> None conversions resolved

---
 pandas/io/html.py            | 34 +++++++++++++++++++---------------
 pandas/tests/io/test_html.py |  4 ++--
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index 7b1d259086bd9..3d4fefcecf97e 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -16,6 +16,8 @@
     cast,
 )
 
+import numpy as np
+
 from pandas._typing import (
     FilePath,
     ReadBuffer,
@@ -32,7 +34,6 @@
 from pandas.core.construction import create_series_with_explicit_dtype
 from pandas.core.frame import DataFrame
 from pandas.core.indexes.base import Index
-from pandas.core.indexes.multi import MultiIndex
 
 from pandas.io.common import (
     file_exists,
@@ -185,8 +186,7 @@ class _HtmlFrameParser:
 
     extract_links : {None, "all", "header", "body", "footer"}
         Table elements in the specified section(s) with <a> tags will have their
-        href extracted. Note that specifying "header" will result in a
-        :class:`~pandas.MultiIndex`.
+        href extracted.
 
     .. versionadded:: 1.5.0
 
@@ -210,8 +210,7 @@ class _HtmlFrameParser:
 
     extract_links : {None, "all", "header", "body", "footer"}
         Table elements in the specified section(s) with <a> tags will have their
-        href extracted. Note that specifying "header" will result in a
-        :class:`~pandas.MultiIndex`.
+        href extracted.
 
     .. versionadded:: 1.5.0
 
@@ -875,13 +874,7 @@ def _data_to_frame(**kwargs):
     # fill out elements of body that are "ragged"
     _expand_elements(body)
     with TextParser(body, header=header, **kwargs) as tp:
-        df = tp.read()
-
-        # Cast MultiIndex header to an Index of tuples.
-        # This maintains consistency of selection (e.g. df.columns.str[1])
-        if isinstance(df.columns, MultiIndex):
-            df.columns = Index(df.columns)
-        return df
+        return tp.read()
 
 
 _valid_parsers = {
@@ -1001,7 +994,19 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **
     ret = []
     for table in tables:
         try:
-            ret.append(_data_to_frame(data=table, **kwargs))
+            df = _data_to_frame(data=table, **kwargs)
+            # Cast MultiIndex header to an Index of tuples when extracting header
+            # links and replace np.nan with None.
+            # This maintains consistency of selection (e.g. df.columns.str[1])
+            if extract_links in ("all", "header"):
+                idx = df.columns.values
+                idx[:] = np.vectorize(
+                    lambda cols: tuple(None if col is np.nan else col for col in cols),
+                    otypes=["object"],
+                )(idx)
+                df.columns = Index(df.columns)
+
+            ret.append(df)
         except EmptyDataError:  # empty table
             continue
     return ret
@@ -1121,8 +1126,7 @@ def read_html(
 
     extract_links : {None, "all", "header", "body", "footer"}
         Table elements in the specified section(s) with <a> tags will have their
-        href extracted. Note that specifying "header" will result in a
-        :class:`~pandas.MultiIndex`.
+        href extracted.
 
     .. versionadded:: 1.5.0
 
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 676b057aa2093..3c42cacc1fdcd 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -143,8 +143,8 @@ def gh_13141_expected(self):
         return {
             "head_ignore": ["HTTP", "FTP", "Linkless"],
             "head_extract": [
-                ("HTTP", np.nan),
-                ("FTP", np.nan),
+                ("HTTP", None),
+                ("FTP", None),
                 ("Linkless", "https://en.wiktionary.org/wiki/linkless"),
             ],
             "body_ignore": ["Wikipedia", "SURROUNDING Debian TEXT", "Linkless"],

From 490005af42e67f959d8296b16322651e35b939d4 Mon Sep 17 00:00:00 2001
From: abmyii <52673001+abmyii@users.noreply.github.com>
Date: Sat, 18 Jun 2022 11:01:32 +0100
Subject: [PATCH 16/24] Add back EOF newline to test_html.py

---
 pandas/tests/io/test_html.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index ac546b5447aec..6f038413d3f07 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -1419,4 +1419,4 @@ def test_extract_links_bad(self, gh_13141_data):
             '{None, "header", "footer", "body", "all"}, got "incorrect"'
         )
         with pytest.raises(ValueError, match=msg):
-            read_html(gh_13141_data, extract_links="incorrect")
\ No newline at end of file
+            read_html(gh_13141_data, extract_links="incorrect")

From a5ff5c16768cb6923ba52a365a02deb53f6169e1 Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Sat, 18 Jun 2022 18:06:24 +0100
Subject: [PATCH 17/24] Correct user guide example

---
 doc/source/user_guide/io.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 7cf43de99ba8a..4896e8db323ef 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -2723,7 +2723,7 @@ succeeds, the function will return*.
 
    dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor=["lxml", "bs4"])
 
-Links can be extracted from cells along with the text using ``extract_links=True``.
+Links can be extracted from cells along with the text using ``extract_links="all"``.
 
 .. ipython:: python
 
@@ -2740,11 +2740,11 @@ Links can be extracted from cells along with the text using ``extract_links=True
 
     df = pd.read_html(
         html_table,
-        extract_links=True
+        extract_links="all"
     )[0]
     df
-    df["GitHub"]
-    df["GitHub"].str[1]
+    df[("GitHub", None)]
+    df[("GitHub", None)].str[1]
 
 .. versionadded:: 1.5.0
 

From 58fdb0c2b2214448bbd726910831f82fd2887b7e Mon Sep 17 00:00:00 2001
From: JHM Darbyshire <24256554+attack68@users.noreply.github.com>
Date: Fri, 29 Jul 2022 23:34:57 +0200
Subject: [PATCH 18/24] Update pandas/io/html.py

---
 pandas/io/html.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index ec2d6986b6ecc..eaf2575fb08aa 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -189,7 +189,7 @@ class _HtmlFrameParser:
         Table elements in the specified section(s) with <a> tags will have their
         href extracted.
 
-    .. versionadded:: 1.5.0
+        .. versionadded:: 1.5.0
 
     Attributes
     ----------

From c34d8ff9be12bfe58a49fdfaffdf6caeb33a5e8f Mon Sep 17 00:00:00 2001
From: JHM Darbyshire <24256554+attack68@users.noreply.github.com>
Date: Fri, 29 Jul 2022 23:35:04 +0200
Subject: [PATCH 19/24] Update pandas/io/html.py

---
 pandas/io/html.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index eaf2575fb08aa..8d03a783c1324 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -213,7 +213,7 @@ class _HtmlFrameParser:
         Table elements in the specified section(s) with <a> tags will have their
         href extracted.
 
-    .. versionadded:: 1.5.0
+        .. versionadded:: 1.5.0
 
     Notes
     -----

From 7389b84e0bd75bb1726144e18d8c5a8f8494c7ad Mon Sep 17 00:00:00 2001
From: JHM Darbyshire <24256554+attack68@users.noreply.github.com>
Date: Fri, 29 Jul 2022 23:35:10 +0200
Subject: [PATCH 20/24] Update pandas/io/html.py

---
 pandas/io/html.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index 8d03a783c1324..edf8e203bca45 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -1139,7 +1139,7 @@ def read_html(
         Table elements in the specified section(s) with <a> tags will have their
         href extracted.
 
-    .. versionadded:: 1.5.0
+        .. versionadded:: 1.5.0
 
     Returns
     -------

From ba7caab64c4c388b8f1349e872e1aca1200c9696 Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Sat, 30 Jul 2022 11:59:47 +0100
Subject: [PATCH 21/24] Simplify MultiIndex -> Index conversion

---
 pandas/io/html.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index edf8e203bca45..6f71fdb5d3a91 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -17,8 +17,6 @@
     cast,
 )
 
-import numpy as np
-
 from pandas._typing import (
     FilePath,
     ReadBuffer,
@@ -32,6 +30,7 @@
 
 from pandas.core.dtypes.common import is_list_like
 
+from pandas import isna
 from pandas.core.construction import create_series_with_explicit_dtype
 from pandas.core.frame import DataFrame
 from pandas.core.indexes.base import Index
@@ -1007,15 +1006,13 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **
         try:
             df = _data_to_frame(data=table, **kwargs)
             # Cast MultiIndex header to an Index of tuples when extracting header
-            # links and replace np.nan with None.
+            # links and replace nan with None.
             # This maintains consistency of selection (e.g. df.columns.str[1])
             if extract_links in ("all", "header"):
-                idx = df.columns.values
-                idx[:] = np.vectorize(
-                    lambda cols: tuple(None if col is np.nan else col for col in cols),
-                    otypes=["object"],
-                )(idx)
-                df.columns = Index(df.columns)
+                df.columns = Index(
+                    ((col[0], None if isna(col[1]) else col[1]) for col in df.columns),
+                    tupleize_cols=False,
+                )
 
             ret.append(df)
         except EmptyDataError:  # empty table

From 4c7f5321e0dc9526f3b2680c13eed2c38f9c3404 Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Sat, 30 Jul 2022 12:01:42 +0100
Subject: [PATCH 22/24] Move unnecessary fixtures into test body

---
 pandas/tests/io/test_html.py | 106 +++++++++++++++++------------------
 1 file changed, 51 insertions(+), 55 deletions(-)

diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 6f038413d3f07..045c22f106105 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -111,58 +111,6 @@ def spam_data(self, datapath):
     def banklist_data(self, datapath):
         return datapath("io", "data", "html", "banklist.html")
 
-    @pytest.fixture
-    def gh_13141_data(self):
-        return """
-          <table>
-            <tr>
-              <th>HTTP</th>
-              <th>FTP</th>
-              <th><a href="https://en.wiktionary.org/wiki/linkless">Linkless</a></th>
-            </tr>
-            <tr>
-              <td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
-              <td>SURROUNDING <a href="ftp://ftp.us.debian.org/">Debian</a> TEXT</td>
-              <td>Linkless</td>
-            </tr>
-            <tfoot>
-              <tr>
-                <td><a href="https://en.wikipedia.org/wiki/Page_footer">Footer</a></td>
-                <td>
-                  Multiple <a href="1">links:</a> <a href="2">Only first captured.</a>
-                </td>
-              </tr>
-            </tfoot>
-          </table>
-          """
-
-    @pytest.fixture
-    def gh_13141_expected(self):
-        return {
-            "head_ignore": ["HTTP", "FTP", "Linkless"],
-            "head_extract": [
-                ("HTTP", None),
-                ("FTP", None),
-                ("Linkless", "https://en.wiktionary.org/wiki/linkless"),
-            ],
-            "body_ignore": ["Wikipedia", "SURROUNDING Debian TEXT", "Linkless"],
-            "body_extract": [
-                ("Wikipedia", "https://en.wikipedia.org/"),
-                ("SURROUNDING Debian TEXT", "ftp://ftp.us.debian.org/"),
-                ("Linkless", None),
-            ],
-            "footer_ignore": [
-                "Footer",
-                "Multiple links: Only first captured.",
-                None,
-            ],
-            "footer_extract": [
-                ("Footer", "https://en.wikipedia.org/wiki/Page_footer"),
-                ("Multiple links: Only first captured.", "1"),
-                None,
-            ],
-        }
-
     @pytest.fixture(autouse=True, scope="function")
     def set_defaults(self, flavor):
         self.read_html = partial(read_html, flavor=flavor)
@@ -1394,7 +1342,55 @@ def test_parse_br_as_space(self):
         tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize("arg", ["all", "body", "header", "footer"])
-    def test_extract_links(self, gh_13141_data, gh_13141_expected, arg):
+    def test_extract_links(self, arg):
+        gh_13141_data = """
+          <table>
+            <tr>
+              <th>HTTP</th>
+              <th>FTP</th>
+              <th><a href="https://en.wiktionary.org/wiki/linkless">Linkless</a></th>
+            </tr>
+            <tr>
+              <td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
+              <td>SURROUNDING <a href="ftp://ftp.us.debian.org/">Debian</a> TEXT</td>
+              <td>Linkless</td>
+            </tr>
+            <tfoot>
+              <tr>
+                <td><a href="https://en.wikipedia.org/wiki/Page_footer">Footer</a></td>
+                <td>
+                  Multiple <a href="1">links:</a> <a href="2">Only first captured.</a>
+                </td>
+              </tr>
+            </tfoot>
+          </table>
+          """
+
+        gh_13141_expected = {
+            "head_ignore": ["HTTP", "FTP", "Linkless"],
+            "head_extract": [
+                ("HTTP", None),
+                ("FTP", None),
+                ("Linkless", "https://en.wiktionary.org/wiki/linkless"),
+            ],
+            "body_ignore": ["Wikipedia", "SURROUNDING Debian TEXT", "Linkless"],
+            "body_extract": [
+                ("Wikipedia", "https://en.wikipedia.org/"),
+                ("SURROUNDING Debian TEXT", "ftp://ftp.us.debian.org/"),
+                ("Linkless", None),
+            ],
+            "footer_ignore": [
+                "Footer",
+                "Multiple links: Only first captured.",
+                None,
+            ],
+            "footer_extract": [
+                ("Footer", "https://en.wikipedia.org/wiki/Page_footer"),
+                ("Multiple links: Only first captured.", "1"),
+                None,
+            ],
+        }
+
         data_exp = gh_13141_expected["body_ignore"]
         foot_exp = gh_13141_expected["footer_ignore"]
         head_exp = gh_13141_expected["head_ignore"]
@@ -1413,10 +1409,10 @@ def test_extract_links(self, gh_13141_data, gh_13141_expected, arg):
         expected = DataFrame([data_exp, foot_exp], columns=head_exp)
         tm.assert_frame_equal(result, expected)
 
-    def test_extract_links_bad(self, gh_13141_data):
+    def test_extract_links_bad(self, spam_data):
         msg = (
             "`extract_links` must be one of "
             '{None, "header", "footer", "body", "all"}, got "incorrect"'
         )
         with pytest.raises(ValueError, match=msg):
-            read_html(gh_13141_data, extract_links="incorrect")
+            read_html(spam_data, extract_links="incorrect")

From 98a46e2f3b1f002aca42884da4c24533f48082a5 Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Tue, 16 Aug 2022 01:51:02 +0100
Subject: [PATCH 23/24] Simplify statement

---
 pandas/io/html.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index 6f71fdb5d3a91..2e9e84b4c877c 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -525,7 +525,7 @@ def _expand_colspan_rowspan(
                 text = _remove_whitespace(self._text_getter(td))
                 if self.extract_links == "all" or self.extract_links == section:
                     href = self._href_getter(td)
-                    text = (text, href) if href else (text, None)
+                    text = (text, href)
                 rowspan = int(self._attr_getter(td, "rowspan") or 1)
                 colspan = int(self._attr_getter(td, "colspan") or 1)
 

From 614c6368635e72c8f6ad249e796d5abc922a530e Mon Sep 17 00:00:00 2001
From: Abdurrahmaan Iqbal <abdurrahmaaniqbal@hotmail.com>
Date: Tue, 16 Aug 2022 02:45:22 +0100
Subject: [PATCH 24/24] Fix code checks

---
 pandas/io/html.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index 6d02862b3c99e..f890ad86519df 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -33,7 +33,6 @@
 
 from pandas import isna
 from pandas.core.construction import create_series_with_explicit_dtype
-from pandas.core.frame import DataFrame
 from pandas.core.indexes.base import Index
 
 from pandas.io.common import (

HTTP	FTP	Linkless
Wikipedia	SURROUNDING Debian TEXT	Linkless
Footer	- Multiple links: Only first captured. -