From d69ce74944f96baf422ab16fc36581a4b6fc7327 Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Sun, 13 Feb 2022 14:44:05 +0000 Subject: [PATCH 01/24] ENH: pd.read_html argument to extract hrefs along with text from cells --- doc/source/whatsnew/v1.5.0.rst | 26 ++++++++++++++++ pandas/io/html.py | 53 ++++++++++++++++++++++++++++++--- pandas/tests/io/test_html.py | 54 ++++++++++++++++++++++++++++++++++ 3 files changed, 129 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index a35ca589065d8..dd71907b1b3dd 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -109,6 +109,32 @@ apply converter methods, and parse dates (:issue:`43567`). df df.dtypes +read_html now supports ``extract_hrefs`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`pandas.read_html` can now extract hrefs from table cells (:issue:`13141`). + +.. ipython:: python + + html_table = """ + + + + + + + +
GitHub
pandas
+ """ + + df = pd.read_html( + html_table, + extract_hrefs=True + )[0] + df + df["GitHub"] + df["GitHub"].str[1] + .. _whatsnew_150.api_breaking.api_breaking2: api_breaking_change2 diff --git a/pandas/io/html.py b/pandas/io/html.py index 05d7c2998ef27..002a238808526 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -180,6 +180,9 @@ class _HtmlFrameParser: displayed_only : bool Whether or not items with "display:none" should be ignored + extract_hrefs : bool, default False + Whether table elements with tags should have the href extracted. + Attributes ---------- io : str or file-like @@ -198,11 +201,15 @@ class _HtmlFrameParser: displayed_only : bool Whether or not items with "display:none" should be ignored + extract_hrefs : bool, default False + Whether table elements with tags should have the href extracted. + Notes ----- To subclass this class effectively you must override the following methods: * :func:`_build_doc` * :func:`_attr_getter` + * :func:`_href_getter` * :func:`_text_getter` * :func:`_parse_td` * :func:`_parse_thead_tr` @@ -221,12 +228,14 @@ def __init__( attrs: dict[str, str] | None, encoding: str, displayed_only: bool, + extract_hrefs: bool, ): self.io = io self.match = match self.attrs = attrs self.encoding = encoding self.displayed_only = displayed_only + self.extract_hrefs = extract_hrefs def parse_tables(self): """ @@ -259,6 +268,22 @@ def _attr_getter(self, obj, attr): # Both lxml and BeautifulSoup have the same implementation: return obj.get(attr) + def _href_getter(self, obj): + """ + Return a href if the DOM node contains a child or None. + + Parameters + ---------- + obj : node-like + A DOM node. + + Returns + ------- + href : str or unicode + The href from the child of the DOM node. + """ + raise AbstractMethodError(self) + def _text_getter(self, obj): """ Return the text of an individual DOM node. @@ -435,13 +460,13 @@ def row_is_all_th(row): while body_rows and row_is_all_th(body_rows[0]): header_rows.append(body_rows.pop(0)) - header = self._expand_colspan_rowspan(header_rows) + header = self._expand_colspan_rowspan(header_rows, header=True) body = self._expand_colspan_rowspan(body_rows) footer = self._expand_colspan_rowspan(footer_rows) return header, body, footer - def _expand_colspan_rowspan(self, rows): + def _expand_colspan_rowspan(self, rows, header=False): """ Given a list of s, return a list of text rows. @@ -449,6 +474,8 @@ def _expand_colspan_rowspan(self, rows): ---------- rows : list of node-like List of s + header : whether the current row is the header - don't capture links if so, + as this results in a MultiIndex which is undesirable. Returns ------- @@ -481,6 +508,11 @@ def _expand_colspan_rowspan(self, rows): # Append the text from this , colspan times text = _remove_whitespace(self._text_getter(td)) + if not header and self.extract_hrefs: + # All cells will be tuples except for the headers for + # consistency in selection (e.g. using .str indexing) + href = self._href_getter(td) + text = (text, href) if href else (text,) rowspan = int(self._attr_getter(td, "rowspan") or 1) colspan = int(self._attr_getter(td, "colspan") or 1) @@ -585,6 +617,10 @@ def _parse_tables(self, doc, match, attrs): raise ValueError(f"No tables found matching pattern {repr(match.pattern)}") return result + def _href_getter(self, obj): + a = obj.find("a", href=True) + return None if not a else a["href"] + def _text_getter(self, obj): return obj.text @@ -670,6 +706,10 @@ class _LxmlFrameParser(_HtmlFrameParser): :class:`_HtmlFrameParser`. """ + def _href_getter(self, obj): + href = obj.xpath(".//a/@href") + return None if not href else href[0] + def _text_getter(self, obj): return obj.text_content() @@ -906,14 +946,14 @@ def _validate_flavor(flavor): return flavor -def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): +def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_hrefs, **kwargs): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here retained = None for flav in flavor: parser = _parser_dispatch(flav) - p = parser(io, compiled_match, attrs, encoding, displayed_only) + p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_hrefs) try: tables = p.parse_tables() @@ -964,6 +1004,7 @@ def read_html( na_values=None, keep_default_na: bool = True, displayed_only: bool = True, + extract_hrefs: bool = False, ) -> list[DataFrame]: r""" Read HTML tables into a ``list`` of ``DataFrame`` objects. @@ -1058,6 +1099,9 @@ def read_html( displayed_only : bool, default True Whether elements with "display: none" should be parsed. + extract_hrefs : bool, default False + Whether table elements with tags should have the href extracted. + Returns ------- dfs @@ -1126,4 +1170,5 @@ def read_html( na_values=na_values, keep_default_na=keep_default_na, displayed_only=displayed_only, + extract_hrefs=extract_hrefs, ) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index eeebb9a638afb..cfa90237b52f4 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1286,3 +1286,57 @@ def test_parse_path_object(self, datapath): df1 = self.read_html(file_path_string)[0] df2 = self.read_html(file_path)[0] tm.assert_frame_equal(df1, df2) + + def test_extract_hrefs(self): + # GH 13141: + # read_html argument to interpret hyperlinks as links (not merely text) + result = self.read_html( + """ + + + + + + + + + + + + + + + + + + + +
KingdomPhylumClassOrderFamilyGenusSpecies
AnimaliaChordataMammaliaCarnivoraUrsidaeAiluropodaA. melanoleuca
+ """, + extract_hrefs=True, + )[0] + + expected = DataFrame( + [ + [ + ("Animalia", "https://en.wikipedia.org/wiki/Animal"), + ("Chordata", "https://en.wikipedia.org/wiki/Chordate"), + ("Mammalia", "https://en.wikipedia.org/wiki/Mammal"), + ("Carnivora", "https://en.wikipedia.org/wiki/Carnivora"), + ("Ursidae", "https://en.wikipedia.org/wiki/Bear"), + ("Ailuropoda", "https://en.wikipedia.org/wiki/Ailuropoda"), + ("A. melanoleuca",), + ] + ], + columns=( + "Kingdom", + "Phylum", + "Class", + "Order", + "Family", + "Genus", + "Species", + ), + ) + + tm.assert_frame_equal(result, expected) From ac86888d2aaa33cb0897d94b659f6e77ae26f224 Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Mon, 14 Feb 2022 15:40:59 +0000 Subject: [PATCH 02/24] Fix typing error --- pandas/io/html.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 002a238808526..6c4a51833b916 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -488,7 +488,9 @@ def _expand_colspan_rowspan(self, rows, header=False): to subsequent cells. """ all_texts = [] # list of rows, each a list of str - remainder: list[tuple[int, str, int]] = [] # list of (index, text, nrows) + remainder: list[ + tuple[int, str | tuple, int] + ] = [] # list of (index, text, nrows) for tr in rows: texts = [] # the output for this row From b33dc9ee8b54ff46059b1d9b71da0e20d444185e Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Tue, 15 Feb 2022 10:05:34 +0000 Subject: [PATCH 03/24] Simplify tests --- pandas/tests/io/test_html.py | 40 +++++++++++------------------------- 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index cfa90237b52f4..7fd05d3f342cb 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1294,22 +1294,14 @@ def test_extract_hrefs(self): """ - - - - - - - + + + - - - - - - - + + +
KingdomPhylumClassOrderFamilyGenusSpeciesHTTPFTPNone
AnimaliaChordataMammaliaCarnivoraUrsidaeAiluropodaA. melanoleucaWikipediaDebianLinkless
""", @@ -1319,23 +1311,15 @@ def test_extract_hrefs(self): expected = DataFrame( [ [ - ("Animalia", "https://en.wikipedia.org/wiki/Animal"), - ("Chordata", "https://en.wikipedia.org/wiki/Chordate"), - ("Mammalia", "https://en.wikipedia.org/wiki/Mammal"), - ("Carnivora", "https://en.wikipedia.org/wiki/Carnivora"), - ("Ursidae", "https://en.wikipedia.org/wiki/Bear"), - ("Ailuropoda", "https://en.wikipedia.org/wiki/Ailuropoda"), - ("A. melanoleuca",), + ("Wikipedia", "https://en.wikipedia.org/"), + ("Debian", "ftp://ftp.us.debian.org/"), + ("Linkless",), ] ], columns=( - "Kingdom", - "Phylum", - "Class", - "Order", - "Family", - "Genus", - "Species", + "HTTP", + "FTP", + "None", ), ) From a13c5f070d3f937c1947699eb86d83600aeb71b8 Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Tue, 15 Feb 2022 14:22:37 +0000 Subject: [PATCH 04/24] Fix still incorrect typing --- pandas/io/html.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/html.py b/pandas/io/html.py index 6c4a51833b916..e8fccefa68546 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -488,6 +488,7 @@ def _expand_colspan_rowspan(self, rows, header=False): to subsequent cells. """ all_texts = [] # list of rows, each a list of str + text: str | tuple remainder: list[ tuple[int, str | tuple, int] ] = [] # list of (index, text, nrows) From 76ebe35e6a626cedfa295ddd53c5a2b8fe49a3ff Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Thu, 17 Feb 2022 13:55:22 +0000 Subject: [PATCH 05/24] Summarise whatsnew entry and move detailed explanation into user guide --- doc/source/user_guide/io.rst | 22 ++++++++++++++++++++++ doc/source/whatsnew/v1.5.0.rst | 28 ++-------------------------- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 34f10c1b3ec28..6bf25b9e9bb46 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2729,6 +2729,28 @@ succeeds, the function will return*. dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor=["lxml", "bs4"]) +Links can be extracted from cells along with the text using ``extract_hrefs=True``. + +.. ipython:: python + + html_table = """ + + + + + + + +
GitHub
pandas
+ """ + + df = pd.read_html( + html_table, + extract_hrefs=True + )[0] + df + df["GitHub"] + df["GitHub"].str[1] .. _io.html: diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index dd71907b1b3dd..013e6668482f2 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -39,6 +39,8 @@ Other enhancements - :meth:`.GroupBy.min` and :meth:`.GroupBy.max` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`45428`) - Implemented a ``bool``-dtype :class:`Index`, passing a bool-dtype array-like to ``pd.Index`` will now retain ``bool`` dtype instead of casting to ``object`` (:issue:`45061`) - Implemented a complex-dtype :class:`Index`, passing a complex-dtype array-like to ``pd.Index`` will now retain complex dtype instead of casting to ``object`` (:issue:`45845`) +- :func:`pandas.read_html` now supports extracting hrefs from table cells (:issue:`13141`). + - @@ -109,32 +111,6 @@ apply converter methods, and parse dates (:issue:`43567`). df df.dtypes -read_html now supports ``extract_hrefs`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:func:`pandas.read_html` can now extract hrefs from table cells (:issue:`13141`). - -.. ipython:: python - - html_table = """ - - - - - - - -
GitHub
pandas
- """ - - df = pd.read_html( - html_table, - extract_hrefs=True - )[0] - df - df["GitHub"] - df["GitHub"].str[1] - .. _whatsnew_150.api_breaking.api_breaking2: api_breaking_change2 From cd352e7e1d7f99d9915ce6341f7eb931f91ee31a Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Wed, 23 Feb 2022 19:17:10 +0000 Subject: [PATCH 06/24] More flexible link extraction --- pandas/io/html.py | 32 +++++++++++++++++++------------- pandas/tests/io/test_html.py | 28 ++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 15 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index e8fccefa68546..4b0819ff64db1 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -10,6 +10,7 @@ import numbers import re from typing import ( + Literal, Pattern, Sequence, cast, @@ -180,8 +181,10 @@ class _HtmlFrameParser: displayed_only : bool Whether or not items with "display:none" should be ignored - extract_hrefs : bool, default False - Whether table elements with
tags should have the href extracted. + extract_hrefs : all/header/body/footer or None + Table elements in the specified section(s) with tags will have their + href extracted. Note that specifying "header" will result in a + :class:`~pandas.MultiIndex`. Attributes ---------- @@ -228,7 +231,7 @@ def __init__( attrs: dict[str, str] | None, encoding: str, displayed_only: bool, - extract_hrefs: bool, + extract_hrefs: Literal["all", "header", "body", "footer", None], ): self.io = io self.match = match @@ -460,13 +463,15 @@ def row_is_all_th(row): while body_rows and row_is_all_th(body_rows[0]): header_rows.append(body_rows.pop(0)) - header = self._expand_colspan_rowspan(header_rows, header=True) - body = self._expand_colspan_rowspan(body_rows) - footer = self._expand_colspan_rowspan(footer_rows) + header = self._expand_colspan_rowspan(header_rows, section="header") + body = self._expand_colspan_rowspan(body_rows, section="body") + footer = self._expand_colspan_rowspan(footer_rows, section="footer") return header, body, footer - def _expand_colspan_rowspan(self, rows, header=False): + def _expand_colspan_rowspan( + self, rows, section: Literal["header", "body", "footer"] + ): """ Given a list of s, return a list of text rows. @@ -474,8 +479,7 @@ def _expand_colspan_rowspan(self, rows, header=False): ---------- rows : list of node-like List of s - header : whether the current row is the header - don't capture links if so, - as this results in a MultiIndex which is undesirable. + section : the section that the rows belong to (header, body or footer). Returns ------- @@ -511,7 +515,7 @@ def _expand_colspan_rowspan(self, rows, header=False): # Append the text from this , colspan times text = _remove_whitespace(self._text_getter(td)) - if not header and self.extract_hrefs: + if self.extract_hrefs == "all" or self.extract_hrefs == section: # All cells will be tuples except for the headers for # consistency in selection (e.g. using .str indexing) href = self._href_getter(td) @@ -1007,7 +1011,7 @@ def read_html( na_values=None, keep_default_na: bool = True, displayed_only: bool = True, - extract_hrefs: bool = False, + extract_hrefs: Literal["all", "header", "body", "footer", None] = None, ) -> list[DataFrame]: r""" Read HTML tables into a ``list`` of ``DataFrame`` objects. @@ -1102,8 +1106,10 @@ def read_html( displayed_only : bool, default True Whether elements with "display: none" should be parsed. - extract_hrefs : bool, default False - Whether table elements with tags should have the href extracted. + extract_hrefs : all/header/body/footer or None, default None + Table elements in the specified section(s) with tags will have their + href extracted. Note that specifying "header" will result in a + :class:`~pandas.MultiIndex`. Returns ------- diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 7fd05d3f342cb..457c3f888efb7 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1287,7 +1287,7 @@ def test_parse_path_object(self, datapath): df2 = self.read_html(file_path)[0] tm.assert_frame_equal(df1, df2) - def test_extract_hrefs(self): + def test_extract_hrefs_body(self): # GH 13141: # read_html argument to interpret hyperlinks as links (not merely text) result = self.read_html( @@ -1305,7 +1305,7 @@ def test_extract_hrefs(self): """, - extract_hrefs=True, + extract_hrefs="body", )[0] expected = DataFrame( @@ -1324,3 +1324,27 @@ def test_extract_hrefs(self): ) tm.assert_frame_equal(result, expected) + + def test_extract_hrefs_header(self): + # GH 13141: + # read_html argument to interpret hyperlinks as links (not merely text) + result = self.read_html( + """ + + + + + + + +
Linkless
Wikipedia
+ """, + extract_hrefs="header", + )[0] + + expected = DataFrame( + [["Wikipedia"]], + columns=(("Linkless", "https://en.wiktionary.org/wiki/linkless"),), + ) + + tm.assert_frame_equal(result, expected) From 1de13246106587fb17bc2306f910b6df7e6c4fd5 Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Sat, 26 Feb 2022 19:08:43 +0000 Subject: [PATCH 07/24] Suggested changes --- doc/source/user_guide/io.rst | 2 ++ pandas/io/html.py | 20 ++++++++++++++------ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 6bf25b9e9bb46..0dba537025fae 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2729,6 +2729,8 @@ succeeds, the function will return*. dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor=["lxml", "bs4"]) +.. versionadded:: 1.5.0 + Links can be extracted from cells along with the text using ``extract_hrefs=True``. .. ipython:: python diff --git a/pandas/io/html.py b/pandas/io/html.py index 4b0819ff64db1..0fae7eae8c816 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -181,7 +181,9 @@ class _HtmlFrameParser: displayed_only : bool Whether or not items with "display:none" should be ignored - extract_hrefs : all/header/body/footer or None + .. versionadded:: 1.5.0 + + extract_hrefs : "all"/"header"/"body"/"footer" or None Table elements in the specified section(s) with
tags will have their href extracted. Note that specifying "header" will result in a :class:`~pandas.MultiIndex`. @@ -204,8 +206,12 @@ class _HtmlFrameParser: displayed_only : bool Whether or not items with "display:none" should be ignored - extract_hrefs : bool, default False - Whether table elements with tags should have the href extracted. + .. versionadded:: 1.5.0 + + extract_hrefs : "all"/"header"/"body"/"footer" or None, default None + Table elements in the specified section(s) with tags will have their + href extracted. Note that specifying "header" will result in a + :class:`~pandas.MultiIndex`. Notes ----- @@ -624,7 +630,7 @@ def _parse_tables(self, doc, match, attrs): raise ValueError(f"No tables found matching pattern {repr(match.pattern)}") return result - def _href_getter(self, obj): + def _href_getter(self, obj) -> str | None: a = obj.find("a", href=True) return None if not a else a["href"] @@ -713,7 +719,7 @@ class _LxmlFrameParser(_HtmlFrameParser): :class:`_HtmlFrameParser`. """ - def _href_getter(self, obj): + def _href_getter(self, obj) -> str | None: href = obj.xpath(".//a/@href") return None if not href else href[0] @@ -1106,7 +1112,9 @@ def read_html( displayed_only : bool, default True Whether elements with "display: none" should be parsed. - extract_hrefs : all/header/body/footer or None, default None + .. versionadded:: 1.5.0 + + extract_hrefs : "all"/"header"/"body"/"footer" or None, default None Table elements in the specified section(s) with tags will have their href extracted. Note that specifying "header" will result in a :class:`~pandas.MultiIndex`. From 1190ea7e039dc7c4d4d59a38410481e4a9b2ef34 Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Mon, 28 Feb 2022 17:37:23 +0000 Subject: [PATCH 08/24] extract_hrefs -> extract_links --- doc/source/user_guide/io.rst | 4 ++-- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/io/html.py | 20 ++++++++++---------- pandas/tests/io/test_html.py | 8 ++++---- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 0dba537025fae..8306ed801eeed 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2731,7 +2731,7 @@ succeeds, the function will return*. .. versionadded:: 1.5.0 -Links can be extracted from cells along with the text using ``extract_hrefs=True``. +Links can be extracted from cells along with the text using ``extract_links=True``. .. ipython:: python @@ -2748,7 +2748,7 @@ Links can be extracted from cells along with the text using ``extract_hrefs=True df = pd.read_html( html_table, - extract_hrefs=True + extract_links=True )[0] df df["GitHub"] diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 013e6668482f2..88caf4d6e41aa 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -39,7 +39,7 @@ Other enhancements - :meth:`.GroupBy.min` and :meth:`.GroupBy.max` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`45428`) - Implemented a ``bool``-dtype :class:`Index`, passing a bool-dtype array-like to ``pd.Index`` will now retain ``bool`` dtype instead of casting to ``object`` (:issue:`45061`) - Implemented a complex-dtype :class:`Index`, passing a complex-dtype array-like to ``pd.Index`` will now retain complex dtype instead of casting to ``object`` (:issue:`45845`) -- :func:`pandas.read_html` now supports extracting hrefs from table cells (:issue:`13141`). +- :func:`pandas.read_html` now supports extracting links from table cells (:issue:`13141`). - diff --git a/pandas/io/html.py b/pandas/io/html.py index 0fae7eae8c816..51fe673e04146 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -183,7 +183,7 @@ class _HtmlFrameParser: .. versionadded:: 1.5.0 - extract_hrefs : "all"/"header"/"body"/"footer" or None + extract_links : "all"/"header"/"body"/"footer" or None Table elements in the specified section(s) with tags will have their href extracted. Note that specifying "header" will result in a :class:`~pandas.MultiIndex`. @@ -208,7 +208,7 @@ class _HtmlFrameParser: .. versionadded:: 1.5.0 - extract_hrefs : "all"/"header"/"body"/"footer" or None, default None + extract_links : "all"/"header"/"body"/"footer" or None, default None Table elements in the specified section(s) with tags will have their href extracted. Note that specifying "header" will result in a :class:`~pandas.MultiIndex`. @@ -237,14 +237,14 @@ def __init__( attrs: dict[str, str] | None, encoding: str, displayed_only: bool, - extract_hrefs: Literal["all", "header", "body", "footer", None], + extract_links: Literal["all", "header", "body", "footer", None], ): self.io = io self.match = match self.attrs = attrs self.encoding = encoding self.displayed_only = displayed_only - self.extract_hrefs = extract_hrefs + self.extract_links = extract_links def parse_tables(self): """ @@ -521,7 +521,7 @@ def _expand_colspan_rowspan( # Append the text from this , colspan times text = _remove_whitespace(self._text_getter(td)) - if self.extract_hrefs == "all" or self.extract_hrefs == section: + if self.extract_links == "all" or self.extract_links == section: # All cells will be tuples except for the headers for # consistency in selection (e.g. using .str indexing) href = self._href_getter(td) @@ -959,14 +959,14 @@ def _validate_flavor(flavor): return flavor -def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_hrefs, **kwargs): +def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here retained = None for flav in flavor: parser = _parser_dispatch(flav) - p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_hrefs) + p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links) try: tables = p.parse_tables() @@ -1017,7 +1017,7 @@ def read_html( na_values=None, keep_default_na: bool = True, displayed_only: bool = True, - extract_hrefs: Literal["all", "header", "body", "footer", None] = None, + extract_links: Literal["all", "header", "body", "footer", None] = None, ) -> list[DataFrame]: r""" Read HTML tables into a ``list`` of ``DataFrame`` objects. @@ -1114,7 +1114,7 @@ def read_html( .. versionadded:: 1.5.0 - extract_hrefs : "all"/"header"/"body"/"footer" or None, default None + extract_links : "all"/"header"/"body"/"footer" or None, default None Table elements in the specified section(s) with tags will have their href extracted. Note that specifying "header" will result in a :class:`~pandas.MultiIndex`. @@ -1187,5 +1187,5 @@ def read_html( na_values=na_values, keep_default_na=keep_default_na, displayed_only=displayed_only, - extract_hrefs=extract_hrefs, + extract_links=extract_links, ) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 457c3f888efb7..0f6febd720eb9 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1287,7 +1287,7 @@ def test_parse_path_object(self, datapath): df2 = self.read_html(file_path)[0] tm.assert_frame_equal(df1, df2) - def test_extract_hrefs_body(self): + def test_extract_links_body(self): # GH 13141: # read_html argument to interpret hyperlinks as links (not merely text) result = self.read_html( @@ -1305,7 +1305,7 @@ def test_extract_hrefs_body(self): """, - extract_hrefs="body", + extract_links="body", )[0] expected = DataFrame( @@ -1325,7 +1325,7 @@ def test_extract_hrefs_body(self): tm.assert_frame_equal(result, expected) - def test_extract_hrefs_header(self): + def test_extract_links_header(self): # GH 13141: # read_html argument to interpret hyperlinks as links (not merely text) result = self.read_html( @@ -1339,7 +1339,7 @@ def test_extract_hrefs_header(self): """, - extract_hrefs="header", + extract_links="header", )[0] expected = DataFrame( From db8b6db4beeb1c81c15dee4885567c6ea60cd68d Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Sun, 20 Mar 2022 19:21:35 +0000 Subject: [PATCH 09/24] Move versionadded to correct place and improve docstring for extract_links (@attack68) --- doc/source/user_guide/io.rst | 4 ++-- pandas/io/html.py | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 8306ed801eeed..f372b4682d733 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2729,8 +2729,6 @@ succeeds, the function will return*. dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor=["lxml", "bs4"]) -.. versionadded:: 1.5.0 - Links can be extracted from cells along with the text using ``extract_links=True``. .. ipython:: python @@ -2754,6 +2752,8 @@ Links can be extracted from cells along with the text using ``extract_links=True df["GitHub"] df["GitHub"].str[1] +.. versionadded:: 1.5.0 + .. _io.html: Writing to HTML files diff --git a/pandas/io/html.py b/pandas/io/html.py index 51fe673e04146..6808554e220b6 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -181,13 +181,13 @@ class _HtmlFrameParser: displayed_only : bool Whether or not items with "display:none" should be ignored - .. versionadded:: 1.5.0 - - extract_links : "all"/"header"/"body"/"footer" or None + extract_links : {None, "all", "header", "body", "footer"} Table elements in the specified section(s) with tags will have their href extracted. Note that specifying "header" will result in a :class:`~pandas.MultiIndex`. + .. versionadded:: 1.5.0 + Attributes ---------- io : str or file-like @@ -206,13 +206,13 @@ class _HtmlFrameParser: displayed_only : bool Whether or not items with "display:none" should be ignored - .. versionadded:: 1.5.0 - - extract_links : "all"/"header"/"body"/"footer" or None, default None + extract_links : {None, "all", "header", "body", "footer"} Table elements in the specified section(s) with tags will have their href extracted. Note that specifying "header" will result in a :class:`~pandas.MultiIndex`. + .. versionadded:: 1.5.0 + Notes ----- To subclass this class effectively you must override the following methods: @@ -1112,13 +1112,13 @@ def read_html( displayed_only : bool, default True Whether elements with "display: none" should be parsed. - .. versionadded:: 1.5.0 - - extract_links : "all"/"header"/"body"/"footer" or None, default None + extract_links : {None, "all", "header", "body", "footer"} Table elements in the specified section(s) with tags will have their href extracted. Note that specifying "header" will result in a :class:`~pandas.MultiIndex`. + .. versionadded:: 1.5.0 + Returns ------- dfs From 1c8c89162dc61641ac53cf945c70527a5e2f2be6 Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Sun, 20 Mar 2022 21:29:41 +0000 Subject: [PATCH 10/24] Test for invalid extract_links value --- pandas/io/html.py | 12 +++++++++--- pandas/tests/io/test_html.py | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 6808554e220b6..4f13e806ec07e 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -237,7 +237,7 @@ def __init__( attrs: dict[str, str] | None, encoding: str, displayed_only: bool, - extract_links: Literal["all", "header", "body", "footer", None], + extract_links: Literal[None, "header", "footer", "body", "all"], ): self.io = io self.match = match @@ -476,7 +476,7 @@ def row_is_all_th(row): return header, body, footer def _expand_colspan_rowspan( - self, rows, section: Literal["header", "body", "footer"] + self, rows, section: Literal["header", "footer", "body"] ): """ Given a list of s, return a list of text rows. @@ -1017,7 +1017,7 @@ def read_html( na_values=None, keep_default_na: bool = True, displayed_only: bool = True, - extract_links: Literal["all", "header", "body", "footer", None] = None, + extract_links: Literal[None, "header", "footer", "body", "all"] = None, ) -> list[DataFrame]: r""" Read HTML tables into a ``list`` of ``DataFrame`` objects. @@ -1167,6 +1167,12 @@ def read_html( "cannot skip rows starting from the end of the " "data (you passed a negative value)" ) + if extract_links not in [None, "header", "footer", "body", "all"]: + raise ValueError( + "`extract_links` must be one of " + '{None, "header", "footer", "body", "all"}, got ' + f'"{extract_links}"' + ) validate_header_arg(header) io = stringify_path(io) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 0f6febd720eb9..f2354b845b40f 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1348,3 +1348,21 @@ def test_extract_links_header(self): ) tm.assert_frame_equal(result, expected) + + def test_extract_links_bad(self): + html = """ + + + + + + + +
Linkless
Wikipedia
+ """ + msg = ( + "`extract_links` must be one of " + '{None, "header", "footer", "body", "all"}, got "incorrect"' + ) + with pytest.raises(ValueError, match=msg): + read_html(html, extract_links="incorrect") From 1555fbd4642e4a1e310c4f8309de633fd4ba7ef4 Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Sat, 2 Apr 2022 03:42:19 +0100 Subject: [PATCH 11/24] Test all extract_link options --- pandas/tests/io/test_html.py | 126 +++++++++++++++++++++++++---------- 1 file changed, 92 insertions(+), 34 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index f2354b845b40f..27e1d2caee0e6 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -113,6 +113,28 @@ def spam_data(self, datapath): def banklist_data(self, datapath): return datapath("io", "data", "html", "banklist.html") + @pytest.fixture + def gh_13141_data(self): + return """ + + + + + + + + + + + + + + + + +
HTTPFTPLinkless
WikipediaDebianLinkless
Footer
+ """ + @pytest.fixture(autouse=True, scope="function") def set_defaults(self, flavor): self.read_html = partial(read_html, flavor=flavor) @@ -1287,25 +1309,12 @@ def test_parse_path_object(self, datapath): df2 = self.read_html(file_path)[0] tm.assert_frame_equal(df1, df2) - def test_extract_links_body(self): + def test_extract_links(self, gh_13141_data): # GH 13141: # read_html argument to interpret hyperlinks as links (not merely text) result = self.read_html( - """ - - - - - - - - - - - -
HTTPFTPNone
WikipediaDebianLinkless
- """, - extract_links="body", + gh_13141_data, + extract_links="all", )[0] expected = DataFrame( @@ -1314,37 +1323,86 @@ def test_extract_links_body(self): ("Wikipedia", "https://en.wikipedia.org/"), ("Debian", "ftp://ftp.us.debian.org/"), ("Linkless",), - ] + ], + [("Footer", "https://en.wikipedia.org/wiki/Page_footer"), None, None], + ], + columns=( + ("HTTP", np.nan), + ("FTP", np.nan), + ("Linkless", "https://en.wiktionary.org/wiki/linkless"), + ), + ) + + tm.assert_frame_equal(result, expected) + + def test_extract_links_header(self, gh_13141_data): + result = self.read_html( + gh_13141_data, + extract_links="header", + )[0] + + expected = DataFrame( + [ + [ + "Wikipedia", + "Debian", + "Linkless", + ], + ["Footer", None, None], + ], + columns=( + ("HTTP", np.nan), + ("FTP", np.nan), + ("Linkless", "https://en.wiktionary.org/wiki/linkless"), + ), + ) + + tm.assert_frame_equal(result, expected) + + def test_extract_links_footer(self, gh_13141_data): + result = self.read_html( + gh_13141_data, + extract_links="footer", + )[0] + + expected = DataFrame( + [ + [ + "Wikipedia", + "Debian", + "Linkless", + ], + [("Footer", "https://en.wikipedia.org/wiki/Page_footer"), None, None], ], columns=( "HTTP", "FTP", - "None", + "Linkless", ), ) tm.assert_frame_equal(result, expected) - def test_extract_links_header(self): - # GH 13141: - # read_html argument to interpret hyperlinks as links (not merely text) + def test_extract_links_body(self, gh_13141_data): result = self.read_html( - """ - - - - - - - -
Linkless
Wikipedia
- """, - extract_links="header", + gh_13141_data, + extract_links="body", )[0] expected = DataFrame( - [["Wikipedia"]], - columns=(("Linkless", "https://en.wiktionary.org/wiki/linkless"),), + [ + [ + ("Wikipedia", "https://en.wikipedia.org/"), + ("Debian", "ftp://ftp.us.debian.org/"), + ("Linkless",), + ], + ["Footer", None, None], + ], + columns=( + "HTTP", + "FTP", + "Linkless", + ), ) tm.assert_frame_equal(result, expected) From 09356966b520a4e5a8b20cf56ff05df9c550fffb Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Mon, 25 Apr 2022 02:36:04 +0100 Subject: [PATCH 12/24] Fix for MultiIndex headers (also fixes tests) --- pandas/io/html.py | 17 ++-- pandas/tests/io/test_html.py | 151 ++++++++++------------------------- 2 files changed, 55 insertions(+), 113 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 4f13e806ec07e..7b1d259086bd9 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -31,6 +31,8 @@ from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.frame import DataFrame +from pandas.core.indexes.base import Index +from pandas.core.indexes.multi import MultiIndex from pandas.io.common import ( file_exists, @@ -490,7 +492,8 @@ def _expand_colspan_rowspan( Returns ------- list of list - Each returned row is a list of str text. + Each returned row is a list of str text, or tuple (text, link) + if extract_links is not None. Notes ----- @@ -522,10 +525,8 @@ def _expand_colspan_rowspan( # Append the text from this , colspan times text = _remove_whitespace(self._text_getter(td)) if self.extract_links == "all" or self.extract_links == section: - # All cells will be tuples except for the headers for - # consistency in selection (e.g. using .str indexing) href = self._href_getter(td) - text = (text, href) if href else (text,) + text = (text, href) if href else (text, None) rowspan = int(self._attr_getter(td, "rowspan") or 1) colspan = int(self._attr_getter(td, "colspan") or 1) @@ -874,7 +875,13 @@ def _data_to_frame(**kwargs): # fill out elements of body that are "ragged" _expand_elements(body) with TextParser(body, header=header, **kwargs) as tp: - return tp.read() + df = tp.read() + + # Cast MultiIndex header to an Index of tuples. + # This maintains consistency of selection (e.g. df.columns.str[1]) + if isinstance(df.columns, MultiIndex): + df.columns = Index(df.columns) + return df _valid_parsers = { diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 27e1d2caee0e6..88d405435c947 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -135,6 +135,29 @@ def gh_13141_data(self): """ + @pytest.fixture + def gh_13141_expected(self): + return { + "head_ignore": ["HTTP", "FTP", "Linkless"], + "head_extract": [ + ("HTTP", np.nan), + ("FTP", np.nan), + ("Linkless", "https://en.wiktionary.org/wiki/linkless"), + ], + "body_ignore": ["Wikipedia", "Debian", "Linkless"], + "body_extract": [ + ("Wikipedia", "https://en.wikipedia.org/"), + ("Debian", "ftp://ftp.us.debian.org/"), + ("Linkless", None), + ], + "footer_ignore": ["Footer", None, None], + "footer_extract": [ + ("Footer", "https://en.wikipedia.org/wiki/Page_footer"), + None, + None, + ], + } + @pytest.fixture(autouse=True, scope="function") def set_defaults(self, flavor): self.read_html = partial(read_html, flavor=flavor) @@ -1309,118 +1332,30 @@ def test_parse_path_object(self, datapath): df2 = self.read_html(file_path)[0] tm.assert_frame_equal(df1, df2) - def test_extract_links(self, gh_13141_data): - # GH 13141: - # read_html argument to interpret hyperlinks as links (not merely text) - result = self.read_html( - gh_13141_data, - extract_links="all", - )[0] - - expected = DataFrame( - [ - [ - ("Wikipedia", "https://en.wikipedia.org/"), - ("Debian", "ftp://ftp.us.debian.org/"), - ("Linkless",), - ], - [("Footer", "https://en.wikipedia.org/wiki/Page_footer"), None, None], - ], - columns=( - ("HTTP", np.nan), - ("FTP", np.nan), - ("Linkless", "https://en.wiktionary.org/wiki/linkless"), - ), - ) - - tm.assert_frame_equal(result, expected) - - def test_extract_links_header(self, gh_13141_data): - result = self.read_html( - gh_13141_data, - extract_links="header", - )[0] - - expected = DataFrame( - [ - [ - "Wikipedia", - "Debian", - "Linkless", - ], - ["Footer", None, None], - ], - columns=( - ("HTTP", np.nan), - ("FTP", np.nan), - ("Linkless", "https://en.wiktionary.org/wiki/linkless"), - ), - ) - - tm.assert_frame_equal(result, expected) - - def test_extract_links_footer(self, gh_13141_data): - result = self.read_html( - gh_13141_data, - extract_links="footer", - )[0] - - expected = DataFrame( - [ - [ - "Wikipedia", - "Debian", - "Linkless", - ], - [("Footer", "https://en.wikipedia.org/wiki/Page_footer"), None, None], - ], - columns=( - "HTTP", - "FTP", - "Linkless", - ), - ) - + @pytest.mark.parametrize("arg", ["all", "body", "header", "footer"]) + def test_extract_links(self, gh_13141_data, gh_13141_expected, arg): + data_exp = gh_13141_expected["body_ignore"] + foot_exp = gh_13141_expected["footer_ignore"] + head_exp = gh_13141_expected["head_ignore"] + if arg == "all": + data_exp = gh_13141_expected["body_extract"] + foot_exp = gh_13141_expected["footer_extract"] + head_exp = gh_13141_expected["head_extract"] + elif arg == "body": + data_exp = gh_13141_expected["body_extract"] + elif arg == "footer": + foot_exp = gh_13141_expected["footer_extract"] + elif arg == "header": + head_exp = gh_13141_expected["head_extract"] + + result = self.read_html(gh_13141_data, extract_links=arg)[0] + expected = DataFrame([data_exp, foot_exp], columns=head_exp) tm.assert_frame_equal(result, expected) - def test_extract_links_body(self, gh_13141_data): - result = self.read_html( - gh_13141_data, - extract_links="body", - )[0] - - expected = DataFrame( - [ - [ - ("Wikipedia", "https://en.wikipedia.org/"), - ("Debian", "ftp://ftp.us.debian.org/"), - ("Linkless",), - ], - ["Footer", None, None], - ], - columns=( - "HTTP", - "FTP", - "Linkless", - ), - ) - - tm.assert_frame_equal(result, expected) - - def test_extract_links_bad(self): - html = """ - - - - - - - -
Linkless
Wikipedia
- """ + def test_extract_links_bad(self, gh_13141_data): msg = ( "`extract_links` must be one of " '{None, "header", "footer", "body", "all"}, got "incorrect"' ) with pytest.raises(ValueError, match=msg): - read_html(html, extract_links="incorrect") + read_html(gh_13141_data, extract_links="incorrect") From afaad1accfc771e031c74c68d74e05f1ee8deec9 Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Mon, 25 Apr 2022 02:44:11 +0100 Subject: [PATCH 13/24] Test that text surrounding
tag is still captured --- pandas/tests/io/test_html.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 88d405435c947..801fd8119eb7e 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -124,7 +124,7 @@ def gh_13141_data(self): Wikipedia - Debian + SURROUNDING Debian TEXT Linkless @@ -144,10 +144,10 @@ def gh_13141_expected(self): ("FTP", np.nan), ("Linkless", "https://en.wiktionary.org/wiki/linkless"), ], - "body_ignore": ["Wikipedia", "Debian", "Linkless"], + "body_ignore": ["Wikipedia", "SURROUNDING Debian TEXT", "Linkless"], "body_extract": [ ("Wikipedia", "https://en.wikipedia.org/"), - ("Debian", "ftp://ftp.us.debian.org/"), + ("SURROUNDING Debian TEXT", "ftp://ftp.us.debian.org/"), ("Linkless", None), ], "footer_ignore": ["Footer", None, None], From 20e24e9ba44b63cb3603885ab22ef42786556496 Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Mon, 25 Apr 2022 02:51:33 +0100 Subject: [PATCH 14/24] Test for multiple tags in cell --- pandas/tests/io/test_html.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 801fd8119eb7e..676b057aa2093 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -130,6 +130,9 @@ def gh_13141_data(self): Footer + + Multiple links: Only first captured. + @@ -150,10 +153,14 @@ def gh_13141_expected(self): ("SURROUNDING Debian TEXT", "ftp://ftp.us.debian.org/"), ("Linkless", None), ], - "footer_ignore": ["Footer", None, None], + "footer_ignore": [ + "Footer", + "Multiple links: Only first captured.", + None, + ], "footer_extract": [ ("Footer", "https://en.wikipedia.org/wiki/Page_footer"), - None, + ("Multiple links: Only first captured.", "1"), None, ], } From ffdcf8adc98a5968af0d339bc48b14c38f95fd39 Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Sun, 15 May 2022 19:40:21 +0100 Subject: [PATCH 15/24] Fix all tests, with both MultiIndex -> Index and np.nan -> None conversions resolved --- pandas/io/html.py | 34 +++++++++++++++++++--------------- pandas/tests/io/test_html.py | 4 ++-- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 7b1d259086bd9..3d4fefcecf97e 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -16,6 +16,8 @@ cast, ) +import numpy as np + from pandas._typing import ( FilePath, ReadBuffer, @@ -32,7 +34,6 @@ from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.frame import DataFrame from pandas.core.indexes.base import Index -from pandas.core.indexes.multi import MultiIndex from pandas.io.common import ( file_exists, @@ -185,8 +186,7 @@ class _HtmlFrameParser: extract_links : {None, "all", "header", "body", "footer"} Table elements in the specified section(s) with tags will have their - href extracted. Note that specifying "header" will result in a - :class:`~pandas.MultiIndex`. + href extracted. .. versionadded:: 1.5.0 @@ -210,8 +210,7 @@ class _HtmlFrameParser: extract_links : {None, "all", "header", "body", "footer"} Table elements in the specified section(s) with tags will have their - href extracted. Note that specifying "header" will result in a - :class:`~pandas.MultiIndex`. + href extracted. .. versionadded:: 1.5.0 @@ -875,13 +874,7 @@ def _data_to_frame(**kwargs): # fill out elements of body that are "ragged" _expand_elements(body) with TextParser(body, header=header, **kwargs) as tp: - df = tp.read() - - # Cast MultiIndex header to an Index of tuples. - # This maintains consistency of selection (e.g. df.columns.str[1]) - if isinstance(df.columns, MultiIndex): - df.columns = Index(df.columns) - return df + return tp.read() _valid_parsers = { @@ -1001,7 +994,19 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, ** ret = [] for table in tables: try: - ret.append(_data_to_frame(data=table, **kwargs)) + df = _data_to_frame(data=table, **kwargs) + # Cast MultiIndex header to an Index of tuples when extracting header + # links and replace np.nan with None. + # This maintains consistency of selection (e.g. df.columns.str[1]) + if extract_links in ("all", "header"): + idx = df.columns.values + idx[:] = np.vectorize( + lambda cols: tuple(None if col is np.nan else col for col in cols), + otypes=["object"], + )(idx) + df.columns = Index(df.columns) + + ret.append(df) except EmptyDataError: # empty table continue return ret @@ -1121,8 +1126,7 @@ def read_html( extract_links : {None, "all", "header", "body", "footer"} Table elements in the specified section(s) with tags will have their - href extracted. Note that specifying "header" will result in a - :class:`~pandas.MultiIndex`. + href extracted. .. versionadded:: 1.5.0 diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 676b057aa2093..3c42cacc1fdcd 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -143,8 +143,8 @@ def gh_13141_expected(self): return { "head_ignore": ["HTTP", "FTP", "Linkless"], "head_extract": [ - ("HTTP", np.nan), - ("FTP", np.nan), + ("HTTP", None), + ("FTP", None), ("Linkless", "https://en.wiktionary.org/wiki/linkless"), ], "body_ignore": ["Wikipedia", "SURROUNDING Debian TEXT", "Linkless"], From 490005af42e67f959d8296b16322651e35b939d4 Mon Sep 17 00:00:00 2001 From: abmyii <52673001+abmyii@users.noreply.github.com> Date: Sat, 18 Jun 2022 11:01:32 +0100 Subject: [PATCH 16/24] Add back EOF newline to test_html.py --- pandas/tests/io/test_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index ac546b5447aec..6f038413d3f07 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1419,4 +1419,4 @@ def test_extract_links_bad(self, gh_13141_data): '{None, "header", "footer", "body", "all"}, got "incorrect"' ) with pytest.raises(ValueError, match=msg): - read_html(gh_13141_data, extract_links="incorrect") \ No newline at end of file + read_html(gh_13141_data, extract_links="incorrect") From a5ff5c16768cb6923ba52a365a02deb53f6169e1 Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Sat, 18 Jun 2022 18:06:24 +0100 Subject: [PATCH 17/24] Correct user guide example --- doc/source/user_guide/io.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 7cf43de99ba8a..4896e8db323ef 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2723,7 +2723,7 @@ succeeds, the function will return*. dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor=["lxml", "bs4"]) -Links can be extracted from cells along with the text using ``extract_links=True``. +Links can be extracted from cells along with the text using ``extract_links="all"``. .. ipython:: python @@ -2740,11 +2740,11 @@ Links can be extracted from cells along with the text using ``extract_links=True df = pd.read_html( html_table, - extract_links=True + extract_links="all" )[0] df - df["GitHub"] - df["GitHub"].str[1] + df[("GitHub", None)] + df[("GitHub", None)].str[1] .. versionadded:: 1.5.0 From 58fdb0c2b2214448bbd726910831f82fd2887b7e Mon Sep 17 00:00:00 2001 From: JHM Darbyshire <24256554+attack68@users.noreply.github.com> Date: Fri, 29 Jul 2022 23:34:57 +0200 Subject: [PATCH 18/24] Update pandas/io/html.py --- pandas/io/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index ec2d6986b6ecc..eaf2575fb08aa 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -189,7 +189,7 @@ class _HtmlFrameParser: Table elements in the specified section(s) with tags will have their href extracted. - .. versionadded:: 1.5.0 + .. versionadded:: 1.5.0 Attributes ---------- From c34d8ff9be12bfe58a49fdfaffdf6caeb33a5e8f Mon Sep 17 00:00:00 2001 From: JHM Darbyshire <24256554+attack68@users.noreply.github.com> Date: Fri, 29 Jul 2022 23:35:04 +0200 Subject: [PATCH 19/24] Update pandas/io/html.py --- pandas/io/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index eaf2575fb08aa..8d03a783c1324 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -213,7 +213,7 @@ class _HtmlFrameParser: Table elements in the specified section(s) with tags will have their href extracted. - .. versionadded:: 1.5.0 + .. versionadded:: 1.5.0 Notes ----- From 7389b84e0bd75bb1726144e18d8c5a8f8494c7ad Mon Sep 17 00:00:00 2001 From: JHM Darbyshire <24256554+attack68@users.noreply.github.com> Date: Fri, 29 Jul 2022 23:35:10 +0200 Subject: [PATCH 20/24] Update pandas/io/html.py --- pandas/io/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 8d03a783c1324..edf8e203bca45 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -1139,7 +1139,7 @@ def read_html( Table elements in the specified section(s) with tags will have their href extracted. - .. versionadded:: 1.5.0 + .. versionadded:: 1.5.0 Returns ------- From ba7caab64c4c388b8f1349e872e1aca1200c9696 Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Sat, 30 Jul 2022 11:59:47 +0100 Subject: [PATCH 21/24] Simplify MultiIndex -> Index conversion --- pandas/io/html.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index edf8e203bca45..6f71fdb5d3a91 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -17,8 +17,6 @@ cast, ) -import numpy as np - from pandas._typing import ( FilePath, ReadBuffer, @@ -32,6 +30,7 @@ from pandas.core.dtypes.common import is_list_like +from pandas import isna from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.frame import DataFrame from pandas.core.indexes.base import Index @@ -1007,15 +1006,13 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, ** try: df = _data_to_frame(data=table, **kwargs) # Cast MultiIndex header to an Index of tuples when extracting header - # links and replace np.nan with None. + # links and replace nan with None. # This maintains consistency of selection (e.g. df.columns.str[1]) if extract_links in ("all", "header"): - idx = df.columns.values - idx[:] = np.vectorize( - lambda cols: tuple(None if col is np.nan else col for col in cols), - otypes=["object"], - )(idx) - df.columns = Index(df.columns) + df.columns = Index( + ((col[0], None if isna(col[1]) else col[1]) for col in df.columns), + tupleize_cols=False, + ) ret.append(df) except EmptyDataError: # empty table From 4c7f5321e0dc9526f3b2680c13eed2c38f9c3404 Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Sat, 30 Jul 2022 12:01:42 +0100 Subject: [PATCH 22/24] Move unnecessary fixtures into test body --- pandas/tests/io/test_html.py | 106 +++++++++++++++++------------------ 1 file changed, 51 insertions(+), 55 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 6f038413d3f07..045c22f106105 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -111,58 +111,6 @@ def spam_data(self, datapath): def banklist_data(self, datapath): return datapath("io", "data", "html", "banklist.html") - @pytest.fixture - def gh_13141_data(self): - return """ - - - - - - - - - - - - - - - - - -
HTTPFTPLinkless
WikipediaSURROUNDING Debian TEXTLinkless
Footer - Multiple links: Only first captured. -
- """ - - @pytest.fixture - def gh_13141_expected(self): - return { - "head_ignore": ["HTTP", "FTP", "Linkless"], - "head_extract": [ - ("HTTP", None), - ("FTP", None), - ("Linkless", "https://en.wiktionary.org/wiki/linkless"), - ], - "body_ignore": ["Wikipedia", "SURROUNDING Debian TEXT", "Linkless"], - "body_extract": [ - ("Wikipedia", "https://en.wikipedia.org/"), - ("SURROUNDING Debian TEXT", "ftp://ftp.us.debian.org/"), - ("Linkless", None), - ], - "footer_ignore": [ - "Footer", - "Multiple links: Only first captured.", - None, - ], - "footer_extract": [ - ("Footer", "https://en.wikipedia.org/wiki/Page_footer"), - ("Multiple links: Only first captured.", "1"), - None, - ], - } - @pytest.fixture(autouse=True, scope="function") def set_defaults(self, flavor): self.read_html = partial(read_html, flavor=flavor) @@ -1394,7 +1342,55 @@ def test_parse_br_as_space(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("arg", ["all", "body", "header", "footer"]) - def test_extract_links(self, gh_13141_data, gh_13141_expected, arg): + def test_extract_links(self, arg): + gh_13141_data = """ + + + + + + + + + + + + + + + + + +
HTTPFTPLinkless
WikipediaSURROUNDING Debian TEXTLinkless
Footer + Multiple links: Only first captured. +
+ """ + + gh_13141_expected = { + "head_ignore": ["HTTP", "FTP", "Linkless"], + "head_extract": [ + ("HTTP", None), + ("FTP", None), + ("Linkless", "https://en.wiktionary.org/wiki/linkless"), + ], + "body_ignore": ["Wikipedia", "SURROUNDING Debian TEXT", "Linkless"], + "body_extract": [ + ("Wikipedia", "https://en.wikipedia.org/"), + ("SURROUNDING Debian TEXT", "ftp://ftp.us.debian.org/"), + ("Linkless", None), + ], + "footer_ignore": [ + "Footer", + "Multiple links: Only first captured.", + None, + ], + "footer_extract": [ + ("Footer", "https://en.wikipedia.org/wiki/Page_footer"), + ("Multiple links: Only first captured.", "1"), + None, + ], + } + data_exp = gh_13141_expected["body_ignore"] foot_exp = gh_13141_expected["footer_ignore"] head_exp = gh_13141_expected["head_ignore"] @@ -1413,10 +1409,10 @@ def test_extract_links(self, gh_13141_data, gh_13141_expected, arg): expected = DataFrame([data_exp, foot_exp], columns=head_exp) tm.assert_frame_equal(result, expected) - def test_extract_links_bad(self, gh_13141_data): + def test_extract_links_bad(self, spam_data): msg = ( "`extract_links` must be one of " '{None, "header", "footer", "body", "all"}, got "incorrect"' ) with pytest.raises(ValueError, match=msg): - read_html(gh_13141_data, extract_links="incorrect") + read_html(spam_data, extract_links="incorrect") From 98a46e2f3b1f002aca42884da4c24533f48082a5 Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Tue, 16 Aug 2022 01:51:02 +0100 Subject: [PATCH 23/24] Simplify statement --- pandas/io/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 6f71fdb5d3a91..2e9e84b4c877c 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -525,7 +525,7 @@ def _expand_colspan_rowspan( text = _remove_whitespace(self._text_getter(td)) if self.extract_links == "all" or self.extract_links == section: href = self._href_getter(td) - text = (text, href) if href else (text, None) + text = (text, href) rowspan = int(self._attr_getter(td, "rowspan") or 1) colspan = int(self._attr_getter(td, "colspan") or 1) From 614c6368635e72c8f6ad249e796d5abc922a530e Mon Sep 17 00:00:00 2001 From: Abdurrahmaan Iqbal Date: Tue, 16 Aug 2022 02:45:22 +0100 Subject: [PATCH 24/24] Fix code checks --- pandas/io/html.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 6d02862b3c99e..f890ad86519df 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -33,7 +33,6 @@ from pandas import isna from pandas.core.construction import create_series_with_explicit_dtype -from pandas.core.frame import DataFrame from pandas.core.indexes.base import Index from pandas.io.common import (