diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 3ad2263cc4b97..a5fb90d20ed37 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2743,6 +2743,30 @@ succeeds, the function will return*. dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor=["lxml", "bs4"]) +Links can be extracted from cells along with the text using ``extract_links="all"``. + +.. ipython:: python + + html_table = """ + + + + + + + +
GitHub
pandas
+ """ + + df = pd.read_html( + html_table, + extract_links="all" + )[0] + df + df[("GitHub", None)] + df[("GitHub", None)].str[1] + +.. versionadded:: 1.5.0 .. _io.html: diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index a1a2149da7cf6..58a1dbe73a9cf 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -289,6 +289,7 @@ Other enhancements - Added ``check_like`` argument to :func:`testing.assert_series_equal` (:issue:`47247`) - Add support for :meth:`GroupBy.ohlc` for extension array dtypes (:issue:`37493`) - Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files) +- :func:`pandas.read_html` now supports extracting links from table cells (:issue:`13141`) - :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`) - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`) - :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`) diff --git a/pandas/io/html.py b/pandas/io/html.py index dbbe61dcb8247..f890ad86519df 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -12,6 +12,7 @@ from typing import ( TYPE_CHECKING, Iterable, + Literal, Pattern, Sequence, cast, @@ -30,7 +31,9 @@ from pandas.core.dtypes.common import is_list_like +from pandas import isna from pandas.core.construction import create_series_with_explicit_dtype +from pandas.core.indexes.base import Index from pandas.io.common import ( file_exists, @@ -184,6 +187,12 @@ class _HtmlFrameParser: displayed_only : bool Whether or not items with "display:none" should be ignored + extract_links : {None, "all", "header", "body", "footer"} + Table elements in the specified section(s) with tags will have their + href extracted. + + .. versionadded:: 1.5.0 + Attributes ---------- io : str or file-like @@ -202,11 +211,18 @@ class _HtmlFrameParser: displayed_only : bool Whether or not items with "display:none" should be ignored + extract_links : {None, "all", "header", "body", "footer"} + Table elements in the specified section(s) with tags will have their + href extracted. + + .. versionadded:: 1.5.0 + Notes ----- To subclass this class effectively you must override the following methods: * :func:`_build_doc` * :func:`_attr_getter` + * :func:`_href_getter` * :func:`_text_getter` * :func:`_parse_td` * :func:`_parse_thead_tr` @@ -225,12 +241,14 @@ def __init__( attrs: dict[str, str] | None, encoding: str, displayed_only: bool, + extract_links: Literal[None, "header", "footer", "body", "all"], ) -> None: self.io = io self.match = match self.attrs = attrs self.encoding = encoding self.displayed_only = displayed_only + self.extract_links = extract_links def parse_tables(self): """ @@ -263,6 +281,22 @@ def _attr_getter(self, obj, attr): # Both lxml and BeautifulSoup have the same implementation: return obj.get(attr) + def _href_getter(self, obj): + """ + Return a href if the DOM node contains a child or None. + + Parameters + ---------- + obj : node-like + A DOM node. + + Returns + ------- + href : str or unicode + The href from the child of the DOM node. + """ + raise AbstractMethodError(self) + def _text_getter(self, obj): """ Return the text of an individual DOM node. @@ -439,13 +473,15 @@ def row_is_all_th(row): while body_rows and row_is_all_th(body_rows[0]): header_rows.append(body_rows.pop(0)) - header = self._expand_colspan_rowspan(header_rows) - body = self._expand_colspan_rowspan(body_rows) - footer = self._expand_colspan_rowspan(footer_rows) + header = self._expand_colspan_rowspan(header_rows, section="header") + body = self._expand_colspan_rowspan(body_rows, section="body") + footer = self._expand_colspan_rowspan(footer_rows, section="footer") return header, body, footer - def _expand_colspan_rowspan(self, rows): + def _expand_colspan_rowspan( + self, rows, section: Literal["header", "footer", "body"] + ): """ Given a list of s, return a list of text rows. @@ -453,11 +489,13 @@ def _expand_colspan_rowspan(self, rows): ---------- rows : list of node-like List of s + section : the section that the rows belong to (header, body or footer). Returns ------- list of list - Each returned row is a list of str text. + Each returned row is a list of str text, or tuple (text, link) + if extract_links is not None. Notes ----- @@ -465,7 +503,10 @@ def _expand_colspan_rowspan(self, rows): to subsequent cells. """ all_texts = [] # list of rows, each a list of str - remainder: list[tuple[int, str, int]] = [] # list of (index, text, nrows) + text: str | tuple + remainder: list[ + tuple[int, str | tuple, int] + ] = [] # list of (index, text, nrows) for tr in rows: texts = [] # the output for this row @@ -485,6 +526,9 @@ def _expand_colspan_rowspan(self, rows): # Append the text from this , colspan times text = _remove_whitespace(self._text_getter(td)) + if self.extract_links == "all" or self.extract_links == section: + href = self._href_getter(td) + text = (text, href) rowspan = int(self._attr_getter(td, "rowspan") or 1) colspan = int(self._attr_getter(td, "colspan") or 1) @@ -589,6 +633,10 @@ def _parse_tables(self, doc, match, attrs): raise ValueError(f"No tables found matching pattern {repr(match.pattern)}") return result + def _href_getter(self, obj) -> str | None: + a = obj.find("a", href=True) + return None if not a else a["href"] + def _text_getter(self, obj): return obj.text @@ -680,6 +728,10 @@ class _LxmlFrameParser(_HtmlFrameParser): :class:`_HtmlFrameParser`. """ + def _href_getter(self, obj) -> str | None: + href = obj.xpath(".//a/@href") + return None if not href else href[0] + def _text_getter(self, obj): return obj.text_content() @@ -920,14 +972,14 @@ def _validate_flavor(flavor): return flavor -def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): +def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here retained = None for flav in flavor: parser = _parser_dispatch(flav) - p = parser(io, compiled_match, attrs, encoding, displayed_only) + p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links) try: tables = p.parse_tables() @@ -955,7 +1007,17 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): ret = [] for table in tables: try: - ret.append(_data_to_frame(data=table, **kwargs)) + df = _data_to_frame(data=table, **kwargs) + # Cast MultiIndex header to an Index of tuples when extracting header + # links and replace nan with None. + # This maintains consistency of selection (e.g. df.columns.str[1]) + if extract_links in ("all", "header"): + df.columns = Index( + ((col[0], None if isna(col[1]) else col[1]) for col in df.columns), + tupleize_cols=False, + ) + + ret.append(df) except EmptyDataError: # empty table continue return ret @@ -978,6 +1040,7 @@ def read_html( na_values: Iterable[object] | None = None, keep_default_na: bool = True, displayed_only: bool = True, + extract_links: Literal[None, "header", "footer", "body", "all"] = None, ) -> list[DataFrame]: r""" Read HTML tables into a ``list`` of ``DataFrame`` objects. @@ -1072,6 +1135,12 @@ def read_html( displayed_only : bool, default True Whether elements with "display: none" should be parsed. + extract_links : {None, "all", "header", "body", "footer"} + Table elements in the specified section(s) with tags will have their + href extracted. + + .. versionadded:: 1.5.0 + Returns ------- dfs @@ -1120,6 +1189,12 @@ def read_html( "cannot skip rows starting from the end of the " "data (you passed a negative value)" ) + if extract_links not in [None, "header", "footer", "body", "all"]: + raise ValueError( + "`extract_links` must be one of " + '{None, "header", "footer", "body", "all"}, got ' + f'"{extract_links}"' + ) validate_header_arg(header) io = stringify_path(io) @@ -1140,4 +1215,5 @@ def read_html( na_values=na_values, keep_default_na=keep_default_na, displayed_only=displayed_only, + extract_links=extract_links, ) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 20bcfe202f68a..045c22f106105 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1340,3 +1340,79 @@ def test_parse_br_as_space(self): expected = DataFrame(data=[["word1 word2"]], columns=["A"]) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("arg", ["all", "body", "header", "footer"]) + def test_extract_links(self, arg): + gh_13141_data = """ + + + + + + + + + + + + + + + + + +
HTTPFTPLinkless
WikipediaSURROUNDING Debian TEXTLinkless
Footer + Multiple links: Only first captured. +
+ """ + + gh_13141_expected = { + "head_ignore": ["HTTP", "FTP", "Linkless"], + "head_extract": [ + ("HTTP", None), + ("FTP", None), + ("Linkless", "https://en.wiktionary.org/wiki/linkless"), + ], + "body_ignore": ["Wikipedia", "SURROUNDING Debian TEXT", "Linkless"], + "body_extract": [ + ("Wikipedia", "https://en.wikipedia.org/"), + ("SURROUNDING Debian TEXT", "ftp://ftp.us.debian.org/"), + ("Linkless", None), + ], + "footer_ignore": [ + "Footer", + "Multiple links: Only first captured.", + None, + ], + "footer_extract": [ + ("Footer", "https://en.wikipedia.org/wiki/Page_footer"), + ("Multiple links: Only first captured.", "1"), + None, + ], + } + + data_exp = gh_13141_expected["body_ignore"] + foot_exp = gh_13141_expected["footer_ignore"] + head_exp = gh_13141_expected["head_ignore"] + if arg == "all": + data_exp = gh_13141_expected["body_extract"] + foot_exp = gh_13141_expected["footer_extract"] + head_exp = gh_13141_expected["head_extract"] + elif arg == "body": + data_exp = gh_13141_expected["body_extract"] + elif arg == "footer": + foot_exp = gh_13141_expected["footer_extract"] + elif arg == "header": + head_exp = gh_13141_expected["head_extract"] + + result = self.read_html(gh_13141_data, extract_links=arg)[0] + expected = DataFrame([data_exp, foot_exp], columns=head_exp) + tm.assert_frame_equal(result, expected) + + def test_extract_links_bad(self, spam_data): + msg = ( + "`extract_links` must be one of " + '{None, "header", "footer", "body", "all"}, got "incorrect"' + ) + with pytest.raises(ValueError, match=msg): + read_html(spam_data, extract_links="incorrect")