Skip to content

Commit 9f81aa6

Browse files
abmyiiattack68
andauthored
ENH: pd.read_html argument to extract hrefs along with text from cells (#45973)
* ENH: pd.read_html argument to extract hrefs along with text from cells * Fix typing error * Simplify tests * Fix still incorrect typing * Summarise whatsnew entry and move detailed explanation into user guide * More flexible link extraction * Suggested changes * extract_hrefs -> extract_links * Move versionadded to correct place and improve docstring for extract_links (@attack68) * Test for invalid extract_links value * Test all extract_link options * Fix for MultiIndex headers (also fixes tests) * Test that text surrounding <a> tag is still captured * Test for multiple <a> tags in cell * Fix all tests, with both MultiIndex -> Index and np.nan -> None conversions resolved * Add back EOF newline to test_html.py * Correct user guide example * Update pandas/io/html.py * Update pandas/io/html.py * Update pandas/io/html.py * Simplify MultiIndex -> Index conversion * Move unnecessary fixtures into test body * Simplify statement * Fix code checks Co-authored-by: JHM Darbyshire <[email protected]>
1 parent c7b470c commit 9f81aa6

File tree

4 files changed

+186
-9
lines changed

4 files changed

+186
-9
lines changed

doc/source/user_guide/io.rst

+24
Original file line numberDiff line numberDiff line change
@@ -2743,6 +2743,30 @@ succeeds, the function will return*.
27432743
27442744
dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor=["lxml", "bs4"])
27452745
2746+
Links can be extracted from cells along with the text using ``extract_links="all"``.
2747+
2748+
.. ipython:: python
2749+
2750+
html_table = """
2751+
<table>
2752+
<tr>
2753+
<th>GitHub</th>
2754+
</tr>
2755+
<tr>
2756+
<td><a href="https://github.com/pandas-dev/pandas">pandas</a></td>
2757+
</tr>
2758+
</table>
2759+
"""
2760+
2761+
df = pd.read_html(
2762+
html_table,
2763+
extract_links="all"
2764+
)[0]
2765+
df
2766+
df[("GitHub", None)]
2767+
df[("GitHub", None)].str[1]
2768+
2769+
.. versionadded:: 1.5.0
27462770

27472771
.. _io.html:
27482772

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,7 @@ Other enhancements
289289
- Added ``check_like`` argument to :func:`testing.assert_series_equal` (:issue:`47247`)
290290
- Add support for :meth:`GroupBy.ohlc` for extension array dtypes (:issue:`37493`)
291291
- Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files)
292+
- :func:`pandas.read_html` now supports extracting links from table cells (:issue:`13141`)
292293
- :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`)
293294
- :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`)
294295
- :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`)

pandas/io/html.py

+85-9
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from typing import (
1313
TYPE_CHECKING,
1414
Iterable,
15+
Literal,
1516
Pattern,
1617
Sequence,
1718
cast,
@@ -30,7 +31,9 @@
3031

3132
from pandas.core.dtypes.common import is_list_like
3233

34+
from pandas import isna
3335
from pandas.core.construction import create_series_with_explicit_dtype
36+
from pandas.core.indexes.base import Index
3437

3538
from pandas.io.common import (
3639
file_exists,
@@ -184,6 +187,12 @@ class _HtmlFrameParser:
184187
displayed_only : bool
185188
Whether or not items with "display:none" should be ignored
186189
190+
extract_links : {None, "all", "header", "body", "footer"}
191+
Table elements in the specified section(s) with <a> tags will have their
192+
href extracted.
193+
194+
.. versionadded:: 1.5.0
195+
187196
Attributes
188197
----------
189198
io : str or file-like
@@ -202,11 +211,18 @@ class _HtmlFrameParser:
202211
displayed_only : bool
203212
Whether or not items with "display:none" should be ignored
204213
214+
extract_links : {None, "all", "header", "body", "footer"}
215+
Table elements in the specified section(s) with <a> tags will have their
216+
href extracted.
217+
218+
.. versionadded:: 1.5.0
219+
205220
Notes
206221
-----
207222
To subclass this class effectively you must override the following methods:
208223
* :func:`_build_doc`
209224
* :func:`_attr_getter`
225+
* :func:`_href_getter`
210226
* :func:`_text_getter`
211227
* :func:`_parse_td`
212228
* :func:`_parse_thead_tr`
@@ -225,12 +241,14 @@ def __init__(
225241
attrs: dict[str, str] | None,
226242
encoding: str,
227243
displayed_only: bool,
244+
extract_links: Literal[None, "header", "footer", "body", "all"],
228245
) -> None:
229246
self.io = io
230247
self.match = match
231248
self.attrs = attrs
232249
self.encoding = encoding
233250
self.displayed_only = displayed_only
251+
self.extract_links = extract_links
234252

235253
def parse_tables(self):
236254
"""
@@ -263,6 +281,22 @@ def _attr_getter(self, obj, attr):
263281
# Both lxml and BeautifulSoup have the same implementation:
264282
return obj.get(attr)
265283

284+
def _href_getter(self, obj):
285+
"""
286+
Return a href if the DOM node contains a child <a> or None.
287+
288+
Parameters
289+
----------
290+
obj : node-like
291+
A DOM node.
292+
293+
Returns
294+
-------
295+
href : str or unicode
296+
The href from the <a> child of the DOM node.
297+
"""
298+
raise AbstractMethodError(self)
299+
266300
def _text_getter(self, obj):
267301
"""
268302
Return the text of an individual DOM node.
@@ -439,33 +473,40 @@ def row_is_all_th(row):
439473
while body_rows and row_is_all_th(body_rows[0]):
440474
header_rows.append(body_rows.pop(0))
441475

442-
header = self._expand_colspan_rowspan(header_rows)
443-
body = self._expand_colspan_rowspan(body_rows)
444-
footer = self._expand_colspan_rowspan(footer_rows)
476+
header = self._expand_colspan_rowspan(header_rows, section="header")
477+
body = self._expand_colspan_rowspan(body_rows, section="body")
478+
footer = self._expand_colspan_rowspan(footer_rows, section="footer")
445479

446480
return header, body, footer
447481

448-
def _expand_colspan_rowspan(self, rows):
482+
def _expand_colspan_rowspan(
483+
self, rows, section: Literal["header", "footer", "body"]
484+
):
449485
"""
450486
Given a list of <tr>s, return a list of text rows.
451487
452488
Parameters
453489
----------
454490
rows : list of node-like
455491
List of <tr>s
492+
section : the section that the rows belong to (header, body or footer).
456493
457494
Returns
458495
-------
459496
list of list
460-
Each returned row is a list of str text.
497+
Each returned row is a list of str text, or tuple (text, link)
498+
if extract_links is not None.
461499
462500
Notes
463501
-----
464502
Any cell with ``rowspan`` or ``colspan`` will have its contents copied
465503
to subsequent cells.
466504
"""
467505
all_texts = [] # list of rows, each a list of str
468-
remainder: list[tuple[int, str, int]] = [] # list of (index, text, nrows)
506+
text: str | tuple
507+
remainder: list[
508+
tuple[int, str | tuple, int]
509+
] = [] # list of (index, text, nrows)
469510

470511
for tr in rows:
471512
texts = [] # the output for this row
@@ -485,6 +526,9 @@ def _expand_colspan_rowspan(self, rows):
485526

486527
# Append the text from this <td>, colspan times
487528
text = _remove_whitespace(self._text_getter(td))
529+
if self.extract_links == "all" or self.extract_links == section:
530+
href = self._href_getter(td)
531+
text = (text, href)
488532
rowspan = int(self._attr_getter(td, "rowspan") or 1)
489533
colspan = int(self._attr_getter(td, "colspan") or 1)
490534

@@ -589,6 +633,10 @@ def _parse_tables(self, doc, match, attrs):
589633
raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
590634
return result
591635

636+
def _href_getter(self, obj) -> str | None:
637+
a = obj.find("a", href=True)
638+
return None if not a else a["href"]
639+
592640
def _text_getter(self, obj):
593641
return obj.text
594642

@@ -680,6 +728,10 @@ class _LxmlFrameParser(_HtmlFrameParser):
680728
:class:`_HtmlFrameParser`.
681729
"""
682730

731+
def _href_getter(self, obj) -> str | None:
732+
href = obj.xpath(".//a/@href")
733+
return None if not href else href[0]
734+
683735
def _text_getter(self, obj):
684736
return obj.text_content()
685737

@@ -920,14 +972,14 @@ def _validate_flavor(flavor):
920972
return flavor
921973

922974

923-
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
975+
def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs):
924976
flavor = _validate_flavor(flavor)
925977
compiled_match = re.compile(match) # you can pass a compiled regex here
926978

927979
retained = None
928980
for flav in flavor:
929981
parser = _parser_dispatch(flav)
930-
p = parser(io, compiled_match, attrs, encoding, displayed_only)
982+
p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links)
931983

932984
try:
933985
tables = p.parse_tables()
@@ -955,7 +1007,17 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
9551007
ret = []
9561008
for table in tables:
9571009
try:
958-
ret.append(_data_to_frame(data=table, **kwargs))
1010+
df = _data_to_frame(data=table, **kwargs)
1011+
# Cast MultiIndex header to an Index of tuples when extracting header
1012+
# links and replace nan with None.
1013+
# This maintains consistency of selection (e.g. df.columns.str[1])
1014+
if extract_links in ("all", "header"):
1015+
df.columns = Index(
1016+
((col[0], None if isna(col[1]) else col[1]) for col in df.columns),
1017+
tupleize_cols=False,
1018+
)
1019+
1020+
ret.append(df)
9591021
except EmptyDataError: # empty table
9601022
continue
9611023
return ret
@@ -978,6 +1040,7 @@ def read_html(
9781040
na_values: Iterable[object] | None = None,
9791041
keep_default_na: bool = True,
9801042
displayed_only: bool = True,
1043+
extract_links: Literal[None, "header", "footer", "body", "all"] = None,
9811044
) -> list[DataFrame]:
9821045
r"""
9831046
Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1072,6 +1135,12 @@ def read_html(
10721135
displayed_only : bool, default True
10731136
Whether elements with "display: none" should be parsed.
10741137
1138+
extract_links : {None, "all", "header", "body", "footer"}
1139+
Table elements in the specified section(s) with <a> tags will have their
1140+
href extracted.
1141+
1142+
.. versionadded:: 1.5.0
1143+
10751144
Returns
10761145
-------
10771146
dfs
@@ -1120,6 +1189,12 @@ def read_html(
11201189
"cannot skip rows starting from the end of the "
11211190
"data (you passed a negative value)"
11221191
)
1192+
if extract_links not in [None, "header", "footer", "body", "all"]:
1193+
raise ValueError(
1194+
"`extract_links` must be one of "
1195+
'{None, "header", "footer", "body", "all"}, got '
1196+
f'"{extract_links}"'
1197+
)
11231198
validate_header_arg(header)
11241199

11251200
io = stringify_path(io)
@@ -1140,4 +1215,5 @@ def read_html(
11401215
na_values=na_values,
11411216
keep_default_na=keep_default_na,
11421217
displayed_only=displayed_only,
1218+
extract_links=extract_links,
11431219
)

pandas/tests/io/test_html.py

+76
Original file line numberDiff line numberDiff line change
@@ -1340,3 +1340,79 @@ def test_parse_br_as_space(self):
13401340
expected = DataFrame(data=[["word1 word2"]], columns=["A"])
13411341

13421342
tm.assert_frame_equal(result, expected)
1343+
1344+
@pytest.mark.parametrize("arg", ["all", "body", "header", "footer"])
1345+
def test_extract_links(self, arg):
1346+
gh_13141_data = """
1347+
<table>
1348+
<tr>
1349+
<th>HTTP</th>
1350+
<th>FTP</th>
1351+
<th><a href="https://en.wiktionary.org/wiki/linkless">Linkless</a></th>
1352+
</tr>
1353+
<tr>
1354+
<td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
1355+
<td>SURROUNDING <a href="ftp://ftp.us.debian.org/">Debian</a> TEXT</td>
1356+
<td>Linkless</td>
1357+
</tr>
1358+
<tfoot>
1359+
<tr>
1360+
<td><a href="https://en.wikipedia.org/wiki/Page_footer">Footer</a></td>
1361+
<td>
1362+
Multiple <a href="1">links:</a> <a href="2">Only first captured.</a>
1363+
</td>
1364+
</tr>
1365+
</tfoot>
1366+
</table>
1367+
"""
1368+
1369+
gh_13141_expected = {
1370+
"head_ignore": ["HTTP", "FTP", "Linkless"],
1371+
"head_extract": [
1372+
("HTTP", None),
1373+
("FTP", None),
1374+
("Linkless", "https://en.wiktionary.org/wiki/linkless"),
1375+
],
1376+
"body_ignore": ["Wikipedia", "SURROUNDING Debian TEXT", "Linkless"],
1377+
"body_extract": [
1378+
("Wikipedia", "https://en.wikipedia.org/"),
1379+
("SURROUNDING Debian TEXT", "ftp://ftp.us.debian.org/"),
1380+
("Linkless", None),
1381+
],
1382+
"footer_ignore": [
1383+
"Footer",
1384+
"Multiple links: Only first captured.",
1385+
None,
1386+
],
1387+
"footer_extract": [
1388+
("Footer", "https://en.wikipedia.org/wiki/Page_footer"),
1389+
("Multiple links: Only first captured.", "1"),
1390+
None,
1391+
],
1392+
}
1393+
1394+
data_exp = gh_13141_expected["body_ignore"]
1395+
foot_exp = gh_13141_expected["footer_ignore"]
1396+
head_exp = gh_13141_expected["head_ignore"]
1397+
if arg == "all":
1398+
data_exp = gh_13141_expected["body_extract"]
1399+
foot_exp = gh_13141_expected["footer_extract"]
1400+
head_exp = gh_13141_expected["head_extract"]
1401+
elif arg == "body":
1402+
data_exp = gh_13141_expected["body_extract"]
1403+
elif arg == "footer":
1404+
foot_exp = gh_13141_expected["footer_extract"]
1405+
elif arg == "header":
1406+
head_exp = gh_13141_expected["head_extract"]
1407+
1408+
result = self.read_html(gh_13141_data, extract_links=arg)[0]
1409+
expected = DataFrame([data_exp, foot_exp], columns=head_exp)
1410+
tm.assert_frame_equal(result, expected)
1411+
1412+
def test_extract_links_bad(self, spam_data):
1413+
msg = (
1414+
"`extract_links` must be one of "
1415+
'{None, "header", "footer", "body", "all"}, got "incorrect"'
1416+
)
1417+
with pytest.raises(ValueError, match=msg):
1418+
read_html(spam_data, extract_links="incorrect")

0 commit comments

Comments
 (0)