Skip to content

Commit d69ce74

Browse files
committed
ENH: pd.read_html argument to extract hrefs along with text from cells
1 parent 5bf346c commit d69ce74

File tree

3 files changed

+129
-4
lines changed

3 files changed

+129
-4
lines changed

doc/source/whatsnew/v1.5.0.rst

+26
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,32 @@ apply converter methods, and parse dates (:issue:`43567`).
109109
df
110110
df.dtypes
111111
112+
read_html now supports ``extract_hrefs``
113+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
114+
115+
:func:`pandas.read_html` can now extract hrefs from table cells (:issue:`13141`).
116+
117+
.. ipython:: python
118+
119+
html_table = """
120+
<table>
121+
<tr>
122+
<th>GitHub</th>
123+
</tr>
124+
<tr>
125+
<td><a href="https://github.com/pandas-dev/pandas">pandas</a></td>
126+
</tr>
127+
</table>
128+
"""
129+
130+
df = pd.read_html(
131+
html_table,
132+
extract_hrefs=True
133+
)[0]
134+
df
135+
df["GitHub"]
136+
df["GitHub"].str[1]
137+
112138
.. _whatsnew_150.api_breaking.api_breaking2:
113139

114140
api_breaking_change2

pandas/io/html.py

+49-4
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,9 @@ class _HtmlFrameParser:
180180
displayed_only : bool
181181
Whether or not items with "display:none" should be ignored
182182
183+
extract_hrefs : bool, default False
184+
Whether table elements with <a> tags should have the href extracted.
185+
183186
Attributes
184187
----------
185188
io : str or file-like
@@ -198,11 +201,15 @@ class _HtmlFrameParser:
198201
displayed_only : bool
199202
Whether or not items with "display:none" should be ignored
200203
204+
extract_hrefs : bool, default False
205+
Whether table elements with <a> tags should have the href extracted.
206+
201207
Notes
202208
-----
203209
To subclass this class effectively you must override the following methods:
204210
* :func:`_build_doc`
205211
* :func:`_attr_getter`
212+
* :func:`_href_getter`
206213
* :func:`_text_getter`
207214
* :func:`_parse_td`
208215
* :func:`_parse_thead_tr`
@@ -221,12 +228,14 @@ def __init__(
221228
attrs: dict[str, str] | None,
222229
encoding: str,
223230
displayed_only: bool,
231+
extract_hrefs: bool,
224232
):
225233
self.io = io
226234
self.match = match
227235
self.attrs = attrs
228236
self.encoding = encoding
229237
self.displayed_only = displayed_only
238+
self.extract_hrefs = extract_hrefs
230239

231240
def parse_tables(self):
232241
"""
@@ -259,6 +268,22 @@ def _attr_getter(self, obj, attr):
259268
# Both lxml and BeautifulSoup have the same implementation:
260269
return obj.get(attr)
261270

271+
def _href_getter(self, obj):
272+
"""
273+
Return a href if the DOM node contains a child <a> or None.
274+
275+
Parameters
276+
----------
277+
obj : node-like
278+
A DOM node.
279+
280+
Returns
281+
-------
282+
href : str or unicode
283+
The href from the <a> child of the DOM node.
284+
"""
285+
raise AbstractMethodError(self)
286+
262287
def _text_getter(self, obj):
263288
"""
264289
Return the text of an individual DOM node.
@@ -435,20 +460,22 @@ def row_is_all_th(row):
435460
while body_rows and row_is_all_th(body_rows[0]):
436461
header_rows.append(body_rows.pop(0))
437462

438-
header = self._expand_colspan_rowspan(header_rows)
463+
header = self._expand_colspan_rowspan(header_rows, header=True)
439464
body = self._expand_colspan_rowspan(body_rows)
440465
footer = self._expand_colspan_rowspan(footer_rows)
441466

442467
return header, body, footer
443468

444-
def _expand_colspan_rowspan(self, rows):
469+
def _expand_colspan_rowspan(self, rows, header=False):
445470
"""
446471
Given a list of <tr>s, return a list of text rows.
447472
448473
Parameters
449474
----------
450475
rows : list of node-like
451476
List of <tr>s
477+
header : whether the current row is the header - don't capture links if so,
478+
as this results in a MultiIndex which is undesirable.
452479
453480
Returns
454481
-------
@@ -481,6 +508,11 @@ def _expand_colspan_rowspan(self, rows):
481508

482509
# Append the text from this <td>, colspan times
483510
text = _remove_whitespace(self._text_getter(td))
511+
if not header and self.extract_hrefs:
512+
# All cells will be tuples except for the headers for
513+
# consistency in selection (e.g. using .str indexing)
514+
href = self._href_getter(td)
515+
text = (text, href) if href else (text,)
484516
rowspan = int(self._attr_getter(td, "rowspan") or 1)
485517
colspan = int(self._attr_getter(td, "colspan") or 1)
486518

@@ -585,6 +617,10 @@ def _parse_tables(self, doc, match, attrs):
585617
raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
586618
return result
587619

620+
def _href_getter(self, obj):
621+
a = obj.find("a", href=True)
622+
return None if not a else a["href"]
623+
588624
def _text_getter(self, obj):
589625
return obj.text
590626

@@ -670,6 +706,10 @@ class _LxmlFrameParser(_HtmlFrameParser):
670706
:class:`_HtmlFrameParser`.
671707
"""
672708

709+
def _href_getter(self, obj):
710+
href = obj.xpath(".//a/@href")
711+
return None if not href else href[0]
712+
673713
def _text_getter(self, obj):
674714
return obj.text_content()
675715

@@ -906,14 +946,14 @@ def _validate_flavor(flavor):
906946
return flavor
907947

908948

909-
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
949+
def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_hrefs, **kwargs):
910950
flavor = _validate_flavor(flavor)
911951
compiled_match = re.compile(match) # you can pass a compiled regex here
912952

913953
retained = None
914954
for flav in flavor:
915955
parser = _parser_dispatch(flav)
916-
p = parser(io, compiled_match, attrs, encoding, displayed_only)
956+
p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_hrefs)
917957

918958
try:
919959
tables = p.parse_tables()
@@ -964,6 +1004,7 @@ def read_html(
9641004
na_values=None,
9651005
keep_default_na: bool = True,
9661006
displayed_only: bool = True,
1007+
extract_hrefs: bool = False,
9671008
) -> list[DataFrame]:
9681009
r"""
9691010
Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1058,6 +1099,9 @@ def read_html(
10581099
displayed_only : bool, default True
10591100
Whether elements with "display: none" should be parsed.
10601101
1102+
extract_hrefs : bool, default False
1103+
Whether table elements with <a> tags should have the href extracted.
1104+
10611105
Returns
10621106
-------
10631107
dfs
@@ -1126,4 +1170,5 @@ def read_html(
11261170
na_values=na_values,
11271171
keep_default_na=keep_default_na,
11281172
displayed_only=displayed_only,
1173+
extract_hrefs=extract_hrefs,
11291174
)

pandas/tests/io/test_html.py

+54
Original file line numberDiff line numberDiff line change
@@ -1286,3 +1286,57 @@ def test_parse_path_object(self, datapath):
12861286
df1 = self.read_html(file_path_string)[0]
12871287
df2 = self.read_html(file_path)[0]
12881288
tm.assert_frame_equal(df1, df2)
1289+
1290+
def test_extract_hrefs(self):
1291+
# GH 13141:
1292+
# read_html argument to interpret hyperlinks as links (not merely text)
1293+
result = self.read_html(
1294+
"""
1295+
<table>
1296+
<tr>
1297+
<th>Kingdom</th>
1298+
<th>Phylum</th>
1299+
<th>Class</th>
1300+
<th>Order</th>
1301+
<th>Family</th>
1302+
<th>Genus</th>
1303+
<th>Species</th>
1304+
</tr>
1305+
<tr>
1306+
<td><a href="https://en.wikipedia.org/wiki/Animal">Animalia</a></td>
1307+
<td><a href="https://en.wikipedia.org/wiki/Chordate">Chordata</a></td>
1308+
<td><a href="https://en.wikipedia.org/wiki/Mammal">Mammalia</a></td>
1309+
<td><a href="https://en.wikipedia.org/wiki/Carnivora">Carnivora</a></td>
1310+
<td><a href="https://en.wikipedia.org/wiki/Bear">Ursidae</a></td>
1311+
<td><a href="https://en.wikipedia.org/wiki/Ailuropoda">Ailuropoda</a></td>
1312+
<td>A. melanoleuca</td>
1313+
</tr>
1314+
</table>
1315+
""",
1316+
extract_hrefs=True,
1317+
)[0]
1318+
1319+
expected = DataFrame(
1320+
[
1321+
[
1322+
("Animalia", "https://en.wikipedia.org/wiki/Animal"),
1323+
("Chordata", "https://en.wikipedia.org/wiki/Chordate"),
1324+
("Mammalia", "https://en.wikipedia.org/wiki/Mammal"),
1325+
("Carnivora", "https://en.wikipedia.org/wiki/Carnivora"),
1326+
("Ursidae", "https://en.wikipedia.org/wiki/Bear"),
1327+
("Ailuropoda", "https://en.wikipedia.org/wiki/Ailuropoda"),
1328+
("A. melanoleuca",),
1329+
]
1330+
],
1331+
columns=(
1332+
"Kingdom",
1333+
"Phylum",
1334+
"Class",
1335+
"Order",
1336+
"Family",
1337+
"Genus",
1338+
"Species",
1339+
),
1340+
)
1341+
1342+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)