@@ -180,6 +180,9 @@ class _HtmlFrameParser:
180
180
displayed_only : bool
181
181
Whether or not items with "display:none" should be ignored
182
182
183
+ extract_hrefs : bool, default False
184
+ Whether table elements with <a> tags should have the href extracted.
185
+
183
186
Attributes
184
187
----------
185
188
io : str or file-like
@@ -198,11 +201,15 @@ class _HtmlFrameParser:
198
201
displayed_only : bool
199
202
Whether or not items with "display:none" should be ignored
200
203
204
+ extract_hrefs : bool, default False
205
+ Whether table elements with <a> tags should have the href extracted.
206
+
201
207
Notes
202
208
-----
203
209
To subclass this class effectively you must override the following methods:
204
210
* :func:`_build_doc`
205
211
* :func:`_attr_getter`
212
+ * :func:`_href_getter`
206
213
* :func:`_text_getter`
207
214
* :func:`_parse_td`
208
215
* :func:`_parse_thead_tr`
@@ -221,12 +228,14 @@ def __init__(
221
228
attrs : dict [str , str ] | None ,
222
229
encoding : str ,
223
230
displayed_only : bool ,
231
+ extract_hrefs : bool
224
232
):
225
233
self .io = io
226
234
self .match = match
227
235
self .attrs = attrs
228
236
self .encoding = encoding
229
237
self .displayed_only = displayed_only
238
+ self .extract_hrefs = extract_hrefs
230
239
231
240
def parse_tables (self ):
232
241
"""
@@ -259,6 +268,22 @@ def _attr_getter(self, obj, attr):
259
268
# Both lxml and BeautifulSoup have the same implementation:
260
269
return obj .get (attr )
261
270
271
+ def _href_getter (self , obj ):
272
+ """
273
+ Return a href if the DOM node contains a child <a> or None.
274
+
275
+ Parameters
276
+ ----------
277
+ obj : node-like
278
+ A DOM node.
279
+
280
+ Returns
281
+ -------
282
+ href : str or unicode
283
+ The href from the <a> child of the DOM node.
284
+ """
285
+ raise AbstractMethodError (self )
286
+
262
287
def _text_getter (self , obj ):
263
288
"""
264
289
Return the text of an individual DOM node.
@@ -435,20 +460,22 @@ def row_is_all_th(row):
435
460
while body_rows and row_is_all_th (body_rows [0 ]):
436
461
header_rows .append (body_rows .pop (0 ))
437
462
438
- header = self ._expand_colspan_rowspan (header_rows )
463
+ header = self ._expand_colspan_rowspan (header_rows , header = True )
439
464
body = self ._expand_colspan_rowspan (body_rows )
440
465
footer = self ._expand_colspan_rowspan (footer_rows )
441
466
442
467
return header , body , footer
443
468
444
- def _expand_colspan_rowspan (self , rows ):
469
+ def _expand_colspan_rowspan (self , rows , header = False ):
445
470
"""
446
471
Given a list of <tr>s, return a list of text rows.
447
472
448
473
Parameters
449
474
----------
450
475
rows : list of node-like
451
476
List of <tr>s
477
+ header : whether the current row is the header - don't capture links if so,
478
+ as this results in a MultiIndex which is undesirable.
452
479
453
480
Returns
454
481
-------
@@ -481,6 +508,11 @@ def _expand_colspan_rowspan(self, rows):
481
508
482
509
# Append the text from this <td>, colspan times
483
510
text = _remove_whitespace (self ._text_getter (td ))
511
+ if not header and self .extract_hrefs :
512
+ # All cells will be tuples except for the headers for
513
+ # consistency in selection (e.g. using .str indexing)
514
+ href = self ._href_getter (td )
515
+ text = (text , href ) if href else (text ,)
484
516
rowspan = int (self ._attr_getter (td , "rowspan" ) or 1 )
485
517
colspan = int (self ._attr_getter (td , "colspan" ) or 1 )
486
518
@@ -585,6 +617,10 @@ def _parse_tables(self, doc, match, attrs):
585
617
raise ValueError (f"No tables found matching pattern { repr (match .pattern )} " )
586
618
return result
587
619
620
+ def _href_getter (self , obj ):
621
+ a = obj .find ("a" , href = True )
622
+ return None if not a else a ["href" ]
623
+
588
624
def _text_getter (self , obj ):
589
625
return obj .text
590
626
@@ -670,6 +706,10 @@ class _LxmlFrameParser(_HtmlFrameParser):
670
706
:class:`_HtmlFrameParser`.
671
707
"""
672
708
709
+ def _href_getter (self , obj ):
710
+ href = obj .xpath (".//a/@href" )
711
+ return None if not href else href [0 ]
712
+
673
713
def _text_getter (self , obj ):
674
714
return obj .text_content ()
675
715
@@ -906,14 +946,14 @@ def _validate_flavor(flavor):
906
946
return flavor
907
947
908
948
909
- def _parse (flavor , io , match , attrs , encoding , displayed_only , ** kwargs ):
949
+ def _parse (flavor , io , match , attrs , encoding , displayed_only , extract_hrefs , ** kwargs ):
910
950
flavor = _validate_flavor (flavor )
911
951
compiled_match = re .compile (match ) # you can pass a compiled regex here
912
952
913
953
retained = None
914
954
for flav in flavor :
915
955
parser = _parser_dispatch (flav )
916
- p = parser (io , compiled_match , attrs , encoding , displayed_only )
956
+ p = parser (io , compiled_match , attrs , encoding , displayed_only , extract_hrefs )
917
957
918
958
try :
919
959
tables = p .parse_tables ()
@@ -964,6 +1004,7 @@ def read_html(
964
1004
na_values = None ,
965
1005
keep_default_na : bool = True ,
966
1006
displayed_only : bool = True ,
1007
+ extract_hrefs : bool = False ,
967
1008
) -> list [DataFrame ]:
968
1009
r"""
969
1010
Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1058,6 +1099,9 @@ def read_html(
1058
1099
displayed_only : bool, default True
1059
1100
Whether elements with "display: none" should be parsed.
1060
1101
1102
+ extract_hrefs : bool, default False
1103
+ Whether table elements with <a> tags should have the href extracted.
1104
+
1061
1105
Returns
1062
1106
-------
1063
1107
dfs
@@ -1126,4 +1170,5 @@ def read_html(
1126
1170
na_values = na_values ,
1127
1171
keep_default_na = keep_default_na ,
1128
1172
displayed_only = displayed_only ,
1173
+ extract_hrefs = extract_hrefs ,
1129
1174
)
0 commit comments