12
12
from typing import (
13
13
TYPE_CHECKING ,
14
14
Iterable ,
15
+ Literal ,
15
16
Pattern ,
16
17
Sequence ,
17
18
cast ,
30
31
31
32
from pandas .core .dtypes .common import is_list_like
32
33
34
+ from pandas import isna
33
35
from pandas .core .construction import create_series_with_explicit_dtype
36
+ from pandas .core .indexes .base import Index
34
37
35
38
from pandas .io .common import (
36
39
file_exists ,
@@ -184,6 +187,12 @@ class _HtmlFrameParser:
184
187
displayed_only : bool
185
188
Whether or not items with "display:none" should be ignored
186
189
190
+ extract_links : {None, "all", "header", "body", "footer"}
191
+ Table elements in the specified section(s) with <a> tags will have their
192
+ href extracted.
193
+
194
+ .. versionadded:: 1.5.0
195
+
187
196
Attributes
188
197
----------
189
198
io : str or file-like
@@ -202,11 +211,18 @@ class _HtmlFrameParser:
202
211
displayed_only : bool
203
212
Whether or not items with "display:none" should be ignored
204
213
214
+ extract_links : {None, "all", "header", "body", "footer"}
215
+ Table elements in the specified section(s) with <a> tags will have their
216
+ href extracted.
217
+
218
+ .. versionadded:: 1.5.0
219
+
205
220
Notes
206
221
-----
207
222
To subclass this class effectively you must override the following methods:
208
223
* :func:`_build_doc`
209
224
* :func:`_attr_getter`
225
+ * :func:`_href_getter`
210
226
* :func:`_text_getter`
211
227
* :func:`_parse_td`
212
228
* :func:`_parse_thead_tr`
@@ -225,12 +241,14 @@ def __init__(
225
241
attrs : dict [str , str ] | None ,
226
242
encoding : str ,
227
243
displayed_only : bool ,
244
+ extract_links : Literal [None , "header" , "footer" , "body" , "all" ],
228
245
) -> None :
229
246
self .io = io
230
247
self .match = match
231
248
self .attrs = attrs
232
249
self .encoding = encoding
233
250
self .displayed_only = displayed_only
251
+ self .extract_links = extract_links
234
252
235
253
def parse_tables (self ):
236
254
"""
@@ -263,6 +281,22 @@ def _attr_getter(self, obj, attr):
263
281
# Both lxml and BeautifulSoup have the same implementation:
264
282
return obj .get (attr )
265
283
284
+ def _href_getter (self , obj ):
285
+ """
286
+ Return a href if the DOM node contains a child <a> or None.
287
+
288
+ Parameters
289
+ ----------
290
+ obj : node-like
291
+ A DOM node.
292
+
293
+ Returns
294
+ -------
295
+ href : str or unicode
296
+ The href from the <a> child of the DOM node.
297
+ """
298
+ raise AbstractMethodError (self )
299
+
266
300
def _text_getter (self , obj ):
267
301
"""
268
302
Return the text of an individual DOM node.
@@ -439,33 +473,40 @@ def row_is_all_th(row):
439
473
while body_rows and row_is_all_th (body_rows [0 ]):
440
474
header_rows .append (body_rows .pop (0 ))
441
475
442
- header = self ._expand_colspan_rowspan (header_rows )
443
- body = self ._expand_colspan_rowspan (body_rows )
444
- footer = self ._expand_colspan_rowspan (footer_rows )
476
+ header = self ._expand_colspan_rowspan (header_rows , section = "header" )
477
+ body = self ._expand_colspan_rowspan (body_rows , section = "body" )
478
+ footer = self ._expand_colspan_rowspan (footer_rows , section = "footer" )
445
479
446
480
return header , body , footer
447
481
448
- def _expand_colspan_rowspan (self , rows ):
482
+ def _expand_colspan_rowspan (
483
+ self , rows , section : Literal ["header" , "footer" , "body" ]
484
+ ):
449
485
"""
450
486
Given a list of <tr>s, return a list of text rows.
451
487
452
488
Parameters
453
489
----------
454
490
rows : list of node-like
455
491
List of <tr>s
492
+ section : the section that the rows belong to (header, body or footer).
456
493
457
494
Returns
458
495
-------
459
496
list of list
460
- Each returned row is a list of str text.
497
+ Each returned row is a list of str text, or tuple (text, link)
498
+ if extract_links is not None.
461
499
462
500
Notes
463
501
-----
464
502
Any cell with ``rowspan`` or ``colspan`` will have its contents copied
465
503
to subsequent cells.
466
504
"""
467
505
all_texts = [] # list of rows, each a list of str
468
- remainder : list [tuple [int , str , int ]] = [] # list of (index, text, nrows)
506
+ text : str | tuple
507
+ remainder : list [
508
+ tuple [int , str | tuple , int ]
509
+ ] = [] # list of (index, text, nrows)
469
510
470
511
for tr in rows :
471
512
texts = [] # the output for this row
@@ -485,6 +526,9 @@ def _expand_colspan_rowspan(self, rows):
485
526
486
527
# Append the text from this <td>, colspan times
487
528
text = _remove_whitespace (self ._text_getter (td ))
529
+ if self .extract_links == "all" or self .extract_links == section :
530
+ href = self ._href_getter (td )
531
+ text = (text , href )
488
532
rowspan = int (self ._attr_getter (td , "rowspan" ) or 1 )
489
533
colspan = int (self ._attr_getter (td , "colspan" ) or 1 )
490
534
@@ -589,6 +633,10 @@ def _parse_tables(self, doc, match, attrs):
589
633
raise ValueError (f"No tables found matching pattern { repr (match .pattern )} " )
590
634
return result
591
635
636
+ def _href_getter (self , obj ) -> str | None :
637
+ a = obj .find ("a" , href = True )
638
+ return None if not a else a ["href" ]
639
+
592
640
def _text_getter (self , obj ):
593
641
return obj .text
594
642
@@ -680,6 +728,10 @@ class _LxmlFrameParser(_HtmlFrameParser):
680
728
:class:`_HtmlFrameParser`.
681
729
"""
682
730
731
+ def _href_getter (self , obj ) -> str | None :
732
+ href = obj .xpath (".//a/@href" )
733
+ return None if not href else href [0 ]
734
+
683
735
def _text_getter (self , obj ):
684
736
return obj .text_content ()
685
737
@@ -920,14 +972,14 @@ def _validate_flavor(flavor):
920
972
return flavor
921
973
922
974
923
- def _parse (flavor , io , match , attrs , encoding , displayed_only , ** kwargs ):
975
+ def _parse (flavor , io , match , attrs , encoding , displayed_only , extract_links , ** kwargs ):
924
976
flavor = _validate_flavor (flavor )
925
977
compiled_match = re .compile (match ) # you can pass a compiled regex here
926
978
927
979
retained = None
928
980
for flav in flavor :
929
981
parser = _parser_dispatch (flav )
930
- p = parser (io , compiled_match , attrs , encoding , displayed_only )
982
+ p = parser (io , compiled_match , attrs , encoding , displayed_only , extract_links )
931
983
932
984
try :
933
985
tables = p .parse_tables ()
@@ -955,7 +1007,17 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
955
1007
ret = []
956
1008
for table in tables :
957
1009
try :
958
- ret .append (_data_to_frame (data = table , ** kwargs ))
1010
+ df = _data_to_frame (data = table , ** kwargs )
1011
+ # Cast MultiIndex header to an Index of tuples when extracting header
1012
+ # links and replace nan with None.
1013
+ # This maintains consistency of selection (e.g. df.columns.str[1])
1014
+ if extract_links in ("all" , "header" ):
1015
+ df .columns = Index (
1016
+ ((col [0 ], None if isna (col [1 ]) else col [1 ]) for col in df .columns ),
1017
+ tupleize_cols = False ,
1018
+ )
1019
+
1020
+ ret .append (df )
959
1021
except EmptyDataError : # empty table
960
1022
continue
961
1023
return ret
@@ -978,6 +1040,7 @@ def read_html(
978
1040
na_values : Iterable [object ] | None = None ,
979
1041
keep_default_na : bool = True ,
980
1042
displayed_only : bool = True ,
1043
+ extract_links : Literal [None , "header" , "footer" , "body" , "all" ] = None ,
981
1044
) -> list [DataFrame ]:
982
1045
r"""
983
1046
Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1072,6 +1135,12 @@ def read_html(
1072
1135
displayed_only : bool, default True
1073
1136
Whether elements with "display: none" should be parsed.
1074
1137
1138
+ extract_links : {None, "all", "header", "body", "footer"}
1139
+ Table elements in the specified section(s) with <a> tags will have their
1140
+ href extracted.
1141
+
1142
+ .. versionadded:: 1.5.0
1143
+
1075
1144
Returns
1076
1145
-------
1077
1146
dfs
@@ -1120,6 +1189,12 @@ def read_html(
1120
1189
"cannot skip rows starting from the end of the "
1121
1190
"data (you passed a negative value)"
1122
1191
)
1192
+ if extract_links not in [None , "header" , "footer" , "body" , "all" ]:
1193
+ raise ValueError (
1194
+ "`extract_links` must be one of "
1195
+ '{None, "header", "footer", "body", "all"}, got '
1196
+ f'"{ extract_links } "'
1197
+ )
1123
1198
validate_header_arg (header )
1124
1199
1125
1200
io = stringify_path (io )
@@ -1140,4 +1215,5 @@ def read_html(
1140
1215
na_values = na_values ,
1141
1216
keep_default_na = keep_default_na ,
1142
1217
displayed_only = displayed_only ,
1218
+ extract_links = extract_links ,
1143
1219
)
0 commit comments