@@ -160,6 +160,14 @@ class _HtmlFrameParser(object):
160
160
attrs : dict
161
161
List of HTML <table> element attributes to match.
162
162
163
+ encoding : str
164
+ Encoding to be used by parser
165
+
166
+ displayed_only : bool
167
+ Whether or not items with "display:none" should be ignored
168
+
169
+ .. versionadded:: 0.23.0
170
+
163
171
Attributes
164
172
----------
165
173
io : str or file-like
@@ -172,6 +180,14 @@ class _HtmlFrameParser(object):
172
180
A dictionary of valid table attributes to use to search for table
173
181
elements.
174
182
183
+ encoding : str
184
+ Encoding to be used by parser
185
+
186
+ displayed_only : bool
187
+ Whether or not items with "display:none" should be ignored
188
+
189
+ .. versionadded:: 0.23.0
190
+
175
191
Notes
176
192
-----
177
193
To subclass this class effectively you must override the following methods:
@@ -187,11 +203,12 @@ class _HtmlFrameParser(object):
187
203
functionality.
188
204
"""
189
205
190
- def __init__ (self , io , match , attrs , encoding ):
206
+ def __init__ (self , io , match , attrs , encoding , displayed_only ):
191
207
self .io = io
192
208
self .match = match
193
209
self .attrs = attrs
194
210
self .encoding = encoding
211
+ self .displayed_only = displayed_only
195
212
196
213
def parse_tables (self ):
197
214
tables = self ._parse_tables (self ._build_doc (), self .match , self .attrs )
@@ -380,6 +397,27 @@ def _parse_raw_tbody(self, table):
380
397
res = self ._parse_tr (table )
381
398
return self ._parse_raw_data (res )
382
399
400
+ def _handle_hidden_tables (self , tbl_list , attr_name ):
401
+ """Returns list of tables, potentially removing hidden elements
402
+
403
+ Parameters
404
+ ----------
405
+ tbl_list : list of Tag or list of Element
406
+ Type of list elements will vary depending upon parser used
407
+ attr_name : str
408
+ Name of the accessor for retrieving HTML attributes
409
+
410
+ Returns
411
+ -------
412
+ list of Tag or list of Element
413
+ Return type matches `tbl_list`
414
+ """
415
+ if not self .displayed_only :
416
+ return tbl_list
417
+
418
+ return [x for x in tbl_list if "display:none" not in
419
+ getattr (x , attr_name ).get ('style' , '' ).replace (" " , "" )]
420
+
383
421
384
422
class _BeautifulSoupHtml5LibFrameParser (_HtmlFrameParser ):
385
423
"""HTML to DataFrame parser that uses BeautifulSoup under the hood.
@@ -431,8 +469,14 @@ def _parse_tables(self, doc, match, attrs):
431
469
432
470
result = []
433
471
unique_tables = set ()
472
+ tables = self ._handle_hidden_tables (tables , "attrs" )
434
473
435
474
for table in tables :
475
+ if self .displayed_only :
476
+ for elem in table .find_all (
477
+ style = re .compile (r"display:\s*none" )):
478
+ elem .decompose ()
479
+
436
480
if (table not in unique_tables and
437
481
table .find (text = match ) is not None ):
438
482
result .append (table )
@@ -528,6 +572,17 @@ def _parse_tables(self, doc, match, kwargs):
528
572
529
573
tables = doc .xpath (xpath_expr , namespaces = _re_namespace )
530
574
575
+ tables = self ._handle_hidden_tables (tables , "attrib" )
576
+ if self .displayed_only :
577
+ for table in tables :
578
+ # lxml utilizes XPATH 1.0 which does not have regex
579
+ # support. As a result, we find all elements with a style
580
+ # attribute and iterate them to check for display:none
581
+ for elem in table .xpath ('.//*[@style]' ):
582
+ if "display:none" in elem .attrib .get (
583
+ "style" , "" ).replace (" " , "" ):
584
+ elem .getparent ().remove (elem )
585
+
531
586
if not tables :
532
587
raise ValueError ("No tables found matching regex {patt!r}"
533
588
.format (patt = pattern ))
@@ -729,15 +784,15 @@ def _validate_flavor(flavor):
729
784
return flavor
730
785
731
786
732
- def _parse (flavor , io , match , attrs , encoding , ** kwargs ):
787
+ def _parse (flavor , io , match , attrs , encoding , displayed_only , ** kwargs ):
733
788
flavor = _validate_flavor (flavor )
734
789
compiled_match = re .compile (match ) # you can pass a compiled regex here
735
790
736
791
# hack around python 3 deleting the exception variable
737
792
retained = None
738
793
for flav in flavor :
739
794
parser = _parser_dispatch (flav )
740
- p = parser (io , compiled_match , attrs , encoding )
795
+ p = parser (io , compiled_match , attrs , encoding , displayed_only )
741
796
742
797
try :
743
798
tables = p .parse_tables ()
@@ -773,7 +828,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
773
828
skiprows = None , attrs = None , parse_dates = False ,
774
829
tupleize_cols = None , thousands = ',' , encoding = None ,
775
830
decimal = '.' , converters = None , na_values = None ,
776
- keep_default_na = True ):
831
+ keep_default_na = True , displayed_only = True ):
777
832
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
778
833
779
834
Parameters
@@ -877,6 +932,11 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
877
932
878
933
.. versionadded:: 0.19.0
879
934
935
+ display_only : bool, default True
936
+ Whether elements with "display: none" should be parsed
937
+
938
+ .. versionadded:: 0.23.0
939
+
880
940
Returns
881
941
-------
882
942
dfs : list of DataFrames
@@ -924,4 +984,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
924
984
parse_dates = parse_dates , tupleize_cols = tupleize_cols ,
925
985
thousands = thousands , attrs = attrs , encoding = encoding ,
926
986
decimal = decimal , converters = converters , na_values = na_values ,
927
- keep_default_na = keep_default_na )
987
+ keep_default_na = keep_default_na ,
988
+ displayed_only = displayed_only )
0 commit comments