Skip to content

Commit 543dc9a

Browse files
WillAydpandres
authored andcommitted
Added 'displayed_only' option to 'read_html' (pandas-dev#20047)
1 parent 0505c05 commit 543dc9a

File tree

3 files changed

+133
-5
lines changed

3 files changed

+133
-5
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,7 @@ Other Enhancements
343343
- :meth:`Timestamp.day_name` and :meth:`DatetimeIndex.day_name` are now available to return day names with a specified locale (:issue:`12806`)
344344
- :meth:`DataFrame.to_sql` now performs a multivalue insert if the underlying connection supports itk rather than inserting row by row.
345345
``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`)
346+
- :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`)
346347

347348
.. _whatsnew_0230.api_breaking:
348349

pandas/io/html.py

+66-5
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,14 @@ class _HtmlFrameParser(object):
160160
attrs : dict
161161
List of HTML <table> element attributes to match.
162162
163+
encoding : str
164+
Encoding to be used by parser
165+
166+
displayed_only : bool
167+
Whether or not items with "display:none" should be ignored
168+
169+
.. versionadded:: 0.23.0
170+
163171
Attributes
164172
----------
165173
io : str or file-like
@@ -172,6 +180,14 @@ class _HtmlFrameParser(object):
172180
A dictionary of valid table attributes to use to search for table
173181
elements.
174182
183+
encoding : str
184+
Encoding to be used by parser
185+
186+
displayed_only : bool
187+
Whether or not items with "display:none" should be ignored
188+
189+
.. versionadded:: 0.23.0
190+
175191
Notes
176192
-----
177193
To subclass this class effectively you must override the following methods:
@@ -187,11 +203,12 @@ class _HtmlFrameParser(object):
187203
functionality.
188204
"""
189205

190-
def __init__(self, io, match, attrs, encoding):
206+
def __init__(self, io, match, attrs, encoding, displayed_only):
191207
self.io = io
192208
self.match = match
193209
self.attrs = attrs
194210
self.encoding = encoding
211+
self.displayed_only = displayed_only
195212

196213
def parse_tables(self):
197214
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
@@ -380,6 +397,27 @@ def _parse_raw_tbody(self, table):
380397
res = self._parse_tr(table)
381398
return self._parse_raw_data(res)
382399

400+
def _handle_hidden_tables(self, tbl_list, attr_name):
401+
"""Returns list of tables, potentially removing hidden elements
402+
403+
Parameters
404+
----------
405+
tbl_list : list of Tag or list of Element
406+
Type of list elements will vary depending upon parser used
407+
attr_name : str
408+
Name of the accessor for retrieving HTML attributes
409+
410+
Returns
411+
-------
412+
list of Tag or list of Element
413+
Return type matches `tbl_list`
414+
"""
415+
if not self.displayed_only:
416+
return tbl_list
417+
418+
return [x for x in tbl_list if "display:none" not in
419+
getattr(x, attr_name).get('style', '').replace(" ", "")]
420+
383421

384422
class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
385423
"""HTML to DataFrame parser that uses BeautifulSoup under the hood.
@@ -431,8 +469,14 @@ def _parse_tables(self, doc, match, attrs):
431469

432470
result = []
433471
unique_tables = set()
472+
tables = self._handle_hidden_tables(tables, "attrs")
434473

435474
for table in tables:
475+
if self.displayed_only:
476+
for elem in table.find_all(
477+
style=re.compile(r"display:\s*none")):
478+
elem.decompose()
479+
436480
if (table not in unique_tables and
437481
table.find(text=match) is not None):
438482
result.append(table)
@@ -528,6 +572,17 @@ def _parse_tables(self, doc, match, kwargs):
528572

529573
tables = doc.xpath(xpath_expr, namespaces=_re_namespace)
530574

575+
tables = self._handle_hidden_tables(tables, "attrib")
576+
if self.displayed_only:
577+
for table in tables:
578+
# lxml utilizes XPATH 1.0 which does not have regex
579+
# support. As a result, we find all elements with a style
580+
# attribute and iterate them to check for display:none
581+
for elem in table.xpath('.//*[@style]'):
582+
if "display:none" in elem.attrib.get(
583+
"style", "").replace(" ", ""):
584+
elem.getparent().remove(elem)
585+
531586
if not tables:
532587
raise ValueError("No tables found matching regex {patt!r}"
533588
.format(patt=pattern))
@@ -729,15 +784,15 @@ def _validate_flavor(flavor):
729784
return flavor
730785

731786

732-
def _parse(flavor, io, match, attrs, encoding, **kwargs):
787+
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
733788
flavor = _validate_flavor(flavor)
734789
compiled_match = re.compile(match) # you can pass a compiled regex here
735790

736791
# hack around python 3 deleting the exception variable
737792
retained = None
738793
for flav in flavor:
739794
parser = _parser_dispatch(flav)
740-
p = parser(io, compiled_match, attrs, encoding)
795+
p = parser(io, compiled_match, attrs, encoding, displayed_only)
741796

742797
try:
743798
tables = p.parse_tables()
@@ -773,7 +828,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
773828
skiprows=None, attrs=None, parse_dates=False,
774829
tupleize_cols=None, thousands=',', encoding=None,
775830
decimal='.', converters=None, na_values=None,
776-
keep_default_na=True):
831+
keep_default_na=True, displayed_only=True):
777832
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
778833
779834
Parameters
@@ -877,6 +932,11 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
877932
878933
.. versionadded:: 0.19.0
879934
935+
display_only : bool, default True
936+
Whether elements with "display: none" should be parsed
937+
938+
.. versionadded:: 0.23.0
939+
880940
Returns
881941
-------
882942
dfs : list of DataFrames
@@ -924,4 +984,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
924984
parse_dates=parse_dates, tupleize_cols=tupleize_cols,
925985
thousands=thousands, attrs=attrs, encoding=encoding,
926986
decimal=decimal, converters=converters, na_values=na_values,
927-
keep_default_na=keep_default_na)
987+
keep_default_na=keep_default_na,
988+
displayed_only=displayed_only)

pandas/tests/io/test_html.py

+66
Original file line numberDiff line numberDiff line change
@@ -674,6 +674,39 @@ def test_wikipedia_states_table(self):
674674
result = self.read_html(data, 'Arizona', header=1)[0]
675675
assert result['sq mi'].dtype == np.dtype('float64')
676676

677+
@pytest.mark.parametrize("displayed_only,exp0,exp1", [
678+
(True, DataFrame(["foo"]), None),
679+
(False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))])
680+
def test_displayed_only(self, displayed_only, exp0, exp1):
681+
# GH 20027
682+
data = StringIO("""<html>
683+
<body>
684+
<table>
685+
<tr>
686+
<td>
687+
foo
688+
<span style="display:none;text-align:center">bar</span>
689+
<span style="display:none">baz</span>
690+
<span style="display: none">qux</span>
691+
</td>
692+
</tr>
693+
</table>
694+
<table style="display: none">
695+
<tr>
696+
<td>foo</td>
697+
</tr>
698+
</table>
699+
</body>
700+
</html>""")
701+
702+
dfs = self.read_html(data, displayed_only=displayed_only)
703+
tm.assert_frame_equal(dfs[0], exp0)
704+
705+
if exp1 is not None:
706+
tm.assert_frame_equal(dfs[1], exp1)
707+
else:
708+
assert len(dfs) == 1 # Should not parse hidden table
709+
677710
def test_decimal_rows(self):
678711

679712
# GH 12907
@@ -896,6 +929,39 @@ def test_computer_sales_page(self):
896929
data = os.path.join(DATA_PATH, 'computer_sales_page.html')
897930
self.read_html(data, header=[0, 1])
898931

932+
@pytest.mark.parametrize("displayed_only,exp0,exp1", [
933+
(True, DataFrame(["foo"]), None),
934+
(False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))])
935+
def test_displayed_only(self, displayed_only, exp0, exp1):
936+
# GH 20027
937+
data = StringIO("""<html>
938+
<body>
939+
<table>
940+
<tr>
941+
<td>
942+
foo
943+
<span style="display:none;text-align:center">bar</span>
944+
<span style="display:none">baz</span>
945+
<span style="display: none">qux</span>
946+
</td>
947+
</tr>
948+
</table>
949+
<table style="display: none">
950+
<tr>
951+
<td>foo</td>
952+
</tr>
953+
</table>
954+
</body>
955+
</html>""")
956+
957+
dfs = self.read_html(data, displayed_only=displayed_only)
958+
tm.assert_frame_equal(dfs[0], exp0)
959+
960+
if exp1 is not None:
961+
tm.assert_frame_equal(dfs[1], exp1)
962+
else:
963+
assert len(dfs) == 1 # Should not parse hidden table
964+
899965

900966
def test_invalid_flavor():
901967
url = 'google.com'

0 commit comments

Comments
 (0)