-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
Added 'displayed_only' option to 'read_html' #20047
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
234fe8b
b2f24bb
509c9e2
00f1b5f
61bac89
0c7b137
b1d0f91
2960025
2cc98d9
3093879
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -160,6 +160,12 @@ class _HtmlFrameParser(object): | |
attrs : dict | ||
List of HTML <table> element attributes to match. | ||
|
||
encoding : str | ||
Encoding to be used by parser | ||
|
||
displayed_only : bool | ||
Whether or not items with "display:none" should be ignored | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a versionadded tag |
||
|
||
Attributes | ||
---------- | ||
io : str or file-like | ||
|
@@ -187,11 +193,12 @@ class _HtmlFrameParser(object): | |
functionality. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add this to the attributes list |
||
""" | ||
|
||
def __init__(self, io, match, attrs, encoding): | ||
def __init__(self, io, match, attrs, encoding, displayed_only): | ||
self.io = io | ||
self.match = match | ||
self.attrs = attrs | ||
self.encoding = encoding | ||
self.displayed_only = displayed_only | ||
|
||
def parse_tables(self): | ||
tables = self._parse_tables(self._build_doc(), self.match, self.attrs) | ||
|
@@ -432,7 +439,18 @@ def _parse_tables(self, doc, match, attrs): | |
result = [] | ||
unique_tables = set() | ||
|
||
if self.displayed_only: | ||
# Remove any hidden tables | ||
tables = [x for x in tables if not ( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is almost identical to what is done in the LXML parser, save it having a different accessor for the tag element ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor unrelated comment: I think you could make this a generator comprehension instead of a list comprehension. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately I don't think that's possible (save further rework) as the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like in this case there's just a single loop over Probably best to just leave this as-is for consistency with the other method though, as I don't think converting to a generator will provide much of a benefit anyways. Was more trying to be in line with some other cleaning related PRs I've noticed where the preference was to convert to generators where possible. |
||
"display:none" in x.attrs.get('style', '').replace(' ', ''))] | ||
|
||
for table in tables: | ||
if self.displayed_only: | ||
import re | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. import at the top |
||
for elem in table.find_all( | ||
style=re.compile(r"display:\s*none")): | ||
elem.decompose() | ||
|
||
if (table not in unique_tables and | ||
table.find(text=match) is not None): | ||
result.append(table) | ||
|
@@ -528,6 +546,15 @@ def _parse_tables(self, doc, match, kwargs): | |
|
||
tables = doc.xpath(xpath_expr, namespaces=_re_namespace) | ||
|
||
if self.displayed_only: | ||
# Remove any hidden tables | ||
tables = [x for x in tables if not ( | ||
"display:none" in x.attrib.get('style', '').replace(' ', ''))] | ||
for table in tables: | ||
for elem in table.xpath( | ||
'.//*[contains(@style, "display:none")]'): | ||
elem.getparent().remove(elem) | ||
|
||
if not tables: | ||
raise ValueError("No tables found matching regex {patt!r}" | ||
.format(patt=pattern)) | ||
|
@@ -729,15 +756,15 @@ def _validate_flavor(flavor): | |
return flavor | ||
|
||
|
||
def _parse(flavor, io, match, attrs, encoding, **kwargs): | ||
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): | ||
flavor = _validate_flavor(flavor) | ||
compiled_match = re.compile(match) # you can pass a compiled regex here | ||
|
||
# hack around python 3 deleting the exception variable | ||
retained = None | ||
for flav in flavor: | ||
parser = _parser_dispatch(flav) | ||
p = parser(io, compiled_match, attrs, encoding) | ||
p = parser(io, compiled_match, attrs, encoding, displayed_only) | ||
|
||
try: | ||
tables = p.parse_tables() | ||
|
@@ -773,7 +800,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, | |
skiprows=None, attrs=None, parse_dates=False, | ||
tupleize_cols=None, thousands=',', encoding=None, | ||
decimal='.', converters=None, na_values=None, | ||
keep_default_na=True): | ||
keep_default_na=True, displayed_only=True): | ||
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. | ||
|
||
Parameters | ||
|
@@ -877,6 +904,11 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, | |
|
||
.. versionadded:: 0.19.0 | ||
|
||
display_only : bool, default True | ||
Whether elements with "display: none" should be parsed | ||
|
||
.. versionadded:: 0.23.0 | ||
|
||
Returns | ||
------- | ||
dfs : list of DataFrames | ||
|
@@ -924,4 +956,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, | |
parse_dates=parse_dates, tupleize_cols=tupleize_cols, | ||
thousands=thousands, attrs=attrs, encoding=encoding, | ||
decimal=decimal, converters=converters, na_values=na_values, | ||
keep_default_na=keep_default_na) | ||
keep_default_na=keep_default_na, | ||
displayed_only=displayed_only) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -674,6 +674,34 @@ def test_wikipedia_states_table(self): | |
result = self.read_html(data, 'Arizona', header=1)[0] | ||
assert result['sq mi'].dtype == np.dtype('float64') | ||
|
||
@pytest.mark.parametrize("displayed_only,exp0,exp1", [ | ||
(True, DataFrame(["foo"]), None), | ||
(False, DataFrame(["foofoo"]), DataFrame(["foofoo"]))]) | ||
def test_displayed_only(self, displayed_only, exp0, exp1): | ||
# GH 20027 | ||
data = StringIO("""<html> | ||
<body> | ||
<table> | ||
<tr> | ||
<td>foo<span style="display:none">foo</span></td> | ||
</tr> | ||
</table> | ||
<table style="display: none"> | ||
<tr> | ||
<td>foo<span>foo</span></td> | ||
</tr> | ||
</table> | ||
</body> | ||
</html>""") | ||
|
||
dfs = self.read_html(data, displayed_only=displayed_only) | ||
tm.assert_frame_equal(dfs[0], exp0) | ||
|
||
if exp1 is not None: | ||
tm.assert_frame_equal(dfs[1], exp1) | ||
else: | ||
assert len(dfs) == 1 # Should not parse hidden table | ||
|
||
def test_decimal_rows(self): | ||
|
||
# GH 12907 | ||
|
@@ -896,6 +924,34 @@ def test_computer_sales_page(self): | |
data = os.path.join(DATA_PATH, 'computer_sales_page.html') | ||
self.read_html(data, header=[0, 1]) | ||
|
||
@pytest.mark.parametrize("displayed_only,exp0,exp1", [ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Blatant copy/paste of the method above. I was kind of surprised how few tests were shared between the parsers, so there's opportunity here to consolidate a lot of these tests into a base class but I figured that was better done comprehensively than trying to shimmy into this change |
||
(True, DataFrame(["foo"]), None), | ||
(False, DataFrame(["foofoo"]), DataFrame(["foofoo"]))]) | ||
def test_displayed_only(self, displayed_only, exp0, exp1): | ||
# GH 20027 | ||
data = StringIO("""<html> | ||
<body> | ||
<table> | ||
<tr> | ||
<td>foo<span style="display:none">foo</span></td> | ||
</tr> | ||
</table> | ||
<table style="display: none"> | ||
<tr> | ||
<td>foo<span>foo</span></td> | ||
</tr> | ||
</table> | ||
</body> | ||
</html>""") | ||
|
||
dfs = self.read_html(data, displayed_only=displayed_only) | ||
tm.assert_frame_equal(dfs[0], exp0) | ||
|
||
if exp1 is not None: | ||
tm.assert_frame_equal(dfs[1], exp1) | ||
else: | ||
assert len(dfs) == 1 # Should not parse hidden table | ||
|
||
|
||
def test_invalid_flavor(): | ||
url = 'google.com' | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didn't add encoding as part of this change but it looks to have been undocumented from whenever it was added. Tossed something in there for now, assuming the sprint this weekend may address in more detail.
The docstrings throughout this module I think technically violate the standard by introducing blank space in between each parameter, but figured better left to the sprint than tossing in this change