Skip to content

Commit 525f1ef

Browse files
authored
BUG: change lxml remove to drop_tree (#51629) (#52135)
* BUG: change lxml remove to drop_tree (#51629) * for removing elements when display:none in read_html * test added * DOC: #51629 added to whatsnew
1 parent 3c4be2b commit 525f1ef

File tree

3 files changed

+24
-1
lines changed

3 files changed

+24
-1
lines changed

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,7 @@ MultiIndex
205205

206206
I/O
207207
^^^
208+
- Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
208209
- :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`)
209210
-
210211

pandas/io/html.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -732,7 +732,7 @@ def _parse_tables(self, doc, match, kwargs):
732732
# attribute and iterate them to check for display:none
733733
for elem in table.xpath(".//*[@style]"):
734734
if "display:none" in elem.attrib.get("style", "").replace(" ", ""):
735-
elem.getparent().remove(elem)
735+
elem.drop_tree()
736736

737737
if not tables:
738738
raise ValueError(f"No tables found matching regex {repr(pattern)}")

pandas/tests/io/test_html.py

+22
Original file line numberDiff line numberDiff line change
@@ -1238,6 +1238,28 @@ def test_displayed_only(self, displayed_only, exp0, exp1):
12381238
else:
12391239
assert len(dfs) == 1 # Should not parse hidden table
12401240

1241+
@pytest.mark.parametrize("displayed_only", [True, False])
1242+
def test_displayed_only_with_many_elements(self, displayed_only):
1243+
html_table = """
1244+
<table>
1245+
<tr>
1246+
<th>A</th>
1247+
<th>B</th>
1248+
</tr>
1249+
<tr>
1250+
<td>1</td>
1251+
<td>2</td>
1252+
</tr>
1253+
<tr>
1254+
<td><span style="display:none"></span>4</td>
1255+
<td>5</td>
1256+
</tr>
1257+
</table>
1258+
"""
1259+
result = read_html(html_table, displayed_only=displayed_only)[0]
1260+
expected = DataFrame({"A": [1, 4], "B": [2, 5]})
1261+
tm.assert_frame_equal(result, expected)
1262+
12411263
@pytest.mark.filterwarnings(
12421264
"ignore:You provided Unicode markup but also provided a value for "
12431265
"from_encoding.*:UserWarning"

0 commit comments

Comments
 (0)