Skip to content

Commit 2ca9b3c

Browse files
authored
* pandas-dev#52197 bug fix. These styling details aren't intended to be in the DataFrame. When these style elements are encountered upon parsing, the trees rooted in these elements are dropped. * Removing extraneous line used for testing * Added an entry in the latest doc file
1 parent cb6d8fd commit 2ca9b3c

File tree

3 files changed

+32
-3
lines changed

3 files changed

+32
-3
lines changed

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,7 @@ I/O
209209
^^^
210210
- Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
211211
- :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`)
212+
- Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`)
212213
-
213214

214215
Period

pandas/io/html.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -582,7 +582,6 @@ def __init__(self, *args, **kwargs) -> None:
582582
def _parse_tables(self, doc, match, attrs):
583583
element_name = self._strainer.name
584584
tables = doc.find_all(element_name, attrs=attrs)
585-
586585
if not tables:
587586
raise ValueError("No tables found")
588587

@@ -592,13 +591,15 @@ def _parse_tables(self, doc, match, attrs):
592591

593592
for table in tables:
594593
if self.displayed_only:
594+
for elem in table.find_all("style"):
595+
elem.decompose()
596+
595597
for elem in table.find_all(style=re.compile(r"display:\s*none")):
596598
elem.decompose()
597599

598600
if table not in unique_tables and table.find(string=match) is not None:
599601
result.append(table)
600602
unique_tables.add(table)
601-
602603
if not result:
603604
raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
604605
return result
@@ -730,10 +731,11 @@ def _parse_tables(self, doc, match, kwargs):
730731
# lxml utilizes XPATH 1.0 which does not have regex
731732
# support. As a result, we find all elements with a style
732733
# attribute and iterate them to check for display:none
734+
for elem in table.xpath(".//style"):
735+
elem.drop_tree()
733736
for elem in table.xpath(".//*[@style]"):
734737
if "display:none" in elem.attrib.get("style", "").replace(" ", ""):
735738
elem.drop_tree()
736-
737739
if not tables:
738740
raise ValueError(f"No tables found matching regex {repr(pattern)}")
739741
return tables
@@ -1170,6 +1172,7 @@ def read_html(
11701172
'{None, "header", "footer", "body", "all"}, got '
11711173
f'"{extract_links}"'
11721174
)
1175+
11731176
validate_header_arg(header)
11741177
check_dtype_backend(dtype_backend)
11751178

pandas/tests/io/test_html.py

+25
Original file line numberDiff line numberDiff line change
@@ -1495,3 +1495,28 @@ def test_invalid_dtype_backend(self):
14951495
)
14961496
with pytest.raises(ValueError, match=msg):
14971497
read_html("test", dtype_backend="numpy")
1498+
1499+
def test_style_tag(self):
1500+
# GH 48316
1501+
data = """
1502+
<table>
1503+
<tr>
1504+
<th>
1505+
<style>.style</style>
1506+
A
1507+
</th>
1508+
<th>B</th>
1509+
</tr>
1510+
<tr>
1511+
<td>A1</td>
1512+
<td>B1</td>
1513+
</tr>
1514+
<tr>
1515+
<td>A2</td>
1516+
<td>B2</td>
1517+
</tr>
1518+
</table>
1519+
"""
1520+
result = self.read_html(data)[0]
1521+
expected = DataFrame(data=[["A1", "B1"], ["A2", "B2"]], columns=["A", "B"])
1522+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)