BUG pandas-dev#52197 proposed fix (pandas-dev#52251)

zbreger · web-flow · commit 2ca9b3cb1d1a · 2023-03-28T11:23:57.000-07:00
* pandas-dev#52197 bug fix. These styling details aren't intended to be in the DataFrame. When these style elements are encountered upon parsing, the trees rooted in these elements are dropped. * Removing extraneous line used for testing * Added an entry in the latest doc file
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -209,6 +209,7 @@ I/O
 ^^^
 - Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
 - :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`)
+- Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`)
 -
 
 Period
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -582,7 +582,6 @@ def __init__(self, *args, **kwargs) -> None:
     def _parse_tables(self, doc, match, attrs):
         element_name = self._strainer.name
         tables = doc.find_all(element_name, attrs=attrs)
-
         if not tables:
             raise ValueError("No tables found")
 
@@ -592,13 +591,15 @@ def _parse_tables(self, doc, match, attrs):
 
         for table in tables:
             if self.displayed_only:
+                for elem in table.find_all("style"):
+                    elem.decompose()
+
                 for elem in table.find_all(style=re.compile(r"display:\s*none")):
                     elem.decompose()
 
             if table not in unique_tables and table.find(string=match) is not None:
                 result.append(table)
             unique_tables.add(table)
-
         if not result:
             raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
         return result
@@ -730,10 +731,11 @@ def _parse_tables(self, doc, match, kwargs):
                 # lxml utilizes XPATH 1.0 which does not have regex
                 # support. As a result, we find all elements with a style
                 # attribute and iterate them to check for display:none
+                for elem in table.xpath(".//style"):
+                    elem.drop_tree()
                 for elem in table.xpath(".//*[@style]"):
                     if "display:none" in elem.attrib.get("style", "").replace(" ", ""):
                         elem.drop_tree()
-
         if not tables:
             raise ValueError(f"No tables found matching regex {repr(pattern)}")
         return tables
@@ -1170,6 +1172,7 @@ def read_html(
             '{None, "header", "footer", "body", "all"}, got '
             f'"{extract_links}"'
         )
+
     validate_header_arg(header)
     check_dtype_backend(dtype_backend)
 
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
@@ -1495,3 +1495,28 @@ def test_invalid_dtype_backend(self):
         )
         with pytest.raises(ValueError, match=msg):
             read_html("test", dtype_backend="numpy")
+
+    def test_style_tag(self):
+        # GH 48316
+        data = """
+        <table>
+            <tr>
+                <th>
+                    <style>.style</style>
+                    A
+                    </th>
+                <th>B</th>
+            </tr>
+            <tr>
+                <td>A1</td>
+                <td>B1</td>
+            </tr>
+            <tr>
+                <td>A2</td>
+                <td>B2</td>
+            </tr>
+        </table>
+        """
+        result = self.read_html(data)[0]
+        expected = DataFrame(data=[["A1", "B1"], ["A2", "B2"]], columns=["A", "B"])
+        tm.assert_frame_equal(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -209,6 +209,7 @@ I/O`
`209`	`209`	`^^^`
`210`	`210`	- Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
`211`	`211`	- :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`)
	`212`	+- Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`)
`212`	`213`	`-`
`213`	`214`
`214`	`215`	`Period`