diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 71fda39a05e55..9f37d12d0fa56 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -208,6 +208,7 @@ I/O ^^^ - Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`) - :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`) +- Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`) - Period diff --git a/pandas/io/html.py b/pandas/io/html.py index ce95c2be8581f..02661329b58de 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -582,7 +582,6 @@ def __init__(self, *args, **kwargs) -> None: def _parse_tables(self, doc, match, attrs): element_name = self._strainer.name tables = doc.find_all(element_name, attrs=attrs) - if not tables: raise ValueError("No tables found") @@ -592,13 +591,15 @@ def _parse_tables(self, doc, match, attrs): for table in tables: if self.displayed_only: + for elem in table.find_all("style"): + elem.decompose() + for elem in table.find_all(style=re.compile(r"display:\s*none")): elem.decompose() if table not in unique_tables and table.find(string=match) is not None: result.append(table) unique_tables.add(table) - if not result: raise ValueError(f"No tables found matching pattern {repr(match.pattern)}") return result @@ -730,10 +731,11 @@ def _parse_tables(self, doc, match, kwargs): # lxml utilizes XPATH 1.0 which does not have regex # support. As a result, we find all elements with a style # attribute and iterate them to check for display:none + for elem in table.xpath(".//style"): + elem.drop_tree() for elem in table.xpath(".//*[@style]"): if "display:none" in elem.attrib.get("style", "").replace(" ", ""): elem.drop_tree() - if not tables: raise ValueError(f"No tables found matching regex {repr(pattern)}") return tables @@ -1170,6 +1172,7 @@ def read_html( '{None, "header", "footer", "body", "all"}, got ' f'"{extract_links}"' ) + validate_header_arg(header) check_dtype_backend(dtype_backend) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 252f028e0dffc..047918d4694e0 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1495,3 +1495,28 @@ def test_invalid_dtype_backend(self): ) with pytest.raises(ValueError, match=msg): read_html("test", dtype_backend="numpy") + + def test_style_tag(self): + # GH 48316 + data = """ + + + + + + + + + + + + + +
+ + A + B
A1B1
A2B2
+ """ + result = self.read_html(data)[0] + expected = DataFrame(data=[["A1", "B1"], ["A2", "B2"]], columns=["A", "B"]) + tm.assert_frame_equal(result, expected)