diff --git a/pandas/io/html.py b/pandas/io/html.py index eafcca0e85bb3..04f9f317d7dae 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -591,9 +591,14 @@ def _setup_build_doc(self): def _build_doc(self): from bs4 import BeautifulSoup - return BeautifulSoup( - self._setup_build_doc(), features="html5lib", from_encoding=self.encoding - ) + bdoc = self._setup_build_doc() + if isinstance(bdoc, bytes) and self.encoding is not None: + udoc = bdoc.decode(self.encoding) + from_encoding = None + else: + udoc = bdoc + from_encoding = self.encoding + return BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding) def _build_xpath_expr(attrs) -> str: diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 626df839363cb..7a814ce82fd73 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1158,9 +1158,9 @@ def test_displayed_only(self, displayed_only, exp0, exp1): assert len(dfs) == 1 # Should not parse hidden table def test_encode(self, html_encoding_file): - _, encoding = os.path.splitext(os.path.basename(html_encoding_file))[0].split( - "_" - ) + base_path = os.path.basename(html_encoding_file) + root = os.path.splitext(base_path)[0] + _, encoding = root.split("_") try: with open(html_encoding_file, "rb") as fobj: @@ -1183,7 +1183,7 @@ def test_encode(self, html_encoding_file): if is_platform_windows(): if "16" in encoding or "32" in encoding: pytest.skip() - raise + raise def test_parse_failure_unseekable(self): # Issue #17975