diff --git a/pandas/io/html.py b/pandas/io/html.py
index 809ce77eef0bb..75cb0fafaa6b3 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -591,9 +591,14 @@ def _setup_build_doc(self):
def _build_doc(self):
from bs4 import BeautifulSoup
- return BeautifulSoup(
- self._setup_build_doc(), features="html5lib", from_encoding=self.encoding
- )
+ bdoc = self._setup_build_doc()
+ if isinstance(bdoc, bytes) and self.encoding is not None:
+ udoc = bdoc.decode(self.encoding)
+ from_encoding = None
+ else:
+ udoc = bdoc
+ from_encoding = self.encoding
+ return BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
def _build_xpath_expr(attrs) -> str:
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 626df839363cb..7a814ce82fd73 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -1158,9 +1158,9 @@ def test_displayed_only(self, displayed_only, exp0, exp1):
assert len(dfs) == 1 # Should not parse hidden table
def test_encode(self, html_encoding_file):
- _, encoding = os.path.splitext(os.path.basename(html_encoding_file))[0].split(
- "_"
- )
+ base_path = os.path.basename(html_encoding_file)
+ root = os.path.splitext(base_path)[0]
+ _, encoding = root.split("_")
try:
with open(html_encoding_file, "rb") as fobj:
@@ -1183,7 +1183,7 @@ def test_encode(self, html_encoding_file):
if is_platform_windows():
if "16" in encoding or "32" in encoding:
pytest.skip()
- raise
+ raise
def test_parse_failure_unseekable(self):
# Issue #17975