pandas-dev · phofl · Feb 10, 2024 · Feb 8, 2024 · Feb 8, 2024 · Feb 8, 2024
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -3247,7 +3247,7 @@ output (as shown below for demonstration) for easier parse into ``DataFrame``:
       </row>
     </response>"""
 
-   df = pd.read_xml(StringIO(xml), stylesheet=xsl)
+   df = pd.read_xml(StringIO(xml), stylesheet=StringIO(xsl))
    df
 
 For very large XML files that can range in hundreds of megabytes to gigabytes, :func:`pandas.read_xml`
@@ -3418,7 +3418,7 @@ Write an XML and transform with stylesheet:
       </xsl:template>
     </xsl:stylesheet>"""
 
-   print(geom_df.to_xml(stylesheet=xsl))
+   print(geom_df.to_xml(stylesheet=StringIO(xsl)))
 
 
 XML Final Notes

diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst
@@ -201,12 +201,12 @@ IO enhancements
     You can use ``pd.read_html()`` to read the output from ``DataFrame.to_html()`` like so
 
     .. ipython:: python
-       :okwarning:
 
+        import io
         df = pd.DataFrame({"a": range(3), "b": list("abc")})
         print(df)
         html = df.to_html()
-        alist = pd.read_html(html, index_col=0)
+        alist = pd.read_html(io.StringIO(html), index_col=0)
         print(df == alist[0])
 
     Note that ``alist`` here is a Python ``list`` so ``pd.read_html()`` and

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -103,6 +103,7 @@ Deprecations
 
 Removal of prior version deprecations/changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+- :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`)
 - All arguments except the first ``path``-like argument in IO writers are now keyword only (:issue:`54229`)
 - Removed ``DataFrameGroupBy.grouper`` and ``SeriesGroupBy.grouper`` (:issue:`56521`)
 - Removed ``DataFrameGroupby.fillna`` and ``SeriesGroupBy.fillna``` (:issue:`55719`)

@@ -8,7 +8,6 @@
 )
 import datetime
 from functools import partial
-from io import BytesIO
 import os
 from textwrap import fill
 from typing import (
@@ -94,7 +93,7 @@
 
 Parameters
 ----------
-io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object
+io : str, ExcelFile, xlrd.Book, path object, or file-like object
     Any valid string path is acceptable. The string could be a URL. Valid
     URL schemes include http, ftp, s3, and file. For file URLs, a host is
     expected. A local file could be: ``file://localhost/path/to/table.xlsx``.
@@ -552,10 +551,6 @@ def __init__(
         if engine_kwargs is None:
             engine_kwargs = {}
 
-        # First argument can also be bytes, so create a buffer
-        if isinstance(filepath_or_buffer, bytes):
-            filepath_or_buffer = BytesIO(filepath_or_buffer)
-
         self.handles = IOHandles(
             handle=filepath_or_buffer, compression={"method": None}
         )
@@ -1405,9 +1400,6 @@ def inspect_excel_format(
     BadZipFile
         If resulting stream does not have an XLS signature and is not a valid zipfile.
     """
-    if isinstance(content_or_path, bytes):
-        content_or_path = BytesIO(content_or_path)
-
     with get_handle(
         content_or_path, "rb", storage_options=storage_options, is_text=False
     ) as handle:
@@ -1526,19 +1518,6 @@ def __init__(
         if engine is not None and engine not in self._engines:
             raise ValueError(f"Unknown engine: {engine}")
 
-        # First argument can also be bytes, so create a buffer
-        if isinstance(path_or_buffer, bytes):
-            path_or_buffer = BytesIO(path_or_buffer)
-            warnings.warn(
-                "Passing bytes to 'read_excel' is deprecated and "
-                "will be removed in a future version. To read from a "
-                "byte string, wrap it in a `BytesIO` object.",
-                FutureWarning,
-                stacklevel=find_stack_level(),
-            )
-
-        # Could be a str, ExcelFile, Book, etc.
-        self.io = path_or_buffer
         # Always a string
         self._io = stringify_path(path_or_buffer)
 

diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py
@@ -24,10 +24,7 @@
 from pandas.core.shared_docs import _shared_docs
 
 from pandas.io.common import get_handle
-from pandas.io.xml import (
-    get_data_from_filepath,
-    preprocess_data,
-)
+from pandas.io.xml import get_data_from_filepath
 
 if TYPE_CHECKING:
     from pandas._typing import (
@@ -548,7 +545,7 @@ def _transform_doc(self) -> bytes:
             storage_options=self.storage_options,
         )
 
-        with preprocess_data(handle_data) as xml_data:
+        with handle_data as xml_data:
             curr_parser = XMLParser(encoding=self.encoding)
 
             if isinstance(xml_data, io.StringIO):

diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -7,15 +7,16 @@
 from __future__ import annotations
 
 from collections import abc
+import errno
 import numbers
+import os
 import re
 from re import Pattern
 from typing import (
     TYPE_CHECKING,
     Literal,
     cast,
 )
-import warnings
 
 from pandas._libs import lib
 from pandas.compat._optional import import_optional_dependency
@@ -24,7 +25,6 @@
     EmptyDataError,
 )
 from pandas.util._decorators import doc
-from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.common import is_list_like
@@ -36,10 +36,7 @@
 from pandas.core.shared_docs import _shared_docs
 
 from pandas.io.common import (
-    file_exists,
     get_handle,
-    is_file_like,
-    is_fsspec_url,
     is_url,
     stringify_path,
     validate_header_arg,
@@ -134,21 +131,17 @@ def _read(
     -------
     raw_text : str
     """
-    text: str | bytes
-    if (
-        is_url(obj)
-        or hasattr(obj, "read")
-        or (isinstance(obj, str) and file_exists(obj))
-    ):
+    try:
         with get_handle(
             obj, "r", encoding=encoding, storage_options=storage_options
         ) as handles:
-            text = handles.handle.read()
-    elif isinstance(obj, (str, bytes)):
-        text = obj
-    else:
-        raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")
-    return text
+            return handles.handle.read()
+    except OSError as err:
+        if not is_url(obj):
+            raise FileNotFoundError(
+                f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {obj}"
+            ) from err
+        raise
 
 
 class _HtmlFrameParser:
@@ -158,7 +151,7 @@ class _HtmlFrameParser:
     Parameters
     ----------
     io : str or file-like
-        This can be either a string of raw HTML, a valid URL using the HTTP,
+        This can be either a string path, a valid URL using the HTTP,
         FTP, or FILE protocols or a file-like object.
 
     match : str or regex
@@ -780,36 +773,26 @@ def _build_doc(self):
         from lxml.etree import XMLSyntaxError
         from lxml.html import (
             HTMLParser,
-            fromstring,
             parse,
         )
 
         parser = HTMLParser(recover=True, encoding=self.encoding)
 
-        try:
-            if is_url(self.io):
-                with get_handle(
-                    self.io, "r", storage_options=self.storage_options
-                ) as f:
-                    r = parse(f.handle, parser=parser)
-            else:
-                # try to parse the input in the simplest way
-                r = parse(self.io, parser=parser)
+        if is_url(self.io):
+            with get_handle(self.io, "r", storage_options=self.storage_options) as f:
+                r = parse(f.handle, parser=parser)
+        else:
+            # try to parse the input in the simplest way
             try:
-                r = r.getroot()
-            except AttributeError:
-                pass
-        except (UnicodeDecodeError, OSError) as e:
-            # if the input is a blob of html goop
-            if not is_url(self.io):
-                r = fromstring(self.io, parser=parser)
-
-                try:
-                    r = r.getroot()
-                except AttributeError:
-                    pass
-            else:
-                raise e
+                r = parse(self.io, parser=parser)
+            except OSError as err:
+                raise FileNotFoundError(
+                    f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {self.io}"
+                ) from err
+        try:
+            r = r.getroot()
+        except AttributeError:
+            pass
         else:
             if not hasattr(r, "text_content"):
                 raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
@@ -1059,7 +1042,7 @@ def read_html(
     io : str, path object, or file-like object
         String, path object (implementing ``os.PathLike[str]``), or file-like
         object implementing a string ``read()`` function.
-        The string can represent a URL or the HTML itself. Note that
+        The string can represent a URL. Note that
         lxml only accepts the http, ftp and file url protocols. If you have a
         URL that starts with ``'https'`` you might try removing the ``'s'``.
 
@@ -1227,22 +1210,6 @@ def read_html(
 
     io = stringify_path(io)
 
-    if isinstance(io, str) and not any(
-        [
-            is_file_like(io),
-            file_exists(io),
-            is_url(io),
-            is_fsspec_url(io),
-        ]
-    ):
-        warnings.warn(
-            "Passing literal html to 'read_html' is deprecated and "
-            "will be removed in a future version. To read from a "
-            "literal string, wrap it in a 'StringIO' object.",
-            FutureWarning,
-            stacklevel=find_stack_level(),
-        )
-
     return _parse(
         flavor=flavor,
         io=io,