From f909e0d5af8066da804b02a764e74019eae537da Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 8 Feb 2024 10:43:13 -0800
Subject: [PATCH 01/10] Ban literal json in read_json

---
 pandas/io/json/_json.py             | 87 ++++++++---------------------
 pandas/tests/io/json/test_pandas.py | 46 +++++----------
 2 files changed, 35 insertions(+), 98 deletions(-)

diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index cea34cdfb0b9d..a75e4e6aa9600 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -5,7 +5,6 @@
     abstractmethod,
 )
 from collections import abc
-from io import StringIO
 from itertools import islice
 from typing import (
     TYPE_CHECKING,
@@ -30,7 +29,6 @@
 from pandas.compat._optional import import_optional_dependency
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import doc
-from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.common import (
@@ -58,9 +56,7 @@
     extension_to_compression,
     file_exists,
     get_handle,
-    is_fsspec_url,
     is_potential_multi_index,
-    is_url,
     stringify_path,
 )
 from pandas.io.json._normalize import convert_to_line_delimits
@@ -530,7 +526,7 @@ def read_json(
 
     Parameters
     ----------
-    path_or_buf : a valid JSON str, path object or file-like object
+    path_or_buf : a string path, path object or file-like object
         Any valid string path is acceptable. The string could be a URL. Valid
         URL schemes include http, ftp, s3, and file. For file URLs, a host is
         expected. A local file could be:
@@ -879,18 +875,6 @@ def __init__(
             self.nrows = validate_integer("nrows", self.nrows, 0)
             if not self.lines:
                 raise ValueError("nrows can only be passed if lines=True")
-        if (
-            isinstance(filepath_or_buffer, str)
-            and not self.lines
-            and "\n" in filepath_or_buffer
-        ):
-            warnings.warn(
-                "Passing literal json to 'read_json' is deprecated and "
-                "will be removed in a future version. To read from a "
-                "literal string, wrap it in a 'StringIO' object.",
-                FutureWarning,
-                stacklevel=find_stack_level(),
-            )
         if self.engine == "pyarrow":
             if not self.lines:
                 raise ValueError(
@@ -900,45 +884,22 @@ def __init__(
             self.data = filepath_or_buffer
         elif self.engine == "ujson":
             data = self._get_data_from_filepath(filepath_or_buffer)
-            self.data = self._preprocess_data(data)
-
-    def _preprocess_data(self, data):
-        """
-        At this point, the data either has a `read` attribute (e.g. a file
-        object or a StringIO) or is a string that is a JSON document.
-
-        If self.chunksize, we prepare the data for the `__next__` method.
-        Otherwise, we read it into memory for the `read` method.
-        """
-        if hasattr(data, "read") and not (self.chunksize or self.nrows):
-            with self:
-                data = data.read()
-        if not hasattr(data, "read") and (self.chunksize or self.nrows):
-            data = StringIO(data)
-
-        return data
+            # If self.chunksize, we prepare the data for the `__next__` method.
+            # Otherwise, we read it into memory for the `read` method.
+            if not (self.chunksize or self.nrows):
+                with self:
+                    self.data = data.read()
+            else:
+                self.data = data
 
     def _get_data_from_filepath(self, filepath_or_buffer):
         """
         The function read_json accepts three input types:
             1. filepath (string-like)
             2. file-like object (e.g. open file object, StringIO)
-            3. JSON string
-
-        This method turns (1) into (2) to simplify the rest of the processing.
-        It returns input types (2) and (3) unchanged.
-
-        It raises FileNotFoundError if the input is a string ending in
-        one of .json, .json.gz, .json.bz2, etc. but no such file exists.
         """
-        # if it is a string but the file does not exist, it might be a JSON string
         filepath_or_buffer = stringify_path(filepath_or_buffer)
-        if (
-            not isinstance(filepath_or_buffer, str)
-            or is_url(filepath_or_buffer)
-            or is_fsspec_url(filepath_or_buffer)
-            or file_exists(filepath_or_buffer)
-        ):
+        try:
             self.handles = get_handle(
                 filepath_or_buffer,
                 "r",
@@ -947,23 +908,19 @@ def _get_data_from_filepath(self, filepath_or_buffer):
                 storage_options=self.storage_options,
                 errors=self.encoding_errors,
             )
-            filepath_or_buffer = self.handles.handle
-        elif (
-            isinstance(filepath_or_buffer, str)
-            and filepath_or_buffer.lower().endswith(
-                (".json",) + tuple(f".json{c}" for c in extension_to_compression)
-            )
-            and not file_exists(filepath_or_buffer)
-        ):
-            raise FileNotFoundError(f"File {filepath_or_buffer} does not exist")
-        else:
-            warnings.warn(
-                "Passing literal json to 'read_json' is deprecated and "
-                "will be removed in a future version. To read from a "
-                "literal string, wrap it in a 'StringIO' object.",
-                FutureWarning,
-                stacklevel=find_stack_level(),
-            )
+        except OSError as err:
+            if (
+                isinstance(filepath_or_buffer, str)
+                and filepath_or_buffer.lower().endswith(
+                    (".json",) + tuple(f".json{c}" for c in extension_to_compression)
+                )
+                and not file_exists(filepath_or_buffer)
+            ):
+                raise FileNotFoundError(
+                    f"File {filepath_or_buffer} does not exist"
+                ) from err
+            raise
+        filepath_or_buffer = self.handles.handle
         return filepath_or_buffer
 
     def _combine_lines(self, lines) -> str:
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 9a263e8bc5f44..b5e197303d12c 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -36,49 +36,29 @@
 from pandas.io.json import ujson_dumps
 
 
-def test_literal_json_deprecation():
+def test_literal_json_raises():
     # PR 53409
-    expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
-
     jsonl = """{"a": 1, "b": 2}
         {"a": 3, "b": 4}
         {"a": 5, "b": 6}
         {"a": 7, "b": 8}"""
 
-    msg = (
-        "Passing literal json to 'read_json' is deprecated and "
-        "will be removed in a future version. To read from a "
-        "literal string, wrap it in a 'StringIO' object."
-    )
+    msg = r"\[Errno 2\] No such file or directory"
 
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        try:
-            read_json(jsonl, lines=False)
-        except ValueError:
-            pass
+    with pytest.raises(FileNotFoundError, match=msg):
+        read_json(jsonl, lines=False)
 
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        read_json(expected.to_json(), lines=False)
+    with pytest.raises(FileNotFoundError, match=msg):
+        read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
 
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
-        tm.assert_frame_equal(result, expected)
+    with pytest.raises(FileNotFoundError, match=msg):
+        read_json(
+            '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n',
+            lines=False,
+        )
 
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        try:
-            result = read_json(
-                '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n',
-                lines=False,
-            )
-        except ValueError:
-            pass
-
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        try:
-            result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=False)
-        except ValueError:
-            pass
-        tm.assert_frame_equal(result, expected)
+    with pytest.raises(FileNotFoundError, match=msg):
+        read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=False)
 
 
 def assert_json_roundtrip_equal(result, expected, orient):

From ddcf0344b5d83f76c98ae26372ebf8c2ef141471 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 8 Feb 2024 13:34:04 -0800
Subject: [PATCH 02/10] address xml

---
 pandas/io/formats/xml.py           |  7 +--
 pandas/io/xml.py                   | 75 ++++++++----------------------
 pandas/tests/io/xml/test_to_xml.py | 37 +++++++--------
 pandas/tests/io/xml/test_xml.py    | 72 ++++++++++------------------
 4 files changed, 62 insertions(+), 129 deletions(-)

diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py
index 775f1842692cb..e55561902d4d3 100644
--- a/pandas/io/formats/xml.py
+++ b/pandas/io/formats/xml.py
@@ -24,10 +24,7 @@
 from pandas.core.shared_docs import _shared_docs
 
 from pandas.io.common import get_handle
-from pandas.io.xml import (
-    get_data_from_filepath,
-    preprocess_data,
-)
+from pandas.io.xml import get_data_from_filepath
 
 if TYPE_CHECKING:
     from pandas._typing import (
@@ -548,7 +545,7 @@ def _transform_doc(self) -> bytes:
             storage_options=self.storage_options,
         )
 
-        with preprocess_data(handle_data) as xml_data:
+        with handle_data as xml_data:
             curr_parser = XMLParser(encoding=self.encoding)
 
             if isinstance(xml_data, io.StringIO):
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index 97bf520a77611..24ab111215218 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -11,7 +11,6 @@
     Any,
     Callable,
 )
-import warnings
 
 from pandas._libs import lib
 from pandas.compat._optional import import_optional_dependency
@@ -20,7 +19,6 @@
     ParserError,
 )
 from pandas.util._decorators import doc
-from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.common import is_list_like
@@ -28,10 +26,8 @@
 from pandas.core.shared_docs import _shared_docs
 
 from pandas.io.common import (
-    file_exists,
     get_handle,
     infer_compression,
-    is_file_like,
     is_fsspec_url,
     is_url,
     stringify_path,
@@ -528,7 +524,7 @@ def _parse_doc(
             storage_options=self.storage_options,
         )
 
-        with preprocess_data(handle_data) as xml_data:
+        with handle_data as xml_data:
             curr_parser = XMLParser(encoding=self.encoding)
             document = parse(xml_data, parser=curr_parser)
 
@@ -635,7 +631,7 @@ def _parse_doc(
             storage_options=self.storage_options,
         )
 
-        with preprocess_data(handle_data) as xml_data:
+        with handle_data as xml_data:
             curr_parser = XMLParser(encoding=self.encoding)
 
             if isinstance(xml_data, io.StringIO):
@@ -677,40 +673,23 @@ def get_data_from_filepath(
     """
     Extract raw XML data.
 
-    The method accepts three input types:
+    The method accepts two input types:
         1. filepath (string-like)
         2. file-like object (e.g. open file object, StringIO)
-        3. XML string or bytes
-
-    This method turns (1) into (2) to simplify the rest of the processing.
-    It returns input types (2) and (3) unchanged.
     """
-    if not isinstance(filepath_or_buffer, bytes):
-        filepath_or_buffer = stringify_path(filepath_or_buffer)
-
-    if (
-        isinstance(filepath_or_buffer, str)
-        and not filepath_or_buffer.startswith(("<?xml", "<"))
-    ) and (
-        not isinstance(filepath_or_buffer, str)
-        or is_url(filepath_or_buffer)
-        or is_fsspec_url(filepath_or_buffer)
-        or file_exists(filepath_or_buffer)
-    ):
-        with get_handle(
-            filepath_or_buffer,
-            "r",
-            encoding=encoding,
-            compression=compression,
-            storage_options=storage_options,
-        ) as handle_obj:
-            filepath_or_buffer = (
-                handle_obj.handle.read()
-                if hasattr(handle_obj.handle, "read")
-                else handle_obj.handle
-            )
-
-    return filepath_or_buffer
+    filepath_or_buffer = stringify_path(filepath_or_buffer)
+    with get_handle(
+        filepath_or_buffer,
+        "r",
+        encoding=encoding,
+        compression=compression,
+        storage_options=storage_options,
+    ) as handle_obj:
+        return (
+            preprocess_data(handle_obj.handle.read())
+            if hasattr(handle_obj.handle, "read")
+            else handle_obj.handle
+        )
 
 
 def preprocess_data(data) -> io.StringIO | io.BytesIO:
@@ -790,22 +769,6 @@ def _parse(
 
     p: _EtreeFrameParser | _LxmlFrameParser
 
-    if isinstance(path_or_buffer, str) and not any(
-        [
-            is_file_like(path_or_buffer),
-            file_exists(path_or_buffer),
-            is_url(path_or_buffer),
-            is_fsspec_url(path_or_buffer),
-        ]
-    ):
-        warnings.warn(
-            "Passing literal xml to 'read_xml' is deprecated and "
-            "will be removed in a future version. To read from a "
-            "literal string, wrap it in a 'StringIO' object.",
-            FutureWarning,
-            stacklevel=find_stack_level(),
-        )
-
     if parser == "lxml":
         lxml = import_optional_dependency("lxml.etree", errors="ignore")
 
@@ -894,8 +857,8 @@ def read_xml(
     ----------
     path_or_buffer : str, path object, or file-like object
         String, path object (implementing ``os.PathLike[str]``), or file-like
-        object implementing a ``read()`` function. The string can be any valid XML
-        string or a path. The string can further be a URL. Valid URL schemes
+        object implementing a ``read()`` function. The string can be a path.
+        The string can further be a URL. Valid URL schemes
         include http, ftp, s3, and file.
 
         .. deprecated:: 2.1.0
@@ -969,7 +932,7 @@ def read_xml(
         and ability to use XSLT stylesheet are supported.
 
     stylesheet : str, path object or file-like object
-        A URL, file-like object, or a raw string containing an XSLT script.
+        A URL, file-like object, or a string path containing an XSLT script.
         This stylesheet should flatten complex, deeply nested XML documents
         for easier parsing. To use this feature you must have ``lxml`` module
         installed and specify 'lxml' as ``parser``. The ``xpath`` must
diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py
index a123f6dd52c08..62cc33376c630 100644
--- a/pandas/tests/io/xml/test_to_xml.py
+++ b/pandas/tests/io/xml/test_to_xml.py
@@ -1034,26 +1034,23 @@ def test_stylesheet_buffered_reader(xsl_row_field_output, mode, geom_df):
     with open(
         xsl_row_field_output, mode, encoding="utf-8" if mode == "r" else None
     ) as f:
-        xsl_obj = f.read()
-
-    output = geom_df.to_xml(stylesheet=xsl_obj)
+        output = geom_df.to_xml(stylesheet=f)
 
     assert output == xsl_expected
 
 
 def test_stylesheet_wrong_path(geom_df):
-    lxml_etree = pytest.importorskip("lxml.etree")
+    pytest.importorskip("lxml.etree")
 
-    xsl = os.path.join("data", "xml", "row_field_output.xslt")
+    xsl = os.path.join("does", "not", "exist", "row_field_output.xslt")
 
     with pytest.raises(
-        lxml_etree.XMLSyntaxError,
-        match=("Start tag expected, '<' not found"),
+        FileNotFoundError, match=r"\[Errno 2\] No such file or director"
     ):
         geom_df.to_xml(stylesheet=xsl)
 
 
-@pytest.mark.parametrize("val", ["", b""])
+@pytest.mark.parametrize("val", [StringIO(""), BytesIO(b"")])
 def test_empty_string_stylesheet(val, geom_df):
     lxml_etree = pytest.importorskip("lxml.etree")
 
@@ -1095,9 +1092,9 @@ def test_incorrect_xsl_syntax(geom_df):
 </xsl:stylesheet>"""
 
     with pytest.raises(
-        lxml_etree.XMLSyntaxError, match=("Opening and ending tag mismatch")
+        lxml_etree.XMLSyntaxError, match="Opening and ending tag mismatch"
     ):
-        geom_df.to_xml(stylesheet=xsl)
+        geom_df.to_xml(stylesheet=StringIO(xsl))
 
 
 def test_incorrect_xsl_eval(geom_df):
@@ -1124,8 +1121,8 @@ def test_incorrect_xsl_eval(geom_df):
     </xsl:template>
 </xsl:stylesheet>"""
 
-    with pytest.raises(lxml_etree.XSLTParseError, match=("failed to compile")):
-        geom_df.to_xml(stylesheet=xsl)
+    with pytest.raises(lxml_etree.XSLTParseError, match="failed to compile"):
+        geom_df.to_xml(stylesheet=StringIO(xsl))
 
 
 def test_incorrect_xsl_apply(geom_df):
@@ -1143,9 +1140,9 @@ def test_incorrect_xsl_apply(geom_df):
     </xsl:template>
 </xsl:stylesheet>"""
 
-    with pytest.raises(lxml_etree.XSLTApplyError, match=("Cannot resolve URI")):
+    with pytest.raises(lxml_etree.XSLTApplyError, match="Cannot resolve URI"):
         with tm.ensure_clean("test.xml") as path:
-            geom_df.to_xml(path, stylesheet=xsl)
+            geom_df.to_xml(path, stylesheet=StringIO(xsl))
 
 
 def test_stylesheet_with_etree(geom_df):
@@ -1160,10 +1157,8 @@ def test_stylesheet_with_etree(geom_df):
         </xsl:copy>
     </xsl:template>"""
 
-    with pytest.raises(
-        ValueError, match=("To use stylesheet, you need lxml installed")
-    ):
-        geom_df.to_xml(parser="etree", stylesheet=xsl)
+    with pytest.raises(ValueError, match="To use stylesheet, you need lxml installed"):
+        geom_df.to_xml(parser="etree", stylesheet=StringIO(xsl))
 
 
 def test_style_to_csv(geom_df):
@@ -1190,7 +1185,7 @@ def test_style_to_csv(geom_df):
 
     if out_csv is not None:
         out_csv = out_csv.strip()
-    out_xml = geom_df.to_xml(stylesheet=xsl)
+    out_xml = geom_df.to_xml(stylesheet=StringIO(xsl))
 
     assert out_csv == out_xml
 
@@ -1224,7 +1219,7 @@ def test_style_to_string(geom_df):
 </xsl:stylesheet>"""
 
     out_str = geom_df.to_string()
-    out_xml = geom_df.to_xml(na_rep="NaN", stylesheet=xsl)
+    out_xml = geom_df.to_xml(na_rep="NaN", stylesheet=StringIO(xsl))
 
     assert out_xml == out_str
 
@@ -1269,7 +1264,7 @@ def test_style_to_json(geom_df):
 </xsl:stylesheet>"""
 
     out_json = geom_df.to_json()
-    out_xml = geom_df.to_xml(stylesheet=xsl)
+    out_xml = geom_df.to_xml(stylesheet=StringIO(xsl))
 
     assert out_json == out_xml
 
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index 0ee3ec85ab6c6..2f5eca5e1e353 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -247,16 +247,12 @@
 )
 
 
-def test_literal_xml_deprecation():
+def test_literal_xml_raises():
     # GH 53809
     pytest.importorskip("lxml")
-    msg = (
-        "Passing literal xml to 'read_xml' is deprecated and "
-        "will be removed in a future version. To read from a "
-        "literal string, wrap it in a 'StringIO' object."
-    )
+    msg = r"\[Errno 2\] No such file or directory"
 
-    with tm.assert_produces_warning(FutureWarning, match=msg):
+    with pytest.raises(FileNotFoundError, match=msg):
         read_xml(xml_default_nmsp)
 
 
@@ -490,16 +486,10 @@ def test_empty_string_etree(val):
 
 
 def test_wrong_file_path(parser):
-    msg = (
-        "Passing literal xml to 'read_xml' is deprecated and "
-        "will be removed in a future version. To read from a "
-        "literal string, wrap it in a 'StringIO' object."
-    )
-    filename = os.path.join("data", "html", "books.xml")
+    filename = os.path.join("does", "not", "exist", "books.xml")
 
     with pytest.raises(
-        FutureWarning,
-        match=msg,
+        FileNotFoundError, match=r"\[Errno 2\] No such file or directory"
     ):
         read_xml(filename, parser=parser)
 
@@ -1197,14 +1187,12 @@ def test_stylesheet_io(kml_cta_rail_lines, xsl_flatten_doc, mode):
 def test_stylesheet_buffered_reader(kml_cta_rail_lines, xsl_flatten_doc, mode):
     pytest.importorskip("lxml")
     with open(xsl_flatten_doc, mode, encoding="utf-8" if mode == "r" else None) as f:
-        xsl_obj = f.read()
-
-    df_style = read_xml(
-        kml_cta_rail_lines,
-        xpath=".//k:Placemark",
-        namespaces={"k": "http://www.opengis.net/kml/2.2"},
-        stylesheet=xsl_obj,
-    )
+        df_style = read_xml(
+            kml_cta_rail_lines,
+            xpath=".//k:Placemark",
+            namespaces={"k": "http://www.opengis.net/kml/2.2"},
+            stylesheet=f,
+        )
 
     tm.assert_frame_equal(df_kml, df_style)
 
@@ -1233,7 +1221,7 @@ def test_style_charset():
 </xsl:stylesheet>"""
 
     df_orig = read_xml(StringIO(xml))
-    df_style = read_xml(StringIO(xml), stylesheet=xsl)
+    df_style = read_xml(StringIO(xml), stylesheet=StringIO(xsl))
 
     tm.assert_frame_equal(df_orig, df_style)
 
@@ -1271,9 +1259,9 @@ def test_incorrect_xsl_syntax(kml_cta_rail_lines):
 </xsl:stylesheet>"""
 
     with pytest.raises(
-        lxml_etree.XMLSyntaxError, match=("Extra content at the end of the document")
+        lxml_etree.XMLSyntaxError, match="Extra content at the end of the document"
     ):
-        read_xml(kml_cta_rail_lines, stylesheet=xsl)
+        read_xml(kml_cta_rail_lines, stylesheet=StringIO(xsl))
 
 
 def test_incorrect_xsl_eval(kml_cta_rail_lines):
@@ -1299,8 +1287,8 @@ def test_incorrect_xsl_eval(kml_cta_rail_lines):
     <xsl:template match="k:description|k:Snippet|k:Style"/>
 </xsl:stylesheet>"""
 
-    with pytest.raises(lxml_etree.XSLTParseError, match=("failed to compile")):
-        read_xml(kml_cta_rail_lines, stylesheet=xsl)
+    with pytest.raises(lxml_etree.XSLTParseError, match="failed to compile"):
+        read_xml(kml_cta_rail_lines, stylesheet=StringIO(xsl))
 
 
 def test_incorrect_xsl_apply(kml_cta_rail_lines):
@@ -1318,18 +1306,17 @@ def test_incorrect_xsl_apply(kml_cta_rail_lines):
     </xsl:template>
 </xsl:stylesheet>"""
 
-    with pytest.raises(lxml_etree.XSLTApplyError, match=("Cannot resolve URI")):
-        read_xml(kml_cta_rail_lines, stylesheet=xsl)
+    with pytest.raises(lxml_etree.XSLTApplyError, match="Cannot resolve URI"):
+        read_xml(kml_cta_rail_lines, stylesheet=StringIO(xsl))
 
 
 def test_wrong_stylesheet(kml_cta_rail_lines, xml_data_path):
-    xml_etree = pytest.importorskip("lxml.etree")
+    pytest.importorskip("lxml.etree")
 
-    xsl = xml_data_path / "flatten.xsl"
+    xsl = xml_data_path / "flatten_doesnt_exist.xsl"
 
     with pytest.raises(
-        xml_etree.XMLSyntaxError,
-        match=("Start tag expected, '<' not found"),
+        FileNotFoundError, match=r"\[Errno 2\] No such file or directory"
     ):
         read_xml(kml_cta_rail_lines, stylesheet=xsl)
 
@@ -1359,20 +1346,11 @@ def test_stylesheet_with_etree(kml_cta_rail_lines, xsl_flatten_doc):
         read_xml(kml_cta_rail_lines, parser="etree", stylesheet=xsl_flatten_doc)
 
 
-@pytest.mark.parametrize("val", ["", b""])
-def test_empty_stylesheet(val):
+@pytest.mark.parametrize("val", [StringIO(""), BytesIO(b"")])
+def test_empty_stylesheet(val, kml_cta_rail_lines):
     lxml_etree = pytest.importorskip("lxml.etree")
-
-    msg = (
-        "Passing literal xml to 'read_xml' is deprecated and "
-        "will be removed in a future version. To read from a "
-        "literal string, wrap it in a 'StringIO' object."
-    )
-    kml = os.path.join("data", "xml", "cta_rail_lines.kml")
-
     with pytest.raises(lxml_etree.XMLSyntaxError):
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            read_xml(kml, stylesheet=val)
+        read_xml(kml_cta_rail_lines, stylesheet=val)
 
 
 # ITERPARSE
@@ -1910,7 +1888,7 @@ def test_online_stylesheet():
         StringIO(xml),
         xpath=".//tr[td and position() <= 6]",
         names=["title", "artist"],
-        stylesheet=xsl,
+        stylesheet=StringIO(xsl),
     )
 
     df_expected = DataFrame(

From 1bf5e9c72b32959a6c93f426fba67b37b3cffab3 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 8 Feb 2024 13:52:41 -0800
Subject: [PATCH 03/10] Add html

---
 pandas/io/html.py            | 83 +++++++++++-------------------------
 pandas/tests/io/test_html.py | 10 ++---
 2 files changed, 27 insertions(+), 66 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index adcb78d3fb7d1..3197880f4eaaf 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -7,7 +7,9 @@
 from __future__ import annotations
 
 from collections import abc
+import errno
 import numbers
+import os
 import re
 from re import Pattern
 from typing import (
@@ -15,7 +17,6 @@
     Literal,
     cast,
 )
-import warnings
 
 from pandas._libs import lib
 from pandas.compat._optional import import_optional_dependency
@@ -24,7 +25,6 @@
     EmptyDataError,
 )
 from pandas.util._decorators import doc
-from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.common import is_list_like
@@ -36,10 +36,7 @@
 from pandas.core.shared_docs import _shared_docs
 
 from pandas.io.common import (
-    file_exists,
     get_handle,
-    is_file_like,
-    is_fsspec_url,
     is_url,
     stringify_path,
     validate_header_arg,
@@ -134,21 +131,15 @@ def _read(
     -------
     raw_text : str
     """
-    text: str | bytes
-    if (
-        is_url(obj)
-        or hasattr(obj, "read")
-        or (isinstance(obj, str) and file_exists(obj))
-    ):
+    try:
         with get_handle(
             obj, "r", encoding=encoding, storage_options=storage_options
         ) as handles:
-            text = handles.handle.read()
-    elif isinstance(obj, (str, bytes)):
-        text = obj
-    else:
-        raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")
-    return text
+            return handles.handle.read()
+    except OSError as err:
+        raise FileNotFoundError(
+            f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {obj}"
+        ) from err
 
 
 class _HtmlFrameParser:
@@ -158,7 +149,7 @@ class _HtmlFrameParser:
     Parameters
     ----------
     io : str or file-like
-        This can be either a string of raw HTML, a valid URL using the HTTP,
+        This can be either a string path, a valid URL using the HTTP,
         FTP, or FILE protocols or a file-like object.
 
     match : str or regex
@@ -780,36 +771,26 @@ def _build_doc(self):
         from lxml.etree import XMLSyntaxError
         from lxml.html import (
             HTMLParser,
-            fromstring,
             parse,
         )
 
         parser = HTMLParser(recover=True, encoding=self.encoding)
 
-        try:
-            if is_url(self.io):
-                with get_handle(
-                    self.io, "r", storage_options=self.storage_options
-                ) as f:
-                    r = parse(f.handle, parser=parser)
-            else:
-                # try to parse the input in the simplest way
-                r = parse(self.io, parser=parser)
+        if is_url(self.io):
+            with get_handle(self.io, "r", storage_options=self.storage_options) as f:
+                r = parse(f.handle, parser=parser)
+        else:
+            # try to parse the input in the simplest way
             try:
-                r = r.getroot()
-            except AttributeError:
-                pass
-        except (UnicodeDecodeError, OSError) as e:
-            # if the input is a blob of html goop
-            if not is_url(self.io):
-                r = fromstring(self.io, parser=parser)
-
-                try:
-                    r = r.getroot()
-                except AttributeError:
-                    pass
-            else:
-                raise e
+                r = parse(self.io, parser=parser)
+            except OSError as err:
+                raise FileNotFoundError(
+                    f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {self.io}"
+                ) from err
+        try:
+            r = r.getroot()
+        except AttributeError:
+            pass
         else:
             if not hasattr(r, "text_content"):
                 raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
@@ -1059,7 +1040,7 @@ def read_html(
     io : str, path object, or file-like object
         String, path object (implementing ``os.PathLike[str]``), or file-like
         object implementing a string ``read()`` function.
-        The string can represent a URL or the HTML itself. Note that
+        The string can represent a URL. Note that
         lxml only accepts the http, ftp and file url protocols. If you have a
         URL that starts with ``'https'`` you might try removing the ``'s'``.
 
@@ -1227,22 +1208,6 @@ def read_html(
 
     io = stringify_path(io)
 
-    if isinstance(io, str) and not any(
-        [
-            is_file_like(io),
-            file_exists(io),
-            is_url(io),
-            is_fsspec_url(io),
-        ]
-    ):
-        warnings.warn(
-            "Passing literal html to 'read_html' is deprecated and "
-            "will be removed in a future version. To read from a "
-            "literal string, wrap it in a 'StringIO' object.",
-            FutureWarning,
-            stacklevel=find_stack_level(),
-        )
-
     return _parse(
         flavor=flavor,
         io=io,
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 73044b8c24a53..2251fa20f0b63 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -112,13 +112,9 @@ def flavor_read_html(request):
 class TestReadHtml:
     def test_literal_html_deprecation(self, flavor_read_html):
         # GH 53785
-        msg = (
-            "Passing literal html to 'read_html' is deprecated and "
-            "will be removed in a future version. To read from a "
-            "literal string, wrap it in a 'StringIO' object."
-        )
+        msg = r"\[Errno 2\] No such file or director"
 
-        with tm.assert_produces_warning(FutureWarning, match=msg):
+        with pytest.raises(FileNotFoundError, match=msg):
             flavor_read_html(
                 """<table>
                 <thead>
@@ -1405,7 +1401,7 @@ def test_encode(self, html_encoding_file, flavor_read_html):
         try:
             with open(html_encoding_file, "rb") as fobj:
                 from_string = flavor_read_html(
-                    fobj.read(), encoding=encoding, index_col=0
+                    BytesIO(fobj.read()), encoding=encoding, index_col=0
                 ).pop()
 
             with open(html_encoding_file, "rb") as fobj:

From 60b5d3f43c537d524c4029b8bf57d7666faae479 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 8 Feb 2024 14:04:54 -0800
Subject: [PATCH 04/10] Fix excel

---
 pandas/io/excel/_base.py              | 23 +----------------------
 pandas/tests/io/excel/test_readers.py | 13 +++----------
 2 files changed, 4 insertions(+), 32 deletions(-)

diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index 1f272d0e09db8..cf9c3be97ee5c 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -8,7 +8,6 @@
 )
 import datetime
 from functools import partial
-from io import BytesIO
 import os
 from textwrap import fill
 from typing import (
@@ -94,7 +93,7 @@
 
 Parameters
 ----------
-io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object
+io : str, ExcelFile, xlrd.Book, path object, or file-like object
     Any valid string path is acceptable. The string could be a URL. Valid
     URL schemes include http, ftp, s3, and file. For file URLs, a host is
     expected. A local file could be: ``file://localhost/path/to/table.xlsx``.
@@ -552,10 +551,6 @@ def __init__(
         if engine_kwargs is None:
             engine_kwargs = {}
 
-        # First argument can also be bytes, so create a buffer
-        if isinstance(filepath_or_buffer, bytes):
-            filepath_or_buffer = BytesIO(filepath_or_buffer)
-
         self.handles = IOHandles(
             handle=filepath_or_buffer, compression={"method": None}
         )
@@ -1405,9 +1400,6 @@ def inspect_excel_format(
     BadZipFile
         If resulting stream does not have an XLS signature and is not a valid zipfile.
     """
-    if isinstance(content_or_path, bytes):
-        content_or_path = BytesIO(content_or_path)
-
     with get_handle(
         content_or_path, "rb", storage_options=storage_options, is_text=False
     ) as handle:
@@ -1526,19 +1518,6 @@ def __init__(
         if engine is not None and engine not in self._engines:
             raise ValueError(f"Unknown engine: {engine}")
 
-        # First argument can also be bytes, so create a buffer
-        if isinstance(path_or_buffer, bytes):
-            path_or_buffer = BytesIO(path_or_buffer)
-            warnings.warn(
-                "Passing bytes to 'read_excel' is deprecated and "
-                "will be removed in a future version. To read from a "
-                "byte string, wrap it in a `BytesIO` object.",
-                FutureWarning,
-                stacklevel=find_stack_level(),
-            )
-
-        # Could be a str, ExcelFile, Book, etc.
-        self.io = path_or_buffer
         # Always a string
         self._io = stringify_path(path_or_buffer)
 
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index 708f01839a23c..e4a8791396ee2 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -1448,17 +1448,10 @@ def test_euro_decimal_format(self, read_ext):
 
 
 class TestExcelFileRead:
-    def test_deprecate_bytes_input(self, engine, read_ext):
+    def test_raises_bytes_input(self, engine, read_ext):
         # GH 53830
-        msg = (
-            "Passing bytes to 'read_excel' is deprecated and "
-            "will be removed in a future version. To read from a "
-            "byte string, wrap it in a `BytesIO` object."
-        )
-
-        with tm.assert_produces_warning(
-            FutureWarning, match=msg, raise_on_extra_warnings=False
-        ):
+        msg = "Expected file path name or file-like object"
+        with pytest.raises(TypeError, match=msg):
             with open("test1" + read_ext, "rb") as f:
                 pd.read_excel(f.read(), engine=engine)
 

From 3e0ccb90b50e4642346fc7ba7cf3abf6366d393f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 8 Feb 2024 14:08:19 -0800
Subject: [PATCH 05/10] Add whatsnew

---
 doc/source/whatsnew/v3.0.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 2c39318fa28b3..f43cd7547e4b2 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -103,6 +103,7 @@ Deprecations
 
 Removal of prior version deprecations/changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+- :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`)
 - All arguments except the first ``path``-like argument in IO writers are now keyword only (:issue:`54229`)
 - Removed :meth:`DataFrame.first` and :meth:`DataFrame.last` (:issue:`53710`)
 - Removed :meth:`DataFrameGroupby.fillna` and :meth:`SeriesGroupBy.fillna` (:issue:`55719`)

From f9a986438cdc85bb294915a8df0915d1db6d79a5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 8 Feb 2024 14:59:51 -0800
Subject: [PATCH 06/10] Fix docs

---
 doc/source/user_guide/io.rst    | 4 ++--
 doc/source/whatsnew/v0.12.0.rst | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 80572de91e0c7..a08315818366f 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -3247,7 +3247,7 @@ output (as shown below for demonstration) for easier parse into ``DataFrame``:
       </row>
     </response>"""
 
-   df = pd.read_xml(StringIO(xml), stylesheet=xsl)
+   df = pd.read_xml(StringIO(xml), stylesheet=StringIO(xsl))
    df
 
 For very large XML files that can range in hundreds of megabytes to gigabytes, :func:`pandas.read_xml`
@@ -3418,7 +3418,7 @@ Write an XML and transform with stylesheet:
       </xsl:template>
     </xsl:stylesheet>"""
 
-   print(geom_df.to_xml(stylesheet=xsl))
+   print(geom_df.to_xml(stylesheet=StringIO(xsl)))
 
 
 XML Final Notes
diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst
index 59d104cb3e96c..c805758f85b35 100644
--- a/doc/source/whatsnew/v0.12.0.rst
+++ b/doc/source/whatsnew/v0.12.0.rst
@@ -201,12 +201,12 @@ IO enhancements
     You can use ``pd.read_html()`` to read the output from ``DataFrame.to_html()`` like so
 
     .. ipython:: python
-       :okwarning:
 
+        import io
         df = pd.DataFrame({"a": range(3), "b": list("abc")})
         print(df)
         html = df.to_html()
-        alist = pd.read_html(html, index_col=0)
+        alist = pd.read_html(io.StringIO(html), index_col=0)
         print(df == alist[0])
 
     Note that ``alist`` here is a Python ``list`` so ``pd.read_html()`` and

From 1a71d9fc5e2dd722725c1095e67cf3f049d4c7d5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 8 Feb 2024 16:36:18 -0800
Subject: [PATCH 07/10] Adjust some tests

---
 pandas/io/html.py                   | 8 +++++---
 pandas/tests/io/json/test_pandas.py | 2 +-
 pandas/tests/io/xml/test_xml.py     | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index 3197880f4eaaf..b4f6a5508726b 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -137,9 +137,11 @@ def _read(
         ) as handles:
             return handles.handle.read()
     except OSError as err:
-        raise FileNotFoundError(
-            f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {obj}"
-        ) from err
+        if not is_url(obj):
+            raise FileNotFoundError(
+                f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {obj}"
+            ) from err
+        raise
 
 
 class _HtmlFrameParser:
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index b5e197303d12c..9f4c7bdc46067 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -43,7 +43,7 @@ def test_literal_json_raises():
         {"a": 5, "b": 6}
         {"a": 7, "b": 8}"""
 
-    msg = r"\[Errno 2\] No such file or directory"
+    msg = r".*No such file or directory"
 
     with pytest.raises(FileNotFoundError, match=msg):
         read_json(jsonl, lines=False)
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index 2f5eca5e1e353..c0800da37dca5 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -250,7 +250,7 @@
 def test_literal_xml_raises():
     # GH 53809
     pytest.importorskip("lxml")
-    msg = r"\[Errno 2\] No such file or directory"
+    msg = r".*No such file or directory"
 
     with pytest.raises(FileNotFoundError, match=msg):
         read_xml(xml_default_nmsp)

From f84ba8661582acbc27be64cb103558231fd8969c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 8 Feb 2024 17:44:06 -0800
Subject: [PATCH 08/10] Ignore typing for now

---
 pandas/io/xml.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index 24ab111215218..2f5a1e9f5a4bb 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -669,7 +669,7 @@ def get_data_from_filepath(
     encoding: str | None,
     compression: CompressionOptions,
     storage_options: StorageOptions,
-) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:
+):
     """
     Extract raw XML data.
 
@@ -677,7 +677,7 @@ def get_data_from_filepath(
         1. filepath (string-like)
         2. file-like object (e.g. open file object, StringIO)
     """
-    filepath_or_buffer = stringify_path(filepath_or_buffer)
+    filepath_or_buffer = stringify_path(filepath_or_buffer)  # type: ignore[arg-type]
     with get_handle(
         filepath_or_buffer,
         "r",

From d225f4da692fba2984287014df30af59e9017695 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 9 Feb 2024 10:25:31 -0800
Subject: [PATCH 09/10] Windows compat

---
 pandas/io/json/_json.py             | 16 +++-------------
 pandas/tests/io/json/test_pandas.py |  2 +-
 pandas/tests/io/xml/test_xml.py     |  4 ++--
 3 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index a75e4e6aa9600..91d0732c52cce 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -53,8 +53,6 @@
 from pandas.io.common import (
     IOHandles,
     dedup_names,
-    extension_to_compression,
-    file_exists,
     get_handle,
     is_potential_multi_index,
     stringify_path,
@@ -909,17 +907,9 @@ def _get_data_from_filepath(self, filepath_or_buffer):
                 errors=self.encoding_errors,
             )
         except OSError as err:
-            if (
-                isinstance(filepath_or_buffer, str)
-                and filepath_or_buffer.lower().endswith(
-                    (".json",) + tuple(f".json{c}" for c in extension_to_compression)
-                )
-                and not file_exists(filepath_or_buffer)
-            ):
-                raise FileNotFoundError(
-                    f"File {filepath_or_buffer} does not exist"
-                ) from err
-            raise
+            raise FileNotFoundError(
+                f"File {filepath_or_buffer} does not exist"
+            ) from err
         filepath_or_buffer = self.handles.handle
         return filepath_or_buffer
 
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 9f4c7bdc46067..8eadbb9aac3c3 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -43,7 +43,7 @@ def test_literal_json_raises():
         {"a": 5, "b": 6}
         {"a": 7, "b": 8}"""
 
-    msg = r".*No such file or directory"
+    msg = r".* does not exist"
 
     with pytest.raises(FileNotFoundError, match=msg):
         read_json(jsonl, lines=False)
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index c0800da37dca5..97599722cb93f 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -250,9 +250,9 @@
 def test_literal_xml_raises():
     # GH 53809
     pytest.importorskip("lxml")
-    msg = r".*No such file or directory"
+    msg = "|".join([r".*No such file or directory", r".*Invalid argument"])
 
-    with pytest.raises(FileNotFoundError, match=msg):
+    with pytest.raises((FileNotFoundError, OSError), match=msg):
         read_xml(xml_default_nmsp)
 
 

From 3b1778fbaccab4aab8fa8aeace6b05f22801a755 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 9 Feb 2024 12:14:24 -0800
Subject: [PATCH 10/10] Add generaltypeissues

---
 pandas/io/json/_json.py | 2 +-
 pandas/io/xml.py        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index 91d0732c52cce..de246a2757409 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -524,7 +524,7 @@ def read_json(
 
     Parameters
     ----------
-    path_or_buf : a string path, path object or file-like object
+    path_or_buf : a str path, path object or file-like object
         Any valid string path is acceptable. The string could be a URL. Valid
         URL schemes include http, ftp, s3, and file. For file URLs, a host is
         expected. A local file could be:
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index 2f5a1e9f5a4bb..2038733bee808 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -678,8 +678,8 @@ def get_data_from_filepath(
         2. file-like object (e.g. open file object, StringIO)
     """
     filepath_or_buffer = stringify_path(filepath_or_buffer)  # type: ignore[arg-type]
-    with get_handle(
-        filepath_or_buffer,
+    with get_handle(  # pyright: ignore[reportGeneralTypeIssues]
+        filepath_or_buffer,  # pyright: ignore[reportGeneralTypeIssues]
         "r",
         encoding=encoding,
         compression=compression,