pandas-dev · mroeschke · Jul 11, 2023 · Jun 22, 2023 · Jun 23, 2023 · Jun 23, 2023
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -2920,6 +2920,7 @@ Read an XML string:
 
 .. ipython:: python
 
+    from io import StringIO
    xml = """<?xml version="1.0" encoding="UTF-8"?>
    <bookstore>
      <book category="cooking">
@@ -2942,7 +2943,7 @@ Read an XML string:
      </book>
    </bookstore>"""
 
-   df = pd.read_xml(xml)
+   df = pd.read_xml(StringIO(xml))
    df
 
 Read a URL with no options:
@@ -2962,7 +2963,7 @@ as a string:
        f.write(xml)
 
    with open(file_path, "r") as f:
-       df = pd.read_xml(f.read())
+       df = pd.read_xml(StringIO(f.read()))
    df
 
 Read in the content of the "books.xml" as instance of ``StringIO`` or
@@ -3053,7 +3054,7 @@ For example, below XML contains a namespace with prefix, ``doc``, and URI at
      </doc:row>
    </doc:data>"""
 
-   df = pd.read_xml(xml,
+   df = pd.read_xml(StringIO(xml),
                     xpath="//doc:row",
                     namespaces={"doc": "https://example.com"})
    df
@@ -3083,7 +3084,7 @@ But assigning *any* temporary name to correct URI allows parsing by nodes.
     </row>
    </data>"""
 
-   df = pd.read_xml(xml,
+   df = pd.read_xml(StringIO(xml),
                     xpath="//pandas:row",
                     namespaces={"pandas": "https://example.com"})
    df
@@ -3118,7 +3119,7 @@ However, if XPath does not reference node names such as default, ``/*``, then
         </row>
       </data>"""
 
-      df = pd.read_xml(xml, xpath="./row")
+      df = pd.read_xml(StringIO(xml), xpath="./row")
       df
 
    shows the attribute ``sides`` on ``shape`` element was not parsed as
@@ -3219,7 +3220,7 @@ output (as shown below for demonstration) for easier parse into ``DataFrame``:
       </row>
     </response>"""
 
-   df = pd.read_xml(xml, stylesheet=xsl)
+   df = pd.read_xml(StringIO(xml), stylesheet=xsl)
    df
 
 For very large XML files that can range in hundreds of megabytes to gigabytes, :func:`pandas.read_xml`

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -221,6 +221,7 @@ apply converter methods, and parse dates (:issue:`43567`).
 
 .. ipython:: python
 
+    from io import StringIO
     xml_dates = """<?xml version='1.0' encoding='utf-8'?>
     <data>
       <row>
@@ -244,7 +245,7 @@ apply converter methods, and parse dates (:issue:`43567`).
     </data>"""
 
     df = pd.read_xml(
-        xml_dates,
+        StringIO(xml_dates),
         dtype={'sides': 'Int64'},
         converters={'degrees': str},
         parse_dates=['date']

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -298,13 +298,15 @@ Deprecations
 - Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`)
 - Deprecated falling back to filling when ``value`` is not specified in :meth:`DataFrame.replace` and :meth:`Series.replace` with non-dict-like ``to_replace`` (:issue:`33302`)
 - Deprecated literal json input to :func:`read_json`. Wrap literal json string input in ``io.StringIO`` instead. (:issue:`53409`)
+- Deprecated literal string input to :func:`read_xml`. Wrap literal string/bytes input in ``io.StringIO`` instead. (:issue:`53767`)
- Deprecated literal string input to :func:`read_xml`. Wrap literal string/bytes input in ``io.StringIO`` instead. (:issue:`53767`)
+- Deprecated literal string input to :func:`read_xml`. Wrap literal string/bytes input in ``io.StringIO``/``io.BytesIO`` instead. (:issue:`53767`)
- Deprecated literal string input to :func:`read_xml`. Wrap literal string/bytes input in ``io.StringIO`` instead. (:issue:`53767`)
+- Deprecated literal string input to :func:`read_xml`. Wrap literal string/bytes input in ``io.StringIO``/``io.BytesIO`` instead. (:issue:`53767`)
 - Deprecated option "mode.use_inf_as_na", convert inf entries to ``NaN`` before instead (:issue:`51684`)
 - Deprecated parameter ``obj`` in :meth:`GroupBy.get_group` (:issue:`53545`)
 - Deprecated positional indexing on :class:`Series` with :meth:`Series.__getitem__` and :meth:`Series.__setitem__`, in a future version ``ser[item]`` will *always* interpret ``item`` as a label, not a position (:issue:`50617`)
 - Deprecated strings ``T``, ``t``, ``L`` and ``l`` denoting units in :func:`to_timedelta` (:issue:`52536`)
 - Deprecated the "method" and "limit" keywords on :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`SeriesGroupBy.fillna`, :meth:`DataFrameGroupBy.fillna`, and :meth:`Resampler.fillna`, use ``obj.bfill()`` or ``obj.ffill()`` instead (:issue:`53394`)
 - Deprecated the ``method`` and ``limit`` keywords in :meth:`DataFrame.replace` and :meth:`Series.replace` (:issue:`33302`)
 - Deprecated values "pad", "ffill", "bfill", "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate`, use ``obj.ffill()`` or ``obj.bfill()`` instead (:issue:`53581`)
+-
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_210.performance:

diff --git a/pandas/io/xml.py b/pandas/io/xml.py
@@ -12,6 +12,7 @@
     Callable,
     Sequence,
 )
+import warnings
 
 from pandas._libs import lib
 from pandas.compat._optional import import_optional_dependency
@@ -20,6 +21,7 @@
     ParserError,
 )
 from pandas.util._decorators import doc
+from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.common import is_list_like
@@ -894,6 +896,9 @@ def read_xml(
         string or a path. The string can further be a URL. Valid URL schemes
         include http, ftp, s3, and file.
 
+        .. deprecated:: 2.1.0
+            Passing html literal strings is deprecated.
+
     xpath : str, optional, default './\*'
         The XPath to parse required set of nodes for migration to DataFrame.
         XPath should return a collection of elements and not a single
@@ -1049,6 +1054,7 @@ def read_xml(
 
     Examples
     --------
+    >>> import io
     >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
     ... <data xmlns="http://example.com">
     ...  <row>
@@ -1068,7 +1074,7 @@ def read_xml(
     ...  </row>
     ... </data>'''
 
-    >>> df = pd.read_xml(xml)
+    >>> df = pd.read_xml(io.StringIO(xml))
     >>> df
           shape  degrees  sides
     0    square      360    4.0
@@ -1082,7 +1088,7 @@ def read_xml(
     ...   <row shape="triangle" degrees="180" sides="3.0"/>
     ... </data>'''
 
-    >>> df = pd.read_xml(xml, xpath=".//row")
+    >>> df = pd.read_xml(io.StringIO(xml), xpath=".//row")
     >>> df
           shape  degrees  sides
     0    square      360    4.0
@@ -1108,7 +1114,7 @@ def read_xml(
     ...   </doc:row>
     ... </doc:data>'''
 
-    >>> df = pd.read_xml(xml,
+    >>> df = pd.read_xml(io.StringIO(xml),
     ...                  xpath="//doc:row",
     ...                  namespaces={{"doc": "https://example.com"}})
     >>> df
@@ -1119,6 +1125,15 @@ def read_xml(
     """
     check_dtype_backend(dtype_backend)
 
+    if isinstance(path_or_buffer, str) and "\n" in path_or_buffer:
+        warnings.warn(
+            "Passing literal xml to 'read_xml' is deprecated and "
+            "will be removed in a future version. To read from a "
+            "literal string, wrap it in a 'StringIO' object.",
+            FutureWarning,
+            stacklevel=find_stack_level(),
+        )
+
     return _parse(
         path_or_buffer=path_or_buffer,
         xpath=xpath,

diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
@@ -247,6 +247,19 @@
 )
 
 
+@td.skip_if_no("lxml")
+def test_literal_xml_deprecation():
+    # GH 53809
+    msg = (
+        "Passing literal xml to 'read_xml' is deprecated and "
+        "will be removed in a future version. To read from a "
+        "literal string, wrap it in a 'StringIO' object."
+    )
+
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        read_xml(xml_default_nmsp)
+
+
 @pytest.fixture(params=["rb", "r"])
 def mode(request):
     return request.param
@@ -391,6 +404,11 @@ def test_file_buffered_reader_string(xml_books, parser, mode):
     with open(xml_books, mode, encoding="utf-8" if mode == "r" else None) as f:
         xml_obj = f.read()
 
+    if mode == "rb":
+        xml_obj = StringIO(xml_obj.decode())
+    elif mode == "r":
+        xml_obj = StringIO(xml_obj)
+
     df_str = read_xml(xml_obj, parser=parser)
 
     df_expected = DataFrame(
@@ -411,6 +429,11 @@ def test_file_buffered_reader_no_xml_declaration(xml_books, parser, mode):
         next(f)
         xml_obj = f.read()
 
+    if mode == "rb":
+        xml_obj = StringIO(xml_obj.decode())
+    elif mode == "r":
+        xml_obj = StringIO(xml_obj)
+
     df_str = read_xml(xml_obj, parser=parser)
 
     df_expected = DataFrame(
@@ -580,7 +603,7 @@ def test_bad_xpath_lxml(xml_books):
 
 def test_default_namespace(parser):
     df_nmsp = read_xml(
-        xml_default_nmsp,
+        StringIO(xml_default_nmsp),
         xpath=".//ns:row",
         namespaces={"ns": "http://example.com"},
         parser=parser,
@@ -606,7 +629,7 @@ def test_default_namespace(parser):
 
 def test_prefix_namespace(parser):
     df_nmsp = read_xml(
-        xml_prefix_nmsp,
+        StringIO(xml_prefix_nmsp),
         xpath=".//doc:row",
         namespaces={"doc": "http://example.com"},
         parser=parser,
@@ -630,14 +653,14 @@ def test_prefix_namespace(parser):
 @td.skip_if_no("lxml")
 def test_consistency_default_namespace():
     df_lxml = read_xml(
-        xml_default_nmsp,
+        StringIO(xml_default_nmsp),
         xpath=".//ns:row",
         namespaces={"ns": "http://example.com"},
         parser="lxml",
     )
 
     df_etree = read_xml(
-        xml_default_nmsp,
+        StringIO(xml_default_nmsp),
         xpath=".//doc:row",
         namespaces={"doc": "http://example.com"},
         parser="etree",
@@ -649,14 +672,14 @@ def test_consistency_default_namespace():
 @td.skip_if_no("lxml")
 def test_consistency_prefix_namespace():
     df_lxml = read_xml(
-        xml_prefix_nmsp,
+        StringIO(xml_prefix_nmsp),
         xpath=".//doc:row",
         namespaces={"doc": "http://example.com"},
         parser="lxml",
     )
 
     df_etree = read_xml(
-        xml_prefix_nmsp,
+        StringIO(xml_prefix_nmsp),
         xpath=".//doc:row",
         namespaces={"doc": "http://example.com"},
         parser="etree",
@@ -693,7 +716,7 @@ def test_none_namespace_prefix(key):
         TypeError, match=("empty namespace prefix is not supported in XPath")
     ):
         read_xml(
-            xml_default_nmsp,
+            StringIO(xml_default_nmsp),
             xpath=".//kml:Placemark",
             namespaces={key: "http://www.opengis.net/kml/2.2"},
             parser="lxml",
@@ -782,7 +805,7 @@ def test_empty_attrs_only(parser):
         ValueError,
         match=("xpath does not return any nodes or attributes"),
     ):
-        read_xml(xml, xpath="./row", attrs_only=True, parser=parser)
+        read_xml(StringIO(xml), xpath="./row", attrs_only=True, parser=parser)
 
 
 def test_empty_elems_only(parser):
@@ -797,7 +820,7 @@ def test_empty_elems_only(parser):
         ValueError,
         match=("xpath does not return any nodes or attributes"),
     ):
-        read_xml(xml, xpath="./row", elems_only=True, parser=parser)
+        read_xml(StringIO(xml), xpath="./row", elems_only=True, parser=parser)
 
 
 @td.skip_if_no("lxml")
@@ -822,8 +845,8 @@ def test_attribute_centric_xml():
       </Stations>
 </TrainSchedule>"""
 
-    df_lxml = read_xml(xml, xpath=".//station")
-    df_etree = read_xml(xml, xpath=".//station", parser="etree")
+    df_lxml = read_xml(StringIO(xml), xpath=".//station")
+    df_etree = read_xml(StringIO(xml), xpath=".//station", parser="etree")
 
     df_iter_lx = read_xml_iterparse(xml, iterparse={"station": ["Name", "coords"]})
     df_iter_et = read_xml_iterparse(
@@ -875,7 +898,10 @@ def test_repeat_names(parser):
   </shape>
 </shapes>"""
     df_xpath = read_xml(
-        xml, xpath=".//shape", parser=parser, names=["type_dim", "shape", "type_edge"]
+        StringIO(xml),
+        xpath=".//shape",
+        parser=parser,
+        names=["type_dim", "shape", "type_edge"],
     )
 
     df_iter = read_xml_iterparse(
@@ -917,7 +943,9 @@ def test_repeat_values_new_names(parser):
     <family>ellipse</family>
   </shape>
 </shapes>"""
-    df_xpath = read_xml(xml, xpath=".//shape", parser=parser, names=["name", "group"])
+    df_xpath = read_xml(
+        StringIO(xml), xpath=".//shape", parser=parser, names=["name", "group"]
+    )
 
     df_iter = read_xml_iterparse(
         xml,
@@ -960,7 +988,7 @@ def test_repeat_elements(parser):
   </shape>
 </shapes>"""
     df_xpath = read_xml(
-        xml,
+        StringIO(xml),
         xpath=".//shape",
         parser=parser,
         names=["name", "family", "degrees", "sides"],
@@ -1339,19 +1367,6 @@ def test_empty_stylesheet(val):
 
 
 # ITERPARSE
-
-
-def test_string_error(parser):
-    with pytest.raises(
-        ParserError, match=("iterparse is designed for large XML files")
-    ):
-        read_xml(
-            xml_default_nmsp,
-            parser=parser,
-            iterparse={"row": ["shape", "degrees", "sides", "date"]},
-        )
-
-
 def test_file_like_iterparse(xml_books, parser, mode):
     with open(xml_books, mode, encoding="utf-8" if mode == "r" else None) as f:
         if mode == "r" and parser == "lxml":
@@ -1532,7 +1547,7 @@ def test_comment(parser):
 </shapes>
 <!-- comment after root -->"""
 
-    df_xpath = read_xml(xml, xpath=".//shape", parser=parser)
+    df_xpath = read_xml(StringIO(xml), xpath=".//shape", parser=parser)
 
     df_iter = read_xml_iterparse(
         xml, parser=parser, iterparse={"shape": ["name", "type"]}
@@ -1568,7 +1583,7 @@ def test_dtd(parser):
   </shape>
 </shapes>"""
 
-    df_xpath = read_xml(xml, xpath=".//shape", parser=parser)
+    df_xpath = read_xml(StringIO(xml), xpath=".//shape", parser=parser)
 
     df_iter = read_xml_iterparse(
         xml, parser=parser, iterparse={"shape": ["name", "type"]}
@@ -1604,7 +1619,7 @@ def test_processing_instruction(parser):
   </shape>
 </shapes>"""
 
-    df_xpath = read_xml(xml, xpath=".//shape", parser=parser)
+    df_xpath = read_xml(StringIO(xml), xpath=".//shape", parser=parser)
 
     df_iter = read_xml_iterparse(
         xml, parser=parser, iterparse={"shape": ["name", "type"]}
@@ -1808,7 +1823,7 @@ def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend):
         string_array_na = ArrowStringArray(pa.array(["x", None]))
 
     with pd.option_context("mode.string_storage", string_storage):
-        result = read_xml(data, parser=parser, dtype_backend=dtype_backend)
+        result = read_xml(StringIO(data), parser=parser, dtype_backend=dtype_backend)
 
     expected = DataFrame(
         {