DEPR: Remove literal string input for read_xml (#53809)

rmhowe425 · web-flow · commit c68449a65cb5 · 2023-07-11T13:52:05.000-07:00
* Updating documentation and adding deprecation logic for read_xml.

* Fixing documentation issue and adding unit test

* Updating unit tests and documentation.

* Fixing unit tests and documentation issues

* Fixing unit tests and documentation issues

* Fixing unit tests and documentation issues

* Fixing import error in documentation

* Updated deprecation logic per reviewer recommendations.

* Updating deprecation logic and documentation per reviewer recommendations.

* Fixing logic error

* Fixing implementation per reviewer recommendations.

* Updating implementation per reviewer recommendations.

* Cleaning up the deprecation logic a bit.

* Updating implementation per reviewer recommendations.

* Updating unit tests

* Fixing discrepancy in doc string.

* Updating implementation based on reviewer recommendations.
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -2919,6 +2919,7 @@ Read an XML string:
 
 .. ipython:: python
 
+    from io import StringIO
    xml = """<?xml version="1.0" encoding="UTF-8"?>
    <bookstore>
      <book category="cooking">
@@ -2941,7 +2942,7 @@ Read an XML string:
      </book>
    </bookstore>"""
 
-   df = pd.read_xml(xml)
+   df = pd.read_xml(StringIO(xml))
    df
 
 Read a URL with no options:
@@ -2961,7 +2962,7 @@ as a string:
        f.write(xml)
 
    with open(file_path, "r") as f:
-       df = pd.read_xml(f.read())
+       df = pd.read_xml(StringIO(f.read()))
    df
 
 Read in the content of the "books.xml" as instance of ``StringIO`` or
@@ -3052,7 +3053,7 @@ For example, below XML contains a namespace with prefix, ``doc``, and URI at
      </doc:row>
    </doc:data>"""
 
-   df = pd.read_xml(xml,
+   df = pd.read_xml(StringIO(xml),
                     xpath="//doc:row",
                     namespaces={"doc": "https://example.com"})
    df
@@ -3082,7 +3083,7 @@ But assigning *any* temporary name to correct URI allows parsing by nodes.
     </row>
    </data>"""
 
-   df = pd.read_xml(xml,
+   df = pd.read_xml(StringIO(xml),
                     xpath="//pandas:row",
                     namespaces={"pandas": "https://example.com"})
    df
@@ -3117,7 +3118,7 @@ However, if XPath does not reference node names such as default, ``/*``, then
         </row>
       </data>"""
 
-      df = pd.read_xml(xml, xpath="./row")
+      df = pd.read_xml(StringIO(xml), xpath="./row")
       df
 
    shows the attribute ``sides`` on ``shape`` element was not parsed as
@@ -3218,7 +3219,7 @@ output (as shown below for demonstration) for easier parse into ``DataFrame``:
       </row>
     </response>"""
 
-   df = pd.read_xml(xml, stylesheet=xsl)
+   df = pd.read_xml(StringIO(xml), stylesheet=xsl)
    df
 
 For very large XML files that can range in hundreds of megabytes to gigabytes, :func:`pandas.read_xml`
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -221,6 +221,7 @@ apply converter methods, and parse dates (:issue:`43567`).
 
 .. ipython:: python
 
+    from io import StringIO
     xml_dates = """<?xml version='1.0' encoding='utf-8'?>
     <data>
       <row>
@@ -244,7 +245,7 @@ apply converter methods, and parse dates (:issue:`43567`).
     </data>"""
 
     df = pd.read_xml(
-        xml_dates,
+        StringIO(xml_dates),
         dtype={'sides': 'Int64'},
         converters={'degrees': str},
         parse_dates=['date']
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -313,6 +313,7 @@ Deprecations
 - Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`)
 - Deprecated falling back to filling when ``value`` is not specified in :meth:`DataFrame.replace` and :meth:`Series.replace` with non-dict-like ``to_replace`` (:issue:`33302`)
 - Deprecated literal json input to :func:`read_json`. Wrap literal json string input in ``io.StringIO`` instead. (:issue:`53409`)
+- Deprecated literal string input to :func:`read_xml`. Wrap literal string/bytes input in ``io.StringIO`` / ``io.BytesIO`` instead. (:issue:`53767`)
 - Deprecated literal string/bytes input to :func:`read_html`. Wrap literal string/bytes input in ``io.StringIO`` / ``io.BytesIO`` instead. (:issue:`53767`)
 - Deprecated option "mode.use_inf_as_na", convert inf entries to ``NaN`` before instead (:issue:`51684`)
 - Deprecated parameter ``obj`` in :meth:`GroupBy.get_group` (:issue:`53545`)
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
@@ -11,6 +11,7 @@
     Any,
     Callable,
 )
+import warnings
 
 from pandas._libs import lib
 from pandas.compat._optional import import_optional_dependency
@@ -19,6 +20,7 @@
     ParserError,
 )
 from pandas.util._decorators import doc
+from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.common import is_list_like
@@ -29,6 +31,7 @@
     file_exists,
     get_handle,
     infer_compression,
+    is_file_like,
     is_fsspec_url,
     is_url,
     stringify_path,
@@ -802,6 +805,22 @@ def _parse(
 
     p: _EtreeFrameParser | _LxmlFrameParser
 
+    if isinstance(path_or_buffer, str) and not any(
+        [
+            is_file_like(path_or_buffer),
+            file_exists(path_or_buffer),
+            is_url(path_or_buffer),
+            is_fsspec_url(path_or_buffer),
+        ]
+    ):
+        warnings.warn(
+            "Passing literal xml to 'read_xml' is deprecated and "
+            "will be removed in a future version. To read from a "
+            "literal string, wrap it in a 'StringIO' object.",
+            FutureWarning,
+            stacklevel=find_stack_level(),
+        )
+
     if parser == "lxml":
         lxml = import_optional_dependency("lxml.etree", errors="ignore")
 
@@ -894,6 +913,10 @@ def read_xml(
         string or a path. The string can further be a URL. Valid URL schemes
         include http, ftp, s3, and file.
 
+        .. deprecated:: 2.1.0
+            Passing xml literal strings is deprecated.
+            Wrap literal xml input in ``io.StringIO`` or ``io.BytesIO`` instead.
+
     xpath : str, optional, default './\*'
         The XPath to parse required set of nodes for migration to DataFrame.
         XPath should return a collection of elements and not a single
@@ -1049,6 +1072,7 @@ def read_xml(
 
     Examples
     --------
+    >>> import io
     >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
     ... <data xmlns="http://example.com">
     ...  <row>
@@ -1068,7 +1092,7 @@ def read_xml(
     ...  </row>
     ... </data>'''
 
-    >>> df = pd.read_xml(xml)
+    >>> df = pd.read_xml(io.StringIO(xml))
     >>> df
           shape  degrees  sides
     0    square      360    4.0
@@ -1082,7 +1106,7 @@ def read_xml(
     ...   <row shape="triangle" degrees="180" sides="3.0"/>
     ... </data>'''
 
-    >>> df = pd.read_xml(xml, xpath=".//row")
+    >>> df = pd.read_xml(io.StringIO(xml), xpath=".//row")
     >>> df
           shape  degrees  sides
     0    square      360    4.0
@@ -1108,7 +1132,7 @@ def read_xml(
     ...   </doc:row>
     ... </doc:data>'''
 
-    >>> df = pd.read_xml(xml,
+    >>> df = pd.read_xml(io.StringIO(xml),
     ...                  xpath="//doc:row",
     ...                  namespaces={{"doc": "https://example.com"}})
     >>> df
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py