BUG: Fix file-like objects failing in read_xml when iterparse is used (pandas-dev#50759)

ParfaitG · web-flow · commit e9b35621bf5a · 2023-01-18T11:47:22.000-08:00
* BUG: Fix file-like objects failing in read_xml when iterparse is used

* Revert whatsnew version conflict issue

* Add entry to 1.5.3 whatsnew version

* Move whatsnew note to v2.0.0; parameterize read mode in tests

* Split iterparse tests between file like and file IO
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -1024,6 +1024,7 @@ I/O
 - Fixed issue where :func:`json_normalize` would incorrectly remove leading characters from column names that matched the ``sep`` argument (:issue:`49861`)
 - Bug in :meth:`DataFrame.to_dict` not converting ``NA`` to ``None`` (:issue:`50795`)
 - Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`)
+- Bug in :func:`read_xml` where file-like objects failed when iterparse is used (:issue:`50641`)
 
 Period
 ^^^^^^
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
@@ -297,7 +297,7 @@ def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
         TypeError
             * If `iterparse` is not a dict or its dict value is not list-like.
         ParserError
-            * If `path_or_buffer` is not a physical, decompressed file on disk.
+            * If `path_or_buffer` is not a physical file on disk or file-like object.
             * If no data is returned from selected items in `iterparse`.
 
         Notes
@@ -322,7 +322,7 @@ def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
                 "for value in iterparse"
             )
 
-        if (
+        if (not hasattr(self.path_or_buffer, "read")) and (
             not isinstance(self.path_or_buffer, str)
             or is_url(self.path_or_buffer)
             or is_fsspec_url(self.path_or_buffer)
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
@@ -1351,18 +1351,81 @@ def test_string_error(parser):
         )
 
 
-def test_file_like_error(datapath, parser, mode):
+def test_file_like_iterparse(datapath, parser, mode):
     filename = datapath("io", "data", "xml", "books.xml")
-    with pytest.raises(
-        ParserError, match=("iterparse is designed for large XML files")
-    ):
-        with open(filename) as f:
-            read_xml(
+
+    with open(filename, mode) as f:
+        if mode == "r" and parser == "lxml":
+            with pytest.raises(
+                TypeError, match=("reading file objects must return bytes objects")
+            ):
+                read_xml(
+                    f,
+                    parser=parser,
+                    iterparse={
+                        "book": ["category", "title", "year", "author", "price"]
+                    },
+                )
+            return None
+        else:
+            df_filelike = read_xml(
                 f,
                 parser=parser,
                 iterparse={"book": ["category", "title", "year", "author", "price"]},
             )
 
+    df_expected = DataFrame(
+        {
+            "category": ["cooking", "children", "web"],
+            "title": ["Everyday Italian", "Harry Potter", "Learning XML"],
+            "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
+            "year": [2005, 2005, 2003],
+            "price": [30.00, 29.99, 39.95],
+        }
+    )
+
+    tm.assert_frame_equal(df_filelike, df_expected)
+
+
+def test_file_io_iterparse(datapath, parser, mode):
+    filename = datapath("io", "data", "xml", "books.xml")
+
+    funcIO = StringIO if mode == "r" else BytesIO
+    with open(filename, mode) as f:
+        with funcIO(f.read()) as b:
+            if mode == "r" and parser == "lxml":
+                with pytest.raises(
+                    TypeError, match=("reading file objects must return bytes objects")
+                ):
+                    read_xml(
+                        b,
+                        parser=parser,
+                        iterparse={
+                            "book": ["category", "title", "year", "author", "price"]
+                        },
+                    )
+                return None
+            else:
+                df_fileio = read_xml(
+                    b,
+                    parser=parser,
+                    iterparse={
+                        "book": ["category", "title", "year", "author", "price"]
+                    },
+                )
+
+    df_expected = DataFrame(
+        {
+            "category": ["cooking", "children", "web"],
+            "title": ["Everyday Italian", "Harry Potter", "Learning XML"],
+            "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
+            "year": [2005, 2005, 2003],
+            "price": [30.00, 29.99, 39.95],
+        }
+    )
+
+    tm.assert_frame_equal(df_fileio, df_expected)
+
 
 @pytest.mark.network
 @tm.network(url="https://www.w3schools.com/xml/books.xml", check_before_test=True)