Skip to content

Commit e9b3562

Browse files
authored
BUG: Fix file-like objects failing in read_xml when iterparse is used (pandas-dev#50759)
* BUG: Fix file-like objects failing in read_xml when iterparse is used * Revert whatsnew version conflict issue * Add entry to 1.5.3 whatsnew version * Move whatsnew note to v2.0.0; parameterize read mode in tests * Split iterparse tests between file like and file IO
1 parent 68cc56d commit e9b3562

File tree

3 files changed

+72
-8
lines changed

3 files changed

+72
-8
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1024,6 +1024,7 @@ I/O
10241024
- Fixed issue where :func:`json_normalize` would incorrectly remove leading characters from column names that matched the ``sep`` argument (:issue:`49861`)
10251025
- Bug in :meth:`DataFrame.to_dict` not converting ``NA`` to ``None`` (:issue:`50795`)
10261026
- Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`)
1027+
- Bug in :func:`read_xml` where file-like objects failed when iterparse is used (:issue:`50641`)
10271028

10281029
Period
10291030
^^^^^^

pandas/io/xml.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,7 @@ def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
297297
TypeError
298298
* If `iterparse` is not a dict or its dict value is not list-like.
299299
ParserError
300-
* If `path_or_buffer` is not a physical, decompressed file on disk.
300+
* If `path_or_buffer` is not a physical file on disk or file-like object.
301301
* If no data is returned from selected items in `iterparse`.
302302
303303
Notes
@@ -322,7 +322,7 @@ def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
322322
"for value in iterparse"
323323
)
324324

325-
if (
325+
if (not hasattr(self.path_or_buffer, "read")) and (
326326
not isinstance(self.path_or_buffer, str)
327327
or is_url(self.path_or_buffer)
328328
or is_fsspec_url(self.path_or_buffer)

pandas/tests/io/xml/test_xml.py

+69-6
Original file line numberDiff line numberDiff line change
@@ -1351,18 +1351,81 @@ def test_string_error(parser):
13511351
)
13521352

13531353

1354-
def test_file_like_error(datapath, parser, mode):
1354+
def test_file_like_iterparse(datapath, parser, mode):
13551355
filename = datapath("io", "data", "xml", "books.xml")
1356-
with pytest.raises(
1357-
ParserError, match=("iterparse is designed for large XML files")
1358-
):
1359-
with open(filename) as f:
1360-
read_xml(
1356+
1357+
with open(filename, mode) as f:
1358+
if mode == "r" and parser == "lxml":
1359+
with pytest.raises(
1360+
TypeError, match=("reading file objects must return bytes objects")
1361+
):
1362+
read_xml(
1363+
f,
1364+
parser=parser,
1365+
iterparse={
1366+
"book": ["category", "title", "year", "author", "price"]
1367+
},
1368+
)
1369+
return None
1370+
else:
1371+
df_filelike = read_xml(
13611372
f,
13621373
parser=parser,
13631374
iterparse={"book": ["category", "title", "year", "author", "price"]},
13641375
)
13651376

1377+
df_expected = DataFrame(
1378+
{
1379+
"category": ["cooking", "children", "web"],
1380+
"title": ["Everyday Italian", "Harry Potter", "Learning XML"],
1381+
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
1382+
"year": [2005, 2005, 2003],
1383+
"price": [30.00, 29.99, 39.95],
1384+
}
1385+
)
1386+
1387+
tm.assert_frame_equal(df_filelike, df_expected)
1388+
1389+
1390+
def test_file_io_iterparse(datapath, parser, mode):
1391+
filename = datapath("io", "data", "xml", "books.xml")
1392+
1393+
funcIO = StringIO if mode == "r" else BytesIO
1394+
with open(filename, mode) as f:
1395+
with funcIO(f.read()) as b:
1396+
if mode == "r" and parser == "lxml":
1397+
with pytest.raises(
1398+
TypeError, match=("reading file objects must return bytes objects")
1399+
):
1400+
read_xml(
1401+
b,
1402+
parser=parser,
1403+
iterparse={
1404+
"book": ["category", "title", "year", "author", "price"]
1405+
},
1406+
)
1407+
return None
1408+
else:
1409+
df_fileio = read_xml(
1410+
b,
1411+
parser=parser,
1412+
iterparse={
1413+
"book": ["category", "title", "year", "author", "price"]
1414+
},
1415+
)
1416+
1417+
df_expected = DataFrame(
1418+
{
1419+
"category": ["cooking", "children", "web"],
1420+
"title": ["Everyday Italian", "Harry Potter", "Learning XML"],
1421+
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
1422+
"year": [2005, 2005, 2003],
1423+
"price": [30.00, 29.99, 39.95],
1424+
}
1425+
)
1426+
1427+
tm.assert_frame_equal(df_fileio, df_expected)
1428+
13661429

13671430
@pytest.mark.network
13681431
@tm.network(url="https://www.w3schools.com/xml/books.xml", check_before_test=True)

0 commit comments

Comments
 (0)