Skip to content

Commit 1bc9197

Browse files
authored
BUG: read_xml iterparse doesn't handle multiple toplevel elements with lxml parser (#47504)
BUG: read_xml iterparse doesn't handle multiple toplevel elements with lxml parser
1 parent 734db4f commit 1bc9197

File tree

2 files changed

+109
-2
lines changed

2 files changed

+109
-2
lines changed

pandas/io/xml.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -693,7 +693,7 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]:
693693
row = None
694694

695695
elem.clear()
696-
while elem.getprevious() is not None:
696+
while elem.getprevious() is not None and elem.getparent() is not None:
697697
del elem.getparent()[0]
698698

699699
if dicts == []:

pandas/tests/io/xml/test_xml.py

+108-1
Original file line numberDiff line numberDiff line change
@@ -1271,7 +1271,7 @@ def test_wrong_dict_value(datapath, parser):
12711271
read_xml(filename, parser=parser, iterparse={"book": "category"})
12721272

12731273

1274-
def test_bad_xml(datapath, parser):
1274+
def test_bad_xml(parser):
12751275
bad_xml = """\
12761276
<?xml version='1.0' encoding='utf-8'?>
12771277
<row>
@@ -1312,6 +1312,113 @@ def test_bad_xml(datapath, parser):
13121312
)
13131313

13141314

1315+
def test_comment(parser):
1316+
xml = """\
1317+
<!-- comment before root -->
1318+
<shapes>
1319+
<!-- comment within root -->
1320+
<shape>
1321+
<name>circle</name>
1322+
<type>2D</type>
1323+
</shape>
1324+
<shape>
1325+
<name>sphere</name>
1326+
<type>3D</type>
1327+
<!-- comment within child -->
1328+
</shape>
1329+
<!-- comment within root -->
1330+
</shapes>
1331+
<!-- comment after root -->"""
1332+
1333+
df_xpath = read_xml(xml, xpath=".//shape", parser=parser)
1334+
1335+
df_iter = read_xml_iterparse(
1336+
xml, parser=parser, iterparse={"shape": ["name", "type"]}
1337+
)
1338+
1339+
df_expected = DataFrame(
1340+
{
1341+
"name": ["circle", "sphere"],
1342+
"type": ["2D", "3D"],
1343+
}
1344+
)
1345+
1346+
tm.assert_frame_equal(df_xpath, df_expected)
1347+
tm.assert_frame_equal(df_iter, df_expected)
1348+
1349+
1350+
def test_dtd(parser):
1351+
xml = """\
1352+
<?xml version="1.0" encoding="UTF-8"?>
1353+
<!DOCTYPE non-profits [
1354+
<!ELEMENT shapes (shape*) >
1355+
<!ELEMENT shape ( name, type )>
1356+
<!ELEMENT name (#PCDATA)>
1357+
]>
1358+
<shapes>
1359+
<shape>
1360+
<name>circle</name>
1361+
<type>2D</type>
1362+
</shape>
1363+
<shape>
1364+
<name>sphere</name>
1365+
<type>3D</type>
1366+
</shape>
1367+
</shapes>"""
1368+
1369+
df_xpath = read_xml(xml, xpath=".//shape", parser=parser)
1370+
1371+
df_iter = read_xml_iterparse(
1372+
xml, parser=parser, iterparse={"shape": ["name", "type"]}
1373+
)
1374+
1375+
df_expected = DataFrame(
1376+
{
1377+
"name": ["circle", "sphere"],
1378+
"type": ["2D", "3D"],
1379+
}
1380+
)
1381+
1382+
tm.assert_frame_equal(df_xpath, df_expected)
1383+
tm.assert_frame_equal(df_iter, df_expected)
1384+
1385+
1386+
def test_processing_instruction(parser):
1387+
xml = """\
1388+
<?xml version="1.0" encoding="UTF-8"?>
1389+
<?xml-stylesheet type="text/xsl" href="style.xsl"?>
1390+
<?display table-view?>
1391+
<?sort alpha-ascending?>
1392+
<?textinfo whitespace is allowed ?>
1393+
<?elementnames <shape>, <name>, <type> ?>
1394+
<shapes>
1395+
<shape>
1396+
<name>circle</name>
1397+
<type>2D</type>
1398+
</shape>
1399+
<shape>
1400+
<name>sphere</name>
1401+
<type>3D</type>
1402+
</shape>
1403+
</shapes>"""
1404+
1405+
df_xpath = read_xml(xml, xpath=".//shape", parser=parser)
1406+
1407+
df_iter = read_xml_iterparse(
1408+
xml, parser=parser, iterparse={"shape": ["name", "type"]}
1409+
)
1410+
1411+
df_expected = DataFrame(
1412+
{
1413+
"name": ["circle", "sphere"],
1414+
"type": ["2D", "3D"],
1415+
}
1416+
)
1417+
1418+
tm.assert_frame_equal(df_xpath, df_expected)
1419+
tm.assert_frame_equal(df_iter, df_expected)
1420+
1421+
13151422
def test_no_result(datapath, parser):
13161423
filename = datapath("io", "data", "xml", "books.xml")
13171424
with pytest.raises(

0 commit comments

Comments
 (0)