From 8915fad472adee40fdf1ff6736e3a75565b0f5ec Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Fri, 24 Jun 2022 22:49:30 -0500 Subject: [PATCH] BUG: read_xml iterparse doesn't handle multiple toplevel elements with lxml parser --- pandas/io/xml.py | 2 +- pandas/tests/io/xml/test_xml.py | 109 +++++++++++++++++++++++++++++++- 2 files changed, 109 insertions(+), 2 deletions(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 78fbeaad09300..9398e995c81ce 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -693,7 +693,7 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]: row = None elem.clear() - while elem.getprevious() is not None: + while elem.getprevious() is not None and elem.getparent() is not None: del elem.getparent()[0] if dicts == []: diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index eb2230bbf7fd5..b89adf85d8e26 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1271,7 +1271,7 @@ def test_wrong_dict_value(datapath, parser): read_xml(filename, parser=parser, iterparse={"book": "category"}) -def test_bad_xml(datapath, parser): +def test_bad_xml(parser): bad_xml = """\ @@ -1312,6 +1312,113 @@ def test_bad_xml(datapath, parser): ) +def test_comment(parser): + xml = """\ + + + + + circle + 2D + + + sphere + 3D + + + + +""" + + df_xpath = read_xml(xml, xpath=".//shape", parser=parser) + + df_iter = read_xml_iterparse( + xml, parser=parser, iterparse={"shape": ["name", "type"]} + ) + + df_expected = DataFrame( + { + "name": ["circle", "sphere"], + "type": ["2D", "3D"], + } + ) + + tm.assert_frame_equal(df_xpath, df_expected) + tm.assert_frame_equal(df_iter, df_expected) + + +def test_dtd(parser): + xml = """\ + + + + +]> + + + circle + 2D + + + sphere + 3D + +""" + + df_xpath = read_xml(xml, xpath=".//shape", parser=parser) + + df_iter = read_xml_iterparse( + xml, parser=parser, iterparse={"shape": ["name", "type"]} + ) + + df_expected = DataFrame( + { + "name": ["circle", "sphere"], + "type": ["2D", "3D"], + } + ) + + tm.assert_frame_equal(df_xpath, df_expected) + tm.assert_frame_equal(df_iter, df_expected) + + +def test_processing_instruction(parser): + xml = """\ + + + + + +, , ?> + + + circle + 2D + + + sphere + 3D + +""" + + df_xpath = read_xml(xml, xpath=".//shape", parser=parser) + + df_iter = read_xml_iterparse( + xml, parser=parser, iterparse={"shape": ["name", "type"]} + ) + + df_expected = DataFrame( + { + "name": ["circle", "sphere"], + "type": ["2D", "3D"], + } + ) + + tm.assert_frame_equal(df_xpath, df_expected) + tm.assert_frame_equal(df_iter, df_expected) + + def test_no_result(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") with pytest.raises(