Skip to content

Commit f19c23d

Browse files
ParfaitGyehoshuadimarsky
authored andcommitted
BUG: using read_xml with iterparse and names will ignore duplicate values (pandas-dev#47630)
1 parent a6685b9 commit f19c23d

File tree

2 files changed

+41
-1
lines changed

2 files changed

+41
-1
lines changed

pandas/io/xml.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
342342
for col, nm in zip(self.iterparse[row_node], self.names):
343343
if curr_elem == col:
344344
elem_val = elem.text.strip() if elem.text else None
345-
if elem_val not in row.values() and nm not in row:
345+
if row.get(nm) != elem_val and nm not in row:
346346
row[nm] = elem_val
347347
if col in elem.attrib:
348348
if elem.attrib[col] not in row.values() and nm not in row:

pandas/tests/io/xml/test_xml.py

+40
Original file line numberDiff line numberDiff line change
@@ -824,6 +824,46 @@ def test_repeat_names(parser):
824824
tm.assert_frame_equal(df_iter, df_expected)
825825

826826

827+
def test_repeat_values_new_names(parser):
828+
xml = """\
829+
<shapes>
830+
<shape>
831+
<name>rectangle</name>
832+
<family>rectangle</family>
833+
</shape>
834+
<shape>
835+
<name>square</name>
836+
<family>rectangle</family>
837+
</shape>
838+
<shape>
839+
<name>ellipse</name>
840+
<family>ellipse</family>
841+
</shape>
842+
<shape>
843+
<name>circle</name>
844+
<family>ellipse</family>
845+
</shape>
846+
</shapes>"""
847+
df_xpath = read_xml(xml, xpath=".//shape", parser=parser, names=["name", "group"])
848+
849+
df_iter = read_xml_iterparse(
850+
xml,
851+
parser=parser,
852+
iterparse={"shape": ["name", "family"]},
853+
names=["name", "group"],
854+
)
855+
856+
df_expected = DataFrame(
857+
{
858+
"name": ["rectangle", "square", "ellipse", "circle"],
859+
"group": ["rectangle", "rectangle", "ellipse", "ellipse"],
860+
}
861+
)
862+
863+
tm.assert_frame_equal(df_xpath, df_expected)
864+
tm.assert_frame_equal(df_iter, df_expected)
865+
866+
827867
def test_names_option_wrong_length(datapath, parser):
828868
filename = datapath("io", "data", "xml", "books.xml")
829869

0 commit comments

Comments
 (0)