diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 95fac0c739895..9b6eb31dafc07 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -342,7 +342,7 @@ def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]: for col, nm in zip(self.iterparse[row_node], self.names): if curr_elem == col: elem_val = elem.text.strip() if elem.text else None - if elem_val not in row.values() and nm not in row: + if row.get(nm) != elem_val and nm not in row: row[nm] = elem_val if col in elem.attrib: if elem.attrib[col] not in row.values() and nm not in row: diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index b89adf85d8e26..410c5f6703dcd 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -824,6 +824,46 @@ def test_repeat_names(parser): tm.assert_frame_equal(df_iter, df_expected) +def test_repeat_values_new_names(parser): + xml = """\ + + + rectangle + rectangle + + + square + rectangle + + + ellipse + ellipse + + + circle + ellipse + +""" + df_xpath = read_xml(xml, xpath=".//shape", parser=parser, names=["name", "group"]) + + df_iter = read_xml_iterparse( + xml, + parser=parser, + iterparse={"shape": ["name", "family"]}, + names=["name", "group"], + ) + + df_expected = DataFrame( + { + "name": ["rectangle", "square", "ellipse", "circle"], + "group": ["rectangle", "rectangle", "ellipse", "ellipse"], + } + ) + + tm.assert_frame_equal(df_xpath, df_expected) + tm.assert_frame_equal(df_iter, df_expected) + + def test_names_option_wrong_length(datapath, parser): filename = datapath("io", "data", "xml", "books.xml")