From 170ca23075977340830b3883b2a0e8bd3204eb4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stephan=20He=C3=9Felmann=20=28lgtf/39809=29?= Date: Fri, 9 Jul 2021 17:27:10 +0200 Subject: [PATCH 1/2] BUG: `to_xml` with `index=False` and offset input index Fixes #42458 It was assumed that the index contains the element `0`. This led to a defect when the index of the input Dataframe has an offset, which is a common use case when streaming Dataframes via generators. This fix consists of not relying on accessing the `0` element of `frame_dicts`. --- doc/source/whatsnew/v1.3.1.rst | 1 + pandas/io/formats/xml.py | 16 +++++++----- pandas/tests/io/xml/test_to_xml.py | 41 +++++++++++++++++++++++++++++- 3 files changed, 50 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst index 255747c3c5c6d..c1cf9d208673f 100644 --- a/doc/source/whatsnew/v1.3.1.rst +++ b/doc/source/whatsnew/v1.3.1.rst @@ -27,6 +27,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Fixed bug in :meth:`DataFrame.transpose` dropping values when the DataFrame had an Extension Array dtype and a duplicate index (:issue:`42380`) +- Fixed bug in :meth:`DataFrame.to_xml` raising ``KeyError`` when called with ``index=False`` and an offset index (:issue:`42458`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index d2b86cc458b74..e7ed4036fda20 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -195,14 +195,16 @@ def handle_indexes(self) -> None: This method will add indexes into attr_cols or elem_cols. """ - indexes: list[str] = [ - x for x in self.frame_dicts[0].keys() if x not in self.orig_cols - ] + if not self.index: + return + + first_dict = next(iter(self.frame_dicts.values())) + indexes: list[str] = [x for x in first_dict.keys() if x not in self.orig_cols] - if self.attr_cols and self.index: + if self.attr_cols: self.attr_cols = indexes + self.attr_cols - if self.elem_cols and self.index: + if self.elem_cols: self.elem_cols = indexes + self.elem_cols def get_prefix_uri(self) -> str: @@ -307,7 +309,7 @@ def build_tree(self) -> bytes: self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}") if not self.attr_cols and not self.elem_cols: - self.elem_cols = list(self.frame_dicts[0].keys()) + self.elem_cols = list(d.keys()) self.build_elems() else: @@ -477,7 +479,7 @@ def build_tree(self) -> bytes: self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}") if not self.attr_cols and not self.elem_cols: - self.elem_cols = list(self.frame_dicts[0].keys()) + self.elem_cols = list(d.keys()) self.build_elems() else: diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py index 478f4c803479d..c73966a8d8786 100644 --- a/pandas/tests/io/xml/test_to_xml.py +++ b/pandas/tests/io/xml/test_to_xml.py @@ -11,7 +11,10 @@ import pandas.util._test_decorators as td -from pandas import DataFrame +from pandas import ( + DataFrame, + RangeIndex, +) import pandas._testing as tm from pandas.io.common import get_handle @@ -290,6 +293,42 @@ def test_index_false_rename_row_root(datapath, parser): assert output == expected +def test_index_false_with_offset_input_index(parser): + """ + Tests that the output does not contain the `` field when the index of the + input Dataframe has an offset. + + This is a regression test for issue #42458. + """ + + expected = """\ + + + + square + 360 + 4.0 + + + circle + 360 + + + + triangle + 180 + 3.0 + +""" + + offset_geom_df = geom_df.copy() + offset_geom_df.index = RangeIndex(start=10, stop=13, step=1) + output = offset_geom_df.to_xml(index=False, parser=parser) + output = equalize_decl(output) + + assert output == expected + + # NA_REP na_expected = """\ From 374c78cd50d9f2107c34955b7323839820e8df9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stephan=20He=C3=9Felmann=20=28lgtf/39809=29?= Date: Mon, 12 Jul 2021 17:36:58 +0200 Subject: [PATCH 2/2] Address review comments --- pandas/io/formats/xml.py | 10 ++++++---- pandas/tests/io/xml/test_to_xml.py | 9 ++++++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index e7ed4036fda20..f5ba8c6b53335 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -198,8 +198,10 @@ def handle_indexes(self) -> None: if not self.index: return - first_dict = next(iter(self.frame_dicts.values())) - indexes: list[str] = [x for x in first_dict.keys() if x not in self.orig_cols] + first_key = next(iter(self.frame_dicts)) + indexes: list[str] = [ + x for x in self.frame_dicts[first_key].keys() if x not in self.orig_cols + ] if self.attr_cols: self.attr_cols = indexes + self.attr_cols @@ -309,7 +311,7 @@ def build_tree(self) -> bytes: self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}") if not self.attr_cols and not self.elem_cols: - self.elem_cols = list(d.keys()) + self.elem_cols = list(self.d.keys()) self.build_elems() else: @@ -479,7 +481,7 @@ def build_tree(self) -> bytes: self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}") if not self.attr_cols and not self.elem_cols: - self.elem_cols = list(d.keys()) + self.elem_cols = list(self.d.keys()) self.build_elems() else: diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py index c73966a8d8786..4f4815b9008ad 100644 --- a/pandas/tests/io/xml/test_to_xml.py +++ b/pandas/tests/io/xml/test_to_xml.py @@ -13,7 +13,7 @@ from pandas import ( DataFrame, - RangeIndex, + Index, ) import pandas._testing as tm @@ -293,7 +293,10 @@ def test_index_false_rename_row_root(datapath, parser): assert output == expected -def test_index_false_with_offset_input_index(parser): +@pytest.mark.parametrize( + "offset_index", [list(range(10, 13)), [str(i) for i in range(10, 13)]] +) +def test_index_false_with_offset_input_index(parser, offset_index): """ Tests that the output does not contain the `` field when the index of the input Dataframe has an offset. @@ -322,7 +325,7 @@ def test_index_false_with_offset_input_index(parser): """ offset_geom_df = geom_df.copy() - offset_geom_df.index = RangeIndex(start=10, stop=13, step=1) + offset_geom_df.index = Index(offset_index) output = offset_geom_df.to_xml(index=False, parser=parser) output = equalize_decl(output)