diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2aac2596c18cb..b4289973a26cd 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -296,6 +296,7 @@ I/O - :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue: `35058`) - :meth:`to_csv` and :meth:`read_csv` did not honor `compression` and `encoding` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`) - :meth:`to_picke` and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, and :issue:`29570`) +- Bug in :meth:`read_excel` with `engine="odf"` caused ``UnboundLocalError`` in some cases where cells had nested child nodes (:issue:`36122`, and :issue:`35802`) Plotting ^^^^^^^^ diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 02575ab878f6e..09a4bfb9accc3 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -197,22 +197,24 @@ def _get_cell_string_value(self, cell) -> str: Find and decode OpenDocument text:s tags that represent a run length encoded sequence of space characters. """ - from odf.element import Element, Text + from odf.element import Element from odf.namespaces import TEXTNS - from odf.text import P, S + from odf.text import S - text_p = P().qname text_s = S().qname - p = cell.childNodes[0] - value = [] - if p.qname == text_p: - for k, fragment in enumerate(p.childNodes): - if isinstance(fragment, Text): - value.append(fragment.data) - elif isinstance(fragment, Element): - if fragment.qname == text_s: - spaces = int(fragment.attributes.get((TEXTNS, "c"), 1)) + + for fragment in cell.childNodes: + if isinstance(fragment, Element): + if fragment.qname == text_s: + spaces = int(fragment.attributes.get((TEXTNS, "c"), 1)) value.append(" " * spaces) + else: + # recursive impl needed in case of nested fragments + # with multiple spaces + # https://github.com/pandas-dev/pandas/pull/36175#discussion_r484639704 + value.append(self._get_cell_string_value(fragment)) + else: + value.append(str(fragment)) return "".join(value) diff --git a/pandas/tests/io/data/excel/gh-35802.ods b/pandas/tests/io/data/excel/gh-35802.ods new file mode 100755 index 0000000000000..f3ad061f1d995 Binary files /dev/null and b/pandas/tests/io/data/excel/gh-35802.ods differ diff --git a/pandas/tests/io/data/excel/gh-36122.ods b/pandas/tests/io/data/excel/gh-36122.ods new file mode 100755 index 0000000000000..3dfdaf976da45 Binary files /dev/null and b/pandas/tests/io/data/excel/gh-36122.ods differ diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 431a50477fccc..33467be42dfd9 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -499,6 +499,23 @@ def test_reader_spaces(self, read_ext): ) tm.assert_frame_equal(actual, expected) + # gh-36122, gh-35802 + @pytest.mark.parametrize( + "basename,expected", + [ + ("gh-35802", DataFrame({"COLUMN": ["Test (1)"]})), + ("gh-36122", DataFrame(columns=["got 2nd sa"])), + ], + ) + def test_read_excel_ods_nested_xml(self, read_ext, basename, expected): + # see gh-35802 + engine = pd.read_excel.keywords["engine"] + if engine != "odf": + pytest.skip(f"Skipped for engine: {engine}") + + actual = pd.read_excel(basename + read_ext) + tm.assert_frame_equal(actual, expected) + def test_reading_all_sheets(self, read_ext): # Test reading all sheet names by setting sheet_name to None, # Ensure a dict is returned.