diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 19e8acdaa7384..329aa3200d43a 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -408,6 +408,7 @@ I/O - Bug in :meth:`read_csv` was raising a misleading exception on a permissions issue (:issue:`23784`) - Bug in :meth:`read_csv` was raising an ``IndexError`` when header=None and 2 extra data columns - Bug in :meth:`DataFrame.to_sql` where an ``AttributeError`` was raised when saving an out of bounds date (:issue:`26761`) +- Bug in :meth:`read_excel` did not correctly handle multiple embedded spaces in OpenDocument text cells. (:issue:`32207`) Plotting ^^^^^^^^ diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 7af776dc1a10f..739c77d1c0b99 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -171,7 +171,7 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: cell_value = cell.attributes.get((OFFICENS, "value")) return float(cell_value) elif cell_type == "string": - return str(cell) + return self._get_cell_string_value(cell) elif cell_type == "currency": cell_value = cell.attributes.get((OFFICENS, "value")) return float(cell_value) @@ -182,3 +182,28 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: return pd.to_datetime(str(cell)).time() else: raise ValueError(f"Unrecognized type {cell_type}") + + def _get_cell_string_value(self, cell) -> str: + """ + Find and decode OpenDocument text:s tags that represent + a run length encoded sequence of space characters. + """ + from odf.element import Text, Element + from odf.text import S, P + from odf.namespaces import TEXTNS + + text_p = P().qname + text_s = S().qname + + p = cell.childNodes[0] + + value = [] + if p.qname == text_p: + for k, fragment in enumerate(p.childNodes): + if isinstance(fragment, Text): + value.append(fragment.data) + elif isinstance(fragment, Element): + if fragment.qname == text_s: + spaces = int(fragment.attributes.get((TEXTNS, "c"), 1)) + value.append(" " * spaces) + return "".join(value) diff --git a/pandas/tests/io/data/excel/test_spaces.ods b/pandas/tests/io/data/excel/test_spaces.ods new file mode 100644 index 0000000000000..375e839c8c221 Binary files /dev/null and b/pandas/tests/io/data/excel/test_spaces.ods differ diff --git a/pandas/tests/io/data/excel/test_spaces.xls b/pandas/tests/io/data/excel/test_spaces.xls new file mode 100644 index 0000000000000..316db172360d0 Binary files /dev/null and b/pandas/tests/io/data/excel/test_spaces.xls differ diff --git a/pandas/tests/io/data/excel/test_spaces.xlsb b/pandas/tests/io/data/excel/test_spaces.xlsb new file mode 100644 index 0000000000000..e38b6c2d8f170 Binary files /dev/null and b/pandas/tests/io/data/excel/test_spaces.xlsb differ diff --git a/pandas/tests/io/data/excel/test_spaces.xlsm b/pandas/tests/io/data/excel/test_spaces.xlsm new file mode 100644 index 0000000000000..a41ebe5bb0e65 Binary files /dev/null and b/pandas/tests/io/data/excel/test_spaces.xlsm differ diff --git a/pandas/tests/io/data/excel/test_spaces.xlsx b/pandas/tests/io/data/excel/test_spaces.xlsx new file mode 100644 index 0000000000000..9071543c4739b Binary files /dev/null and b/pandas/tests/io/data/excel/test_spaces.xlsx differ diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index b1502ed3f3c09..99447c03e89af 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -464,6 +464,24 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): actual = pd.read_excel(basename + read_ext, dtype=dtype) tm.assert_frame_equal(actual, expected) + def test_reader_spaces(self, read_ext): + # see gh-32207 + basename = "test_spaces" + + actual = pd.read_excel(basename + read_ext) + expected = DataFrame( + { + "testcol": [ + "this is great", + "4 spaces", + "1 trailing ", + " 1 leading", + "2 spaces multiple times", + ] + } + ) + tm.assert_frame_equal(actual, expected) + def test_reading_all_sheets(self, read_ext): # Test reading all sheetnames by setting sheetname to None, # Ensure a dict is returned.