Skip to content

Commit f404a3f

Browse files
authored
Ods loses spaces 32207 (#33233)
1 parent 6300abe commit f404a3f

File tree

8 files changed

+45
-1
lines changed

8 files changed

+45
-1
lines changed

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,7 @@ I/O
414414
- Bug in :meth:`read_csv` was raising a misleading exception on a permissions issue (:issue:`23784`)
415415
- Bug in :meth:`read_csv` was raising an ``IndexError`` when header=None and 2 extra data columns
416416
- Bug in :meth:`DataFrame.to_sql` where an ``AttributeError`` was raised when saving an out of bounds date (:issue:`26761`)
417+
- Bug in :meth:`read_excel` did not correctly handle multiple embedded spaces in OpenDocument text cells. (:issue:`32207`)
417418

418419
Plotting
419420
^^^^^^^^

pandas/io/excel/_odfreader.py

+26-1
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar:
171171
cell_value = cell.attributes.get((OFFICENS, "value"))
172172
return float(cell_value)
173173
elif cell_type == "string":
174-
return str(cell)
174+
return self._get_cell_string_value(cell)
175175
elif cell_type == "currency":
176176
cell_value = cell.attributes.get((OFFICENS, "value"))
177177
return float(cell_value)
@@ -182,3 +182,28 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar:
182182
return pd.to_datetime(str(cell)).time()
183183
else:
184184
raise ValueError(f"Unrecognized type {cell_type}")
185+
186+
def _get_cell_string_value(self, cell) -> str:
187+
"""
188+
Find and decode OpenDocument text:s tags that represent
189+
a run length encoded sequence of space characters.
190+
"""
191+
from odf.element import Text, Element
192+
from odf.text import S, P
193+
from odf.namespaces import TEXTNS
194+
195+
text_p = P().qname
196+
text_s = S().qname
197+
198+
p = cell.childNodes[0]
199+
200+
value = []
201+
if p.qname == text_p:
202+
for k, fragment in enumerate(p.childNodes):
203+
if isinstance(fragment, Text):
204+
value.append(fragment.data)
205+
elif isinstance(fragment, Element):
206+
if fragment.qname == text_s:
207+
spaces = int(fragment.attributes.get((TEXTNS, "c"), 1))
208+
value.append(" " * spaces)
209+
return "".join(value)
9.05 KB
Binary file not shown.
5.5 KB
Binary file not shown.
7.85 KB
Binary file not shown.
4.73 KB
Binary file not shown.
8.42 KB
Binary file not shown.

pandas/tests/io/excel/test_readers.py

+18
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,24 @@ def test_reader_dtype_str(self, read_ext, dtype, expected):
464464
actual = pd.read_excel(basename + read_ext, dtype=dtype)
465465
tm.assert_frame_equal(actual, expected)
466466

467+
def test_reader_spaces(self, read_ext):
468+
# see gh-32207
469+
basename = "test_spaces"
470+
471+
actual = pd.read_excel(basename + read_ext)
472+
expected = DataFrame(
473+
{
474+
"testcol": [
475+
"this is great",
476+
"4 spaces",
477+
"1 trailing ",
478+
" 1 leading",
479+
"2 spaces multiple times",
480+
]
481+
}
482+
)
483+
tm.assert_frame_equal(actual, expected)
484+
467485
def test_reading_all_sheets(self, read_ext):
468486
# Test reading all sheetnames by setting sheetname to None,
469487
# Ensure a dict is returned.

0 commit comments

Comments
 (0)