diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 7743f762d8898..ce9255142d0c1 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -282,6 +282,7 @@ Bug fixes - Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) - Bug in :meth:`pandas.DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) +- Bug in :meth:`pandas.read_excel` where ODS files with comments on time value cells failed to parse (:issue:`55200`) - Bug in :meth:`pandas.read_excel` with a ODS file without cached formatted cell for float values (:issue:`55219`) Categorical diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 277f64f636731..05f5ee5951c6c 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -1,5 +1,7 @@ from __future__ import annotations +import datetime +import re from typing import ( TYPE_CHECKING, cast, @@ -26,6 +28,12 @@ from pandas._libs.tslibs.nattype import NaTType +# ODF variant of ISO 8601 time/duration format: "PThhhHmmMss.sssS" +# see https://www.w3.org/TR/xmlschema-2/#duration for details +ODF_ISOTIME_PATTERN = re.compile( + r"^\s*PT\s*(\d+)\s*H\s*(\d+)\s*M\s*(\d+)(\.(\d+))?\s*S$" +) + @doc(storage_options=_shared_docs["storage_options"]) class ODFReader(BaseExcelReader["OpenDocument"]): @@ -214,9 +222,9 @@ def _get_cell_value(self, cell) -> Scalar | NaTType: cell_value = cell.attributes.get((OFFICENS, "date-value")) return pd.Timestamp(cell_value) elif cell_type == "time": - stamp = pd.Timestamp(str(cell)) + stamp = self._get_cell_time_value(cell) # cast needed here because Scalar doesn't include datetime.time - return cast(Scalar, stamp.time()) + return cast(Scalar, stamp) else: self.close() raise ValueError(f"Unrecognized type {cell_type}") @@ -247,3 +255,28 @@ def _get_cell_string_value(self, cell) -> str: else: value.append(str(fragment).strip("\n")) return "".join(value) + + def _get_cell_time_value(self, cell) -> datetime.time: + """ + This helper function parses ODF time value + """ + from odf.namespaces import OFFICENS + + value = cell.attributes.get((OFFICENS, "time-value")) + parts = ODF_ISOTIME_PATTERN.match(value) + if parts is None: + raise ValueError(f"Failed to parse ODF time value: {value}") + hours, minutes, seconds, _, second_part = parts.group(*range(1, 6)) + if second_part is None: + microseconds = 0 + else: + microseconds = int(int(second_part) * pow(10, 6 - len(second_part))) + + return datetime.time( + # ignore date part from some representations + # and datetime.time restrict hour values to 0..23 + hour=int(hours) % 24, + minute=int(minutes), + second=int(seconds), + microsecond=microseconds, + ) diff --git a/pandas/tests/io/data/excel/test_corrupted_time.ods b/pandas/tests/io/data/excel/test_corrupted_time.ods new file mode 100644 index 0000000000000..c3c3d105a4e0c Binary files /dev/null and b/pandas/tests/io/data/excel/test_corrupted_time.ods differ diff --git a/pandas/tests/io/data/excel/times_1900.ods b/pandas/tests/io/data/excel/times_1900.ods index 79e031c721ea3..7e307952bf89f 100644 Binary files a/pandas/tests/io/data/excel/times_1900.ods and b/pandas/tests/io/data/excel/times_1900.ods differ diff --git a/pandas/tests/io/data/excel/times_1904.ods b/pandas/tests/io/data/excel/times_1904.ods index b47a949d3b715..88ec4a17b2a3b 100644 Binary files a/pandas/tests/io/data/excel/times_1904.ods and b/pandas/tests/io/data/excel/times_1904.ods differ diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 8dd9f96a05a90..b76f0d035d7af 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1014,6 +1014,11 @@ def test_reader_seconds(self, request, engine, read_ext): actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1") tm.assert_frame_equal(actual, expected) + if read_ext == ".ods": + msg = "Failed to parse ODF time value: PT01H5a2M00S" + with pytest.raises(ValueError, match=msg): + pd.read_excel("test_corrupted_time" + read_ext) + def test_read_excel_multiindex(self, request, engine, read_ext): # see gh-4679 if engine == "pyxlsb":