backported pandas.CalamineExcelReader from pandas-dev/pandas#50581

dimastbk · dimastbk · commit 4dfca9b364dc · 2023-03-23T00:55:15.000+06:00
diff --git a/python/python_calamine/pandas.py b/python/python_calamine/pandas.py
@@ -1,68 +1,92 @@
 from __future__ import annotations
 
-from io import BufferedReader, BytesIO
-from pathlib import PurePath
-from tempfile import NamedTemporaryFile
-from typing import Any, Type
+from datetime import date, datetime, time
+from typing import Union
 
+import pandas as pd
+from pandas._typing import FilePath, ReadBuffer, Scalar, StorageOptions
+from pandas.compat._optional import import_optional_dependency
+from pandas.core.shared_docs import _shared_docs
 from pandas.io.excel import ExcelFile
-from pandas.io.excel._base import BaseExcelReader, inspect_excel_format
+from pandas.io.excel._base import BaseExcelReader
+from pandas.util._decorators import doc
 
-from ._python_calamine import get_sheet_data, get_sheet_names
-
-
-class __calamine__:
-    pass
+_ValueT = Union[int, float, str, bool, time, date, datetime]
 
 
 class CalamineExcelReader(BaseExcelReader):
-    book: str
     _sheet_names: list[str] | None = None
 
-    @property
-    def _workbook_class(self) -> Type[__calamine__]:
-        return __calamine__
+    @doc(storage_options=_shared_docs["storage_options"])
+    def __init__(
+        self,
+        filepath_or_buffer: FilePath | ReadBuffer[bytes],
+        storage_options: StorageOptions = None,
+    ) -> None:
+        """
+        Reader using calamine engine (xlsx/xls/xlsb/ods).
+
+        Parameters
+        ----------
+        filepath_or_buffer : str, path to be parsed or
+            an open readable stream.
+        {storage_options}
+        """
+        import_optional_dependency("python_calamine")
+        super().__init__(filepath_or_buffer, storage_options=storage_options)
 
-    def load_workbook(
-        self, filepath_or_buffer: str | PurePath | BufferedReader | BytesIO
-    ) -> str:
-        if isinstance(filepath_or_buffer, BufferedReader):
-            filepath_or_buffer = filepath_or_buffer.name
-
-        elif isinstance(filepath_or_buffer, BytesIO):
-            ext = inspect_excel_format(filepath_or_buffer)
-            with NamedTemporaryFile(suffix=f".{ext}", delete=False) as tmp_file:
-                tmp_file.write(filepath_or_buffer.getvalue())
-                filepath_or_buffer = tmp_file.name
+    @property
+    def _workbook_class(self):
+        from python_calamine import CalamineWorkbook
 
-        elif isinstance(filepath_or_buffer, PurePath):
-            filepath_or_buffer = filepath_or_buffer.as_posix()
+        return CalamineWorkbook
 
-        assert isinstance(filepath_or_buffer, str)
+    def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
+        from python_calamine import CalamineWorkbook
 
-        self._sheet_names = get_sheet_names(filepath_or_buffer)
-        return filepath_or_buffer
+        return CalamineWorkbook.from_pyobject(filepath_or_buffer)
 
     @property
     def sheet_names(self) -> list[str]:
-        if self._sheet_names is None:
-            self._sheet_names = get_sheet_names(self.book)
-        return self._sheet_names
+        return self.book.sheet_names  # pyright: ignore
 
-    def get_sheet_by_name(self, name: str) -> int:
+    def get_sheet_by_name(self, name: str):
         self.raise_if_bad_sheet_by_name(name)
-        return self.sheet_names.index(name)
+        return self.book.get_sheet_by_name(name)  # pyright: ignore
 
-    def get_sheet_by_index(self, index: int) -> int:
+    def get_sheet_by_index(self, index: int):
         self.raise_if_bad_sheet_by_index(index)
-        return index
+        return self.book.get_sheet_by_index(index)  # pyright: ignore
 
-    def get_sheet_data(self, sheet: int, convert_float: bool) -> list[list[Any]]:
-        return get_sheet_data(self.book, sheet)
+    def get_sheet_data(
+        self, sheet, file_rows_needed: int | None = None
+    ) -> list[list[Scalar]]:
+        def _convert_cell(value: _ValueT) -> Scalar:
+            if isinstance(value, float):
+                val = int(value)
+                if val == value:
+                    return val
+                else:
+                    return value
+            elif isinstance(value, date):
+                return pd.Timestamp(value)
+            elif isinstance(value, time):
+                return value.isoformat()
 
+            return value
 
-def pandas_monkeypatch():
+        rows: list[list[_ValueT]] = sheet.to_python(skip_empty_area=False)
+        data: list[list[Scalar]] = []
+
+        for row in rows:
+            data.append([_convert_cell(cell) for cell in row])
+            if file_rows_needed is not None and len(data) >= file_rows_needed:
+                break
+
+        return data
 
+
+def pandas_monkeypatch():
     ExcelFile._engines = {
         "calamine": CalamineExcelReader,
         **ExcelFile._engines,