Skip to content

Commit 4dfca9b

Browse files
committed
backported pandas.CalamineExcelReader from pandas-dev/pandas#50581
1 parent 1f91242 commit 4dfca9b

File tree

1 file changed

+64
-40
lines changed

1 file changed

+64
-40
lines changed

python/python_calamine/pandas.py

+64-40
Original file line numberDiff line numberDiff line change
@@ -1,68 +1,92 @@
11
from __future__ import annotations
22

3-
from io import BufferedReader, BytesIO
4-
from pathlib import PurePath
5-
from tempfile import NamedTemporaryFile
6-
from typing import Any, Type
3+
from datetime import date, datetime, time
4+
from typing import Union
75

6+
import pandas as pd
7+
from pandas._typing import FilePath, ReadBuffer, Scalar, StorageOptions
8+
from pandas.compat._optional import import_optional_dependency
9+
from pandas.core.shared_docs import _shared_docs
810
from pandas.io.excel import ExcelFile
9-
from pandas.io.excel._base import BaseExcelReader, inspect_excel_format
11+
from pandas.io.excel._base import BaseExcelReader
12+
from pandas.util._decorators import doc
1013

11-
from ._python_calamine import get_sheet_data, get_sheet_names
12-
13-
14-
class __calamine__:
15-
pass
14+
_ValueT = Union[int, float, str, bool, time, date, datetime]
1615

1716

1817
class CalamineExcelReader(BaseExcelReader):
19-
book: str
2018
_sheet_names: list[str] | None = None
2119

22-
@property
23-
def _workbook_class(self) -> Type[__calamine__]:
24-
return __calamine__
20+
@doc(storage_options=_shared_docs["storage_options"])
21+
def __init__(
22+
self,
23+
filepath_or_buffer: FilePath | ReadBuffer[bytes],
24+
storage_options: StorageOptions = None,
25+
) -> None:
26+
"""
27+
Reader using calamine engine (xlsx/xls/xlsb/ods).
28+
29+
Parameters
30+
----------
31+
filepath_or_buffer : str, path to be parsed or
32+
an open readable stream.
33+
{storage_options}
34+
"""
35+
import_optional_dependency("python_calamine")
36+
super().__init__(filepath_or_buffer, storage_options=storage_options)
2537

26-
def load_workbook(
27-
self, filepath_or_buffer: str | PurePath | BufferedReader | BytesIO
28-
) -> str:
29-
if isinstance(filepath_or_buffer, BufferedReader):
30-
filepath_or_buffer = filepath_or_buffer.name
31-
32-
elif isinstance(filepath_or_buffer, BytesIO):
33-
ext = inspect_excel_format(filepath_or_buffer)
34-
with NamedTemporaryFile(suffix=f".{ext}", delete=False) as tmp_file:
35-
tmp_file.write(filepath_or_buffer.getvalue())
36-
filepath_or_buffer = tmp_file.name
38+
@property
39+
def _workbook_class(self):
40+
from python_calamine import CalamineWorkbook
3741

38-
elif isinstance(filepath_or_buffer, PurePath):
39-
filepath_or_buffer = filepath_or_buffer.as_posix()
42+
return CalamineWorkbook
4043

41-
assert isinstance(filepath_or_buffer, str)
44+
def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
45+
from python_calamine import CalamineWorkbook
4246

43-
self._sheet_names = get_sheet_names(filepath_or_buffer)
44-
return filepath_or_buffer
47+
return CalamineWorkbook.from_pyobject(filepath_or_buffer)
4548

4649
@property
4750
def sheet_names(self) -> list[str]:
48-
if self._sheet_names is None:
49-
self._sheet_names = get_sheet_names(self.book)
50-
return self._sheet_names
51+
return self.book.sheet_names # pyright: ignore
5152

52-
def get_sheet_by_name(self, name: str) -> int:
53+
def get_sheet_by_name(self, name: str):
5354
self.raise_if_bad_sheet_by_name(name)
54-
return self.sheet_names.index(name)
55+
return self.book.get_sheet_by_name(name) # pyright: ignore
5556

56-
def get_sheet_by_index(self, index: int) -> int:
57+
def get_sheet_by_index(self, index: int):
5758
self.raise_if_bad_sheet_by_index(index)
58-
return index
59+
return self.book.get_sheet_by_index(index) # pyright: ignore
5960

60-
def get_sheet_data(self, sheet: int, convert_float: bool) -> list[list[Any]]:
61-
return get_sheet_data(self.book, sheet)
61+
def get_sheet_data(
62+
self, sheet, file_rows_needed: int | None = None
63+
) -> list[list[Scalar]]:
64+
def _convert_cell(value: _ValueT) -> Scalar:
65+
if isinstance(value, float):
66+
val = int(value)
67+
if val == value:
68+
return val
69+
else:
70+
return value
71+
elif isinstance(value, date):
72+
return pd.Timestamp(value)
73+
elif isinstance(value, time):
74+
return value.isoformat()
6275

76+
return value
6377

64-
def pandas_monkeypatch():
78+
rows: list[list[_ValueT]] = sheet.to_python(skip_empty_area=False)
79+
data: list[list[Scalar]] = []
80+
81+
for row in rows:
82+
data.append([_convert_cell(cell) for cell in row])
83+
if file_rows_needed is not None and len(data) >= file_rows_needed:
84+
break
85+
86+
return data
6587

88+
89+
def pandas_monkeypatch():
6690
ExcelFile._engines = {
6791
"calamine": CalamineExcelReader,
6892
**ExcelFile._engines,

0 commit comments

Comments
 (0)