Skip to content

Commit b6701f0

Browse files
committed
ENH: add calamine excel reader (close #50395)
Co-author: Kostya Farber (#50581)
1 parent 4b456e2 commit b6701f0

20 files changed

+217
-52
lines changed

ci/deps/actions-310.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -58,4 +58,5 @@ dependencies:
5858

5959
- pip:
6060
- pyqt5>=5.15.6
61+
- python-calamine>=0.1.6
6162
- tzdata>=2022.1

ci/deps/actions-311-downstream_compat.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -73,4 +73,5 @@ dependencies:
7373
- pip:
7474
- dataframe-api-compat>=0.1.7
7575
- pyqt5>=5.15.6
76+
- python-calamine>=0.1.6
7677
- tzdata>=2022.1

ci/deps/actions-311.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -58,4 +58,5 @@ dependencies:
5858

5959
- pip:
6060
- pyqt5>=5.15.6
61+
- python-calamine>=0.1.6
6162
- tzdata>=2022.1

ci/deps/actions-39-minimum_versions.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -61,4 +61,5 @@ dependencies:
6161
- pip:
6262
- dataframe-api-compat==0.1.7
6363
- pyqt5==5.15.6
64+
- python-calamine==0.1.6
6465
- tzdata==2022.1

ci/deps/actions-39.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -58,4 +58,5 @@ dependencies:
5858

5959
- pip:
6060
- pyqt5>=5.15.6
61+
- python-calamine>=0.1.6
6162
- tzdata>=2022.1

ci/deps/circle-310-arm64.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,6 @@ dependencies:
5656
- xlrd>=2.0.1
5757
- xlsxwriter>=3.0.3
5858
- zstandard>=0.17.0
59+
60+
- pip:
61+
- python-calamine>=0.1.6

doc/source/getting_started/install.rst

+1
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,7 @@ xlrd 2.0.1 excel Reading Excel
281281
xlsxwriter 3.0.3 excel Writing Excel
282282
openpyxl 3.0.10 excel Reading / writing for xlsx files
283283
pyxlsb 1.0.9 excel Reading for xlsb files
284+
python-calamine 0.1.6 excel Reading for xls/xlsx/xlsb/ods files
284285
========================= ================== =============== =============================================================
285286

286287
HTML

doc/source/user_guide/io.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -3453,7 +3453,8 @@ Excel files
34533453
The :func:`~pandas.read_excel` method can read Excel 2007+ (``.xlsx``) files
34543454
using the ``openpyxl`` Python module. Excel 2003 (``.xls``) files
34553455
can be read using ``xlrd``. Binary Excel (``.xlsb``)
3456-
files can be read using ``pyxlsb``.
3456+
files can be read using ``pyxlsb``. Also, all this formats can be read using ``python-calamine``,
3457+
but this library has some different behavior from other libraries (mostly for ``.ods``).
34573458
The :meth:`~DataFrame.to_excel` instance method is used for
34583459
saving a ``DataFrame`` to Excel. Generally the semantics are
34593460
similar to working with :ref:`csv<io.read_csv_table>` data.

doc/source/whatsnew/v2.2.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ enhancement2
2828

2929
Other enhancements
3030
^^^^^^^^^^^^^^^^^^
31-
-
31+
- Added ``calamine`` as an engine to ``read_excel`` (:issue:`50395`)
3232
-
3333

3434
.. ---------------------------------------------------------------------------

environment.yml

+1
Original file line numberDiff line numberDiff line change
@@ -115,5 +115,6 @@ dependencies:
115115
- pip:
116116
- dataframe-api-compat>=0.1.7
117117
- sphinx-toggleprompt # conda-forge version has stricter pins on jinja2
118+
- python-calamine>=0.1.6
118119
- typing_extensions; python_version<"3.11"
119120
- tzdata>=2022.1

pandas/compat/_optional.py

+2
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
"pyarrow": "7.0.0",
3838
"pyreadstat": "1.1.5",
3939
"pytest": "7.3.2",
40+
"python-calamine": "0.1.6",
4041
"pyxlsb": "1.0.9",
4142
"s3fs": "2022.05.0",
4243
"scipy": "1.8.1",
@@ -62,6 +63,7 @@
6263
"lxml.etree": "lxml",
6364
"odf": "odfpy",
6465
"pandas_gbq": "pandas-gbq",
66+
"python_calamine": "python-calamine",
6567
"sqlalchemy": "SQLAlchemy",
6668
"tables": "pytables",
6769
}

pandas/core/config_init.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -513,11 +513,11 @@ def use_inf_as_na_cb(key) -> None:
513513
auto, {others}.
514514
"""
515515

516-
_xls_options = ["xlrd"]
517-
_xlsm_options = ["xlrd", "openpyxl"]
518-
_xlsx_options = ["xlrd", "openpyxl"]
519-
_ods_options = ["odf"]
520-
_xlsb_options = ["pyxlsb"]
516+
_xls_options = ["xlrd", "calamine"]
517+
_xlsm_options = ["xlrd", "openpyxl", "calamine"]
518+
_xlsx_options = ["xlrd", "openpyxl", "calamine"]
519+
_ods_options = ["odf", "calamine"]
520+
_xlsb_options = ["pyxlsb", "calamine"]
521521

522522

523523
with cf.config_prefix("io.excel.xls"):

pandas/io/excel/_base.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -160,13 +160,15 @@
160160
of dtype conversion.
161161
engine : str, default None
162162
If io is not a buffer or path, this must be set to identify io.
163-
Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb".
163+
Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", "calamine".
164164
Engine compatibility :
165165
166166
- "xlrd" supports old-style Excel files (.xls).
167167
- "openpyxl" supports newer Excel file formats.
168168
- "odf" supports OpenDocument file formats (.odf, .ods, .odt).
169169
- "pyxlsb" supports Binary Excel files.
170+
- "calamine" supports Excel (.xls, .xlsx, .xlsm, .xlsb)
171+
and OpenDocument (.ods) file formats.
170172
171173
.. versionchanged:: 1.2.0
172174
The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
@@ -395,7 +397,7 @@ def read_excel(
395397
| Callable[[str], bool]
396398
| None = ...,
397399
dtype: DtypeArg | None = ...,
398-
engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ...,
400+
engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ...,
399401
converters: dict[str, Callable] | dict[int, Callable] | None = ...,
400402
true_values: Iterable[Hashable] | None = ...,
401403
false_values: Iterable[Hashable] | None = ...,
@@ -434,7 +436,7 @@ def read_excel(
434436
| Callable[[str], bool]
435437
| None = ...,
436438
dtype: DtypeArg | None = ...,
437-
engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ...,
439+
engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ...,
438440
converters: dict[str, Callable] | dict[int, Callable] | None = ...,
439441
true_values: Iterable[Hashable] | None = ...,
440442
false_values: Iterable[Hashable] | None = ...,
@@ -473,7 +475,7 @@ def read_excel(
473475
| Callable[[str], bool]
474476
| None = None,
475477
dtype: DtypeArg | None = None,
476-
engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = None,
478+
engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = None,
477479
converters: dict[str, Callable] | dict[int, Callable] | None = None,
478480
true_values: Iterable[Hashable] | None = None,
479481
false_values: Iterable[Hashable] | None = None,
@@ -1463,13 +1465,15 @@ class ExcelFile:
14631465
.xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file.
14641466
engine : str, default None
14651467
If io is not a buffer or path, this must be set to identify io.
1466-
Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``
1468+
Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, ``calamine``
14671469
Engine compatibility :
14681470
14691471
- ``xlrd`` supports old-style Excel files (.xls).
14701472
- ``openpyxl`` supports newer Excel file formats.
14711473
- ``odf`` supports OpenDocument file formats (.odf, .ods, .odt).
14721474
- ``pyxlsb`` supports Binary Excel files.
1475+
- ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb)
1476+
and OpenDocument (.ods) file formats.
14731477
14741478
.. versionchanged:: 1.2.0
14751479
@@ -1505,6 +1509,7 @@ class ExcelFile:
15051509
... df1 = pd.read_excel(xls, "Sheet1") # doctest: +SKIP
15061510
"""
15071511

1512+
from pandas.io.excel._calamine import CalamineReader
15081513
from pandas.io.excel._odfreader import ODFReader
15091514
from pandas.io.excel._openpyxl import OpenpyxlReader
15101515
from pandas.io.excel._pyxlsb import PyxlsbReader
@@ -1515,6 +1520,7 @@ class ExcelFile:
15151520
"openpyxl": OpenpyxlReader,
15161521
"odf": ODFReader,
15171522
"pyxlsb": PyxlsbReader,
1523+
"calamine": CalamineReader,
15181524
}
15191525

15201526
def __init__(

pandas/io/excel/_calamine.py

+127
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
from __future__ import annotations
2+
3+
from datetime import (
4+
date,
5+
datetime,
6+
time,
7+
timedelta,
8+
)
9+
from typing import (
10+
TYPE_CHECKING,
11+
Any,
12+
Union,
13+
cast,
14+
)
15+
16+
from pandas._typing import Scalar
17+
from pandas.compat._optional import import_optional_dependency
18+
from pandas.util._decorators import doc
19+
20+
import pandas as pd
21+
from pandas.core.shared_docs import _shared_docs
22+
23+
from pandas.io.excel._base import BaseExcelReader
24+
25+
if TYPE_CHECKING:
26+
from python_calamine import (
27+
CalamineSheet,
28+
CalamineWorkbook,
29+
)
30+
31+
from pandas._typing import (
32+
FilePath,
33+
ReadBuffer,
34+
StorageOptions,
35+
)
36+
37+
_CellValueT = Union[int, float, str, bool, time, date, datetime, timedelta]
38+
39+
40+
class CalamineReader(BaseExcelReader["CalamineWorkbook"]):
41+
@doc(storage_options=_shared_docs["storage_options"])
42+
def __init__(
43+
self,
44+
filepath_or_buffer: FilePath | ReadBuffer[bytes],
45+
storage_options: StorageOptions | None = None,
46+
engine_kwargs: dict | None = None,
47+
) -> None:
48+
"""
49+
Reader using calamine engine (xlsx/xls/xlsb/ods).
50+
51+
Parameters
52+
----------
53+
filepath_or_buffer : str, path to be parsed or
54+
an open readable stream.
55+
{storage_options}
56+
engine_kwargs : dict, optional
57+
Arbitrary keyword arguments passed to excel engine.
58+
"""
59+
import_optional_dependency("python_calamine")
60+
super().__init__(
61+
filepath_or_buffer,
62+
storage_options=storage_options,
63+
engine_kwargs=engine_kwargs,
64+
)
65+
66+
@property
67+
def _workbook_class(self) -> type[CalamineWorkbook]:
68+
from python_calamine import CalamineWorkbook
69+
70+
return CalamineWorkbook
71+
72+
def load_workbook(
73+
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs: Any
74+
) -> CalamineWorkbook:
75+
from python_calamine import load_workbook
76+
77+
return load_workbook(
78+
filepath_or_buffer, **engine_kwargs # type: ignore[arg-type]
79+
)
80+
81+
@property
82+
def sheet_names(self) -> list[str]:
83+
from python_calamine import SheetTypeEnum
84+
85+
return [
86+
sheet.name
87+
for sheet in self.book.sheets_metadata
88+
if sheet.typ == SheetTypeEnum.WorkSheet
89+
]
90+
91+
def get_sheet_by_name(self, name: str) -> CalamineSheet:
92+
self.raise_if_bad_sheet_by_name(name)
93+
return self.book.get_sheet_by_name(name)
94+
95+
def get_sheet_by_index(self, index: int) -> CalamineSheet:
96+
self.raise_if_bad_sheet_by_index(index)
97+
return self.book.get_sheet_by_index(index)
98+
99+
def get_sheet_data(
100+
self, sheet: CalamineSheet, file_rows_needed: int | None = None
101+
) -> list[list[Scalar]]:
102+
def _convert_cell(value: _CellValueT) -> Scalar:
103+
if isinstance(value, float):
104+
val = int(value)
105+
if val == value:
106+
return val
107+
else:
108+
return value
109+
elif isinstance(value, date):
110+
return pd.Timestamp(value)
111+
elif isinstance(value, timedelta):
112+
return pd.Timedelta(value)
113+
elif isinstance(value, time):
114+
# cast needed here because Scalar doesn't include datetime.time
115+
return cast(Scalar, value)
116+
117+
return value
118+
119+
rows: list[list[_CellValueT]] = sheet.to_python(skip_empty_area=False)
120+
data: list[list[Scalar]] = []
121+
122+
for row in rows:
123+
data.append([_convert_cell(cell) for cell in row])
124+
if file_rows_needed is not None and len(data) >= file_rows_needed:
125+
break
126+
127+
return data

0 commit comments

Comments
 (0)