Skip to content

Commit bafe865

Browse files
committed
ENH: add calamine excel reader (see also pandas-dev#50581)
1 parent db27c36 commit bafe865

22 files changed

+238
-59
lines changed

ci/deps/actions-310.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -60,4 +60,5 @@ dependencies:
6060

6161
- pip:
6262
- pyqt5>=5.15.6
63+
- python-calamine>=0.1.4
6364
- tzdata>=2022.1

ci/deps/actions-311-downstream_compat.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,5 @@ dependencies:
7474
- py
7575
- pip:
7676
- pyqt5>=5.15.6
77+
- python-calamine>=0.1.4
7778
- tzdata>=2022.1

ci/deps/actions-311.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -60,4 +60,5 @@ dependencies:
6060

6161
- pip:
6262
- pyqt5>=5.15.6
63+
- python-calamine>=0.1.4
6364
- tzdata>=2022.1

ci/deps/actions-39-minimum_versions.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -62,4 +62,5 @@ dependencies:
6262

6363
- pip:
6464
- pyqt5==5.15.6
65+
- python-calamine=0.1.4
6566
- tzdata==2022.1

ci/deps/actions-39.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -60,4 +60,5 @@ dependencies:
6060

6161
- pip:
6262
- pyqt5>=5.15.6
63+
- python-calamine>=0.1.4
6364
- tzdata>=2022.1

ci/deps/circle-310-arm64.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,6 @@ dependencies:
5858
- xlrd>=2.0.1
5959
- xlsxwriter>=3.0.3
6060
- zstandard>=0.17.0
61+
62+
- pip:
63+
- python-calamine>=0.1.4

doc/source/getting_started/install.rst

+1
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,7 @@ xlrd 2.0.1 excel Reading Excel
337337
xlsxwriter 3.0.3 excel Writing Excel
338338
openpyxl 3.0.10 excel Reading / writing for xlsx files
339339
pyxlsb 1.0.9 excel Reading for xlsb files
340+
python-calamine 0.1.4 excel Reading for xls/xlsx/xlsb/ods files
340341
========================= ================== =============== =============================================================
341342

342343
HTML

doc/source/user_guide/io.rst

+3-1
Original file line numberDiff line numberDiff line change
@@ -3436,7 +3436,9 @@ Excel files
34363436
The :func:`~pandas.read_excel` method can read Excel 2007+ (``.xlsx``) files
34373437
using the ``openpyxl`` Python module. Excel 2003 (``.xls``) files
34383438
can be read using ``xlrd``. Binary Excel (``.xlsb``)
3439-
files can be read using ``pyxlsb``.
3439+
files can be read using ``pyxlsb``. Also, all this formats can be read using ``python-calamine``,
3440+
but this library has some limitation and different behavior from other libraries,
3441+
for example, can't detect chartsheets.
34403442
The :meth:`~DataFrame.to_excel` instance method is used for
34413443
saving a ``DataFrame`` to Excel. Generally the semantics are
34423444
similar to working with :ref:`csv<io.read_csv_table>` data.

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ Other enhancements
164164
- Added :meth:`ExtensionArray.interpolate` used by :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` (:issue:`53659`)
165165
- Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`)
166166
- Added a new parameter ``by_row`` to :meth:`Series.apply` and :meth:`DataFrame.apply`. When set to ``False`` the supplied callables will always operate on the whole Series or DataFrame (:issue:`53400`, :issue:`53601`).
167+
- Added ``calamine`` as an engine to ``read_excel`` (:issue:`50395`)
167168
- Groupby aggregations (such as :meth:`DataFrameGroupby.sum`) now can preserve the dtype of the input instead of casting to ``float64`` (:issue:`44952`)
168169
- Improved error message when :meth:`DataFrameGroupBy.agg` failed (:issue:`52930`)
169170
- Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`)

environment.yml

+1
Original file line numberDiff line numberDiff line change
@@ -117,3 +117,4 @@ dependencies:
117117
- sphinx-toggleprompt # conda-forge version has stricter pins on jinja2
118118
- typing_extensions; python_version<"3.11"
119119
- tzdata>=2022.1
120+
- python-calamine

pandas/compat/_optional.py

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
"pyarrow": "7.0.0",
3838
"pyreadstat": "1.1.5",
3939
"pytest": "7.3.2",
40+
"python-calamine": "0.1.4",
4041
"pyxlsb": "1.0.9",
4142
"s3fs": "2022.05.0",
4243
"scipy": "1.8.1",

pandas/core/config_init.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -511,11 +511,11 @@ def use_inf_as_na_cb(key) -> None:
511511
auto, {others}.
512512
"""
513513

514-
_xls_options = ["xlrd"]
515-
_xlsm_options = ["xlrd", "openpyxl"]
516-
_xlsx_options = ["xlrd", "openpyxl"]
517-
_ods_options = ["odf"]
518-
_xlsb_options = ["pyxlsb"]
514+
_xls_options = ["xlrd", "calamine"]
515+
_xlsm_options = ["xlrd", "openpyxl", "calamine"]
516+
_xlsx_options = ["xlrd", "openpyxl", "calamine"]
517+
_ods_options = ["odf", "calamine"]
518+
_xlsb_options = ["pyxlsb", "calamine"]
519519

520520

521521
with cf.config_prefix("io.excel.xls"):

pandas/io/excel/_base.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -156,13 +156,15 @@
156156
of dtype conversion.
157157
engine : str, default None
158158
If io is not a buffer or path, this must be set to identify io.
159-
Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb".
159+
Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", "calamine".
160160
Engine compatibility :
161161
162162
- "xlrd" supports old-style Excel files (.xls).
163163
- "openpyxl" supports newer Excel file formats.
164164
- "odf" supports OpenDocument file formats (.odf, .ods, .odt).
165165
- "pyxlsb" supports Binary Excel files.
166+
- "calamine" supports Excel (.xls, .xlsx, .xlsm, .xlsb)
167+
and OpenDocument (.ods) file formats.
166168
167169
.. versionchanged:: 1.2.0
168170
The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
@@ -391,7 +393,7 @@ def read_excel(
391393
| Callable[[str], bool]
392394
| None = ...,
393395
dtype: DtypeArg | None = ...,
394-
engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ...,
396+
engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ...,
395397
converters: dict[str, Callable] | dict[int, Callable] | None = ...,
396398
true_values: Iterable[Hashable] | None = ...,
397399
false_values: Iterable[Hashable] | None = ...,
@@ -430,7 +432,7 @@ def read_excel(
430432
| Callable[[str], bool]
431433
| None = ...,
432434
dtype: DtypeArg | None = ...,
433-
engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ...,
435+
engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ...,
434436
converters: dict[str, Callable] | dict[int, Callable] | None = ...,
435437
true_values: Iterable[Hashable] | None = ...,
436438
false_values: Iterable[Hashable] | None = ...,
@@ -469,7 +471,7 @@ def read_excel(
469471
| Callable[[str], bool]
470472
| None = None,
471473
dtype: DtypeArg | None = None,
472-
engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = None,
474+
engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = None,
473475
converters: dict[str, Callable] | dict[int, Callable] | None = None,
474476
true_values: Iterable[Hashable] | None = None,
475477
false_values: Iterable[Hashable] | None = None,
@@ -1448,13 +1450,15 @@ class ExcelFile:
14481450
.xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file.
14491451
engine : str, default None
14501452
If io is not a buffer or path, this must be set to identify io.
1451-
Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``
1453+
Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, ``calamine``
14521454
Engine compatibility :
14531455
14541456
- ``xlrd`` supports old-style Excel files (.xls).
14551457
- ``openpyxl`` supports newer Excel file formats.
14561458
- ``odf`` supports OpenDocument file formats (.odf, .ods, .odt).
14571459
- ``pyxlsb`` supports Binary Excel files.
1460+
- ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb)
1461+
and OpenDocument (.ods) file formats.
14581462
14591463
.. versionchanged:: 1.2.0
14601464
@@ -1490,6 +1494,7 @@ class ExcelFile:
14901494
... df1 = pd.read_excel(xls, "Sheet1") # doctest: +SKIP
14911495
"""
14921496

1497+
from pandas.io.excel._calamine import CalamineReader
14931498
from pandas.io.excel._odfreader import ODFReader
14941499
from pandas.io.excel._openpyxl import OpenpyxlReader
14951500
from pandas.io.excel._pyxlsb import PyxlsbReader
@@ -1500,6 +1505,7 @@ class ExcelFile:
15001505
"openpyxl": OpenpyxlReader,
15011506
"odf": ODFReader,
15021507
"pyxlsb": PyxlsbReader,
1508+
"calamine": CalamineReader,
15031509
}
15041510

15051511
def __init__(

pandas/io/excel/_calamine.py

+116
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
from __future__ import annotations
2+
3+
from datetime import (
4+
date,
5+
datetime,
6+
time,
7+
)
8+
from typing import (
9+
TYPE_CHECKING,
10+
Union,
11+
cast,
12+
)
13+
14+
from pandas._typing import Scalar
15+
from pandas.compat._optional import import_optional_dependency
16+
from pandas.util._decorators import doc
17+
18+
import pandas as pd
19+
from pandas.core.shared_docs import _shared_docs
20+
21+
from pandas.io.excel._base import BaseExcelReader
22+
23+
if TYPE_CHECKING:
24+
from pandas._typing import (
25+
FilePath,
26+
ReadBuffer,
27+
StorageOptions,
28+
)
29+
30+
_CellValueT = Union[int, float, str, bool, time, date, datetime]
31+
32+
33+
class CalamineReader(BaseExcelReader):
34+
@doc(storage_options=_shared_docs["storage_options"])
35+
def __init__(
36+
self,
37+
filepath_or_buffer: FilePath | ReadBuffer[bytes],
38+
storage_options: StorageOptions | None = None,
39+
engine_kwargs: dict | None = None,
40+
) -> None:
41+
"""
42+
Reader using calamine engine (xlsx/xls/xlsb/ods).
43+
44+
Parameters
45+
----------
46+
filepath_or_buffer : str, path to be parsed or
47+
an open readable stream.
48+
{storage_options}
49+
engine_kwargs : dict, optional
50+
Arbitrary keyword arguments passed to excel engine.
51+
"""
52+
import_optional_dependency("python_calamine")
53+
super().__init__(
54+
filepath_or_buffer,
55+
storage_options=storage_options,
56+
engine_kwargs=engine_kwargs,
57+
)
58+
59+
@property
60+
def _workbook_class(self):
61+
from python_calamine import CalamineWorkbook
62+
63+
return CalamineWorkbook
64+
65+
def load_workbook(
66+
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
67+
):
68+
from python_calamine import load_workbook
69+
70+
return load_workbook(
71+
filepath_or_buffer, **engine_kwargs # type: ignore[arg-type]
72+
)
73+
74+
@property
75+
def sheet_names(self) -> list[str]:
76+
return self.book.sheet_names # pyright: ignore[reportGeneralTypeIssues]
77+
78+
def get_sheet_by_name(self, name: str):
79+
self.raise_if_bad_sheet_by_name(name)
80+
return self.book.get_sheet_by_name( # pyright: ignore[reportGeneralTypeIssues]
81+
name
82+
)
83+
84+
def get_sheet_by_index(self, index: int):
85+
self.raise_if_bad_sheet_by_index(index)
86+
return self.book.get_sheet_by_index( # pyright: ignore[reportGeneralTypeIssues]
87+
index
88+
)
89+
90+
def get_sheet_data(
91+
self, sheet, file_rows_needed: int | None = None
92+
) -> list[list[Scalar]]:
93+
def _convert_cell(value: _CellValueT) -> Scalar:
94+
if isinstance(value, float):
95+
val = int(value)
96+
if val == value:
97+
return val
98+
else:
99+
return value
100+
elif isinstance(value, date):
101+
return pd.Timestamp(value)
102+
elif isinstance(value, time):
103+
# cast needed here because Scalar doesn't include datetime.time
104+
return cast(Scalar, value)
105+
106+
return value
107+
108+
rows: list[list[_CellValueT]] = sheet.to_python(skip_empty_area=False)
109+
data: list[list[Scalar]] = []
110+
111+
for row in rows:
112+
data.append([_convert_cell(cell) for cell in row])
113+
if file_rows_needed is not None and len(data) >= file_rows_needed:
114+
break
115+
116+
return data
13 KB
Binary file not shown.
13 KB
Binary file not shown.

0 commit comments

Comments
 (0)