Skip to content

Commit 8d923c9

Browse files
authored
ENH: Add xlsb auto detection to read_excel and respect default options (#38710)
1 parent b337b61 commit 8d923c9

File tree

5 files changed

+64
-34
lines changed

5 files changed

+64
-34
lines changed

doc/source/whatsnew/v1.3.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ Other enhancements
5151
- :func:`pandas.read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`)
5252
- Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`)
5353
- :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`)
54+
- :func:`pandas.read_excel` can now auto detect .xlsb files (:issue:`35416`)
5455

5556
.. ---------------------------------------------------------------------------
5657
@@ -270,7 +271,7 @@ I/O
270271
for subclasses of ``DataFrame`` or ``Series`` (:issue:`33748`).
271272
- Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`)
272273
- Bug in :func:`read_excel` forward filling :class:`MultiIndex` names with multiple header and index columns specified (:issue:`34673`)
273-
274+
- :func:`pandas.read_excel` now respects :func:``pandas.set_option`` (:issue:`34252`)
274275

275276
Period
276277
^^^^^^

pandas/core/config_init.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -524,15 +524,15 @@ def use_inf_as_na_cb(key):
524524
"reader",
525525
"auto",
526526
reader_engine_doc.format(ext="xls", others=", ".join(_xls_options)),
527-
validator=str,
527+
validator=is_one_of_factory(_xls_options + ["auto"]),
528528
)
529529

530530
with cf.config_prefix("io.excel.xlsm"):
531531
cf.register_option(
532532
"reader",
533533
"auto",
534534
reader_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)),
535-
validator=str,
535+
validator=is_one_of_factory(_xlsm_options + ["auto"]),
536536
)
537537

538538

@@ -541,7 +541,7 @@ def use_inf_as_na_cb(key):
541541
"reader",
542542
"auto",
543543
reader_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)),
544-
validator=str,
544+
validator=is_one_of_factory(_xlsx_options + ["auto"]),
545545
)
546546

547547

@@ -550,15 +550,15 @@ def use_inf_as_na_cb(key):
550550
"reader",
551551
"auto",
552552
reader_engine_doc.format(ext="ods", others=", ".join(_ods_options)),
553-
validator=str,
553+
validator=is_one_of_factory(_ods_options + ["auto"]),
554554
)
555555

556556
with cf.config_prefix("io.excel.xlsb"):
557557
cf.register_option(
558558
"reader",
559559
"auto",
560560
reader_engine_doc.format(ext="xlsb", others=", ".join(_xlsb_options)),
561-
validator=str,
561+
validator=is_one_of_factory(_xlsb_options + ["auto"]),
562562
)
563563

564564
# Set up the io.excel specific writer configuration.

pandas/io/excel/_base.py

+14-18
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from pandas.io.common import IOHandles, get_handle, stringify_path, validate_header_arg
2626
from pandas.io.excel._util import (
2727
fill_mi_header,
28-
get_default_writer,
28+
get_default_engine,
2929
get_writer,
3030
maybe_convert_usecols,
3131
pop_header_name,
@@ -123,6 +123,10 @@
123123
then `odf <https://pypi.org/project/odfpy/>`_ will be used.
124124
- Otherwise if ``path_or_buffer`` is an xls format,
125125
``xlrd`` will be used.
126+
- Otherwise if ``path_or_buffer`` is in xlsb format,
127+
``pyxlsb`` will be used.
128+
129+
.. versionadded:: 1.3.0
126130
- Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
127131
then ``openpyxl`` will be used.
128132
- Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
@@ -707,7 +711,7 @@ def __new__(cls, path, engine=None, **kwargs):
707711
try:
708712
engine = config.get_option(f"io.excel.{ext}.writer", silent=True)
709713
if engine == "auto":
710-
engine = get_default_writer(ext)
714+
engine = get_default_engine(ext, mode="writer")
711715
except KeyError as err:
712716
raise ValueError(f"No engine for filetype: '{ext}'") from err
713717

@@ -1009,6 +1013,10 @@ class ExcelFile:
10091013
then `odf <https://pypi.org/project/odfpy/>`_ will be used.
10101014
- Otherwise if ``path_or_buffer`` is an xls format,
10111015
``xlrd`` will be used.
1016+
- Otherwise if ``path_or_buffer`` is in xlsb format,
1017+
`pyxlsb <https://pypi.org/project/pyxlsb/>`_ will be used.
1018+
1019+
.. versionadded:: 1.3.0
10121020
- Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
10131021
then ``openpyxl`` will be used.
10141022
- Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
@@ -1065,21 +1073,10 @@ def __init__(
10651073
)
10661074

10671075
if engine is None:
1068-
if ext == "ods":
1069-
engine = "odf"
1070-
elif ext == "xls":
1071-
engine = "xlrd"
1072-
else:
1073-
# GH 35029 - Prefer openpyxl except for xls files
1074-
if (
1075-
import_optional_dependency(
1076-
"openpyxl", raise_on_missing=False, on_version="ignore"
1077-
)
1078-
is not None
1079-
):
1080-
engine = "openpyxl"
1081-
else:
1082-
engine = "xlrd"
1076+
# ext will always be valid, otherwise inspect_excel_format would raise
1077+
engine = config.get_option(f"io.excel.{ext}.reader", silent=True)
1078+
if engine == "auto":
1079+
engine = get_default_engine(ext, mode="reader")
10831080

10841081
if engine == "xlrd" and ext != "xls" and xlrd_version is not None:
10851082
if xlrd_version >= "2":
@@ -1107,7 +1104,6 @@ def __init__(
11071104
FutureWarning,
11081105
stacklevel=stacklevel,
11091106
)
1110-
assert engine in self._engines, f"Engine {engine} not recognized"
11111107

11121108
self.engine = engine
11131109
self.storage_options = storage_options

pandas/io/excel/_util.py

+37-8
Original file line numberDiff line numberDiff line change
@@ -23,32 +23,61 @@ def register_writer(klass):
2323
_writers[engine_name] = klass
2424

2525

26-
def get_default_writer(ext):
26+
def get_default_engine(ext, mode="reader"):
2727
"""
28-
Return the default writer for the given extension.
28+
Return the default reader/writer for the given extension.
2929
3030
Parameters
3131
----------
3232
ext : str
3333
The excel file extension for which to get the default engine.
34+
mode : str {'reader', 'writer'}
35+
Whether to get the default engine for reading or writing.
36+
Either 'reader' or 'writer'
3437
3538
Returns
3639
-------
3740
str
3841
The default engine for the extension.
3942
"""
43+
_default_readers = {
44+
"xlsx": "openpyxl",
45+
"xlsm": "openpyxl",
46+
"xlsb": "pyxlsb",
47+
"xls": "xlrd",
48+
"ods": "odf",
49+
}
4050
_default_writers = {
4151
"xlsx": "openpyxl",
4252
"xlsm": "openpyxl",
53+
"xlsb": "pyxlsb",
4354
"xls": "xlwt",
4455
"ods": "odf",
4556
}
46-
xlsxwriter = import_optional_dependency(
47-
"xlsxwriter", raise_on_missing=False, on_version="warn"
48-
)
49-
if xlsxwriter:
50-
_default_writers["xlsx"] = "xlsxwriter"
51-
return _default_writers[ext]
57+
assert mode in ["reader", "writer"]
58+
if mode == "writer":
59+
# Prefer xlsxwriter over openpyxl if installed
60+
xlsxwriter = import_optional_dependency(
61+
"xlsxwriter", raise_on_missing=False, on_version="warn"
62+
)
63+
if xlsxwriter:
64+
_default_writers["xlsx"] = "xlsxwriter"
65+
return _default_writers[ext]
66+
else:
67+
if (
68+
import_optional_dependency(
69+
"openpyxl", raise_on_missing=False, on_version="ignore"
70+
)
71+
is None
72+
and import_optional_dependency(
73+
"xlrd", raise_on_missing=False, on_version="ignore"
74+
)
75+
is not None
76+
):
77+
# if no openpyxl but xlrd installed, return xlrd
78+
# the version is handled elsewhere
79+
_default_readers["xlsx"] = "xlrd"
80+
return _default_readers[ext]
5281

5382

5483
def get_writer(engine_name):

pandas/tests/io/excel/test_readers.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -1234,8 +1234,6 @@ def test_excel_read_binary(self, engine, read_ext):
12341234

12351235
def test_excel_read_binary_via_read_excel(self, read_ext, engine):
12361236
# GH 38424
1237-
if read_ext == ".xlsb" and engine == "pyxlsb":
1238-
pytest.xfail("GH 38667 - should default to pyxlsb but doesn't")
12391237
with open("test1" + read_ext, "rb") as f:
12401238
result = pd.read_excel(f)
12411239
expected = pd.read_excel("test1" + read_ext, engine=engine)
@@ -1282,3 +1280,9 @@ def test_read_datetime_multiindex(self, engine, read_ext):
12821280
expected = DataFrame([], columns=expected_column_index)
12831281

12841282
tm.assert_frame_equal(expected, actual)
1283+
1284+
def test_engine_invalid_option(self, read_ext):
1285+
# read_ext includes the '.' hence the weird formatting
1286+
with pytest.raises(ValueError, match="Value must be one of *"):
1287+
with pd.option_context(f"io.excel{read_ext}.reader", "abc"):
1288+
pass

0 commit comments

Comments
 (0)