Skip to content

ENH: Add xlsb auto detection to read_excel and respect default options #38710

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jan 3, 2021
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ Other enhancements
- :func:`pandas.read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`)
- Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`)
- :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`)
- :func:`pandas.read_excel` can now auto detect .xlsb files (:issue:`35416`)

.. ---------------------------------------------------------------------------

Expand Down Expand Up @@ -269,7 +270,7 @@ I/O
for subclasses of ``DataFrame`` or ``Series`` (:issue:`33748`).
- Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`)
- Bug in :func:`read_excel` forward filling :class:`MultiIndex` names with multiple header and index columns specified (:issue:`34673`)

- :func:`pandas.read_excel` now respects :func:``pandas.set_option`` (:issue:`34252`)

Period
^^^^^^
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,15 +524,15 @@ def use_inf_as_na_cb(key):
"reader",
"auto",
reader_engine_doc.format(ext="xls", others=", ".join(_xls_options)),
validator=str,
validator=is_one_of_factory(_xls_options + ["auto"]),
)

with cf.config_prefix("io.excel.xlsm"):
cf.register_option(
"reader",
"auto",
reader_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)),
validator=str,
validator=is_one_of_factory(_xlsm_options + ["auto"]),
)


Expand All @@ -541,7 +541,7 @@ def use_inf_as_na_cb(key):
"reader",
"auto",
reader_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)),
validator=str,
validator=is_one_of_factory(_xlsx_options + ["auto"]),
)


Expand All @@ -550,15 +550,15 @@ def use_inf_as_na_cb(key):
"reader",
"auto",
reader_engine_doc.format(ext="ods", others=", ".join(_ods_options)),
validator=str,
validator=is_one_of_factory(_ods_options + ["auto"]),
)

with cf.config_prefix("io.excel.xlsb"):
cf.register_option(
"reader",
"auto",
reader_engine_doc.format(ext="xlsb", others=", ".join(_xlsb_options)),
validator=str,
validator=is_one_of_factory(_xlsb_options + ["auto"]),
)

# Set up the io.excel specific writer configuration.
Expand Down
32 changes: 14 additions & 18 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from pandas.io.common import IOHandles, get_handle, stringify_path, validate_header_arg
from pandas.io.excel._util import (
fill_mi_header,
get_default_writer,
get_default_engine,
get_writer,
maybe_convert_usecols,
pop_header_name,
Expand Down Expand Up @@ -123,6 +123,10 @@
then `odf <https://pypi.org/project/odfpy/>`_ will be used.
- Otherwise if ``path_or_buffer`` is an xls format,
``xlrd`` will be used.
- Otherwise if ``path_or_buffer`` is in xlsb format,
``pyxlsb`` will be used.

.. versionadded:: 1.3.0
- Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
then ``openpyxl`` will be used.
- Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
Expand Down Expand Up @@ -707,7 +711,7 @@ def __new__(cls, path, engine=None, **kwargs):
try:
engine = config.get_option(f"io.excel.{ext}.writer", silent=True)
if engine == "auto":
engine = get_default_writer(ext)
engine = get_default_engine(ext, mode="writer")
except KeyError as err:
raise ValueError(f"No engine for filetype: '{ext}'") from err

Expand Down Expand Up @@ -1009,6 +1013,10 @@ class ExcelFile:
then `odf <https://pypi.org/project/odfpy/>`_ will be used.
- Otherwise if ``path_or_buffer`` is an xls format,
``xlrd`` will be used.
- Otherwise if ``path_or_buffer`` is in xlsb format,
`pyxlsb <https://pypi.org/project/pyxlsb/>`_ will be used.

.. versionadded:: 1.3.0
- Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
then ``openpyxl`` will be used.
- Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
Expand Down Expand Up @@ -1065,21 +1073,10 @@ def __init__(
)

if engine is None:
if ext == "ods":
engine = "odf"
elif ext == "xls":
engine = "xlrd"
else:
# GH 35029 - Prefer openpyxl except for xls files
if (
import_optional_dependency(
"openpyxl", raise_on_missing=False, on_version="ignore"
)
is not None
):
engine = "openpyxl"
else:
engine = "xlrd"
# ext will always be valid, otherwise inspect_excel_format would raise
engine = config.get_option(f"io.excel.{ext}.reader", silent=True)
if engine == "auto":
engine = get_default_engine(ext, mode="reader")

if engine == "xlrd" and ext != "xls" and xlrd_version is not None:
if xlrd_version >= "2":
Expand Down Expand Up @@ -1107,7 +1104,6 @@ def __init__(
FutureWarning,
stacklevel=stacklevel,
)
assert engine in self._engines, f"Engine {engine} not recognized"

self.engine = engine
self.storage_options = storage_options
Expand Down
45 changes: 37 additions & 8 deletions pandas/io/excel/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,32 +23,61 @@ def register_writer(klass):
_writers[engine_name] = klass


def get_default_writer(ext):
def get_default_engine(ext, mode="reader"):
"""
Return the default writer for the given extension.
Return the default reader/writer for the given extension.

Parameters
----------
ext : str
The excel file extension for which to get the default engine.
mode : str {'reader', 'writer'}
Whether to get the default engine for reading or writing.
Either 'reader' or 'writer'

Returns
-------
str
The default engine for the extension.
"""
_default_readers = {
"xlsx": "openpyxl",
"xlsm": "openpyxl",
"xlsb": "pyxlsb",
"xls": "xlrd",
"ods": "odf",
}
_default_writers = {
"xlsx": "openpyxl",
"xlsm": "openpyxl",
"xlsb": "pyxlsb",
"xls": "xlwt",
"ods": "odf",
}
xlsxwriter = import_optional_dependency(
"xlsxwriter", raise_on_missing=False, on_version="warn"
)
if xlsxwriter:
_default_writers["xlsx"] = "xlsxwriter"
return _default_writers[ext]
assert mode in ["reader", "writer"]
if mode == "writer":
# Prefer xlsxwriter over openpyxl if installed
xlsxwriter = import_optional_dependency(
"xlsxwriter", raise_on_missing=False, on_version="warn"
)
if xlsxwriter:
_default_writers["xlsx"] = "xlsxwriter"
return _default_writers[ext]
else:
if (
import_optional_dependency(
"openpyxl", raise_on_missing=False, on_version="ignore"
)
is None
and import_optional_dependency(
"xlrd", raise_on_missing=False, on_version="ignore"
)
is not None
):
# if no openpyxl but xlrd installed, return xlrd
# the version is handled elsewhere
_default_readers["xlsx"] = "xlrd"
return _default_readers[ext]


def get_writer(engine_name):
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1234,8 +1234,6 @@ def test_excel_read_binary(self, engine, read_ext):

def test_excel_read_binary_via_read_excel(self, read_ext, engine):
# GH 38424
if read_ext == ".xlsb" and engine == "pyxlsb":
pytest.xfail("GH 38667 - should default to pyxlsb but doesn't")
with open("test1" + read_ext, "rb") as f:
result = pd.read_excel(f)
expected = pd.read_excel("test1" + read_ext, engine=engine)
Expand Down Expand Up @@ -1282,3 +1280,9 @@ def test_read_datetime_multiindex(self, engine, read_ext):
expected = DataFrame([], columns=expected_column_index)

tm.assert_frame_equal(expected, actual)

def test_engine_invalid_option(self, read_ext):
# read_ext includes the '.' hence the weird formatting
with pytest.raises(ValueError, match="Value must be one of *"):
with pd.option_context(f"io.excel{read_ext}.reader", "abc"):
pass