diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 85da250648c28..d89a868479d06 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -51,6 +51,7 @@ Other enhancements
- :func:`pandas.read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`)
- Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`)
- :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`)
+- :func:`pandas.read_excel` can now auto detect .xlsb files (:issue:`35416`)
.. ---------------------------------------------------------------------------
@@ -269,7 +270,7 @@ I/O
for subclasses of ``DataFrame`` or ``Series`` (:issue:`33748`).
- Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`)
- Bug in :func:`read_excel` forward filling :class:`MultiIndex` names with multiple header and index columns specified (:issue:`34673`)
-
+- :func:`pandas.read_excel` now respects :func:``pandas.set_option`` (:issue:`34252`)
Period
^^^^^^
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index 7d9664bd9f965..fba82ae499e90 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -524,7 +524,7 @@ def use_inf_as_na_cb(key):
"reader",
"auto",
reader_engine_doc.format(ext="xls", others=", ".join(_xls_options)),
- validator=str,
+ validator=is_one_of_factory(_xls_options + ["auto"]),
)
with cf.config_prefix("io.excel.xlsm"):
@@ -532,7 +532,7 @@ def use_inf_as_na_cb(key):
"reader",
"auto",
reader_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)),
- validator=str,
+ validator=is_one_of_factory(_xlsm_options + ["auto"]),
)
@@ -541,7 +541,7 @@ def use_inf_as_na_cb(key):
"reader",
"auto",
reader_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)),
- validator=str,
+ validator=is_one_of_factory(_xlsx_options + ["auto"]),
)
@@ -550,7 +550,7 @@ def use_inf_as_na_cb(key):
"reader",
"auto",
reader_engine_doc.format(ext="ods", others=", ".join(_ods_options)),
- validator=str,
+ validator=is_one_of_factory(_ods_options + ["auto"]),
)
with cf.config_prefix("io.excel.xlsb"):
@@ -558,7 +558,7 @@ def use_inf_as_na_cb(key):
"reader",
"auto",
reader_engine_doc.format(ext="xlsb", others=", ".join(_xlsb_options)),
- validator=str,
+ validator=is_one_of_factory(_xlsb_options + ["auto"]),
)
# Set up the io.excel specific writer configuration.
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index b23b5fe5b34a8..495d51c2cebce 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -25,7 +25,7 @@
from pandas.io.common import IOHandles, get_handle, stringify_path, validate_header_arg
from pandas.io.excel._util import (
fill_mi_header,
- get_default_writer,
+ get_default_engine,
get_writer,
maybe_convert_usecols,
pop_header_name,
@@ -123,6 +123,10 @@
then `odf `_ will be used.
- Otherwise if ``path_or_buffer`` is an xls format,
``xlrd`` will be used.
+ - Otherwise if ``path_or_buffer`` is in xlsb format,
+ ``pyxlsb`` will be used.
+
+ .. versionadded:: 1.3.0
- Otherwise if `openpyxl `_ is installed,
then ``openpyxl`` will be used.
- Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
@@ -707,7 +711,7 @@ def __new__(cls, path, engine=None, **kwargs):
try:
engine = config.get_option(f"io.excel.{ext}.writer", silent=True)
if engine == "auto":
- engine = get_default_writer(ext)
+ engine = get_default_engine(ext, mode="writer")
except KeyError as err:
raise ValueError(f"No engine for filetype: '{ext}'") from err
@@ -1009,6 +1013,10 @@ class ExcelFile:
then `odf `_ will be used.
- Otherwise if ``path_or_buffer`` is an xls format,
``xlrd`` will be used.
+ - Otherwise if ``path_or_buffer`` is in xlsb format,
+ `pyxlsb `_ will be used.
+
+ .. versionadded:: 1.3.0
- Otherwise if `openpyxl `_ is installed,
then ``openpyxl`` will be used.
- Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
@@ -1065,21 +1073,10 @@ def __init__(
)
if engine is None:
- if ext == "ods":
- engine = "odf"
- elif ext == "xls":
- engine = "xlrd"
- else:
- # GH 35029 - Prefer openpyxl except for xls files
- if (
- import_optional_dependency(
- "openpyxl", raise_on_missing=False, on_version="ignore"
- )
- is not None
- ):
- engine = "openpyxl"
- else:
- engine = "xlrd"
+ # ext will always be valid, otherwise inspect_excel_format would raise
+ engine = config.get_option(f"io.excel.{ext}.reader", silent=True)
+ if engine == "auto":
+ engine = get_default_engine(ext, mode="reader")
if engine == "xlrd" and ext != "xls" and xlrd_version is not None:
if xlrd_version >= "2":
@@ -1107,7 +1104,6 @@ def __init__(
FutureWarning,
stacklevel=stacklevel,
)
- assert engine in self._engines, f"Engine {engine} not recognized"
self.engine = engine
self.storage_options = storage_options
diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py
index 47105916a9c78..b5d0d1347f119 100644
--- a/pandas/io/excel/_util.py
+++ b/pandas/io/excel/_util.py
@@ -23,32 +23,61 @@ def register_writer(klass):
_writers[engine_name] = klass
-def get_default_writer(ext):
+def get_default_engine(ext, mode="reader"):
"""
- Return the default writer for the given extension.
+ Return the default reader/writer for the given extension.
Parameters
----------
ext : str
The excel file extension for which to get the default engine.
+ mode : str {'reader', 'writer'}
+ Whether to get the default engine for reading or writing.
+ Either 'reader' or 'writer'
Returns
-------
str
The default engine for the extension.
"""
+ _default_readers = {
+ "xlsx": "openpyxl",
+ "xlsm": "openpyxl",
+ "xlsb": "pyxlsb",
+ "xls": "xlrd",
+ "ods": "odf",
+ }
_default_writers = {
"xlsx": "openpyxl",
"xlsm": "openpyxl",
+ "xlsb": "pyxlsb",
"xls": "xlwt",
"ods": "odf",
}
- xlsxwriter = import_optional_dependency(
- "xlsxwriter", raise_on_missing=False, on_version="warn"
- )
- if xlsxwriter:
- _default_writers["xlsx"] = "xlsxwriter"
- return _default_writers[ext]
+ assert mode in ["reader", "writer"]
+ if mode == "writer":
+ # Prefer xlsxwriter over openpyxl if installed
+ xlsxwriter = import_optional_dependency(
+ "xlsxwriter", raise_on_missing=False, on_version="warn"
+ )
+ if xlsxwriter:
+ _default_writers["xlsx"] = "xlsxwriter"
+ return _default_writers[ext]
+ else:
+ if (
+ import_optional_dependency(
+ "openpyxl", raise_on_missing=False, on_version="ignore"
+ )
+ is None
+ and import_optional_dependency(
+ "xlrd", raise_on_missing=False, on_version="ignore"
+ )
+ is not None
+ ):
+ # if no openpyxl but xlrd installed, return xlrd
+ # the version is handled elsewhere
+ _default_readers["xlsx"] = "xlrd"
+ return _default_readers[ext]
def get_writer(engine_name):
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index 110b79adb5646..f472e24ac9498 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -1234,8 +1234,6 @@ def test_excel_read_binary(self, engine, read_ext):
def test_excel_read_binary_via_read_excel(self, read_ext, engine):
# GH 38424
- if read_ext == ".xlsb" and engine == "pyxlsb":
- pytest.xfail("GH 38667 - should default to pyxlsb but doesn't")
with open("test1" + read_ext, "rb") as f:
result = pd.read_excel(f)
expected = pd.read_excel("test1" + read_ext, engine=engine)
@@ -1282,3 +1280,9 @@ def test_read_datetime_multiindex(self, engine, read_ext):
expected = DataFrame([], columns=expected_column_index)
tm.assert_frame_equal(expected, actual)
+
+ def test_engine_invalid_option(self, read_ext):
+ # read_ext includes the '.' hence the weird formatting
+ with pytest.raises(ValueError, match="Value must be one of *"):
+ with pd.option_context(f"io.excel{read_ext}.reader", "abc"):
+ pass