diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 85da250648c28..d89a868479d06 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -51,6 +51,7 @@ Other enhancements - :func:`pandas.read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`) - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`) - :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`) +- :func:`pandas.read_excel` can now auto detect .xlsb files (:issue:`35416`) .. --------------------------------------------------------------------------- @@ -269,7 +270,7 @@ I/O for subclasses of ``DataFrame`` or ``Series`` (:issue:`33748`). - Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`) - Bug in :func:`read_excel` forward filling :class:`MultiIndex` names with multiple header and index columns specified (:issue:`34673`) - +- :func:`pandas.read_excel` now respects :func:``pandas.set_option`` (:issue:`34252`) Period ^^^^^^ diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 7d9664bd9f965..fba82ae499e90 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -524,7 +524,7 @@ def use_inf_as_na_cb(key): "reader", "auto", reader_engine_doc.format(ext="xls", others=", ".join(_xls_options)), - validator=str, + validator=is_one_of_factory(_xls_options + ["auto"]), ) with cf.config_prefix("io.excel.xlsm"): @@ -532,7 +532,7 @@ def use_inf_as_na_cb(key): "reader", "auto", reader_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)), - validator=str, + validator=is_one_of_factory(_xlsm_options + ["auto"]), ) @@ -541,7 +541,7 @@ def use_inf_as_na_cb(key): "reader", "auto", reader_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)), - validator=str, + validator=is_one_of_factory(_xlsx_options + ["auto"]), ) @@ -550,7 +550,7 @@ def use_inf_as_na_cb(key): "reader", "auto", reader_engine_doc.format(ext="ods", others=", ".join(_ods_options)), - validator=str, + validator=is_one_of_factory(_ods_options + ["auto"]), ) with cf.config_prefix("io.excel.xlsb"): @@ -558,7 +558,7 @@ def use_inf_as_na_cb(key): "reader", "auto", reader_engine_doc.format(ext="xlsb", others=", ".join(_xlsb_options)), - validator=str, + validator=is_one_of_factory(_xlsb_options + ["auto"]), ) # Set up the io.excel specific writer configuration. diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index b23b5fe5b34a8..495d51c2cebce 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -25,7 +25,7 @@ from pandas.io.common import IOHandles, get_handle, stringify_path, validate_header_arg from pandas.io.excel._util import ( fill_mi_header, - get_default_writer, + get_default_engine, get_writer, maybe_convert_usecols, pop_header_name, @@ -123,6 +123,10 @@ then `odf `_ will be used. - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used. + - Otherwise if ``path_or_buffer`` is in xlsb format, + ``pyxlsb`` will be used. + + .. versionadded:: 1.3.0 - Otherwise if `openpyxl `_ is installed, then ``openpyxl`` will be used. - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. @@ -707,7 +711,7 @@ def __new__(cls, path, engine=None, **kwargs): try: engine = config.get_option(f"io.excel.{ext}.writer", silent=True) if engine == "auto": - engine = get_default_writer(ext) + engine = get_default_engine(ext, mode="writer") except KeyError as err: raise ValueError(f"No engine for filetype: '{ext}'") from err @@ -1009,6 +1013,10 @@ class ExcelFile: then `odf `_ will be used. - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used. + - Otherwise if ``path_or_buffer`` is in xlsb format, + `pyxlsb `_ will be used. + + .. versionadded:: 1.3.0 - Otherwise if `openpyxl `_ is installed, then ``openpyxl`` will be used. - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. @@ -1065,21 +1073,10 @@ def __init__( ) if engine is None: - if ext == "ods": - engine = "odf" - elif ext == "xls": - engine = "xlrd" - else: - # GH 35029 - Prefer openpyxl except for xls files - if ( - import_optional_dependency( - "openpyxl", raise_on_missing=False, on_version="ignore" - ) - is not None - ): - engine = "openpyxl" - else: - engine = "xlrd" + # ext will always be valid, otherwise inspect_excel_format would raise + engine = config.get_option(f"io.excel.{ext}.reader", silent=True) + if engine == "auto": + engine = get_default_engine(ext, mode="reader") if engine == "xlrd" and ext != "xls" and xlrd_version is not None: if xlrd_version >= "2": @@ -1107,7 +1104,6 @@ def __init__( FutureWarning, stacklevel=stacklevel, ) - assert engine in self._engines, f"Engine {engine} not recognized" self.engine = engine self.storage_options = storage_options diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index 47105916a9c78..b5d0d1347f119 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -23,32 +23,61 @@ def register_writer(klass): _writers[engine_name] = klass -def get_default_writer(ext): +def get_default_engine(ext, mode="reader"): """ - Return the default writer for the given extension. + Return the default reader/writer for the given extension. Parameters ---------- ext : str The excel file extension for which to get the default engine. + mode : str {'reader', 'writer'} + Whether to get the default engine for reading or writing. + Either 'reader' or 'writer' Returns ------- str The default engine for the extension. """ + _default_readers = { + "xlsx": "openpyxl", + "xlsm": "openpyxl", + "xlsb": "pyxlsb", + "xls": "xlrd", + "ods": "odf", + } _default_writers = { "xlsx": "openpyxl", "xlsm": "openpyxl", + "xlsb": "pyxlsb", "xls": "xlwt", "ods": "odf", } - xlsxwriter = import_optional_dependency( - "xlsxwriter", raise_on_missing=False, on_version="warn" - ) - if xlsxwriter: - _default_writers["xlsx"] = "xlsxwriter" - return _default_writers[ext] + assert mode in ["reader", "writer"] + if mode == "writer": + # Prefer xlsxwriter over openpyxl if installed + xlsxwriter = import_optional_dependency( + "xlsxwriter", raise_on_missing=False, on_version="warn" + ) + if xlsxwriter: + _default_writers["xlsx"] = "xlsxwriter" + return _default_writers[ext] + else: + if ( + import_optional_dependency( + "openpyxl", raise_on_missing=False, on_version="ignore" + ) + is None + and import_optional_dependency( + "xlrd", raise_on_missing=False, on_version="ignore" + ) + is not None + ): + # if no openpyxl but xlrd installed, return xlrd + # the version is handled elsewhere + _default_readers["xlsx"] = "xlrd" + return _default_readers[ext] def get_writer(engine_name): diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 110b79adb5646..f472e24ac9498 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1234,8 +1234,6 @@ def test_excel_read_binary(self, engine, read_ext): def test_excel_read_binary_via_read_excel(self, read_ext, engine): # GH 38424 - if read_ext == ".xlsb" and engine == "pyxlsb": - pytest.xfail("GH 38667 - should default to pyxlsb but doesn't") with open("test1" + read_ext, "rb") as f: result = pd.read_excel(f) expected = pd.read_excel("test1" + read_ext, engine=engine) @@ -1282,3 +1280,9 @@ def test_read_datetime_multiindex(self, engine, read_ext): expected = DataFrame([], columns=expected_column_index) tm.assert_frame_equal(expected, actual) + + def test_engine_invalid_option(self, read_ext): + # read_ext includes the '.' hence the weird formatting + with pytest.raises(ValueError, match="Value must be one of *"): + with pd.option_context(f"io.excel{read_ext}.reader", "abc"): + pass