diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 1002eb9ee8568..101932a23ca6a 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3449,6 +3449,18 @@ Reading Excel files In the most basic use-case, ``read_excel`` takes a path to an Excel file, and the ``sheet_name`` indicating which sheet to parse. +When using the ``engine_kwargs`` parameter, pandas will pass these arguments to the +engine. For this, it is important to know which function pandas is +using internally. + +* For the engine openpyxl, pandas is using :func:`openpyxl.load_workbook` to read in (``.xlsx``) and (``.xlsm``) files. + +* For the engine xlrd, pandas is using :func:`xlrd.open_workbook` to read in (``.xls``) files. + +* For the engine pyxlsb, pandas is using :func:`pyxlsb.open_workbook` to read in (``.xlsb``) files. + +* For the engine odf, pandas is using :func:`odf.opendocument.load` to read in (``.ods``) files. + .. code-block:: python # Returns a DataFrame diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index afe361da1114d..245cc111f3794 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -87,6 +87,7 @@ Other enhancements - :meth:`DataFrame.applymap` now uses the :meth:`~api.extensions.ExtensionArray.map` method of underlying :class:`api.extensions.ExtensionArray` instances (:issue:`52219`) - :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`). - Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`) +- Adding ``engine_kwargs`` parameter to :meth:`DataFrame.read_excel` (:issue:`52214`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 8c3bbb7798f68..92750bdd0f272 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -289,6 +289,9 @@ .. versionadded:: 2.0 +engine_kwargs : dict, optional + Arbitrary keyword arguments passed to excel engine. + Returns ------- DataFrame or dict of DataFrames @@ -302,6 +305,11 @@ read_csv : Read a comma-separated values (csv) file into DataFrame. read_fwf : Read a table of fixed-width formatted lines into DataFrame. +Notes +----- +For specific information on the methods used for each Excel engine, refer to the pandas +:ref:`user guide ` + Examples -------- The file can be read using the file name as string or an open file object: @@ -472,13 +480,21 @@ def read_excel( skipfooter: int = 0, storage_options: StorageOptions = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + engine_kwargs: dict | None = None, ) -> DataFrame | dict[IntStrT, DataFrame]: check_dtype_backend(dtype_backend) - should_close = False + if engine_kwargs is None: + engine_kwargs = {} + if not isinstance(io, ExcelFile): should_close = True - io = ExcelFile(io, storage_options=storage_options, engine=engine) + io = ExcelFile( + io, + storage_options=storage_options, + engine=engine, + engine_kwargs=engine_kwargs, + ) elif engine and engine != io.engine: raise ValueError( "Engine should not be specified when passing " @@ -520,8 +536,14 @@ def read_excel( class BaseExcelReader(metaclass=abc.ABCMeta): def __init__( - self, filepath_or_buffer, storage_options: StorageOptions = None + self, + filepath_or_buffer, + storage_options: StorageOptions = None, + engine_kwargs: dict | None = None, ) -> None: + if engine_kwargs is None: + engine_kwargs = {} + # First argument can also be bytes, so create a buffer if isinstance(filepath_or_buffer, bytes): filepath_or_buffer = BytesIO(filepath_or_buffer) @@ -540,7 +562,7 @@ def __init__( # N.B. xlrd.Book has a read attribute too self.handles.handle.seek(0) try: - self.book = self.load_workbook(self.handles.handle) + self.book = self.load_workbook(self.handles.handle, engine_kwargs) except Exception: self.close() raise @@ -555,7 +577,7 @@ def _workbook_class(self): pass @abc.abstractmethod - def load_workbook(self, filepath_or_buffer): + def load_workbook(self, filepath_or_buffer, engine_kwargs): pass def close(self) -> None: @@ -1450,6 +1472,8 @@ class ExcelFile: Please do not report issues when using ``xlrd`` to read ``.xlsx`` files. This is not supported, switch to using ``openpyxl`` instead. + engine_kwargs : dict, optional + Arbitrary keyword arguments passed to excel engine. """ from pandas.io.excel._odfreader import ODFReader @@ -1469,7 +1493,11 @@ def __init__( path_or_buffer, engine: str | None = None, storage_options: StorageOptions = None, + engine_kwargs: dict | None = None, ) -> None: + if engine_kwargs is None: + engine_kwargs = {} + if engine is not None and engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") @@ -1513,7 +1541,11 @@ def __init__( self.engine = engine self.storage_options = storage_options - self._reader = self._engines[engine](self._io, storage_options=storage_options) + self._reader = self._engines[engine]( + self._io, + storage_options=storage_options, + engine_kwargs=engine_kwargs, + ) def __fspath__(self): return self._io diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index c3d7cb5df717f..c46424d5b26da 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -31,6 +31,7 @@ def __init__( self, filepath_or_buffer: FilePath | ReadBuffer[bytes], storage_options: StorageOptions = None, + engine_kwargs: dict | None = None, ) -> None: """ Read tables out of OpenDocument formatted files. @@ -40,9 +41,15 @@ def __init__( filepath_or_buffer : str, path to be parsed or an open readable stream. {storage_options} + engine_kwargs : dict, optional + Arbitrary keyword arguments passed to excel engine. """ import_optional_dependency("odf") - super().__init__(filepath_or_buffer, storage_options=storage_options) + super().__init__( + filepath_or_buffer, + storage_options=storage_options, + engine_kwargs=engine_kwargs, + ) @property def _workbook_class(self): @@ -50,10 +57,12 @@ def _workbook_class(self): return OpenDocument - def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]): + def load_workbook( + self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs + ): from odf.opendocument import load - return load(filepath_or_buffer) + return load(filepath_or_buffer, **engine_kwargs) @property def empty_value(self) -> str: diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index e751c919ee8dc..195d3a3a8b263 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -536,6 +536,7 @@ def __init__( self, filepath_or_buffer: FilePath | ReadBuffer[bytes], storage_options: StorageOptions = None, + engine_kwargs: dict | None = None, ) -> None: """ Reader using openpyxl engine. @@ -545,9 +546,15 @@ def __init__( filepath_or_buffer : str, path object or Workbook Object to be parsed. {storage_options} + engine_kwargs : dict, optional + Arbitrary keyword arguments passed to excel engine. """ import_optional_dependency("openpyxl") - super().__init__(filepath_or_buffer, storage_options=storage_options) + super().__init__( + filepath_or_buffer, + storage_options=storage_options, + engine_kwargs=engine_kwargs, + ) @property def _workbook_class(self): @@ -555,11 +562,17 @@ def _workbook_class(self): return Workbook - def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]): + def load_workbook( + self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs + ): from openpyxl import load_workbook return load_workbook( - filepath_or_buffer, read_only=True, data_only=True, keep_links=False + filepath_or_buffer, + read_only=True, + data_only=True, + keep_links=False, + **engine_kwargs, ) @property diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index bfe21082cc4d0..a1234b0e74c3e 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -25,6 +25,7 @@ def __init__( self, filepath_or_buffer: FilePath | ReadBuffer[bytes], storage_options: StorageOptions = None, + engine_kwargs: dict | None = None, ) -> None: """ Reader using pyxlsb engine. @@ -34,11 +35,17 @@ def __init__( filepath_or_buffer : str, path object, or Workbook Object to be parsed. {storage_options} + engine_kwargs : dict, optional + Arbitrary keyword arguments passed to excel engine. """ import_optional_dependency("pyxlsb") # This will call load_workbook on the filepath or buffer # And set the result to the book-attribute - super().__init__(filepath_or_buffer, storage_options=storage_options) + super().__init__( + filepath_or_buffer, + storage_options=storage_options, + engine_kwargs=engine_kwargs, + ) @property def _workbook_class(self): @@ -46,14 +53,16 @@ def _workbook_class(self): return Workbook - def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]): + def load_workbook( + self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs + ): from pyxlsb import open_workbook # TODO: hack in buffer capability # This might need some modifications to the Pyxlsb library # Actual work for opening it is in xlsbpackage.py, line 20-ish - return open_workbook(filepath_or_buffer) + return open_workbook(filepath_or_buffer, **engine_kwargs) @property def sheet_names(self) -> list[str]: diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 702d00e7fdea7..d131567cf70f7 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -22,7 +22,10 @@ class XlrdReader(BaseExcelReader): @doc(storage_options=_shared_docs["storage_options"]) def __init__( - self, filepath_or_buffer, storage_options: StorageOptions = None + self, + filepath_or_buffer, + storage_options: StorageOptions = None, + engine_kwargs: dict | None = None, ) -> None: """ Reader using xlrd engine. @@ -32,10 +35,16 @@ def __init__( filepath_or_buffer : str, path object or Workbook Object to be parsed. {storage_options} + engine_kwargs : dict, optional + Arbitrary keyword arguments passed to excel engine. """ err_msg = "Install xlrd >= 2.0.1 for xls Excel support" import_optional_dependency("xlrd", extra=err_msg) - super().__init__(filepath_or_buffer, storage_options=storage_options) + super().__init__( + filepath_or_buffer, + storage_options=storage_options, + engine_kwargs=engine_kwargs, + ) @property def _workbook_class(self): @@ -43,14 +52,14 @@ def _workbook_class(self): return Book - def load_workbook(self, filepath_or_buffer): + def load_workbook(self, filepath_or_buffer, engine_kwargs): from xlrd import open_workbook if hasattr(filepath_or_buffer, "read"): data = filepath_or_buffer.read() - return open_workbook(file_contents=data) + return open_workbook(file_contents=data, **engine_kwargs) else: - return open_workbook(filepath_or_buffer) + return open_workbook(filepath_or_buffer, **engine_kwargs) @property def sheet_names(self): diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index c22051912d293..05c86be850b32 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -6,6 +6,7 @@ import os from pathlib import Path import platform +import re from urllib.error import URLError from zipfile import BadZipFile @@ -148,6 +149,32 @@ def parser(self, *args, **kwargs): expected = expected_defaults[read_ext[1:]] assert result == expected + def test_engine_kwargs(self, read_ext, engine): + # GH#52214 + expected_defaults = { + "xlsx": {"foo": "abcd"}, + "xlsm": {"foo": 123}, + "xlsb": {"foo": "True"}, + "xls": {"foo": True}, + "ods": {"foo": "abcd"}, + } + + if read_ext[1:] == "xls" or read_ext[1:] == "xlsb": + msg = re.escape(r"open_workbook() got an unexpected keyword argument 'foo'") + elif read_ext[1:] == "ods": + msg = re.escape(r"load() got an unexpected keyword argument 'foo'") + else: + msg = re.escape(r"load_workbook() got an unexpected keyword argument 'foo'") + + if engine is not None: + with pytest.raises(TypeError, match=msg): + pd.read_excel( + "test1" + read_ext, + sheet_name="Sheet1", + index_col=0, + engine_kwargs=expected_defaults[read_ext[1:]], + ) + def test_usecols_int(self, read_ext): # usecols as int msg = "Passing an integer for `usecols`"