Skip to content

Commit 7eeec0d

Browse files
rmhowe425mroeschke
andauthored
ENH: Adding engine_kwargs to Excel engines for issue #40274 (#52214)
* Fixing merge conflicts * Fixing merge conflict * Fixing documentation issues * standardized usage of engine_kwargs, fixed unit tests & doc strings * Fixing documentation issues * Fixing implementation logic and unit tests * Fixing implementation logic * Fixing formatting issues * Fixing error for test Docstring validation, typing, and other manual pre-commit hooks * Fixing documentation error * Standardizing engine_kwarg types * Fixing minor issues with unit tests and documentation * Fixing documentation issue * Fixing a formatting / documentation error * Fixing documentation errors * Fixing documentation errors * Fixing documentation errors * Fixing documentation errors * Fixing documentation errors * Adding an extra blank line to troubleshoot documentation error * Adding an extra blank line to troubleshoot documentation error * Fixing documentation issues * Fixing formatting errors * Fixing formatting errors * Fixing formatting errors * Fixing logic and formatting issues in unit tests * Fixing issues with merge conflict * Fixing formatting issue * Update pandas/io/excel/_base.py --------- Co-authored-by: Matthew Roeschke <[email protected]>
1 parent cfbbeb6 commit 7eeec0d

File tree

8 files changed

+132
-20
lines changed

8 files changed

+132
-20
lines changed

doc/source/user_guide/io.rst

+12
Original file line numberDiff line numberDiff line change
@@ -3449,6 +3449,18 @@ Reading Excel files
34493449
In the most basic use-case, ``read_excel`` takes a path to an Excel
34503450
file, and the ``sheet_name`` indicating which sheet to parse.
34513451

3452+
When using the ``engine_kwargs`` parameter, pandas will pass these arguments to the
3453+
engine. For this, it is important to know which function pandas is
3454+
using internally.
3455+
3456+
* For the engine openpyxl, pandas is using :func:`openpyxl.load_workbook` to read in (``.xlsx``) and (``.xlsm``) files.
3457+
3458+
* For the engine xlrd, pandas is using :func:`xlrd.open_workbook` to read in (``.xls``) files.
3459+
3460+
* For the engine pyxlsb, pandas is using :func:`pyxlsb.open_workbook` to read in (``.xlsb``) files.
3461+
3462+
* For the engine odf, pandas is using :func:`odf.opendocument.load` to read in (``.ods``) files.
3463+
34523464
.. code-block:: python
34533465
34543466
# Returns a DataFrame

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ Other enhancements
8787
- :meth:`DataFrame.applymap` now uses the :meth:`~api.extensions.ExtensionArray.map` method of underlying :class:`api.extensions.ExtensionArray` instances (:issue:`52219`)
8888
- :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`).
8989
- Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`)
90+
- Adding ``engine_kwargs`` parameter to :meth:`DataFrame.read_excel` (:issue:`52214`)
9091
-
9192

9293
.. ---------------------------------------------------------------------------

pandas/io/excel/_base.py

+38-6
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,9 @@
289289
290290
.. versionadded:: 2.0
291291
292+
engine_kwargs : dict, optional
293+
Arbitrary keyword arguments passed to excel engine.
294+
292295
Returns
293296
-------
294297
DataFrame or dict of DataFrames
@@ -302,6 +305,11 @@
302305
read_csv : Read a comma-separated values (csv) file into DataFrame.
303306
read_fwf : Read a table of fixed-width formatted lines into DataFrame.
304307
308+
Notes
309+
-----
310+
For specific information on the methods used for each Excel engine, refer to the pandas
311+
:ref:`user guide <io.excel_reader>`
312+
305313
Examples
306314
--------
307315
The file can be read using the file name as string or an open file object:
@@ -472,13 +480,21 @@ def read_excel(
472480
skipfooter: int = 0,
473481
storage_options: StorageOptions = None,
474482
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
483+
engine_kwargs: dict | None = None,
475484
) -> DataFrame | dict[IntStrT, DataFrame]:
476485
check_dtype_backend(dtype_backend)
477-
478486
should_close = False
487+
if engine_kwargs is None:
488+
engine_kwargs = {}
489+
479490
if not isinstance(io, ExcelFile):
480491
should_close = True
481-
io = ExcelFile(io, storage_options=storage_options, engine=engine)
492+
io = ExcelFile(
493+
io,
494+
storage_options=storage_options,
495+
engine=engine,
496+
engine_kwargs=engine_kwargs,
497+
)
482498
elif engine and engine != io.engine:
483499
raise ValueError(
484500
"Engine should not be specified when passing "
@@ -520,8 +536,14 @@ def read_excel(
520536

521537
class BaseExcelReader(metaclass=abc.ABCMeta):
522538
def __init__(
523-
self, filepath_or_buffer, storage_options: StorageOptions = None
539+
self,
540+
filepath_or_buffer,
541+
storage_options: StorageOptions = None,
542+
engine_kwargs: dict | None = None,
524543
) -> None:
544+
if engine_kwargs is None:
545+
engine_kwargs = {}
546+
525547
# First argument can also be bytes, so create a buffer
526548
if isinstance(filepath_or_buffer, bytes):
527549
filepath_or_buffer = BytesIO(filepath_or_buffer)
@@ -540,7 +562,7 @@ def __init__(
540562
# N.B. xlrd.Book has a read attribute too
541563
self.handles.handle.seek(0)
542564
try:
543-
self.book = self.load_workbook(self.handles.handle)
565+
self.book = self.load_workbook(self.handles.handle, engine_kwargs)
544566
except Exception:
545567
self.close()
546568
raise
@@ -555,7 +577,7 @@ def _workbook_class(self):
555577
pass
556578

557579
@abc.abstractmethod
558-
def load_workbook(self, filepath_or_buffer):
580+
def load_workbook(self, filepath_or_buffer, engine_kwargs):
559581
pass
560582

561583
def close(self) -> None:
@@ -1450,6 +1472,8 @@ class ExcelFile:
14501472
14511473
Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.
14521474
This is not supported, switch to using ``openpyxl`` instead.
1475+
engine_kwargs : dict, optional
1476+
Arbitrary keyword arguments passed to excel engine.
14531477
"""
14541478

14551479
from pandas.io.excel._odfreader import ODFReader
@@ -1469,7 +1493,11 @@ def __init__(
14691493
path_or_buffer,
14701494
engine: str | None = None,
14711495
storage_options: StorageOptions = None,
1496+
engine_kwargs: dict | None = None,
14721497
) -> None:
1498+
if engine_kwargs is None:
1499+
engine_kwargs = {}
1500+
14731501
if engine is not None and engine not in self._engines:
14741502
raise ValueError(f"Unknown engine: {engine}")
14751503

@@ -1513,7 +1541,11 @@ def __init__(
15131541
self.engine = engine
15141542
self.storage_options = storage_options
15151543

1516-
self._reader = self._engines[engine](self._io, storage_options=storage_options)
1544+
self._reader = self._engines[engine](
1545+
self._io,
1546+
storage_options=storage_options,
1547+
engine_kwargs=engine_kwargs,
1548+
)
15171549

15181550
def __fspath__(self):
15191551
return self._io

pandas/io/excel/_odfreader.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ def __init__(
3131
self,
3232
filepath_or_buffer: FilePath | ReadBuffer[bytes],
3333
storage_options: StorageOptions = None,
34+
engine_kwargs: dict | None = None,
3435
) -> None:
3536
"""
3637
Read tables out of OpenDocument formatted files.
@@ -40,20 +41,28 @@ def __init__(
4041
filepath_or_buffer : str, path to be parsed or
4142
an open readable stream.
4243
{storage_options}
44+
engine_kwargs : dict, optional
45+
Arbitrary keyword arguments passed to excel engine.
4346
"""
4447
import_optional_dependency("odf")
45-
super().__init__(filepath_or_buffer, storage_options=storage_options)
48+
super().__init__(
49+
filepath_or_buffer,
50+
storage_options=storage_options,
51+
engine_kwargs=engine_kwargs,
52+
)
4653

4754
@property
4855
def _workbook_class(self):
4956
from odf.opendocument import OpenDocument
5057

5158
return OpenDocument
5259

53-
def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
60+
def load_workbook(
61+
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
62+
):
5463
from odf.opendocument import load
5564

56-
return load(filepath_or_buffer)
65+
return load(filepath_or_buffer, **engine_kwargs)
5766

5867
@property
5968
def empty_value(self) -> str:

pandas/io/excel/_openpyxl.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,7 @@ def __init__(
536536
self,
537537
filepath_or_buffer: FilePath | ReadBuffer[bytes],
538538
storage_options: StorageOptions = None,
539+
engine_kwargs: dict | None = None,
539540
) -> None:
540541
"""
541542
Reader using openpyxl engine.
@@ -545,21 +546,33 @@ def __init__(
545546
filepath_or_buffer : str, path object or Workbook
546547
Object to be parsed.
547548
{storage_options}
549+
engine_kwargs : dict, optional
550+
Arbitrary keyword arguments passed to excel engine.
548551
"""
549552
import_optional_dependency("openpyxl")
550-
super().__init__(filepath_or_buffer, storage_options=storage_options)
553+
super().__init__(
554+
filepath_or_buffer,
555+
storage_options=storage_options,
556+
engine_kwargs=engine_kwargs,
557+
)
551558

552559
@property
553560
def _workbook_class(self):
554561
from openpyxl import Workbook
555562

556563
return Workbook
557564

558-
def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
565+
def load_workbook(
566+
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
567+
):
559568
from openpyxl import load_workbook
560569

561570
return load_workbook(
562-
filepath_or_buffer, read_only=True, data_only=True, keep_links=False
571+
filepath_or_buffer,
572+
read_only=True,
573+
data_only=True,
574+
keep_links=False,
575+
**engine_kwargs,
563576
)
564577

565578
@property

pandas/io/excel/_pyxlsb.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def __init__(
2525
self,
2626
filepath_or_buffer: FilePath | ReadBuffer[bytes],
2727
storage_options: StorageOptions = None,
28+
engine_kwargs: dict | None = None,
2829
) -> None:
2930
"""
3031
Reader using pyxlsb engine.
@@ -34,26 +35,34 @@ def __init__(
3435
filepath_or_buffer : str, path object, or Workbook
3536
Object to be parsed.
3637
{storage_options}
38+
engine_kwargs : dict, optional
39+
Arbitrary keyword arguments passed to excel engine.
3740
"""
3841
import_optional_dependency("pyxlsb")
3942
# This will call load_workbook on the filepath or buffer
4043
# And set the result to the book-attribute
41-
super().__init__(filepath_or_buffer, storage_options=storage_options)
44+
super().__init__(
45+
filepath_or_buffer,
46+
storage_options=storage_options,
47+
engine_kwargs=engine_kwargs,
48+
)
4249

4350
@property
4451
def _workbook_class(self):
4552
from pyxlsb import Workbook
4653

4754
return Workbook
4855

49-
def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
56+
def load_workbook(
57+
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
58+
):
5059
from pyxlsb import open_workbook
5160

5261
# TODO: hack in buffer capability
5362
# This might need some modifications to the Pyxlsb library
5463
# Actual work for opening it is in xlsbpackage.py, line 20-ish
5564

56-
return open_workbook(filepath_or_buffer)
65+
return open_workbook(filepath_or_buffer, **engine_kwargs)
5766

5867
@property
5968
def sheet_names(self) -> list[str]:

pandas/io/excel/_xlrd.py

+14-5
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@
2222
class XlrdReader(BaseExcelReader):
2323
@doc(storage_options=_shared_docs["storage_options"])
2424
def __init__(
25-
self, filepath_or_buffer, storage_options: StorageOptions = None
25+
self,
26+
filepath_or_buffer,
27+
storage_options: StorageOptions = None,
28+
engine_kwargs: dict | None = None,
2629
) -> None:
2730
"""
2831
Reader using xlrd engine.
@@ -32,25 +35,31 @@ def __init__(
3235
filepath_or_buffer : str, path object or Workbook
3336
Object to be parsed.
3437
{storage_options}
38+
engine_kwargs : dict, optional
39+
Arbitrary keyword arguments passed to excel engine.
3540
"""
3641
err_msg = "Install xlrd >= 2.0.1 for xls Excel support"
3742
import_optional_dependency("xlrd", extra=err_msg)
38-
super().__init__(filepath_or_buffer, storage_options=storage_options)
43+
super().__init__(
44+
filepath_or_buffer,
45+
storage_options=storage_options,
46+
engine_kwargs=engine_kwargs,
47+
)
3948

4049
@property
4150
def _workbook_class(self):
4251
from xlrd import Book
4352

4453
return Book
4554

46-
def load_workbook(self, filepath_or_buffer):
55+
def load_workbook(self, filepath_or_buffer, engine_kwargs):
4756
from xlrd import open_workbook
4857

4958
if hasattr(filepath_or_buffer, "read"):
5059
data = filepath_or_buffer.read()
51-
return open_workbook(file_contents=data)
60+
return open_workbook(file_contents=data, **engine_kwargs)
5261
else:
53-
return open_workbook(filepath_or_buffer)
62+
return open_workbook(filepath_or_buffer, **engine_kwargs)
5463

5564
@property
5665
def sheet_names(self):

pandas/tests/io/excel/test_readers.py

+27
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import os
77
from pathlib import Path
88
import platform
9+
import re
910
from urllib.error import URLError
1011
from zipfile import BadZipFile
1112

@@ -148,6 +149,32 @@ def parser(self, *args, **kwargs):
148149
expected = expected_defaults[read_ext[1:]]
149150
assert result == expected
150151

152+
def test_engine_kwargs(self, read_ext, engine):
153+
# GH#52214
154+
expected_defaults = {
155+
"xlsx": {"foo": "abcd"},
156+
"xlsm": {"foo": 123},
157+
"xlsb": {"foo": "True"},
158+
"xls": {"foo": True},
159+
"ods": {"foo": "abcd"},
160+
}
161+
162+
if read_ext[1:] == "xls" or read_ext[1:] == "xlsb":
163+
msg = re.escape(r"open_workbook() got an unexpected keyword argument 'foo'")
164+
elif read_ext[1:] == "ods":
165+
msg = re.escape(r"load() got an unexpected keyword argument 'foo'")
166+
else:
167+
msg = re.escape(r"load_workbook() got an unexpected keyword argument 'foo'")
168+
169+
if engine is not None:
170+
with pytest.raises(TypeError, match=msg):
171+
pd.read_excel(
172+
"test1" + read_ext,
173+
sheet_name="Sheet1",
174+
index_col=0,
175+
engine_kwargs=expected_defaults[read_ext[1:]],
176+
)
177+
151178
def test_usecols_int(self, read_ext):
152179
# usecols as int
153180
msg = "Passing an integer for `usecols`"

0 commit comments

Comments
 (0)