Skip to content

Commit ea08ae2

Browse files
Debian Science Teamrebecca-palmer
Debian Science Team
authored andcommitted
Default to openpyxl not xlrd for read_excel
xlrd 1.2 fails if defusedxml (needed for odf) is installed Bug: pandas-dev/pandas#35029 Bug-Debian: https://bugs.debian.org/976620 Origin: upstream b3a3932af6aafaa2fd41f17e9b7995643e5f92eb Author: Robert de Vries, Rebecca N. Palmer <[email protected]> Forwarded: not-needed Gbp-Pq: Name xlrd_976620.patch
1 parent a400950 commit ea08ae2

File tree

5 files changed

+159
-8
lines changed

5 files changed

+159
-8
lines changed

doc/source/whatsnew/v1.1.5.rst

+10
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,16 @@ including other versions of pandas.
88

99
{{ header }}
1010

11+
.. warning::
12+
13+
Previously, the default argument ``engine=None`` to ``pd.read_excel``
14+
would result in using the `xlrd <https://xlrd.readthedocs.io/en/latest/>`_ engine in
15+
many cases. The engine ``xlrd`` is no longer maintained, and may not work if ``defusedxml``
16+
is installed. Hence, from version 1.1.5 in Debian and 1.2.0 upstream,
17+
if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
18+
many of these cases will now default to using the ``openpyxl`` engine. See the
19+
:func:`read_excel` documentation for more details.
20+
1121
.. ---------------------------------------------------------------------------
1222
1323
.. _whatsnew_115.regressions:

pandas/io/excel/_base.py

+91-4
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
import abc
22
import datetime
3+
import inspect
34
from io import BufferedIOBase, BytesIO, RawIOBase
45
import os
56
from textwrap import fill
67
from typing import Union
8+
import warnings
79

810
from pandas._config import config
911

1012
from pandas._libs.parsers import STR_NA_VALUES
13+
from pandas.compat._optional import import_optional_dependency
1114
from pandas.errors import EmptyDataError
1215
from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments
1316

@@ -104,12 +107,32 @@
104107
of dtype conversion.
105108
engine : str, default None
106109
If io is not a buffer or path, this must be set to identify io.
107-
Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", default "xlrd".
110+
Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb".
108111
Engine compatibility :
112+
109113
- "xlrd" supports most old/new Excel file formats.
110114
- "openpyxl" supports newer Excel file formats.
111115
- "odf" supports OpenDocument file formats (.odf, .ods, .odt).
112116
- "pyxlsb" supports Binary Excel files.
117+
118+
.. versionchanged:: 1.1.5 in Debian, 1.2.0 upstream
119+
The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
120+
is no longer maintained, and is not supported with
121+
python >= 3.9. When ``engine=None``, the following logic will be
122+
used to determine the engine.
123+
124+
- If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
125+
then `odf <https://pypi.org/project/odfpy/>`_ will be used.
126+
- Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
127+
extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` will
128+
be used.
129+
- Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
130+
then ``openpyxl`` will be used.
131+
- Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
132+
133+
Specifying ``engine="xlrd"`` will continue to be allowed for the
134+
indefinite future, but may require uninstalling (python3-)defusedxml.
135+
113136
converters : dict, default None
114137
Dict of functions for converting values in certain columns. Keys can
115138
either be integers or column labels, values are functions that take one
@@ -823,13 +846,32 @@ class ExcelFile:
823846
.xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file.
824847
engine : str, default None
825848
If io is not a buffer or path, this must be set to identify io.
826-
Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``,
827-
default ``xlrd``.
849+
Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``
828850
Engine compatibility :
851+
829852
- ``xlrd`` supports most old/new Excel file formats.
830853
- ``openpyxl`` supports newer Excel file formats.
831854
- ``odf`` supports OpenDocument file formats (.odf, .ods, .odt).
832855
- ``pyxlsb`` supports Binary Excel files.
856+
857+
.. versionchanged:: 1.1.5 in Debian, 1.2.0 upstream
858+
859+
The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
860+
is no longer maintained, and is not supported with
861+
python >= 3.9. When ``engine=None``, the following logic will be
862+
used to determine the engine.
863+
864+
- If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
865+
then `odf <https://pypi.org/project/odfpy/>`_ will be used.
866+
- Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
867+
extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd``
868+
will be used.
869+
- Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
870+
then ``openpyxl`` will be used.
871+
- Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
872+
873+
Specifying ``engine="xlrd"`` will continue to be allowed for the
874+
indefinite future, but may require uninstalling (python3-)defusedxml.
833875
"""
834876

835877
from pandas.io.excel._odfreader import _ODFReader
@@ -846,14 +888,59 @@ class ExcelFile:
846888

847889
def __init__(self, path_or_buffer, engine=None):
848890
if engine is None:
849-
engine = "xlrd"
891+
# Determine ext and use odf for ods stream/file
850892
if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)):
893+
ext = None
851894
if _is_ods_stream(path_or_buffer):
852895
engine = "odf"
853896
else:
854897
ext = os.path.splitext(str(path_or_buffer))[-1]
855898
if ext == ".ods":
856899
engine = "odf"
900+
901+
if (
902+
import_optional_dependency(
903+
"xlrd", raise_on_missing=False, on_version="ignore"
904+
)
905+
is not None
906+
):
907+
from xlrd import Book
908+
909+
if isinstance(path_or_buffer, Book):
910+
engine = "xlrd"
911+
912+
# GH 35029 - Prefer openpyxl except for xls files
913+
if engine is None:
914+
if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls":
915+
engine = "xlrd"
916+
elif (
917+
import_optional_dependency(
918+
"openpyxl", raise_on_missing=False, on_version="ignore"
919+
)
920+
is not None
921+
):
922+
engine = "openpyxl"
923+
else:
924+
caller = inspect.stack()[1]
925+
if (
926+
caller.filename.endswith("pandas/io/excel/_base.py")
927+
and caller.function == "read_excel"
928+
):
929+
stacklevel = 4
930+
else:
931+
stacklevel = 2
932+
warnings.warn(
933+
"The xlrd engine is no longer maintained and is not "
934+
"supported when using pandas with python >= 3.9. However, "
935+
"the engine xlrd will continue to be allowed for the "
936+
"indefinite future. The "
937+
"openpyxl engine will be used if it is installed and the "
938+
"engine argument is not specified. Either install openpyxl "
939+
"or specify engine='xlrd' to silence this warning.",
940+
FutureWarning,
941+
stacklevel=stacklevel,
942+
)
943+
engine = "xlrd"
857944
if engine not in self._engines:
858945
raise ValueError(f"Unknown engine: {engine}")
859946

pandas/tests/io/excel/test_readers.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -599,6 +599,10 @@ def test_date_conversion_overflow(self, read_ext):
599599
if pd.read_excel.keywords["engine"] == "openpyxl":
600600
pytest.xfail("Maybe not supported by openpyxl")
601601

602+
if pd.read_excel.keywords["engine"] is None:
603+
# GH 35029
604+
pytest.xfail("Defaults to openpyxl, maybe not supported")
605+
602606
result = pd.read_excel("testdateoverflow" + read_ext)
603607
tm.assert_frame_equal(result, expected)
604608

@@ -1153,12 +1157,13 @@ def test_excel_read_binary(self, engine, read_ext):
11531157
actual = pd.read_excel(data, engine=engine)
11541158
tm.assert_frame_equal(expected, actual)
11551159

1160+
@td.skip_if_no("xlrd")
11561161
def test_excel_high_surrogate(self, engine):
11571162
# GH 23809
11581163
expected = pd.DataFrame(["\udc88"], columns=["Column1"])
11591164

11601165
# should not produce a segmentation violation
1161-
actual = pd.read_excel("high_surrogate.xlsx")
1166+
actual = pd.read_excel("high_surrogate.xlsx", engine="xlrd")
11621167
tm.assert_frame_equal(expected, actual)
11631168

11641169
@pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"])

pandas/tests/io/excel/test_writers.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -351,12 +351,15 @@ def test_excel_sheet_by_name_raise(self, path, engine):
351351
msg = "sheet 0 not found"
352352
with pytest.raises(ValueError, match=msg):
353353
pd.read_excel(xl, "0")
354-
else:
354+
elif engine == "xlwt":
355355
import xlrd
356356

357357
msg = "No sheet named <'0'>"
358358
with pytest.raises(xlrd.XLRDError, match=msg):
359359
pd.read_excel(xl, sheet_name="0")
360+
else:
361+
with pytest.raises(KeyError, match="Worksheet 0 does not exist."):
362+
pd.read_excel(xl, sheet_name="0")
360363

361364
def test_excel_writer_context_manager(self, frame, path):
362365
with ExcelWriter(path) as writer:
@@ -1195,7 +1198,9 @@ def test_datetimes(self, path):
11951198

11961199
write_frame = DataFrame({"A": datetimes})
11971200
write_frame.to_excel(path, "Sheet1")
1198-
read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0)
1201+
# GH 35029 - Default changed to openpyxl, but test is for odf/xlrd
1202+
engine = "odf" if path.endswith("ods") else "xlrd"
1203+
read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0, engine=engine)
11991204

12001205
tm.assert_series_equal(write_frame["A"], read_frame["A"])
12011206

pandas/tests/io/excel/test_xlrd.py

+45-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import pytest
22

3+
from pandas.compat._optional import import_optional_dependency
4+
35
import pandas as pd
46
import pandas._testing as tm
57

@@ -38,6 +40,48 @@ def test_read_xlrd_book(read_ext, frame):
3840
# TODO: test for openpyxl as well
3941
def test_excel_table_sheet_by_index(datapath, read_ext):
4042
path = datapath("io", "data", "excel", f"test1{read_ext}")
41-
with pd.ExcelFile(path) as excel:
43+
with pd.ExcelFile(path, engine="xlrd") as excel:
4244
with pytest.raises(xlrd.XLRDError):
4345
pd.read_excel(excel, sheet_name="asdf")
46+
47+
48+
def test_excel_file_warning_with_xlsx_file(datapath):
49+
# GH 29375
50+
path = datapath("io", "data", "excel", "test1.xlsx")
51+
has_openpyxl = (
52+
import_optional_dependency(
53+
"openpyxl", raise_on_missing=False, on_version="ignore"
54+
)
55+
is not None
56+
)
57+
if not has_openpyxl:
58+
with tm.assert_produces_warning(
59+
FutureWarning,
60+
raise_on_extra_warnings=False,
61+
match="The xlrd engine is no longer maintained",
62+
):
63+
ExcelFile(path, engine=None)
64+
else:
65+
with tm.assert_produces_warning(None):
66+
pd.read_excel(path, "Sheet1", engine=None)
67+
68+
69+
def test_read_excel_warning_with_xlsx_file(tmpdir, datapath):
70+
# GH 29375
71+
path = datapath("io", "data", "excel", "test1.xlsx")
72+
has_openpyxl = (
73+
import_optional_dependency(
74+
"openpyxl", raise_on_missing=False, on_version="ignore"
75+
)
76+
is not None
77+
)
78+
if not has_openpyxl:
79+
with tm.assert_produces_warning(
80+
FutureWarning,
81+
raise_on_extra_warnings=False,
82+
match="The xlrd engine is no longer maintained",
83+
):
84+
pd.read_excel(path, "Sheet1", engine=None)
85+
else:
86+
with tm.assert_produces_warning(None):
87+
pd.read_excel(path, "Sheet1", engine=None)

0 commit comments

Comments
 (0)