Skip to content

Commit 45e8193

Browse files
cruzzoeroberthdevries
authored andcommitted
Deprecate using xlrd engine and change default engine to read excel files to openpyxl
1 parent eca6068 commit 45e8193

File tree

6 files changed

+89
-20
lines changed

6 files changed

+89
-20
lines changed

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor
143143
Deprecations
144144
~~~~~~~~~~~~
145145
- Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`)
146+
- :func:`read_excel` default engine "xlrd" is replaced by "openpyxl" because "xlrd" is deprecated (:issue:`28547`).
146147
-
147148
-
148149

pandas/io/excel/_base.py

+14-4
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import os
55
from textwrap import fill
66
from typing import Any, Mapping, Union
7+
import warnings
78

89
from pandas._config import config
910

@@ -825,8 +826,7 @@ def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool:
825826
class ExcelFile:
826827
"""
827828
Class for parsing tabular excel sheets into DataFrame objects.
828-
829-
Uses xlrd engine by default. See read_excel for more documentation
829+
Uses xlrd, openpyxl or odf. See read_excel for more documentation
830830
831831
Parameters
832832
----------
@@ -837,7 +837,7 @@ class ExcelFile:
837837
engine : str, default None
838838
If io is not a buffer or path, this must be set to identify io.
839839
Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``,
840-
default ``xlrd``.
840+
default ``openpyxl``, ``xlrd`` for .xls files, ``odf`` for .ods files.
841841
Engine compatibility :
842842
- ``xlrd`` supports most old/new Excel file formats.
843843
- ``openpyxl`` supports newer Excel file formats.
@@ -861,14 +861,24 @@ def __init__(
861861
self, path_or_buffer, engine=None, storage_options: StorageOptions = None
862862
):
863863
if engine is None:
864-
engine = "xlrd"
864+
engine = "openpyxl"
865865
if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)):
866866
if _is_ods_stream(path_or_buffer):
867867
engine = "odf"
868868
else:
869869
ext = os.path.splitext(str(path_or_buffer))[-1]
870870
if ext == ".ods":
871871
engine = "odf"
872+
elif ext == ".xls":
873+
engine = "xlrd"
874+
875+
elif engine == "xlrd":
876+
warnings.warn(
877+
'The Excel reader engine "xlrd" is deprecated, use "openpyxl" instead. '
878+
'Specify engine="openpyxl" to suppress this warning.',
879+
FutureWarning,
880+
stacklevel=2,
881+
)
872882
if engine not in self._engines:
873883
raise ValueError(f"Unknown engine: {engine}")
874884

pandas/io/excel/_openpyxl.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from datetime import datetime
12
from typing import List
23

34
import numpy as np
@@ -517,7 +518,11 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
517518

518519
# TODO: replace with openpyxl constants
519520
if cell.is_date:
520-
return cell.value
521+
try:
522+
# workaround for inaccurate timestamp notation in excel
523+
return datetime.fromtimestamp(round(cell.value.timestamp()))
524+
except (AttributeError, OSError):
525+
return cell.value
521526
elif cell.data_type == "e":
522527
return np.nan
523528
elif cell.data_type == "b":

pandas/tests/io/excel/test_readers.py

+25-12
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@
2222
marks=[
2323
td.skip_if_no("xlrd"),
2424
pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"),
25+
pytest.mark.filterwarnings(
26+
'ignore:The Excel reader engine "xlrd" is deprecated,'
27+
),
2528
],
2629
),
2730
pytest.param(
@@ -34,8 +37,8 @@
3437
pytest.param(
3538
None,
3639
marks=[
37-
td.skip_if_no("xlrd"),
38-
pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"),
40+
td.skip_if_no("openpyxl"),
41+
pytest.mark.filterwarnings("ignore:.*html argument"),
3942
],
4043
),
4144
pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")),
@@ -51,6 +54,8 @@ def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool:
5154
engine = engine.values[0]
5255
if engine == "openpyxl" and read_ext == ".xls":
5356
return False
57+
if engine is None and read_ext == ".xls":
58+
return False
5459
if engine == "odf" and read_ext != ".ods":
5560
return False
5661
if read_ext == ".ods" and engine != "odf":
@@ -559,7 +564,7 @@ def test_date_conversion_overflow(self, read_ext):
559564
columns=["DateColWithBigInt", "StringCol"],
560565
)
561566

562-
if pd.read_excel.keywords["engine"] == "openpyxl":
567+
if pd.read_excel.keywords["engine"] in ["openpyxl", None]:
563568
pytest.xfail("Maybe not supported by openpyxl")
564569

565570
result = pd.read_excel("testdateoverflow" + read_ext)
@@ -942,7 +947,10 @@ def test_read_excel_squeeze(self, read_ext):
942947
expected = pd.Series([1, 2, 3], name="a")
943948
tm.assert_series_equal(actual, expected)
944949

945-
def test_deprecated_kwargs(self, read_ext):
950+
def test_deprecated_kwargs(self, engine, read_ext):
951+
if engine == "xlrd":
952+
pytest.skip("Use of xlrd engine produces a FutureWarning as well")
953+
946954
with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False):
947955
pd.read_excel("test1" + read_ext, "Sheet1", 0)
948956

@@ -961,6 +969,19 @@ def test_no_header_with_list_index_col(self, read_ext):
961969
)
962970
tm.assert_frame_equal(expected, result)
963971

972+
def test_excel_high_surrogate(self, engine, read_ext):
973+
# GH 23809
974+
if read_ext != ".xlsx":
975+
pytest.skip("Test is only applicable to .xlsx file")
976+
if engine in ["openpyxl", None]:
977+
pytest.skip("Test does not work for openpyxl")
978+
979+
expected = pd.DataFrame(["\udc88"], columns=["Column1"])
980+
981+
# should not produce a segmentation violation
982+
actual = pd.read_excel("high_surrogate.xlsx")
983+
tm.assert_frame_equal(expected, actual)
984+
964985

965986
class TestExcelFileRead:
966987
@pytest.fixture(autouse=True)
@@ -1116,14 +1137,6 @@ def test_excel_read_binary(self, engine, read_ext):
11161137
actual = pd.read_excel(data, engine=engine)
11171138
tm.assert_frame_equal(expected, actual)
11181139

1119-
def test_excel_high_surrogate(self, engine):
1120-
# GH 23809
1121-
expected = pd.DataFrame(["\udc88"], columns=["Column1"])
1122-
1123-
# should not produce a segmentation violation
1124-
actual = pd.read_excel("high_surrogate.xlsx")
1125-
tm.assert_frame_equal(expected, actual)
1126-
11271140
@pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"])
11281141
def test_header_with_index_col(self, engine, filename):
11291142
# GH 33476

pandas/tests/io/excel/test_writers.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -351,12 +351,16 @@ def test_excel_sheet_by_name_raise(self, path, engine):
351351
msg = "sheet 0 not found"
352352
with pytest.raises(ValueError, match=msg):
353353
pd.read_excel(xl, "0")
354-
else:
354+
elif engine == "xlwt":
355355
import xlrd
356356

357357
msg = "No sheet named <'0'>"
358358
with pytest.raises(xlrd.XLRDError, match=msg):
359359
pd.read_excel(xl, sheet_name="0")
360+
else: # openpyxl
361+
msg = "Worksheet 0 does not exist."
362+
with pytest.raises(KeyError, match=msg):
363+
pd.read_excel(xl, sheet_name="0")
360364

361365
def test_excel_writer_context_manager(self, frame, path):
362366
with ExcelWriter(path) as writer:
@@ -1199,6 +1203,9 @@ def test_datetimes(self, path):
11991203

12001204
tm.assert_series_equal(write_frame["A"], read_frame["A"])
12011205

1206+
@pytest.mark.filterwarnings(
1207+
'ignore:The Excel reader engine "xlrd" is deprecated:FutureWarning'
1208+
)
12021209
def test_bytes_io(self, engine):
12031210
# see gh-7074
12041211
bio = BytesIO()
@@ -1209,8 +1216,15 @@ def test_bytes_io(self, engine):
12091216
df.to_excel(writer)
12101217
writer.save()
12111218

1219+
if engine == "xlwt":
1220+
read_engine = "xlrd"
1221+
elif engine == "xlsxwriter":
1222+
read_engine = "openpyxl"
1223+
else:
1224+
read_engine = engine
1225+
12121226
bio.seek(0)
1213-
reread_df = pd.read_excel(bio, index_col=0)
1227+
reread_df = pd.read_excel(bio, index_col=0, engine=read_engine)
12141228
tm.assert_frame_equal(df, reread_df)
12151229

12161230
def test_write_lists_dict(self, path):

pandas/tests/io/excel/test_xlrd.py

+27-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ def skip_ods_and_xlsb_files(read_ext):
1717
pytest.skip("Not valid for xlrd")
1818

1919

20+
@pytest.mark.filterwarnings(
21+
'ignore:The Excel reader engine "xlrd" is deprecated:FutureWarning'
22+
)
2023
def test_read_xlrd_book(read_ext, frame):
2124
df = frame
2225

@@ -36,8 +39,31 @@ def test_read_xlrd_book(read_ext, frame):
3639

3740

3841
# TODO: test for openpyxl as well
42+
@pytest.mark.filterwarnings(
43+
'ignore:The Excel reader engine "xlrd" is deprecated:FutureWarning'
44+
)
3945
def test_excel_table_sheet_by_index(datapath, read_ext):
4046
path = datapath("io", "data", "excel", f"test1{read_ext}")
41-
with pd.ExcelFile(path) as excel:
47+
with pd.ExcelFile(path, engine="xlrd") as excel:
4248
with pytest.raises(xlrd.XLRDError):
4349
pd.read_excel(excel, sheet_name="asdf")
50+
51+
52+
def test_excel_file_warning_with_xlsx_file(datapath):
53+
# GH 29375
54+
path = datapath("io", "data", "excel", "test1.xlsx")
55+
with tm.assert_produces_warning(
56+
FutureWarning, check_stacklevel=True, raise_on_extra_warnings=False
57+
) as w:
58+
pd.ExcelFile(path, engine="xlrd")
59+
assert '"xlrd" is deprecated, use "openpyxl" instead.' in str(w[0].message)
60+
61+
62+
def test_read_excel_warning_with_xlsx_file(tmpdir, datapath):
63+
# GH 29375
64+
path = datapath("io", "data", "excel", "test1.xlsx")
65+
with tm.assert_produces_warning(
66+
FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False
67+
) as w:
68+
pd.read_excel(path, "Sheet1", engine="xlrd")
69+
assert '"xlrd" is deprecated, use "openpyxl" instead.' in str(w[0].message)

0 commit comments

Comments
 (0)