Skip to content

Commit fad02a5

Browse files
cruzzoeroberthdevries
authored andcommitted
Deprecate using xlrd engine and change default engine to read excel files to openpyxl
1 parent 7e25af8 commit fad02a5

File tree

6 files changed

+89
-20
lines changed

6 files changed

+89
-20
lines changed

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -820,6 +820,7 @@ Deprecations
820820
precision through the ``rtol``, and ``atol`` parameters, thus deprecating the
821821
``check_less_precise`` parameter. (:issue:`13357`).
822822
- :func:`DataFrame.melt` accepting a value_name that already exists is deprecated, and will be removed in a future version (:issue:`34731`)
823+
- :func:`read_excel` default engine "xlrd" is replaced by "openpyxl" because "xlrd" is deprecated (:issue:`28547`).
823824

824825
.. ---------------------------------------------------------------------------
825826

pandas/io/excel/_base.py

+14-4
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import os
55
from textwrap import fill
66
from typing import Union
7+
import warnings
78

89
from pandas._config import config
910

@@ -810,8 +811,7 @@ def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool:
810811
class ExcelFile:
811812
"""
812813
Class for parsing tabular excel sheets into DataFrame objects.
813-
814-
Uses xlrd engine by default. See read_excel for more documentation
814+
Uses xlrd, openpyxl or odf. See read_excel for more documentation
815815
816816
Parameters
817817
----------
@@ -822,7 +822,7 @@ class ExcelFile:
822822
engine : str, default None
823823
If io is not a buffer or path, this must be set to identify io.
824824
Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``,
825-
default ``xlrd``.
825+
default ``openpyxl``, ``xlrd`` for .xls files, ``odf`` for .ods files.
826826
Engine compatibility :
827827
- ``xlrd`` supports most old/new Excel file formats.
828828
- ``openpyxl`` supports newer Excel file formats.
@@ -844,14 +844,24 @@ class ExcelFile:
844844

845845
def __init__(self, path_or_buffer, engine=None):
846846
if engine is None:
847-
engine = "xlrd"
847+
engine = "openpyxl"
848848
if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)):
849849
if _is_ods_stream(path_or_buffer):
850850
engine = "odf"
851851
else:
852852
ext = os.path.splitext(str(path_or_buffer))[-1]
853853
if ext == ".ods":
854854
engine = "odf"
855+
elif ext == ".xls":
856+
engine = "xlrd"
857+
858+
elif engine == "xlrd":
859+
warnings.warn(
860+
'The Excel reader engine "xlrd" is deprecated, use "openpyxl" instead. '
861+
'Specify engine="openpyxl" to suppress this warning.',
862+
FutureWarning,
863+
stacklevel=2,
864+
)
855865
if engine not in self._engines:
856866
raise ValueError(f"Unknown engine: {engine}")
857867

pandas/io/excel/_openpyxl.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from datetime import datetime
12
from typing import List
23

34
import numpy as np
@@ -511,7 +512,11 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
511512

512513
# TODO: replace with openpyxl constants
513514
if cell.is_date:
514-
return cell.value
515+
try:
516+
# workaround for inaccurate timestamp notation in excel
517+
return datetime.fromtimestamp(round(cell.value.timestamp()))
518+
except (AttributeError, OSError):
519+
return cell.value
515520
elif cell.data_type == "e":
516521
return np.nan
517522
elif cell.data_type == "b":

pandas/tests/io/excel/test_readers.py

+25-12
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ def ignore_xlrd_time_clock_warning():
4040
marks=[
4141
td.skip_if_no("xlrd"),
4242
pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"),
43+
pytest.mark.filterwarnings(
44+
'ignore:The Excel reader engine "xlrd" is deprecated,'
45+
),
4346
],
4447
),
4548
pytest.param(
@@ -52,8 +55,8 @@ def ignore_xlrd_time_clock_warning():
5255
pytest.param(
5356
None,
5457
marks=[
55-
td.skip_if_no("xlrd"),
56-
pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"),
58+
td.skip_if_no("openpyxl"),
59+
pytest.mark.filterwarnings("ignore:.*html argument"),
5760
],
5861
),
5962
pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")),
@@ -69,6 +72,8 @@ def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool:
6972
engine = engine.values[0]
7073
if engine == "openpyxl" and read_ext == ".xls":
7174
return False
75+
if engine is None and read_ext == ".xls":
76+
return False
7277
if engine == "odf" and read_ext != ".ods":
7378
return False
7479
if read_ext == ".ods" and engine != "odf":
@@ -579,7 +584,7 @@ def test_date_conversion_overflow(self, read_ext):
579584
columns=["DateColWithBigInt", "StringCol"],
580585
)
581586

582-
if pd.read_excel.keywords["engine"] == "openpyxl":
587+
if pd.read_excel.keywords["engine"] in ["openpyxl", None]:
583588
pytest.xfail("Maybe not supported by openpyxl")
584589

585590
result = pd.read_excel("testdateoverflow" + read_ext)
@@ -962,12 +967,28 @@ def test_read_excel_squeeze(self, read_ext):
962967
expected = pd.Series([1, 2, 3], name="a")
963968
tm.assert_series_equal(actual, expected)
964969

965-
def test_deprecated_kwargs(self, read_ext):
970+
def test_deprecated_kwargs(self, engine, read_ext):
971+
if engine == "xlrd":
972+
pytest.skip("Use of xlrd engine produces a FutureWarning as well")
973+
966974
with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False):
967975
pd.read_excel("test1" + read_ext, "Sheet1", 0)
968976

969977
pd.read_excel("test1" + read_ext)
970978

979+
def test_excel_high_surrogate(self, engine, read_ext):
980+
# GH 23809
981+
if read_ext != ".xlsx":
982+
pytest.skip("Test is only applicable to .xlsx file")
983+
if engine in ["openpyxl", None]:
984+
pytest.skip("Test does not work for openpyxl")
985+
986+
expected = pd.DataFrame(["\udc88"], columns=["Column1"])
987+
988+
# should not produce a segmentation violation
989+
actual = pd.read_excel("high_surrogate.xlsx")
990+
tm.assert_frame_equal(expected, actual)
991+
971992

972993
class TestExcelFileRead:
973994
@pytest.fixture(autouse=True)
@@ -1123,14 +1144,6 @@ def test_excel_read_binary(self, engine, read_ext):
11231144
actual = pd.read_excel(data, engine=engine)
11241145
tm.assert_frame_equal(expected, actual)
11251146

1126-
def test_excel_high_surrogate(self, engine):
1127-
# GH 23809
1128-
expected = pd.DataFrame(["\udc88"], columns=["Column1"])
1129-
1130-
# should not produce a segmentation violation
1131-
actual = pd.read_excel("high_surrogate.xlsx")
1132-
tm.assert_frame_equal(expected, actual)
1133-
11341147
@pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"])
11351148
def test_header_with_index_col(self, engine, filename):
11361149
# GH 33476

pandas/tests/io/excel/test_writers.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -351,12 +351,16 @@ def test_excel_sheet_by_name_raise(self, path, engine):
351351
msg = "sheet 0 not found"
352352
with pytest.raises(ValueError, match=msg):
353353
pd.read_excel(xl, "0")
354-
else:
354+
elif engine == "xlwt":
355355
import xlrd
356356

357357
msg = "No sheet named <'0'>"
358358
with pytest.raises(xlrd.XLRDError, match=msg):
359359
pd.read_excel(xl, sheet_name="0")
360+
else: # openpyxl
361+
msg = "Worksheet 0 does not exist."
362+
with pytest.raises(KeyError, match=msg):
363+
pd.read_excel(xl, sheet_name="0")
360364

361365
def test_excel_writer_context_manager(self, frame, path):
362366
with ExcelWriter(path) as writer:
@@ -1199,6 +1203,9 @@ def test_datetimes(self, path):
11991203

12001204
tm.assert_series_equal(write_frame["A"], read_frame["A"])
12011205

1206+
@pytest.mark.filterwarnings(
1207+
'ignore:The Excel reader engine "xlrd" is deprecated:FutureWarning'
1208+
)
12021209
def test_bytes_io(self, engine):
12031210
# see gh-7074
12041211
bio = BytesIO()
@@ -1209,8 +1216,15 @@ def test_bytes_io(self, engine):
12091216
df.to_excel(writer)
12101217
writer.save()
12111218

1219+
if engine == "xlwt":
1220+
read_engine = "xlrd"
1221+
elif engine == "xlsxwriter":
1222+
read_engine = "openpyxl"
1223+
else:
1224+
read_engine = engine
1225+
12121226
bio.seek(0)
1213-
reread_df = pd.read_excel(bio, index_col=0)
1227+
reread_df = pd.read_excel(bio, index_col=0, engine=read_engine)
12141228
tm.assert_frame_equal(df, reread_df)
12151229

12161230
def test_write_lists_dict(self, path):

pandas/tests/io/excel/test_xlrd.py

+27-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ def skip_ods_and_xlsb_files(read_ext):
1717
pytest.skip("Not valid for xlrd")
1818

1919

20+
@pytest.mark.filterwarnings(
21+
'ignore:The Excel reader engine "xlrd" is deprecated:FutureWarning'
22+
)
2023
def test_read_xlrd_book(read_ext, frame):
2124
df = frame
2225

@@ -36,8 +39,31 @@ def test_read_xlrd_book(read_ext, frame):
3639

3740

3841
# TODO: test for openpyxl as well
42+
@pytest.mark.filterwarnings(
43+
'ignore:The Excel reader engine "xlrd" is deprecated:FutureWarning'
44+
)
3945
def test_excel_table_sheet_by_index(datapath, read_ext):
4046
path = datapath("io", "data", "excel", f"test1{read_ext}")
41-
with pd.ExcelFile(path) as excel:
47+
with pd.ExcelFile(path, engine="xlrd") as excel:
4248
with pytest.raises(xlrd.XLRDError):
4349
pd.read_excel(excel, sheet_name="asdf")
50+
51+
52+
def test_excel_file_warning_with_xlsx_file(datapath):
53+
# GH 29375
54+
path = datapath("io", "data", "excel", "test1.xlsx")
55+
with tm.assert_produces_warning(
56+
FutureWarning, check_stacklevel=True, raise_on_extra_warnings=False
57+
) as w:
58+
pd.ExcelFile(path, engine="xlrd")
59+
assert '"xlrd" is deprecated, use "openpyxl" instead.' in str(w[0].message)
60+
61+
62+
def test_read_excel_warning_with_xlsx_file(tmpdir, datapath):
63+
# GH 29375
64+
path = datapath("io", "data", "excel", "test1.xlsx")
65+
with tm.assert_produces_warning(
66+
FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False
67+
) as w:
68+
pd.read_excel(path, "Sheet1", engine="xlrd")
69+
assert '"xlrd" is deprecated, use "openpyxl" instead.' in str(w[0].message)

0 commit comments

Comments
 (0)