Skip to content

Commit 1222a46

Browse files
Backport PR pandas-dev#38571: DEPR: Adjust read excel behavior for xlrd >= 2.0 (pandas-dev#38670)
Co-authored-by: Richard Shadrach <[email protected]>
1 parent b4cb528 commit 1222a46

File tree

9 files changed

+235
-97
lines changed

9 files changed

+235
-97
lines changed

ci/deps/azure-38-slow.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ dependencies:
3030
- moto>=1.3.14
3131
- scipy
3232
- sqlalchemy
33-
- xlrd<2.0
33+
- xlrd>=2.0
3434
- xlsxwriter
3535
- xlwt
3636
- moto

ci/deps/azure-windows-37.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ dependencies:
3333
- s3fs>=0.4.2
3434
- scipy
3535
- sqlalchemy
36-
- xlrd<2.0
36+
- xlrd>=2.0
3737
- xlsxwriter
3838
- xlwt
3939
- pyreadstat

doc/source/whatsnew/v1.2.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ including other versions of pandas.
2727
**Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.**
2828
This is no longer supported, switch to using ``openpyxl`` instead.
2929

30-
Attempting to use the the ``xlwt`` engine will raise a ``FutureWarning``
30+
Attempting to use the ``xlwt`` engine will raise a ``FutureWarning``
3131
unless the option :attr:`io.excel.xls.writer` is set to ``"xlwt"``.
3232
While this option is now deprecated and will also raise a ``FutureWarning``,
3333
it can be globally set and the warning suppressed. Users are recommended to

pandas/io/excel/_base.py

+155-84
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,26 @@
11
import abc
22
import datetime
3+
from distutils.version import LooseVersion
34
import inspect
45
from io import BufferedIOBase, BytesIO, RawIOBase
56
import os
67
from textwrap import fill
7-
from typing import Any, Dict, Mapping, Union, cast
8+
from typing import IO, Any, Dict, Mapping, Optional, Union, cast
89
import warnings
10+
import zipfile
911

1012
from pandas._config import config
1113

1214
from pandas._libs.parsers import STR_NA_VALUES
1315
from pandas._typing import Buffer, FilePathOrBuffer, StorageOptions
1416
from pandas.compat._optional import import_optional_dependency
1517
from pandas.errors import EmptyDataError
16-
from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments
18+
from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments, doc
1719

1820
from pandas.core.dtypes.common import is_bool, is_float, is_integer, is_list_like
1921

2022
from pandas.core.frame import DataFrame
23+
from pandas.core.shared_docs import _shared_docs
2124

2225
from pandas.io.common import IOHandles, get_handle, stringify_path, validate_header_arg
2326
from pandas.io.excel._util import (
@@ -116,17 +119,15 @@
116119
When ``engine=None``, the following logic will be
117120
used to determine the engine:
118121
119-
- If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
120-
then `odf <https://pypi.org/project/odfpy/>`_ will be used.
121-
- Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
122-
extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` will
123-
be used.
124-
- Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
125-
then ``openpyxl`` will be used.
126-
- Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
127-
128-
Specifying ``engine="xlrd"`` will continue to be allowed for the
129-
indefinite future.
122+
- If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
123+
then `odf <https://pypi.org/project/odfpy/>`_ will be used.
124+
- Otherwise if ``path_or_buffer`` is an xls format,
125+
``xlrd`` will be used.
126+
- Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
127+
then ``openpyxl`` will be used.
128+
- Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
129+
- Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. This
130+
case will raise a ``ValueError`` in a future version of pandas.
130131
131132
converters : dict, default None
132133
Dict of functions for converting values in certain columns. Keys can
@@ -888,39 +889,92 @@ def close(self):
888889
return content
889890

890891

891-
def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool:
892+
XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"
893+
ZIP_SIGNATURE = b"PK\x03\x04"
894+
PEEK_SIZE = max(len(XLS_SIGNATURE), len(ZIP_SIGNATURE))
895+
896+
897+
@doc(storage_options=_shared_docs["storage_options"])
898+
def inspect_excel_format(
899+
path: Optional[str] = None,
900+
content: Union[None, BufferedIOBase, RawIOBase, bytes] = None,
901+
storage_options: StorageOptions = None,
902+
) -> str:
892903
"""
893-
Check if the stream is an OpenDocument Spreadsheet (.ods) file
904+
Inspect the path or content of an excel file and get its format.
905+
906+
At least one of path or content must be not None. If both are not None,
907+
content will take precedence.
894908
895-
It uses magic values inside the stream
909+
Adopted from xlrd: https://github.com/python-excel/xlrd.
896910
897911
Parameters
898912
----------
899-
stream : Union[BufferedIOBase, RawIOBase]
900-
IO stream with data which might be an ODS file
913+
path : str, optional
914+
Path to file to inspect. May be a URL.
915+
content : file-like object, optional
916+
Content of file to inspect.
917+
{storage_options}
901918
902919
Returns
903920
-------
904-
is_ods : bool
905-
Boolean indication that this is indeed an ODS file or not
921+
str
922+
Format of file.
923+
924+
Raises
925+
------
926+
ValueError
927+
If resulting stream is empty.
928+
BadZipFile
929+
If resulting stream does not have an XLS signature and is not a valid zipfile.
906930
"""
907-
stream.seek(0)
908-
is_ods = False
909-
if stream.read(4) == b"PK\003\004":
910-
stream.seek(30)
911-
is_ods = (
912-
stream.read(54) == b"mimetype"
913-
b"application/vnd.oasis.opendocument.spreadsheet"
914-
)
915-
stream.seek(0)
916-
return is_ods
931+
content_or_path: Union[None, str, BufferedIOBase, RawIOBase, IO[bytes]]
932+
if isinstance(content, bytes):
933+
content_or_path = BytesIO(content)
934+
else:
935+
content_or_path = content or path
936+
assert content_or_path is not None
937+
938+
with get_handle(
939+
content_or_path, "rb", storage_options=storage_options, is_text=False
940+
) as handle:
941+
stream = handle.handle
942+
stream.seek(0)
943+
buf = stream.read(PEEK_SIZE)
944+
if buf is None:
945+
raise ValueError("stream is empty")
946+
else:
947+
assert isinstance(buf, bytes)
948+
peek = buf
949+
stream.seek(0)
950+
951+
if peek.startswith(XLS_SIGNATURE):
952+
return "xls"
953+
elif not peek.startswith(ZIP_SIGNATURE):
954+
raise ValueError("File is not a recognized excel file")
955+
956+
# ZipFile typing is overly-strict
957+
# https://github.com/python/typeshed/issues/4212
958+
zf = zipfile.ZipFile(stream) # type: ignore[arg-type]
959+
960+
# Workaround for some third party files that use forward slashes and
961+
# lower case names.
962+
component_names = [name.replace("\\", "/").lower() for name in zf.namelist()]
963+
964+
if "xl/workbook.xml" in component_names:
965+
return "xlsx"
966+
if "xl/workbook.bin" in component_names:
967+
return "xlsb"
968+
if "content.xml" in component_names:
969+
return "ods"
970+
return "zip"
917971

918972

919973
class ExcelFile:
920974
"""
921975
Class for parsing tabular excel sheets into DataFrame objects.
922976
923-
See read_excel for more documentation
977+
See read_excel for more documentation.
924978
925979
Parameters
926980
----------
@@ -947,12 +1001,13 @@ class ExcelFile:
9471001
9481002
- If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
9491003
then `odf <https://pypi.org/project/odfpy/>`_ will be used.
950-
- Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
951-
extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd``
952-
will be used.
1004+
- Otherwise if ``path_or_buffer`` is an xls format,
1005+
``xlrd`` will be used.
9531006
- Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
9541007
then ``openpyxl`` will be used.
1008+
- Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
9551009
- Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
1010+
This case will raise a ``ValueError`` in a future version of pandas.
9561011
9571012
.. warning::
9581013
@@ -975,71 +1030,87 @@ class ExcelFile:
9751030
def __init__(
9761031
self, path_or_buffer, engine=None, storage_options: StorageOptions = None
9771032
):
978-
if engine is None:
979-
# Determine ext and use odf for ods stream/file
980-
if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)):
981-
ext = None
982-
if _is_ods_stream(path_or_buffer):
983-
engine = "odf"
984-
else:
985-
ext = os.path.splitext(str(path_or_buffer))[-1]
986-
if ext == ".ods":
987-
engine = "odf"
1033+
if engine is not None and engine not in self._engines:
1034+
raise ValueError(f"Unknown engine: {engine}")
9881035

989-
if (
990-
import_optional_dependency(
991-
"xlrd", raise_on_missing=False, on_version="ignore"
992-
)
993-
is not None
994-
):
995-
from xlrd import Book
1036+
# Could be a str, ExcelFile, Book, etc.
1037+
self.io = path_or_buffer
1038+
# Always a string
1039+
self._io = stringify_path(path_or_buffer)
9961040

997-
if isinstance(path_or_buffer, Book):
998-
engine = "xlrd"
1041+
# Determine xlrd version if installed
1042+
if (
1043+
import_optional_dependency(
1044+
"xlrd", raise_on_missing=False, on_version="ignore"
1045+
)
1046+
is None
1047+
):
1048+
xlrd_version = None
1049+
else:
1050+
import xlrd
9991051

1000-
# GH 35029 - Prefer openpyxl except for xls files
1001-
if engine is None:
1002-
if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls":
1003-
engine = "xlrd"
1004-
elif (
1052+
xlrd_version = LooseVersion(xlrd.__version__)
1053+
1054+
if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase, bytes)):
1055+
ext = inspect_excel_format(
1056+
content=path_or_buffer, storage_options=storage_options
1057+
)
1058+
elif xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):
1059+
ext = "xls"
1060+
else:
1061+
# path_or_buffer is path-like, use stringified path
1062+
ext = inspect_excel_format(
1063+
path=str(self._io), storage_options=storage_options
1064+
)
1065+
1066+
if engine is None:
1067+
if ext == "ods":
1068+
engine = "odf"
1069+
elif ext == "xls":
1070+
engine = "xlrd"
1071+
else:
1072+
# GH 35029 - Prefer openpyxl except for xls files
1073+
if (
10051074
import_optional_dependency(
10061075
"openpyxl", raise_on_missing=False, on_version="ignore"
10071076
)
10081077
is not None
10091078
):
10101079
engine = "openpyxl"
10111080
else:
1012-
caller = inspect.stack()[1]
1013-
if (
1014-
caller.filename.endswith("pandas/io/excel/_base.py")
1015-
and caller.function == "read_excel"
1016-
):
1017-
stacklevel = 4
1018-
else:
1019-
stacklevel = 2
1020-
warnings.warn(
1021-
"The xlrd engine is no longer maintained and is not "
1022-
"supported when using pandas with python >= 3.9. However, "
1023-
"the engine xlrd will continue to be allowed for the "
1024-
"indefinite future. Beginning with pandas 1.2.0, the "
1025-
"openpyxl engine will be used if it is installed and the "
1026-
"engine argument is not specified. Either install openpyxl "
1027-
"or specify engine='xlrd' to silence this warning.",
1028-
FutureWarning,
1029-
stacklevel=stacklevel,
1030-
)
10311081
engine = "xlrd"
1032-
if engine not in self._engines:
1033-
raise ValueError(f"Unknown engine: {engine}")
1082+
1083+
if engine == "xlrd" and ext != "xls" and xlrd_version is not None:
1084+
if xlrd_version >= "2":
1085+
raise ValueError(
1086+
f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
1087+
f"only the xls format is supported. Install openpyxl instead."
1088+
)
1089+
else:
1090+
caller = inspect.stack()[1]
1091+
if (
1092+
caller.filename.endswith(
1093+
os.path.join("pandas", "io", "excel", "_base.py")
1094+
)
1095+
and caller.function == "read_excel"
1096+
):
1097+
stacklevel = 4
1098+
else:
1099+
stacklevel = 2
1100+
warnings.warn(
1101+
f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
1102+
f"only the xls format is supported. As a result, the "
1103+
f"openpyxl engine will be used if it is installed and the "
1104+
f"engine argument is not specified. Install "
1105+
f"openpyxl instead.",
1106+
FutureWarning,
1107+
stacklevel=stacklevel,
1108+
)
1109+
assert engine in self._engines, f"Engine {engine} not recognized"
10341110

10351111
self.engine = engine
10361112
self.storage_options = storage_options
10371113

1038-
# Could be a str, ExcelFile, Book, etc.
1039-
self.io = path_or_buffer
1040-
# Always a string
1041-
self._io = stringify_path(path_or_buffer)
1042-
10431114
self._reader = self._engines[engine](self._io, storage_options=storage_options)
10441115

10451116
def __fspath__(self):

pandas/tests/io/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,8 @@
1414
r"Use 'tree.iter\(\)' or 'list\(tree.iter\(\)\)' instead."
1515
":PendingDeprecationWarning"
1616
),
17+
# GH 26552
18+
pytest.mark.filterwarnings(
19+
"ignore:As the xlwt package is no longer maintained:FutureWarning"
20+
),
1721
]

pandas/tests/io/excel/__init__.py

+19
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1+
from distutils.version import LooseVersion
2+
13
import pytest
24

5+
from pandas.compat._optional import import_optional_dependency
6+
37
pytestmark = [
48
pytest.mark.filterwarnings(
59
# Looks like tree.getiterator is deprecated in favor of tree.iter
@@ -13,4 +17,19 @@
1317
pytest.mark.filterwarnings(
1418
"ignore:As the xlwt package is no longer maintained:FutureWarning"
1519
),
20+
# GH 38571
21+
pytest.mark.filterwarnings(
22+
"ignore:.*In xlrd >= 2.0, only the xls format is supported:FutureWarning"
23+
),
1624
]
25+
26+
27+
if (
28+
import_optional_dependency("xlrd", raise_on_missing=False, on_version="ignore")
29+
is None
30+
):
31+
xlrd_version = None
32+
else:
33+
import xlrd
34+
35+
xlrd_version = LooseVersion(xlrd.__version__)

0 commit comments

Comments
 (0)