diff --git a/ci/deps/azure-38-slow.yaml b/ci/deps/azure-38-slow.yaml
index 2b4339cf12658..9651837f26114 100644
--- a/ci/deps/azure-38-slow.yaml
+++ b/ci/deps/azure-38-slow.yaml
@@ -30,7 +30,7 @@ dependencies:
- moto>=1.3.14
- scipy
- sqlalchemy
- - xlrd<2.0
+ - xlrd>=2.0
- xlsxwriter
- xlwt
- moto
diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml
index ad72b9c8577e9..e7ac4c783b855 100644
--- a/ci/deps/azure-windows-37.yaml
+++ b/ci/deps/azure-windows-37.yaml
@@ -33,7 +33,7 @@ dependencies:
- s3fs>=0.4.2
- scipy
- sqlalchemy
- - xlrd<2.0
+ - xlrd>=2.0
- xlsxwriter
- xlwt
- pyreadstat
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index 151b41705f9a5..6dc0d0cfaf80b 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -27,7 +27,7 @@ including other versions of pandas.
**Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.**
This is no longer supported, switch to using ``openpyxl`` instead.
- Attempting to use the the ``xlwt`` engine will raise a ``FutureWarning``
+ Attempting to use the ``xlwt`` engine will raise a ``FutureWarning``
unless the option :attr:`io.excel.xls.writer` is set to ``"xlwt"``.
While this option is now deprecated and will also raise a ``FutureWarning``,
it can be globally set and the warning suppressed. Users are recommended to
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index c72f294bf6ac8..221e8b9ccfb14 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -1,11 +1,13 @@
import abc
import datetime
+from distutils.version import LooseVersion
import inspect
from io import BufferedIOBase, BytesIO, RawIOBase
import os
from textwrap import fill
-from typing import Any, Dict, Mapping, Union, cast
+from typing import IO, Any, Dict, Mapping, Optional, Union, cast
import warnings
+import zipfile
from pandas._config import config
@@ -13,11 +15,12 @@
from pandas._typing import Buffer, FilePathOrBuffer, StorageOptions
from pandas.compat._optional import import_optional_dependency
from pandas.errors import EmptyDataError
-from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments
+from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments, doc
from pandas.core.dtypes.common import is_bool, is_float, is_integer, is_list_like
from pandas.core.frame import DataFrame
+from pandas.core.shared_docs import _shared_docs
from pandas.io.common import IOHandles, get_handle, stringify_path, validate_header_arg
from pandas.io.excel._util import (
@@ -116,17 +119,15 @@
When ``engine=None``, the following logic will be
used to determine the engine:
- - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
- then `odf `_ will be used.
- - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
- extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` will
- be used.
- - Otherwise if `openpyxl `_ is installed,
- then ``openpyxl`` will be used.
- - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
-
- Specifying ``engine="xlrd"`` will continue to be allowed for the
- indefinite future.
+ - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
+ then `odf `_ will be used.
+ - Otherwise if ``path_or_buffer`` is an xls format,
+ ``xlrd`` will be used.
+ - Otherwise if `openpyxl `_ is installed,
+ then ``openpyxl`` will be used.
+ - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
+ - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. This
+ case will raise a ``ValueError`` in a future version of pandas.
converters : dict, default None
Dict of functions for converting values in certain columns. Keys can
@@ -888,39 +889,92 @@ def close(self):
return content
-def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool:
+XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"
+ZIP_SIGNATURE = b"PK\x03\x04"
+PEEK_SIZE = max(len(XLS_SIGNATURE), len(ZIP_SIGNATURE))
+
+
+@doc(storage_options=_shared_docs["storage_options"])
+def inspect_excel_format(
+ path: Optional[str] = None,
+ content: Union[None, BufferedIOBase, RawIOBase, bytes] = None,
+ storage_options: StorageOptions = None,
+) -> str:
"""
- Check if the stream is an OpenDocument Spreadsheet (.ods) file
+ Inspect the path or content of an excel file and get its format.
+
+ At least one of path or content must be not None. If both are not None,
+ content will take precedence.
- It uses magic values inside the stream
+ Adopted from xlrd: https://github.com/python-excel/xlrd.
Parameters
----------
- stream : Union[BufferedIOBase, RawIOBase]
- IO stream with data which might be an ODS file
+ path : str, optional
+ Path to file to inspect. May be a URL.
+ content : file-like object, optional
+ Content of file to inspect.
+ {storage_options}
Returns
-------
- is_ods : bool
- Boolean indication that this is indeed an ODS file or not
+ str
+ Format of file.
+
+ Raises
+ ------
+ ValueError
+ If resulting stream is empty.
+ BadZipFile
+ If resulting stream does not have an XLS signature and is not a valid zipfile.
"""
- stream.seek(0)
- is_ods = False
- if stream.read(4) == b"PK\003\004":
- stream.seek(30)
- is_ods = (
- stream.read(54) == b"mimetype"
- b"application/vnd.oasis.opendocument.spreadsheet"
- )
- stream.seek(0)
- return is_ods
+ content_or_path: Union[None, str, BufferedIOBase, RawIOBase, IO[bytes]]
+ if isinstance(content, bytes):
+ content_or_path = BytesIO(content)
+ else:
+ content_or_path = content or path
+ assert content_or_path is not None
+
+ with get_handle(
+ content_or_path, "rb", storage_options=storage_options, is_text=False
+ ) as handle:
+ stream = handle.handle
+ stream.seek(0)
+ buf = stream.read(PEEK_SIZE)
+ if buf is None:
+ raise ValueError("stream is empty")
+ else:
+ assert isinstance(buf, bytes)
+ peek = buf
+ stream.seek(0)
+
+ if peek.startswith(XLS_SIGNATURE):
+ return "xls"
+ elif not peek.startswith(ZIP_SIGNATURE):
+ raise ValueError("File is not a recognized excel file")
+
+ # ZipFile typing is overly-strict
+ # https://github.com/python/typeshed/issues/4212
+ zf = zipfile.ZipFile(stream) # type: ignore[arg-type]
+
+ # Workaround for some third party files that use forward slashes and
+ # lower case names.
+ component_names = [name.replace("\\", "/").lower() for name in zf.namelist()]
+
+ if "xl/workbook.xml" in component_names:
+ return "xlsx"
+ if "xl/workbook.bin" in component_names:
+ return "xlsb"
+ if "content.xml" in component_names:
+ return "ods"
+ return "zip"
class ExcelFile:
"""
Class for parsing tabular excel sheets into DataFrame objects.
- See read_excel for more documentation
+ See read_excel for more documentation.
Parameters
----------
@@ -947,12 +1001,13 @@ class ExcelFile:
- If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
then `odf `_ will be used.
- - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
- extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd``
- will be used.
+ - Otherwise if ``path_or_buffer`` is an xls format,
+ ``xlrd`` will be used.
- Otherwise if `openpyxl `_ is installed,
then ``openpyxl`` will be used.
+ - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
- Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
+ This case will raise a ``ValueError`` in a future version of pandas.
.. warning::
@@ -975,33 +1030,47 @@ class ExcelFile:
def __init__(
self, path_or_buffer, engine=None, storage_options: StorageOptions = None
):
- if engine is None:
- # Determine ext and use odf for ods stream/file
- if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)):
- ext = None
- if _is_ods_stream(path_or_buffer):
- engine = "odf"
- else:
- ext = os.path.splitext(str(path_or_buffer))[-1]
- if ext == ".ods":
- engine = "odf"
+ if engine is not None and engine not in self._engines:
+ raise ValueError(f"Unknown engine: {engine}")
- if (
- import_optional_dependency(
- "xlrd", raise_on_missing=False, on_version="ignore"
- )
- is not None
- ):
- from xlrd import Book
+ # Could be a str, ExcelFile, Book, etc.
+ self.io = path_or_buffer
+ # Always a string
+ self._io = stringify_path(path_or_buffer)
- if isinstance(path_or_buffer, Book):
- engine = "xlrd"
+ # Determine xlrd version if installed
+ if (
+ import_optional_dependency(
+ "xlrd", raise_on_missing=False, on_version="ignore"
+ )
+ is None
+ ):
+ xlrd_version = None
+ else:
+ import xlrd
- # GH 35029 - Prefer openpyxl except for xls files
- if engine is None:
- if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls":
- engine = "xlrd"
- elif (
+ xlrd_version = LooseVersion(xlrd.__version__)
+
+ if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase, bytes)):
+ ext = inspect_excel_format(
+ content=path_or_buffer, storage_options=storage_options
+ )
+ elif xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):
+ ext = "xls"
+ else:
+ # path_or_buffer is path-like, use stringified path
+ ext = inspect_excel_format(
+ path=str(self._io), storage_options=storage_options
+ )
+
+ if engine is None:
+ if ext == "ods":
+ engine = "odf"
+ elif ext == "xls":
+ engine = "xlrd"
+ else:
+ # GH 35029 - Prefer openpyxl except for xls files
+ if (
import_optional_dependency(
"openpyxl", raise_on_missing=False, on_version="ignore"
)
@@ -1009,37 +1078,39 @@ def __init__(
):
engine = "openpyxl"
else:
- caller = inspect.stack()[1]
- if (
- caller.filename.endswith("pandas/io/excel/_base.py")
- and caller.function == "read_excel"
- ):
- stacklevel = 4
- else:
- stacklevel = 2
- warnings.warn(
- "The xlrd engine is no longer maintained and is not "
- "supported when using pandas with python >= 3.9. However, "
- "the engine xlrd will continue to be allowed for the "
- "indefinite future. Beginning with pandas 1.2.0, the "
- "openpyxl engine will be used if it is installed and the "
- "engine argument is not specified. Either install openpyxl "
- "or specify engine='xlrd' to silence this warning.",
- FutureWarning,
- stacklevel=stacklevel,
- )
engine = "xlrd"
- if engine not in self._engines:
- raise ValueError(f"Unknown engine: {engine}")
+
+ if engine == "xlrd" and ext != "xls" and xlrd_version is not None:
+ if xlrd_version >= "2":
+ raise ValueError(
+ f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
+ f"only the xls format is supported. Install openpyxl instead."
+ )
+ else:
+ caller = inspect.stack()[1]
+ if (
+ caller.filename.endswith(
+ os.path.join("pandas", "io", "excel", "_base.py")
+ )
+ and caller.function == "read_excel"
+ ):
+ stacklevel = 4
+ else:
+ stacklevel = 2
+ warnings.warn(
+ f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
+ f"only the xls format is supported. As a result, the "
+ f"openpyxl engine will be used if it is installed and the "
+ f"engine argument is not specified. Install "
+ f"openpyxl instead.",
+ FutureWarning,
+ stacklevel=stacklevel,
+ )
+ assert engine in self._engines, f"Engine {engine} not recognized"
self.engine = engine
self.storage_options = storage_options
- # Could be a str, ExcelFile, Book, etc.
- self.io = path_or_buffer
- # Always a string
- self._io = stringify_path(path_or_buffer)
-
self._reader = self._engines[engine](self._io, storage_options=storage_options)
def __fspath__(self):
diff --git a/pandas/tests/io/__init__.py b/pandas/tests/io/__init__.py
index c5e867f45b92d..39474dedba78c 100644
--- a/pandas/tests/io/__init__.py
+++ b/pandas/tests/io/__init__.py
@@ -14,4 +14,8 @@
r"Use 'tree.iter\(\)' or 'list\(tree.iter\(\)\)' instead."
":PendingDeprecationWarning"
),
+ # GH 26552
+ pytest.mark.filterwarnings(
+ "ignore:As the xlwt package is no longer maintained:FutureWarning"
+ ),
]
diff --git a/pandas/tests/io/excel/__init__.py b/pandas/tests/io/excel/__init__.py
index 384f1006c44df..b7ceb28573484 100644
--- a/pandas/tests/io/excel/__init__.py
+++ b/pandas/tests/io/excel/__init__.py
@@ -1,5 +1,9 @@
+from distutils.version import LooseVersion
+
import pytest
+from pandas.compat._optional import import_optional_dependency
+
pytestmark = [
pytest.mark.filterwarnings(
# Looks like tree.getiterator is deprecated in favor of tree.iter
@@ -13,4 +17,19 @@
pytest.mark.filterwarnings(
"ignore:As the xlwt package is no longer maintained:FutureWarning"
),
+ # GH 38571
+ pytest.mark.filterwarnings(
+ "ignore:.*In xlrd >= 2.0, only the xls format is supported:FutureWarning"
+ ),
]
+
+
+if (
+ import_optional_dependency("xlrd", raise_on_missing=False, on_version="ignore")
+ is None
+):
+ xlrd_version = None
+else:
+ import xlrd
+
+ xlrd_version = LooseVersion(xlrd.__version__)
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index 98a55ae39bd77..df1250cee8b00 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -11,6 +11,7 @@
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series
import pandas._testing as tm
+from pandas.tests.io.excel import xlrd_version
read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"]
engine_params = [
@@ -57,6 +58,13 @@ def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool:
return False
if read_ext == ".xlsb" and engine != "pyxlsb":
return False
+ if (
+ engine == "xlrd"
+ and xlrd_version is not None
+ and xlrd_version >= "2"
+ and read_ext != ".xls"
+ ):
+ return False
return True
@@ -614,6 +622,19 @@ def test_bad_engine_raises(self, read_ext):
with pytest.raises(ValueError, match="Unknown engine: foo"):
pd.read_excel("", engine=bad_engine)
+ def test_missing_file_raises(self, read_ext):
+ bad_file = f"foo{read_ext}"
+ # CI tests with zh_CN.utf8, translates to "No such file or directory"
+ with pytest.raises(
+ FileNotFoundError, match=r"(No such file or directory|没有那个文件或目录)"
+ ):
+ pd.read_excel(bad_file)
+
+ def test_corrupt_bytes_raises(self, read_ext, engine):
+ bad_stream = b"foo"
+ with pytest.raises(ValueError, match="File is not a recognized excel file"):
+ pd.read_excel(bad_stream)
+
@tm.network
def test_read_from_http_url(self, read_ext):
url = (
@@ -1158,6 +1179,19 @@ def test_excel_read_binary(self, engine, read_ext):
actual = pd.read_excel(data, engine=engine)
tm.assert_frame_equal(expected, actual)
+ def test_excel_read_binary_via_read_excel(self, read_ext, engine):
+ # GH 38424
+ if read_ext == ".xlsb" and engine == "pyxlsb":
+ pytest.xfail("GH 38667 - should default to pyxlsb but doesn't")
+ with open("test1" + read_ext, "rb") as f:
+ result = pd.read_excel(f)
+ expected = pd.read_excel("test1" + read_ext, engine=engine)
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.skipif(
+ xlrd_version is not None and xlrd_version >= "2",
+ reason="xlrd no longer supports xlsx",
+ )
def test_excel_high_surrogate(self, engine):
# GH 23809
expected = DataFrame(["\udc88"], columns=["Column1"])
diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py
index 80ebeb4c03d89..197738330efe1 100644
--- a/pandas/tests/io/excel/test_writers.py
+++ b/pandas/tests/io/excel/test_writers.py
@@ -1195,9 +1195,9 @@ def test_datetimes(self, path):
write_frame = DataFrame({"A": datetimes})
write_frame.to_excel(path, "Sheet1")
- # GH 35029 - Default changed to openpyxl, but test is for odf/xlrd
- engine = "odf" if path.endswith("ods") else "xlrd"
- read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0, engine=engine)
+ if path.endswith("xlsx") or path.endswith("xlsm"):
+ pytest.skip("Defaults to openpyxl and fails - GH #38644")
+ read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0)
tm.assert_series_equal(write_frame["A"], read_frame["A"])
diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py
index f2fbcbc2e2f04..2a1114a9570f0 100644
--- a/pandas/tests/io/excel/test_xlrd.py
+++ b/pandas/tests/io/excel/test_xlrd.py
@@ -4,6 +4,7 @@
import pandas as pd
import pandas._testing as tm
+from pandas.tests.io.excel import xlrd_version
from pandas.io.excel import ExcelFile
@@ -17,6 +18,8 @@ def skip_ods_and_xlsb_files(read_ext):
pytest.skip("Not valid for xlrd")
if read_ext == ".xlsb":
pytest.skip("Not valid for xlrd")
+ if read_ext in (".xlsx", ".xlsm") and xlrd_version >= "2":
+ pytest.skip("Not valid for xlrd >= 2.0")
def test_read_xlrd_book(read_ext, frame):
@@ -66,7 +69,7 @@ def test_excel_file_warning_with_xlsx_file(datapath):
pd.read_excel(path, "Sheet1", engine=None)
-def test_read_excel_warning_with_xlsx_file(tmpdir, datapath):
+def test_read_excel_warning_with_xlsx_file(datapath):
# GH 29375
path = datapath("io", "data", "excel", "test1.xlsx")
has_openpyxl = (
@@ -76,12 +79,19 @@ def test_read_excel_warning_with_xlsx_file(tmpdir, datapath):
is not None
)
if not has_openpyxl:
- with tm.assert_produces_warning(
- FutureWarning,
- raise_on_extra_warnings=False,
- match="The xlrd engine is no longer maintained",
- ):
- pd.read_excel(path, "Sheet1", engine=None)
+ if xlrd_version >= "2":
+ with pytest.raises(
+ ValueError,
+ match="Your version of xlrd is ",
+ ):
+ pd.read_excel(path, "Sheet1", engine=None)
+ else:
+ with tm.assert_produces_warning(
+ FutureWarning,
+ raise_on_extra_warnings=False,
+ match="The xlrd engine is no longer maintained",
+ ):
+ pd.read_excel(path, "Sheet1", engine=None)
else:
with tm.assert_produces_warning(None):
pd.read_excel(path, "Sheet1", engine=None)