Skip to content

Commit b3e3352

Browse files
ENH: loosen XLS signature (#41321)
1 parent 269a3e4 commit b3e3352

File tree

4 files changed

+54
-13
lines changed

4 files changed

+54
-13
lines changed

doc/source/whatsnew/v1.3.0.rst

+3-1
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ Other enhancements
197197
- Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`)
198198
- :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`)
199199
- Add support for dict-like names in :class:`MultiIndex.set_names` and :class:`MultiIndex.rename` (:issue:`20421`)
200-
- :func:`pandas.read_excel` can now auto detect .xlsb files (:issue:`35416`)
200+
- :func:`pandas.read_excel` can now auto detect .xlsb files and older .xls files (:issue:`35416`, :issue:`41225`)
201201
- :class:`pandas.ExcelWriter` now accepts an ``if_sheet_exists`` parameter to control the behaviour of append mode when writing to existing sheets (:issue:`40230`)
202202
- :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.ExponentialMovingWindow.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support ``Numba`` execution with the ``engine`` keyword (:issue:`38895`, :issue:`41267`)
203203
- :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`)
@@ -850,6 +850,8 @@ I/O
850850
- Bug in :func:`read_csv` and :func:`read_excel` not respecting dtype for duplicated column name when ``mangle_dupe_cols`` is set to ``True`` (:issue:`35211`)
851851
- Bug in :func:`read_csv` and :func:`read_table` misinterpreting arguments when ``sys.setprofile`` had been previously called (:issue:`41069`)
852852
- Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`)
853+
- Bug in :func:`read_excel` would raise an error when pandas could not determine the file type, even when user specified the ``engine`` argument (:issue:`41225`)
854+
-
853855

854856
Period
855857
^^^^^^

pandas/io/excel/_base.py

+20-10
Original file line numberDiff line numberDiff line change
@@ -1014,16 +1014,21 @@ def close(self):
10141014
return content
10151015

10161016

1017-
XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"
1017+
XLS_SIGNATURES = (
1018+
b"\x09\x00\x04\x00\x07\x00\x10\x00", # BIFF2
1019+
b"\x09\x02\x06\x00\x00\x00\x10\x00", # BIFF3
1020+
b"\x09\x04\x06\x00\x00\x00\x10\x00", # BIFF4
1021+
b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", # Compound File Binary
1022+
)
10181023
ZIP_SIGNATURE = b"PK\x03\x04"
1019-
PEEK_SIZE = max(len(XLS_SIGNATURE), len(ZIP_SIGNATURE))
1024+
PEEK_SIZE = max(map(len, XLS_SIGNATURES + (ZIP_SIGNATURE,)))
10201025

10211026

10221027
@doc(storage_options=_shared_docs["storage_options"])
10231028
def inspect_excel_format(
10241029
content_or_path: FilePathOrBuffer,
10251030
storage_options: StorageOptions = None,
1026-
) -> str:
1031+
) -> str | None:
10271032
"""
10281033
Inspect the path or content of an excel file and get its format.
10291034
@@ -1037,8 +1042,8 @@ def inspect_excel_format(
10371042
10381043
Returns
10391044
-------
1040-
str
1041-
Format of file.
1045+
str or None
1046+
Format of file if it can be determined.
10421047
10431048
Raises
10441049
------
@@ -1063,10 +1068,10 @@ def inspect_excel_format(
10631068
peek = buf
10641069
stream.seek(0)
10651070

1066-
if peek.startswith(XLS_SIGNATURE):
1071+
if any(peek.startswith(sig) for sig in XLS_SIGNATURES):
10671072
return "xls"
10681073
elif not peek.startswith(ZIP_SIGNATURE):
1069-
raise ValueError("File is not a recognized excel file")
1074+
return None
10701075

10711076
# ZipFile typing is overly-strict
10721077
# https://github.com/python/typeshed/issues/4212
@@ -1174,8 +1179,12 @@ def __init__(
11741179
ext = inspect_excel_format(
11751180
content_or_path=path_or_buffer, storage_options=storage_options
11761181
)
1182+
if ext is None:
1183+
raise ValueError(
1184+
"Excel file format cannot be determined, you must specify "
1185+
"an engine manually."
1186+
)
11771187

1178-
# ext will always be valid, otherwise inspect_excel_format would raise
11791188
engine = config.get_option(f"io.excel.{ext}.reader", silent=True)
11801189
if engine == "auto":
11811190
engine = get_default_engine(ext, mode="reader")
@@ -1190,12 +1199,13 @@ def __init__(
11901199
path_or_buffer, storage_options=storage_options
11911200
)
11921201

1193-
if ext != "xls" and xlrd_version >= Version("2"):
1202+
# Pass through if ext is None, otherwise check if ext valid for xlrd
1203+
if ext and ext != "xls" and xlrd_version >= Version("2"):
11941204
raise ValueError(
11951205
f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
11961206
f"only the xls format is supported. Install openpyxl instead."
11971207
)
1198-
elif ext != "xls":
1208+
elif ext and ext != "xls":
11991209
caller = inspect.stack()[1]
12001210
if (
12011211
caller.filename.endswith(

pandas/tests/io/excel/test_readers.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -727,9 +727,20 @@ def test_missing_file_raises(self, read_ext):
727727

728728
def test_corrupt_bytes_raises(self, read_ext, engine):
729729
bad_stream = b"foo"
730-
if engine is None or engine == "xlrd":
730+
if engine is None:
731731
error = ValueError
732-
msg = "File is not a recognized excel file"
732+
msg = (
733+
"Excel file format cannot be determined, you must "
734+
"specify an engine manually."
735+
)
736+
elif engine == "xlrd":
737+
from xlrd import XLRDError
738+
739+
error = XLRDError
740+
msg = (
741+
"Unsupported format, or corrupt file: Expected BOF "
742+
"record; found b'foo'"
743+
)
733744
else:
734745
error = BadZipFile
735746
msg = "File is not a zip file"

pandas/tests/io/excel/test_xlrd.py

+18
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import io
2+
13
import pytest
24

35
from pandas.compat._optional import import_optional_dependency
@@ -8,6 +10,7 @@
810
from pandas.util.version import Version
911

1012
from pandas.io.excel import ExcelFile
13+
from pandas.io.excel._base import inspect_excel_format
1114

1215
xlrd = pytest.importorskip("xlrd")
1316
xlwt = pytest.importorskip("xlwt")
@@ -78,3 +81,18 @@ def test_read_excel_warning_with_xlsx_file(datapath):
7881
else:
7982
with tm.assert_produces_warning(None):
8083
pd.read_excel(path, "Sheet1", engine=None)
84+
85+
86+
@pytest.mark.parametrize(
87+
"file_header",
88+
[
89+
b"\x09\x00\x04\x00\x07\x00\x10\x00",
90+
b"\x09\x02\x06\x00\x00\x00\x10\x00",
91+
b"\x09\x04\x06\x00\x00\x00\x10\x00",
92+
b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1",
93+
],
94+
)
95+
def test_read_old_xls_files(file_header):
96+
# GH 41226
97+
f = io.BytesIO(file_header)
98+
assert inspect_excel_format(f) == "xls"

0 commit comments

Comments
 (0)