Skip to content

Commit 3ebbc77

Browse files
Backport PR #39586: REG: read_excel with engine specified raises on non-path/non-buffer (#39652)
Co-authored-by: Richard Shadrach <[email protected]>
1 parent b206877 commit 3ebbc77

File tree

5 files changed

+62
-19
lines changed

5 files changed

+62
-19
lines changed

doc/source/whatsnew/v1.2.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ Fixed regressions
2323
- Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`)
2424
- Fixed regression in :meth:`DataFrame.transform` failing in case of an empty DataFrame or Series (:issue:`39636`)
2525
- Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`)
26+
- Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`)
2627
-
2728

2829
.. ---------------------------------------------------------------------------

pandas/io/excel/_base.py

+21-10
Original file line numberDiff line numberDiff line change
@@ -1062,14 +1062,16 @@ def __init__(
10621062

10631063
xlrd_version = LooseVersion(get_version(xlrd))
10641064

1065-
if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):
1066-
ext = "xls"
1067-
else:
1068-
ext = inspect_excel_format(
1069-
content=path_or_buffer, storage_options=storage_options
1070-
)
1071-
1065+
ext = None
10721066
if engine is None:
1067+
# Only determine ext if it is needed
1068+
if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):
1069+
ext = "xls"
1070+
else:
1071+
ext = inspect_excel_format(
1072+
content=path_or_buffer, storage_options=storage_options
1073+
)
1074+
10731075
if ext == "ods":
10741076
engine = "odf"
10751077
elif ext == "xls":
@@ -1086,13 +1088,22 @@ def __init__(
10861088
else:
10871089
engine = "xlrd"
10881090

1089-
if engine == "xlrd" and ext != "xls" and xlrd_version is not None:
1090-
if xlrd_version >= "2":
1091+
if engine == "xlrd" and xlrd_version is not None:
1092+
if ext is None:
1093+
# Need ext to determine ext in order to raise/warn
1094+
if isinstance(path_or_buffer, xlrd.Book):
1095+
ext = "xls"
1096+
else:
1097+
ext = inspect_excel_format(
1098+
content=path_or_buffer, storage_options=storage_options
1099+
)
1100+
1101+
if ext != "xls" and xlrd_version >= "2":
10911102
raise ValueError(
10921103
f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
10931104
f"only the xls format is supported. Install openpyxl instead."
10941105
)
1095-
else:
1106+
elif ext != "xls":
10961107
caller = inspect.stack()[1]
10971108
if (
10981109
caller.filename.endswith(

pandas/io/excel/_openpyxl.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -531,15 +531,19 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
531531

532532
version = LooseVersion(get_version(openpyxl))
533533

534-
if version >= "3.0.0":
534+
# There is no good way of determining if a sheet is read-only
535+
# https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1605
536+
is_readonly = hasattr(sheet, "reset_dimensions")
537+
538+
if version >= "3.0.0" and is_readonly:
535539
sheet.reset_dimensions()
536540

537541
data: List[List[Scalar]] = []
538542
for row_number, row in enumerate(sheet.rows):
539543
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
540544
data.append(converted_row)
541545

542-
if version >= "3.0.0" and len(data) > 0:
546+
if version >= "3.0.0" and is_readonly and len(data) > 0:
543547
# With dimension reset, openpyxl no longer pads rows
544548
max_width = max(len(data_row) for data_row in data)
545549
if min(len(data_row) for data_row in data) < max_width:

pandas/tests/io/excel/test_openpyxl.py

+26-6
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,17 @@ def test_to_excel_with_openpyxl_engine(ext):
122122
styled.to_excel(filename, engine="openpyxl")
123123

124124

125+
@pytest.mark.parametrize("read_only", [True, False])
126+
def test_read_workbook(datapath, ext, read_only):
127+
# GH 39528
128+
filename = datapath("io", "data", "excel", "test1" + ext)
129+
wb = openpyxl.load_workbook(filename, read_only=read_only)
130+
result = pd.read_excel(wb, engine="openpyxl")
131+
wb.close()
132+
expected = pd.read_excel(filename)
133+
tm.assert_frame_equal(result, expected)
134+
135+
125136
@pytest.mark.parametrize(
126137
"header, expected_data",
127138
[
@@ -139,13 +150,22 @@ def test_to_excel_with_openpyxl_engine(ext):
139150
@pytest.mark.parametrize(
140151
"filename", ["dimension_missing", "dimension_small", "dimension_large"]
141152
)
142-
@pytest.mark.xfail(
143-
LooseVersion(get_version(openpyxl)) < "3.0.0",
144-
reason="openpyxl read-only sheet is incorrect when dimension data is wrong",
145-
)
146-
def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename):
153+
# When read_only is None, use read_excel instead of a workbook
154+
@pytest.mark.parametrize("read_only", [True, False, None])
155+
def test_read_with_bad_dimension(
156+
datapath, ext, header, expected_data, filename, read_only, request
157+
):
147158
# GH 38956, 39001 - no/incorrect dimension information
159+
version = LooseVersion(get_version(openpyxl))
160+
if (read_only or read_only is None) and version < "3.0.0":
161+
msg = "openpyxl read-only sheet is incorrect when dimension data is wrong"
162+
request.node.add_marker(pytest.mark.xfail(reason=msg))
148163
path = datapath("io", "data", "excel", f"{filename}{ext}")
149-
result = pd.read_excel(path, header=header)
164+
if read_only is None:
165+
result = pd.read_excel(path, header=header)
166+
else:
167+
wb = openpyxl.load_workbook(path, read_only=read_only)
168+
result = pd.read_excel(wb, engine="openpyxl", header=header)
169+
wb.close()
150170
expected = DataFrame(expected_data)
151171
tm.assert_frame_equal(result, expected)

pandas/tests/io/excel/test_readers.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from functools import partial
33
import os
44
from urllib.error import URLError
5+
from zipfile import BadZipFile
56

67
import numpy as np
78
import pytest
@@ -642,7 +643,13 @@ def test_missing_file_raises(self, read_ext):
642643

643644
def test_corrupt_bytes_raises(self, read_ext, engine):
644645
bad_stream = b"foo"
645-
with pytest.raises(ValueError, match="File is not a recognized excel file"):
646+
if engine is None or engine == "xlrd":
647+
error = ValueError
648+
msg = "File is not a recognized excel file"
649+
else:
650+
error = BadZipFile
651+
msg = "File is not a zip file"
652+
with pytest.raises(error, match=msg):
646653
pd.read_excel(bad_stream)
647654

648655
@tm.network

0 commit comments

Comments
 (0)