Skip to content

Backport PR #39586: REG: read_excel with engine specified raises on non-path/non-buffer #39652

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Fixed regressions
- Fixed regression in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` always raising ``AssertionError`` when comparing extension dtypes (:issue:`39410`)
- Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`)
- Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`)
- Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`)
-

.. ---------------------------------------------------------------------------
Expand Down
31 changes: 21 additions & 10 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1062,14 +1062,16 @@ def __init__(

xlrd_version = LooseVersion(get_version(xlrd))

if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):
ext = "xls"
else:
ext = inspect_excel_format(
content=path_or_buffer, storage_options=storage_options
)

ext = None
if engine is None:
# Only determine ext if it is needed
if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):
ext = "xls"
else:
ext = inspect_excel_format(
content=path_or_buffer, storage_options=storage_options
)

if ext == "ods":
engine = "odf"
elif ext == "xls":
Expand All @@ -1086,13 +1088,22 @@ def __init__(
else:
engine = "xlrd"

if engine == "xlrd" and ext != "xls" and xlrd_version is not None:
if xlrd_version >= "2":
if engine == "xlrd" and xlrd_version is not None:
if ext is None:
# Need ext to determine ext in order to raise/warn
if isinstance(path_or_buffer, xlrd.Book):
ext = "xls"
else:
ext = inspect_excel_format(
content=path_or_buffer, storage_options=storage_options
)

if ext != "xls" and xlrd_version >= "2":
raise ValueError(
f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
f"only the xls format is supported. Install openpyxl instead."
)
else:
elif ext != "xls":
caller = inspect.stack()[1]
if (
caller.filename.endswith(
Expand Down
8 changes: 6 additions & 2 deletions pandas/io/excel/_openpyxl.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,15 +531,19 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:

version = LooseVersion(get_version(openpyxl))

if version >= "3.0.0":
# There is no good way of determining if a sheet is read-only
# https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1605
is_readonly = hasattr(sheet, "reset_dimensions")

if version >= "3.0.0" and is_readonly:
sheet.reset_dimensions()

data: List[List[Scalar]] = []
for row_number, row in enumerate(sheet.rows):
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
data.append(converted_row)

if version >= "3.0.0" and len(data) > 0:
if version >= "3.0.0" and is_readonly and len(data) > 0:
# With dimension reset, openpyxl no longer pads rows
max_width = max(len(data_row) for data_row in data)
if min(len(data_row) for data_row in data) < max_width:
Expand Down
32 changes: 26 additions & 6 deletions pandas/tests/io/excel/test_openpyxl.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,17 @@ def test_to_excel_with_openpyxl_engine(ext):
styled.to_excel(filename, engine="openpyxl")


@pytest.mark.parametrize("read_only", [True, False])
def test_read_workbook(datapath, ext, read_only):
# GH 39528
filename = datapath("io", "data", "excel", "test1" + ext)
wb = openpyxl.load_workbook(filename, read_only=read_only)
result = pd.read_excel(wb, engine="openpyxl")
wb.close()
expected = pd.read_excel(filename)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"header, expected_data",
[
Expand All @@ -139,13 +150,22 @@ def test_to_excel_with_openpyxl_engine(ext):
@pytest.mark.parametrize(
"filename", ["dimension_missing", "dimension_small", "dimension_large"]
)
@pytest.mark.xfail(
LooseVersion(get_version(openpyxl)) < "3.0.0",
reason="openpyxl read-only sheet is incorrect when dimension data is wrong",
)
def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename):
# When read_only is None, use read_excel instead of a workbook
@pytest.mark.parametrize("read_only", [True, False, None])
def test_read_with_bad_dimension(
datapath, ext, header, expected_data, filename, read_only, request
):
# GH 38956, 39001 - no/incorrect dimension information
version = LooseVersion(get_version(openpyxl))
if (read_only or read_only is None) and version < "3.0.0":
msg = "openpyxl read-only sheet is incorrect when dimension data is wrong"
request.node.add_marker(pytest.mark.xfail(reason=msg))
path = datapath("io", "data", "excel", f"{filename}{ext}")
result = pd.read_excel(path, header=header)
if read_only is None:
result = pd.read_excel(path, header=header)
else:
wb = openpyxl.load_workbook(path, read_only=read_only)
result = pd.read_excel(wb, engine="openpyxl", header=header)
wb.close()
expected = DataFrame(expected_data)
tm.assert_frame_equal(result, expected)
9 changes: 8 additions & 1 deletion pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from functools import partial
import os
from urllib.error import URLError
from zipfile import BadZipFile

import numpy as np
import pytest
Expand Down Expand Up @@ -642,7 +643,13 @@ def test_missing_file_raises(self, read_ext):

def test_corrupt_bytes_raises(self, read_ext, engine):
bad_stream = b"foo"
with pytest.raises(ValueError, match="File is not a recognized excel file"):
if engine is None or engine == "xlrd":
error = ValueError
msg = "File is not a recognized excel file"
else:
error = BadZipFile
msg = "File is not a zip file"
with pytest.raises(error, match=msg):
pd.read_excel(bad_stream)

@tm.network
Expand Down