Skip to content

Commit 69d09e9

Browse files
Backport PR pandas-dev#39486: BUG: read_excel with openpyxl and missing dimension'
1 parent ee8c1ff commit 69d09e9

File tree

7 files changed

+63
-7
lines changed

7 files changed

+63
-7
lines changed

doc/source/whatsnew/v1.2.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ Bug fixes
3131
~~~~~~~~~
3232

3333
- :func:`pandas.read_excel` error message when a specified ``sheetname`` does not exist is now uniform across engines (:issue:`39250`)
34+
- Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information; the fix requires ``openpyxl`` >= 3.0.0, prior versions may still fail (:issue:`38956`, :issue:`39001`)
3435
-
3536

3637
.. ---------------------------------------------------------------------------

pandas/compat/_optional.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
"matplotlib": "2.2.3",
1616
"numexpr": "2.6.8",
1717
"odfpy": "1.3.0",
18-
"openpyxl": "2.5.7",
18+
"openpyxl": "2.6.0",
1919
"pandas_gbq": "0.12.0",
2020
"pyarrow": "0.15.0",
2121
"pytest": "5.0.1",

pandas/io/excel/_openpyxl.py

+28-6
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1+
from distutils.version import LooseVersion
12
from typing import TYPE_CHECKING, Dict, List, Optional
23

34
import numpy as np
45

56
from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions
6-
from pandas.compat._optional import import_optional_dependency
7+
from pandas.compat._optional import get_version, import_optional_dependency
78

89
from pandas.io.excel._base import BaseExcelReader, ExcelWriter
910
from pandas.io.excel._util import validate_freeze_panes
@@ -503,14 +504,14 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
503504

504505
from openpyxl.cell.cell import TYPE_BOOL, TYPE_ERROR, TYPE_NUMERIC
505506

506-
if cell.is_date:
507+
if cell.value is None:
508+
return "" # compat with xlrd
509+
elif cell.is_date:
507510
return cell.value
508511
elif cell.data_type == TYPE_ERROR:
509512
return np.nan
510513
elif cell.data_type == TYPE_BOOL:
511514
return bool(cell.value)
512-
elif cell.value is None:
513-
return "" # compat with xlrd
514515
elif cell.data_type == TYPE_NUMERIC:
515516
# GH5394
516517
if convert_float:
@@ -523,8 +524,29 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
523524
return cell.value
524525

525526
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
527+
# GH 39001
528+
# Reading of excel file depends on dimension data being correct but
529+
# writers sometimes omit or get it wrong
530+
import openpyxl
531+
532+
version = LooseVersion(get_version(openpyxl))
533+
534+
if version >= "3.0.0":
535+
sheet.reset_dimensions()
536+
526537
data: List[List[Scalar]] = []
527-
for row in sheet.rows:
528-
data.append([self._convert_cell(cell, convert_float) for cell in row])
538+
for row_number, row in enumerate(sheet.rows):
539+
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
540+
data.append(converted_row)
541+
542+
if version >= "3.0.0" and len(data) > 0:
543+
# With dimension reset, openpyxl no longer pads rows
544+
max_width = max(len(data_row) for data_row in data)
545+
if min(len(data_row) for data_row in data) < max_width:
546+
empty_cell: List[Scalar] = [""]
547+
data = [
548+
data_row + (max_width - len(data_row)) * empty_cell
549+
for data_row in data
550+
]
529551

530552
return data
4.78 KB
Binary file not shown.
Binary file not shown.
4.78 KB
Binary file not shown.

pandas/tests/io/excel/test_openpyxl.py

+33
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
1+
from distutils.version import LooseVersion
2+
13
import numpy as np
24
import pytest
35

6+
from pandas.compat._optional import get_version
7+
48
import pandas as pd
59
from pandas import DataFrame
610
import pandas._testing as tm
@@ -116,3 +120,32 @@ def test_to_excel_with_openpyxl_engine(ext):
116120
).highlight_max()
117121

118122
styled.to_excel(filename, engine="openpyxl")
123+
124+
125+
@pytest.mark.parametrize(
126+
"header, expected_data",
127+
[
128+
(
129+
0,
130+
{
131+
"Title": [np.nan, "A", 1, 2, 3],
132+
"Unnamed: 1": [np.nan, "B", 4, 5, 6],
133+
"Unnamed: 2": [np.nan, "C", 7, 8, 9],
134+
},
135+
),
136+
(2, {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}),
137+
],
138+
)
139+
@pytest.mark.parametrize(
140+
"filename", ["dimension_missing", "dimension_small", "dimension_large"]
141+
)
142+
@pytest.mark.xfail(
143+
LooseVersion(get_version(openpyxl)) < "3.0.0",
144+
reason="openpyxl read-only sheet is incorrect when dimension data is wrong",
145+
)
146+
def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename):
147+
# GH 38956, 39001 - no/incorrect dimension information
148+
path = datapath("io", "data", "excel", f"{filename}{ext}")
149+
result = pd.read_excel(path, header=header)
150+
expected = DataFrame(expected_data)
151+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)