diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index 0ee1abaa2a0eb..cc5653fe2f360 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -31,6 +31,7 @@ Bug fixes ~~~~~~~~~ - :func:`pandas.read_excel` error message when a specified ``sheetname`` does not exist is now uniform across engines (:issue:`39250`) +- Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information; the fix requires ``openpyxl`` >= 3.0.0, prior versions may still fail (:issue:`38956`, :issue:`39001`) - .. --------------------------------------------------------------------------- diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 4ed9df2c97fdb..2bde42357b96c 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -15,7 +15,7 @@ "matplotlib": "2.2.3", "numexpr": "2.6.8", "odfpy": "1.3.0", - "openpyxl": "2.5.7", + "openpyxl": "2.6.0", "pandas_gbq": "0.12.0", "pyarrow": "0.15.0", "pytest": "5.0.1", diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 583baf3b239d8..4f02aff2eb992 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -1,9 +1,10 @@ +from distutils.version import LooseVersion from typing import TYPE_CHECKING, Dict, List, Optional import numpy as np from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions -from pandas.compat._optional import import_optional_dependency +from pandas.compat._optional import get_version, import_optional_dependency from pandas.io.excel._base import BaseExcelReader, ExcelWriter from pandas.io.excel._util import validate_freeze_panes @@ -503,14 +504,14 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: from openpyxl.cell.cell import TYPE_BOOL, TYPE_ERROR, TYPE_NUMERIC - if cell.is_date: + if cell.value is None: + return "" # compat with xlrd + elif cell.is_date: return cell.value elif cell.data_type == TYPE_ERROR: return np.nan elif cell.data_type == TYPE_BOOL: return bool(cell.value) - elif cell.value is None: - return "" # compat with xlrd elif cell.data_type == TYPE_NUMERIC: # GH5394 if convert_float: @@ -523,8 +524,29 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.value def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + # GH 39001 + # Reading of excel file depends on dimension data being correct but + # writers sometimes omit or get it wrong + import openpyxl + + version = LooseVersion(get_version(openpyxl)) + + if version >= "3.0.0": + sheet.reset_dimensions() + data: List[List[Scalar]] = [] - for row in sheet.rows: - data.append([self._convert_cell(cell, convert_float) for cell in row]) + for row_number, row in enumerate(sheet.rows): + converted_row = [self._convert_cell(cell, convert_float) for cell in row] + data.append(converted_row) + + if version >= "3.0.0" and len(data) > 0: + # With dimension reset, openpyxl no longer pads rows + max_width = max(len(data_row) for data_row in data) + if min(len(data_row) for data_row in data) < max_width: + empty_cell: List[Scalar] = [""] + data = [ + data_row + (max_width - len(data_row)) * empty_cell + for data_row in data + ] return data diff --git a/pandas/tests/io/data/excel/dimension_large.xlsx b/pandas/tests/io/data/excel/dimension_large.xlsx new file mode 100644 index 0000000000000..d57abdf2fbbae Binary files /dev/null and b/pandas/tests/io/data/excel/dimension_large.xlsx differ diff --git a/pandas/tests/io/data/excel/dimension_missing.xlsx b/pandas/tests/io/data/excel/dimension_missing.xlsx new file mode 100644 index 0000000000000..9274896689a72 Binary files /dev/null and b/pandas/tests/io/data/excel/dimension_missing.xlsx differ diff --git a/pandas/tests/io/data/excel/dimension_small.xlsx b/pandas/tests/io/data/excel/dimension_small.xlsx new file mode 100644 index 0000000000000..78ce4723ebef4 Binary files /dev/null and b/pandas/tests/io/data/excel/dimension_small.xlsx differ diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 3155e22d3ff5d..640501baffc62 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -1,6 +1,10 @@ +from distutils.version import LooseVersion + import numpy as np import pytest +from pandas.compat._optional import get_version + import pandas as pd from pandas import DataFrame import pandas._testing as tm @@ -116,3 +120,32 @@ def test_to_excel_with_openpyxl_engine(ext): ).highlight_max() styled.to_excel(filename, engine="openpyxl") + + +@pytest.mark.parametrize( + "header, expected_data", + [ + ( + 0, + { + "Title": [np.nan, "A", 1, 2, 3], + "Unnamed: 1": [np.nan, "B", 4, 5, 6], + "Unnamed: 2": [np.nan, "C", 7, 8, 9], + }, + ), + (2, {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}), + ], +) +@pytest.mark.parametrize( + "filename", ["dimension_missing", "dimension_small", "dimension_large"] +) +@pytest.mark.xfail( + LooseVersion(get_version(openpyxl)) < "3.0.0", + reason="openpyxl read-only sheet is incorrect when dimension data is wrong", +) +def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename): + # GH 38956, 39001 - no/incorrect dimension information + path = datapath("io", "data", "excel", f"{filename}{ext}") + result = pd.read_excel(path, header=header) + expected = DataFrame(expected_data) + tm.assert_frame_equal(result, expected)