pandas-dev · jreback · Feb 5, 2021 · Jan 30, 2021 · Jan 30, 2021 · Jan 31, 2021
diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst
@@ -31,6 +31,7 @@ Bug fixes
 ~~~~~~~~~
 
 - :func:`pandas.read_excel` error message when a specified ``sheetname`` does not exist is now uniform across engines (:issue:`39250`)
+- Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information; the fix requires ``openpyxl`` >= 3.0.0, prior versions may still fail (:issue:`38956`, :issue:`39001`)
 -
 
 .. ---------------------------------------------------------------------------

diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
@@ -17,7 +17,7 @@
     "matplotlib": "2.2.3",
     "numexpr": "2.6.8",
     "odfpy": "1.3.0",
-    "openpyxl": "2.5.7",
+    "openpyxl": "2.6.0",
     "pandas_gbq": "0.12.0",
     "pyarrow": "0.15.0",
     "pytest": "5.0.1",

diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py
@@ -1,11 +1,12 @@
 from __future__ import annotations
 
+from distutils.version import LooseVersion
 from typing import TYPE_CHECKING, Dict, List, Optional
 
 import numpy as np
 
 from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions
-from pandas.compat._optional import import_optional_dependency
+from pandas.compat._optional import get_version, import_optional_dependency
 
 from pandas.io.excel._base import BaseExcelReader, ExcelWriter
 from pandas.io.excel._util import validate_freeze_panes
@@ -505,14 +506,14 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
 
         from openpyxl.cell.cell import TYPE_BOOL, TYPE_ERROR, TYPE_NUMERIC
 
-        if cell.is_date:
+        if cell.value is None:
+            return ""  # compat with xlrd
+        elif cell.is_date:
             return cell.value
         elif cell.data_type == TYPE_ERROR:
             return np.nan
         elif cell.data_type == TYPE_BOOL:
             return bool(cell.value)
-        elif cell.value is None:
-            return ""  # compat with xlrd
         elif cell.data_type == TYPE_NUMERIC:
             # GH5394
             if convert_float:
@@ -525,8 +526,29 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
         return cell.value
 
     def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
+        # GH 39001
+        # Reading of excel file depends on dimension data being correct but
+        # writers sometimes omit or get it wrong
+        import openpyxl
+
+        version = LooseVersion(get_version(openpyxl))
+
+        if version >= "3.0.0":
+            sheet.reset_dimensions()
+
         data: List[List[Scalar]] = []
-        for row in sheet.rows:
-            data.append([self._convert_cell(cell, convert_float) for cell in row])
+        for row_number, row in enumerate(sheet.rows):
+            converted_row = [self._convert_cell(cell, convert_float) for cell in row]
+            data.append(converted_row)
+
+        if version >= "3.0.0" and len(data) > 0:
+            # With dimension reset, openpyxl no longer pads rows
+            max_width = max(len(data_row) for data_row in data)
+            if min(len(data_row) for data_row in data) < max_width:
+                empty_cell: List[Scalar] = [""]
+                data = [
+                    data_row + (max_width - len(data_row)) * empty_cell
+                    for data_row in data
+                ]
 
         return data
diff --git a/pandas/tests/io/data/excel/dimension_large.xlsx b/pandas/tests/io/data/excel/dimension_large.xlsx
diff --git a/pandas/tests/io/data/excel/dimension_missing.xlsx b/pandas/tests/io/data/excel/dimension_missing.xlsx
diff --git a/pandas/tests/io/data/excel/dimension_small.xlsx b/pandas/tests/io/data/excel/dimension_small.xlsx
diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py
@@ -1,6 +1,10 @@
+from distutils.version import LooseVersion
+
 import numpy as np
 import pytest
 
+from pandas.compat._optional import get_version
+
 import pandas as pd
 from pandas import DataFrame
 import pandas._testing as tm
@@ -116,3 +120,32 @@ def test_to_excel_with_openpyxl_engine(ext):
         ).highlight_max()
 
         styled.to_excel(filename, engine="openpyxl")
+
+
+@pytest.mark.parametrize(
+    "header, expected_data",
+    [
+        (
+            0,
+            {
+                "Title": [np.nan, "A", 1, 2, 3],
+                "Unnamed: 1": [np.nan, "B", 4, 5, 6],
+                "Unnamed: 2": [np.nan, "C", 7, 8, 9],
+            },
+        ),
+        (2, {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}),
+    ],
+)
+@pytest.mark.parametrize(
+    "filename", ["dimension_missing", "dimension_small", "dimension_large"]
+)
+@pytest.mark.xfail(
+    LooseVersion(get_version(openpyxl)) < "3.0.0",
+    reason="openpyxl read-only sheet is incorrect when dimension data is wrong",
+)
+def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename):
+    # GH 38956, 39001 - no/incorrect dimension information
+    path = datapath("io", "data", "excel", f"{filename}{ext}")
+    result = pd.read_excel(path, header=header)
+    expected = DataFrame(expected_data)
+    tm.assert_frame_equal(result, expected)