diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index 46023e2ce08f5..88bceb7a11cb9 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -35,6 +35,7 @@ Bug fixes - :func:`pandas.read_excel` error message when a specified ``sheetname`` does not exist is now uniform across engines (:issue:`39250`) - Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information; the fix requires ``openpyxl`` >= 3.0.0, prior versions may still fail (:issue:`38956`, :issue:`39001`) +- Fixed bug in :func:`pandas.read_excel` sometimes producing a ``DataFrame`` with trailing rows of ``np.nan`` when the engine ``openpyxl`` is used (:issue:`39181`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 205d248bf6768..be1587dbc010c 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -544,10 +544,16 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: sheet.reset_dimensions() data: List[List[Scalar]] = [] + last_row_with_data = -1 for row_number, row in enumerate(sheet.rows): converted_row = [self._convert_cell(cell, convert_float) for cell in row] + if not all(cell == "" for cell in converted_row): + last_row_with_data = row_number data.append(converted_row) + # Trim trailing empty rows + data = data[: last_row_with_data + 1] + if version >= "3.0.0" and is_readonly and len(data) > 0: # With dimension reset, openpyxl no longer pads rows max_width = max(len(data_row) for data_row in data) diff --git a/pandas/tests/io/data/excel/empty_trailing_rows.xlsx b/pandas/tests/io/data/excel/empty_trailing_rows.xlsx new file mode 100644 index 0000000000000..920b03915a3c8 Binary files /dev/null and b/pandas/tests/io/data/excel/empty_trailing_rows.xlsx differ diff --git a/pandas/tests/io/data/excel/empty_with_blank_row.xlsx b/pandas/tests/io/data/excel/empty_with_blank_row.xlsx new file mode 100644 index 0000000000000..fe3bcfcc269d7 Binary files /dev/null and b/pandas/tests/io/data/excel/empty_with_blank_row.xlsx differ diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 04a484c3edc0d..0962b719efd4d 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -189,3 +189,43 @@ def test_append_mode_file(ext): second = data.find(b"docProps/app.xml", first + 1) third = data.find(b"docProps/app.xml", second + 1) assert second != -1 and third == -1 + + +# When read_only is None, use read_excel instead of a workbook +@pytest.mark.parametrize("read_only", [True, False, None]) +def test_read_with_empty_trailing_rows(datapath, ext, read_only, request): + # GH 39181 + version = LooseVersion(get_version(openpyxl)) + if (read_only or read_only is None) and version < "3.0.0": + msg = "openpyxl read-only sheet is incorrect when dimension data is wrong" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + path = datapath("io", "data", "excel", f"empty_trailing_rows{ext}") + if read_only is None: + result = pd.read_excel(path) + else: + wb = openpyxl.load_workbook(path, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl") + wb.close() + expected = DataFrame( + { + "Title": [np.nan, "A", 1, 2, 3], + "Unnamed: 1": [np.nan, "B", 4, 5, 6], + "Unnamed: 2": [np.nan, "C", 7, 8, 9], + } + ) + tm.assert_frame_equal(result, expected) + + +# When read_only is None, use read_excel instead of a workbook +@pytest.mark.parametrize("read_only", [True, False, None]) +def test_read_empty_with_blank_row(datapath, ext, read_only): + # GH 39547 - empty excel file with a row that has no data + path = datapath("io", "data", "excel", f"empty_with_blank_row{ext}") + if read_only is None: + result = pd.read_excel(path) + else: + wb = openpyxl.load_workbook(path, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl") + wb.close() + expected = DataFrame() + tm.assert_frame_equal(result, expected)