Skip to content

Commit a354a5c

Browse files
Backport PR #39547: BUG: read_excel with openpyxl produces trailing rows of nan (#39679)
Co-authored-by: Richard Shadrach <[email protected]>
1 parent 1b8a4eb commit a354a5c

File tree

5 files changed

+47
-0
lines changed

5 files changed

+47
-0
lines changed

doc/source/whatsnew/v1.2.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ Bug fixes
3737

3838
- :func:`pandas.read_excel` error message when a specified ``sheetname`` does not exist is now uniform across engines (:issue:`39250`)
3939
- Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information; the fix requires ``openpyxl`` >= 3.0.0, prior versions may still fail (:issue:`38956`, :issue:`39001`)
40+
- Fixed bug in :func:`pandas.read_excel` sometimes producing a ``DataFrame`` with trailing rows of ``np.nan`` when the engine ``openpyxl`` is used (:issue:`39181`)
4041

4142
.. ---------------------------------------------------------------------------
4243

pandas/io/excel/_openpyxl.py

+6
Original file line numberDiff line numberDiff line change
@@ -544,10 +544,16 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
544544
sheet.reset_dimensions()
545545

546546
data: List[List[Scalar]] = []
547+
last_row_with_data = -1
547548
for row_number, row in enumerate(sheet.rows):
548549
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
550+
if not all(cell == "" for cell in converted_row):
551+
last_row_with_data = row_number
549552
data.append(converted_row)
550553

554+
# Trim trailing empty rows
555+
data = data[: last_row_with_data + 1]
556+
551557
if version >= "3.0.0" and is_readonly and len(data) > 0:
552558
# With dimension reset, openpyxl no longer pads rows
553559
max_width = max(len(data_row) for data_row in data)
Binary file not shown.
Binary file not shown.

pandas/tests/io/excel/test_openpyxl.py

+40
Original file line numberDiff line numberDiff line change
@@ -189,3 +189,43 @@ def test_append_mode_file(ext):
189189
second = data.find(b"docProps/app.xml", first + 1)
190190
third = data.find(b"docProps/app.xml", second + 1)
191191
assert second != -1 and third == -1
192+
193+
194+
# When read_only is None, use read_excel instead of a workbook
195+
@pytest.mark.parametrize("read_only", [True, False, None])
196+
def test_read_with_empty_trailing_rows(datapath, ext, read_only, request):
197+
# GH 39181
198+
version = LooseVersion(get_version(openpyxl))
199+
if (read_only or read_only is None) and version < "3.0.0":
200+
msg = "openpyxl read-only sheet is incorrect when dimension data is wrong"
201+
request.node.add_marker(pytest.mark.xfail(reason=msg))
202+
path = datapath("io", "data", "excel", f"empty_trailing_rows{ext}")
203+
if read_only is None:
204+
result = pd.read_excel(path)
205+
else:
206+
wb = openpyxl.load_workbook(path, read_only=read_only)
207+
result = pd.read_excel(wb, engine="openpyxl")
208+
wb.close()
209+
expected = DataFrame(
210+
{
211+
"Title": [np.nan, "A", 1, 2, 3],
212+
"Unnamed: 1": [np.nan, "B", 4, 5, 6],
213+
"Unnamed: 2": [np.nan, "C", 7, 8, 9],
214+
}
215+
)
216+
tm.assert_frame_equal(result, expected)
217+
218+
219+
# When read_only is None, use read_excel instead of a workbook
220+
@pytest.mark.parametrize("read_only", [True, False, None])
221+
def test_read_empty_with_blank_row(datapath, ext, read_only):
222+
# GH 39547 - empty excel file with a row that has no data
223+
path = datapath("io", "data", "excel", f"empty_with_blank_row{ext}")
224+
if read_only is None:
225+
result = pd.read_excel(path)
226+
else:
227+
wb = openpyxl.load_workbook(path, read_only=read_only)
228+
result = pd.read_excel(wb, engine="openpyxl")
229+
wb.close()
230+
expected = DataFrame()
231+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)