Skip to content

Commit ae5fe34

Browse files
authored
BUG: read_excel trailing blank rows and columns (#41227)
1 parent e4ee3d3 commit ae5fe34

File tree

9 files changed

+40
-7
lines changed

9 files changed

+40
-7
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -829,6 +829,7 @@ I/O
829829
- Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`)
830830
- Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`)
831831
- Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`)
832+
- Bug in :func:`read_excel` loading trailing empty rows/columns for some filetypes (:issue:`41167`)
832833
- Bug in :func:`read_excel` raising ``AttributeError`` with ``MultiIndex`` header followed by two empty rows and no index, and bug affecting :func:`read_excel`, :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_clipboard` where one blank row after a ``MultiIndex`` header with no index would be dropped (:issue:`40442`)
833834
- Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`)
834835
- Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`)

pandas/io/excel/_openpyxl.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -571,15 +571,18 @@ def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]:
571571
last_row_with_data = -1
572572
for row_number, row in enumerate(sheet.rows):
573573
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
574-
if not all(cell == "" for cell in converted_row):
574+
while converted_row and converted_row[-1] == "":
575+
# trim trailing empty elements
576+
converted_row.pop()
577+
if converted_row:
575578
last_row_with_data = row_number
576579
data.append(converted_row)
577580

578581
# Trim trailing empty rows
579582
data = data[: last_row_with_data + 1]
580583

581-
if self.book.read_only and len(data) > 0:
582-
# With dimension reset, openpyxl no longer pads rows
584+
if len(data) > 0:
585+
# extend rows to max width
583586
max_width = max(len(data_row) for data_row in data)
584587
if min(len(data_row) for data_row in data) < max_width:
585588
empty_cell: list[Scalar] = [""]

pandas/io/excel/_pyxlsb.py

+24-4
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,27 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
7575
return cell.v
7676

7777
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
78-
return [
79-
[self._convert_cell(c, convert_float) for c in r]
80-
for r in sheet.rows(sparse=False)
81-
]
78+
data: list[list[Scalar]] = []
79+
prevous_row_number = -1
80+
# When sparse=True the rows can have different lengths and empty rows are
81+
# not returned. The cells are namedtuples of row, col, value (r, c, v).
82+
for row in sheet.rows(sparse=True):
83+
row_number = row[0].r
84+
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
85+
while converted_row and converted_row[-1] == "":
86+
# trim trailing empty elements
87+
converted_row.pop()
88+
if converted_row:
89+
data.extend([[]] * (row_number - prevous_row_number - 1))
90+
data.append(converted_row)
91+
prevous_row_number = row_number
92+
if data:
93+
# extend rows to max_width
94+
max_width = max(len(data_row) for data_row in data)
95+
if min(len(data_row) for data_row in data) < max_width:
96+
empty_cell: list[Scalar] = [""]
97+
data = [
98+
data_row + (max_width - len(data_row)) * empty_cell
99+
for data_row in data
100+
]
101+
return data
2.99 KB
Binary file not shown.
Binary file not shown.
Binary file not shown.
8.62 KB
Binary file not shown.
Binary file not shown.

pandas/tests/io/excel/test_readers.py

+9
Original file line numberDiff line numberDiff line change
@@ -1205,6 +1205,15 @@ def test_multiheader_two_blank_lines(self, read_ext):
12051205
)
12061206
tm.assert_frame_equal(result, expected)
12071207

1208+
def test_trailing_blanks(self, read_ext):
1209+
"""
1210+
Sheets can contain blank cells with no data. Some of our readers
1211+
were including those cells, creating many empty rows and columns
1212+
"""
1213+
file_name = "trailing_blanks" + read_ext
1214+
result = pd.read_excel(file_name)
1215+
assert result.shape == (3, 3)
1216+
12081217

12091218
class TestExcelFileRead:
12101219
@pytest.fixture(autouse=True)

0 commit comments

Comments
 (0)