diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 74710ca48308c..1bad9401ba312 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -795,6 +795,7 @@ I/O - Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`) - Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`) - Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`) +- Bug in :func:`read_excel` loading trailing empty rows/columns for some filetypes (:issue:`41167`) - Bug in :func:`read_excel` raising ``AttributeError`` with ``MultiIndex`` header followed by two empty rows and no index, and bug affecting :func:`read_excel`, :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_clipboard` where one blank row after a ``MultiIndex`` header with no index would be dropped (:issue:`40442`) - Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`) - Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index a99f8e2625602..2071076d04a24 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -571,15 +571,18 @@ def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]: last_row_with_data = -1 for row_number, row in enumerate(sheet.rows): converted_row = [self._convert_cell(cell, convert_float) for cell in row] - if not all(cell == "" for cell in converted_row): + while converted_row and converted_row[-1] == "": + # trim trailing empty elements + converted_row.pop() + if converted_row: last_row_with_data = row_number data.append(converted_row) # Trim trailing empty rows data = data[: last_row_with_data + 1] - if self.book.read_only and len(data) > 0: - # With dimension reset, openpyxl no longer pads rows + if len(data) > 0: + # extend rows to max width max_width = max(len(data_row) for data_row in data) if min(len(data_row) for data_row in data) < max_width: empty_cell: list[Scalar] = [""] diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 71ec189854f6d..02b8090adcfdf 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -75,7 +75,27 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.v def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: - return [ - [self._convert_cell(c, convert_float) for c in r] - for r in sheet.rows(sparse=False) - ] + data: list[list[Scalar]] = [] + prevous_row_number = -1 + # When sparse=True the rows can have different lengths and empty rows are + # not returned. The cells are namedtuples of row, col, value (r, c, v). + for row in sheet.rows(sparse=True): + row_number = row[0].r + converted_row = [self._convert_cell(cell, convert_float) for cell in row] + while converted_row and converted_row[-1] == "": + # trim trailing empty elements + converted_row.pop() + if converted_row: + data.extend([[]] * (row_number - prevous_row_number - 1)) + data.append(converted_row) + prevous_row_number = row_number + if data: + # extend rows to max_width + max_width = max(len(data_row) for data_row in data) + if min(len(data_row) for data_row in data) < max_width: + empty_cell: list[Scalar] = [""] + data = [ + data_row + (max_width - len(data_row)) * empty_cell + for data_row in data + ] + return data diff --git a/pandas/tests/io/data/excel/trailing_blanks.ods b/pandas/tests/io/data/excel/trailing_blanks.ods new file mode 100644 index 0000000000000..a56fbfe452387 Binary files /dev/null and b/pandas/tests/io/data/excel/trailing_blanks.ods differ diff --git a/pandas/tests/io/data/excel/trailing_blanks.xls b/pandas/tests/io/data/excel/trailing_blanks.xls new file mode 100644 index 0000000000000..32aeb3fe36b05 Binary files /dev/null and b/pandas/tests/io/data/excel/trailing_blanks.xls differ diff --git a/pandas/tests/io/data/excel/trailing_blanks.xlsb b/pandas/tests/io/data/excel/trailing_blanks.xlsb new file mode 100644 index 0000000000000..b40b390a48f38 Binary files /dev/null and b/pandas/tests/io/data/excel/trailing_blanks.xlsb differ diff --git a/pandas/tests/io/data/excel/trailing_blanks.xlsm b/pandas/tests/io/data/excel/trailing_blanks.xlsm new file mode 100644 index 0000000000000..9f8ca5370ef81 Binary files /dev/null and b/pandas/tests/io/data/excel/trailing_blanks.xlsm differ diff --git a/pandas/tests/io/data/excel/trailing_blanks.xlsx b/pandas/tests/io/data/excel/trailing_blanks.xlsx new file mode 100644 index 0000000000000..8f1a739be9078 Binary files /dev/null and b/pandas/tests/io/data/excel/trailing_blanks.xlsx differ diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index c4b3221e1d3a7..401e913d9d7a7 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1204,6 +1204,15 @@ def test_multiheader_two_blank_lines(self, read_ext): ) tm.assert_frame_equal(result, expected) + def test_trailing_blanks(self, read_ext): + """ + Sheets can contain blank cells with no data. Some of our readers + were including those cells, creating many empty rows and columns + """ + file_name = "trailing_blanks" + read_ext + result = pd.read_excel(file_name) + assert result.shape == (3, 3) + class TestExcelFileRead: @pytest.fixture(autouse=True)