diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index d20f347e54d6b..68bca2842df9b 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -565,7 +565,13 @@ def get_sheet_by_index(self, index: int): pass @abc.abstractmethod - def get_sheet_data(self, sheet, convert_float: bool, rows: int | None = None): + def get_sheet_data( + self, + sheet, + convert_float: bool, + rows: int | None = None, + offset: int | None = None, + ): pass def raise_if_bad_sheet_by_index(self, index: int) -> None: @@ -740,7 +746,16 @@ def parse( sheet = self.get_sheet_by_index(asheetname) file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows) - data = self.get_sheet_data(sheet, convert_float, file_rows_needed) + file_offset_needed = None + if header is None: + if is_integer(skiprows): + file_offset_needed = skiprows + elif is_list_like(skiprows): + file_offset_needed = skiprows[0] + + data = self.get_sheet_data( + sheet, convert_float, file_rows_needed, file_offset_needed + ) if hasattr(sheet, "close"): # pyxlsb opens two TemporaryFiles sheet.close() @@ -817,6 +832,7 @@ def parse( # GH 12292 : error when read one empty column from excel file try: + skiprows = None if is_integer(skiprows) and header is None else skiprows parser = TextParser( data, names=names, diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 075590f3535fe..292093663715d 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -90,7 +90,11 @@ def get_sheet_by_name(self, name: str): raise ValueError(f"sheet {name} not found") def get_sheet_data( - self, sheet, convert_float: bool, file_rows_needed: int | None = None + self, + sheet, + convert_float: bool, + file_rows_needed: int | None = None, + file_offset_needed: int | None = None, ) -> list[list[Scalar | NaTType]]: """ Parse an ODF Table into a list of lists @@ -111,7 +115,17 @@ def get_sheet_data( table: list[list[Scalar | NaTType]] = [] - for sheet_row in sheet_rows: + loop_on = sheet_rows + if file_rows_needed: + loop_on = sheet_rows[: file_rows_needed + 1] + if file_offset_needed: + loop_on = sheet_rows[ + file_offset_needed : file_offset_needed + file_rows_needed + 1 + ] + elif file_offset_needed: + loop_on = sheet_rows[file_offset_needed:] + + for sheet_row in loop_on: sheet_cells = [ x for x in sheet_row.childNodes diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 87cc07d3fd21d..177651843d25a 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -595,36 +595,63 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.value def get_sheet_data( - self, sheet, convert_float: bool, file_rows_needed: int | None = None + self, + sheet, + convert_float: bool, + file_rows_needed: int | None = None, + file_offset_needed: int | None = None, ) -> list[list[Scalar]]: if self.book.read_only: sheet.reset_dimensions() - data: list[list[Scalar]] = [] - last_row_with_data = -1 - for row_number, row in enumerate(sheet.rows): - converted_row = [self._convert_cell(cell, convert_float) for cell in row] - while converted_row and converted_row[-1] == "": - # trim trailing empty elements - converted_row.pop() - if converted_row: - last_row_with_data = row_number - data.append(converted_row) - if file_rows_needed is not None and len(data) >= file_rows_needed: - break - - # Trim trailing empty rows - data = data[: last_row_with_data + 1] - - if len(data) > 0: - # extend rows to max width - max_width = max(len(data_row) for data_row in data) - if min(len(data_row) for data_row in data) < max_width: - empty_cell: list[Scalar] = [""] - data = [ - data_row + (max_width - len(data_row)) * empty_cell - for data_row in data + def _loop_rows(iterator): + data: list[list[Scalar]] = [] + last_row_with_data = -1 + for row_number, row in iterator: + converted_row = [ + self._convert_cell(cell, convert_float) for cell in row ] + while converted_row and converted_row[-1] == "": + # trim trailing empty elements + converted_row.pop() + if converted_row: + last_row_with_data = row_number + data.append(converted_row) + if file_rows_needed is not None and len(data) >= file_rows_needed: + break + + # Trim trailing empty rows + data = data[: last_row_with_data + 1] + + if data: + # extend rows to max width + max_width = max(len(data_row) for data_row in data) + if min(len(data_row) for data_row in data) < max_width: + empty_cell: list[Scalar] = [""] + data = [ + data_row + (max_width - len(data_row)) * empty_cell + for data_row in data + ] + + return data + + data: list[list[Scalar]] = [] + loop_on = sheet.rows + if file_rows_needed or file_offset_needed: + min_row = max_row = None + # +1 are here because this is 1-based indexing + if file_rows_needed: + max_row = file_rows_needed + 1 + if file_offset_needed: + max_row += file_offset_needed + + if file_offset_needed: + min_row = file_offset_needed + 1 + + # Then we return the generator + loop_on = list(sheet.iter_rows(min_row=min_row, max_row=max_row)) + + data = _loop_rows(enumerate(loop_on)) return data diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 5d40ccdf2f8f3..109c01f330188 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -84,21 +84,33 @@ def get_sheet_data( sheet, convert_float: bool, file_rows_needed: int | None = None, + file_offset_needed: int | None = None, ) -> list[list[Scalar]]: data: list[list[Scalar]] = [] - prevous_row_number = -1 + loop_on = sheet.rows(sparse=True) + previous_row_number = -1 + + if file_rows_needed: + loop_on = loop_on[: file_rows_needed + 1] + if file_offset_needed: + loop_on = loop_on[ + file_offset_needed : file_offset_needed + file_rows_needed + 1 + ] + elif file_offset_needed: + loop_on = loop_on[file_offset_needed:] + # When sparse=True the rows can have different lengths and empty rows are # not returned. The cells are namedtuples of row, col, value (r, c, v). - for row in sheet.rows(sparse=True): + for row in loop_on: row_number = row[0].r converted_row = [self._convert_cell(cell, convert_float) for cell in row] while converted_row and converted_row[-1] == "": # trim trailing empty elements converted_row.pop() if converted_row: - data.extend([[]] * (row_number - prevous_row_number - 1)) + data.extend([[]] * (row_number - previous_row_number - 1)) data.append(converted_row) - prevous_row_number = row_number + previous_row_number = row_number if file_rows_needed is not None and len(data) >= file_rows_needed: break if data: diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 0bf3ac6134cf6..07fd5ce0e2e4b 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -62,7 +62,11 @@ def get_sheet_by_index(self, index): return self.book.sheet_by_index(index) def get_sheet_data( - self, sheet, convert_float: bool, file_rows_needed: int | None = None + self, + sheet, + convert_float: bool, + file_rows_needed: int | None = None, + file_offset_needed: int | None = None, ) -> list[list[Scalar]]: from xlrd import ( XL_CELL_BOOLEAN, @@ -115,13 +119,26 @@ def _parse_cell(cell_contents, cell_typ): data = [] nrows = sheet.nrows - if file_rows_needed is not None: - nrows = min(nrows, file_rows_needed) - for i in range(nrows): - row = [ - _parse_cell(value, typ) - for value, typ in zip(sheet.row_values(i), sheet.row_types(i)) - ] - data.append(row) + nrows_range = range(nrows) + if file_rows_needed: + nrows_range = range(min(nrows, file_rows_needed)) + if file_offset_needed: + nrows_range = range( + file_offset_needed, file_offset_needed + file_rows_needed + ) + elif file_offset_needed: + nrows_range = range(file_offset_needed, nrows) + + data = [] + + for i in nrows_range: + try: + row = [ + _parse_cell(value, typ) + for value, typ in zip(sheet.row_values(i), sheet.row_types(i)) + ] + data.append(row) + except IndexError: + break return data