Skip to content

PERF: pandas read_excel perf optimisations #47376

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,13 @@ def get_sheet_by_index(self, index: int):
pass

@abc.abstractmethod
def get_sheet_data(self, sheet, convert_float: bool, rows: int | None = None):
def get_sheet_data(
self,
sheet,
convert_float: bool,
rows: int | None = None,
offset: int | None = None,
):
pass

def raise_if_bad_sheet_by_index(self, index: int) -> None:
Expand Down Expand Up @@ -740,7 +746,16 @@ def parse(
sheet = self.get_sheet_by_index(asheetname)

file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows)
data = self.get_sheet_data(sheet, convert_float, file_rows_needed)
file_offset_needed = None
if header is None:
if is_integer(skiprows):
file_offset_needed = skiprows
elif is_list_like(skiprows):
file_offset_needed = skiprows[0]

data = self.get_sheet_data(
sheet, convert_float, file_rows_needed, file_offset_needed
)
if hasattr(sheet, "close"):
# pyxlsb opens two TemporaryFiles
sheet.close()
Expand Down Expand Up @@ -817,6 +832,7 @@ def parse(

# GH 12292 : error when read one empty column from excel file
try:
skiprows = None if is_integer(skiprows) and header is None else skiprows
parser = TextParser(
data,
names=names,
Expand Down
18 changes: 16 additions & 2 deletions pandas/io/excel/_odfreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,11 @@ def get_sheet_by_name(self, name: str):
raise ValueError(f"sheet {name} not found")

def get_sheet_data(
self, sheet, convert_float: bool, file_rows_needed: int | None = None
self,
sheet,
convert_float: bool,
file_rows_needed: int | None = None,
file_offset_needed: int | None = None,
) -> list[list[Scalar | NaTType]]:
"""
Parse an ODF Table into a list of lists
Expand All @@ -111,7 +115,17 @@ def get_sheet_data(

table: list[list[Scalar | NaTType]] = []

for sheet_row in sheet_rows:
loop_on = sheet_rows
if file_rows_needed:
loop_on = sheet_rows[: file_rows_needed + 1]
if file_offset_needed:
loop_on = sheet_rows[
file_offset_needed : file_offset_needed + file_rows_needed + 1
]
elif file_offset_needed:
loop_on = sheet_rows[file_offset_needed:]

for sheet_row in loop_on:
sheet_cells = [
x
for x in sheet_row.childNodes
Expand Down
77 changes: 52 additions & 25 deletions pandas/io/excel/_openpyxl.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,36 +595,63 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
return cell.value

def get_sheet_data(
self, sheet, convert_float: bool, file_rows_needed: int | None = None
self,
sheet,
convert_float: bool,
file_rows_needed: int | None = None,
file_offset_needed: int | None = None,
) -> list[list[Scalar]]:

if self.book.read_only:
sheet.reset_dimensions()

data: list[list[Scalar]] = []
last_row_with_data = -1
for row_number, row in enumerate(sheet.rows):
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
while converted_row and converted_row[-1] == "":
# trim trailing empty elements
converted_row.pop()
if converted_row:
last_row_with_data = row_number
data.append(converted_row)
if file_rows_needed is not None and len(data) >= file_rows_needed:
break

# Trim trailing empty rows
data = data[: last_row_with_data + 1]

if len(data) > 0:
# extend rows to max width
max_width = max(len(data_row) for data_row in data)
if min(len(data_row) for data_row in data) < max_width:
empty_cell: list[Scalar] = [""]
data = [
data_row + (max_width - len(data_row)) * empty_cell
for data_row in data
def _loop_rows(iterator):
data: list[list[Scalar]] = []
last_row_with_data = -1
for row_number, row in iterator:
converted_row = [
self._convert_cell(cell, convert_float) for cell in row
]
while converted_row and converted_row[-1] == "":
# trim trailing empty elements
converted_row.pop()
if converted_row:
last_row_with_data = row_number
data.append(converted_row)
if file_rows_needed is not None and len(data) >= file_rows_needed:
break

# Trim trailing empty rows
data = data[: last_row_with_data + 1]

if data:
# extend rows to max width
max_width = max(len(data_row) for data_row in data)
if min(len(data_row) for data_row in data) < max_width:
empty_cell: list[Scalar] = [""]
data = [
data_row + (max_width - len(data_row)) * empty_cell
for data_row in data
]

return data

data: list[list[Scalar]] = []
loop_on = sheet.rows
if file_rows_needed or file_offset_needed:
min_row = max_row = None
# +1 are here because this is 1-based indexing
if file_rows_needed:
max_row = file_rows_needed + 1
if file_offset_needed:
max_row += file_offset_needed

if file_offset_needed:
min_row = file_offset_needed + 1

# Then we return the generator
loop_on = list(sheet.iter_rows(min_row=min_row, max_row=max_row))

data = _loop_rows(enumerate(loop_on))

return data
20 changes: 16 additions & 4 deletions pandas/io/excel/_pyxlsb.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,21 +84,33 @@ def get_sheet_data(
sheet,
convert_float: bool,
file_rows_needed: int | None = None,
file_offset_needed: int | None = None,
) -> list[list[Scalar]]:
data: list[list[Scalar]] = []
prevous_row_number = -1
loop_on = sheet.rows(sparse=True)
previous_row_number = -1

if file_rows_needed:
loop_on = loop_on[: file_rows_needed + 1]
if file_offset_needed:
loop_on = loop_on[
file_offset_needed : file_offset_needed + file_rows_needed + 1
]
elif file_offset_needed:
loop_on = loop_on[file_offset_needed:]

# When sparse=True the rows can have different lengths and empty rows are
# not returned. The cells are namedtuples of row, col, value (r, c, v).
for row in sheet.rows(sparse=True):
for row in loop_on:
row_number = row[0].r
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
while converted_row and converted_row[-1] == "":
# trim trailing empty elements
converted_row.pop()
if converted_row:
data.extend([[]] * (row_number - prevous_row_number - 1))
data.extend([[]] * (row_number - previous_row_number - 1))
data.append(converted_row)
prevous_row_number = row_number
previous_row_number = row_number
if file_rows_needed is not None and len(data) >= file_rows_needed:
break
if data:
Expand Down
35 changes: 26 additions & 9 deletions pandas/io/excel/_xlrd.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,11 @@ def get_sheet_by_index(self, index):
return self.book.sheet_by_index(index)

def get_sheet_data(
self, sheet, convert_float: bool, file_rows_needed: int | None = None
self,
sheet,
convert_float: bool,
file_rows_needed: int | None = None,
file_offset_needed: int | None = None,
) -> list[list[Scalar]]:
from xlrd import (
XL_CELL_BOOLEAN,
Expand Down Expand Up @@ -115,13 +119,26 @@ def _parse_cell(cell_contents, cell_typ):
data = []

nrows = sheet.nrows
if file_rows_needed is not None:
nrows = min(nrows, file_rows_needed)
for i in range(nrows):
row = [
_parse_cell(value, typ)
for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
]
data.append(row)
nrows_range = range(nrows)
if file_rows_needed:
nrows_range = range(min(nrows, file_rows_needed))
if file_offset_needed:
nrows_range = range(
file_offset_needed, file_offset_needed + file_rows_needed
)
elif file_offset_needed:
nrows_range = range(file_offset_needed, nrows)

data = []

for i in nrows_range:
try:
row = [
_parse_cell(value, typ)
for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
]
data.append(row)
except IndexError:
break

return data