Skip to content

PERF: Optimize read_excel nrows #46894

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Jun 5, 2022
11 changes: 11 additions & 0 deletions asv_bench/benchmarks/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,15 @@ def time_read_excel(self, engine):
read_excel(fname, engine=engine)


class ReadExcelNRows(ReadExcel):
def time_read_excel(self, engine):
if engine == "xlrd":
fname = self.fname_excel_xls
elif engine == "odf":
fname = self.fname_odf
else:
fname = self.fname_excel
read_excel(fname, engine=engine, nrows=10)


from ..pandas_vb_common import setup # noqa: F401 isort:skip
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,7 @@ Performance improvements
- Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`)
- Performance improvement in :func:`factorize` (:issue:`46109`)
- Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
- Performance improvement in :func:`read_excel` when ``nrows`` argument provided (:issue:`32727`)

.. ---------------------------------------------------------------------------
.. _whatsnew_150.bug_fixes:
Expand Down
50 changes: 48 additions & 2 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,7 @@ def get_sheet_by_index(self, index: int):
pass

@abc.abstractmethod
def get_sheet_data(self, sheet, convert_float: bool):
def get_sheet_data(self, sheet, convert_float: bool, rows: int | None = None):
pass

def raise_if_bad_sheet_by_index(self, index: int) -> None:
Expand All @@ -577,6 +577,51 @@ def raise_if_bad_sheet_by_name(self, name: str) -> None:
if name not in self.sheet_names:
raise ValueError(f"Worksheet named '{name}' not found")

def _check_skiprows_func(self, skiprows, rows_to_use):
"""
See how many file rows are required when `skiprows` is callable
"""
i = 0
rows_used_so_far = 0
while rows_used_so_far < rows_to_use:
if not skiprows(i):
rows_used_so_far += 1
i += 1
return i

def _calc_rows(self, header, index_col, skiprows, nrows):
"""
If nrows specified, find the number of rows needed from the file
"""
if nrows is None:
return
if not isinstance(nrows, int) or nrows < 0:
raise ValueError("'nrows' must be an integer >=0")
if header is None:
header_rows = 1
elif isinstance(header, int):
header_rows = 1 + header
else:
header_rows = 1 + header[-1]
# If there is a MultiIndex header and an index then there is also
# a row containing just the index name(s)
if is_list_like(header) and len(header) > 1 and index_col is not None:
header_rows += 1
if skiprows is None:
return header_rows + nrows
if isinstance(skiprows, int):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is_integer again

return header_rows + nrows + skiprows
if is_list_like(skiprows):
return self._check_skiprows_func(
lambda x: x in skiprows,
header_rows + nrows,
)
if callable(skiprows):
return self._check_skiprows_func(
skiprows,
header_rows + nrows,
)

def parse(
self,
sheet_name: str | int | list[int] | list[str] | None = 0,
Expand Down Expand Up @@ -643,7 +688,8 @@ def parse(
else: # assume an integer if not a string
sheet = self.get_sheet_by_index(asheetname)

data = self.get_sheet_data(sheet, convert_float)
file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows)
data = self.get_sheet_data(sheet, convert_float, file_rows_needed)
if hasattr(sheet, "close"):
# pyxlsb opens two TemporaryFiles
sheet.close()
Expand Down
4 changes: 3 additions & 1 deletion pandas/io/excel/_odfreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def get_sheet_by_name(self, name: str):
raise ValueError(f"sheet {name} not found")

def get_sheet_data(
self, sheet, convert_float: bool
self, sheet, convert_float: bool, file_rows_needed: int | None = None
) -> list[list[Scalar | NaTType]]:
"""
Parse an ODF Table into a list of lists
Expand Down Expand Up @@ -148,6 +148,8 @@ def get_sheet_data(
empty_rows = 0
for _ in range(row_repeat):
table.append(table_row)
if file_rows_needed is not None and len(table) >= file_rows_needed:
break

# Make our table square
for row in table:
Expand Down
6 changes: 5 additions & 1 deletion pandas/io/excel/_openpyxl.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,9 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:

return cell.value

def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]:
def get_sheet_data(
self, sheet, convert_float: bool, file_rows_needed: int | None = None
) -> list[list[Scalar]]:

if self.book.read_only:
sheet.reset_dimensions()
Expand All @@ -603,6 +605,8 @@ def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]:
if converted_row:
last_row_with_data = row_number
data.append(converted_row)
if file_rows_needed is not None and len(data) >= file_rows_needed:
break

# Trim trailing empty rows
data = data[: last_row_with_data + 1]
Expand Down
9 changes: 8 additions & 1 deletion pandas/io/excel/_pyxlsb.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,12 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:

return cell.v

def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]:
def get_sheet_data(
self,
sheet,
convert_float: bool,
file_rows_needed: int | None = None,
) -> list[list[Scalar]]:
data: list[list[Scalar]] = []
prevous_row_number = -1
# When sparse=True the rows can have different lengths and empty rows are
Expand All @@ -94,6 +99,8 @@ def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]:
data.extend([[]] * (row_number - prevous_row_number - 1))
data.append(converted_row)
prevous_row_number = row_number
if file_rows_needed is not None and len(data) >= file_rows_needed:
break
if data:
# extend rows to max_width
max_width = max(len(data_row) for data_row in data)
Expand Down
16 changes: 13 additions & 3 deletions pandas/io/excel/_xlrd.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
from __future__ import annotations

from datetime import time

import numpy as np

from pandas._typing import StorageOptions
from pandas._typing import (
Scalar,
StorageOptions,
)
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import doc

Expand Down Expand Up @@ -56,7 +61,9 @@ def get_sheet_by_index(self, index):
self.raise_if_bad_sheet_by_index(index)
return self.book.sheet_by_index(index)

def get_sheet_data(self, sheet, convert_float):
def get_sheet_data(
self, sheet, convert_float: bool, file_rows_needed: int | None = None
) -> list[list[Scalar]]:
from xlrd import (
XL_CELL_BOOLEAN,
XL_CELL_DATE,
Expand Down Expand Up @@ -107,7 +114,10 @@ def _parse_cell(cell_contents, cell_typ):

data = []

for i in range(sheet.nrows):
nrows = sheet.nrows
if file_rows_needed is not None:
nrows = min(nrows, file_rows_needed)
for i in range(nrows):
row = [
_parse_cell(value, typ)
for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
Expand Down
91 changes: 91 additions & 0 deletions pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1219,6 +1219,97 @@ def test_read_excel_nrows_non_integer_parameter(self, read_ext):
with pytest.raises(ValueError, match=msg):
pd.read_excel("test1" + read_ext, nrows="5")

def test_read_excel_nrows_mi_column(self, read_ext):
expected = pd.read_excel(
"testmultiindex" + read_ext,
sheet_name="mi_column",
header=[0, 1],
index_col=0,
).iloc[:3]
actual = pd.read_excel(
"testmultiindex" + read_ext,
sheet_name="mi_column",
header=[0, 1],
index_col=0,
nrows=3,
)
tm.assert_frame_equal(actual, expected)

def test_read_excel_nrows_mi_index(self, read_ext):
expected = pd.read_excel(
"testmultiindex" + read_ext,
sheet_name="mi_index",
index_col=[0, 1],
).iloc[:3]
actual = pd.read_excel(
"testmultiindex" + read_ext,
sheet_name="mi_index",
index_col=[0, 1],
nrows=3,
)
tm.assert_frame_equal(actual, expected)

def test_read_excel_nrows_mi_both(self, read_ext):
expected = pd.read_excel(
"testmultiindex" + read_ext,
sheet_name="both",
header=[0, 1],
index_col=[0, 1],
).iloc[:3]
actual = pd.read_excel(
"testmultiindex" + read_ext,
sheet_name="both",
header=[0, 1],
index_col=[0, 1],
nrows=3,
)
tm.assert_frame_equal(actual, expected)

def test_read_excel_nrows_mi_column_name(self, read_ext):
expected = pd.read_excel(
"testmultiindex" + read_ext,
sheet_name="mi_column_name",
header=[0, 1],
index_col=0,
).iloc[:3]
actual = pd.read_excel(
"testmultiindex" + read_ext,
sheet_name="mi_column_name",
header=[0, 1],
index_col=0,
nrows=3,
)
tm.assert_frame_equal(actual, expected)

def test_read_excel_nrows_skiprows_list(self, read_ext):
expected = pd.read_excel(
"testskiprows" + read_ext,
sheet_name="skiprows_list",
skiprows=[0, 2],
).iloc[:3]
actual = pd.read_excel(
"testskiprows" + read_ext,
sheet_name="skiprows_list",
skiprows=[0, 2],
nrows=3,
)
tm.assert_frame_equal(actual, expected)

def test_read_excel_nrows_skiprows_func(self, read_ext):
func = lambda x: x == 0 or x == 2
expected = pd.read_excel(
"testskiprows" + read_ext,
sheet_name="skiprows_list",
skiprows=func,
).iloc[:3]
actual = pd.read_excel(
"testskiprows" + read_ext,
sheet_name="skiprows_list",
skiprows=func,
nrows=3,
)
tm.assert_frame_equal(actual, expected)

def test_read_excel_squeeze(self, read_ext):
# GH 12157
f = "test_squeeze" + read_ext
Expand Down