Skip to content

Commit 9e10206

Browse files
authored
PERF: Optimize read_excel nrows (#46894)
1 parent 9b77039 commit 9e10206

File tree

10 files changed

+210
-33
lines changed

10 files changed

+210
-33
lines changed

asv_bench/benchmarks/io/excel.py

+11
Original file line numberDiff line numberDiff line change
@@ -86,4 +86,15 @@ def time_read_excel(self, engine):
8686
read_excel(fname, engine=engine)
8787

8888

89+
class ReadExcelNRows(ReadExcel):
90+
def time_read_excel(self, engine):
91+
if engine == "xlrd":
92+
fname = self.fname_excel_xls
93+
elif engine == "odf":
94+
fname = self.fname_odf
95+
else:
96+
fname = self.fname_excel
97+
read_excel(fname, engine=engine, nrows=10)
98+
99+
89100
from ..pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -703,6 +703,7 @@ Performance improvements
703703
- Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`)
704704
- Performance improvement in :func:`factorize` (:issue:`46109`)
705705
- Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
706+
- Performance improvement in :func:`read_excel` when ``nrows`` argument provided (:issue:`32727`)
706707

707708
.. ---------------------------------------------------------------------------
708709
.. _whatsnew_150.bug_fixes:

pandas/io/common.py

+28-2
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
Generic,
3131
Literal,
3232
Mapping,
33+
Sequence,
3334
TypeVar,
3435
cast,
3536
overload,
@@ -58,7 +59,12 @@
5859
from pandas.util._decorators import doc
5960
from pandas.util._exceptions import find_stack_level
6061

61-
from pandas.core.dtypes.common import is_file_like
62+
from pandas.core.dtypes.common import (
63+
is_bool,
64+
is_file_like,
65+
is_integer,
66+
is_list_like,
67+
)
6268

6369
from pandas.core.shared_docs import _shared_docs
6470

@@ -175,12 +181,32 @@ def _expand_user(filepath_or_buffer: str | BaseBufferT) -> str | BaseBufferT:
175181

176182

177183
def validate_header_arg(header: object) -> None:
178-
if isinstance(header, bool):
184+
if header is None:
185+
return
186+
if is_integer(header):
187+
header = cast(int, header)
188+
if header < 0:
189+
# GH 27779
190+
raise ValueError(
191+
"Passing negative integer to header is invalid. "
192+
"For no header, use header=None instead"
193+
)
194+
return
195+
if is_list_like(header, allow_sets=False):
196+
header = cast(Sequence, header)
197+
if not all(map(is_integer, header)):
198+
raise ValueError("header must be integer or list of integers")
199+
if any(i < 0 for i in header):
200+
raise ValueError("cannot specify multi-index header with negative integers")
201+
return
202+
if is_bool(header):
179203
raise TypeError(
180204
"Passing a bool to header is invalid. Use header=None for no header or "
181205
"header=int or list-like of ints to specify "
182206
"the row(s) making up the column names"
183207
)
208+
# GH 16338
209+
raise ValueError("header must be integer or list of integers")
184210

185211

186212
@overload

pandas/io/excel/_base.py

+99-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import abc
44
import datetime
5+
from functools import partial
56
from io import BytesIO
67
import os
78
from textwrap import fill
@@ -70,6 +71,7 @@
7071
pop_header_name,
7172
)
7273
from pandas.io.parsers import TextParser
74+
from pandas.io.parsers.readers import validate_integer
7375

7476
_read_excel_doc = (
7577
"""
@@ -563,7 +565,7 @@ def get_sheet_by_index(self, index: int):
563565
pass
564566

565567
@abc.abstractmethod
566-
def get_sheet_data(self, sheet, convert_float: bool):
568+
def get_sheet_data(self, sheet, convert_float: bool, rows: int | None = None):
567569
pass
568570

569571
def raise_if_bad_sheet_by_index(self, index: int) -> None:
@@ -577,6 +579,99 @@ def raise_if_bad_sheet_by_name(self, name: str) -> None:
577579
if name not in self.sheet_names:
578580
raise ValueError(f"Worksheet named '{name}' not found")
579581

582+
def _check_skiprows_func(
583+
self,
584+
skiprows: Callable,
585+
rows_to_use: int,
586+
) -> int:
587+
"""
588+
Determine how many file rows are required to obtain `nrows` data
589+
rows when `skiprows` is a function.
590+
591+
Parameters
592+
----------
593+
skiprows : function
594+
The function passed to read_excel by the user.
595+
rows_to_use : int
596+
The number of rows that will be needed for the header and
597+
the data.
598+
599+
Returns
600+
-------
601+
int
602+
"""
603+
i = 0
604+
rows_used_so_far = 0
605+
while rows_used_so_far < rows_to_use:
606+
if not skiprows(i):
607+
rows_used_so_far += 1
608+
i += 1
609+
return i
610+
611+
def _calc_rows(
612+
self,
613+
header: int | Sequence[int] | None,
614+
index_col: int | Sequence[int] | None,
615+
skiprows: Sequence[int] | int | Callable[[int], object] | None,
616+
nrows: int | None,
617+
) -> int | None:
618+
"""
619+
If nrows specified, find the number of rows needed from the
620+
file, otherwise return None.
621+
622+
623+
Parameters
624+
----------
625+
header : int, list of int, or None
626+
See read_excel docstring.
627+
index_col : int, list of int, or None
628+
See read_excel docstring.
629+
skiprows : list-like, int, callable, or None
630+
See read_excel docstring.
631+
nrows : int or None
632+
See read_excel docstring.
633+
634+
Returns
635+
-------
636+
int or None
637+
"""
638+
if nrows is None:
639+
return None
640+
if header is None:
641+
header_rows = 1
642+
elif is_integer(header):
643+
header = cast(int, header)
644+
header_rows = 1 + header
645+
else:
646+
header = cast(Sequence, header)
647+
header_rows = 1 + header[-1]
648+
# If there is a MultiIndex header and an index then there is also
649+
# a row containing just the index name(s)
650+
if is_list_like(header) and index_col is not None:
651+
header = cast(Sequence, header)
652+
if len(header) > 1:
653+
header_rows += 1
654+
if skiprows is None:
655+
return header_rows + nrows
656+
if is_integer(skiprows):
657+
skiprows = cast(int, skiprows)
658+
return header_rows + nrows + skiprows
659+
if is_list_like(skiprows):
660+
661+
def f(skiprows: Sequence, x: int) -> bool:
662+
return x in skiprows
663+
664+
skiprows = cast(Sequence, skiprows)
665+
return self._check_skiprows_func(partial(f, skiprows), header_rows + nrows)
666+
if callable(skiprows):
667+
return self._check_skiprows_func(
668+
skiprows,
669+
header_rows + nrows,
670+
)
671+
# else unexpected skiprows type: read_excel will not optimize
672+
# the number of rows read from file
673+
return None
674+
580675
def parse(
581676
self,
582677
sheet_name: str | int | list[int] | list[str] | None = 0,
@@ -613,6 +708,7 @@ def parse(
613708
)
614709

615710
validate_header_arg(header)
711+
validate_integer("nrows", nrows)
616712

617713
ret_dict = False
618714

@@ -643,7 +739,8 @@ def parse(
643739
else: # assume an integer if not a string
644740
sheet = self.get_sheet_by_index(asheetname)
645741

646-
data = self.get_sheet_data(sheet, convert_float)
742+
file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows)
743+
data = self.get_sheet_data(sheet, convert_float, file_rows_needed)
647744
if hasattr(sheet, "close"):
648745
# pyxlsb opens two TemporaryFiles
649746
sheet.close()

pandas/io/excel/_odfreader.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def get_sheet_by_name(self, name: str):
9090
raise ValueError(f"sheet {name} not found")
9191

9292
def get_sheet_data(
93-
self, sheet, convert_float: bool
93+
self, sheet, convert_float: bool, file_rows_needed: int | None = None
9494
) -> list[list[Scalar | NaTType]]:
9595
"""
9696
Parse an ODF Table into a list of lists
@@ -148,6 +148,8 @@ def get_sheet_data(
148148
empty_rows = 0
149149
for _ in range(row_repeat):
150150
table.append(table_row)
151+
if file_rows_needed is not None and len(table) >= file_rows_needed:
152+
break
151153

152154
# Make our table square
153155
for row in table:

pandas/io/excel/_openpyxl.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -588,7 +588,9 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
588588

589589
return cell.value
590590

591-
def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]:
591+
def get_sheet_data(
592+
self, sheet, convert_float: bool, file_rows_needed: int | None = None
593+
) -> list[list[Scalar]]:
592594

593595
if self.book.read_only:
594596
sheet.reset_dimensions()
@@ -603,6 +605,8 @@ def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]:
603605
if converted_row:
604606
last_row_with_data = row_number
605607
data.append(converted_row)
608+
if file_rows_needed is not None and len(data) >= file_rows_needed:
609+
break
606610

607611
# Trim trailing empty rows
608612
data = data[: last_row_with_data + 1]

pandas/io/excel/_pyxlsb.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,12 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
7979

8080
return cell.v
8181

82-
def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]:
82+
def get_sheet_data(
83+
self,
84+
sheet,
85+
convert_float: bool,
86+
file_rows_needed: int | None = None,
87+
) -> list[list[Scalar]]:
8388
data: list[list[Scalar]] = []
8489
prevous_row_number = -1
8590
# When sparse=True the rows can have different lengths and empty rows are
@@ -94,6 +99,8 @@ def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]:
9499
data.extend([[]] * (row_number - prevous_row_number - 1))
95100
data.append(converted_row)
96101
prevous_row_number = row_number
102+
if file_rows_needed is not None and len(data) >= file_rows_needed:
103+
break
97104
if data:
98105
# extend rows to max_width
99106
max_width = max(len(data_row) for data_row in data)

pandas/io/excel/_xlrd.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
1+
from __future__ import annotations
2+
13
from datetime import time
24

35
import numpy as np
46

5-
from pandas._typing import StorageOptions
7+
from pandas._typing import (
8+
Scalar,
9+
StorageOptions,
10+
)
611
from pandas.compat._optional import import_optional_dependency
712
from pandas.util._decorators import doc
813

@@ -56,7 +61,9 @@ def get_sheet_by_index(self, index):
5661
self.raise_if_bad_sheet_by_index(index)
5762
return self.book.sheet_by_index(index)
5863

59-
def get_sheet_data(self, sheet, convert_float):
64+
def get_sheet_data(
65+
self, sheet, convert_float: bool, file_rows_needed: int | None = None
66+
) -> list[list[Scalar]]:
6067
from xlrd import (
6168
XL_CELL_BOOLEAN,
6269
XL_CELL_DATE,
@@ -107,7 +114,10 @@ def _parse_cell(cell_contents, cell_typ):
107114

108115
data = []
109116

110-
for i in range(sheet.nrows):
117+
nrows = sheet.nrows
118+
if file_rows_needed is not None:
119+
nrows = min(nrows, file_rows_needed)
120+
for i in range(nrows):
111121
row = [
112122
_parse_cell(value, typ)
113123
for value, typ in zip(sheet.row_values(i), sheet.row_types(i))

pandas/io/parsers/base_parser.py

+6-23
Original file line numberDiff line numberDiff line change
@@ -121,13 +121,7 @@ def __init__(self, kwds) -> None:
121121

122122
# validate header options for mi
123123
self.header = kwds.get("header")
124-
if isinstance(self.header, (list, tuple, np.ndarray)):
125-
if not all(map(is_integer, self.header)):
126-
raise ValueError("header must be integer or list of integers")
127-
if any(i < 0 for i in self.header):
128-
raise ValueError(
129-
"cannot specify multi-index header with negative integers"
130-
)
124+
if is_list_like(self.header, allow_sets=False):
131125
if kwds.get("usecols"):
132126
raise ValueError(
133127
"cannot specify usecols when specifying a multi-index header"
@@ -139,31 +133,20 @@ def __init__(self, kwds) -> None:
139133

140134
# validate index_col that only contains integers
141135
if self.index_col is not None:
142-
is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray))
143136
if not (
144-
is_sequence
137+
is_list_like(self.index_col, allow_sets=False)
145138
and all(map(is_integer, self.index_col))
146139
or is_integer(self.index_col)
147140
):
148141
raise ValueError(
149142
"index_col must only contain row numbers "
150143
"when specifying a multi-index header"
151144
)
152-
elif self.header is not None:
145+
elif self.header is not None and self.prefix is not None:
153146
# GH 27394
154-
if self.prefix is not None:
155-
raise ValueError(
156-
"Argument prefix must be None if argument header is not None"
157-
)
158-
# GH 16338
159-
elif not is_integer(self.header):
160-
raise ValueError("header must be integer or list of integers")
161-
# GH 27779
162-
elif self.header < 0:
163-
raise ValueError(
164-
"Passing negative integer to header is invalid. "
165-
"For no header, use header=None instead"
166-
)
147+
raise ValueError(
148+
"Argument prefix must be None if argument header is not None"
149+
)
167150

168151
self._name_processed = False
169152

0 commit comments

Comments
 (0)