Skip to content

Commit e975f3d

Browse files
authored
ENH: Optimize nrows in read_excel (#35974)
1 parent f59700b commit e975f3d

File tree

8 files changed

+93
-14
lines changed

8 files changed

+93
-14
lines changed

asv_bench/benchmarks/io/excel.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212

1313
def _generate_dataframe():
14-
N = 2000
14+
N = 20000
1515
C = 5
1616
df = DataFrame(
1717
np.random.randn(N, C),
@@ -69,5 +69,9 @@ def time_read_excel(self, engine):
6969
fname = self.fname_odf if engine == "odf" else self.fname_excel
7070
read_excel(fname, engine=engine)
7171

72+
def time_read_excel_nrows(self, engine):
73+
fname = self.fname_odf if engine == "odf" else self.fname_excel
74+
read_excel(fname, engine=engine, nrows=1)
75+
7276

7377
from ..pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ Performance improvements
224224

225225
- Performance improvements when creating DataFrame or Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`)
226226
- Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`)
227+
- Performance improvement in `read_excel` for when ``nrows`` is much smaller than the length of the file (:issue:`33281`).
227228
- Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`)
228229
- Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`)
229230
- ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`)

pandas/io/excel/_base.py

+26-4
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33
from io import BufferedIOBase, BytesIO, RawIOBase
44
import os
55
from textwrap import fill
6-
from typing import Any, Mapping, Union
6+
from typing import Any, List, Mapping, Optional, Union
77

88
from pandas._config import config
99

1010
from pandas._libs.parsers import STR_NA_VALUES
11-
from pandas._typing import StorageOptions
11+
from pandas._typing import Scalar, StorageOptions
1212
from pandas.errors import EmptyDataError
1313
from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments
1414

@@ -398,7 +398,14 @@ def get_sheet_by_index(self, index):
398398
pass
399399

400400
@abc.abstractmethod
401-
def get_sheet_data(self, sheet, convert_float):
401+
def get_sheet_data(
402+
self,
403+
sheet,
404+
convert_float: bool,
405+
header_nrows: int,
406+
skiprows_nrows: int,
407+
nrows: Optional[int],
408+
) -> List[List[Scalar]]:
402409
pass
403410

404411
def parse(
@@ -454,7 +461,22 @@ def parse(
454461
else: # assume an integer if not a string
455462
sheet = self.get_sheet_by_index(asheetname)
456463

457-
data = self.get_sheet_data(sheet, convert_float)
464+
if isinstance(header, int):
465+
header_nrows = header
466+
elif header is None:
467+
header_nrows = 0
468+
else:
469+
header_nrows = max(header)
470+
if isinstance(skiprows, int):
471+
skiprows_nrows = skiprows
472+
elif skiprows is None:
473+
skiprows_nrows = 0
474+
else:
475+
skiprows_nrows = len(skiprows)
476+
477+
data = self.get_sheet_data(
478+
sheet, convert_float, header_nrows, skiprows_nrows, nrows
479+
)
458480
usecols = maybe_convert_usecols(usecols)
459481

460482
if not data:

pandas/io/excel/_odfreader.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List, cast
1+
from typing import List, Optional, cast
22

33
import numpy as np
44

@@ -71,7 +71,14 @@ def get_sheet_by_name(self, name: str):
7171

7272
raise ValueError(f"sheet {name} not found")
7373

74-
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
74+
def get_sheet_data(
75+
self,
76+
sheet,
77+
convert_float: bool,
78+
header_nrows: int,
79+
skiprows_nrows: int,
80+
nrows: Optional[int],
81+
) -> List[List[Scalar]]:
7582
"""
7683
Parse an ODF Table into a list of lists
7784
"""
@@ -87,6 +94,8 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
8794

8895
table: List[List[Scalar]] = []
8996

97+
if isinstance(nrows, int):
98+
sheet_rows = sheet_rows[: header_nrows + skiprows_nrows + nrows + 1]
9099
for i, sheet_row in enumerate(sheet_rows):
91100
sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names]
92101
empty_cells = 0

pandas/io/excel/_openpyxl.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -508,7 +508,14 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
508508

509509
return cell.value
510510

511-
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
511+
def get_sheet_data(
512+
self,
513+
sheet,
514+
convert_float: bool,
515+
header_nrows: int,
516+
skiprows_nrows: int,
517+
nrows: Optional[int],
518+
) -> List[List[Scalar]]:
512519
data: List[List[Scalar]] = []
513520
for row in sheet.rows:
514521
data.append([self._convert_cell(cell, convert_float) for cell in row])

pandas/io/excel/_pyxlsb.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List
1+
from typing import List, Optional
22

33
from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions
44
from pandas.compat._optional import import_optional_dependency
@@ -68,7 +68,14 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
6868

6969
return cell.v
7070

71-
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
71+
def get_sheet_data(
72+
self,
73+
sheet,
74+
convert_float: bool,
75+
header_nrows: int,
76+
skiprows_nrows: int,
77+
nrows: Optional[int],
78+
) -> List[List[Scalar]]:
7279
return [
7380
[self._convert_cell(c, convert_float) for c in r]
7481
for r in sheet.rows(sparse=False)

pandas/io/excel/_xlrd.py

+17-4
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
from datetime import time
2+
from typing import List, Optional
23

34
import numpy as np
45

5-
from pandas._typing import StorageOptions
6+
from pandas._typing import Scalar, StorageOptions
67
from pandas.compat._optional import import_optional_dependency
78

89
from pandas.io.excel._base import BaseExcelReader
@@ -49,7 +50,14 @@ def get_sheet_by_name(self, name):
4950
def get_sheet_by_index(self, index):
5051
return self.book.sheet_by_index(index)
5152

52-
def get_sheet_data(self, sheet, convert_float):
53+
def get_sheet_data(
54+
self,
55+
sheet,
56+
convert_float: bool,
57+
header_nrows: int,
58+
skiprows_nrows: int,
59+
nrows: Optional[int],
60+
) -> List[List[Scalar]]:
5361
from xlrd import (
5462
XL_CELL_BOOLEAN,
5563
XL_CELL_DATE,
@@ -98,9 +106,14 @@ def _parse_cell(cell_contents, cell_typ):
98106
cell_contents = val
99107
return cell_contents
100108

101-
data = []
109+
data: List[List[Scalar]] = []
102110

103-
for i in range(sheet.nrows):
111+
sheet_nrows = sheet.nrows
112+
113+
if isinstance(nrows, int):
114+
sheet_nrows = min(header_nrows + skiprows_nrows + nrows + 1, sheet_nrows)
115+
116+
for i in range(sheet_nrows):
104117
row = [
105118
_parse_cell(value, typ)
106119
for value, typ in zip(sheet.row_values(i), sheet.row_types(i))

pandas/tests/io/excel/test_readers.py

+16
Original file line numberDiff line numberDiff line change
@@ -1195,5 +1195,21 @@ def test_read_datetime_multiindex(self, engine, read_ext):
11951195
],
11961196
)
11971197
expected = pd.DataFrame([], columns=expected_column_index)
1198+
tm.assert_frame_equal(expected, actual)
11981199

1200+
@pytest.mark.parametrize(
1201+
"header, skiprows", [(1, 2), (0, 3), (1, [0, 1]), ([2], 1)]
1202+
)
1203+
@td.check_file_leaks
1204+
def test_header_skiprows_nrows(self, engine, read_ext, header, skiprows):
1205+
# GH 32727
1206+
data = pd.read_excel("test1" + read_ext, engine=engine)
1207+
expected = (
1208+
DataFrame(data.iloc[3:6])
1209+
.reset_index(drop=True)
1210+
.rename(columns=data.iloc[2].rename(None))
1211+
)
1212+
actual = pd.read_excel(
1213+
"test1" + read_ext, engine=engine, header=header, skiprows=skiprows, nrows=3
1214+
)
11991215
tm.assert_frame_equal(expected, actual)

0 commit comments

Comments
 (0)