Skip to content

Commit 9f43fb4

Browse files
jbrockmendelKevin D Smith
authored and
Kevin D Smith
committed
Revert "ENH: Optimize nrows in read_excel (pandas-dev#35974)" (pandas-dev#36537)
This reverts commit e975f3d.
1 parent 1d19da3 commit 9f43fb4

File tree

8 files changed

+14
-93
lines changed

8 files changed

+14
-93
lines changed

asv_bench/benchmarks/io/excel.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212

1313
def _generate_dataframe():
14-
N = 20000
14+
N = 2000
1515
C = 5
1616
df = DataFrame(
1717
np.random.randn(N, C),
@@ -69,9 +69,5 @@ def time_read_excel(self, engine):
6969
fname = self.fname_odf if engine == "odf" else self.fname_excel
7070
read_excel(fname, engine=engine)
7171

72-
def time_read_excel_nrows(self, engine):
73-
fname = self.fname_odf if engine == "odf" else self.fname_excel
74-
read_excel(fname, engine=engine, nrows=1)
75-
7672

7773
from ..pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v1.2.0.rst

-1
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,6 @@ Performance improvements
224224

225225
- Performance improvements when creating DataFrame or Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`)
226226
- Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`)
227-
- Performance improvement in `read_excel` for when ``nrows`` is much smaller than the length of the file (:issue:`33281`).
228227
- Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`)
229228
- Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`)
230229
- ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`)

pandas/io/excel/_base.py

+4-26
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33
from io import BufferedIOBase, BytesIO, RawIOBase
44
import os
55
from textwrap import fill
6-
from typing import Any, List, Mapping, Optional, Union
6+
from typing import Any, Mapping, Union
77

88
from pandas._config import config
99

1010
from pandas._libs.parsers import STR_NA_VALUES
11-
from pandas._typing import Scalar, StorageOptions
11+
from pandas._typing import StorageOptions
1212
from pandas.errors import EmptyDataError
1313
from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments
1414

@@ -398,14 +398,7 @@ def get_sheet_by_index(self, index):
398398
pass
399399

400400
@abc.abstractmethod
401-
def get_sheet_data(
402-
self,
403-
sheet,
404-
convert_float: bool,
405-
header_nrows: int,
406-
skiprows_nrows: int,
407-
nrows: Optional[int],
408-
) -> List[List[Scalar]]:
401+
def get_sheet_data(self, sheet, convert_float):
409402
pass
410403

411404
def parse(
@@ -461,22 +454,7 @@ def parse(
461454
else: # assume an integer if not a string
462455
sheet = self.get_sheet_by_index(asheetname)
463456

464-
if isinstance(header, int):
465-
header_nrows = header
466-
elif header is None:
467-
header_nrows = 0
468-
else:
469-
header_nrows = max(header)
470-
if isinstance(skiprows, int):
471-
skiprows_nrows = skiprows
472-
elif skiprows is None:
473-
skiprows_nrows = 0
474-
else:
475-
skiprows_nrows = len(skiprows)
476-
477-
data = self.get_sheet_data(
478-
sheet, convert_float, header_nrows, skiprows_nrows, nrows
479-
)
457+
data = self.get_sheet_data(sheet, convert_float)
480458
usecols = maybe_convert_usecols(usecols)
481459

482460
if not data:

pandas/io/excel/_odfreader.py

+2-11
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List, Optional, cast
1+
from typing import List, cast
22

33
import numpy as np
44

@@ -71,14 +71,7 @@ def get_sheet_by_name(self, name: str):
7171

7272
raise ValueError(f"sheet {name} not found")
7373

74-
def get_sheet_data(
75-
self,
76-
sheet,
77-
convert_float: bool,
78-
header_nrows: int,
79-
skiprows_nrows: int,
80-
nrows: Optional[int],
81-
) -> List[List[Scalar]]:
74+
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
8275
"""
8376
Parse an ODF Table into a list of lists
8477
"""
@@ -94,8 +87,6 @@ def get_sheet_data(
9487

9588
table: List[List[Scalar]] = []
9689

97-
if isinstance(nrows, int):
98-
sheet_rows = sheet_rows[: header_nrows + skiprows_nrows + nrows + 1]
9990
for i, sheet_row in enumerate(sheet_rows):
10091
sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names]
10192
empty_cells = 0

pandas/io/excel/_openpyxl.py

+1-8
Original file line numberDiff line numberDiff line change
@@ -508,14 +508,7 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
508508

509509
return cell.value
510510

511-
def get_sheet_data(
512-
self,
513-
sheet,
514-
convert_float: bool,
515-
header_nrows: int,
516-
skiprows_nrows: int,
517-
nrows: Optional[int],
518-
) -> List[List[Scalar]]:
511+
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
519512
data: List[List[Scalar]] = []
520513
for row in sheet.rows:
521514
data.append([self._convert_cell(cell, convert_float) for cell in row])

pandas/io/excel/_pyxlsb.py

+2-9
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List, Optional
1+
from typing import List
22

33
from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions
44
from pandas.compat._optional import import_optional_dependency
@@ -68,14 +68,7 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
6868

6969
return cell.v
7070

71-
def get_sheet_data(
72-
self,
73-
sheet,
74-
convert_float: bool,
75-
header_nrows: int,
76-
skiprows_nrows: int,
77-
nrows: Optional[int],
78-
) -> List[List[Scalar]]:
71+
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
7972
return [
8073
[self._convert_cell(c, convert_float) for c in r]
8174
for r in sheet.rows(sparse=False)

pandas/io/excel/_xlrd.py

+4-17
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
from datetime import time
2-
from typing import List, Optional
32

43
import numpy as np
54

6-
from pandas._typing import Scalar, StorageOptions
5+
from pandas._typing import StorageOptions
76
from pandas.compat._optional import import_optional_dependency
87

98
from pandas.io.excel._base import BaseExcelReader
@@ -50,14 +49,7 @@ def get_sheet_by_name(self, name):
5049
def get_sheet_by_index(self, index):
5150
return self.book.sheet_by_index(index)
5251

53-
def get_sheet_data(
54-
self,
55-
sheet,
56-
convert_float: bool,
57-
header_nrows: int,
58-
skiprows_nrows: int,
59-
nrows: Optional[int],
60-
) -> List[List[Scalar]]:
52+
def get_sheet_data(self, sheet, convert_float):
6153
from xlrd import (
6254
XL_CELL_BOOLEAN,
6355
XL_CELL_DATE,
@@ -106,14 +98,9 @@ def _parse_cell(cell_contents, cell_typ):
10698
cell_contents = val
10799
return cell_contents
108100

109-
data: List[List[Scalar]] = []
101+
data = []
110102

111-
sheet_nrows = sheet.nrows
112-
113-
if isinstance(nrows, int):
114-
sheet_nrows = min(header_nrows + skiprows_nrows + nrows + 1, sheet_nrows)
115-
116-
for i in range(sheet_nrows):
103+
for i in range(sheet.nrows):
117104
row = [
118105
_parse_cell(value, typ)
119106
for value, typ in zip(sheet.row_values(i), sheet.row_types(i))

pandas/tests/io/excel/test_readers.py

-16
Original file line numberDiff line numberDiff line change
@@ -1195,21 +1195,5 @@ def test_read_datetime_multiindex(self, engine, read_ext):
11951195
],
11961196
)
11971197
expected = pd.DataFrame([], columns=expected_column_index)
1198-
tm.assert_frame_equal(expected, actual)
11991198

1200-
@pytest.mark.parametrize(
1201-
"header, skiprows", [(1, 2), (0, 3), (1, [0, 1]), ([2], 1)]
1202-
)
1203-
@td.check_file_leaks
1204-
def test_header_skiprows_nrows(self, engine, read_ext, header, skiprows):
1205-
# GH 32727
1206-
data = pd.read_excel("test1" + read_ext, engine=engine)
1207-
expected = (
1208-
DataFrame(data.iloc[3:6])
1209-
.reset_index(drop=True)
1210-
.rename(columns=data.iloc[2].rename(None))
1211-
)
1212-
actual = pd.read_excel(
1213-
"test1" + read_ext, engine=engine, header=header, skiprows=skiprows, nrows=3
1214-
)
12151199
tm.assert_frame_equal(expected, actual)

0 commit comments

Comments
 (0)