-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Optimize nrows in read_excel #35974
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 87 commits
900afff
df55b51
8177024
79b34c3
f0a2b8d
70ac234
27cae3a
4248f8c
70f46b3
cdfc05d
502b5e3
4c8a42a
19bb927
6c2a3b5
b865c88
49276da
393a622
e00fff1
b14642b
dfc794a
7b501de
3292f6b
bdd5780
1867088
3c1eb10
88c3117
dc60055
6fdedfd
ba7175c
d884803
368544a
4be3d27
95e3e02
38520e2
0b21e61
c03b46a
aa8cfe9
c9a622d
aa92783
ee16c15
547787a
368b77c
4d69922
b4ae85f
81f8674
7938fcf
7f31a30
ae8b84f
a4f5009
d7a2892
c94b45e
0ab450b
54c7304
ca388dc
9ddc41c
1920905
6d72a34
5ba54a6
2766270
91176ca
f748b78
ac823f5
f4a805d
6f188fe
ba314fe
f923bfd
008add5
c04c494
596806c
2226050
9216210
d9aa319
094d5f7
234dcc6
0afb1b1
33cb733
06003a8
c08709b
c9a2c75
4e5d048
a0c0111
a2531c0
2ce28c2
6979ae6
8d2dd01
d4720bb
d0a8a68
8f6ead4
98e4093
b49e8a9
04ed5e8
0eb3200
e88fe1e
28e51e6
2d9ee8d
bdb5630
b242ca3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,12 +3,12 @@ | |
from io import BufferedIOBase, BytesIO, RawIOBase | ||
import os | ||
from textwrap import fill | ||
from typing import Any, Mapping, Union | ||
from typing import Any, List, Mapping, Optional, Sequence, Union | ||
|
||
from pandas._config import config | ||
|
||
from pandas._libs.parsers import STR_NA_VALUES | ||
from pandas._typing import StorageOptions | ||
from pandas._typing import Scalar, StorageOptions | ||
from pandas.errors import EmptyDataError | ||
from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments | ||
|
||
|
@@ -397,9 +397,47 @@ def get_sheet_by_index(self, index): | |
pass | ||
|
||
@abc.abstractmethod | ||
def get_sheet_data(self, sheet, convert_float): | ||
def get_sheet_data( | ||
self, | ||
sheet, | ||
convert_float: bool, | ||
header: Optional[Union[int, Sequence[int]]], | ||
skiprows: Optional[Union[int, Sequence[int]]], | ||
nrows: Optional[int], | ||
) -> List[List[Scalar]]: | ||
pass | ||
|
||
def should_skip_row( | ||
self, | ||
index: int, | ||
header: Optional[Union[int, Sequence[int]]], | ||
skiprows: Optional[Union[int, Sequence[int]]], | ||
nrows: Optional[int], | ||
) -> bool: | ||
""" | ||
Determines whether row should be skipped. | ||
|
||
Parameters | ||
---------- | ||
index : int | ||
Index of row. | ||
header : int, list of int | ||
Rows used as column labels. | ||
skiprows : int, list of int | ||
Rows to skip at the begining. | ||
nrows : int | ||
Number of rows to parse. | ||
|
||
Returns | ||
------- | ||
bool | ||
Determines if row should be skipped. | ||
""" | ||
if nrows is not None and isinstance(header, int) and isinstance(skiprows, int): | ||
if index < header + skiprows - 1: | ||
return True | ||
return False | ||
|
||
def parse( | ||
self, | ||
sheet_name=0, | ||
|
@@ -453,7 +491,20 @@ def parse( | |
else: # assume an integer if not a string | ||
sheet = self.get_sheet_by_index(asheetname) | ||
|
||
data = self.get_sheet_data(sheet, convert_float) | ||
get_sheet_data_header = 0 if header is None else header | ||
get_sheet_data_skiprows = 0 if skiprows is None else skiprows | ||
get_sheet_data_nrows = nrows if isinstance(nrows, int) else None | ||
if isinstance(get_sheet_data_header, list) or isinstance( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should be possible to just take the maximum in that list, right? |
||
get_sheet_data_skiprows, list | ||
): | ||
get_sheet_data_nrows = None | ||
data = self.get_sheet_data( | ||
sheet, | ||
convert_float, | ||
get_sheet_data_header, | ||
get_sheet_data_skiprows, | ||
get_sheet_data_nrows, | ||
) | ||
usecols = _maybe_convert_usecols(usecols) | ||
|
||
if not data: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,9 @@ | ||
from datetime import time | ||
from typing import List, Optional, Sequence | ||
|
||
import numpy as np | ||
|
||
from pandas._typing import StorageOptions | ||
from pandas._typing import Scalar, StorageOptions, Union | ||
from pandas.compat._optional import import_optional_dependency | ||
|
||
from pandas.io.excel._base import _BaseExcelReader | ||
|
@@ -49,7 +50,14 @@ def get_sheet_by_name(self, name): | |
def get_sheet_by_index(self, index): | ||
return self.book.sheet_by_index(index) | ||
|
||
def get_sheet_data(self, sheet, convert_float): | ||
def get_sheet_data( | ||
self, | ||
sheet, | ||
convert_float: bool, | ||
header: Optional[Union[int, Sequence[int]]], | ||
skiprows: Optional[Union[int, Sequence[int]]], | ||
nrows: Optional[int], | ||
) -> List[List[Scalar]]: | ||
from xlrd import ( | ||
XL_CELL_BOOLEAN, | ||
XL_CELL_DATE, | ||
|
@@ -98,9 +106,16 @@ def _parse_cell(cell_contents, cell_typ): | |
cell_contents = val | ||
return cell_contents | ||
|
||
data = [] | ||
data: List[List[Scalar]] = [] | ||
|
||
for i in range(sheet.nrows): | ||
sheet_nrows = sheet.nrows | ||
if nrows is not None and isinstance(header, int) and isinstance(skiprows, int): | ||
sheet_nrows = min(header + skiprows + nrows + 1, sheet_nrows) | ||
|
||
for i in range(sheet_nrows): | ||
if self.should_skip_row(i, header, skiprows, nrows): | ||
data.append([]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to append the empty list here? Would be preferable to just continue There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I removed |
||
continue | ||
row = [ | ||
_parse_cell(value, typ) | ||
for value, typ in zip(sheet.row_values(i), sheet.row_types(i)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this required? Looks like there are already checks within the subsequent functions for int values, no?