Skip to content

ENH: Optimize nrows in read_excel #33281

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 79 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
79 commits
Select commit Hold shift + click to select a range
900afff
ENH: Skip rows while reading excel file with engine=openpyxl
mproszewska Mar 27, 2020
df55b51
ENH: Skiping rows with odf engine
mproszewska Apr 4, 2020
8177024
ENH: Optimize nrows in read_excel
mproszewska Apr 4, 2020
79b34c3
Reformatted
mproszewska Apr 4, 2020
f0a2b8d
Fix linting
mproszewska Apr 4, 2020
70ac234
Add annotation to variable
mproszewska Apr 4, 2020
27cae3a
Add imports
mproszewska Apr 4, 2020
4248f8c
Add types
mproszewska Apr 4, 2020
70f46b3
ENH: Fix
mproszewska Apr 9, 2020
cdfc05d
ENH: Mark variables as optional
mproszewska Apr 9, 2020
502b5e3
Merge branch 'master' into excel
mproszewska Apr 9, 2020
4c8a42a
ENH: Move nrows variable check
mproszewska Apr 10, 2020
19bb927
ENH: Remove unused imports
mproszewska Apr 10, 2020
6c2a3b5
ENH: Move repeated code to base
mproszewska Apr 16, 2020
b865c88
ENH: Remove import
mproszewska Apr 16, 2020
49276da
ENH: Lint
mproszewska Apr 16, 2020
393a622
ENH: Lint
mproszewska Apr 17, 2020
e00fff1
ENH: Add docstring to should_read_row
mproszewska Apr 17, 2020
b14642b
ENH: Lint
mproszewska Apr 17, 2020
dfc794a
ENH: Lint
mproszewska Apr 17, 2020
7b501de
ENH: Move nrows value check
mproszewska Apr 17, 2020
3292f6b
ENH: Remove nrows validation
mproszewska Apr 17, 2020
bdd5780
Run tests
mproszewska Apr 24, 2020
1867088
ENH: Fix reading rows in openpyxl
mproszewska Apr 24, 2020
3c1eb10
ENH: Fix lint
mproszewska Apr 24, 2020
88c3117
Fix max_row variable definition
mproszewska Apr 24, 2020
dc60055
Fix max_row variable definition
mproszewska Apr 24, 2020
6fdedfd
Add typed in should_read_row function
mproszewska Apr 25, 2020
ba7175c
Add types and tests
mproszewska Apr 25, 2020
d884803
Add whatsnew
mproszewska Apr 25, 2020
368544a
Resolve conflict
mproszewska Apr 25, 2020
4be3d27
Fix import
mproszewska May 4, 2020
95e3e02
Update doc/source/whatsnew/v1.1.0.rst
mproszewska May 4, 2020
38520e2
Add index type
mproszewska May 4, 2020
0b21e61
Merge branch 'excel' of https://github.com/mproszewska/pandas into excel
mproszewska May 4, 2020
c03b46a
Parametrize test
mproszewska May 4, 2020
aa8cfe9
Fix lint
mproszewska May 5, 2020
c9a622d
Add decorator to test
mproszewska May 5, 2020
aa92783
Fix types defintion
mproszewska May 8, 2020
ee16c15
Change function name
mproszewska May 8, 2020
547787a
Leaks in test fix attempt
mproszewska May 8, 2020
368b77c
Reverse changes
mproszewska May 8, 2020
4d69922
Change skiping rows in openpyxl
mproszewska May 8, 2020
b4ae85f
Run tests again
mproszewska May 9, 2020
81f8674
Remove all changes in openpyxl because of leaks
mproszewska May 9, 2020
7938fcf
Run tests
mproszewska May 9, 2020
7f31a30
Fix
mproszewska May 9, 2020
ae8b84f
Add types
mproszewska May 10, 2020
a4f5009
Run tests
mproszewska May 10, 2020
d7a2892
Run tests again because of conda error
mproszewska May 10, 2020
c94b45e
PERF: Remove unnecessary copies in sorting functions
mproszewska May 15, 2020
0ab450b
Run tests
mproszewska May 16, 2020
54c7304
Run tests
mproszewska May 16, 2020
ca388dc
Merge branch 'master' into excel
mproszewska May 22, 2020
9ddc41c
Resolve conflicts
mproszewska May 22, 2020
1920905
Resolve conflicts
mproszewska May 22, 2020
6d72a34
Add asv
mproszewska May 22, 2020
5ba54a6
Run black
mproszewska May 22, 2020
2766270
Remove asv
mproszewska May 22, 2020
91176ca
Merge branch 'perf'
mproszewska May 24, 2020
f748b78
Merge remote-tracking branch 'upstream/master'
mproszewska May 28, 2020
ac823f5
Resolve conflict
mproszewska May 31, 2020
f4a805d
Resolve conflict
mproszewska May 31, 2020
6f188fe
Revert change
mproszewska May 31, 2020
ba314fe
Change should_skip_row function
mproszewska Jun 1, 2020
f923bfd
Fix return type
mproszewska Jun 1, 2020
008add5
Remove import
mproszewska Jun 1, 2020
c04c494
Merge remote-tracking branch 'upstream/master'
mproszewska Jun 1, 2020
596806c
Resolve conflict
mproszewska Jun 1, 2020
2226050
Run tests
mproszewska Jun 2, 2020
9216210
Add asv
mproszewska Jun 5, 2020
d9aa319
Add asv
mproszewska Jun 5, 2020
094d5f7
Merge branch 'master' into excel
mproszewska Jun 5, 2020
234dcc6
Resolve conflict
mproszewska Jun 5, 2020
0afb1b1
Fix
mproszewska Jun 5, 2020
33cb733
Merge branch 'master' into excel
mproszewska Jun 5, 2020
06003a8
Fix asv
mproszewska Jun 8, 2020
c08709b
Fix asv
mproszewska Jun 8, 2020
c9a2c75
Fix asv
mproszewska Jun 8, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion asv_bench/benchmarks/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@


def _generate_dataframe():
N = 2000
N = 20000
C = 5
df = DataFrame(
np.random.randn(N, C),
Expand Down Expand Up @@ -69,5 +69,9 @@ def time_read_excel(self, engine):
fname = self.fname_odf if engine == "odf" else self.fname_excel
read_excel(fname, engine=engine)

def time_read_excel_nrows(self, engine):
fname = self.fname_odf if engine == "odf" else self.fname_excel
read_excel(fname, engine=engine, nrows=1)


from ..pandas_vb_common import setup # noqa: F401 isort:skip
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -775,6 +775,7 @@ Performance improvements
- Performance improvement in :class:`pandas.core.groupby.RollingGroupby` (:issue:`34052`)
- Performance improvement in arithmetic operations (sub, add, mul, div) for MultiIndex (:issue:`34297`)
- Performance improvement in `DataFrame[bool_indexer]` when `bool_indexer` is a list (:issue:`33924`)
- Performance improvement in `read_excel` for integer ``header``, ``skiprows``, and ``nrows`` (:issue:`33281`).

.. ---------------------------------------------------------------------------

Expand Down
52 changes: 50 additions & 2 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
from io import BytesIO
import os
from textwrap import fill
from typing import List, Optional, Sequence

from pandas._config import config

from pandas._libs.parsers import STR_NA_VALUES
from pandas._typing import Scalar, Union
from pandas.errors import EmptyDataError
from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments

Expand Down Expand Up @@ -383,9 +385,46 @@ def get_sheet_by_index(self, index):
pass

@abc.abstractmethod
def get_sheet_data(self, sheet, convert_float):
def get_sheet_data(
self,
sheet,
convert_float: bool,
header: Optional[Union[int, Sequence[int]]],
skiprows: Optional[Union[int, Sequence[int]]],
nrows: Optional[int],
) -> List[List[Scalar]]:
pass

def should_skip_row(
self,
index: int,
header: Optional[Union[int, Sequence[int]]],
skiprows: Optional[Union[int, Sequence[int]]],
nrows: Optional[int],
) -> bool:
"""
Determines whether row should be read.

Parameters
----------
index : int
Index of row.
header : int, list of int
Rows used as column labels.
skiprows : int, list of int
Rows to skip at the begining.
nrows : int
Number of rows to parse.

Returns
-------
Bool determining if row should be skipped.
"""
if nrows is not None and isinstance(header, int) and isinstance(skiprows, int):
if index < header + skiprows - 1:
return True
return False

def parse(
self,
sheet_name=0,
Expand Down Expand Up @@ -439,7 +478,16 @@ def parse(
else: # assume an integer if not a string
sheet = self.get_sheet_by_index(asheetname)

data = self.get_sheet_data(sheet, convert_float)
gsd_header = 0 if header is None else header
gsd_skiprows = 0 if skiprows is None else skiprows
gsd_nrows = nrows if isinstance(nrows, int) else None

if isinstance(gsd_header, list) or isinstance(gsd_skiprows, list):
gsd_nrows = None

data = self.get_sheet_data(
sheet, convert_float, gsd_header, gsd_skiprows, gsd_nrows
)
usecols = _maybe_convert_usecols(usecols)

if not data:
Expand Down
21 changes: 18 additions & 3 deletions pandas/io/excel/_odfreader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import List
from typing import List, Optional, Sequence

from pandas._typing import FilePathOrBuffer, Scalar
from pandas._typing import FilePathOrBuffer, Scalar, Union
from pandas.compat._optional import import_optional_dependency

import pandas as pd
Expand Down Expand Up @@ -63,7 +63,14 @@ def get_sheet_by_name(self, name: str):

raise ValueError(f"sheet {name} not found")

def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
def get_sheet_data(
self,
sheet,
convert_float: bool,
header: Optional[Union[int, Sequence[int]]],
skiprows: Optional[Union[int, Sequence[int]]],
nrows: Optional[int],
) -> List[List[Scalar]]:
"""
Parse an ODF Table into a list of lists
"""
Expand All @@ -79,7 +86,15 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:

table: List[List[Scalar]] = []

if nrows is not None and isinstance(header, int) and isinstance(skiprows, int):
sheet_rows = sheet_rows[0 : header + skiprows + nrows + 1]

for i, sheet_row in enumerate(sheet_rows):

if self.should_skip_row(i, header, skiprows, nrows):
table.append([])
continue

sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names]
empty_cells = 0
table_row: List[Scalar] = []
Expand Down
15 changes: 12 additions & 3 deletions pandas/io/excel/_openpyxl.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from typing import List
from typing import List, Optional, Sequence

import numpy as np

from pandas._typing import FilePathOrBuffer, Scalar
from pandas._typing import FilePathOrBuffer, Scalar, Union
from pandas.compat._optional import import_optional_dependency

from pandas.io.excel._base import ExcelWriter, _BaseExcelReader
Expand Down Expand Up @@ -529,9 +529,18 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:

return cell.value

def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
def get_sheet_data(
self,
sheet,
convert_float: bool,
header: Optional[Union[int, Sequence[int]]],
skiprows: Optional[Union[int, Sequence[int]]],
nrows: Optional[int],
) -> List[List[Scalar]]:
data: List[List[Scalar]] = []

for row in sheet.rows:

data.append([self._convert_cell(cell, convert_float) for cell in row])

return data
13 changes: 10 additions & 3 deletions pandas/io/excel/_pyxlsb.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import List
from typing import List, Optional, Sequence

from pandas._typing import FilePathOrBuffer, Scalar
from pandas._typing import FilePathOrBuffer, Scalar, Union
from pandas.compat._optional import import_optional_dependency

from pandas.io.excel._base import _BaseExcelReader
Expand Down Expand Up @@ -62,7 +62,14 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:

return cell.v

def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
def get_sheet_data(
self,
sheet,
convert_float: bool,
header: Optional[Union[int, Sequence[int]]],
skiprows: Optional[Union[int, Sequence[int]]],
nrows: Optional[int],
) -> List[List[Scalar]]:
return [
[self._convert_cell(c, convert_float) for c in r]
for r in sheet.rows(sparse=False)
Expand Down
24 changes: 21 additions & 3 deletions pandas/io/excel/_xlrd.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from datetime import time
from typing import List, Optional, Sequence

import numpy as np

from pandas._typing import Scalar, Union
from pandas.compat._optional import import_optional_dependency

from pandas.io.excel._base import _BaseExcelReader
Expand Down Expand Up @@ -46,7 +48,14 @@ def get_sheet_by_name(self, name):
def get_sheet_by_index(self, index):
return self.book.sheet_by_index(index)

def get_sheet_data(self, sheet, convert_float):
def get_sheet_data(
self,
sheet,
convert_float: bool,
header: Optional[Union[int, Sequence[int]]],
skiprows: Optional[Union[int, Sequence[int]]],
nrows: Optional[int],
) -> List[List[Scalar]]:
from xlrd import (
xldate,
XL_CELL_DATE,
Expand Down Expand Up @@ -95,9 +104,18 @@ def _parse_cell(cell_contents, cell_typ):
cell_contents = val
return cell_contents

data = []
data: List[List[Scalar]] = []

sheet_nrows = sheet.nrows
if nrows is not None and isinstance(header, int) and isinstance(skiprows, int):
sheet_nrows = min(header + skiprows + nrows + 1, sheet_nrows)

for i in range(sheet_nrows):

if self.should_skip_row(i, header, skiprows, nrows):
data.append([])
continue

for i in range(sheet.nrows):
row = [
_parse_cell(value, typ)
for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1143,3 +1143,18 @@ def test_header_with_index_col(self, engine, filename):
filename, sheet_name="Sheet1", index_col=0, header=[0, 1]
)
tm.assert_frame_equal(expected, result)

@pytest.mark.parametrize("header, skiprows", [(1, 2), (0, 3)])
@td.check_file_leaks
def test_header_skiprows_nrows(self, engine, read_ext, header, skiprows):
# GH 32727
data = pd.read_excel("test1" + read_ext, engine=engine)
expected = (
DataFrame(data.iloc[3:6])
.reset_index(drop=True)
.rename(columns=data.iloc[2].rename(None))
)
actual = pd.read_excel(
"test1" + read_ext, engine=engine, header=header, skiprows=skiprows, nrows=3
)
tm.assert_frame_equal(expected, actual)