Skip to content

ENH: Optimize nrows in read_excel #33281

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 79 commits into from
Closed
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
79 commits
Select commit Hold shift + click to select a range
900afff
ENH: Skip rows while reading excel file with engine=openpyxl
mproszewska Mar 27, 2020
df55b51
ENH: Skiping rows with odf engine
mproszewska Apr 4, 2020
8177024
ENH: Optimize nrows in read_excel
mproszewska Apr 4, 2020
79b34c3
Reformatted
mproszewska Apr 4, 2020
f0a2b8d
Fix linting
mproszewska Apr 4, 2020
70ac234
Add annotation to variable
mproszewska Apr 4, 2020
27cae3a
Add imports
mproszewska Apr 4, 2020
4248f8c
Add types
mproszewska Apr 4, 2020
70f46b3
ENH: Fix
mproszewska Apr 9, 2020
cdfc05d
ENH: Mark variables as optional
mproszewska Apr 9, 2020
502b5e3
Merge branch 'master' into excel
mproszewska Apr 9, 2020
4c8a42a
ENH: Move nrows variable check
mproszewska Apr 10, 2020
19bb927
ENH: Remove unused imports
mproszewska Apr 10, 2020
6c2a3b5
ENH: Move repeated code to base
mproszewska Apr 16, 2020
b865c88
ENH: Remove import
mproszewska Apr 16, 2020
49276da
ENH: Lint
mproszewska Apr 16, 2020
393a622
ENH: Lint
mproszewska Apr 17, 2020
e00fff1
ENH: Add docstring to should_read_row
mproszewska Apr 17, 2020
b14642b
ENH: Lint
mproszewska Apr 17, 2020
dfc794a
ENH: Lint
mproszewska Apr 17, 2020
7b501de
ENH: Move nrows value check
mproszewska Apr 17, 2020
3292f6b
ENH: Remove nrows validation
mproszewska Apr 17, 2020
bdd5780
Run tests
mproszewska Apr 24, 2020
1867088
ENH: Fix reading rows in openpyxl
mproszewska Apr 24, 2020
3c1eb10
ENH: Fix lint
mproszewska Apr 24, 2020
88c3117
Fix max_row variable definition
mproszewska Apr 24, 2020
dc60055
Fix max_row variable definition
mproszewska Apr 24, 2020
6fdedfd
Add typed in should_read_row function
mproszewska Apr 25, 2020
ba7175c
Add types and tests
mproszewska Apr 25, 2020
d884803
Add whatsnew
mproszewska Apr 25, 2020
368544a
Resolve conflict
mproszewska Apr 25, 2020
4be3d27
Fix import
mproszewska May 4, 2020
95e3e02
Update doc/source/whatsnew/v1.1.0.rst
mproszewska May 4, 2020
38520e2
Add index type
mproszewska May 4, 2020
0b21e61
Merge branch 'excel' of https://github.com/mproszewska/pandas into excel
mproszewska May 4, 2020
c03b46a
Parametrize test
mproszewska May 4, 2020
aa8cfe9
Fix lint
mproszewska May 5, 2020
c9a622d
Add decorator to test
mproszewska May 5, 2020
aa92783
Fix types defintion
mproszewska May 8, 2020
ee16c15
Change function name
mproszewska May 8, 2020
547787a
Leaks in test fix attempt
mproszewska May 8, 2020
368b77c
Reverse changes
mproszewska May 8, 2020
4d69922
Change skiping rows in openpyxl
mproszewska May 8, 2020
b4ae85f
Run tests again
mproszewska May 9, 2020
81f8674
Remove all changes in openpyxl because of leaks
mproszewska May 9, 2020
7938fcf
Run tests
mproszewska May 9, 2020
7f31a30
Fix
mproszewska May 9, 2020
ae8b84f
Add types
mproszewska May 10, 2020
a4f5009
Run tests
mproszewska May 10, 2020
d7a2892
Run tests again because of conda error
mproszewska May 10, 2020
c94b45e
PERF: Remove unnecessary copies in sorting functions
mproszewska May 15, 2020
0ab450b
Run tests
mproszewska May 16, 2020
54c7304
Run tests
mproszewska May 16, 2020
ca388dc
Merge branch 'master' into excel
mproszewska May 22, 2020
9ddc41c
Resolve conflicts
mproszewska May 22, 2020
1920905
Resolve conflicts
mproszewska May 22, 2020
6d72a34
Add asv
mproszewska May 22, 2020
5ba54a6
Run black
mproszewska May 22, 2020
2766270
Remove asv
mproszewska May 22, 2020
91176ca
Merge branch 'perf'
mproszewska May 24, 2020
f748b78
Merge remote-tracking branch 'upstream/master'
mproszewska May 28, 2020
ac823f5
Resolve conflict
mproszewska May 31, 2020
f4a805d
Resolve conflict
mproszewska May 31, 2020
6f188fe
Revert change
mproszewska May 31, 2020
ba314fe
Change should_skip_row function
mproszewska Jun 1, 2020
f923bfd
Fix return type
mproszewska Jun 1, 2020
008add5
Remove import
mproszewska Jun 1, 2020
c04c494
Merge remote-tracking branch 'upstream/master'
mproszewska Jun 1, 2020
596806c
Resolve conflict
mproszewska Jun 1, 2020
2226050
Run tests
mproszewska Jun 2, 2020
9216210
Add asv
mproszewska Jun 5, 2020
d9aa319
Add asv
mproszewska Jun 5, 2020
094d5f7
Merge branch 'master' into excel
mproszewska Jun 5, 2020
234dcc6
Resolve conflict
mproszewska Jun 5, 2020
0afb1b1
Fix
mproszewska Jun 5, 2020
33cb733
Merge branch 'master' into excel
mproszewska Jun 5, 2020
06003a8
Fix asv
mproszewska Jun 8, 2020
c08709b
Fix asv
mproszewska Jun 8, 2020
c9a2c75
Fix asv
mproszewska Jun 8, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ def get_sheet_by_index(self, index):
pass

@abc.abstractmethod
def get_sheet_data(self, sheet, convert_float):
def get_sheet_data(self, sheet, convert_float, header, skiprows, nrows):
pass

def parse(
Expand Down Expand Up @@ -436,7 +436,7 @@ def parse(
else: # assume an integer if not a string
sheet = self.get_sheet_by_index(asheetname)

data = self.get_sheet_data(sheet, convert_float)
data = self.get_sheet_data(sheet, convert_float, header, skiprows, nrows)
usecols = _maybe_convert_usecols(usecols)

if not data:
Expand Down
36 changes: 33 additions & 3 deletions pandas/io/excel/_odfreader.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from typing import List
from typing import List, Optional, Sequence

from pandas._typing import FilePathOrBuffer, Scalar
from pandas._typing import FilePathOrBuffer, Scalar, Union
from pandas.compat._optional import import_optional_dependency

import pandas as pd

from pandas.io.excel._base import _BaseExcelReader
from pandas.io.parsers import _validate_integer


class _ODFReader(_BaseExcelReader):
Expand Down Expand Up @@ -63,7 +64,14 @@ def get_sheet_by_name(self, name: str):

raise ValueError(f"sheet {name} not found")

def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
def get_sheet_data(
self,
sheet,
convert_float: bool,
header: Optional[Union[int, Sequence[int]]],
skiprows: Optional[Union[int, Sequence[int]]],
nrows: Optional[int],
) -> List[List[Scalar]]:
"""
Parse an ODF Table into a list of lists
"""
Expand All @@ -79,7 +87,29 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:

table: List[List[Scalar]] = []

if nrows is not None:
_validate_integer("nrows", nrows)
header = 0 if header is None else header
skiprows = 0 if skiprows is None else skiprows
if isinstance(header, list) or isinstance(skiprows, list):
nrows = None

for i, sheet_row in enumerate(sheet_rows):

if nrows is not None:
if header > 1:
header -= 1
table.append([])
continue
elif skiprows > 0:
skiprows -= 1
table.append([])
continue
if nrows >= 0:
nrows -= 1
else:
break

sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names]
empty_cells = 0
table_row: List[Scalar] = []
Expand Down
37 changes: 34 additions & 3 deletions pandas/io/excel/_openpyxl.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from typing import List
from typing import List, Optional, Sequence

import numpy as np

from pandas._typing import FilePathOrBuffer, Scalar
from pandas._typing import FilePathOrBuffer, Scalar, Union
from pandas.compat._optional import import_optional_dependency

from pandas.io.excel._base import ExcelWriter, _BaseExcelReader
from pandas.io.excel._util import _validate_freeze_panes
from pandas.io.parsers import _validate_integer


class _OpenpyxlWriter(ExcelWriter):
Expand Down Expand Up @@ -524,9 +525,39 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:

return cell.value

def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
def get_sheet_data(
self,
sheet,
convert_float: bool,
header: Optional[Union[int, Sequence[int]]],
skiprows: Optional[Union[int, Sequence[int]]],
nrows: Optional[int],
) -> List[List[Scalar]]:
data: List[List[Scalar]] = []

if nrows is not None:
_validate_integer("nrows", nrows)
header = 0 if header is None else header
skiprows = 0 if skiprows is None else skiprows
if isinstance(header, list) or isinstance(skiprows, list):
nrows = None

for row in sheet.rows:

if nrows is not None:
if header > 1:
header -= 1
data.append([])
continue
elif skiprows > 0:
skiprows -= 1
data.append([])
continue
if nrows >= 0:
nrows -= 1
else:
break

data.append([self._convert_cell(cell, convert_float) for cell in row])

return data
4 changes: 3 additions & 1 deletion pandas/io/excel/_pyxlsb.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:

return cell.v

def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
def get_sheet_data(
self, sheet, convert_float: bool, header, skiprows, nrows
) -> List[List[Scalar]]:
return [
[self._convert_cell(c, convert_float) for c in r]
for r in sheet.rows(sparse=False)
Expand Down
35 changes: 33 additions & 2 deletions pandas/io/excel/_xlrd.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from datetime import time
from typing import List, Optional, Sequence

import numpy as np

from pandas._typing import Scalar, Union
from pandas.compat._optional import import_optional_dependency

from pandas.io.excel._base import _BaseExcelReader
from pandas.io.parsers import _validate_integer


class _XlrdReader(_BaseExcelReader):
Expand Down Expand Up @@ -46,7 +49,14 @@ def get_sheet_by_name(self, name):
def get_sheet_by_index(self, index):
return self.book.sheet_by_index(index)

def get_sheet_data(self, sheet, convert_float):
def get_sheet_data(
self,
sheet,
convert_float,
header: Optional[Union[int, Sequence[int]]],
skiprows: Optional[Union[int, Sequence[int]]],
nrows: Optional[int],
) -> List[List[Scalar]]:
from xlrd import (
xldate,
XL_CELL_DATE,
Expand Down Expand Up @@ -95,9 +105,30 @@ def _parse_cell(cell_contents, cell_typ):
cell_contents = val
return cell_contents

data = []
data: List[List[Scalar]] = []

if nrows is not None:
_validate_integer("nrows", nrows)
header = 0 if header is None else header
skiprows = 0 if skiprows is None else skiprows
if isinstance(header, list) or isinstance(skiprows, list):
nrows = None
for i in range(sheet.nrows):

if nrows is not None:
if header > 1:
header -= 1
data.append([])
continue
elif skiprows > 0:
skiprows -= 1
data.append([])
continue
if nrows >= 0:
nrows -= 1
else:
break

row = [
_parse_cell(value, typ)
for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
Expand Down