Skip to content

Revert "ENH: Optimize nrows in read_excel" #36537

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions asv_bench/benchmarks/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@


def _generate_dataframe():
N = 20000
N = 2000
C = 5
df = DataFrame(
np.random.randn(N, C),
Expand Down Expand Up @@ -69,9 +69,5 @@ def time_read_excel(self, engine):
fname = self.fname_odf if engine == "odf" else self.fname_excel
read_excel(fname, engine=engine)

def time_read_excel_nrows(self, engine):
fname = self.fname_odf if engine == "odf" else self.fname_excel
read_excel(fname, engine=engine, nrows=1)


from ..pandas_vb_common import setup # noqa: F401 isort:skip
1 change: 0 additions & 1 deletion doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,6 @@ Performance improvements

- Performance improvements when creating DataFrame or Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`)
- Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`)
- Performance improvement in `read_excel` for when ``nrows`` is much smaller than the length of the file (:issue:`33281`).
- Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`)
- Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`)
- ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`)
Expand Down
30 changes: 4 additions & 26 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
from io import BufferedIOBase, BytesIO, RawIOBase
import os
from textwrap import fill
from typing import Any, List, Mapping, Optional, Union
from typing import Any, Mapping, Union

from pandas._config import config

from pandas._libs.parsers import STR_NA_VALUES
from pandas._typing import Scalar, StorageOptions
from pandas._typing import StorageOptions
from pandas.errors import EmptyDataError
from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments

Expand Down Expand Up @@ -398,14 +398,7 @@ def get_sheet_by_index(self, index):
pass

@abc.abstractmethod
def get_sheet_data(
self,
sheet,
convert_float: bool,
header_nrows: int,
skiprows_nrows: int,
nrows: Optional[int],
) -> List[List[Scalar]]:
def get_sheet_data(self, sheet, convert_float):
pass

def parse(
Expand Down Expand Up @@ -461,22 +454,7 @@ def parse(
else: # assume an integer if not a string
sheet = self.get_sheet_by_index(asheetname)

if isinstance(header, int):
header_nrows = header
elif header is None:
header_nrows = 0
else:
header_nrows = max(header)
if isinstance(skiprows, int):
skiprows_nrows = skiprows
elif skiprows is None:
skiprows_nrows = 0
else:
skiprows_nrows = len(skiprows)

data = self.get_sheet_data(
sheet, convert_float, header_nrows, skiprows_nrows, nrows
)
data = self.get_sheet_data(sheet, convert_float)
usecols = maybe_convert_usecols(usecols)

if not data:
Expand Down
13 changes: 2 additions & 11 deletions pandas/io/excel/_odfreader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Optional, cast
from typing import List, cast

import numpy as np

Expand Down Expand Up @@ -71,14 +71,7 @@ def get_sheet_by_name(self, name: str):

raise ValueError(f"sheet {name} not found")

def get_sheet_data(
self,
sheet,
convert_float: bool,
header_nrows: int,
skiprows_nrows: int,
nrows: Optional[int],
) -> List[List[Scalar]]:
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
"""
Parse an ODF Table into a list of lists
"""
Expand All @@ -94,8 +87,6 @@ def get_sheet_data(

table: List[List[Scalar]] = []

if isinstance(nrows, int):
sheet_rows = sheet_rows[: header_nrows + skiprows_nrows + nrows + 1]
for i, sheet_row in enumerate(sheet_rows):
sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names]
empty_cells = 0
Expand Down
9 changes: 1 addition & 8 deletions pandas/io/excel/_openpyxl.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,14 +508,7 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:

return cell.value

def get_sheet_data(
self,
sheet,
convert_float: bool,
header_nrows: int,
skiprows_nrows: int,
nrows: Optional[int],
) -> List[List[Scalar]]:
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
data: List[List[Scalar]] = []
for row in sheet.rows:
data.append([self._convert_cell(cell, convert_float) for cell in row])
Expand Down
11 changes: 2 additions & 9 deletions pandas/io/excel/_pyxlsb.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Optional
from typing import List

from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions
from pandas.compat._optional import import_optional_dependency
Expand Down Expand Up @@ -68,14 +68,7 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:

return cell.v

def get_sheet_data(
self,
sheet,
convert_float: bool,
header_nrows: int,
skiprows_nrows: int,
nrows: Optional[int],
) -> List[List[Scalar]]:
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
return [
[self._convert_cell(c, convert_float) for c in r]
for r in sheet.rows(sparse=False)
Expand Down
21 changes: 4 additions & 17 deletions pandas/io/excel/_xlrd.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from datetime import time
from typing import List, Optional

import numpy as np

from pandas._typing import Scalar, StorageOptions
from pandas._typing import StorageOptions
from pandas.compat._optional import import_optional_dependency

from pandas.io.excel._base import BaseExcelReader
Expand Down Expand Up @@ -50,14 +49,7 @@ def get_sheet_by_name(self, name):
def get_sheet_by_index(self, index):
return self.book.sheet_by_index(index)

def get_sheet_data(
self,
sheet,
convert_float: bool,
header_nrows: int,
skiprows_nrows: int,
nrows: Optional[int],
) -> List[List[Scalar]]:
def get_sheet_data(self, sheet, convert_float):
from xlrd import (
XL_CELL_BOOLEAN,
XL_CELL_DATE,
Expand Down Expand Up @@ -106,14 +98,9 @@ def _parse_cell(cell_contents, cell_typ):
cell_contents = val
return cell_contents

data: List[List[Scalar]] = []
data = []

sheet_nrows = sheet.nrows

if isinstance(nrows, int):
sheet_nrows = min(header_nrows + skiprows_nrows + nrows + 1, sheet_nrows)

for i in range(sheet_nrows):
for i in range(sheet.nrows):
row = [
_parse_cell(value, typ)
for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
Expand Down
16 changes: 0 additions & 16 deletions pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1195,21 +1195,5 @@ def test_read_datetime_multiindex(self, engine, read_ext):
],
)
expected = pd.DataFrame([], columns=expected_column_index)
tm.assert_frame_equal(expected, actual)

@pytest.mark.parametrize(
"header, skiprows", [(1, 2), (0, 3), (1, [0, 1]), ([2], 1)]
)
@td.check_file_leaks
def test_header_skiprows_nrows(self, engine, read_ext, header, skiprows):
# GH 32727
data = pd.read_excel("test1" + read_ext, engine=engine)
expected = (
DataFrame(data.iloc[3:6])
.reset_index(drop=True)
.rename(columns=data.iloc[2].rename(None))
)
actual = pd.read_excel(
"test1" + read_ext, engine=engine, header=header, skiprows=skiprows, nrows=3
)
tm.assert_frame_equal(expected, actual)