From 728b37744cddb1604b4bc7e801de918ef8b688a1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 21 Sep 2020 18:41:14 -0700 Subject: [PATCH] Revert "ENH: Optimize nrows in read_excel (#35974)" This reverts commit e975f3def1ff430d5801fbe241c52d7206c79956. --- asv_bench/benchmarks/io/excel.py | 6 +----- doc/source/whatsnew/v1.2.0.rst | 1 - pandas/io/excel/_base.py | 30 ++++----------------------- pandas/io/excel/_odfreader.py | 13 ++---------- pandas/io/excel/_openpyxl.py | 9 +------- pandas/io/excel/_pyxlsb.py | 11 ++-------- pandas/io/excel/_xlrd.py | 21 ++++--------------- pandas/tests/io/excel/test_readers.py | 16 -------------- 8 files changed, 14 insertions(+), 93 deletions(-) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 1eaccb9f2d897..80af2cff41769 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -11,7 +11,7 @@ def _generate_dataframe(): - N = 20000 + N = 2000 C = 5 df = DataFrame( np.random.randn(N, C), @@ -69,9 +69,5 @@ def time_read_excel(self, engine): fname = self.fname_odf if engine == "odf" else self.fname_excel read_excel(fname, engine=engine) - def time_read_excel_nrows(self, engine): - fname = self.fname_odf if engine == "odf" else self.fname_excel - read_excel(fname, engine=engine, nrows=1) - from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 19a563be0a568..18940b574b517 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -224,7 +224,6 @@ Performance improvements - Performance improvements when creating DataFrame or Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`) - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) -- Performance improvement in `read_excel` for when ``nrows`` is much smaller than the length of the file (:issue:`33281`). - Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`) - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) - ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 667f37f47e188..604b7e12ec243 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -3,12 +3,12 @@ from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill -from typing import Any, List, Mapping, Optional, Union +from typing import Any, Mapping, Union from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES -from pandas._typing import Scalar, StorageOptions +from pandas._typing import StorageOptions from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments @@ -398,14 +398,7 @@ def get_sheet_by_index(self, index): pass @abc.abstractmethod - def get_sheet_data( - self, - sheet, - convert_float: bool, - header_nrows: int, - skiprows_nrows: int, - nrows: Optional[int], - ) -> List[List[Scalar]]: + def get_sheet_data(self, sheet, convert_float): pass def parse( @@ -461,22 +454,7 @@ def parse( else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) - if isinstance(header, int): - header_nrows = header - elif header is None: - header_nrows = 0 - else: - header_nrows = max(header) - if isinstance(skiprows, int): - skiprows_nrows = skiprows - elif skiprows is None: - skiprows_nrows = 0 - else: - skiprows_nrows = len(skiprows) - - data = self.get_sheet_data( - sheet, convert_float, header_nrows, skiprows_nrows, nrows - ) + data = self.get_sheet_data(sheet, convert_float) usecols = maybe_convert_usecols(usecols) if not data: diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 07d2f9a593b96..4f9f8a29c0010 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -1,4 +1,4 @@ -from typing import List, Optional, cast +from typing import List, cast import numpy as np @@ -71,14 +71,7 @@ def get_sheet_by_name(self, name: str): raise ValueError(f"sheet {name} not found") - def get_sheet_data( - self, - sheet, - convert_float: bool, - header_nrows: int, - skiprows_nrows: int, - nrows: Optional[int], - ) -> List[List[Scalar]]: + def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: """ Parse an ODF Table into a list of lists """ @@ -94,8 +87,6 @@ def get_sheet_data( table: List[List[Scalar]] = [] - if isinstance(nrows, int): - sheet_rows = sheet_rows[: header_nrows + skiprows_nrows + nrows + 1] for i, sheet_row in enumerate(sheet_rows): sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] empty_cells = 0 diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index bc7b168eeaaa2..a5cadf4d93389 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -508,14 +508,7 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.value - def get_sheet_data( - self, - sheet, - convert_float: bool, - header_nrows: int, - skiprows_nrows: int, - nrows: Optional[int], - ) -> List[List[Scalar]]: + def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: data: List[List[Scalar]] = [] for row in sheet.rows: data.append([self._convert_cell(cell, convert_float) for cell in row]) diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index cf3dcebdff6eb..ac94f4dd3df74 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import List from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency @@ -68,14 +68,7 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.v - def get_sheet_data( - self, - sheet, - convert_float: bool, - header_nrows: int, - skiprows_nrows: int, - nrows: Optional[int], - ) -> List[List[Scalar]]: + def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: return [ [self._convert_cell(c, convert_float) for c in r] for r in sheet.rows(sparse=False) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index e5d0d66f9570a..dfd5dde0329ae 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -1,9 +1,8 @@ from datetime import time -from typing import List, Optional import numpy as np -from pandas._typing import Scalar, StorageOptions +from pandas._typing import StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import BaseExcelReader @@ -50,14 +49,7 @@ def get_sheet_by_name(self, name): def get_sheet_by_index(self, index): return self.book.sheet_by_index(index) - def get_sheet_data( - self, - sheet, - convert_float: bool, - header_nrows: int, - skiprows_nrows: int, - nrows: Optional[int], - ) -> List[List[Scalar]]: + def get_sheet_data(self, sheet, convert_float): from xlrd import ( XL_CELL_BOOLEAN, XL_CELL_DATE, @@ -106,14 +98,9 @@ def _parse_cell(cell_contents, cell_typ): cell_contents = val return cell_contents - data: List[List[Scalar]] = [] + data = [] - sheet_nrows = sheet.nrows - - if isinstance(nrows, int): - sheet_nrows = min(header_nrows + skiprows_nrows + nrows + 1, sheet_nrows) - - for i in range(sheet_nrows): + for i in range(sheet.nrows): row = [ _parse_cell(value, typ) for value, typ in zip(sheet.row_values(i), sheet.row_types(i)) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 4fb1ef8fa0c15..4bdcc5b327fa7 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1195,21 +1195,5 @@ def test_read_datetime_multiindex(self, engine, read_ext): ], ) expected = pd.DataFrame([], columns=expected_column_index) - tm.assert_frame_equal(expected, actual) - @pytest.mark.parametrize( - "header, skiprows", [(1, 2), (0, 3), (1, [0, 1]), ([2], 1)] - ) - @td.check_file_leaks - def test_header_skiprows_nrows(self, engine, read_ext, header, skiprows): - # GH 32727 - data = pd.read_excel("test1" + read_ext, engine=engine) - expected = ( - DataFrame(data.iloc[3:6]) - .reset_index(drop=True) - .rename(columns=data.iloc[2].rename(None)) - ) - actual = pd.read_excel( - "test1" + read_ext, engine=engine, header=header, skiprows=skiprows, nrows=3 - ) tm.assert_frame_equal(expected, actual)