ENH: Optimize nrows in read_excel (#35974)

MarcoGorelli · web-flow · commit e975f3def1ff · 2020-09-21T14:47:14.000-07:00
diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py
@@ -11,7 +11,7 @@
 
 
 def _generate_dataframe():
-    N = 2000
+    N = 20000
     C = 5
     df = DataFrame(
         np.random.randn(N, C),
@@ -69,5 +69,9 @@ def time_read_excel(self, engine):
         fname = self.fname_odf if engine == "odf" else self.fname_excel
         read_excel(fname, engine=engine)
 
+    def time_read_excel_nrows(self, engine):
+        fname = self.fname_odf if engine == "odf" else self.fname_excel
+        read_excel(fname, engine=engine, nrows=1)
+
 
 from ..pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -224,6 +224,7 @@ Performance improvements
 
 - Performance improvements when creating DataFrame or Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`)
 - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`)
+- Performance improvement in `read_excel` for when ``nrows`` is much smaller than the length of the file (:issue:`33281`).
 - Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`)
 - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`)
 - ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`)
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -3,12 +3,12 @@
 from io import BufferedIOBase, BytesIO, RawIOBase
 import os
 from textwrap import fill
-from typing import Any, Mapping, Union
+from typing import Any, List, Mapping, Optional, Union
 
 from pandas._config import config
 
 from pandas._libs.parsers import STR_NA_VALUES
-from pandas._typing import StorageOptions
+from pandas._typing import Scalar, StorageOptions
 from pandas.errors import EmptyDataError
 from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments
 
@@ -398,7 +398,14 @@ def get_sheet_by_index(self, index):
         pass
 
     @abc.abstractmethod
-    def get_sheet_data(self, sheet, convert_float):
+    def get_sheet_data(
+        self,
+        sheet,
+        convert_float: bool,
+        header_nrows: int,
+        skiprows_nrows: int,
+        nrows: Optional[int],
+    ) -> List[List[Scalar]]:
         pass
 
     def parse(
@@ -454,7 +461,22 @@ def parse(
             else:  # assume an integer if not a string
                 sheet = self.get_sheet_by_index(asheetname)
 
-            data = self.get_sheet_data(sheet, convert_float)
+            if isinstance(header, int):
+                header_nrows = header
+            elif header is None:
+                header_nrows = 0
+            else:
+                header_nrows = max(header)
+            if isinstance(skiprows, int):
+                skiprows_nrows = skiprows
+            elif skiprows is None:
+                skiprows_nrows = 0
+            else:
+                skiprows_nrows = len(skiprows)
+
+            data = self.get_sheet_data(
+                sheet, convert_float, header_nrows, skiprows_nrows, nrows
+            )
             usecols = maybe_convert_usecols(usecols)
 
             if not data:
diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py
@@ -1,4 +1,4 @@
-from typing import List, cast
+from typing import List, Optional, cast
 
 import numpy as np
 
@@ -71,7 +71,14 @@ def get_sheet_by_name(self, name: str):
 
         raise ValueError(f"sheet {name} not found")
 
-    def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
+    def get_sheet_data(
+        self,
+        sheet,
+        convert_float: bool,
+        header_nrows: int,
+        skiprows_nrows: int,
+        nrows: Optional[int],
+    ) -> List[List[Scalar]]:
         """
         Parse an ODF Table into a list of lists
         """
@@ -87,6 +94,8 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
 
         table: List[List[Scalar]] = []
 
+        if isinstance(nrows, int):
+            sheet_rows = sheet_rows[: header_nrows + skiprows_nrows + nrows + 1]
         for i, sheet_row in enumerate(sheet_rows):
             sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names]
             empty_cells = 0
diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py
@@ -508,7 +508,14 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
 
         return cell.value
 
-    def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
+    def get_sheet_data(
+        self,
+        sheet,
+        convert_float: bool,
+        header_nrows: int,
+        skiprows_nrows: int,
+        nrows: Optional[int],
+    ) -> List[List[Scalar]]:
         data: List[List[Scalar]] = []
         for row in sheet.rows:
             data.append([self._convert_cell(cell, convert_float) for cell in row])
diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Optional
 
 from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions
 from pandas.compat._optional import import_optional_dependency
@@ -68,7 +68,14 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
 
         return cell.v
 
-    def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
+    def get_sheet_data(
+        self,
+        sheet,
+        convert_float: bool,
+        header_nrows: int,
+        skiprows_nrows: int,
+        nrows: Optional[int],
+    ) -> List[List[Scalar]]:
         return [
             [self._convert_cell(c, convert_float) for c in r]
             for r in sheet.rows(sparse=False)
diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py
@@ -1,8 +1,9 @@
 from datetime import time
+from typing import List, Optional
 
 import numpy as np
 
-from pandas._typing import StorageOptions
+from pandas._typing import Scalar, StorageOptions
 from pandas.compat._optional import import_optional_dependency
 
 from pandas.io.excel._base import BaseExcelReader
@@ -49,7 +50,14 @@ def get_sheet_by_name(self, name):
     def get_sheet_by_index(self, index):
         return self.book.sheet_by_index(index)
 
-    def get_sheet_data(self, sheet, convert_float):
+    def get_sheet_data(
+        self,
+        sheet,
+        convert_float: bool,
+        header_nrows: int,
+        skiprows_nrows: int,
+        nrows: Optional[int],
+    ) -> List[List[Scalar]]:
         from xlrd import (
             XL_CELL_BOOLEAN,
             XL_CELL_DATE,
@@ -98,9 +106,14 @@ def _parse_cell(cell_contents, cell_typ):
                     cell_contents = val
             return cell_contents
 
-        data = []
+        data: List[List[Scalar]] = []
 
-        for i in range(sheet.nrows):
+        sheet_nrows = sheet.nrows
+
+        if isinstance(nrows, int):
+            sheet_nrows = min(header_nrows + skiprows_nrows + nrows + 1, sheet_nrows)
+
+        for i in range(sheet_nrows):
             row = [
                 _parse_cell(value, typ)
                 for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -1195,5 +1195,21 @@ def test_read_datetime_multiindex(self, engine, read_ext):
             ],
         )
         expected = pd.DataFrame([], columns=expected_column_index)
+        tm.assert_frame_equal(expected, actual)
 
+    @pytest.mark.parametrize(
+        "header, skiprows", [(1, 2), (0, 3), (1, [0, 1]), ([2], 1)]
+    )
+    @td.check_file_leaks
+    def test_header_skiprows_nrows(self, engine, read_ext, header, skiprows):
+        # GH 32727
+        data = pd.read_excel("test1" + read_ext, engine=engine)
+        expected = (
+            DataFrame(data.iloc[3:6])
+            .reset_index(drop=True)
+            .rename(columns=data.iloc[2].rename(None))
+        )
+        actual = pd.read_excel(
+            "test1" + read_ext, engine=engine, header=header, skiprows=skiprows, nrows=3
+        )
         tm.assert_frame_equal(expected, actual)