From 6fd0422513c0c1874714206e542fd84ae3006461 Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Wed, 27 Apr 2022 21:18:15 -0600 Subject: [PATCH 01/14] Check for nrows in read_excel --- pandas/io/excel/_base.py | 50 +++++++++++++++++++++++++++++++++-- pandas/io/excel/_odfreader.py | 4 ++- pandas/io/excel/_openpyxl.py | 6 ++++- pandas/io/excel/_pyxlsb.py | 6 ++++- pandas/io/excel/_xlrd.py | 11 ++++++-- 5 files changed, 70 insertions(+), 7 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 030ae9fefda98..ac265ad221ff9 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -563,7 +563,7 @@ def get_sheet_by_index(self, index: int): pass @abc.abstractmethod - def get_sheet_data(self, sheet, convert_float: bool): + def get_sheet_data(self, sheet, convert_float: bool, rows: int | None = None): pass def raise_if_bad_sheet_by_index(self, index: int) -> None: @@ -577,6 +577,51 @@ def raise_if_bad_sheet_by_name(self, name: str) -> None: if name not in self.sheet_names: raise ValueError(f"Worksheet named '{name}' not found") + def _check_skiprows_func(self, skiprows, rows_to_use): + """ + See how many file rows are required when `skiprows` is callable + """ + i = 0 + rows_used_so_far = 0 + while rows_used_so_far < rows_to_use: + if not f(i): + rows_used_so_far += 1 + i += 1 + return i + + def _calc_rows(self, header, index_col, skiprows, nrows): + """ + If nrows specifed, find the number of rows needed from the file + """ + if nrows is None: + return + if not isinstance(nrows, int) or nrows < 0: + raise ValueError("'nrows' must be an integer >=0") + if header is None: + header_rows = 1 + elif isinstance(header, int): + header_rows = 1 + header + else: + header_rows = 1 + header[-1] + # If there is a MultiIndex header and an index then there is also + # a row containing just the index name(s) + if is_list_like(header) and len(header) > 1 and index_col is not None: + header_rows += 1 + if skiprows is None: + return header_rows + nrows + if isinstance(skiprows, int): + return header_rows + nrows + skiprows + if is_list_like(skiprows): + return self._check_skiprows_func( + lambda x: x in skiprows, + header_rows + nrows, + ) + if callable(skiprows): + return self._check_skiprows_func( + skiprows, + header_rows + nrows, + ) + def parse( self, sheet_name: str | int | list[int] | list[str] | None = 0, @@ -643,7 +688,8 @@ def parse( else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) - data = self.get_sheet_data(sheet, convert_float) + file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows) + data = self.get_sheet_data(sheet, convert_float, file_rows_needed) if hasattr(sheet, "close"): # pyxlsb opens two TemporaryFiles sheet.close() diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 5a7e5b0d8d325..075590f3535fe 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -90,7 +90,7 @@ def get_sheet_by_name(self, name: str): raise ValueError(f"sheet {name} not found") def get_sheet_data( - self, sheet, convert_float: bool + self, sheet, convert_float: bool, file_rows_needed: int | None = None ) -> list[list[Scalar | NaTType]]: """ Parse an ODF Table into a list of lists @@ -148,6 +148,8 @@ def get_sheet_data( empty_rows = 0 for _ in range(row_repeat): table.append(table_row) + if file_rows_needed is not None and len(table) >= file_rows_needed: + break # Make our table square for row in table: diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 6d70b3f319f37..8f4201d0befff 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -588,7 +588,9 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.value - def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]: + def get_sheet_data( + self, sheet, convert_float: bool, file_rows_needed: int | None = None + ) -> list[list[Scalar]]: if self.book.read_only: sheet.reset_dimensions() @@ -603,6 +605,8 @@ def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]: if converted_row: last_row_with_data = row_number data.append(converted_row) + if file_rows_needed is not None and len(data) >= file_rows_needed: + break # Trim trailing empty rows data = data[: last_row_with_data + 1] diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 36e2645560078..f05a9f975b0bf 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -79,7 +79,9 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.v - def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]: + def get_sheet_data( + self, sheet, convert_float: bool, file_rows_needed: int | None = None, + ) -> list[list[Scalar]]: data: list[list[Scalar]] = [] prevous_row_number = -1 # When sparse=True the rows can have different lengths and empty rows are @@ -94,6 +96,8 @@ def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]: data.extend([[]] * (row_number - prevous_row_number - 1)) data.append(converted_row) prevous_row_number = row_number + if file_rows_needed is not None and len(data) >= file_rows_needed: + break if data: # extend rows to max_width max_width = max(len(data_row) for data_row in data) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index f38a05e7a4e64..bd971837c7d89 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from datetime import time import numpy as np @@ -56,7 +58,9 @@ def get_sheet_by_index(self, index): self.raise_if_bad_sheet_by_index(index) return self.book.sheet_by_index(index) - def get_sheet_data(self, sheet, convert_float): + def get_sheet_data( + self, sheet, convert_float: bool, file_rows_needed: int | None = None + ) -> list[list[Scalar]]: from xlrd import ( XL_CELL_BOOLEAN, XL_CELL_DATE, @@ -107,7 +111,10 @@ def _parse_cell(cell_contents, cell_typ): data = [] - for i in range(sheet.nrows): + nrows = sheet.nrows + if file_rows_needed is not None: + nrows = min(nrows, file_rows_needed) + for i in range(nrows): row = [ _parse_cell(value, typ) for value, typ in zip(sheet.row_values(i), sheet.row_types(i)) From e4d52d8864ba95952918cb31ea1910360ad9d6a8 Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Thu, 28 Apr 2022 21:44:42 -0600 Subject: [PATCH 02/14] Add nrows tests for multiindex and skiprows --- pandas/io/excel/_base.py | 2 +- pandas/tests/io/excel/test_readers.py | 91 +++++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 1 deletion(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index ac265ad221ff9..0966882c95a4c 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -584,7 +584,7 @@ def _check_skiprows_func(self, skiprows, rows_to_use): i = 0 rows_used_so_far = 0 while rows_used_so_far < rows_to_use: - if not f(i): + if not skiprows(i): rows_used_so_far += 1 i += 1 return i diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 1e0f74ea41453..9eed3e5518e65 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1219,6 +1219,97 @@ def test_read_excel_nrows_non_integer_parameter(self, read_ext): with pytest.raises(ValueError, match=msg): pd.read_excel("test1" + read_ext, nrows="5") + def test_read_excel_nrows_mi_column(self, read_ext): + expected = pd.read_excel( + "testmultiindex" + read_ext, + sheet_name="mi_column", + header=[0, 1], + index_col=0, + ).iloc[:3] + actual = pd.read_excel( + "testmultiindex" + read_ext, + sheet_name="mi_column", + header=[0, 1], + index_col=0, + nrows=3 + ) + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows_mi_index(self, read_ext): + expected = pd.read_excel( + "testmultiindex" + read_ext, + sheet_name="mi_index", + index_col=[0, 1], + ).iloc[:3] + actual = pd.read_excel( + "testmultiindex" + read_ext, + sheet_name="mi_index", + index_col=[0, 1], + nrows=3 + ) + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows_mi_both(self, read_ext): + expected = pd.read_excel( + "testmultiindex" + read_ext, + sheet_name="both", + header=[0, 1], + index_col=[0, 1], + ).iloc[:3] + actual = pd.read_excel( + "testmultiindex" + read_ext, + sheet_name="both", + header=[0, 1], + index_col=[0, 1], + nrows=3 + ) + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows_mi_column_name(self, read_ext): + expected = pd.read_excel( + "testmultiindex" + read_ext, + sheet_name="mi_column_name", + header=[0, 1], + index_col=0, + ).iloc[:3] + actual = pd.read_excel( + "testmultiindex" + read_ext, + sheet_name="mi_column_name", + header=[0, 1], + index_col=0, + nrows=3 + ) + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows_skiprows_list(self, read_ext): + expected = pd.read_excel( + "testskiprows" + read_ext, + sheet_name="skiprows_list", + skiprows=[0, 2], + ).iloc[:3] + actual = pd.read_excel( + "testskiprows" + read_ext, + sheet_name="skiprows_list", + skiprows=[0, 2], + nrows=3, + ) + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows_skiprows_func(self, read_ext): + func = lambda x: x == 0 or x == 2 + expected = pd.read_excel( + "testskiprows" + read_ext, + sheet_name="skiprows_list", + skiprows=[0, 2], + ).iloc[:3] + actual = pd.read_excel( + "testskiprows" + read_ext, + sheet_name="skiprows_list", + skiprows=[0, 2], + nrows=3, + ) + tm.assert_frame_equal(actual, expected) + def test_read_excel_squeeze(self, read_ext): # GH 12157 f = "test_squeeze" + read_ext From eb83a74fb2f4a9ef8bb507ff52b6652e6fb49342 Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Thu, 28 Apr 2022 21:57:34 -0600 Subject: [PATCH 03/14] code linting and fix a bug in a test --- pandas/io/excel/_base.py | 4 ++-- pandas/io/excel/_pyxlsb.py | 5 ++++- pandas/io/excel/_xlrd.py | 5 ++++- pandas/tests/io/excel/test_readers.py | 24 ++++++++++++------------ 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 0966882c95a4c..44d140a9e88c1 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -591,12 +591,12 @@ def _check_skiprows_func(self, skiprows, rows_to_use): def _calc_rows(self, header, index_col, skiprows, nrows): """ - If nrows specifed, find the number of rows needed from the file + If nrows specified, find the number of rows needed from the file """ if nrows is None: return if not isinstance(nrows, int) or nrows < 0: - raise ValueError("'nrows' must be an integer >=0") + raise ValueError("'nrows' must be an integer >=0") if header is None: header_rows = 1 elif isinstance(header, int): diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index f05a9f975b0bf..5d40ccdf2f8f3 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -80,7 +80,10 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.v def get_sheet_data( - self, sheet, convert_float: bool, file_rows_needed: int | None = None, + self, + sheet, + convert_float: bool, + file_rows_needed: int | None = None, ) -> list[list[Scalar]]: data: list[list[Scalar]] = [] prevous_row_number = -1 diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index bd971837c7d89..0bf3ac6134cf6 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -4,7 +4,10 @@ import numpy as np -from pandas._typing import StorageOptions +from pandas._typing import ( + Scalar, + StorageOptions, +) from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 9eed3e5518e65..23d24b0428499 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1225,13 +1225,13 @@ def test_read_excel_nrows_mi_column(self, read_ext): sheet_name="mi_column", header=[0, 1], index_col=0, - ).iloc[:3] + ).iloc[:3] actual = pd.read_excel( "testmultiindex" + read_ext, sheet_name="mi_column", header=[0, 1], index_col=0, - nrows=3 + nrows=3, ) tm.assert_frame_equal(actual, expected) @@ -1240,12 +1240,12 @@ def test_read_excel_nrows_mi_index(self, read_ext): "testmultiindex" + read_ext, sheet_name="mi_index", index_col=[0, 1], - ).iloc[:3] + ).iloc[:3] actual = pd.read_excel( "testmultiindex" + read_ext, sheet_name="mi_index", index_col=[0, 1], - nrows=3 + nrows=3, ) tm.assert_frame_equal(actual, expected) @@ -1255,13 +1255,13 @@ def test_read_excel_nrows_mi_both(self, read_ext): sheet_name="both", header=[0, 1], index_col=[0, 1], - ).iloc[:3] + ).iloc[:3] actual = pd.read_excel( "testmultiindex" + read_ext, sheet_name="both", header=[0, 1], index_col=[0, 1], - nrows=3 + nrows=3, ) tm.assert_frame_equal(actual, expected) @@ -1271,13 +1271,13 @@ def test_read_excel_nrows_mi_column_name(self, read_ext): sheet_name="mi_column_name", header=[0, 1], index_col=0, - ).iloc[:3] + ).iloc[:3] actual = pd.read_excel( "testmultiindex" + read_ext, sheet_name="mi_column_name", header=[0, 1], index_col=0, - nrows=3 + nrows=3, ) tm.assert_frame_equal(actual, expected) @@ -1286,7 +1286,7 @@ def test_read_excel_nrows_skiprows_list(self, read_ext): "testskiprows" + read_ext, sheet_name="skiprows_list", skiprows=[0, 2], - ).iloc[:3] + ).iloc[:3] actual = pd.read_excel( "testskiprows" + read_ext, sheet_name="skiprows_list", @@ -1300,12 +1300,12 @@ def test_read_excel_nrows_skiprows_func(self, read_ext): expected = pd.read_excel( "testskiprows" + read_ext, sheet_name="skiprows_list", - skiprows=[0, 2], - ).iloc[:3] + skiprows=func, + ).iloc[:3] actual = pd.read_excel( "testskiprows" + read_ext, sheet_name="skiprows_list", - skiprows=[0, 2], + skiprows=func, nrows=3, ) tm.assert_frame_equal(actual, expected) From 2c643c734f9387327e3ef393482d9de217463282 Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Thu, 28 Apr 2022 22:04:06 -0600 Subject: [PATCH 04/14] What's new entry --- doc/source/whatsnew/v1.5.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 4920622a15f3f..9700b4f841066 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -451,6 +451,7 @@ Performance improvements - Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`) - Performance improvement in :func:`factorize` (:issue:`46109`) - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`) +- Performance improvement in :func:`read_excel` when ``nrows`` argument provided (:issue:`32727`) .. --------------------------------------------------------------------------- .. _whatsnew_150.bug_fixes: From dc14ac2dd4e0f3ce880fcc92890e1928e1a9058c Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Thu, 28 Apr 2022 22:52:23 -0600 Subject: [PATCH 05/14] Add asv test with nrows=10 --- asv_bench/benchmarks/io/excel.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 3363b43f29b78..a2d989e787e0f 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -86,4 +86,15 @@ def time_read_excel(self, engine): read_excel(fname, engine=engine) +class ReadExcelNRows(ReadExcel): + def time_read_excel(self, engine): + if engine == "xlrd": + fname = self.fname_excel_xls + elif engine == "odf": + fname = self.fname_odf + else: + fname = self.fname_excel + read_excel(fname, engine=engine, nrows=10) + + from ..pandas_vb_common import setup # noqa: F401 isort:skip From 2e79141e8663a9152dad1e7e1da9137a39c21458 Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Thu, 5 May 2022 21:06:36 -0600 Subject: [PATCH 06/14] Parametrize new tests --- pandas/tests/io/excel/test_readers.py | 112 +++++++------------------- 1 file changed, 28 insertions(+), 84 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 23d24b0428499..91782f0ecc0b6 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1219,93 +1219,37 @@ def test_read_excel_nrows_non_integer_parameter(self, read_ext): with pytest.raises(ValueError, match=msg): pd.read_excel("test1" + read_ext, nrows="5") - def test_read_excel_nrows_mi_column(self, read_ext): - expected = pd.read_excel( - "testmultiindex" + read_ext, - sheet_name="mi_column", - header=[0, 1], - index_col=0, - ).iloc[:3] - actual = pd.read_excel( - "testmultiindex" + read_ext, - sheet_name="mi_column", - header=[0, 1], - index_col=0, - nrows=3, - ) - tm.assert_frame_equal(actual, expected) - - def test_read_excel_nrows_mi_index(self, read_ext): - expected = pd.read_excel( - "testmultiindex" + read_ext, - sheet_name="mi_index", - index_col=[0, 1], - ).iloc[:3] - actual = pd.read_excel( - "testmultiindex" + read_ext, - sheet_name="mi_index", - index_col=[0, 1], - nrows=3, - ) - tm.assert_frame_equal(actual, expected) - - def test_read_excel_nrows_mi_both(self, read_ext): - expected = pd.read_excel( - "testmultiindex" + read_ext, - sheet_name="both", - header=[0, 1], - index_col=[0, 1], - ).iloc[:3] - actual = pd.read_excel( - "testmultiindex" + read_ext, - sheet_name="both", - header=[0, 1], - index_col=[0, 1], - nrows=3, - ) - tm.assert_frame_equal(actual, expected) - - def test_read_excel_nrows_mi_column_name(self, read_ext): - expected = pd.read_excel( - "testmultiindex" + read_ext, - sheet_name="mi_column_name", - header=[0, 1], - index_col=0, - ).iloc[:3] - actual = pd.read_excel( - "testmultiindex" + read_ext, - sheet_name="mi_column_name", - header=[0, 1], - index_col=0, - nrows=3, - ) - tm.assert_frame_equal(actual, expected) - - def test_read_excel_nrows_skiprows_list(self, read_ext): - expected = pd.read_excel( - "testskiprows" + read_ext, - sheet_name="skiprows_list", - skiprows=[0, 2], - ).iloc[:3] - actual = pd.read_excel( - "testskiprows" + read_ext, - sheet_name="skiprows_list", - skiprows=[0, 2], - nrows=3, - ) - tm.assert_frame_equal(actual, expected) - - def test_read_excel_nrows_skiprows_func(self, read_ext): - func = lambda x: x == 0 or x == 2 + @pytest.mark.parametrize( + "filename,sheet_name,header,index_col,skiprows", + [ + ("testmultiindex", "mi_column", [0, 1], 0, None), + ("testmultiindex", "mi_index", None, [0, 1], None), + ("testmultiindex", "both", [0, 1], [0, 1], None), + ("testmultiindex", "mi_column_name", [0, 1], 0, None), + ("testskiprows", "skiprows_list", None, None, [0, 2]), + ("testskiprows", "skiprows_list", None, None, lambda x: x == 0 or x == 2), + ], + ) + def test_read_excel_nrows_params( + self, read_ext, filename, sheet_name, header, index_col, skiprows + ): + """ + For vaious parameters, we should get the same result whether we + limit the rows during load (nrows=3) or after (df.iloc[:3]). + """ expected = pd.read_excel( - "testskiprows" + read_ext, - sheet_name="skiprows_list", - skiprows=func, + filename + read_ext, + sheet_name=sheet_name, + header=header, + index_col=index_col, + skiprows=skiprows, ).iloc[:3] actual = pd.read_excel( - "testskiprows" + read_ext, - sheet_name="skiprows_list", - skiprows=func, + filename + read_ext, + sheet_name=sheet_name, + header=header, + index_col=index_col, + skiprows=skiprows, nrows=3, ) tm.assert_frame_equal(actual, expected) From 943f866967ba349c7249136c16f723f23bdf0269 Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Thu, 5 May 2022 21:06:56 -0600 Subject: [PATCH 07/14] Docstrings and type hints --- pandas/io/excel/_base.py | 48 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 44d140a9e88c1..506ab134da351 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -577,9 +577,26 @@ def raise_if_bad_sheet_by_name(self, name: str) -> None: if name not in self.sheet_names: raise ValueError(f"Worksheet named '{name}' not found") - def _check_skiprows_func(self, skiprows, rows_to_use): + def _check_skiprows_func( + self, + skiprows: Callable, + rows_to_use: int, + ) -> int: """ - See how many file rows are required when `skiprows` is callable + Determine how many file rows are required to obtain `nrows` data + rows when `skiprows` is a function. + + Parameters + ---------- + skiprows : function + The function passed to read_excel by the user. + rows_to_use : int + The number of rows that will be needed for the header and + the data. + + Returns + ------- + int """ i = 0 rows_used_so_far = 0 @@ -589,9 +606,32 @@ def _check_skiprows_func(self, skiprows, rows_to_use): i += 1 return i - def _calc_rows(self, header, index_col, skiprows, nrows): + def _calc_rows( + self, + header: int | Sequence[int] | None, + index_col: int | Sequence[int] | None, + skiprows: Sequence[int] | int | Callable[[int], object] | None, + nrows: int | None, + ) -> int | None: """ - If nrows specified, find the number of rows needed from the file + If nrows specified, find the number of rows needed from the + file, otherwise return None. + + + Parameters + ---------- + header : int, list of int, default 0 + See read_excel docstring. + index_col : int, list of int, default None + See read_excel docstring. + skiprows : list-like, int, or callable, optional + See read_excel docstring. + nrows : int, default None + See read_excel docstring. + + Returns + ------- + int or None """ if nrows is None: return From 10a8a3e025adb763c9b184ac0bff6bd089135ce6 Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Thu, 5 May 2022 21:09:44 -0600 Subject: [PATCH 08/14] Add PR # --- pandas/tests/io/excel/test_readers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 91782f0ecc0b6..c58896e9e1baf 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1234,9 +1234,10 @@ def test_read_excel_nrows_params( self, read_ext, filename, sheet_name, header, index_col, skiprows ): """ - For vaious parameters, we should get the same result whether we + For various parameters, we should get the same result whether we limit the rows during load (nrows=3) or after (df.iloc[:3]). """ + # GH 46894 expected = pd.read_excel( filename + read_ext, sheet_name=sheet_name, From 30a280ce543f9adbb852e452ad6417241f015ab5 Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Thu, 5 May 2022 21:31:13 -0600 Subject: [PATCH 09/14] Use is_integer and validate_integer --- pandas/io/excel/_base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 506ab134da351..429ca6eba2dc2 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -70,6 +70,7 @@ pop_header_name, ) from pandas.io.parsers import TextParser +from pandas.io.parsers.readers import validate_integer _read_excel_doc = ( """ @@ -633,13 +634,12 @@ def _calc_rows( ------- int or None """ + validate_integer("nrows", nrows) if nrows is None: return - if not isinstance(nrows, int) or nrows < 0: - raise ValueError("'nrows' must be an integer >=0") if header is None: header_rows = 1 - elif isinstance(header, int): + elif is_integer(header): header_rows = 1 + header else: header_rows = 1 + header[-1] @@ -649,7 +649,7 @@ def _calc_rows( header_rows += 1 if skiprows is None: return header_rows + nrows - if isinstance(skiprows, int): + if is_integer(skiprows): return header_rows + nrows + skiprows if is_list_like(skiprows): return self._check_skiprows_func( From bbc1f1d51ebb5f33494a0f1f32d00fd6db829d69 Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Fri, 6 May 2022 11:13:19 -0600 Subject: [PATCH 10/14] Consolidate header arg validation --- pandas/io/common.py | 27 +++++++++++++++++++++++++-- pandas/io/excel/_base.py | 12 +++++++----- pandas/io/parsers/base_parser.py | 29 ++++++----------------------- 3 files changed, 38 insertions(+), 30 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 57015924ce77f..6721bcbe98fe2 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -54,7 +54,12 @@ from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.common import is_file_like +from pandas.core.dtypes.common import ( + is_bool, + is_file_like, + is_integer, + is_list_like, +) from pandas.core.shared_docs import _shared_docs @@ -175,12 +180,30 @@ def _expand_user(filepath_or_buffer: str | BaseBufferT) -> str | BaseBufferT: def validate_header_arg(header: object) -> None: - if isinstance(header, bool): + if header is None: + return + if is_integer(header): + if header < 0: + # GH 27779 + raise ValueError( + "Passing negative integer to header is invalid. " + "For no header, use header=None instead" + ) + return + if is_list_like(header): + if not all(map(is_integer, header)): + raise ValueError("header must be integer or list of integers") + if any(i < 0 for i in header): + raise ValueError("cannot specify multi-index header with negative integers") + return + if is_bool(header): raise TypeError( "Passing a bool to header is invalid. Use header=None for no header or " "header=int or list-like of ints to specify " "the row(s) making up the column names" ) + # GH 16338 + raise ValueError("header must be integer or list of integers") @overload diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 429ca6eba2dc2..302b7efb46df9 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -621,20 +621,19 @@ def _calc_rows( Parameters ---------- - header : int, list of int, default 0 + header : int, list of int, or None See read_excel docstring. - index_col : int, list of int, default None + index_col : int, list of int, or None See read_excel docstring. - skiprows : list-like, int, or callable, optional + skiprows : list-like, int, callable, or None See read_excel docstring. - nrows : int, default None + nrows : int or None See read_excel docstring. Returns ------- int or None """ - validate_integer("nrows", nrows) if nrows is None: return if header is None: @@ -661,6 +660,8 @@ def _calc_rows( skiprows, header_rows + nrows, ) + # else unexpected skiprows type: read_excel will not optimize + # the number of rows read from file def parse( self, @@ -698,6 +699,7 @@ def parse( ) validate_header_arg(header) + validate_integer("nrows", nrows) ret_dict = False diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 2851ea36c8a33..7be895fd2b942 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -120,13 +120,7 @@ def __init__(self, kwds) -> None: # validate header options for mi self.header = kwds.get("header") - if isinstance(self.header, (list, tuple, np.ndarray)): - if not all(map(is_integer, self.header)): - raise ValueError("header must be integer or list of integers") - if any(i < 0 for i in self.header): - raise ValueError( - "cannot specify multi-index header with negative integers" - ) + if is_list_like(self.header): if kwds.get("usecols"): raise ValueError( "cannot specify usecols when specifying a multi-index header" @@ -138,9 +132,8 @@ def __init__(self, kwds) -> None: # validate index_col that only contains integers if self.index_col is not None: - is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray)) if not ( - is_sequence + is_list_like(self.index_col) and all(map(is_integer, self.index_col)) or is_integer(self.index_col) ): @@ -148,21 +141,11 @@ def __init__(self, kwds) -> None: "index_col must only contain row numbers " "when specifying a multi-index header" ) - elif self.header is not None: + elif self.header is not None and self.prefix is not None: # GH 27394 - if self.prefix is not None: - raise ValueError( - "Argument prefix must be None if argument header is not None" - ) - # GH 16338 - elif not is_integer(self.header): - raise ValueError("header must be integer or list of integers") - # GH 27779 - elif self.header < 0: - raise ValueError( - "Passing negative integer to header is invalid. " - "For no header, use header=None instead" - ) + raise ValueError( + "Argument prefix must be None if argument header is not None" + ) self._name_processed = False From 2018d26319652c3e81d4cb82565a044a1816b293 Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Fri, 6 May 2022 18:11:52 -0600 Subject: [PATCH 11/14] Attempting to placate mypy with assert statements --- pandas/io/common.py | 3 +++ pandas/io/excel/_base.py | 13 ++++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 6721bcbe98fe2..6c8d491f2d870 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -26,6 +26,7 @@ Generic, Literal, Mapping, + Sequence, TypeVar, cast, overload, @@ -183,6 +184,7 @@ def validate_header_arg(header: object) -> None: if header is None: return if is_integer(header): + assert isinstance(header, int) if header < 0: # GH 27779 raise ValueError( @@ -191,6 +193,7 @@ def validate_header_arg(header: object) -> None: ) return if is_list_like(header): + assert isinstance(header, Sequence) if not all(map(is_integer, header)): raise ValueError("header must be integer or list of integers") if any(i < 0 for i in header): diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 302b7efb46df9..c62d835eb217a 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -635,22 +635,28 @@ def _calc_rows( int or None """ if nrows is None: - return + return None if header is None: header_rows = 1 elif is_integer(header): + assert isinstance(header, int) header_rows = 1 + header else: + assert isinstance(header, Sequence) header_rows = 1 + header[-1] # If there is a MultiIndex header and an index then there is also # a row containing just the index name(s) - if is_list_like(header) and len(header) > 1 and index_col is not None: - header_rows += 1 + if is_list_like(header) and index_col is not None: + assert isinstance(header, Sequence) + if len(header) > 1: + header_rows += 1 if skiprows is None: return header_rows + nrows if is_integer(skiprows): + assert isinstance(skiprows, int) return header_rows + nrows + skiprows if is_list_like(skiprows): + assert isinstance(skiprows, Sequence) return self._check_skiprows_func( lambda x: x in skiprows, header_rows + nrows, @@ -662,6 +668,7 @@ def _calc_rows( ) # else unexpected skiprows type: read_excel will not optimize # the number of rows read from file + return None def parse( self, From c108a97ae0bfeb9e2e36fa4f485c05f2d62960f7 Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Mon, 9 May 2022 22:09:26 -0600 Subject: [PATCH 12/14] make type checks pass with typing.cast --- pandas/io/common.py | 4 ++-- pandas/io/excel/_base.py | 20 +++++++++++--------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 6c8d491f2d870..77f17f12d9d3a 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -184,7 +184,7 @@ def validate_header_arg(header: object) -> None: if header is None: return if is_integer(header): - assert isinstance(header, int) + header = cast(int, header) if header < 0: # GH 27779 raise ValueError( @@ -193,7 +193,7 @@ def validate_header_arg(header: object) -> None: ) return if is_list_like(header): - assert isinstance(header, Sequence) + header = cast(Sequence, header) if not all(map(is_integer, header)): raise ValueError("header must be integer or list of integers") if any(i < 0 for i in header): diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index c62d835eb217a..d20f347e54d6b 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -2,6 +2,7 @@ import abc import datetime +from functools import partial from io import BytesIO import os from textwrap import fill @@ -639,28 +640,29 @@ def _calc_rows( if header is None: header_rows = 1 elif is_integer(header): - assert isinstance(header, int) + header = cast(int, header) header_rows = 1 + header else: - assert isinstance(header, Sequence) + header = cast(Sequence, header) header_rows = 1 + header[-1] # If there is a MultiIndex header and an index then there is also # a row containing just the index name(s) if is_list_like(header) and index_col is not None: - assert isinstance(header, Sequence) + header = cast(Sequence, header) if len(header) > 1: header_rows += 1 if skiprows is None: return header_rows + nrows if is_integer(skiprows): - assert isinstance(skiprows, int) + skiprows = cast(int, skiprows) return header_rows + nrows + skiprows if is_list_like(skiprows): - assert isinstance(skiprows, Sequence) - return self._check_skiprows_func( - lambda x: x in skiprows, - header_rows + nrows, - ) + + def f(skiprows: Sequence, x: int) -> bool: + return x in skiprows + + skiprows = cast(Sequence, skiprows) + return self._check_skiprows_func(partial(f, skiprows), header_rows + nrows) if callable(skiprows): return self._check_skiprows_func( skiprows, From 3aad8350982498d3192532b72c6d9a114cd608f5 Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Tue, 10 May 2022 22:22:59 -0600 Subject: [PATCH 13/14] Do not allow sets for header arg Sets for skiprows currently work, so I've left that as is. --- pandas/io/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 77f17f12d9d3a..8f9994ee3952b 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -192,7 +192,7 @@ def validate_header_arg(header: object) -> None: "For no header, use header=None instead" ) return - if is_list_like(header): + if is_list_like(header, allow_sets=False): header = cast(Sequence, header) if not all(map(is_integer, header)): raise ValueError("header must be integer or list of integers") From d6e1df34d10ed40faf3db29a6d06147f963496a1 Mon Sep 17 00:00:00 2001 From: Andrew Hawryluk Date: Wed, 25 May 2022 13:32:13 -0600 Subject: [PATCH 14/14] Two more allow_set=False --- pandas/io/parsers/base_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 7be895fd2b942..e9c39d5ff1996 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -120,7 +120,7 @@ def __init__(self, kwds) -> None: # validate header options for mi self.header = kwds.get("header") - if is_list_like(self.header): + if is_list_like(self.header, allow_sets=False): if kwds.get("usecols"): raise ValueError( "cannot specify usecols when specifying a multi-index header" @@ -133,7 +133,7 @@ def __init__(self, kwds) -> None: # validate index_col that only contains integers if self.index_col is not None: if not ( - is_list_like(self.index_col) + is_list_like(self.index_col, allow_sets=False) and all(map(is_integer, self.index_col)) or is_integer(self.index_col) ):