From 900afff344ad9e2df7d46c74cc23b2f787b149f9 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 27 Mar 2020 19:56:24 +0100 Subject: [PATCH 01/66] ENH: Skip rows while reading excel file with engine=openpyxl --- pandas/io/excel/_base.py | 4 ++-- pandas/io/excel/_odfreader.py | 2 +- pandas/io/excel/_openpyxl.py | 22 +++++++++++++++++++--- pandas/io/excel/_pyxlsb.py | 2 +- pandas/io/excel/_xlrd.py | 2 +- 5 files changed, 24 insertions(+), 8 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index f98d9501f1f73..09977039521c7 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -380,7 +380,7 @@ def get_sheet_by_index(self, index): pass @abc.abstractmethod - def get_sheet_data(self, sheet, convert_float): + def get_sheet_data(self, sheet, convert_float, header, skiprows, nrows): pass def parse( @@ -436,7 +436,7 @@ def parse( else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) - data = self.get_sheet_data(sheet, convert_float) + data = self.get_sheet_data(sheet, convert_float, header, skiprows, nrows) usecols = _maybe_convert_usecols(usecols) if not data: diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 7af776dc1a10f..edd57a4aba0be 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -63,7 +63,7 @@ def get_sheet_by_name(self, name: str): raise ValueError(f"sheet {name} not found") - def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + def get_sheet_data(self, sheet, convert_float: bool, header, skiprows, nrows) -> List[List[Scalar]]: """ Parse an ODF Table into a list of lists """ diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index a96c0f814e2d8..61a563c05bd56 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -524,9 +524,25 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.value - def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + def get_sheet_data(self, sheet, convert_float: bool, header, skiprows, nrows) -> List[List[Scalar]]: data: List[List[Scalar]] = [] - for row in sheet.rows: - data.append([self._convert_cell(cell, convert_float) for cell in row]) + skiprows = 0 if skiprows is None else skiprows + header = 0 if header is None else header + + if nrows is not None: + for row in sheet.rows: + if header > 1: + header -= 1 + data.append(["", ""]) + elif skiprows > 0: + skiprows -= 1 + data.append(["", ""]) + elif nrows >= 0: + nrows -= 1 + else: + break + else: + for row in sheet.rows: + data.append([self._convert_cell(cell, convert_float) for cell in row]) return data diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 0d96c8c4acdb8..23385bcf60d9c 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -62,7 +62,7 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.v - def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + def get_sheet_data(self, sheet, convert_float: bool, header, skiprows, nrows) -> List[List[Scalar]]: return [ [self._convert_cell(c, convert_float) for c in r] for r in sheet.rows(sparse=False) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 8f7d3b1368fc7..afdd40d747c49 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -46,7 +46,7 @@ def get_sheet_by_name(self, name): def get_sheet_by_index(self, index): return self.book.sheet_by_index(index) - def get_sheet_data(self, sheet, convert_float): + def get_sheet_data(self, sheet, convert_float, header, skiprows, nrows): from xlrd import ( xldate, XL_CELL_DATE, From df55b51320eb695830ed9846b2d03c283727dfe5 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 4 Apr 2020 02:07:03 +0200 Subject: [PATCH 02/66] ENH: Skiping rows with odf engine --- pandas/io/excel/_odfreader.py | 17 +++++++++++++++++ pandas/io/excel/_openpyxl.py | 34 ++++++++++++++++++---------------- 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index edd57a4aba0be..e79a186e30673 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -78,8 +78,25 @@ def get_sheet_data(self, sheet, convert_float: bool, header, skiprows, nrows) -> max_row_len = 0 table: List[List[Scalar]] = [] + header = 0 if header is None else header + skiprows = 0 if skiprows is None else skiprows + nrows = 0 if nrows is None else nrows for i, sheet_row in enumerate(sheet_rows): + + if header > 1: + header -= 1 + table.append([]) + continue + elif skiprows > 0: + skiprows -= 1 + table.append([]) + continue + elif nrows >= 0: + nrows -= 1 + else: + break + sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] empty_cells = 0 table_row: List[Scalar] = [] diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 61a563c05bd56..2e8c478132890 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -526,23 +526,25 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: def get_sheet_data(self, sheet, convert_float: bool, header, skiprows, nrows) -> List[List[Scalar]]: data: List[List[Scalar]] = [] - skiprows = 0 if skiprows is None else skiprows header = 0 if header is None else header + skiprows = 0 if skiprows is None else skiprows + nrows = 0 if nrows is None else nrows + + for row in sheet.rows: + + if header > 1: + header -= 1 + data.append([]) + continue + elif skiprows > 0: + skiprows -= 1 + data.append([]) + continue + elif nrows >= 0: + nrows -= 1 + else: + break - if nrows is not None: - for row in sheet.rows: - if header > 1: - header -= 1 - data.append(["", ""]) - elif skiprows > 0: - skiprows -= 1 - data.append(["", ""]) - elif nrows >= 0: - nrows -= 1 - else: - break - else: - for row in sheet.rows: - data.append([self._convert_cell(cell, convert_float) for cell in row]) + data.append([self._convert_cell(cell, convert_float) for cell in row]) return data From 817702435117118a7182afc7899d585fdc479861 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 4 Apr 2020 03:02:42 +0200 Subject: [PATCH 03/66] ENH: Optimize nrows in read_excel --- pandas/io/excel/_odfreader.py | 31 ++++++++++++++++++------------- pandas/io/excel/_openpyxl.py | 32 ++++++++++++++++++-------------- pandas/io/excel/_xlrd.py | 22 +++++++++++++++++++++- 3 files changed, 57 insertions(+), 28 deletions(-) diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index e79a186e30673..70bdba1942aa2 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -6,6 +6,7 @@ import pandas as pd from pandas.io.excel._base import _BaseExcelReader +from pandas.io.parsers import _validate_integer class _ODFReader(_BaseExcelReader): @@ -78,24 +79,28 @@ def get_sheet_data(self, sheet, convert_float: bool, header, skiprows, nrows) -> max_row_len = 0 table: List[List[Scalar]] = [] + + if nrows is not None: _validate_integer("nrows", nrows) header = 0 if header is None else header skiprows = 0 if skiprows is None else skiprows - nrows = 0 if nrows is None else nrows + if isinstance(header, list) or isinstance(skiprows, list): + nrows = None for i, sheet_row in enumerate(sheet_rows): - if header > 1: - header -= 1 - table.append([]) - continue - elif skiprows > 0: - skiprows -= 1 - table.append([]) - continue - elif nrows >= 0: - nrows -= 1 - else: - break + if nrows is not None: + if header > 1: + header -= 1 + data.append([]) + continue + elif skiprows > 0: + skiprows -= 1 + data.append([]) + continue + if nrows >= 0: + nrows -= 1 + else: + break sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] empty_cells = 0 diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 2e8c478132890..58e096f1269d6 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -7,7 +7,7 @@ from pandas.io.excel._base import ExcelWriter, _BaseExcelReader from pandas.io.excel._util import _validate_freeze_panes - +from pandas.io.parsers import _validate_integer class _OpenpyxlWriter(ExcelWriter): engine = "openpyxl" @@ -526,24 +526,28 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: def get_sheet_data(self, sheet, convert_float: bool, header, skiprows, nrows) -> List[List[Scalar]]: data: List[List[Scalar]] = [] + + if nrows is not None: _validate_integer("nrows", nrows) header = 0 if header is None else header skiprows = 0 if skiprows is None else skiprows - nrows = 0 if nrows is None else nrows + if isinstance(header, list) or isinstance(skiprows, list): + nrows = None for row in sheet.rows: - if header > 1: - header -= 1 - data.append([]) - continue - elif skiprows > 0: - skiprows -= 1 - data.append([]) - continue - elif nrows >= 0: - nrows -= 1 - else: - break + if nrows is not None: + if header > 1: + header -= 1 + data.append([]) + continue + elif skiprows > 0: + skiprows -= 1 + data.append([]) + continue + if nrows >= 0: + nrows -= 1 + else: + break data.append([self._convert_cell(cell, convert_float) for cell in row]) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index afdd40d747c49..f3089e44d6faf 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -5,7 +5,7 @@ from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import _BaseExcelReader - +from pandas.io.parsers import _validate_integer class _XlrdReader(_BaseExcelReader): def __init__(self, filepath_or_buffer): @@ -97,7 +97,27 @@ def _parse_cell(cell_contents, cell_typ): data = [] + if nrows is not None: _validate_integer("nrows", nrows) + header = 0 if header is None else header + skiprows = 0 if skiprows is None else skiprows + if isinstance(header, list) or isinstance(skiprows, list): + nrows = None for i in range(sheet.nrows): + + if nrows is not None: + if header > 1: + header -= 1 + data.append([]) + continue + elif skiprows > 0: + skiprows -= 1 + data.append([]) + continue + if nrows >= 0: + nrows -= 1 + else: + break + row = [ _parse_cell(value, typ) for value, typ in zip(sheet.row_values(i), sheet.row_types(i)) From 79b34c3851375d1b46dc2e0215c3e7f5089632d5 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 4 Apr 2020 03:09:17 +0200 Subject: [PATCH 04/66] Reformatted --- pandas/io/excel/_odfreader.py | 9 ++++++--- pandas/io/excel/_openpyxl.py | 14 +++++++++----- pandas/io/excel/_xlrd.py | 6 ++++-- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 70bdba1942aa2..51e22d156663b 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -64,7 +64,9 @@ def get_sheet_by_name(self, name: str): raise ValueError(f"sheet {name} not found") - def get_sheet_data(self, sheet, convert_float: bool, header, skiprows, nrows) -> List[List[Scalar]]: + def get_sheet_data( + self, sheet, convert_float: bool, header, skiprows, nrows + ) -> List[List[Scalar]]: """ Parse an ODF Table into a list of lists """ @@ -80,7 +82,8 @@ def get_sheet_data(self, sheet, convert_float: bool, header, skiprows, nrows) -> table: List[List[Scalar]] = [] - if nrows is not None: _validate_integer("nrows", nrows) + if nrows is not None: + _validate_integer("nrows", nrows) header = 0 if header is None else header skiprows = 0 if skiprows is None else skiprows if isinstance(header, list) or isinstance(skiprows, list): @@ -99,7 +102,7 @@ def get_sheet_data(self, sheet, convert_float: bool, header, skiprows, nrows) -> continue if nrows >= 0: nrows -= 1 - else: + else: break sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 58e096f1269d6..2fad546681a2c 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -9,6 +9,7 @@ from pandas.io.excel._util import _validate_freeze_panes from pandas.io.parsers import _validate_integer + class _OpenpyxlWriter(ExcelWriter): engine = "openpyxl" supported_extensions = (".xlsx", ".xlsm") @@ -524,16 +525,19 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.value - def get_sheet_data(self, sheet, convert_float: bool, header, skiprows, nrows) -> List[List[Scalar]]: + def get_sheet_data( + self, sheet, convert_float: bool, header, skiprows, nrows + ) -> List[List[Scalar]]: data: List[List[Scalar]] = [] - if nrows is not None: _validate_integer("nrows", nrows) + if nrows is not None: + _validate_integer("nrows", nrows) header = 0 if header is None else header skiprows = 0 if skiprows is None else skiprows if isinstance(header, list) or isinstance(skiprows, list): nrows = None - - for row in sheet.rows: + + for row in sheet.rows: if nrows is not None: if header > 1: @@ -546,7 +550,7 @@ def get_sheet_data(self, sheet, convert_float: bool, header, skiprows, nrows) -> continue if nrows >= 0: nrows -= 1 - else: + else: break data.append([self._convert_cell(cell, convert_float) for cell in row]) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index f3089e44d6faf..72ff7ceaa37a3 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -7,6 +7,7 @@ from pandas.io.excel._base import _BaseExcelReader from pandas.io.parsers import _validate_integer + class _XlrdReader(_BaseExcelReader): def __init__(self, filepath_or_buffer): """ @@ -97,7 +98,8 @@ def _parse_cell(cell_contents, cell_typ): data = [] - if nrows is not None: _validate_integer("nrows", nrows) + if nrows is not None: + _validate_integer("nrows", nrows) header = 0 if header is None else header skiprows = 0 if skiprows is None else skiprows if isinstance(header, list) or isinstance(skiprows, list): @@ -115,7 +117,7 @@ def _parse_cell(cell_contents, cell_typ): continue if nrows >= 0: nrows -= 1 - else: + else: break row = [ From f0a2b8d912b5bb7f5384e1159787a5b7fd46e0e1 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 4 Apr 2020 13:54:25 +0200 Subject: [PATCH 05/66] Fix linting --- pandas/io/excel/_odfreader.py | 4 ++-- pandas/io/excel/_pyxlsb.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 51e22d156663b..bd32d6c2a4d8f 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -94,11 +94,11 @@ def get_sheet_data( if nrows is not None: if header > 1: header -= 1 - data.append([]) + table.append([]) continue elif skiprows > 0: skiprows -= 1 - data.append([]) + table.append([]) continue if nrows >= 0: nrows -= 1 diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 23385bcf60d9c..2948eff15a2c0 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -62,7 +62,9 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.v - def get_sheet_data(self, sheet, convert_float: bool, header, skiprows, nrows) -> List[List[Scalar]]: + def get_sheet_data( + self, sheet, convert_float: bool, header, skiprows, nrows + ) -> List[List[Scalar]]: return [ [self._convert_cell(c, convert_float) for c in r] for r in sheet.rows(sparse=False) From 70ac23483e22752ab8332b5acaf4dabfb0bcae41 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 4 Apr 2020 14:17:09 +0200 Subject: [PATCH 06/66] Add annotation to variable --- pandas/io/excel/_xlrd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 72ff7ceaa37a3..55822a40d94d9 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -96,7 +96,7 @@ def _parse_cell(cell_contents, cell_typ): cell_contents = val return cell_contents - data = [] + data: List[List[Scalar]] = [] if nrows is not None: _validate_integer("nrows", nrows) From 27cae3ad72361f96656743618de32f1a73ef711d Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 4 Apr 2020 14:37:49 +0200 Subject: [PATCH 07/66] Add imports --- pandas/io/excel/_xlrd.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 55822a40d94d9..4926540aa5720 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -1,7 +1,9 @@ from datetime import time +from typing import List import numpy as np +from pandas._typing import Scalar from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import _BaseExcelReader From 4248f8c3311b4542fc8edf87a83c04ff45de86be Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sun, 5 Apr 2020 01:21:27 +0200 Subject: [PATCH 08/66] Add types --- pandas/io/excel/_odfreader.py | 7 ++++--- pandas/io/excel/_openpyxl.py | 7 ++++--- pandas/io/excel/_xlrd.py | 9 ++++++--- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index bd32d6c2a4d8f..13b2a95dc30e8 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -1,6 +1,6 @@ -from typing import List +from typing import List, Sequence -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, Union from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -65,7 +65,8 @@ def get_sheet_by_name(self, name: str): raise ValueError(f"sheet {name} not found") def get_sheet_data( - self, sheet, convert_float: bool, header, skiprows, nrows + self, sheet, convert_float: bool, header: Union[int, Sequence[int]], + skiprows: Union[int, Sequence[int]], nrows: int ) -> List[List[Scalar]]: """ Parse an ODF Table into a list of lists diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 2fad546681a2c..3c47944d8b397 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -1,8 +1,8 @@ -from typing import List +from typing import List, Sequence import numpy as np -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, Union from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import ExcelWriter, _BaseExcelReader @@ -526,7 +526,8 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.value def get_sheet_data( - self, sheet, convert_float: bool, header, skiprows, nrows + self, sheet, convert_float: bool, header: Union[int, Sequence[int]], + skiprows: Union[int, Sequence[int]], nrows: int ) -> List[List[Scalar]]: data: List[List[Scalar]] = [] diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 4926540aa5720..9ff62f781039b 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -1,9 +1,9 @@ from datetime import time -from typing import List +from typing import List, Sequence import numpy as np -from pandas._typing import Scalar +from pandas._typing import Scalar, Union from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import _BaseExcelReader @@ -49,7 +49,10 @@ def get_sheet_by_name(self, name): def get_sheet_by_index(self, index): return self.book.sheet_by_index(index) - def get_sheet_data(self, sheet, convert_float, header, skiprows, nrows): + def get_sheet_data( + self, sheet, convert_float, header: Union[int, Sequence[int]], + skiprows: Union[int, Sequence[int]], nrows: int + ) -> List[List[Scalar]]: from xlrd import ( xldate, XL_CELL_DATE, From 70f46b3850bd02549b350e94a97dbe41a9159ef7 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Thu, 9 Apr 2020 20:37:59 +0200 Subject: [PATCH 09/66] ENH: Fix --- pandas/io/excel/_odfreader.py | 8 ++++++-- pandas/io/excel/_openpyxl.py | 8 ++++++-- pandas/io/excel/_xlrd.py | 8 ++++++-- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 13b2a95dc30e8..9ee93e55625a2 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -65,8 +65,12 @@ def get_sheet_by_name(self, name: str): raise ValueError(f"sheet {name} not found") def get_sheet_data( - self, sheet, convert_float: bool, header: Union[int, Sequence[int]], - skiprows: Union[int, Sequence[int]], nrows: int + self, + sheet, + convert_float: bool, + header: Union[int, Sequence[int]], + skiprows: Union[int, Sequence[int]], + nrows: int, ) -> List[List[Scalar]]: """ Parse an ODF Table into a list of lists diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 3c47944d8b397..d73382c8cccef 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -526,8 +526,12 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.value def get_sheet_data( - self, sheet, convert_float: bool, header: Union[int, Sequence[int]], - skiprows: Union[int, Sequence[int]], nrows: int + self, + sheet, + convert_float: bool, + header: Union[int, Sequence[int]], + skiprows: Union[int, Sequence[int]], + nrows: int, ) -> List[List[Scalar]]: data: List[List[Scalar]] = [] diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 9ff62f781039b..b35b5e0d5b667 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -50,8 +50,12 @@ def get_sheet_by_index(self, index): return self.book.sheet_by_index(index) def get_sheet_data( - self, sheet, convert_float, header: Union[int, Sequence[int]], - skiprows: Union[int, Sequence[int]], nrows: int + self, + sheet, + convert_float, + header: Union[int, Sequence[int]], + skiprows: Union[int, Sequence[int]], + nrows: int, ) -> List[List[Scalar]]: from xlrd import ( xldate, From cdfc05dea1de5478ffcb75860724b479c70c4872 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Thu, 9 Apr 2020 21:38:37 +0200 Subject: [PATCH 10/66] ENH: Mark variables as optional --- pandas/io/excel/_odfreader.py | 8 ++++---- pandas/io/excel/_openpyxl.py | 8 ++++---- pandas/io/excel/_xlrd.py | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 9ee93e55625a2..717f87aa15f13 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -1,4 +1,4 @@ -from typing import List, Sequence +from typing import List, Optional, Sequence from pandas._typing import FilePathOrBuffer, Scalar, Union from pandas.compat._optional import import_optional_dependency @@ -68,9 +68,9 @@ def get_sheet_data( self, sheet, convert_float: bool, - header: Union[int, Sequence[int]], - skiprows: Union[int, Sequence[int]], - nrows: int, + header: Optional[Union[int, Sequence[int]]], + skiprows: Optional[Union[int, Sequence[int]]], + nrows: Optional[int], ) -> List[List[Scalar]]: """ Parse an ODF Table into a list of lists diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index d73382c8cccef..2c99690ed5fc4 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -1,4 +1,4 @@ -from typing import List, Sequence +from typing import List, Optional, Sequence import numpy as np @@ -529,9 +529,9 @@ def get_sheet_data( self, sheet, convert_float: bool, - header: Union[int, Sequence[int]], - skiprows: Union[int, Sequence[int]], - nrows: int, + header: Optional[Union[int, Sequence[int]]], + skiprows: Optional[Union[int, Sequence[int]]], + nrows: Optional[int], ) -> List[List[Scalar]]: data: List[List[Scalar]] = [] diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index b35b5e0d5b667..76d5e1337a755 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -1,5 +1,5 @@ from datetime import time -from typing import List, Sequence +from typing import List, Optional, Sequence import numpy as np @@ -53,9 +53,9 @@ def get_sheet_data( self, sheet, convert_float, - header: Union[int, Sequence[int]], - skiprows: Union[int, Sequence[int]], - nrows: int, + header: Optional[Union[int, Sequence[int]]], + skiprows: Optional[Union[int, Sequence[int]]], + nrows: Optional[int], ) -> List[List[Scalar]]: from xlrd import ( xldate, From 4c8a42a95dac8a48cf97c055e6b8e9196726e4b9 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 10 Apr 2020 02:52:46 +0200 Subject: [PATCH 11/66] ENH: Move nrows variable check --- pandas/io/excel/_base.py | 6 ++++-- pandas/io/excel/_odfreader.py | 2 -- pandas/io/excel/_openpyxl.py | 2 -- pandas/io/excel/_pyxlsb.py | 7 ++++--- pandas/io/excel/_xlrd.py | 3 +-- 5 files changed, 9 insertions(+), 11 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index efb9cc1da1cc0..82e663e48518d 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -28,7 +28,7 @@ _pop_header_name, get_writer, ) -from pandas.io.parsers import TextParser +from pandas.io.parsers import TextParser, _validate_integer _read_excel_doc = ( """ @@ -307,6 +307,9 @@ def read_excel( "an ExcelFile - ExcelFile already has the engine set" ) + if nrows is not None: + _validate_integer("nrows", nrows) + return io.parse( sheet_name=sheet_name, header=header, @@ -333,7 +336,6 @@ def read_excel( **kwds, ) - class _BaseExcelReader(metaclass=abc.ABCMeta): def __init__(self, filepath_or_buffer): # If filepath_or_buffer is a url, load the data into a BytesIO diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 80ff3a907cd49..e3a51eb8bcf20 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -87,8 +87,6 @@ def get_sheet_data( table: List[List[Scalar]] = [] - if nrows is not None: - _validate_integer("nrows", nrows) header = 0 if header is None else header skiprows = 0 if skiprows is None else skiprows if isinstance(header, list) or isinstance(skiprows, list): diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 100a44ac8e36c..366574f0ddcda 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -540,8 +540,6 @@ def get_sheet_data( ) -> List[List[Scalar]]: data: List[List[Scalar]] = [] - if nrows is not None: - _validate_integer("nrows", nrows) header = 0 if header is None else header skiprows = 0 if skiprows is None else skiprows if isinstance(header, list) or isinstance(skiprows, list): diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 2948eff15a2c0..51b5d9bb73211 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -1,6 +1,6 @@ -from typing import List +from typing import List, Optional, Sequence -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, Union from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import _BaseExcelReader @@ -63,7 +63,8 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.v def get_sheet_data( - self, sheet, convert_float: bool, header, skiprows, nrows + self, sheet, convert_float: bool, header: Optional[Union[int, Sequence[int]]], + skiprows: Optional[Union[int, Sequence[int]]], nrows: Optional[int] ) -> List[List[Scalar]]: return [ [self._convert_cell(c, convert_float) for c in r] diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 76d5e1337a755..7915acfb98b46 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -107,12 +107,11 @@ def _parse_cell(cell_contents, cell_typ): data: List[List[Scalar]] = [] - if nrows is not None: - _validate_integer("nrows", nrows) header = 0 if header is None else header skiprows = 0 if skiprows is None else skiprows if isinstance(header, list) or isinstance(skiprows, list): nrows = None + for i in range(sheet.nrows): if nrows is not None: From 19bb9275e76f0b270a6f9fa72bd2a03a19d130cb Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 10 Apr 2020 03:31:28 +0200 Subject: [PATCH 12/66] ENH: Remove unused imports --- pandas/io/excel/_base.py | 1 + pandas/io/excel/_odfreader.py | 1 - pandas/io/excel/_openpyxl.py | 1 - pandas/io/excel/_pyxlsb.py | 8 ++++++-- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 82e663e48518d..6156487dca755 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -336,6 +336,7 @@ def read_excel( **kwds, ) + class _BaseExcelReader(metaclass=abc.ABCMeta): def __init__(self, filepath_or_buffer): # If filepath_or_buffer is a url, load the data into a BytesIO diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index e3a51eb8bcf20..d15266a2b41a0 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -6,7 +6,6 @@ import pandas as pd from pandas.io.excel._base import _BaseExcelReader -from pandas.io.parsers import _validate_integer class _ODFReader(_BaseExcelReader): diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 366574f0ddcda..58f82cbfb1e7f 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -7,7 +7,6 @@ from pandas.io.excel._base import ExcelWriter, _BaseExcelReader from pandas.io.excel._util import _validate_freeze_panes -from pandas.io.parsers import _validate_integer class _OpenpyxlWriter(ExcelWriter): diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 51b5d9bb73211..a7962572775b5 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -63,8 +63,12 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.v def get_sheet_data( - self, sheet, convert_float: bool, header: Optional[Union[int, Sequence[int]]], - skiprows: Optional[Union[int, Sequence[int]]], nrows: Optional[int] + self, + sheet, + convert_float: bool, + header: Optional[Union[int, Sequence[int]]], + skiprows: Optional[Union[int, Sequence[int]]], + nrows: Optional[int], ) -> List[List[Scalar]]: return [ [self._convert_cell(c, convert_float) for c in r] From 6c2a3b59897af655d883eba1edf7e29497be5134 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 17 Apr 2020 01:36:58 +0200 Subject: [PATCH 13/66] ENH: Move repeated code to base --- pandas/io/excel/_base.py | 19 ++++++++++++++++++- pandas/io/excel/_odfreader.py | 26 ++++++++------------------ pandas/io/excel/_openpyxl.py | 30 ++++++++++-------------------- pandas/io/excel/_xlrd.py | 26 ++++++++------------------ 4 files changed, 44 insertions(+), 57 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 6156487dca755..d98b11bdb20d6 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -389,6 +389,15 @@ def get_sheet_by_index(self, index): def get_sheet_data(self, sheet, convert_float, header, skiprows, nrows): pass + def should_read_row(self, index, header, skiprows, nrows): + if nrows is not None: + if index <= header - 1 + skiprows: + return True, False + if index <= header - 1 + skiprows + nrows + 1: + return False, False + return False, True + return False, False + def parse( self, sheet_name=0, @@ -442,7 +451,15 @@ def parse( else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) - data = self.get_sheet_data(sheet, convert_float, header, skiprows, nrows) + gsd_header = 0 if header is None else header + gsd_skiprows = 0 if skiprows is None else skiprows + gsd_nrows = nrows + if isinstance(gsd_header, list) or isinstance(gsd_skiprows, list): + gsd_nrows = None + + data = self.get_sheet_data( + sheet, convert_float, gsd_header, gsd_skiprows, gsd_nrows + ) usecols = _maybe_convert_usecols(usecols) if not data: diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index d15266a2b41a0..4094e9da223ea 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -86,26 +86,16 @@ def get_sheet_data( table: List[List[Scalar]] = [] - header = 0 if header is None else header - skiprows = 0 if skiprows is None else skiprows - if isinstance(header, list) or isinstance(skiprows, list): - nrows = None - for i, sheet_row in enumerate(sheet_rows): - if nrows is not None: - if header > 1: - header -= 1 - table.append([]) - continue - elif skiprows > 0: - skiprows -= 1 - table.append([]) - continue - if nrows >= 0: - nrows -= 1 - else: - break + should_continue, should_break = self.should_read_row( + i, header, skiprows, nrows + ) + if should_continue: + table.append([]) + continue + if should_break: + break sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] empty_cells = 0 diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 58f82cbfb1e7f..3cfc7aaa55837 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -539,26 +539,16 @@ def get_sheet_data( ) -> List[List[Scalar]]: data: List[List[Scalar]] = [] - header = 0 if header is None else header - skiprows = 0 if skiprows is None else skiprows - if isinstance(header, list) or isinstance(skiprows, list): - nrows = None - - for row in sheet.rows: - - if nrows is not None: - if header > 1: - header -= 1 - data.append([]) - continue - elif skiprows > 0: - skiprows -= 1 - data.append([]) - continue - if nrows >= 0: - nrows -= 1 - else: - break + for i, row in enumerate(sheet.rows): + + should_continue, should_break = self.should_read_row( + i, header, skiprows, nrows + ) + if should_continue: + data.append([]) + continue + if should_break: + break data.append([self._convert_cell(cell, convert_float) for cell in row]) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 7915acfb98b46..5c7ae62febe3f 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -107,26 +107,16 @@ def _parse_cell(cell_contents, cell_typ): data: List[List[Scalar]] = [] - header = 0 if header is None else header - skiprows = 0 if skiprows is None else skiprows - if isinstance(header, list) or isinstance(skiprows, list): - nrows = None - for i in range(sheet.nrows): - if nrows is not None: - if header > 1: - header -= 1 - data.append([]) - continue - elif skiprows > 0: - skiprows -= 1 - data.append([]) - continue - if nrows >= 0: - nrows -= 1 - else: - break + should_continue, should_break = self.should_read_row( + i, header, skiprows, nrows + ) + if should_continue: + table.append([]) + continue + if should_break: + break row = [ _parse_cell(value, typ) From b865c8814bc848522ee8a08002f40e8b643e76e0 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 17 Apr 2020 01:42:49 +0200 Subject: [PATCH 14/66] ENH: Remove import --- pandas/io/excel/_xlrd.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 5c7ae62febe3f..ad3eb665b1818 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -7,8 +7,6 @@ from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import _BaseExcelReader -from pandas.io.parsers import _validate_integer - class _XlrdReader(_BaseExcelReader): def __init__(self, filepath_or_buffer): @@ -109,9 +107,7 @@ def _parse_cell(cell_contents, cell_typ): for i in range(sheet.nrows): - should_continue, should_break = self.should_read_row( - i, header, skiprows, nrows - ) + should_continue, should_break = self.should_read_row(i, header, skiprows, nrows) if should_continue: table.append([]) continue From 49276daccfa7e1b21bef5a7f3bf6ae1eeb0b3276 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 17 Apr 2020 01:43:59 +0200 Subject: [PATCH 15/66] ENH: Lint --- pandas/io/excel/_xlrd.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index ad3eb665b1818..5fd948cfe4518 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -8,6 +8,7 @@ from pandas.io.excel._base import _BaseExcelReader + class _XlrdReader(_BaseExcelReader): def __init__(self, filepath_or_buffer): """ @@ -107,7 +108,9 @@ def _parse_cell(cell_contents, cell_typ): for i in range(sheet.nrows): - should_continue, should_break = self.should_read_row(i, header, skiprows, nrows) + should_continue, should_break = self.should_read_row( + i, header, skiprows, nrows + ) if should_continue: table.append([]) continue From 393a622f2617b9c22635a278a96d19814b0b72ab Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 17 Apr 2020 02:02:52 +0200 Subject: [PATCH 16/66] ENH: Lint --- pandas/io/excel/_xlrd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 5fd948cfe4518..8dc065cf92034 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -112,7 +112,7 @@ def _parse_cell(cell_contents, cell_typ): i, header, skiprows, nrows ) if should_continue: - table.append([]) + data.append([]) continue if should_break: break From e00fff18f2d1709115d9d1fa918f20070b8a772e Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 17 Apr 2020 02:48:33 +0200 Subject: [PATCH 17/66] ENH: Add docstring to should_read_row --- pandas/io/excel/_base.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index d98b11bdb20d6..35e7d4dc7ba72 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -390,6 +390,24 @@ def get_sheet_data(self, sheet, convert_float, header, skiprows, nrows): pass def should_read_row(self, index, header, skiprows, nrows): + """ + Determines whether row should be read. + + Parameters + ---------- + index : int + Index of row. + header : int + Row used as column labels. + skiprows : int + Rows to skip at the begining. + nrows : int + Number of rows to parse. + + Returns + ------- + Tuple with the first bool element determining if row should be skipped and second bool element determining if reading should be stopped. + """ if nrows is not None: if index <= header - 1 + skiprows: return True, False From b14642bc395fd2c8aeaa485ac54d4ae2b45473da Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 17 Apr 2020 03:13:27 +0200 Subject: [PATCH 18/66] ENH: Lint --- pandas/io/excel/_base.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 35e7d4dc7ba72..a737410db8469 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -392,8 +392,8 @@ def get_sheet_data(self, sheet, convert_float, header, skiprows, nrows): def should_read_row(self, index, header, skiprows, nrows): """ Determines whether row should be read. - - Parameters + + Parameters ---------- index : int Index of row. @@ -406,7 +406,8 @@ def should_read_row(self, index, header, skiprows, nrows): Returns ------- - Tuple with the first bool element determining if row should be skipped and second bool element determining if reading should be stopped. + Tuple with the first bool element determining if row should be + skipped and second bool element determining if reading should be stopped. """ if nrows is not None: if index <= header - 1 + skiprows: From dfc794a6defbfc6d9a47a7d4af4e59f87a83b58a Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 17 Apr 2020 03:34:36 +0200 Subject: [PATCH 19/66] ENH: Lint --- pandas/io/excel/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index a737410db8469..e6a8c14c51f72 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -406,7 +406,7 @@ def should_read_row(self, index, header, skiprows, nrows): Returns ------- - Tuple with the first bool element determining if row should be + Tuple with the first bool element determining if row should be skipped and second bool element determining if reading should be stopped. """ if nrows is not None: From 7b501de9affffe93649d365204fed00ce8915cc7 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 17 Apr 2020 19:36:05 +0200 Subject: [PATCH 20/66] ENH: Move nrows value check --- pandas/io/excel/_base.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index e6a8c14c51f72..5fa822265f57a 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -307,9 +307,6 @@ def read_excel( "an ExcelFile - ExcelFile already has the engine set" ) - if nrows is not None: - _validate_integer("nrows", nrows) - return io.parse( sheet_name=sheet_name, header=header, @@ -443,6 +440,8 @@ def parse( ): validate_header_arg(header) + if nrows is not None: + _validate_integer("nrows", nrows) ret_dict = False From 3292f6ba80a3851640c9faf24c499b8f170e08a6 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 17 Apr 2020 20:15:16 +0200 Subject: [PATCH 21/66] ENH: Remove nrows validation --- pandas/io/excel/_base.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 5fa822265f57a..10ca5ed43040d 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -28,7 +28,7 @@ _pop_header_name, get_writer, ) -from pandas.io.parsers import TextParser, _validate_integer +from pandas.io.parsers import TextParser _read_excel_doc = ( """ @@ -440,8 +440,6 @@ def parse( ): validate_header_arg(header) - if nrows is not None: - _validate_integer("nrows", nrows) ret_dict = False @@ -471,7 +469,7 @@ def parse( gsd_header = 0 if header is None else header gsd_skiprows = 0 if skiprows is None else skiprows - gsd_nrows = nrows + gsd_nrows = nrows if isinstance(nrows, int) else None if isinstance(gsd_header, list) or isinstance(gsd_skiprows, list): gsd_nrows = None From bdd5780608c12bdffe7ad2260f7844a05eba490a Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 24 Apr 2020 14:08:37 +0200 Subject: [PATCH 22/66] Run tests --- pandas/io/excel/_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 10ca5ed43040d..45ec69b6b687a 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -470,6 +470,7 @@ def parse( gsd_header = 0 if header is None else header gsd_skiprows = 0 if skiprows is None else skiprows gsd_nrows = nrows if isinstance(nrows, int) else None + if isinstance(gsd_header, list) or isinstance(gsd_skiprows, list): gsd_nrows = None From 1867088b14ecb0a7920e82ec67dcec0bf828b214 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 24 Apr 2020 21:47:18 +0200 Subject: [PATCH 23/66] ENH: Fix reading rows in openpyxl --- pandas/io/excel/_openpyxl.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 3cfc7aaa55837..e79f4e3712b59 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -539,17 +539,14 @@ def get_sheet_data( ) -> List[List[Scalar]]: data: List[List[Scalar]] = [] - for i, row in enumerate(sheet.rows): - - should_continue, should_break = self.should_read_row( + max_row = None if nrows is None else header + skiprows + nrows + 1 + for i, row in enumerate(sheet.iter_rows(max_row=max_row)): + should_continue, _ = self.should_read_row( i, header, skiprows, nrows ) if should_continue: data.append([]) - continue - if should_break: - break - + continue data.append([self._convert_cell(cell, convert_float) for cell in row]) return data From 3c1eb10e1f6044d8502bd85548986ddb254184d6 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 24 Apr 2020 21:48:13 +0200 Subject: [PATCH 24/66] ENH: Fix lint --- pandas/io/excel/_openpyxl.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index e79f4e3712b59..fbe1ac529ab9c 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -540,13 +540,11 @@ def get_sheet_data( data: List[List[Scalar]] = [] max_row = None if nrows is None else header + skiprows + nrows + 1 - for i, row in enumerate(sheet.iter_rows(max_row=max_row)): - should_continue, _ = self.should_read_row( - i, header, skiprows, nrows - ) + for i, row in enumerate(sheet.iter_rows(max_row=max_row)): + should_continue, _ = self.should_read_row(i, header, skiprows, nrows) if should_continue: data.append([]) - continue + continue data.append([self._convert_cell(cell, convert_float) for cell in row]) return data From 88c3117d284105d73ad77ead46ad64d67cf11790 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 24 Apr 2020 22:17:27 +0200 Subject: [PATCH 25/66] Fix max_row variable definition --- pandas/io/excel/_openpyxl.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index fbe1ac529ab9c..e4715ac658a7f 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -539,7 +539,15 @@ def get_sheet_data( ) -> List[List[Scalar]]: data: List[List[Scalar]] = [] - max_row = None if nrows is None else header + skiprows + nrows + 1 + if ( + isinstance(header, int) + and isinstance(skiprows, int) + and isinstance(nrows, int) + ): + max_row = header + skiprows + nrows + 1 + else: + max_row = None + for i, row in enumerate(sheet.iter_rows(max_row=max_row)): should_continue, _ = self.should_read_row(i, header, skiprows, nrows) if should_continue: From dc600552b8f6ff5df0cd959f69bd45b15ab69d2b Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 24 Apr 2020 22:38:01 +0200 Subject: [PATCH 26/66] Fix max_row variable definition --- pandas/io/excel/_openpyxl.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index e4715ac658a7f..a91075957e036 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -539,14 +539,15 @@ def get_sheet_data( ) -> List[List[Scalar]]: data: List[List[Scalar]] = [] - if ( - isinstance(header, int) - and isinstance(skiprows, int) - and isinstance(nrows, int) - ): - max_row = header + skiprows + nrows + 1 - else: - max_row = None + max_row = ( + header + skiprows + nrows + 1 + if ( + isinstance(header, int) + and isinstance(skiprows, int) + and isinstance(nrows, int) + ) + else None + ) for i, row in enumerate(sheet.iter_rows(max_row=max_row)): should_continue, _ = self.should_read_row(i, header, skiprows, nrows) From 6fdedfd8aa20558926072b4e4eb4c199a144932c Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 25 Apr 2020 04:02:29 +0200 Subject: [PATCH 27/66] Add typed in should_read_row function --- pandas/io/excel/_base.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 45ec69b6b687a..885627d1dbc52 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -3,10 +3,12 @@ from io import BytesIO import os from textwrap import fill +from typing import List, Optional, Sequence, Tuple from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES +from pandas._typing import Union from pandas.errors import EmptyDataError from pandas.util._decorators import Appender @@ -386,7 +388,13 @@ def get_sheet_by_index(self, index): def get_sheet_data(self, sheet, convert_float, header, skiprows, nrows): pass - def should_read_row(self, index, header, skiprows, nrows): + def should_read_row( + self, + index, + header: Optional[Union[int, Sequence[int]]], + skiprows: Optional[Union[int, Sequence[int]]], + nrows: Optional[int], + ) -> Tuple[bool, bool]: """ Determines whether row should be read. From ba7175cb380d30c24ec387b56eab77c6582b0b1c Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 25 Apr 2020 12:06:44 +0200 Subject: [PATCH 28/66] Add types and tests --- pandas/io/excel/_base.py | 4 ++-- pandas/tests/io/excel/test_readers.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 885627d1dbc52..2d718eb18532d 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -415,9 +415,9 @@ def should_read_row( skipped and second bool element determining if reading should be stopped. """ if nrows is not None: - if index <= header - 1 + skiprows: + if index < header + skiprows - 1: return True, False - if index <= header - 1 + skiprows + nrows + 1: + if index <= header + skiprows + nrows: return False, False return False, True return False, False diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 99447c03e89af..116a440902443 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1084,3 +1084,17 @@ def test_excel_high_surrogate(self, engine): # should not produce a segmentation violation actual = pd.read_excel("high_surrogate.xlsx") tm.assert_frame_equal(expected, actual) + + def test_header_skiprows_nrows(self, engine, read_ext): + data = pd.read_excel("test1" + read_ext, engine=engine) + expected = ( + DataFrame(data.iloc[3:6]) + .reset_index(drop=True) + .rename(columns=data.iloc[2].rename(None)) + ) + actual = pd.read_excel( + "test1" + read_ext, engine=engine, header=1, skiprows=2, nrows=3 + ) + tm.assert_frame_equal(expected, actual) + actual = pd.read_excel("test1" + read_ext, engine=engine, skiprows=3, nrows=3) + tm.assert_frame_equal(expected, actual) From d884803f8259c9cc058e00a33ba041c4d3dd9167 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 25 Apr 2020 12:14:16 +0200 Subject: [PATCH 29/66] Add whatsnew --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/excel/_xlrd.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5c39377899a20..ade05436fa76a 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -357,6 +357,7 @@ Performance improvements :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). - Performance improvement in reductions (sum, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`). +- Performance improvement in `read_excel` for integer header, skiprows and nrows (:issue:`33281`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 8dc065cf92034..d8c31b6eb9a9d 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -51,7 +51,7 @@ def get_sheet_by_index(self, index): def get_sheet_data( self, sheet, - convert_float, + convert_float: bool, header: Optional[Union[int, Sequence[int]]], skiprows: Optional[Union[int, Sequence[int]]], nrows: Optional[int], From 4be3d275d9b08eefd0582152632301660232b526 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Mon, 4 May 2020 20:46:29 +0200 Subject: [PATCH 30/66] Fix import --- pandas/io/excel/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 2d718eb18532d..532697ab8f17f 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -3,7 +3,7 @@ from io import BytesIO import os from textwrap import fill -from typing import List, Optional, Sequence, Tuple +from typing import Optional, Sequence, Tuple from pandas._config import config From 95e3e029b6854b7ffe6c15b4c1db3d07bca08980 Mon Sep 17 00:00:00 2001 From: mproszewska <38814059+mproszewska@users.noreply.github.com> Date: Mon, 4 May 2020 20:50:15 +0200 Subject: [PATCH 31/66] Update doc/source/whatsnew/v1.1.0.rst Co-authored-by: gfyoung --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 19a6b988b0d00..82600f4503151 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -427,7 +427,7 @@ Performance improvements :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). - Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). -- Performance improvement in `read_excel` for integer header, skiprows and nrows (:issue:`33281`). +- Performance improvement in `read_excel` for integer ``header``, ``skiprows``, and ``nrows`` (:issue:`33281`). .. --------------------------------------------------------------------------- From 38520e26bc657d14dc2d125ddcf31bfe14158872 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Mon, 4 May 2020 20:51:11 +0200 Subject: [PATCH 32/66] Add index type --- pandas/io/excel/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 532697ab8f17f..f57157aa8fb12 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -390,7 +390,7 @@ def get_sheet_data(self, sheet, convert_float, header, skiprows, nrows): def should_read_row( self, - index, + index: int, header: Optional[Union[int, Sequence[int]]], skiprows: Optional[Union[int, Sequence[int]]], nrows: Optional[int], From c03b46a96821af7cdd2282d0dfda0f79f476d9ec Mon Sep 17 00:00:00 2001 From: mproszewska Date: Mon, 4 May 2020 20:55:37 +0200 Subject: [PATCH 33/66] Parametrize test --- pandas/tests/io/excel/test_readers.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 116a440902443..63d6850a46f45 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1085,7 +1085,8 @@ def test_excel_high_surrogate(self, engine): actual = pd.read_excel("high_surrogate.xlsx") tm.assert_frame_equal(expected, actual) - def test_header_skiprows_nrows(self, engine, read_ext): + @pytest.mark.parametrize("header, skiprows", [(1, 2), (0, 3)]) + def test_header_skiprows_nrows(self, engine, read_ext, header, skiprows): data = pd.read_excel("test1" + read_ext, engine=engine) expected = ( DataFrame(data.iloc[3:6]) @@ -1093,8 +1094,6 @@ def test_header_skiprows_nrows(self, engine, read_ext): .rename(columns=data.iloc[2].rename(None)) ) actual = pd.read_excel( - "test1" + read_ext, engine=engine, header=1, skiprows=2, nrows=3 + "test1" + read_ext, engine=engine, header=header, skiprows=skiprows, nrows=3 ) tm.assert_frame_equal(expected, actual) - actual = pd.read_excel("test1" + read_ext, engine=engine, skiprows=3, nrows=3) - tm.assert_frame_equal(expected, actual) From aa8cfe9a90af53f7fb7c1a366b732e37c3cb70a5 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Tue, 5 May 2020 03:05:42 +0200 Subject: [PATCH 34/66] Fix lint --- pandas/tests/io/excel/test_readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 63d6850a46f45..9b62a643429d7 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1085,7 +1085,7 @@ def test_excel_high_surrogate(self, engine): actual = pd.read_excel("high_surrogate.xlsx") tm.assert_frame_equal(expected, actual) - @pytest.mark.parametrize("header, skiprows", [(1, 2), (0, 3)]) + @pytest.mark.parametrize("header, skiprows", [(1, 2), (0, 3)]) def test_header_skiprows_nrows(self, engine, read_ext, header, skiprows): data = pd.read_excel("test1" + read_ext, engine=engine) expected = ( From c9a622d5f51b3b74cb8840aef2bb4d0f151dfcf1 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Tue, 5 May 2020 12:02:56 +0200 Subject: [PATCH 35/66] Add decorator to test --- pandas/tests/io/excel/test_readers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 9b62a643429d7..846818705c5dc 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1086,6 +1086,7 @@ def test_excel_high_surrogate(self, engine): tm.assert_frame_equal(expected, actual) @pytest.mark.parametrize("header, skiprows", [(1, 2), (0, 3)]) + @td.check_file_leaks def test_header_skiprows_nrows(self, engine, read_ext, header, skiprows): data = pd.read_excel("test1" + read_ext, engine=engine) expected = ( From aa927839d65c83a34a4cc89fc20b0b3cee2973c1 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 8 May 2020 14:00:01 +0200 Subject: [PATCH 36/66] Fix types defintion --- pandas/io/excel/_base.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index f57157aa8fb12..9920cf439c21d 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -388,7 +388,7 @@ def get_sheet_by_index(self, index): def get_sheet_data(self, sheet, convert_float, header, skiprows, nrows): pass - def should_read_row( + def should_skip_row( self, index: int, header: Optional[Union[int, Sequence[int]]], @@ -402,9 +402,9 @@ def should_read_row( ---------- index : int Index of row. - header : int - Row used as column labels. - skiprows : int + header : int, list of int + Rows used as column labels. + skiprows : int, list of int Rows to skip at the begining. nrows : int Number of rows to parse. @@ -414,7 +414,7 @@ def should_read_row( Tuple with the first bool element determining if row should be skipped and second bool element determining if reading should be stopped. """ - if nrows is not None: + if nrows is not None and isinstance(header, int) and isinstance(skiprows, int): if index < header + skiprows - 1: return True, False if index <= header + skiprows + nrows: From ee16c159766545f22f302ae88cebea3aa5aef624 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 8 May 2020 14:41:36 +0200 Subject: [PATCH 37/66] Change function name --- pandas/io/excel/_odfreader.py | 2 +- pandas/io/excel/_openpyxl.py | 2 +- pandas/io/excel/_xlrd.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 4094e9da223ea..2e5f05dfda3d4 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -88,7 +88,7 @@ def get_sheet_data( for i, sheet_row in enumerate(sheet_rows): - should_continue, should_break = self.should_read_row( + should_continue, should_break = self.should_skip_row( i, header, skiprows, nrows ) if should_continue: diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index a91075957e036..5d69807f58e71 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -550,7 +550,7 @@ def get_sheet_data( ) for i, row in enumerate(sheet.iter_rows(max_row=max_row)): - should_continue, _ = self.should_read_row(i, header, skiprows, nrows) + should_continue, _ = self.should_skip_row(i, header, skiprows, nrows) if should_continue: data.append([]) continue diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index d8c31b6eb9a9d..c620c960c9443 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -108,7 +108,7 @@ def _parse_cell(cell_contents, cell_typ): for i in range(sheet.nrows): - should_continue, should_break = self.should_read_row( + should_continue, should_break = self.should_skip_row( i, header, skiprows, nrows ) if should_continue: From 547787ae4cc3721ed4058c2fbc41e743c3b667f6 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 8 May 2020 15:18:10 +0200 Subject: [PATCH 38/66] Leaks in test fix attempt --- pandas/io/excel/_odfreader.py | 2 +- pandas/io/excel/_xlrd.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 2e5f05dfda3d4..96fb7d50ee233 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -95,7 +95,7 @@ def get_sheet_data( table.append([]) continue if should_break: - break + continue sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] empty_cells = 0 diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index c620c960c9443..57aaec4bd803e 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -115,7 +115,7 @@ def _parse_cell(cell_contents, cell_typ): data.append([]) continue if should_break: - break + continue row = [ _parse_cell(value, typ) From 368b77c696c72cc6b5989bae42672692b1076661 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 8 May 2020 16:42:36 +0200 Subject: [PATCH 39/66] Reverse changes --- pandas/io/excel/_odfreader.py | 2 +- pandas/io/excel/_xlrd.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 96fb7d50ee233..2e5f05dfda3d4 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -95,7 +95,7 @@ def get_sheet_data( table.append([]) continue if should_break: - continue + break sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] empty_cells = 0 diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 57aaec4bd803e..c620c960c9443 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -115,7 +115,7 @@ def _parse_cell(cell_contents, cell_typ): data.append([]) continue if should_break: - continue + break row = [ _parse_cell(value, typ) From 4d69922a6f91632e5045d36784d78240787b5539 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 8 May 2020 17:12:05 +0200 Subject: [PATCH 40/66] Change skiping rows in openpyxl --- pandas/io/excel/_openpyxl.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 5d69807f58e71..8c23a4290e6d2 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -550,10 +550,6 @@ def get_sheet_data( ) for i, row in enumerate(sheet.iter_rows(max_row=max_row)): - should_continue, _ = self.should_skip_row(i, header, skiprows, nrows) - if should_continue: - data.append([]) - continue data.append([self._convert_cell(cell, convert_float) for cell in row]) return data From b4ae85f456e3018b929141dd336fefc4c668499a Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 9 May 2020 23:35:15 +0200 Subject: [PATCH 41/66] Run tests again --- pandas/io/excel/_openpyxl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 8c23a4290e6d2..d77b3db0abf39 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -550,6 +550,7 @@ def get_sheet_data( ) for i, row in enumerate(sheet.iter_rows(max_row=max_row)): + data.append([self._convert_cell(cell, convert_float) for cell in row]) return data From 81f86742e053414894df9fb6fc9a9455298e0329 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sun, 10 May 2020 00:08:34 +0200 Subject: [PATCH 42/66] Remove all changes in openpyxl because of leaks --- pandas/io/excel/_openpyxl.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index d77b3db0abf39..4cd9e07e1be12 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -539,17 +539,7 @@ def get_sheet_data( ) -> List[List[Scalar]]: data: List[List[Scalar]] = [] - max_row = ( - header + skiprows + nrows + 1 - if ( - isinstance(header, int) - and isinstance(skiprows, int) - and isinstance(nrows, int) - ) - else None - ) - - for i, row in enumerate(sheet.iter_rows(max_row=max_row)): + for row in sheet.rows: data.append([self._convert_cell(cell, convert_float) for cell in row]) From 7938fcfe7d3921fa68d3850fdd101951c56507b6 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sun, 10 May 2020 01:18:37 +0200 Subject: [PATCH 43/66] Run tests --- pandas/io/excel/_openpyxl.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 4cd9e07e1be12..b23ed0d494fe3 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -539,8 +539,7 @@ def get_sheet_data( ) -> List[List[Scalar]]: data: List[List[Scalar]] = [] - for row in sheet.rows: - + for row in (sheet.rows: data.append([self._convert_cell(cell, convert_float) for cell in row]) return data From 7f31a308e6cd35a7de39e5cf6bd2599810a7dd09 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sun, 10 May 2020 01:36:46 +0200 Subject: [PATCH 44/66] Fix --- pandas/io/excel/_openpyxl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index b23ed0d494fe3..4cd9e07e1be12 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -539,7 +539,8 @@ def get_sheet_data( ) -> List[List[Scalar]]: data: List[List[Scalar]] = [] - for row in (sheet.rows: + for row in sheet.rows: + data.append([self._convert_cell(cell, convert_float) for cell in row]) return data From ae8b84f70903becbdfd5e873db2f1e84efeae446 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sun, 10 May 2020 16:22:59 +0200 Subject: [PATCH 45/66] Add types --- pandas/io/excel/_base.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 9920cf439c21d..a0a4175818b3b 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -3,12 +3,12 @@ from io import BytesIO import os from textwrap import fill -from typing import Optional, Sequence, Tuple +from typing import List, Optional, Sequence, Tuple from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES -from pandas._typing import Union +from pandas._typing import Scalar, Union from pandas.errors import EmptyDataError from pandas.util._decorators import Appender @@ -385,7 +385,14 @@ def get_sheet_by_index(self, index): pass @abc.abstractmethod - def get_sheet_data(self, sheet, convert_float, header, skiprows, nrows): + def get_sheet_data( + self, + sheet, + convert_float: bool, + header: Optional[Union[int, Sequence[int]]], + skiprows: Optional[Union[int, Sequence[int]]], + nrows: Optional[int], + ) -> List[List[Scalar]]: pass def should_skip_row( From a4f50097caa0b210315425bf31a15ee1a184c38e Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sun, 10 May 2020 16:56:35 +0200 Subject: [PATCH 46/66] Run tests --- pandas/io/excel/_openpyxl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 4cd9e07e1be12..7e1c9f16278c2 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -540,7 +540,6 @@ def get_sheet_data( data: List[List[Scalar]] = [] for row in sheet.rows: - data.append([self._convert_cell(cell, convert_float) for cell in row]) return data From d7a28921ee3245b746946570f663575b4e19f54c Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sun, 10 May 2020 18:43:44 +0200 Subject: [PATCH 47/66] Run tests again because of conda error --- pandas/io/excel/_openpyxl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 7e1c9f16278c2..4cd9e07e1be12 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -540,6 +540,7 @@ def get_sheet_data( data: List[List[Scalar]] = [] for row in sheet.rows: + data.append([self._convert_cell(cell, convert_float) for cell in row]) return data From c94b45e1edd4494eee2a8885c25e041f6100eba6 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 15 May 2020 17:38:04 +0200 Subject: [PATCH 48/66] PERF: Remove unnecessary copies in sorting functions --- pandas/core/sorting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 25312b180dba1..da9cbe1023599 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -385,7 +385,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): from pandas.core.indexes.api import Index if not key: - return values.copy() + return values if isinstance(values, ABCMultiIndex): return ensure_key_mapped_multiindex(values, key, level=levels) From 0ab450b9ea5f38582d09acbcd8f697ac62f37919 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 16 May 2020 19:06:23 +0200 Subject: [PATCH 49/66] Run tests --- pandas/core/sorting.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index da9cbe1023599..2943714a5d015 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -386,7 +386,6 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): if not key: return values - if isinstance(values, ABCMultiIndex): return ensure_key_mapped_multiindex(values, key, level=levels) @@ -404,7 +403,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): else: type_of_values = type(values) result = type_of_values(result) # try to revert to original type otherwise - except TypeError: + except TypeError:opy() raise TypeError( f"User-provided `key` function returned an invalid type {type(result)} \ which could not be converted to {type(values)}." From 54c7304d585c60dd148e3e47aa28514100289eb5 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 16 May 2020 19:07:12 +0200 Subject: [PATCH 50/66] Run tests --- pandas/core/sorting.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 2943714a5d015..da9cbe1023599 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -386,6 +386,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): if not key: return values + if isinstance(values, ABCMultiIndex): return ensure_key_mapped_multiindex(values, key, level=levels) @@ -403,7 +404,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): else: type_of_values = type(values) result = type_of_values(result) # try to revert to original type otherwise - except TypeError:opy() + except TypeError: raise TypeError( f"User-provided `key` function returned an invalid type {type(result)} \ which could not be converted to {type(values)}." From 192090595464ce62174100ccb5960c86a66c1fff Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 22 May 2020 20:35:37 +0200 Subject: [PATCH 51/66] Resolve conflicts --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index ddd7d028be337..482af4f3e7c82 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -614,8 +614,8 @@ Performance improvements and :meth:`~pandas.core.groupby.groupby.Groupby.last` (:issue:`34178`) - Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`). - Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). -- Performance improvement in `read_excel` for integer ``header``, ``skiprows``, and ``nrows`` (:issue:`33281`). - Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`) +- Performance improvement in `read_excel` for integer ``header``, ``skiprows``, and ``nrows`` (:issue:`33281`). .. --------------------------------------------------------------------------- From 6d72a346770fc93778a83e171daceec52b60e6d4 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 22 May 2020 23:19:58 +0200 Subject: [PATCH 52/66] Add asv --- asv_bench/benchmarks/algorithms.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 65e52e03c43c7..a96d9bc924308 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -174,4 +174,13 @@ def time_argsort(self, N): self.array.argsort() +class SortIndexSeries: + def setup(self): + N = 10 ** 5 + idx = pd.date_range(start="1/1/2000", periods=N, freq="s") + self.s = pd.Series(np.random.randn(N), index=idx) + + def time_sort_index(self): + self.s.sort_index() + from .pandas_vb_common import setup # noqa: F401 isort:skip From 5ba54a6039d3981a4187b38e11b479e53f8dcdd1 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 22 May 2020 23:20:53 +0200 Subject: [PATCH 53/66] Run black --- asv_bench/benchmarks/algorithms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index a96d9bc924308..7afa97f9aa394 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -183,4 +183,5 @@ def setup(self): def time_sort_index(self): self.s.sort_index() + from .pandas_vb_common import setup # noqa: F401 isort:skip From 276627019d8000792473742c0a9036cf59b5f3cb Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sat, 23 May 2020 00:28:24 +0200 Subject: [PATCH 54/66] Remove asv --- asv_bench/benchmarks/algorithms.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 7afa97f9aa394..65e52e03c43c7 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -174,14 +174,4 @@ def time_argsort(self, N): self.array.argsort() -class SortIndexSeries: - def setup(self): - N = 10 ** 5 - idx = pd.date_range(start="1/1/2000", periods=N, freq="s") - self.s = pd.Series(np.random.randn(N), index=idx) - - def time_sort_index(self): - self.s.sort_index() - - from .pandas_vb_common import setup # noqa: F401 isort:skip From ac823f5298c816cc2c4fef4393f7d5cd7676ddd7 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sun, 31 May 2020 15:20:54 +0200 Subject: [PATCH 55/66] Resolve conflict --- doc/source/whatsnew/v1.1.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 482af4f3e7c82..e307f71ccafe5 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -615,6 +615,8 @@ Performance improvements - Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`). - Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). - Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`) +- Performance improvement in :class:`pandas.core.groupby.RollingGroupby` (:issue:`34052`) +- Performance improvement in arithmetic operations (sub, add, mul, div) for MultiIndex (:issue:`34297`) - Performance improvement in `read_excel` for integer ``header``, ``skiprows``, and ``nrows`` (:issue:`33281`). .. --------------------------------------------------------------------------- From 6f188fe9e9d89b60e4670f9423dedd5006320bf9 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Mon, 1 Jun 2020 01:40:46 +0200 Subject: [PATCH 56/66] Revert change --- pandas/core/sorting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index da9cbe1023599..25312b180dba1 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -385,7 +385,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): from pandas.core.indexes.api import Index if not key: - return values + return values.copy() if isinstance(values, ABCMultiIndex): return ensure_key_mapped_multiindex(values, key, level=levels) From ba314fe73fee507dd68e2d2ced9ffba79b653197 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Mon, 1 Jun 2020 02:41:12 +0200 Subject: [PATCH 57/66] Change should_skip_row function --- pandas/io/excel/_base.py | 10 +++------- pandas/io/excel/_odfreader.py | 10 ++++------ pandas/io/excel/_xlrd.py | 13 ++++++------- 3 files changed, 13 insertions(+), 20 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index a0a4175818b3b..a6a1afdc6e90a 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -418,16 +418,12 @@ def should_skip_row( Returns ------- - Tuple with the first bool element determining if row should be - skipped and second bool element determining if reading should be stopped. + Bool determining if row should be skipped. """ if nrows is not None and isinstance(header, int) and isinstance(skiprows, int): if index < header + skiprows - 1: - return True, False - if index <= header + skiprows + nrows: - return False, False - return False, True - return False, False + return True + return False def parse( self, diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 2e5f05dfda3d4..e30562ca23c3c 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -86,16 +86,14 @@ def get_sheet_data( table: List[List[Scalar]] = [] + if nrows is not None and isinstance(header, int) and isinstance(skiprows, int): + sheet_rows = sheet_rows[0 : header + skiprows + nrows + 1] + for i, sheet_row in enumerate(sheet_rows): - should_continue, should_break = self.should_skip_row( - i, header, skiprows, nrows - ) - if should_continue: + if self.should_skip_row(i, header, skiprows, nrows): table.append([]) continue - if should_break: - break sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] empty_cells = 0 diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index c620c960c9443..61040c4158240 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -106,16 +106,15 @@ def _parse_cell(cell_contents, cell_typ): data: List[List[Scalar]] = [] - for i in range(sheet.nrows): + sheet_nrows = sheet.nrows + if nrows is not None and isinstance(header, int) and isinstance(skiprows, int): + sheet_nrows = min(header + skiprows + nrows + 1, sheet_nrows) - should_continue, should_break = self.should_skip_row( - i, header, skiprows, nrows - ) - if should_continue: + for i in range(sheet_nrows): + + if self.should_skip_row(i, header, skiprows, nrows): data.append([]) continue - if should_break: - break row = [ _parse_cell(value, typ) From f923bfd33b0f9ed7b129a3d3ee99e499dbdfe76a Mon Sep 17 00:00:00 2001 From: mproszewska Date: Mon, 1 Jun 2020 03:12:36 +0200 Subject: [PATCH 58/66] Fix return type --- pandas/io/excel/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index a6a1afdc6e90a..71f5bb1e0b0f3 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -401,7 +401,7 @@ def should_skip_row( header: Optional[Union[int, Sequence[int]]], skiprows: Optional[Union[int, Sequence[int]]], nrows: Optional[int], - ) -> Tuple[bool, bool]: + ) -> bool: """ Determines whether row should be read. From 008add5969c40f02a3d7c018990b51bb1cba04ac Mon Sep 17 00:00:00 2001 From: mproszewska Date: Mon, 1 Jun 2020 03:43:33 +0200 Subject: [PATCH 59/66] Remove import --- pandas/io/excel/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 71f5bb1e0b0f3..987d3d83152a3 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -3,7 +3,7 @@ from io import BytesIO import os from textwrap import fill -from typing import List, Optional, Sequence, Tuple +from typing import List, Optional, Sequence from pandas._config import config From 2226050d05a3e8666a210a49fcae858b37a7d745 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Wed, 3 Jun 2020 01:30:56 +0200 Subject: [PATCH 60/66] Run tests --- pandas/tests/io/excel/test_readers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 261d53b66aad5..f4e6273904983 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1151,6 +1151,7 @@ def test_excel_high_surrogate(self, engine): @pytest.mark.parametrize("header, skiprows", [(1, 2), (0, 3)]) @td.check_file_leaks def test_header_skiprows_nrows(self, engine, read_ext, header, skiprows): + # GH 32727 data = pd.read_excel("test1" + read_ext, engine=engine) expected = ( DataFrame(data.iloc[3:6]) From 9216210dc82953909976391041f8a3c4b682da70 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 5 Jun 2020 02:54:50 +0200 Subject: [PATCH 61/66] Add asv --- asv_bench/benchmarks/io/excel.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 80af2cff41769..84c2534a9cbf4 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -69,5 +69,9 @@ def time_read_excel(self, engine): fname = self.fname_odf if engine == "odf" else self.fname_excel read_excel(fname, engine=engine) + def nrows_read_excel(self, engine): + name = self.fname_odf if engine == "odf" else self.fname_excel + read_excel(fname, engine=engine, nrows=1) + from ..pandas_vb_common import setup # noqa: F401 isort:skip From d9aa31967e1286dc844773abb2be094a0b43a4ac Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 5 Jun 2020 03:51:05 +0200 Subject: [PATCH 62/66] Add asv --- asv_bench/benchmarks/io/excel.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 80af2cff41769..926286ee5fab2 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -11,7 +11,7 @@ def _generate_dataframe(): - N = 2000 + N = 20000 C = 5 df = DataFrame( np.random.randn(N, C), @@ -69,5 +69,9 @@ def time_read_excel(self, engine): fname = self.fname_odf if engine == "odf" else self.fname_excel read_excel(fname, engine=engine) + def nrows_read_excel(self, engine): + name = self.fname_odf if engine == "odf" else self.fname_excel + read_excel(fname, engine=engine, nrows=1) + from ..pandas_vb_common import setup # noqa: F401 isort:skip From 0afb1b14c359eece44f3885d5f20b40e07a9ccb6 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 5 Jun 2020 12:31:33 +0200 Subject: [PATCH 63/66] Fix --- asv_bench/benchmarks/io/excel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 926286ee5fab2..e9776ff2c641e 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -70,7 +70,7 @@ def time_read_excel(self, engine): read_excel(fname, engine=engine) def nrows_read_excel(self, engine): - name = self.fname_odf if engine == "odf" else self.fname_excel + fname = self.fname_odf if engine == "odf" else self.fname_excel read_excel(fname, engine=engine, nrows=1) From 06003a8b090577d55680cb5271b07e0bc36d2bb5 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Mon, 8 Jun 2020 17:22:26 +0200 Subject: [PATCH 64/66] Fix asv --- asv_bench/benchmarks/io/excel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index e9776ff2c641e..1eaccb9f2d897 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -69,7 +69,7 @@ def time_read_excel(self, engine): fname = self.fname_odf if engine == "odf" else self.fname_excel read_excel(fname, engine=engine) - def nrows_read_excel(self, engine): + def time_read_excel_nrows(self, engine): fname = self.fname_odf if engine == "odf" else self.fname_excel read_excel(fname, engine=engine, nrows=1) From c08709b4aa213fe9ab15fc4b7eae14c53a0507e3 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Mon, 8 Jun 2020 17:27:42 +0200 Subject: [PATCH 65/66] Fix asv --- asv_bench/benchmarks/io/excel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 1eaccb9f2d897..d4bde565ecdb2 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -11,7 +11,7 @@ def _generate_dataframe(): - N = 20000 + N = 2000 C = 5 df = DataFrame( np.random.randn(N, C), From c9a2c75ceb49ee85a62e926a1f7ee21bf15ea3a5 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Mon, 8 Jun 2020 19:07:38 +0200 Subject: [PATCH 66/66] Fix asv --- asv_bench/benchmarks/io/excel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index d4bde565ecdb2..1eaccb9f2d897 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -11,7 +11,7 @@ def _generate_dataframe(): - N = 2000 + N = 20000 C = 5 df = DataFrame( np.random.randn(N, C),