diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 3a84a75be838f..634688d65e117 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -74,7 +74,7 @@ jobs: - name: Install pyright # note: keep version in sync with .pre-commit-config.yaml - run: npm install -g pyright@1.1.200 + run: npm install -g pyright@1.1.202 - name: Build Pandas id: build diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 19a8a127fa1a5..e3c9be941498f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -78,7 +78,7 @@ repos: types: [python] stages: [manual] # note: keep version in sync with .github/workflows/ci.yml - additional_dependencies: ['pyright@1.1.200'] + additional_dependencies: ['pyright@1.1.202'] - repo: local hooks: - id: flake8-rst diff --git a/pandas/_typing.py b/pandas/_typing.py index eb5bb30238893..159d57fb27c89 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -84,6 +84,7 @@ DatetimeLikeScalar = Union["Period", "Timestamp", "Timedelta"] PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"] Scalar = Union[PythonScalar, PandasScalar] +IntStrT = TypeVar("IntStrT", int, str) # timestamp and timedelta convertible types diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 2ff3360d0b808..b490244f7f396 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -8,8 +8,16 @@ from typing import ( IO, Any, + Callable, + Hashable, + Iterable, + List, + Literal, Mapping, + Sequence, + Union, cast, + overload, ) import warnings import zipfile @@ -20,6 +28,7 @@ from pandas._typing import ( DtypeArg, FilePath, + IntStrT, ReadBuffer, StorageOptions, WriteExcelBuffer, @@ -342,37 +351,105 @@ ) +@overload +def read_excel( + io, + # sheet name is str or int -> DataFrame + sheet_name: str | int, + header: int | Sequence[int] | None = ..., + names=..., + index_col: int | Sequence[int] | None = ..., + usecols=..., + squeeze: bool | None = ..., + dtype: DtypeArg | None = ..., + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ..., + converters=..., + true_values: Iterable[Hashable] | None = ..., + false_values: Iterable[Hashable] | None = ..., + skiprows: Sequence[int] | int | Callable[[int], object] | None = ..., + nrows: int | None = ..., + na_values=..., + keep_default_na: bool = ..., + na_filter: bool = ..., + verbose: bool = ..., + parse_dates=..., + date_parser=..., + thousands: str | None = ..., + decimal: str = ..., + comment: str | None = ..., + skipfooter: int = ..., + convert_float: bool | None = ..., + mangle_dupe_cols: bool = ..., + storage_options: StorageOptions = ..., +) -> DataFrame: + ... + + +@overload +def read_excel( + io, + # sheet name is list or None -> dict[IntStrT, DataFrame] + sheet_name: list[IntStrT] | None, + header: int | Sequence[int] | None = ..., + names=..., + index_col: int | Sequence[int] | None = ..., + usecols=..., + squeeze: bool | None = ..., + dtype: DtypeArg | None = ..., + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ..., + converters=..., + true_values: Iterable[Hashable] | None = ..., + false_values: Iterable[Hashable] | None = ..., + skiprows: Sequence[int] | int | Callable[[int], object] | None = ..., + nrows: int | None = ..., + na_values=..., + keep_default_na: bool = ..., + na_filter: bool = ..., + verbose: bool = ..., + parse_dates=..., + date_parser=..., + thousands: str | None = ..., + decimal: str = ..., + comment: str | None = ..., + skipfooter: int = ..., + convert_float: bool | None = ..., + mangle_dupe_cols: bool = ..., + storage_options: StorageOptions = ..., +) -> dict[IntStrT, DataFrame]: + ... + + @deprecate_nonkeyword_arguments(allowed_args=["io", "sheet_name"], version="2.0") @Appender(_read_excel_doc) def read_excel( io, - sheet_name=0, - header=0, + sheet_name: str | int | list[IntStrT] | None = 0, + header: int | Sequence[int] | None = 0, names=None, - index_col=None, + index_col: int | Sequence[int] | None = None, usecols=None, - squeeze=None, + squeeze: bool | None = None, dtype: DtypeArg | None = None, - engine=None, + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = None, converters=None, - true_values=None, - false_values=None, - skiprows=None, - nrows=None, + true_values: Iterable[Hashable] | None = None, + false_values: Iterable[Hashable] | None = None, + skiprows: Sequence[int] | int | Callable[[int], object] | None = None, + nrows: int | None = None, na_values=None, - keep_default_na=True, - na_filter=True, - verbose=False, + keep_default_na: bool = True, + na_filter: bool = True, + verbose: bool = False, parse_dates=False, date_parser=None, - thousands=None, - decimal=".", - comment=None, - skipfooter=0, - convert_float=None, - mangle_dupe_cols=True, + thousands: str | None = None, + decimal: str = ".", + comment: str | None = None, + skipfooter: int = 0, + convert_float: bool | None = None, + mangle_dupe_cols: bool = True, storage_options: StorageOptions = None, -): +) -> DataFrame | dict[IntStrT, DataFrame]: should_close = False if not isinstance(io, ExcelFile): @@ -466,19 +543,19 @@ def close(self): @property @abc.abstractmethod - def sheet_names(self): + def sheet_names(self) -> list[str]: pass @abc.abstractmethod - def get_sheet_by_name(self, name): + def get_sheet_by_name(self, name: str): pass @abc.abstractmethod - def get_sheet_by_index(self, index): + def get_sheet_by_index(self, index: int): pass @abc.abstractmethod - def get_sheet_data(self, sheet, convert_float): + def get_sheet_data(self, sheet, convert_float: bool): pass def raise_if_bad_sheet_by_index(self, index: int) -> None: @@ -494,27 +571,27 @@ def raise_if_bad_sheet_by_name(self, name: str) -> None: def parse( self, - sheet_name=0, - header=0, + sheet_name: str | int | list[int] | list[str] | None = 0, + header: int | Sequence[int] | None = 0, names=None, - index_col=None, + index_col: int | Sequence[int] | None = None, usecols=None, - squeeze=None, + squeeze: bool | None = None, dtype: DtypeArg | None = None, - true_values=None, - false_values=None, - skiprows=None, - nrows=None, + true_values: Iterable[Hashable] | None = None, + false_values: Iterable[Hashable] | None = None, + skiprows: Sequence[int] | int | Callable[[int], object] | None = None, + nrows: int | None = None, na_values=None, - verbose=False, + verbose: bool = False, parse_dates=False, date_parser=None, - thousands=None, - decimal=".", - comment=None, - skipfooter=0, - convert_float=None, - mangle_dupe_cols=True, + thousands: str | None = None, + decimal: str = ".", + comment: str | None = None, + skipfooter: int = 0, + convert_float: bool | None = None, + mangle_dupe_cols: bool = True, **kwds, ): @@ -532,17 +609,20 @@ def parse( ret_dict = False # Keep sheetname to maintain backwards compatibility. + sheets: list[int] | list[str] if isinstance(sheet_name, list): sheets = sheet_name ret_dict = True elif sheet_name is None: sheets = self.sheet_names ret_dict = True + elif isinstance(sheet_name, str): + sheets = [sheet_name] else: sheets = [sheet_name] # handle same-type duplicates. - sheets = list(dict.fromkeys(sheets).keys()) + sheets = cast(Union[List[int], List[str]], list(dict.fromkeys(sheets).keys())) output = {} @@ -565,17 +645,28 @@ def parse( output[asheetname] = DataFrame() continue - if is_list_like(header) and len(header) == 1: - header = header[0] + is_list_header = False + is_len_one_list_header = False + if is_list_like(header): + assert isinstance(header, Sequence) + is_list_header = True + if len(header) == 1: + is_len_one_list_header = True + + if is_len_one_list_header: + header = cast(Sequence[int], header)[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None and is_list_like(header): + assert isinstance(header, Sequence) + header_names = [] control_row = [True] * len(data[0]) for row in header: if is_integer(skiprows): + assert isinstance(skiprows, int) row += skiprows data[row], control_row = fill_mi_header(data[row], control_row) @@ -587,14 +678,14 @@ def parse( # If there is a MultiIndex header and an index then there is also # a row containing just the index name(s) has_index_names = ( - is_list_like(header) and len(header) > 1 and index_col is not None + is_list_header and not is_len_one_list_header and index_col is not None ) if is_list_like(index_col): # Forward fill values for MultiIndex index. if header is None: offset = 0 - elif not is_list_like(header): + elif isinstance(header, int): offset = 1 + header else: offset = 1 + max(header) @@ -608,6 +699,8 @@ def parse( # Check if we have an empty dataset # before trying to collect data. if offset < len(data): + assert isinstance(index_col, Sequence) + for col in index_col: last = data[offset][col] @@ -875,12 +968,12 @@ class ExcelWriter(metaclass=abc.ABCMeta): def __new__( cls, path: FilePath | WriteExcelBuffer | ExcelWriter, - engine=None, - date_format=None, - datetime_format=None, + engine: str | None = None, + date_format: str | None = None, + datetime_format: str | None = None, mode: str = "w", storage_options: StorageOptions = None, - if_sheet_exists: str | None = None, + if_sheet_exists: Literal["error", "new", "replace", "overlay"] | None = None, engine_kwargs: dict | None = None, **kwargs, ): @@ -928,6 +1021,8 @@ def __new__( stacklevel=find_stack_level(), ) + # for mypy + assert engine is not None cls = get_writer(engine) return object.__new__(cls) @@ -937,7 +1032,7 @@ def __new__( @property @abc.abstractmethod - def supported_extensions(self): + def supported_extensions(self) -> tuple[str, ...] | list[str]: """Extensions that writer engine supports.""" pass @@ -949,8 +1044,13 @@ def engine(self) -> str: @abc.abstractmethod def write_cells( - self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None - ): + self, + cells, + sheet_name: str | None = None, + startrow: int = 0, + startcol: int = 0, + freeze_panes: tuple[int, int] | None = None, + ) -> None: """ Write given formatted cells into Excel an excel sheet @@ -968,7 +1068,7 @@ def write_cells( pass @abc.abstractmethod - def save(self): + def save(self) -> None: """ Save workbook to disk. """ @@ -977,9 +1077,9 @@ def save(self): def __init__( self, path: FilePath | WriteExcelBuffer | ExcelWriter, - engine=None, - date_format=None, - datetime_format=None, + engine: str | None = None, + date_format: str | None = None, + datetime_format: str | None = None, mode: str = "w", storage_options: StorageOptions = None, if_sheet_exists: str | None = None, @@ -1034,14 +1134,14 @@ def __init__( def __fspath__(self): return getattr(self.handles.handle, "name", "") - def _get_sheet_name(self, sheet_name): + def _get_sheet_name(self, sheet_name: str | None) -> str: if sheet_name is None: sheet_name = self.cur_sheet if sheet_name is None: # pragma: no cover raise ValueError("Must pass explicit sheet_name or set cur_sheet property") return sheet_name - def _value_with_fmt(self, val): + def _value_with_fmt(self, val) -> tuple[object, str | None]: """ Convert numpy types to Python types for the Excel writers. @@ -1076,7 +1176,7 @@ def _value_with_fmt(self, val): return val, fmt @classmethod - def check_extension(cls, ext: str): + def check_extension(cls, ext: str) -> Literal[True]: """ checks that path's extension against the Writer's supported extensions. If it isn't supported, raises UnsupportedFiletypeError. @@ -1100,11 +1200,10 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): self.close() - def close(self): + def close(self) -> None: """synonym for save, to make it more file-like""" - content = self.save() + self.save() self.handles.close() - return content XLS_SIGNATURES = ( @@ -1243,7 +1342,10 @@ class ExcelFile: } def __init__( - self, path_or_buffer, engine=None, storage_options: StorageOptions = None + self, + path_or_buffer, + engine: str | None = None, + storage_options: StorageOptions = None, ): if engine is not None and engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") @@ -1310,6 +1412,7 @@ def __init__( stacklevel=stacklevel, ) + assert engine is not None self.engine = engine self.storage_options = storage_options @@ -1320,27 +1423,27 @@ def __fspath__(self): def parse( self, - sheet_name=0, - header=0, + sheet_name: str | int | list[int] | list[str] | None = 0, + header: int | Sequence[int] | None = 0, names=None, - index_col=None, + index_col: int | Sequence[int] | None = None, usecols=None, - squeeze=None, + squeeze: bool | None = None, converters=None, - true_values=None, - false_values=None, - skiprows=None, - nrows=None, + true_values: Iterable[Hashable] | None = None, + false_values: Iterable[Hashable] | None = None, + skiprows: Sequence[int] | int | Callable[[int], object] | None = None, + nrows: int | None = None, na_values=None, parse_dates=False, date_parser=None, - thousands=None, - comment=None, - skipfooter=0, - convert_float=None, - mangle_dupe_cols=True, + thousands: str | None = None, + comment: str | None = None, + skipfooter: int = 0, + convert_float: bool | None = None, + mangle_dupe_cols: bool = True, **kwds, - ): + ) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]: """ Parse specified sheet(s) into a DataFrame. @@ -1383,7 +1486,7 @@ def book(self): def sheet_names(self): return self._reader.sheet_names - def close(self): + def close(self) -> None: """close io if necessary""" self._reader.close() diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index add95c58cd809..d4fe3683c907e 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -140,7 +140,7 @@ def _make_table_cell_attributes(self, cell) -> dict[str, int | str]: attributes["numbercolumnsspanned"] = cell.mergeend return attributes - def _make_table_cell(self, cell) -> tuple[str, Any]: + def _make_table_cell(self, cell) -> tuple[object, Any]: """Convert cell data to an OpenDocument spreadsheet cell Parameters