Skip to content

CLN/TYP: pandas/io/formats/excel.py #37862

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Nov 24, 2020
Merged
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 60 additions & 65 deletions pandas/io/formats/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from functools import reduce
import itertools
import re
from typing import Callable, Dict, Mapping, Optional, Sequence, Union
from typing import Callable, Dict, Iterator, Mapping, Optional, Sequence, Union, cast
import warnings

import numpy as np
Expand All @@ -14,7 +14,6 @@

from pandas.core.dtypes import missing
from pandas.core.dtypes.common import is_float, is_scalar
from pandas.core.dtypes.generic import ABCIndex

from pandas import DataFrame, Index, MultiIndex, PeriodIndex
import pandas.core.common as com
Expand All @@ -29,7 +28,13 @@ class ExcelCell:
__slots__ = __fields__

def __init__(
self, row: int, col: int, val, style=None, mergestart=None, mergeend=None
self,
row: int,
col: int,
val,
style=None,
mergestart: Optional[int] = None,
mergeend: Optional[int] = None,
):
self.row = row
self.col = col
Expand Down Expand Up @@ -522,16 +527,15 @@ def _format_value(self, val):
)
return val

def _format_header_mi(self):
def _format_header_mi(self) -> Iterator[ExcelCell]:
if self.columns.nlevels > 1:
if not self.index:
raise NotImplementedError(
"Writing to Excel with MultiIndex columns and no "
"index ('index'=False) is not yet implemented."
)

has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex))
if not (has_aliases or self.header):
if not (self._has_aliases or self.header):
return

columns = self.columns
Expand All @@ -547,28 +551,27 @@ def _format_header_mi(self):

if self.merge_cells:
# Format multi-index as a merged cells.
for lnum in range(len(level_lengths)):
name = columns.names[lnum]
yield ExcelCell(lnum, coloffset, name, self.header_style)
for lnum, name in enumerate(columns.names):
yield ExcelCell(
row=lnum,
col=coloffset,
val=name,
style=self.header_style,
)

for lnum, (spans, levels, level_codes) in enumerate(
zip(level_lengths, columns.levels, columns.codes)
):
values = levels.take(level_codes)
for i in spans:
if spans[i] > 1:
yield ExcelCell(
lnum,
coloffset + i + 1,
values[i],
self.header_style,
lnum,
coloffset + i + spans[i],
)
else:
yield ExcelCell(
lnum, coloffset + i + 1, values[i], self.header_style
)
yield ExcelCell(
row=lnum,
col=coloffset + i + 1,
val=values[i],
style=self.header_style,
mergestart=lnum if spans[i] > 1 else None,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe spans[i] > 1 could first be assigned to another appropriately named variable to clarify intent and (slightly) reduce duplication.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am a bit confused what spans[i] > 1 means. I guess it is whether there is a multi-level col. So, I used the variable is_multilevel. Please let me know what you think.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we are always multilevel in this function. maybe needs_merge_cells or spans_multiple_cells

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are good names indeed. Thank you!

mergeend=coloffset + i + spans[i] if spans[i] > 1 else None,
)
else:
# Format in legacy format with dots to indicate levels.
for i, values in enumerate(zip(*level_strs)):
Expand All @@ -577,9 +580,8 @@ def _format_header_mi(self):

self.rowcounter = lnum

def _format_header_regular(self):
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex))
if has_aliases or self.header:
def _format_header_regular(self) -> Iterator[ExcelCell]:
if self._has_aliases or self.header:
coloffset = 0

if self.index:
Expand All @@ -588,17 +590,11 @@ def _format_header_regular(self):
coloffset = len(self.df.index[0])

colnames = self.columns
if has_aliases:
# pandas\io\formats\excel.py:593: error: Argument 1 to "len"
# has incompatible type "Union[Sequence[Optional[Hashable]],
# bool]"; expected "Sized" [arg-type]
if len(self.header) != len(self.columns): # type: ignore[arg-type]
# pandas\io\formats\excel.py:602: error: Argument 1 to
# "len" has incompatible type
# "Union[Sequence[Optional[Hashable]], bool]"; expected
# "Sized" [arg-type]
if self._has_aliases:
self.header = cast(Sequence, self.header)
if len(self.header) != len(self.columns):
raise ValueError(
f"Writing {len(self.columns)} cols " # type: ignore[arg-type]
f"Writing {len(self.columns)} cols "
f"but got {len(self.header)} aliases"
)
else:
Expand All @@ -609,7 +605,7 @@ def _format_header_regular(self):
self.rowcounter, colindex + coloffset, colname, self.header_style
)

def _format_header(self):
def _format_header(self) -> Iterator[ExcelCell]:
if isinstance(self.columns, MultiIndex):
gen = self._format_header_mi()
else:
Expand All @@ -631,15 +627,14 @@ def _format_header(self):
self.rowcounter += 1
return itertools.chain(gen, gen2)

def _format_body(self):
def _format_body(self) -> Iterator[ExcelCell]:
if isinstance(self.df.index, MultiIndex):
return self._format_hierarchical_rows()
else:
return self._format_regular_rows()

def _format_regular_rows(self):
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex))
if has_aliases or self.header:
def _format_regular_rows(self) -> Iterator[ExcelCell]:
if self._has_aliases or self.header:
self.rowcounter += 1

# output index and index_label?
Expand Down Expand Up @@ -676,9 +671,8 @@ def _format_regular_rows(self):

yield from self._generate_body(coloffset)

def _format_hierarchical_rows(self):
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex))
if has_aliases or self.header:
def _format_hierarchical_rows(self) -> Iterator[ExcelCell]:
if self._has_aliases or self.header:
self.rowcounter += 1

gcolidx = 0
Expand Down Expand Up @@ -722,39 +716,40 @@ def _format_hierarchical_rows(self):
)

for i in spans:
if spans[i] > 1:
yield ExcelCell(
self.rowcounter + i,
gcolidx,
values[i],
self.header_style,
self.rowcounter + i + spans[i] - 1,
gcolidx,
)
else:
yield ExcelCell(
self.rowcounter + i,
gcolidx,
values[i],
self.header_style,
)
yield ExcelCell(
row=self.rowcounter + i,
col=gcolidx,
val=values[i],
style=self.header_style,
mergestart=(
self.rowcounter + i + spans[i] - 1
if spans[i] > 1
else None
),
mergeend=gcolidx if spans[i] > 1 else None,
)
gcolidx += 1

else:
# Format hierarchical rows with non-merged values.
for indexcolvals in zip(*self.df.index):
for idx, indexcolval in enumerate(indexcolvals):
yield ExcelCell(
self.rowcounter + idx,
gcolidx,
indexcolval,
self.header_style,
row=self.rowcounter + idx,
col=gcolidx,
val=indexcolval,
style=self.header_style,
)
gcolidx += 1

yield from self._generate_body(gcolidx)

def _generate_body(self, coloffset: int):
@property
def _has_aliases(self) -> bool:
"""Whether the aliases for column names are present."""
return not isinstance(self.header, bool)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thinking some more, maybe best not to use bool.

we've never really had a discussion on use of bool in type annotations, but sometimes we are actually accepting any truthy value and other times we explicitly check if x is False or if x is True.

the header parameter of the docstring of ExcelFormatter should probably be updated to match. It currently is

    header : boolean or list of string, default True
        Write out column names. If a list of string is given it is
        assumed to be aliases for the column names

even though (tuple, list, np.ndarray, ABCIndex) is the current accepted types on master.

and the current mypy issue is that header is annotated as Union[Sequence[Label], bool]

perhaps use is_list_like here. (ignoring that a str is a Sequence)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I replaced with is_list_like.
Also slightly updated docstring. Will it work?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is still a discrepancy. Can a sequence of any hashable be passed or does it have to be a sequence of strings.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ExcelFormatter is called only from Styler.to_excel and NDFrame.to_excel.
According to these docstrings

header : bool or list of str, default True

I also looked through the test base and noticed that header is either bool or some sequence of strings (on some occasions).

I am not aware of other use cases, when header is a sequence of any hashables rather than strings.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok can leave as is for now.

The docstrings of public methods document the public api, the type annotations should normally be as permissible as possible for argument positions and the docstring and type annotations should match.

we don't want to change the public api without discussion, so can circle back to this.


def _generate_body(self, coloffset: int) -> Iterator[ExcelCell]:
if self.styler is None:
styles = None
else:
Expand All @@ -771,7 +766,7 @@ def _generate_body(self, coloffset: int):
xlstyle = self.style_converter(";".join(styles[i, colidx]))
yield ExcelCell(self.rowcounter + i, colidx + coloffset, val, xlstyle)

def get_formatted_cells(self):
def get_formatted_cells(self) -> Iterator[ExcelCell]:
for cell in itertools.chain(self._format_header(), self._format_body()):
cell.val = self._format_value(cell.val)
yield cell
Expand Down