Skip to content

Commit 2c7057e

Browse files
immaxchenproost
authored andcommitted
ENH: Add built-in function for Styler to format the text displayed for missing values (pandas-dev#29118)
* Add built-in funcion for Styler to format the text displayed for missing values As described in GH pandas-dev#28358, user who wants to control how NA values are printed while applying styles to the output will have to implement their own formatter. (so that the underlying data will not change and can be used for styling)
1 parent e293770 commit 2c7057e

File tree

5 files changed

+191
-13
lines changed

5 files changed

+191
-13
lines changed

doc/source/reference/style.rst

+1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ Style application
4141
Styler.set_caption
4242
Styler.set_properties
4343
Styler.set_uuid
44+
Styler.set_na_rep
4445
Styler.clear
4546
Styler.pipe
4647

doc/source/user_guide/style.ipynb

+60
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
"df = pd.DataFrame({'A': np.linspace(1, 10, 10)})\n",
6868
"df = pd.concat([df, pd.DataFrame(np.random.randn(10, 4), columns=list('BCDE'))],\n",
6969
" axis=1)\n",
70+
"df.iloc[3, 3] = np.nan\n",
7071
"df.iloc[0, 2] = np.nan"
7172
]
7273
},
@@ -402,6 +403,38 @@
402403
"df.style.format({\"B\": lambda x: \"±{:.2f}\".format(abs(x))})"
403404
]
404405
},
406+
{
407+
"cell_type": "markdown",
408+
"metadata": {},
409+
"source": [
410+
"You can format the text displayed for missing values by `na_rep`."
411+
]
412+
},
413+
{
414+
"cell_type": "code",
415+
"execution_count": null,
416+
"metadata": {},
417+
"outputs": [],
418+
"source": [
419+
"df.style.format(\"{:.2%}\", na_rep=\"-\")"
420+
]
421+
},
422+
{
423+
"cell_type": "markdown",
424+
"metadata": {},
425+
"source": [
426+
"These formatting techniques can be used in combination with styling."
427+
]
428+
},
429+
{
430+
"cell_type": "code",
431+
"execution_count": null,
432+
"metadata": {},
433+
"outputs": [],
434+
"source": [
435+
"df.style.highlight_max().format(None, na_rep=\"-\")"
436+
]
437+
},
405438
{
406439
"cell_type": "markdown",
407440
"metadata": {},
@@ -659,6 +692,7 @@
659692
"- precision\n",
660693
"- captions\n",
661694
"- table-wide styles\n",
695+
"- missing values representation\n",
662696
"- hiding the index or columns\n",
663697
"\n",
664698
"Each of these can be specified in two ways:\n",
@@ -800,6 +834,32 @@
800834
"We hope to collect some useful ones either in pandas, or preferable in a new package that [builds on top](#Extensibility) the tools here."
801835
]
802836
},
837+
{
838+
"cell_type": "markdown",
839+
"metadata": {},
840+
"source": [
841+
"### Missing values"
842+
]
843+
},
844+
{
845+
"cell_type": "markdown",
846+
"metadata": {},
847+
"source": [
848+
"You can control the default missing values representation for the entire table through `set_na_rep` method."
849+
]
850+
},
851+
{
852+
"cell_type": "code",
853+
"execution_count": null,
854+
"metadata": {},
855+
"outputs": [],
856+
"source": [
857+
"(df.style\n",
858+
" .set_na_rep(\"FAIL\")\n",
859+
" .format(None, na_rep=\"PASS\", subset=[\"D\"])\n",
860+
" .highlight_null(\"yellow\"))"
861+
]
862+
},
803863
{
804864
"cell_type": "markdown",
805865
"metadata": {},

doc/source/whatsnew/v1.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ Other enhancements
122122
- Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`)
123123
- Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`)
124124
- :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`)
125+
- :meth:`Styler.format` added the ``na_rep`` parameter to help format the missing values (:issue:`21527`, :issue:`28358`)
125126
- Roundtripping DataFrames with nullable integer or string data types to parquet
126127
(:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine
127128
now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`).

pandas/io/formats/style.py

+60-13
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import copy
99
from functools import partial
1010
from itertools import product
11-
from typing import Optional
11+
from typing import Any, Callable, DefaultDict, Dict, List, Optional, Sequence, Tuple
1212
from uuid import uuid1
1313

1414
import numpy as np
@@ -71,6 +71,11 @@ class Styler:
7171
The ``id`` takes the form ``T_<uuid>_row<num_row>_col<num_col>``
7272
where ``<uuid>`` is the unique identifier, ``<num_row>`` is the row
7373
number and ``<num_col>`` is the column number.
74+
na_rep : str, optional
75+
Representation for missing values.
76+
If ``na_rep`` is None, no special formatting is applied
77+
78+
.. versionadded:: 1.0.0
7479
7580
Attributes
7681
----------
@@ -126,9 +131,10 @@ def __init__(
126131
caption=None,
127132
table_attributes=None,
128133
cell_ids=True,
134+
na_rep: Optional[str] = None,
129135
):
130-
self.ctx = defaultdict(list)
131-
self._todo = []
136+
self.ctx: DefaultDict[Tuple[int, int], List[str]] = defaultdict(list)
137+
self._todo: List[Tuple[Callable, Tuple, Dict]] = []
132138

133139
if not isinstance(data, (pd.Series, pd.DataFrame)):
134140
raise TypeError("``data`` must be a Series or DataFrame")
@@ -149,19 +155,24 @@ def __init__(
149155
self.precision = precision
150156
self.table_attributes = table_attributes
151157
self.hidden_index = False
152-
self.hidden_columns = []
158+
self.hidden_columns: Sequence[int] = []
153159
self.cell_ids = cell_ids
160+
self.na_rep = na_rep
154161

155162
# display_funcs maps (row, col) -> formatting function
156163

157164
def default_display_func(x):
158-
if is_float(x):
165+
if self.na_rep is not None and pd.isna(x):
166+
return self.na_rep
167+
elif is_float(x):
159168
display_format = "{0:.{precision}f}".format(x, precision=self.precision)
160169
return display_format
161170
else:
162171
return x
163172

164-
self._display_funcs = defaultdict(lambda: default_display_func)
173+
self._display_funcs: DefaultDict[
174+
Tuple[int, int], Callable[[Any], str]
175+
] = defaultdict(lambda: default_display_func)
165176

166177
def _repr_html_(self):
167178
"""
@@ -416,16 +427,22 @@ def format_attr(pair):
416427
table_attributes=table_attr,
417428
)
418429

419-
def format(self, formatter, subset=None):
430+
def format(self, formatter, subset=None, na_rep: Optional[str] = None):
420431
"""
421432
Format the text display value of cells.
422433
423434
Parameters
424435
----------
425-
formatter : str, callable, or dict
436+
formatter : str, callable, dict or None
437+
If ``formatter`` is None, the default formatter is used
426438
subset : IndexSlice
427439
An argument to ``DataFrame.loc`` that restricts which elements
428440
``formatter`` is applied to.
441+
na_rep : str, optional
442+
Representation for missing values.
443+
If ``na_rep`` is None, no special formatting is applied
444+
445+
.. versionadded:: 1.0.0
429446
430447
Returns
431448
-------
@@ -451,6 +468,10 @@ def format(self, formatter, subset=None):
451468
>>> df['c'] = ['a', 'b', 'c', 'd']
452469
>>> df.style.format({'c': str.upper})
453470
"""
471+
if formatter is None:
472+
assert self._display_funcs.default_factory is not None
473+
formatter = self._display_funcs.default_factory()
474+
454475
if subset is None:
455476
row_locs = range(len(self.data))
456477
col_locs = range(len(self.data.columns))
@@ -466,16 +487,16 @@ def format(self, formatter, subset=None):
466487
if is_dict_like(formatter):
467488
for col, col_formatter in formatter.items():
468489
# formatter must be callable, so '{}' are converted to lambdas
469-
col_formatter = _maybe_wrap_formatter(col_formatter)
490+
col_formatter = _maybe_wrap_formatter(col_formatter, na_rep)
470491
col_num = self.data.columns.get_indexer_for([col])[0]
471492

472493
for row_num in row_locs:
473494
self._display_funcs[(row_num, col_num)] = col_formatter
474495
else:
475496
# single scalar to format all cells with
497+
formatter = _maybe_wrap_formatter(formatter, na_rep)
476498
locs = product(*(row_locs, col_locs))
477499
for i, j in locs:
478-
formatter = _maybe_wrap_formatter(formatter)
479500
self._display_funcs[(i, j)] = formatter
480501
return self
481502

@@ -553,6 +574,7 @@ def _copy(self, deepcopy=False):
553574
caption=self.caption,
554575
uuid=self.uuid,
555576
table_styles=self.table_styles,
577+
na_rep=self.na_rep,
556578
)
557579
if deepcopy:
558580
styler.ctx = copy.deepcopy(self.ctx)
@@ -896,6 +918,23 @@ def set_table_styles(self, table_styles):
896918
self.table_styles = table_styles
897919
return self
898920

921+
def set_na_rep(self, na_rep: str) -> "Styler":
922+
"""
923+
Set the missing data representation on a Styler.
924+
925+
.. versionadded:: 1.0.0
926+
927+
Parameters
928+
----------
929+
na_rep : str
930+
931+
Returns
932+
-------
933+
self : Styler
934+
"""
935+
self.na_rep = na_rep
936+
return self
937+
899938
def hide_index(self):
900939
"""
901940
Hide any indices from rendering.
@@ -1487,14 +1526,22 @@ def _get_level_lengths(index, hidden_elements=None):
14871526
return non_zero_lengths
14881527

14891528

1490-
def _maybe_wrap_formatter(formatter):
1529+
def _maybe_wrap_formatter(formatter, na_rep: Optional[str]):
14911530
if isinstance(formatter, str):
1492-
return lambda x: formatter.format(x)
1531+
formatter_func = lambda x: formatter.format(x)
14931532
elif callable(formatter):
1494-
return formatter
1533+
formatter_func = formatter
14951534
else:
14961535
msg = (
14971536
"Expected a template string or callable, got {formatter} "
14981537
"instead".format(formatter=formatter)
14991538
)
15001539
raise TypeError(msg)
1540+
1541+
if na_rep is None:
1542+
return formatter_func
1543+
elif isinstance(na_rep, str):
1544+
return lambda x: na_rep if pd.isna(x) else formatter_func(x)
1545+
else:
1546+
msg = "Expected a string, got {na_rep} instead".format(na_rep=na_rep)
1547+
raise TypeError(msg)

pandas/tests/io/formats/test_style.py

+69
Original file line numberDiff line numberDiff line change
@@ -1009,6 +1009,75 @@ def test_bar_bad_align_raises(self):
10091009
with pytest.raises(ValueError):
10101010
df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"])
10111011

1012+
def test_format_with_na_rep(self):
1013+
# GH 21527 28358
1014+
df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"])
1015+
1016+
ctx = df.style.format(None, na_rep="-")._translate()
1017+
assert ctx["body"][0][1]["display_value"] == "-"
1018+
assert ctx["body"][0][2]["display_value"] == "-"
1019+
1020+
ctx = df.style.format("{:.2%}", na_rep="-")._translate()
1021+
assert ctx["body"][0][1]["display_value"] == "-"
1022+
assert ctx["body"][0][2]["display_value"] == "-"
1023+
assert ctx["body"][1][1]["display_value"] == "110.00%"
1024+
assert ctx["body"][1][2]["display_value"] == "120.00%"
1025+
1026+
ctx = df.style.format("{:.2%}", na_rep="-", subset=["B"])._translate()
1027+
assert ctx["body"][0][2]["display_value"] == "-"
1028+
assert ctx["body"][1][2]["display_value"] == "120.00%"
1029+
1030+
def test_init_with_na_rep(self):
1031+
# GH 21527 28358
1032+
df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"])
1033+
1034+
ctx = Styler(df, na_rep="NA")._translate()
1035+
assert ctx["body"][0][1]["display_value"] == "NA"
1036+
assert ctx["body"][0][2]["display_value"] == "NA"
1037+
1038+
def test_set_na_rep(self):
1039+
# GH 21527 28358
1040+
df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"])
1041+
1042+
ctx = df.style.set_na_rep("NA")._translate()
1043+
assert ctx["body"][0][1]["display_value"] == "NA"
1044+
assert ctx["body"][0][2]["display_value"] == "NA"
1045+
1046+
ctx = (
1047+
df.style.set_na_rep("NA")
1048+
.format(None, na_rep="-", subset=["B"])
1049+
._translate()
1050+
)
1051+
assert ctx["body"][0][1]["display_value"] == "NA"
1052+
assert ctx["body"][0][2]["display_value"] == "-"
1053+
1054+
def test_format_non_numeric_na(self):
1055+
# GH 21527 28358
1056+
df = pd.DataFrame(
1057+
{
1058+
"object": [None, np.nan, "foo"],
1059+
"datetime": [None, pd.NaT, pd.Timestamp("20120101")],
1060+
}
1061+
)
1062+
1063+
ctx = df.style.set_na_rep("NA")._translate()
1064+
assert ctx["body"][0][1]["display_value"] == "NA"
1065+
assert ctx["body"][0][2]["display_value"] == "NA"
1066+
assert ctx["body"][1][1]["display_value"] == "NA"
1067+
assert ctx["body"][1][2]["display_value"] == "NA"
1068+
1069+
ctx = df.style.format(None, na_rep="-")._translate()
1070+
assert ctx["body"][0][1]["display_value"] == "-"
1071+
assert ctx["body"][0][2]["display_value"] == "-"
1072+
assert ctx["body"][1][1]["display_value"] == "-"
1073+
assert ctx["body"][1][2]["display_value"] == "-"
1074+
1075+
def test_format_with_bad_na_rep(self):
1076+
# GH 21527 28358
1077+
df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"])
1078+
with pytest.raises(TypeError):
1079+
df.style.format(None, na_rep=-1)
1080+
10121081
def test_highlight_null(self, null_color="red"):
10131082
df = pd.DataFrame({"A": [0, np.nan]})
10141083
result = df.style.highlight_null()._compute().ctx

0 commit comments

Comments
 (0)