diff --git a/doc/source/_static/style/des_mean.png b/doc/source/_static/style/des_mean.png new file mode 100644 index 0000000000000..d52bbf55d40ae Binary files /dev/null and b/doc/source/_static/style/des_mean.png differ diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst index dd7e2fe7434cd..6a0d727b4f4cb 100644 --- a/doc/source/reference/style.rst +++ b/doc/source/reference/style.rst @@ -45,6 +45,7 @@ Style application Styler.set_td_classes Styler.set_table_styles Styler.set_table_attributes + Styler.set_descriptors Styler.set_tooltips Styler.set_caption Styler.set_sticky diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index c3ff0ec286968..1188b34bb63ca 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -216,6 +216,47 @@ "weather_df.loc[\"2021-01-04\":\"2021-01-08\"].style.pipe(make_pretty)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Describing Data\n", + "\n", + "The data can also be explored with the ability to add header level calculations. The [.set_descriptors()][descriptors] method is used here. We begin with a large DataFrame and reconfigure the `pandas.options` to reduce the rendered size, whilst adding descriptors we wish to calculate on the data.\n", + "\n", + "[descriptors]: ../reference/api/pandas.io.formats.style.Styler.set_descriptors.rst" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.options.styler.render.max_rows = 5\n", + "df_described = pd.DataFrame({\"A\": np.random.randn(1000), \n", + " \"B\": np.random.randint(low=-10, high=10, size=1000, dtype=\"int64\")})\n", + "df_described.style.set_descriptors([\n", + " \"mean\",\n", + " (\"mean 2dp\", lambda s: f\"{s.mean():.2f}\"),\n", + " (\"std\", pd.Series.std),\n", + " \"nunique\",\n", + " lambda s: s.dtype,\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# Hidden cell to reset pandas options \n", + "pd.options.styler.render.max_rows = None" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1661,6 +1702,7 @@ " + `col`, where `n` is the numeric position of the cell.\n", "- Blank cells include `blank`\n", "- Trimmed cells include `col_trim` or `row_trim`\n", + "- Descriptor name cells include `descriptor_name`, descriptor value cells include `descriptor_value` and both also include `descriptor`, where `j` is the numeric index of the list of descriptors.\n", "\n", "The structure of the `id` is `T_uuid_level_row_col` where `level` is used only on headings, and headings will only have either `row` or `col` whichever is needed. By default we've also prepended each row/column identifier with a UUID unique to each DataFrame so that the style from one doesn't collide with the styling from another within the same notebook or page. You can read more about the use of UUIDs in [Optimization](#Optimization).\n", "\n", @@ -1675,7 +1717,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(pd.DataFrame([[1,2],[3,4]], index=['i1', 'i2'], columns=['c1', 'c2']).style.to_html())" + "print(pd.DataFrame([[1,2],[3,4]], index=['i1', 'i2'], columns=['c1', 'c2']).style.set_descriptors([\"mean\"]).to_html())" ] }, { diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index b316b4ff2d688..726b761f31e96 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -20,6 +20,7 @@ Styler ^^^^^^ - New method :meth:`.Styler.to_string` for alternative customisable output methods (:issue:`44502`) + - Added a new method :meth:`.Styler.set_descriptors` which allows adding customised header rows to explore and make calculations on the data, e.g. totals and counts etc. (:issue:`43875`) - Various bug fixes, see below. .. _whatsnew_150.enhancements.enhancement2: diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index f97f558fd0e0b..244c66c6ff3db 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -54,6 +54,7 @@ from pandas.io.formats.style_render import ( CSSProperties, CSSStyles, + Descriptor, ExtFormatter, StylerRenderer, Subset, @@ -1435,6 +1436,7 @@ def _copy(self, deepcopy: bool = False) -> Styler: ] deep = [ # nested lists or dicts "css", + "descriptors", "_display_funcs", "_display_funcs_index", "_display_funcs_columns", @@ -1977,6 +1979,9 @@ def export(self) -> dict[str, Any]: Can be applied to a second Styler with ``Styler.use``. + .. versionchanged:: 1.5.0 + Adds ``descriptors`` to the exported items. + Returns ------- styles : dict @@ -1998,6 +2003,7 @@ def export(self) -> dict[str, Any]: - Whether axes and names are hidden from the display, if unambiguous. - Table attributes - Table styles + - Descriptors The following attributes are considered data dependent and therefore not exported: @@ -2027,6 +2033,7 @@ def export(self) -> dict[str, Any]: "hide_index_names": self.hide_index_names, "hide_column_names": self.hide_column_names, "css": copy.copy(self.css), + "descriptors": copy.copy(self.descriptors), } def use(self, styles: dict[str, Any]) -> Styler: @@ -2035,6 +2042,9 @@ def use(self, styles: dict[str, Any]) -> Styler: Possibly uses styles from ``Styler.export``. + .. versionchanged:: 1.5.0 + Adds ``descriptors`` to the used items. + Parameters ---------- styles : dict(str, Any) @@ -2052,6 +2062,8 @@ def use(self, styles: dict[str, Any]) -> Styler: - "hide_index_names": whether index names are hidden. - "hide_column_names": whether column header names are hidden. - "css": the css class names used. + - "descriptors": list of descriptors, typically added with + ``set_descriptors``. Returns ------- @@ -2094,6 +2106,8 @@ def use(self, styles: dict[str, Any]) -> Styler: self.hide_column_names = styles.get("hide_column_names", False) if styles.get("css"): self.css = styles.get("css") # type: ignore[assignment] + if styles.get("descriptors"): + self.set_descriptors(styles.get("descriptors")) return self def set_uuid(self, uuid: str) -> Styler: @@ -2352,7 +2366,10 @@ def set_table_styles( "row_trim": "row_trim", "level": "level", "data": "data", - "blank": "blank} + "blank": "blank", + "descriptor": "descriptor", + "descriptor_name": "descriptor_name", + "descriptor_value": "descriptor_value"} Examples -------- @@ -2423,6 +2440,50 @@ def set_table_styles( self.table_styles = table_styles return self + def set_descriptors( + self, descriptors: list[Descriptor | tuple[str, Descriptor]] | None = None + ) -> Styler: + """ + Add header-level calculations to the output which describes the data. + + .. versionadded:: 1.5.0 + + Parameters + ---------- + descriptors : list of str, callables or 2-tuples of str and callable + If a string is given must be a valid Series method, e.g. "mean" invokes + Series.mean(). + + If a callable is given must accept a Series and return a scalar. + + If a 2-tuple, must be a string used as the name of the row and a + callable or string as above. + + Returns + ------- + self : Styler + + Examples + -------- + + >>> df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> def udf_func(s): + ... return s.mean() + >>> styler = df.style.set_descriptors([ + ... "mean", + ... Series.mean, + ... ("my-text", "mean"), + ... ("my-text2", Series.mean), + ... ("my-func", lambda s: s.sum()/2), + ... lambda s: s.sum()/2, + ... udf_func, + ... ]) # doctest: +SKIP + + .. figure:: ../../_static/style/des_mean.png + """ + self.descriptors = descriptors if descriptors is not None else [] + return self + def set_na_rep(self, na_rep: str) -> StylerRenderer: """ Set the missing data representation on a ``Styler``. diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 1fe36a34903ab..1873c16cbca90 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -25,6 +25,11 @@ from pandas._typing import Level from pandas.compat._optional import import_optional_dependency +from pandas.core.dtypes.common import ( + is_complex, + is_float, + is_integer, +) from pandas.core.dtypes.generic import ABCSeries from pandas import ( @@ -46,6 +51,7 @@ CSSPair = Tuple[str, Union[str, int, float]] CSSList = List[CSSPair] CSSProperties = Union[str, CSSList] +Descriptor = Union[str, Callable[[Series], Any]] class CSSDict(TypedDict): @@ -115,6 +121,9 @@ def __init__( "level": "level", "data": "data", "blank": "blank", + "descriptor": "descriptor", + "descriptor_value": "descriptor_value", + "descriptor_name": "descriptor_name", } # add rendering variables @@ -124,6 +133,7 @@ def __init__( self.hide_columns_: list = [False] * self.columns.nlevels self.hidden_rows: Sequence[int] = [] # sequence for specific hidden rows/cols self.hidden_columns: Sequence[int] = [] + self.descriptors: list[Descriptor | tuple[str, Descriptor]] = [] self.ctx: DefaultDict[tuple[int, int], CSSList] = defaultdict(list) self.ctx_index: DefaultDict[tuple[int, int], CSSList] = defaultdict(list) self.ctx_columns: DefaultDict[tuple[int, int], CSSList] = defaultdict(list) @@ -329,7 +339,9 @@ def _translate_header(self, sparsify_cols: bool, max_cols: int): 1) | .. | .. | .. | | index_blanks ... | column_name_n | column_headers (level_n) | +----------------------------+---------------+---------------------------+ - 2) | index_names (level_0 to level_n) ... | column_blanks ... | + 2) | index_blanks ... | descriptor | value by column | + +----------------------------+---------------+---------------------------+ + 3) | index_names (level_0 to level_n) ... | column_blanks ... | +----------------------------+---------------+---------------------------+ Parameters @@ -365,7 +377,12 @@ def _translate_header(self, sparsify_cols: bool, max_cols: int): ) head.append(header_row) - # 2) index names + # 2) Descriptor calcs + for r, descriptor in enumerate(self.descriptors): + descriptor_row = self._generate_descriptor_row((r, descriptor), max_cols) + head.append(descriptor_row) + + # 3) index names if ( self.data.index.names and com.any_not_none(*self.data.index.names) @@ -477,6 +494,108 @@ def _generate_col_header_row(self, iter: tuple, max_cols: int, col_lengths: dict return index_blanks + column_name + column_headers + def _generate_descriptor_row(self, iter: tuple, max_cols: int): + """ + Generate the row containing calculated descriptor values for columns: + + +----------------------------+---------------+---------------------------+ + | index_blanks ... | descriptor_i | value_i by col | + +----------------------------+---------------+---------------------------+ + + Parameters + ---------- + iter : tuple + Looping variables from outer scope + max_cols : int + Permissible number of columns + + Returns + ------- + list of elements + """ + + r, descriptor = iter + + # number of index blanks is governed by number of hidden index levels + index_blanks = [ + _element("th", self.css["blank"], self.css["blank_value"], True) + ] * (self.index.nlevels - sum(self.hide_index_) - 1) + + if isinstance(descriptor, str): + name: str | None = descriptor + func: Callable = getattr(Series, descriptor) + elif isinstance(descriptor, tuple): + name = descriptor[0] + if isinstance(descriptor[1], str): + func = getattr(Series, descriptor[1]) + else: + func = descriptor[1] + else: + name, func = getattr(descriptor, "__name__", None), descriptor + name = None if name == "" else name # blank nameless functions + + display_func: Callable = _maybe_wrap_formatter( + decimal=get_option("styler.format.decimal"), + thousands=get_option("styler.format.thousands"), + precision=get_option("styler.format.precision"), + na_rep=get_option("styler.format.na_rep"), + escape=get_option("styler.format.escape"), + ) + + base_css = f"{self.css['descriptor_name']} {self.css['descriptor']}{r}" + if name is not None and not self.hide_column_names: + name_css = base_css + name_val = name + else: + name_css = f"{self.css['blank']} {base_css}" + name_val = self.css["blank_value"] + descriptor_name = _element("th", name_css, name_val, not all(self.hide_index_)) + + descriptor_values, visible_col_count = [], 0 + for c, col in enumerate(self.columns): + if c not in self.hidden_columns: + header_element_visible = True + visible_col_count += 1 + try: + header_element_value = func(self.data[col]) + except Exception: + header_element_value = self.css["blank_value"] + else: + header_element_visible = False + header_element_value = None + + if visible_col_count > max_cols: + # add an extra column with `...` value to indicate trimming + descriptor_values.append( + _element( + "th", + ( + f"{self.css['descriptor_value']} " + f"{self.css['descriptor']}{r} " + f"{self.css['col_trim']}" + ), + "...", + True, + attributes="", + ) + ) + break + + header_element = _element( + "th", + ( + f"{self.css['descriptor_value']} {self.css['descriptor']}{r} " + f"{self.css['col']}{c}" + ), + header_element_value, + header_element_visible, + display_value=display_func(header_element_value), + attributes="", + ) + descriptor_values.append(header_element) + + return index_blanks + [descriptor_name] + descriptor_values + def _generate_index_names_row(self, iter: tuple, max_cols: int, col_lengths: dict): """ Generate the row containing index names @@ -1416,9 +1535,9 @@ def _default_formatter(x: Any, precision: int, thousands: bool = False) -> Any: value : Any Matches input type, or string if input is float or complex or int with sep. """ - if isinstance(x, (float, complex)): + if is_float(x) or is_complex(x): return f"{x:,.{precision}f}" if thousands else f"{x:.{precision}f}" - elif isinstance(x, int): + elif is_integer(x): return f"{x:,.0f}" if thousands else f"{x:.0f}" return x @@ -1433,7 +1552,7 @@ def _wrap_decimal_thousands( """ def wrapper(x): - if isinstance(x, (float, complex, int)): + if is_float(x) or is_complex(x) or is_integer(x): if decimal != "." and thousands is not None and thousands != ",": return ( formatter(x) diff --git a/pandas/tests/io/formats/style/test_format.py b/pandas/tests/io/formats/style/test_format.py index 5207be992d606..d3249707e7c3c 100644 --- a/pandas/tests/io/formats/style/test_format.py +++ b/pandas/tests/io/formats/style/test_format.py @@ -434,3 +434,21 @@ def test_1level_multiindex(): assert ctx["body"][0][0]["is_visible"] is True assert ctx["body"][1][0]["display_value"] == "2" assert ctx["body"][1][0]["is_visible"] is True + + +def test_format_descriptors(styler): + with option_context( + "styler.format.precision", + 5, + "styler.format.decimal", + "*", + "styler.format.thousands", + "_", + ): + styler.set_descriptors([lambda s: s.sum() + 1000]) + ctx = styler._translate(True, True) + + exp_col_1 = {"value": 1001, "display_value": "1_001"} + assert exp_col_1.items() <= ctx["head"][1][1].items() + exp_col_2 = {"value": 998.163, "display_value": "998*16300"} + assert exp_col_2.items() <= ctx["head"][1][2].items() diff --git a/pandas/tests/io/formats/style/test_html.py b/pandas/tests/io/formats/style/test_html.py index fad289d5e0d2c..91672e52f94a8 100644 --- a/pandas/tests/io/formats/style/test_html.py +++ b/pandas/tests/io/formats/style/test_html.py @@ -614,6 +614,7 @@ def test_hiding_index_columns_multiindex_alignment(): styler.hide(level=1, axis=0).hide(level=0, axis=1) styler.hide([("j0", "i1", "j2")], axis=0) styler.hide([("c0", "d1", "d2")], axis=1) + styler.set_descriptors(["mean"]) result = styler.to_html() expected = dedent( """\ @@ -634,6 +635,13 @@ def test_hiding_index_columns_multiindex_alignment(): d2 c2 + +   + mean + 6.000000 + 7.000000 + 8.000000 + i-0 i-2 @@ -678,7 +686,15 @@ def test_hiding_index_columns_multiindex_trimming(): df.index.names, df.columns.names = ["a", "b"], ["c", "d"] styler = Styler(df, cell_ids=False, uuid_len=0) styler.hide([(0, 0), (0, 1), (1, 0)], axis=1).hide([(0, 0), (0, 1), (1, 0)], axis=0) - with option_context("styler.render.max_rows", 4, "styler.render.max_columns", 4): + styler.set_descriptors(["mean"]) + with option_context( + "styler.render.max_rows", + 4, + "styler.render.max_columns", + 4, + "styler.format.precision", + 0, + ): result = styler.to_html() expected = dedent( @@ -703,6 +719,15 @@ def test_hiding_index_columns_multiindex_trimming(): 0 ... + +   + mean + 31 + 32 + 33 + 34 + ... + a b @@ -762,7 +787,6 @@ def test_hiding_index_columns_multiindex_trimming(): """ ) - assert result == expected diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index 915497e614b3a..167373b179b9c 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -74,6 +74,7 @@ def mi_styler_comp(mi_styler): columns=mi_styler.columns, ) ) + mi_styler.set_descriptors(["mean"]) return mi_styler @@ -346,6 +347,7 @@ def test_export(mi_styler_comp, mi_styler): "table_attributes", "table_styles", "css", + "descriptors", ] for attr in exp_attrs: check = getattr(mi_styler, attr) == getattr(mi_styler_comp, attr) @@ -1554,3 +1556,34 @@ def test_no_empty_apply(mi_styler): # 45313 mi_styler.apply(lambda s: ["a:v;"] * 2, subset=[False, False]) mi_styler._compute() + + +def test_descriptors(mi_styler): + def udf_mean(s): + return s.mean() + + mi_styler.set_descriptors( + [ + "mean", + Series.mean, + ("average", "mean"), + ("my-text", Series.mean), + ("my-func", lambda s: s.sum() / len(s)), + lambda s: s.sum() / len(s), + udf_mean, + ] + ) + ctx = mi_styler._translate(True, True) + assert len(ctx["head"]) == 9 # 2 rows for MultiIndex columns and 7 descriptors + + exp_labels = ["mean", "mean", "average", "my-text", "my-func", " ", "udf_mean"] + for r, row in enumerate(ctx["head"][2:9]): # iterate after col headers + for c, col in enumerate(row[2:]): # iterate after row headers + result = {k: col[k] for k in ["type", "is_visible", "value"]} + assert ( + result.items() <= ctx["head"][2][c + 2].items() + ) # test rows 3,4,5 are equivalent to row 2 in value, type and visibility + assert col["class"] == f"descriptor_value descriptor{r} col{c}" # test css + + assert row[1]["value"] == exp_labels[r] # test label is printed + assert f"descriptor_name descriptor{r}" in row[1]["class"] # test css