Skip to content

BUG: groupby.hist legend should use group keys #33493

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Jun 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,7 @@ Other enhancements
- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`)
- :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`).
- :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`).
- :meth:`DataFrame.hist`, :meth:`Series.hist`, :meth:`core.groupby.DataFrameGroupBy.hist`, and :meth:`core.groupby.SeriesGroupBy.hist` have gained the ``legend`` argument. Set to True to show a legend in the histogram. (:issue:`6279`)
- :func:`concat` and :meth:`~DataFrame.append` now preserve extension dtypes, for example
combining a nullable integer column with a numpy integer column will no longer
result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`).
Expand Down
62 changes: 41 additions & 21 deletions pandas/plotting/_core.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,33 @@
import importlib
from typing import TYPE_CHECKING, Optional, Sequence, Tuple, Union

from pandas._config import get_option

from pandas._typing import Label
from pandas.util._decorators import Appender, Substitution

from pandas.core.dtypes.common import is_integer, is_list_like
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries

from pandas.core.base import PandasObject

if TYPE_CHECKING:
from pandas import DataFrame


def hist_series(
self,
by=None,
ax=None,
grid=True,
xlabelsize=None,
xrot=None,
ylabelsize=None,
yrot=None,
figsize=None,
bins=10,
backend=None,
grid: bool = True,
xlabelsize: Optional[int] = None,
xrot: Optional[float] = None,
ylabelsize: Optional[int] = None,
yrot: Optional[float] = None,
figsize: Optional[Tuple[int, int]] = None,
bins: Union[int, Sequence[int]] = 10,
backend: Optional[str] = None,
legend: bool = False,
**kwargs,
):
"""
Expand Down Expand Up @@ -58,6 +64,11 @@ def hist_series(

.. versionadded:: 1.0.0

legend : bool, default False
Whether to show the legend.

..versionadded:: 1.1.0

**kwargs
To be passed to the actual plotting function.

Expand All @@ -82,26 +93,28 @@ def hist_series(
yrot=yrot,
figsize=figsize,
bins=bins,
legend=legend,
**kwargs,
)


def hist_frame(
data,
column=None,
data: "DataFrame",
column: Union[Label, Sequence[Label]] = None,
by=None,
grid=True,
xlabelsize=None,
xrot=None,
ylabelsize=None,
yrot=None,
grid: bool = True,
xlabelsize: Optional[int] = None,
xrot: Optional[float] = None,
ylabelsize: Optional[int] = None,
yrot: Optional[float] = None,
ax=None,
sharex=False,
sharey=False,
figsize=None,
layout=None,
bins=10,
backend=None,
sharex: bool = False,
sharey: bool = False,
figsize: Optional[Tuple[int, int]] = None,
layout: Optional[Tuple[int, int]] = None,
bins: Union[int, Sequence[int]] = 10,
backend: Optional[str] = None,
legend: bool = False,
**kwargs,
):
"""
Expand Down Expand Up @@ -154,6 +167,7 @@ def hist_frame(
bin edges are calculated and returned. If bins is a sequence, gives
bin edges, including left edge of first bin and right edge of last
bin. In this case, bins is returned unmodified.

backend : str, default None
Backend to use instead of the backend specified in the option
``plotting.backend``. For instance, 'matplotlib'. Alternatively, to
Expand All @@ -162,6 +176,11 @@ def hist_frame(

.. versionadded:: 1.0.0

legend : bool, default False
Whether to show the legend.

..versionadded:: 1.1.0

**kwargs
All other plotting keyword arguments to be passed to
:meth:`matplotlib.pyplot.hist`.
Expand Down Expand Up @@ -203,6 +222,7 @@ def hist_frame(
sharey=sharey,
figsize=figsize,
layout=layout,
legend=legend,
bins=bins,
**kwargs,
)
Expand Down
32 changes: 31 additions & 1 deletion pandas/plotting/_matplotlib/hist.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ def _grouped_hist(
xrot=None,
ylabelsize=None,
yrot=None,
legend=False,
**kwargs,
):
"""
Expand All @@ -243,15 +244,26 @@ def _grouped_hist(
sharey : bool, default False
rot : int, default 90
grid : bool, default True
legend: : bool, default False
kwargs : dict, keyword arguments passed to matplotlib.Axes.hist

Returns
-------
collection of Matplotlib Axes
"""
if legend:
assert "label" not in kwargs
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is pandas the only one calling this, or could users call it with labels in kwargs?

Users shouldn't see bare asserts like this. It should be a ValueError.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And just to verify the expected behavior, if the user provides a label why wouldn't we want to use that? Can we just do kwargs.setdefault("label", data.name) (or the other cases)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the first question, this function is only called from hist_frame and hist_series, each of which raise a ValueError on this condition. This assert is to maintain this behavior if in the future any other code paths call it.

For the second question, see #33493 (comment)

if data.ndim == 1:
kwargs["label"] = data.name
elif column is None:
kwargs["label"] = data.columns
else:
kwargs["label"] = column

def plot_group(group, ax):
ax.hist(group.dropna().values, bins=bins, **kwargs)
if legend:
ax.legend()

if xrot is None:
xrot = rot
Expand Down Expand Up @@ -290,10 +302,14 @@ def hist_series(
yrot=None,
figsize=None,
bins=10,
legend: bool = False,
**kwds,
):
import matplotlib.pyplot as plt

if legend and "label" in kwds:
raise ValueError("Cannot use both legend and label")

if by is None:
if kwds.get("layout", None) is not None:
raise ValueError("The 'layout' keyword is not supported when 'by' is None")
Expand All @@ -308,8 +324,11 @@ def hist_series(
elif ax.get_figure() != fig:
raise AssertionError("passed axis not bound to passed figure")
values = self.dropna().values

if legend:
kwds["label"] = self.name
ax.hist(values, bins=bins, **kwds)
if legend:
ax.legend()
ax.grid(grid)
axes = np.array([ax])

Expand All @@ -334,6 +353,7 @@ def hist_series(
xrot=xrot,
ylabelsize=ylabelsize,
yrot=yrot,
legend=legend,
**kwds,
)

Expand All @@ -358,8 +378,11 @@ def hist_frame(
figsize=None,
layout=None,
bins=10,
legend: bool = False,
**kwds,
):
if legend and "label" in kwds:
raise ValueError("Cannot use both legend and label")
if by is not None:
axes = _grouped_hist(
data,
Expand All @@ -376,6 +399,7 @@ def hist_frame(
xrot=xrot,
ylabelsize=ylabelsize,
yrot=yrot,
legend=legend,
**kwds,
)
return axes
Expand All @@ -401,11 +425,17 @@ def hist_frame(
)
_axes = _flatten(axes)

can_set_label = "label" not in kwds

for i, col in enumerate(data.columns):
ax = _axes[i]
if legend and can_set_label:
kwds["label"] = col
ax.hist(data[col].dropna().values, bins=bins, **kwds)
ax.set_title(col)
ax.grid(grid)
if legend:
ax.legend()

_set_ticks_props(
axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot
Expand Down
49 changes: 48 additions & 1 deletion pandas/tests/plotting/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@


import numpy as np
import pytest

import pandas.util._test_decorators as td

from pandas import DataFrame, Series
from pandas import DataFrame, Index, Series
import pandas._testing as tm
from pandas.tests.plotting.common import TestPlotBase

Expand Down Expand Up @@ -65,3 +66,49 @@ def test_plot_kwargs(self):

res = df.groupby("z").plot.scatter(x="x", y="y")
assert len(res["a"].collections) == 1

@pytest.mark.parametrize("column, expected_axes_num", [(None, 2), ("b", 1)])
def test_groupby_hist_frame_with_legend(self, column, expected_axes_num):
# GH 6279 - DataFrameGroupBy histogram can have a legend
expected_layout = (1, expected_axes_num)
expected_labels = column or [["a"], ["b"]]

index = Index(15 * ["1"] + 15 * ["2"], name="c")
df = DataFrame(np.random.randn(30, 2), index=index, columns=["a", "b"])
g = df.groupby("c")

for axes in g.hist(legend=True, column=column):
self._check_axes_shape(
axes, axes_num=expected_axes_num, layout=expected_layout
)
for ax, expected_label in zip(axes[0], expected_labels):
self._check_legend_labels(ax, expected_label)

@pytest.mark.parametrize("column", [None, "b"])
def test_groupby_hist_frame_with_legend_raises(self, column):
# GH 6279 - DataFrameGroupBy histogram with legend and label raises
index = Index(15 * ["1"] + 15 * ["2"], name="c")
df = DataFrame(np.random.randn(30, 2), index=index, columns=["a", "b"])
g = df.groupby("c")

with pytest.raises(ValueError, match="Cannot use both legend and label"):
g.hist(legend=True, column=column, label="d")

def test_groupby_hist_series_with_legend(self):
# GH 6279 - SeriesGroupBy histogram can have a legend
index = Index(15 * ["1"] + 15 * ["2"], name="c")
df = DataFrame(np.random.randn(30, 2), index=index, columns=["a", "b"])
g = df.groupby("c")

for ax in g["a"].hist(legend=True):
self._check_axes_shape(ax, axes_num=1, layout=(1, 1))
self._check_legend_labels(ax, ["1", "2"])

def test_groupby_hist_series_with_legend_raises(self):
# GH 6279 - SeriesGroupBy histogram with legend and label raises
index = Index(15 * ["1"] + 15 * ["2"], name="c")
df = DataFrame(np.random.randn(30, 2), index=index, columns=["a", "b"])
g = df.groupby("c")

with pytest.raises(ValueError, match="Cannot use both legend and label"):
g.hist(legend=True, label="d")
55 changes: 54 additions & 1 deletion pandas/tests/plotting/test_hist_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pandas.util._test_decorators as td

from pandas import DataFrame, Series
from pandas import DataFrame, Index, Series
import pandas._testing as tm
from pandas.tests.plotting.common import TestPlotBase, _check_plot_works

Expand Down Expand Up @@ -129,6 +129,29 @@ def test_plot_fails_when_ax_differs_from_figure(self):
with pytest.raises(AssertionError):
self.ts.hist(ax=ax1, figure=fig2)

@pytest.mark.parametrize(
"by, expected_axes_num, expected_layout", [(None, 1, (1, 1)), ("b", 2, (1, 2))]
)
def test_hist_with_legend(self, by, expected_axes_num, expected_layout):
# GH 6279 - Series histogram can have a legend
index = 15 * ["1"] + 15 * ["2"]
s = Series(np.random.randn(30), index=index, name="a")
s.index.name = "b"

axes = _check_plot_works(s.hist, legend=True, by=by)
self._check_axes_shape(axes, axes_num=expected_axes_num, layout=expected_layout)
self._check_legend_labels(axes, "a")

@pytest.mark.parametrize("by", [None, "b"])
def test_hist_with_legend_raises(self, by):
# GH 6279 - Series histogram with legend and label raises
index = 15 * ["1"] + 15 * ["2"]
s = Series(np.random.randn(30), index=index, name="a")
s.index.name = "b"

with pytest.raises(ValueError, match="Cannot use both legend and label"):
s.hist(legend=True, by=by, label="c")


@td.skip_if_no_mpl
class TestDataFramePlots(TestPlotBase):
Expand Down Expand Up @@ -293,6 +316,36 @@ def test_hist_column_order_unchanged(self, column, expected):

assert result == expected

@pytest.mark.parametrize("by", [None, "c"])
@pytest.mark.parametrize("column", [None, "b"])
def test_hist_with_legend(self, by, column):
# GH 6279 - DataFrame histogram can have a legend
expected_axes_num = 1 if by is None and column is not None else 2
expected_layout = (1, expected_axes_num)
expected_labels = column or ["a", "b"]
if by is not None:
expected_labels = [expected_labels] * 2

index = Index(15 * ["1"] + 15 * ["2"], name="c")
df = DataFrame(np.random.randn(30, 2), index=index, columns=["a", "b"])

axes = _check_plot_works(df.hist, legend=True, by=by, column=column)
self._check_axes_shape(axes, axes_num=expected_axes_num, layout=expected_layout)
if by is None and column is None:
axes = axes[0]
for expected_label, ax in zip(expected_labels, axes):
self._check_legend_labels(ax, expected_label)

@pytest.mark.parametrize("by", [None, "c"])
@pytest.mark.parametrize("column", [None, "b"])
def test_hist_with_legend_raises(self, by, column):
# GH 6279 - DataFrame histogram with legend and label raises
index = Index(15 * ["1"] + 15 * ["2"], name="c")
df = DataFrame(np.random.randn(30, 2), index=index, columns=["a", "b"])

with pytest.raises(ValueError, match="Cannot use both legend and label"):
df.hist(legend=True, by=by, column=column, label="d")


@td.skip_if_no_mpl
class TestDataFrameGroupByPlots(TestPlotBase):
Expand Down