Skip to content

BUG: groupby.hist legend should use group keys #33493

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Jun 22, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions pandas/plotting/_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def hist_series(
yrot=None,
figsize=None,
bins=10,
legend=False,
backend=None,
**kwargs,
):
Expand Down Expand Up @@ -50,6 +51,8 @@ def hist_series(
bin edges are calculated and returned. If bins is a sequence, gives
bin edges, including left edge of first bin and right edge of last
bin. In this case, bins is returned unmodified.
legend : bool, default False
Whether to show the legend.
backend : str, default None
Backend to use instead of the backend specified in the option
``plotting.backend``. For instance, 'matplotlib'. Alternatively, to
Expand Down Expand Up @@ -82,6 +85,7 @@ def hist_series(
yrot=yrot,
figsize=figsize,
bins=bins,
legend=legend,
**kwargs,
)

Expand All @@ -101,6 +105,7 @@ def hist_frame(
figsize=None,
layout=None,
bins=10,
legend=False,
backend=None,
**kwargs,
):
Expand Down Expand Up @@ -154,6 +159,8 @@ def hist_frame(
bin edges are calculated and returned. If bins is a sequence, gives
bin edges, including left edge of first bin and right edge of last
bin. In this case, bins is returned unmodified.
legend : bool, default False
Whether to show the legend.
backend : str, default None
Backend to use instead of the backend specified in the option
``plotting.backend``. For instance, 'matplotlib'. Alternatively, to
Expand Down Expand Up @@ -203,6 +210,7 @@ def hist_frame(
sharey=sharey,
figsize=figsize,
layout=layout,
legend=legend,
bins=bins,
**kwargs,
)
Expand Down
28 changes: 27 additions & 1 deletion pandas/plotting/_matplotlib/hist.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ def _grouped_hist(
xrot=None,
ylabelsize=None,
yrot=None,
legend=False,
**kwargs,
):
"""
Expand All @@ -243,15 +244,27 @@ def _grouped_hist(
sharey : bool, default False
rot : int, default 90
grid : bool, default True
legend: : bool, default False
kwargs : dict, keyword arguments passed to matplotlib.Axes.hist

Returns
-------
collection of Matplotlib Axes
"""

if legend and "label" not in kwargs:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason why someone would pass legend and label together? Should this not just raise?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only reason I think is to rename for output:

index = 3 * ['1'] + 3 * ['2']
df = DataFrame([(3, 4)]*3 + [(5, 6)]*3, index=index, columns=['a', 'b'])
df.index.names = ['c']
ret = df.hist(by='c', legend=True, label=['Nice Name 1', 'Nice Name 2'])

image

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should just raise instead

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems to me that we would then be raising on the only case where label has any effect - or am I missing another case?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I don't understand the reason why we would raise here when not raising makes it easier for the end user have different names appear in the legend. What does raising gain?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So my thought is that If a user wants particular labels they should just provide that to the grouper; adding this here is just another way of doing things so just API clout

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would completely agree if this was implemented within pandas. However it's being passed through to matplotlib via kwargs. Disabling certain kwargs from working and allowing others to go through seems hazardous to user expectations. But even if this is unconvincing, I think we should strive to be consistent with various plots:

index = 3 * ['1'] + 3 * ['2']
df = DataFrame([(3, 4)]*3 + [(5, 6)]*3, index=index, columns=['a', 'b'])
df.index.names = ['c']
ret = df.plot(y=['a', 'b'], label=['Nice Name 1', 'Nice Name 2'])

image

Perhaps an issue could be raised as to how labels should be treated across all plots?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well shoot. As a counter-argument to what I just said. DataFrame.plot.hist ignores label altogether. I suppose if there is one plotting function we should be consistent with here, it's that one. What do you think about not raising, but simply ignoring any label kwarg?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still think we should raise instead of ignoring; ignoring leads to confusion or surprising behavior, so always better to be explicit

if isinstance(data, ABCDataFrame):
if column is None:
kwargs["label"] = data.columns
else:
kwargs["label"] = column
else:
kwargs["label"] = data.name

def plot_group(group, ax):
ax.hist(group.dropna().values, bins=bins, **kwargs)
if legend:
ax.legend()

if xrot is None:
xrot = rot
Expand Down Expand Up @@ -290,6 +303,7 @@ def hist_series(
yrot=None,
figsize=None,
bins=10,
legend=False,
**kwds,
):
import matplotlib.pyplot as plt
Expand All @@ -308,8 +322,11 @@ def hist_series(
elif ax.get_figure() != fig:
raise AssertionError("passed axis not bound to passed figure")
values = self.dropna().values

if legend and "label" not in kwds:
kwds["label"] = self.name
ax.hist(values, bins=bins, **kwds)
if legend:
ax.legend()
ax.grid(grid)
axes = np.array([ax])

Expand All @@ -334,6 +351,7 @@ def hist_series(
xrot=xrot,
ylabelsize=ylabelsize,
yrot=yrot,
legend=legend,
**kwds,
)

Expand All @@ -358,6 +376,7 @@ def hist_frame(
figsize=None,
layout=None,
bins=10,
legend=False,
**kwds,
):
if by is not None:
Expand All @@ -376,6 +395,7 @@ def hist_frame(
xrot=xrot,
ylabelsize=ylabelsize,
yrot=yrot,
legend=legend,
**kwds,
)
return axes
Expand All @@ -401,11 +421,17 @@ def hist_frame(
)
_axes = _flatten(axes)

can_set_label = "label" not in kwds

for i, col in enumerate(data.columns):
ax = _axes[i]
if legend and can_set_label:
kwds["label"] = col
ax.hist(data[col].dropna().values, bins=bins, **kwds)
ax.set_title(col)
ax.grid(grid)
if legend:
ax.legend()

_set_ticks_props(
axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot
Expand Down
18 changes: 17 additions & 1 deletion pandas/tests/plotting/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@


import numpy as np
import pytest

import pandas.util._test_decorators as td

from pandas import DataFrame, Series
from pandas import DataFrame, Index, Series
import pandas._testing as tm
from pandas.tests.plotting.common import TestPlotBase

Expand Down Expand Up @@ -65,3 +66,18 @@ def test_plot_kwargs(self):

res = df.groupby("z").plot.scatter(x="x", y="y")
assert len(res["a"].collections) == 1


@td.skip_if_no_mpl
@pytest.mark.parametrize("column", [None, "b"])
@pytest.mark.parametrize("label", [None, "d"])
def test_hist_with_legend(column, label):
index = Index(15 * [1] + 15 * [2], name="c")
df = DataFrame(np.random.randn(30, 2), index=index, columns=["a", "b"])
g = df.groupby("c")

g.hist(column=column, label=label, legend=True)
tm.close()
if column != "b":
g["a"].hist(label=label, legend=True)
tm.close()
51 changes: 50 additions & 1 deletion pandas/tests/plotting/test_hist_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pandas.util._test_decorators as td

from pandas import DataFrame, Series
from pandas import DataFrame, Index, Series
import pandas._testing as tm
from pandas.tests.plotting.common import TestPlotBase, _check_plot_works

Expand Down Expand Up @@ -293,6 +293,28 @@ def test_hist_column_order_unchanged(self, column, expected):

assert result == expected

@pytest.mark.slow
@pytest.mark.parametrize("by", [None, "b"])
@pytest.mark.parametrize("label", [None, "c"])
def test_hist_with_legend(self, by, label):
expected_labels = label or "a"
expected_axes_num = 1 if by is None else 2
expected_layout = (1, 1) if by is None else (1, 2)

index = 15 * [1] + 15 * [2]
s = Series(np.random.randn(30), index=index, name="a")
s.index.name = "b"

kwargs = {"legend": True, "by": by}
if label is not None:
# Behavior differs if kwargs contains "label": None
kwargs["label"] = label

_check_plot_works(s.hist, **kwargs)
axes = s.hist(**kwargs)
self._check_axes_shape(axes, axes_num=expected_axes_num, layout=expected_layout)
self._check_legend_labels(axes, expected_labels)


@td.skip_if_no_mpl
class TestDataFrameGroupByPlots(TestPlotBase):
Expand Down Expand Up @@ -484,3 +506,30 @@ def test_axis_share_xy(self):

assert ax1._shared_y_axes.joined(ax1, ax2)
assert ax2._shared_y_axes.joined(ax1, ax2)

@pytest.mark.slow
@pytest.mark.parametrize("by", [None, "c"])
@pytest.mark.parametrize("column", [None, "b"])
@pytest.mark.parametrize("label", [None, "d"])
def test_hist_with_legend(self, by, column, label):
expected_axes_num = 1 if by is None and column is not None else 2
expected_layout = (1, expected_axes_num)
expected_labels = label or column or ["a", "b"]
if by is not None:
expected_labels = [expected_labels] * 2

index = Index(15 * [1] + 15 * [2], name="c")
df = DataFrame(np.random.randn(30, 2), index=index, columns=["a", "b"])

kwargs = {"legend": True, "by": by, "column": column}
if label is not None:
# Behavior differs if kwargs contains "label": None
kwargs["label"] = label

_check_plot_works(df.hist, **kwargs)
axes = df.hist(**kwargs)
self._check_axes_shape(axes, axes_num=expected_axes_num, layout=expected_layout)
if by is None:
axes = axes[0]
for expected_label, ax in zip(expected_labels, axes):
self._check_legend_labels(ax, expected_label)