diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index e842b73664e6c..06dba0979c7eb 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -168,6 +168,8 @@ previously results in ``Exception`` or ``TypeError`` (:issue:`7812`) - ``Timestamp.__repr__`` displays ``dateutil.tz.tzoffset`` info (:issue:`7907`) +- Histogram from ``DataFrame.plot`` with ``kind='hist'`` (:issue:`7809`), See :ref:`the docs`. + .. _whatsnew_0150.dt: .dt accessor diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 69e04483cb47d..40b5d7c1599c1 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -123,6 +123,7 @@ a handful of values for plots other than the default Line plot. These include: * :ref:`'bar' ` or :ref:`'barh' ` for bar plots +* :ref:`'hist' ` for histogram * :ref:`'kde' ` or ``'density'`` for density plots * :ref:`'area' ` for area plots * :ref:`'scatter' ` for scatter plots @@ -205,6 +206,46 @@ To get horizontal bar plots, pass ``kind='barh'``: Histograms ~~~~~~~~~~ + +.. versionadded:: 0.15.0 + +Histogram can be drawn specifying ``kind='hist'``. + +.. ipython:: python + + df4 = DataFrame({'a': randn(1000) + 1, 'b': randn(1000), + 'c': randn(1000) - 1}, columns=['a', 'b', 'c']) + + plt.figure(); + + @savefig hist_new.png + df4.plot(kind='hist', alpha=0.5) + +Histogram can be stacked by ``stacked=True``. Bin size can be changed by ``bins`` keyword. + +.. ipython:: python + + plt.figure(); + + @savefig hist_new_stacked.png + df4.plot(kind='hist', stacked=True, bins=20) + +You can pass other keywords supported by matplotlib ``hist``. For example, horizontal and cumulative histgram can be drawn by ``orientation='horizontal'`` and ``cumulative='True'``. + +.. ipython:: python + + plt.figure(); + + @savefig hist_new_kwargs.png + df4['a'].plot(kind='hist', orientation='horizontal', cumulative=True) + + +See the :meth:`hist ` method and the +`matplotlib hist documenation `__ for more. + + +The previous interface ``DataFrame.hist`` to plot histogram still can be used. + .. ipython:: python plt.figure(); diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 8dbcb8c542fb3..b3a92263370e8 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -452,7 +452,7 @@ def test_plot(self): _check_plot_works(self.ts.plot, kind='area', stacked=False) _check_plot_works(self.iseries.plot) - for kind in ['line', 'bar', 'barh', 'kde']: + for kind in ['line', 'bar', 'barh', 'kde', 'hist']: if not _ok_for_gaussian_kde(kind): continue _check_plot_works(self.series[:5].plot, kind=kind) @@ -616,7 +616,13 @@ def test_pie_series(self): self._check_text_labels(ax.texts, series.index) @slow - def test_hist(self): + def test_hist_df_kwargs(self): + df = DataFrame(np.random.randn(10, 2)) + ax = df.plot(kind='hist', bins=5) + self.assertEqual(len(ax.patches), 10) + + @slow + def test_hist_legacy(self): _check_plot_works(self.ts.hist) _check_plot_works(self.ts.hist, grid=False) _check_plot_works(self.ts.hist, figsize=(8, 10)) @@ -637,7 +643,7 @@ def test_hist(self): self.ts.hist(by=self.ts.index, figure=fig) @slow - def test_hist_bins(self): + def test_hist_bins_legacy(self): df = DataFrame(np.random.randn(10, 2)) ax = df.hist(bins=2)[0][0] self.assertEqual(len(ax.patches), 2) @@ -701,13 +707,25 @@ def test_plot_fails_when_ax_differs_from_figure(self): self.ts.hist(ax=ax1, figure=fig2) @slow - def test_kde(self): + def test_hist_kde(self): + ax = self.ts.plot(kind='hist', logy=True) + self._check_ax_scales(ax, yaxis='log') + xlabels = ax.get_xticklabels() + # ticks are values, thus ticklabels are blank + self._check_text_labels(xlabels, [''] * len(xlabels)) + ylabels = ax.get_yticklabels() + self._check_text_labels(ylabels, [''] * len(ylabels)) + tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() _check_plot_works(self.ts.plot, kind='kde') _check_plot_works(self.ts.plot, kind='density') ax = self.ts.plot(kind='kde', logy=True) self._check_ax_scales(ax, yaxis='log') + xlabels = ax.get_xticklabels() + self._check_text_labels(xlabels, [''] * len(xlabels)) + ylabels = ax.get_yticklabels() + self._check_text_labels(ylabels, [''] * len(ylabels)) @slow def test_kde_kwargs(self): @@ -718,9 +736,29 @@ def test_kde_kwargs(self): _check_plot_works(self.ts.plot, kind='density', bw_method=.5, ind=linspace(-100,100,20)) ax = self.ts.plot(kind='kde', logy=True, bw_method=.5, ind=linspace(-100,100,20)) self._check_ax_scales(ax, yaxis='log') + self._check_text_labels(ax.yaxis.get_label(), 'Density') @slow - def test_kde_color(self): + def test_hist_kwargs(self): + ax = self.ts.plot(kind='hist', bins=5) + self.assertEqual(len(ax.patches), 5) + self._check_text_labels(ax.yaxis.get_label(), 'Degree') + tm.close() + + ax = self.ts.plot(kind='hist', orientation='horizontal') + self._check_text_labels(ax.xaxis.get_label(), 'Degree') + tm.close() + + ax = self.ts.plot(kind='hist', align='left', stacked=True) + tm.close() + + @slow + def test_hist_kde_color(self): + ax = self.ts.plot(kind='hist', logy=True, bins=10, color='b') + self._check_ax_scales(ax, yaxis='log') + self.assertEqual(len(ax.patches), 10) + self._check_colors(ax.patches, facecolors=['b'] * 10) + tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() ax = self.ts.plot(kind='kde', logy=True, color='r') @@ -1611,7 +1649,7 @@ def test_boxplot_return_type(self): self._check_box_return_type(result, 'both') @slow - def test_kde(self): + def test_kde_df(self): tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() df = DataFrame(randn(100, 4)) @@ -1630,7 +1668,122 @@ def test_kde(self): self._check_ax_scales(axes, yaxis='log') @slow - def test_hist(self): + def test_hist_df(self): + df = DataFrame(randn(100, 4)) + series = df[0] + + ax = _check_plot_works(df.plot, kind='hist') + expected = [com.pprint_thing(c) for c in df.columns] + self._check_legend_labels(ax, labels=expected) + + axes = _check_plot_works(df.plot, kind='hist', subplots=True, logy=True) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + self._check_ax_scales(axes, yaxis='log') + + axes = series.plot(kind='hist', rot=40) + self._check_ticks_props(axes, xrot=40, yrot=0) + tm.close() + + ax = series.plot(kind='hist', normed=True, cumulative=True, bins=4) + # height of last bin (index 5) must be 1.0 + self.assertAlmostEqual(ax.get_children()[5].get_height(), 1.0) + tm.close() + + ax = series.plot(kind='hist', cumulative=True, bins=4) + self.assertAlmostEqual(ax.get_children()[5].get_height(), 100.0) + tm.close() + + # if horizontal, yticklabels are rotated + axes = df.plot(kind='hist', rot=50, fontsize=8, orientation='horizontal') + self._check_ticks_props(axes, xrot=0, yrot=50, ylabelsize=8) + + def _check_box_coord(self, patches, expected_y=None, expected_h=None, + expected_x=None, expected_w=None): + result_y = np.array([p.get_y() for p in patches]) + result_height = np.array([p.get_height() for p in patches]) + result_x = np.array([p.get_x() for p in patches]) + result_width = np.array([p.get_width() for p in patches]) + + if expected_y is not None: + self.assert_numpy_array_equal(result_y, expected_y) + if expected_h is not None: + self.assert_numpy_array_equal(result_height, expected_h) + if expected_x is not None: + self.assert_numpy_array_equal(result_x, expected_x) + if expected_w is not None: + self.assert_numpy_array_equal(result_width, expected_w) + + @slow + def test_hist_df_coord(self): + normal_df = DataFrame({'A': np.repeat(np.array([1, 2, 3, 4, 5]), + np.array([10, 9, 8, 7, 6])), + 'B': np.repeat(np.array([1, 2, 3, 4, 5]), + np.array([8, 8, 8, 8, 8])), + 'C': np.repeat(np.array([1, 2, 3, 4, 5]), + np.array([6, 7, 8, 9, 10]))}, + columns=['A', 'B', 'C']) + + nan_df = DataFrame({'A': np.repeat(np.array([np.nan, 1, 2, 3, 4, 5]), + np.array([3, 10, 9, 8, 7, 6])), + 'B': np.repeat(np.array([1, np.nan, 2, 3, 4, 5]), + np.array([8, 3, 8, 8, 8, 8])), + 'C': np.repeat(np.array([1, 2, 3, np.nan, 4, 5]), + np.array([6, 7, 8, 3, 9, 10]))}, + columns=['A', 'B', 'C']) + + for df in [normal_df, nan_df]: + ax = df.plot(kind='hist', bins=5) + self._check_box_coord(ax.patches[:5], expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([10, 9, 8, 7, 6])) + self._check_box_coord(ax.patches[5:10], expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([8, 8, 8, 8, 8])) + self._check_box_coord(ax.patches[10:], expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([6, 7, 8, 9, 10])) + + ax = df.plot(kind='hist', bins=5, stacked=True) + self._check_box_coord(ax.patches[:5], expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([10, 9, 8, 7, 6])) + self._check_box_coord(ax.patches[5:10], expected_y=np.array([10, 9, 8, 7, 6]), + expected_h=np.array([8, 8, 8, 8, 8])) + self._check_box_coord(ax.patches[10:], expected_y=np.array([18, 17, 16, 15, 14]), + expected_h=np.array([6, 7, 8, 9, 10])) + + axes = df.plot(kind='hist', bins=5, stacked=True, subplots=True) + self._check_box_coord(axes[0].patches, expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([10, 9, 8, 7, 6])) + self._check_box_coord(axes[1].patches, expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([8, 8, 8, 8, 8])) + self._check_box_coord(axes[2].patches, expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([6, 7, 8, 9, 10])) + + # horizontal + ax = df.plot(kind='hist', bins=5, orientation='horizontal') + self._check_box_coord(ax.patches[:5], expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([10, 9, 8, 7, 6])) + self._check_box_coord(ax.patches[5:10], expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([8, 8, 8, 8, 8])) + self._check_box_coord(ax.patches[10:], expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([6, 7, 8, 9, 10])) + + ax = df.plot(kind='hist', bins=5, stacked=True, orientation='horizontal') + self._check_box_coord(ax.patches[:5], expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([10, 9, 8, 7, 6])) + self._check_box_coord(ax.patches[5:10], expected_x=np.array([10, 9, 8, 7, 6]), + expected_w=np.array([8, 8, 8, 8, 8])) + self._check_box_coord(ax.patches[10:], expected_x=np.array([18, 17, 16, 15, 14]), + expected_w=np.array([6, 7, 8, 9, 10])) + + axes = df.plot(kind='hist', bins=5, stacked=True, + subplots=True, orientation='horizontal') + self._check_box_coord(axes[0].patches, expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([10, 9, 8, 7, 6])) + self._check_box_coord(axes[1].patches, expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([8, 8, 8, 8, 8])) + self._check_box_coord(axes[2].patches, expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([6, 7, 8, 9, 10])) + + @slow + def test_hist_df_legacy(self): _check_plot_works(self.hist_df.hist) # make sure layout is handled @@ -1849,7 +2002,7 @@ def test_plot_int_columns(self): @slow def test_df_legend_labels(self): - kinds = 'line', 'bar', 'barh', 'kde', 'area' + kinds = ['line', 'bar', 'barh', 'kde', 'area', 'hist'] df = DataFrame(rand(3, 3), columns=['a', 'b', 'c']) df2 = DataFrame(rand(3, 3), columns=['d', 'e', 'f']) df3 = DataFrame(rand(3, 3), columns=['g', 'h', 'i']) @@ -1927,7 +2080,7 @@ def test_legend_name(self): @slow def test_no_legend(self): - kinds = 'line', 'bar', 'barh', 'kde', 'area' + kinds = ['line', 'bar', 'barh', 'kde', 'area', 'hist'] df = DataFrame(rand(3, 3), columns=['a', 'b', 'c']) for kind in kinds: @@ -2019,6 +2172,56 @@ def test_area_colors(self): poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] self._check_colors(poly, facecolors=rgba_colors) + @slow + def test_hist_colors(self): + default_colors = self.plt.rcParams.get('axes.color_cycle') + + df = DataFrame(randn(5, 5)) + ax = df.plot(kind='hist') + self._check_colors(ax.patches[::10], facecolors=default_colors[:5]) + tm.close() + + custom_colors = 'rgcby' + ax = df.plot(kind='hist', color=custom_colors) + self._check_colors(ax.patches[::10], facecolors=custom_colors) + tm.close() + + from matplotlib import cm + # Test str -> colormap functionality + ax = df.plot(kind='hist', colormap='jet') + rgba_colors = lmap(cm.jet, np.linspace(0, 1, 5)) + self._check_colors(ax.patches[::10], facecolors=rgba_colors) + tm.close() + + # Test colormap functionality + ax = df.plot(kind='hist', colormap=cm.jet) + rgba_colors = lmap(cm.jet, np.linspace(0, 1, 5)) + self._check_colors(ax.patches[::10], facecolors=rgba_colors) + tm.close() + + ax = df.ix[:, [0]].plot(kind='hist', color='DodgerBlue') + self._check_colors([ax.patches[0]], facecolors=['DodgerBlue']) + + @slow + def test_kde_colors(self): + from matplotlib import cm + + custom_colors = 'rgcby' + df = DataFrame(rand(5, 5)) + + ax = df.plot(kind='kde', color=custom_colors) + self._check_colors(ax.get_lines(), linecolors=custom_colors) + tm.close() + + ax = df.plot(kind='kde', colormap='jet') + rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df))) + self._check_colors(ax.get_lines(), linecolors=rgba_colors) + tm.close() + + ax = df.plot(kind='kde', colormap=cm.jet) + rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df))) + self._check_colors(ax.get_lines(), linecolors=rgba_colors) + def test_default_color_cycle(self): import matplotlib.pyplot as plt plt.rcParams['axes.color_cycle'] = list('rgbk') diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 5d85b68234f96..7d0eaea5b36d6 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -1359,58 +1359,6 @@ def _get_errorbars(self, label=None, index=None, xerr=True, yerr=True): return errors -class KdePlot(MPLPlot): - orientation = 'vertical' - - def __init__(self, data, bw_method=None, ind=None, **kwargs): - MPLPlot.__init__(self, data, **kwargs) - self.bw_method=bw_method - self.ind=ind - - def _make_plot(self): - from scipy.stats import gaussian_kde - from scipy import __version__ as spv - from distutils.version import LooseVersion - plotf = self.plt.Axes.plot - colors = self._get_colors() - for i, (label, y) in enumerate(self._iter_data()): - ax = self._get_ax(i) - style = self._get_style(i, label) - - label = com.pprint_thing(label) - - if LooseVersion(spv) >= '0.11.0': - gkde = gaussian_kde(y, bw_method=self.bw_method) - else: - gkde = gaussian_kde(y) - if self.bw_method is not None: - msg = ('bw_method was added in Scipy 0.11.0.' + - ' Scipy version in use is %s.' % spv) - warnings.warn(msg) - - sample_range = max(y) - min(y) - - if self.ind is None: - ind = np.linspace(min(y) - 0.5 * sample_range, - max(y) + 0.5 * sample_range, 1000) - else: - ind = self.ind - - ax.set_ylabel("Density") - - y = gkde.evaluate(ind) - kwds = self.kwds.copy() - kwds['label'] = label - self._maybe_add_color(colors, kwds, style, i) - if style is None: - args = (ax, ind, y) - else: - args = (ax, ind, y, style) - - newlines = plotf(*args, **kwds) - self._add_legend_handle(newlines[0], label) - - class ScatterPlot(MPLPlot): def __init__(self, data, x, y, **kwargs): MPLPlot.__init__(self, data, **kwargs) @@ -1903,6 +1851,119 @@ def orientation(self): raise NotImplementedError(self.kind) +class HistPlot(LinePlot): + + def __init__(self, data, bins=10, bottom=0, **kwargs): + self.bins = bins # use mpl default + self.bottom = bottom + # Do not call LinePlot.__init__ which may fill nan + MPLPlot.__init__(self, data, **kwargs) + + def _args_adjust(self): + if com.is_integer(self.bins): + # create common bin edge + values = np.ravel(self.data.values) + values = values[~com.isnull(values)] + + hist, self.bins = np.histogram(values, bins=self.bins, + range=self.kwds.get('range', None), + weights=self.kwds.get('weights', None)) + + if com.is_list_like(self.bottom): + self.bottom = np.array(self.bottom) + + def _get_plot_function(self): + def plotf(ax, y, style=None, column_num=None, **kwds): + if column_num == 0: + self._initialize_prior(len(self.bins) - 1) + y = y[~com.isnull(y)] + bottom = self._pos_prior + self.bottom + # ignore style + n, bins, patches = self.plt.Axes.hist(ax, y, bins=self.bins, + bottom=bottom, **kwds) + self._update_prior(n) + return patches + return plotf + + def _make_plot(self): + plotf = self._get_plot_function() + colors = self._get_colors() + for i, (label, y) in enumerate(self._iter_data()): + ax = self._get_ax(i) + style = self._get_style(i, label) + label = com.pprint_thing(label) + + kwds = self.kwds.copy() + kwds['label'] = label + self._maybe_add_color(colors, kwds, style, i) + + if style is not None: + kwds['style'] = style + + artists = plotf(ax, y, column_num=i, **kwds) + self._add_legend_handle(artists[0], label) + + def _post_plot_logic(self): + if self.orientation == 'horizontal': + for ax in self.axes: + ax.set_xlabel('Degree') + else: + for ax in self.axes: + ax.set_ylabel('Degree') + + @property + def orientation(self): + if self.kwds.get('orientation', None) == 'horizontal': + return 'horizontal' + else: + return 'vertical' + + +class KdePlot(HistPlot): + orientation = 'vertical' + + def __init__(self, data, bw_method=None, ind=None, **kwargs): + MPLPlot.__init__(self, data, **kwargs) + self.bw_method = bw_method + self.ind = ind + + def _args_adjust(self): + pass + + def _get_ind(self, y): + if self.ind is None: + sample_range = max(y) - min(y) + ind = np.linspace(min(y) - 0.5 * sample_range, + max(y) + 0.5 * sample_range, 1000) + else: + ind = self.ind + return ind + + def _get_plot_function(self): + from scipy.stats import gaussian_kde + from scipy import __version__ as spv + f = MPLPlot._get_plot_function(self) + def plotf(ax, y, style=None, column_num=None, **kwds): + if LooseVersion(spv) >= '0.11.0': + gkde = gaussian_kde(y, bw_method=self.bw_method) + else: + gkde = gaussian_kde(y) + if self.bw_method is not None: + msg = ('bw_method was added in Scipy 0.11.0.' + + ' Scipy version in use is %s.' % spv) + warnings.warn(msg) + + ind = self._get_ind(y) + y = gkde.evaluate(ind) + lines = f(ax, ind, y, style=style, **kwds) + return lines + return plotf + + def _post_plot_logic(self): + for ax in self.axes: + ax.set_ylabel('Density') + + class PiePlot(MPLPlot): def __init__(self, data, kind=None, **kwargs): @@ -1964,11 +2025,8 @@ class BoxPlot(MPLPlot): pass -class HistPlot(MPLPlot): - pass - # kinds supported by both dataframe and series -_common_kinds = ['line', 'bar', 'barh', 'kde', 'density', 'area'] +_common_kinds = ['line', 'bar', 'barh', 'kde', 'density', 'area', 'hist'] # kinds supported by dataframe _dataframe_kinds = ['scatter', 'hexbin'] # kinds supported only by series or dataframe single column @@ -1976,7 +2034,7 @@ class HistPlot(MPLPlot): _all_kinds = _common_kinds + _dataframe_kinds + _series_kinds _plot_klass = {'line': LinePlot, 'bar': BarPlot, 'barh': BarPlot, - 'kde': KdePlot, + 'kde': KdePlot, 'hist': HistPlot, 'scatter': ScatterPlot, 'hexbin': HexBinPlot, 'area': AreaPlot, 'pie': PiePlot} @@ -2023,10 +2081,11 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True, ax : matplotlib axis object, default None style : list or dict matplotlib line style per column - kind : {'line', 'bar', 'barh', 'kde', 'density', 'area', scatter', 'hexbin'} + kind : {'line', 'bar', 'barh', 'hist', 'kde', 'density', 'area', 'scatter', 'hexbin'} line : line plot bar : vertical bar plot barh : horizontal bar plot + hist : histogram kde/density : Kernel Density Estimation plot area : area plot scatter : scatter plot @@ -2170,10 +2229,11 @@ def plot_series(series, label=None, kind='line', use_index=True, rot=None, Parameters ---------- label : label argument to provide to plot - kind : {'line', 'bar', 'barh', 'kde', 'density', 'area'} + kind : {'line', 'bar', 'barh', 'hist', 'kde', 'density', 'area'} line : line plot bar : vertical bar plot barh : horizontal bar plot + hist : histogram kde/density : Kernel Density Estimation plot area : area plot use_index : boolean, default True