diff --git a/doc/source/release.rst b/doc/source/release.rst index be6d5d464cb36..35ce6c9359d56 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -53,6 +53,8 @@ pandas 0.14.0 New features ~~~~~~~~~~~~ +- Hexagonal bin plots from ``DataFrame.plot`` with ``kind='hexbin'`` (:issue:`5478`) + API Changes ~~~~~~~~~~~ diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index d181df4c6b89b..ea9fbadeeaf4e 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -154,6 +154,7 @@ Enhancements - ``plot(legend='reverse')`` will now reverse the order of legend labels for most plot kinds. (:issue:`6014`) - improve performance of slice indexing on Series with string keys (:issue:`6341`) +- Hexagonal bin plots from ``DataFrame.plot`` with ``kind='hexbin'`` (:issue:`5478`) Performance ~~~~~~~~~~~ diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 3af83b4d80c8c..081dfd0292cdc 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -414,6 +414,59 @@ setting `kind='kde'`: @savefig kde_plot.png ser.plot(kind='kde') +.. _visualization.hexbin + +Hexagonal Bin plot +~~~~~~~~~~~~~~~~~~ +*New in .14* You can create hexagonal bin plots with ``DataFrame.plot`` and +``kind='hexbin'``. +Hexbin plots can be a useful alternative to scatter plots if your data are +too dense to plot each point individually. + +.. ipython:: python + :suppress: + + plt.figure(); + +.. ipython:: python + + df = DataFrame(randn(1000, 2), columns=['a', 'b']) + df['b'] = df['b'] = df['b'] + np.arange(1000) + + @savefig hexbin_plot.png + df.plot(kind='hexbin', x='a', y='b', gridsize=25) + + +A useful keyword argument is ``gridsize``; it controls the number of hexagons +in the x-direction, and defaults to 100. A larger ``gridsize`` means more, smaller +bins. + +By default, a histogram of the counts around each ``(x, y)`` point is computed. +You can specify alternative aggregations by passing values to the ``C`` and +``reduce_C_function`` arguments. ``C`` specifies the value at each ``(x, y)`` point +and ``reduce_C_function`` is a function of one argument that reduces all the +values in a bin to a single number (e.g. ``mean``, ``max``, ``sum``, ``std``). In this +example the positions are given by columns ``a`` and ``b``, while the value is +given by column ``z``. The bins are aggregated with numpy's ``max`` function. + +.. ipython:: python + :suppress: + + plt.figure(); + +.. ipython:: python + + df = DataFrame(randn(1000, 2), columns=['a', 'b']) + df['b'] = df['b'] = df['b'] + np.arange(1000) + df['z'] = np.random.uniform(0, 3, 1000) + + @savefig hexbin_plot_agg.png + df.plot(kind='hexbin', x='a', y='b', C='z', reduce_C_function=np.max, + gridsize=25) + + +See the `matplotlib hexbin documenation `__ for more. + .. _visualization.andrews_curves: Andrews Curves diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 2902621a1e944..041920e1de6ea 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -956,6 +956,65 @@ def test_invalid_kind(self): with tm.assertRaises(ValueError): df.plot(kind='aasdf') + @slow + def test_hexbin_basic(self): + df = DataFrame({"A": np.random.uniform(size=20), + "B": np.random.uniform(size=20), + "C": np.arange(20) + np.random.uniform(size=20)}) + + ax = df.plot(kind='hexbin', x='A', y='B', gridsize=10) + # TODO: need better way to test. This just does existence. + self.assert_(len(ax.collections) == 1) + + @slow + def test_hexbin_with_c(self): + df = DataFrame({"A": np.random.uniform(size=20), + "B": np.random.uniform(size=20), + "C": np.arange(20) + np.random.uniform(size=20)}) + + ax = df.plot(kind='hexbin', x='A', y='B', C='C') + self.assert_(len(ax.collections) == 1) + + ax = df.plot(kind='hexbin', x='A', y='B', C='C', + reduce_C_function=np.std) + self.assert_(len(ax.collections) == 1) + + @slow + def test_hexbin_cmap(self): + df = DataFrame({"A": np.random.uniform(size=20), + "B": np.random.uniform(size=20), + "C": np.arange(20) + np.random.uniform(size=20)}) + + # Default to BuGn + ax = df.plot(kind='hexbin', x='A', y='B') + self.assertEquals(ax.collections[0].cmap.name, 'BuGn') + + cm = 'cubehelix' + ax = df.plot(kind='hexbin', x='A', y='B', colormap=cm) + self.assertEquals(ax.collections[0].cmap.name, cm) + + @slow + def test_no_color_bar(self): + df = DataFrame({"A": np.random.uniform(size=20), + "B": np.random.uniform(size=20), + "C": np.arange(20) + np.random.uniform(size=20)}) + + ax = df.plot(kind='hexbin', x='A', y='B', colorbar=None) + self.assertIs(ax.collections[0].colorbar, None) + + @slow + def test_allow_cmap(self): + df = DataFrame({"A": np.random.uniform(size=20), + "B": np.random.uniform(size=20), + "C": np.arange(20) + np.random.uniform(size=20)}) + + ax = df.plot(kind='hexbin', x='A', y='B', cmap='YlGn') + self.assertEquals(ax.collections[0].cmap.name, 'YlGn') + + with tm.assertRaises(TypeError): + df.plot(kind='hexbin', x='A', y='B', cmap='YlGn', + colormap='BuGn') + @tm.mplskip class TestDataFrameGroupByPlots(tm.TestCase): diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index c11e2a891ec7a..7038284b6c2a0 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -835,7 +835,14 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=True, secondary_y = [secondary_y] self.secondary_y = secondary_y - self.colormap = colormap + # ugly TypeError if user passes matplotlib's `cmap` name. + # Probably better to accept either. + if 'cmap' in kwds and colormap: + raise TypeError("Only specify one of `cmap` and `colormap`.") + elif 'cmap' in kwds: + self.colormap = kwds.pop('cmap') + else: + self.colormap = colormap self.kwds = kwds @@ -1263,6 +1270,52 @@ def _post_plot_logic(self): ax.set_xlabel(com.pprint_thing(x)) +class HexBinPlot(MPLPlot): + def __init__(self, data, x, y, C=None, **kwargs): + MPLPlot.__init__(self, data, **kwargs) + + if x is None or y is None: + raise ValueError('hexbin requires and x and y column') + if com.is_integer(x) and not self.data.columns.holds_integer(): + x = self.data.columns[x] + if com.is_integer(y) and not self.data.columns.holds_integer(): + y = self.data.columns[y] + + if com.is_integer(C) and not self.data.columns.holds_integer(): + C = self.data.columns[C] + + self.x = x + self.y = y + self.C = C + + def _make_plot(self): + import matplotlib.pyplot as plt + + x, y, data, C = self.x, self.y, self.data, self.C + ax = self.axes[0] + # pandas uses colormap, matplotlib uses cmap. + cmap = self.colormap or 'BuGn' + cmap = plt.cm.get_cmap(cmap) + cb = self.kwds.pop('colorbar', True) + + if C is None: + c_values = None + else: + c_values = data[C].values + + ax.hexbin(data[x].values, data[y].values, C=c_values, cmap=cmap, + **self.kwds) + if cb: + img = ax.collections[0] + self.fig.colorbar(img, ax=ax) + + def _post_plot_logic(self): + ax = self.axes[0] + x, y = self.x, self.y + ax.set_ylabel(com.pprint_thing(y)) + ax.set_xlabel(com.pprint_thing(x)) + + class LinePlot(MPLPlot): def __init__(self, data, **kwargs): @@ -1663,11 +1716,12 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True, ax : matplotlib axis object, default None style : list or dict matplotlib line style per column - kind : {'line', 'bar', 'barh', 'kde', 'density', 'scatter'} + kind : {'line', 'bar', 'barh', 'kde', 'density', 'scatter', 'hexbin'} bar : vertical bar plot barh : horizontal bar plot kde/density : Kernel Density Estimation plot scatter: scatter plot + hexbin: hexbin plot logx : boolean, default False For line plots, use log scaling on x axis logy : boolean, default False @@ -1695,6 +1749,17 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True, Returns ------- ax_or_axes : matplotlib.AxesSubplot or list of them + + Notes + ----- + + If `kind`='hexbin', you can control the size of the bins with the + `gridsize` argument. By default, a histogram of the counts around each + `(x, y)` point is computed. You can specify alternative aggregations + by passing values to the `C` and `reduce_C_function` arguments. + `C` specifies the value at each `(x, y)` point and `reduce_C_function` + is a function of one argument that reduces all the values in a bin to + a single number (e.g. `mean`, `max`, `sum`, `std`). """ kind = _get_standard_kind(kind.lower().strip()) if kind == 'line': @@ -1705,6 +1770,8 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True, klass = KdePlot elif kind == 'scatter': klass = ScatterPlot + elif kind == 'hexbin': + klass = HexBinPlot else: raise ValueError('Invalid chart type given %s' % kind) @@ -1717,6 +1784,16 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True, figsize=figsize, logx=logx, logy=logy, sort_columns=sort_columns, secondary_y=secondary_y, **kwds) + elif kind == 'hexbin': + C = kwds.pop('C', None) # remove from kwargs so we can set default + plot_obj = klass(frame, x=x, y=y, kind=kind, subplots=subplots, + rot=rot,legend=legend, ax=ax, style=style, + fontsize=fontsize, use_index=use_index, sharex=sharex, + sharey=sharey, xticks=xticks, yticks=yticks, + xlim=xlim, ylim=ylim, title=title, grid=grid, + figsize=figsize, logx=logx, logy=logy, + sort_columns=sort_columns, secondary_y=secondary_y, + C=C, **kwds) else: if x is not None: if com.is_integer(x) and not frame.columns.holds_integer():