pandas-dev · jreback · Feb 14, 2014 · Nov 9, 2013
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -53,6 +53,8 @@ pandas 0.14.0
 New features
 ~~~~~~~~~~~~
 
+- Hexagonal bin plots from ``DataFrame.plot`` with ``kind='hexbin'`` (:issue:`5478`)
+
 API Changes
 ~~~~~~~~~~~
 

diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -154,6 +154,7 @@ Enhancements
 - ``plot(legend='reverse')`` will now reverse the order of legend labels for
   most plot kinds. (:issue:`6014`)
 - improve performance of slice indexing on Series with string keys (:issue:`6341`)
+- Hexagonal bin plots from ``DataFrame.plot`` with ``kind='hexbin'`` (:issue:`5478`)
 
 Performance
 ~~~~~~~~~~~

diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst
@@ -414,6 +414,59 @@ setting `kind='kde'`:
    @savefig kde_plot.png
    ser.plot(kind='kde')
 
+.. _visualization.hexbin
+
+Hexagonal Bin plot
+~~~~~~~~~~~~~~~~~~
+*New in .14* You can create hexagonal bin plots with ``DataFrame.plot`` and
+``kind='hexbin'``.
+Hexbin plots can be a useful alternative to scatter plots if your data are
+too dense to plot each point individually.
+
+.. ipython:: python
+   :suppress:
+
+   plt.figure();
+
+.. ipython:: python
+
+   df = DataFrame(randn(1000, 2), columns=['a', 'b'])
+   df['b'] = df['b'] = df['b'] + np.arange(1000)
+
+   @savefig hexbin_plot.png
+   df.plot(kind='hexbin', x='a', y='b', gridsize=25)
+
+
+A useful keyword argument is ``gridsize``; it controls the number of hexagons
+in the x-direction, and defaults to 100. A larger ``gridsize`` means more, smaller
+bins.
+
+By default, a histogram of the counts around each ``(x, y)`` point is computed.
+You can specify alternative aggregations by passing values to the ``C`` and
+``reduce_C_function`` arguments. ``C`` specifies the value at each ``(x, y)`` point
+and ``reduce_C_function`` is a function of one argument that reduces all the
+values in a bin to a single number (e.g. ``mean``, ``max``, ``sum``, ``std``).  In this
+example the positions are given by columns ``a`` and ``b``, while the value is
+given by column ``z``. The bins are aggregated with numpy's ``max`` function.
+
+.. ipython:: python
+   :suppress:
+
+   plt.figure();
+
+.. ipython:: python
+
+   df = DataFrame(randn(1000, 2), columns=['a', 'b'])
+   df['b'] = df['b'] = df['b'] + np.arange(1000)
+   df['z'] = np.random.uniform(0, 3, 1000)
+
+   @savefig hexbin_plot_agg.png
+   df.plot(kind='hexbin', x='a', y='b', C='z', reduce_C_function=np.max,
+           gridsize=25)
+
+
+See the `matplotlib hexbin documenation <http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.hexbin>`__ for more.
+
 .. _visualization.andrews_curves:
 
 Andrews Curves

diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py
@@ -956,6 +956,65 @@ def test_invalid_kind(self):
         with tm.assertRaises(ValueError):
             df.plot(kind='aasdf')
 
+    @slow
+    def test_hexbin_basic(self):
+        df = DataFrame({"A": np.random.uniform(size=20),
+                        "B": np.random.uniform(size=20),
+                        "C": np.arange(20) + np.random.uniform(size=20)})
+
+        ax = df.plot(kind='hexbin', x='A', y='B', gridsize=10)
+        # TODO: need better way to test. This just does existence.
+        self.assert_(len(ax.collections) == 1)
+
+    @slow
+    def test_hexbin_with_c(self):
+        df = DataFrame({"A": np.random.uniform(size=20),
+                        "B": np.random.uniform(size=20),
+                        "C": np.arange(20) + np.random.uniform(size=20)})
+
+        ax = df.plot(kind='hexbin', x='A', y='B', C='C')
+        self.assert_(len(ax.collections) == 1)
+
+        ax = df.plot(kind='hexbin', x='A', y='B', C='C',
+                          reduce_C_function=np.std)
+        self.assert_(len(ax.collections) == 1)
+
+    @slow
+    def test_hexbin_cmap(self):
+        df = DataFrame({"A": np.random.uniform(size=20),
+                        "B": np.random.uniform(size=20),
+                        "C": np.arange(20) + np.random.uniform(size=20)})
+
+        # Default to BuGn
+        ax = df.plot(kind='hexbin', x='A', y='B')
+        self.assertEquals(ax.collections[0].cmap.name, 'BuGn')
+
+        cm = 'cubehelix'
+        ax = df.plot(kind='hexbin', x='A', y='B', colormap=cm)
+        self.assertEquals(ax.collections[0].cmap.name, cm)
+
+    @slow
+    def test_no_color_bar(self):
+        df = DataFrame({"A": np.random.uniform(size=20),
+                        "B": np.random.uniform(size=20),
+                        "C": np.arange(20) + np.random.uniform(size=20)})
+
+        ax = df.plot(kind='hexbin', x='A', y='B', colorbar=None)
+        self.assertIs(ax.collections[0].colorbar, None)
+
+    @slow
+    def test_allow_cmap(self):
+        df = DataFrame({"A": np.random.uniform(size=20),
+                        "B": np.random.uniform(size=20),
+                        "C": np.arange(20) + np.random.uniform(size=20)})
+
+        ax = df.plot(kind='hexbin', x='A', y='B', cmap='YlGn')
+        self.assertEquals(ax.collections[0].cmap.name, 'YlGn')
+
+        with tm.assertRaises(TypeError):
+            df.plot(kind='hexbin', x='A', y='B', cmap='YlGn',
+                         colormap='BuGn')
+
 
 @tm.mplskip
 class TestDataFrameGroupByPlots(tm.TestCase):

diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py
@@ -835,7 +835,14 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=True,
             secondary_y = [secondary_y]
         self.secondary_y = secondary_y
 
-        self.colormap = colormap
+        # ugly TypeError if user passes matplotlib's `cmap` name.
+        # Probably better to accept either.
+        if 'cmap' in kwds and colormap:
+            raise TypeError("Only specify one of `cmap` and `colormap`.")
+        elif 'cmap' in kwds:
+            self.colormap = kwds.pop('cmap')
+        else:
+            self.colormap = colormap
 
         self.kwds = kwds
 
@@ -1263,6 +1270,52 @@ def _post_plot_logic(self):
         ax.set_xlabel(com.pprint_thing(x))
 
 
+class HexBinPlot(MPLPlot):
+    def __init__(self, data, x, y, C=None, **kwargs):
+        MPLPlot.__init__(self, data, **kwargs)
+
+        if x is None or y is None:
+            raise ValueError('hexbin requires and x and y column')
+        if com.is_integer(x) and not self.data.columns.holds_integer():
+            x = self.data.columns[x]
+        if com.is_integer(y) and not self.data.columns.holds_integer():
+            y = self.data.columns[y]
+
+        if com.is_integer(C) and not self.data.columns.holds_integer():
+            C = self.data.columns[C]
+
+        self.x = x
+        self.y = y
+        self.C = C
+
+    def _make_plot(self):
+        import matplotlib.pyplot as plt
+
+        x, y, data, C = self.x, self.y, self.data, self.C
+        ax = self.axes[0]
+        # pandas uses colormap, matplotlib uses cmap.
+        cmap = self.colormap or 'BuGn'
+        cmap = plt.cm.get_cmap(cmap)
+        cb = self.kwds.pop('colorbar', True)
+
+        if C is None:
+            c_values = None
+        else:
+            c_values = data[C].values
+
+        ax.hexbin(data[x].values, data[y].values, C=c_values, cmap=cmap,
+                  **self.kwds)
+        if cb:
+            img = ax.collections[0]
+            self.fig.colorbar(img, ax=ax)
+
+    def _post_plot_logic(self):
+        ax = self.axes[0]
+        x, y = self.x, self.y
+        ax.set_ylabel(com.pprint_thing(y))
+        ax.set_xlabel(com.pprint_thing(x))
+
+
 class LinePlot(MPLPlot):
 
     def __init__(self, data, **kwargs):
@@ -1663,11 +1716,12 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True,
     ax : matplotlib axis object, default None
     style : list or dict
         matplotlib line style per column
-    kind : {'line', 'bar', 'barh', 'kde', 'density', 'scatter'}
+    kind : {'line', 'bar', 'barh', 'kde', 'density', 'scatter', 'hexbin'}
         bar : vertical bar plot
         barh : horizontal bar plot
         kde/density : Kernel Density Estimation plot
         scatter: scatter plot
+        hexbin: hexbin plot
     logx : boolean, default False
         For line plots, use log scaling on x axis
     logy : boolean, default False
@@ -1695,6 +1749,17 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True,
     Returns
     -------
     ax_or_axes : matplotlib.AxesSubplot or list of them
+
+    Notes
+    -----
+
+    If `kind`='hexbin', you can control the size of the bins with the
+    `gridsize` argument. By default, a histogram of the counts around each
+    `(x, y)` point is computed. You can specify alternative aggregations
+    by passing values to the `C` and `reduce_C_function` arguments.
+    `C` specifies the value at each `(x, y)` point and `reduce_C_function`
+    is a function of one argument that reduces all the values in a bin to
+    a single number (e.g. `mean`, `max`, `sum`, `std`).
     """
     kind = _get_standard_kind(kind.lower().strip())
     if kind == 'line':
@@ -1705,6 +1770,8 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True,
         klass = KdePlot
     elif kind == 'scatter':
         klass = ScatterPlot
+    elif kind == 'hexbin':
+        klass = HexBinPlot
     else:
         raise ValueError('Invalid chart type given %s' % kind)
 
@@ -1717,6 +1784,16 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True,
                          figsize=figsize, logx=logx, logy=logy,
                          sort_columns=sort_columns, secondary_y=secondary_y,
                          **kwds)
+    elif kind == 'hexbin':
+        C = kwds.pop('C', None)  # remove from kwargs so we can set default
+        plot_obj = klass(frame,  x=x, y=y, kind=kind, subplots=subplots,
+                         rot=rot,legend=legend, ax=ax, style=style,
+                         fontsize=fontsize, use_index=use_index, sharex=sharex,
+                         sharey=sharey, xticks=xticks, yticks=yticks,
+                         xlim=xlim, ylim=ylim, title=title, grid=grid,
+                         figsize=figsize, logx=logx, logy=logy,
+                         sort_columns=sort_columns, secondary_y=secondary_y,
+                         C=C, **kwds)
     else:
         if x is not None:
             if com.is_integer(x) and not frame.columns.holds_integer():