Skip to content

VIS/ENH Hexbin plot #5478

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 14, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ pandas 0.14.0
New features
~~~~~~~~~~~~

- Hexagonal bin plots from ``DataFrame.plot`` with ``kind='hexbin'`` (:issue:`5478`)

API Changes
~~~~~~~~~~~

Expand Down
1 change: 1 addition & 0 deletions doc/source/v0.14.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ Enhancements
- ``plot(legend='reverse')`` will now reverse the order of legend labels for
most plot kinds. (:issue:`6014`)
- improve performance of slice indexing on Series with string keys (:issue:`6341`)
- Hexagonal bin plots from ``DataFrame.plot`` with ``kind='hexbin'`` (:issue:`5478`)

Performance
~~~~~~~~~~~
Expand Down
53 changes: 53 additions & 0 deletions doc/source/visualization.rst
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,59 @@ setting `kind='kde'`:
@savefig kde_plot.png
ser.plot(kind='kde')

.. _visualization.hexbin

Hexagonal Bin plot
~~~~~~~~~~~~~~~~~~
*New in .14* You can create hexagonal bin plots with ``DataFrame.plot`` and
``kind='hexbin'``.
Hexbin plots can be a useful alternative to scatter plots if your data are
too dense to plot each point individually.

.. ipython:: python
:suppress:

plt.figure();

.. ipython:: python

df = DataFrame(randn(1000, 2), columns=['a', 'b'])
df['b'] = df['b'] = df['b'] + np.arange(1000)

@savefig hexbin_plot.png
df.plot(kind='hexbin', x='a', y='b', gridsize=25)


A useful keyword argument is ``gridsize``; it controls the number of hexagons
in the x-direction, and defaults to 100. A larger ``gridsize`` means more, smaller
bins.

By default, a histogram of the counts around each ``(x, y)`` point is computed.
You can specify alternative aggregations by passing values to the ``C`` and
``reduce_C_function`` arguments. ``C`` specifies the value at each ``(x, y)`` point
and ``reduce_C_function`` is a function of one argument that reduces all the
values in a bin to a single number (e.g. ``mean``, ``max``, ``sum``, ``std``). In this
example the positions are given by columns ``a`` and ``b``, while the value is
given by column ``z``. The bins are aggregated with numpy's ``max`` function.

.. ipython:: python
:suppress:

plt.figure();

.. ipython:: python

df = DataFrame(randn(1000, 2), columns=['a', 'b'])
df['b'] = df['b'] = df['b'] + np.arange(1000)
df['z'] = np.random.uniform(0, 3, 1000)

@savefig hexbin_plot_agg.png
df.plot(kind='hexbin', x='a', y='b', C='z', reduce_C_function=np.max,
gridsize=25)


See the `matplotlib hexbin documenation <http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.hexbin>`__ for more.

.. _visualization.andrews_curves:

Andrews Curves
Expand Down
59 changes: 59 additions & 0 deletions pandas/tests/test_graphics.py
Original file line number Diff line number Diff line change
Expand Up @@ -956,6 +956,65 @@ def test_invalid_kind(self):
with tm.assertRaises(ValueError):
df.plot(kind='aasdf')

@slow
def test_hexbin_basic(self):
df = DataFrame({"A": np.random.uniform(size=20),
"B": np.random.uniform(size=20),
"C": np.arange(20) + np.random.uniform(size=20)})

ax = df.plot(kind='hexbin', x='A', y='B', gridsize=10)
# TODO: need better way to test. This just does existence.
self.assert_(len(ax.collections) == 1)

@slow
def test_hexbin_with_c(self):
df = DataFrame({"A": np.random.uniform(size=20),
"B": np.random.uniform(size=20),
"C": np.arange(20) + np.random.uniform(size=20)})

ax = df.plot(kind='hexbin', x='A', y='B', C='C')
self.assert_(len(ax.collections) == 1)

ax = df.plot(kind='hexbin', x='A', y='B', C='C',
reduce_C_function=np.std)
self.assert_(len(ax.collections) == 1)

@slow
def test_hexbin_cmap(self):
df = DataFrame({"A": np.random.uniform(size=20),
"B": np.random.uniform(size=20),
"C": np.arange(20) + np.random.uniform(size=20)})

# Default to BuGn
ax = df.plot(kind='hexbin', x='A', y='B')
self.assertEquals(ax.collections[0].cmap.name, 'BuGn')

cm = 'cubehelix'
ax = df.plot(kind='hexbin', x='A', y='B', colormap=cm)
self.assertEquals(ax.collections[0].cmap.name, cm)

@slow
def test_no_color_bar(self):
df = DataFrame({"A": np.random.uniform(size=20),
"B": np.random.uniform(size=20),
"C": np.arange(20) + np.random.uniform(size=20)})

ax = df.plot(kind='hexbin', x='A', y='B', colorbar=None)
self.assertIs(ax.collections[0].colorbar, None)

@slow
def test_allow_cmap(self):
df = DataFrame({"A": np.random.uniform(size=20),
"B": np.random.uniform(size=20),
"C": np.arange(20) + np.random.uniform(size=20)})

ax = df.plot(kind='hexbin', x='A', y='B', cmap='YlGn')
self.assertEquals(ax.collections[0].cmap.name, 'YlGn')

with tm.assertRaises(TypeError):
df.plot(kind='hexbin', x='A', y='B', cmap='YlGn',
colormap='BuGn')


@tm.mplskip
class TestDataFrameGroupByPlots(tm.TestCase):
Expand Down
81 changes: 79 additions & 2 deletions pandas/tools/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,7 +835,14 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=True,
secondary_y = [secondary_y]
self.secondary_y = secondary_y

self.colormap = colormap
# ugly TypeError if user passes matplotlib's `cmap` name.
# Probably better to accept either.
if 'cmap' in kwds and colormap:
raise TypeError("Only specify one of `cmap` and `colormap`.")
elif 'cmap' in kwds:
self.colormap = kwds.pop('cmap')
else:
self.colormap = colormap

self.kwds = kwds

Expand Down Expand Up @@ -1263,6 +1270,52 @@ def _post_plot_logic(self):
ax.set_xlabel(com.pprint_thing(x))


class HexBinPlot(MPLPlot):
def __init__(self, data, x, y, C=None, **kwargs):
MPLPlot.__init__(self, data, **kwargs)

if x is None or y is None:
raise ValueError('hexbin requires and x and y column')
if com.is_integer(x) and not self.data.columns.holds_integer():
x = self.data.columns[x]
if com.is_integer(y) and not self.data.columns.holds_integer():
y = self.data.columns[y]

if com.is_integer(C) and not self.data.columns.holds_integer():
C = self.data.columns[C]

self.x = x
self.y = y
self.C = C

def _make_plot(self):
import matplotlib.pyplot as plt

x, y, data, C = self.x, self.y, self.data, self.C
ax = self.axes[0]
# pandas uses colormap, matplotlib uses cmap.
cmap = self.colormap or 'BuGn'
cmap = plt.cm.get_cmap(cmap)
cb = self.kwds.pop('colorbar', True)

if C is None:
c_values = None
else:
c_values = data[C].values

ax.hexbin(data[x].values, data[y].values, C=c_values, cmap=cmap,
**self.kwds)
if cb:
img = ax.collections[0]
self.fig.colorbar(img, ax=ax)

def _post_plot_logic(self):
ax = self.axes[0]
x, y = self.x, self.y
ax.set_ylabel(com.pprint_thing(y))
ax.set_xlabel(com.pprint_thing(x))


class LinePlot(MPLPlot):

def __init__(self, data, **kwargs):
Expand Down Expand Up @@ -1663,11 +1716,12 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True,
ax : matplotlib axis object, default None
style : list or dict
matplotlib line style per column
kind : {'line', 'bar', 'barh', 'kde', 'density', 'scatter'}
kind : {'line', 'bar', 'barh', 'kde', 'density', 'scatter', 'hexbin'}
bar : vertical bar plot
barh : horizontal bar plot
kde/density : Kernel Density Estimation plot
scatter: scatter plot
hexbin: hexbin plot
logx : boolean, default False
For line plots, use log scaling on x axis
logy : boolean, default False
Expand Down Expand Up @@ -1695,6 +1749,17 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True,
Returns
-------
ax_or_axes : matplotlib.AxesSubplot or list of them

Notes
-----

If `kind`='hexbin', you can control the size of the bins with the
`gridsize` argument. By default, a histogram of the counts around each
`(x, y)` point is computed. You can specify alternative aggregations
by passing values to the `C` and `reduce_C_function` arguments.
`C` specifies the value at each `(x, y)` point and `reduce_C_function`
is a function of one argument that reduces all the values in a bin to
a single number (e.g. `mean`, `max`, `sum`, `std`).
"""
kind = _get_standard_kind(kind.lower().strip())
if kind == 'line':
Expand All @@ -1705,6 +1770,8 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True,
klass = KdePlot
elif kind == 'scatter':
klass = ScatterPlot
elif kind == 'hexbin':
klass = HexBinPlot
else:
raise ValueError('Invalid chart type given %s' % kind)

Expand All @@ -1717,6 +1784,16 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True,
figsize=figsize, logx=logx, logy=logy,
sort_columns=sort_columns, secondary_y=secondary_y,
**kwds)
elif kind == 'hexbin':
C = kwds.pop('C', None) # remove from kwargs so we can set default
plot_obj = klass(frame, x=x, y=y, kind=kind, subplots=subplots,
rot=rot,legend=legend, ax=ax, style=style,
fontsize=fontsize, use_index=use_index, sharex=sharex,
sharey=sharey, xticks=xticks, yticks=yticks,
xlim=xlim, ylim=ylim, title=title, grid=grid,
figsize=figsize, logx=logx, logy=logy,
sort_columns=sort_columns, secondary_y=secondary_y,
C=C, **kwds)
else:
if x is not None:
if com.is_integer(x) and not frame.columns.holds_integer():
Expand Down