Skip to content

Commit 5ca7425

Browse files
committed
Merge PR #1566
2 parents 6195a32 + b9cf9dd commit 5ca7425

File tree

4 files changed

+96
-2
lines changed

4 files changed

+96
-2
lines changed

RELEASE.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ pandas 0.8.1
3030
**New features**
3131

3232
- Can pass dict of per-column line styles to DataFrame.plot (#1559)
33-
- Add new ``bootstrap_plot`` function
33+
- Add new ``bootstrap_plot`` plot function
34+
- Add new ``parallel_coordinates`` plot function (#1488)
3435

3536
**Improvements to existing features**
3637

doc/source/visualization.rst

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,7 @@ Bootstrap Plot
368368

369369
Bootstrap plots are used to visually assess the uncertainty of a statistic, such
370370
as mean, median, midrange, etc. A random subset of a specified size is selected
371-
from a data set, the statistic in question is computed for this subset and the
371+
from a data set, the statistic in question is computed for this subset and the
372372
process is repeated a specified number of times. Resulting plots and histograms
373373
are what constitutes the bootstrap plot.
374374

@@ -380,3 +380,29 @@ are what constitutes the bootstrap plot.
380380
381381
@savefig bootstrap_plot.png width=8in
382382
bootstrap_plot(data, size=50, samples=500, color='grey')
383+
384+
RadViz
385+
~~~~~~
386+
387+
RadViz is a way of visualizing multi-variate data. It is based on a simple
388+
spring tension minimization algorithm. Basically you set up a bunch of points in
389+
a plane. In our case they are equally spaced on a unit circle. Each point
390+
represents a single attribute. You then pretend that each sample in the data set
391+
is attached to each of these points by a spring, the stiffness of which is
392+
proportional to the numerical value of that attribute (they are normalized to
393+
unit interval). The point in the plane, where our sample settles to (where the
394+
forces acting on our sample are at an equilibrium) is where a dot representing
395+
our sample will be drawn. Depending on which class that sample belongs it will
396+
be colored differently.
397+
398+
.. ipython:: python
399+
400+
from pandas import read_csv
401+
from pandas.tools.plotting import radviz
402+
403+
data = read_csv('data/iris.data')
404+
405+
plt.figure()
406+
407+
@savefig radviz.png width=6in
408+
radviz(data, 'Name')

pandas/tests/test_graphics.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,14 @@ def test_parallel_coordinates(self):
327327
df = read_csv(path)
328328
_check_plot_works(parallel_coordinates, df, 'Name')
329329

330+
@slow
331+
def test_radviz(self):
332+
from pandas import read_csv
333+
from pandas.tools.plotting import radviz
334+
path = os.path.join(curpath(), 'data/iris.csv')
335+
df = read_csv(path)
336+
_check_plot_works(radviz, df, 'Name')
337+
330338
@slow
331339
def test_plot_int_columns(self):
332340
df = DataFrame(np.random.randn(100, 4)).cumsum()

pandas/tools/plotting.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,65 @@ def _get_marker_compat(marker):
147147
return 'o'
148148
return marker
149149

150+
def radviz(frame, class_column, ax=None, **kwds):
151+
"""RadViz - a multivariate data visualization algorithm
152+
153+
Parameters:
154+
-----------
155+
frame: DataFrame object
156+
class_column: Column name that contains information about class membership
157+
ax: Matplotlib axis object, optional
158+
kwds: Matplotlib scatter method keyword arguments, optional
159+
160+
Returns:
161+
--------
162+
ax: Matplotlib axis object
163+
"""
164+
import matplotlib.pyplot as plt
165+
import matplotlib.patches as patches
166+
import matplotlib.text as text
167+
import random
168+
def random_color(column):
169+
random.seed(column)
170+
return [random.random() for _ in range(3)]
171+
def normalize(series):
172+
a = min(series)
173+
b = max(series)
174+
return (series - a) / (b - a)
175+
column_names = [column_name for column_name in frame.columns if column_name != class_column]
176+
columns = [normalize(frame[column_name]) for column_name in column_names]
177+
if ax == None:
178+
ax = plt.gca(xlim=[-1, 1], ylim=[-1, 1])
179+
classes = set(frame[class_column])
180+
to_plot = {}
181+
for class_ in classes:
182+
to_plot[class_] = [[], []]
183+
n = len(frame.columns) - 1
184+
s = np.array([(np.cos(t), np.sin(t)) for t in [2.0 * np.pi * (i / float(n)) for i in range(n)]])
185+
for i in range(len(frame)):
186+
row = np.array([column[i] for column in columns])
187+
row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1)
188+
y = (s * row_).sum(axis=0) / row.sum()
189+
class_name = frame[class_column][i]
190+
to_plot[class_name][0].append(y[0])
191+
to_plot[class_name][1].append(y[1])
192+
for class_ in classes:
193+
ax.scatter(to_plot[class_][0], to_plot[class_][1], color=random_color(class_), label=str(class_), **kwds)
194+
ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor='none'))
195+
for xy, name in zip(s, column_names):
196+
ax.add_patch(patches.Circle(xy, radius=0.025, facecolor='gray'))
197+
if xy[0] < 0.0 and xy[1] < 0.0:
198+
ax.text(xy[0] - 0.025, xy[1] - 0.025, name, ha='right', va='top', size='small')
199+
elif xy[0] < 0.0 and xy[1] >= 0.0:
200+
ax.text(xy[0] - 0.025, xy[1] + 0.025, name, ha='right', va='bottom', size='small')
201+
elif xy[0] >= 0.0 and xy[1] < 0.0:
202+
ax.text(xy[0] + 0.025, xy[1] - 0.025, name, ha='left', va='top', size='small')
203+
elif xy[0] >= 0.0 and xy[1] >= 0.0:
204+
ax.text(xy[0] + 0.025, xy[1] + 0.025, name, ha='left', va='bottom', size='small')
205+
ax.legend(loc='upper right')
206+
ax.axis('equal')
207+
return ax
208+
150209
def andrews_curves(data, class_column, ax=None, samples=200):
151210
"""
152211
Parameters:

0 commit comments

Comments
 (0)