Added documentation

orbitfold · wesm · commit 51568f73ea18 · 2012-06-02T14:57:30.000-04:00
diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst
@@ -245,4 +245,27 @@ Scatter plot matrix
    scatter_matrix(df, alpha=0.2, figsize=(8, 8), diagonal='kde')
 
    @savefig scatter_matrix_hist.png width=6in
-   scatter_matrix(df, alpha=0.2, figsize=(8, 8), diagonal='hist')
+   scatter_matrix(df, alpha=0.2, figsize=(8, 8), diagonal='hist')
+
+.. _visualization.andrews_curves:
+
+Andrews Curves
+~~~~~~~~~~~~~~
+
+Andrews curves allow one to plot multivariate data as a large number
+of curves that are created using the attributes of samples as coefficients
+for Fourier series. By coloring these curves differently for each class
+it is possible to visualize data clustering. Curves belonging to samples
+of the same class will usually be closer together and form larger structures.
+
+.. ipython:: python
+
+   from pandas import read_csv
+   from pandas.tools.plotting import andrews_curves
+
+   data = read_csv('data/iris.data')
+
+   plt.figure()
+
+   @savefig andrews_curves.png width=6in
+   andrews_curves(data, 'Name')
diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py
@@ -240,6 +240,13 @@ def scat2(x, y, by=None, ax=None, figsize=None):
         grouper = Series(np.repeat([1, 2, 3, 4, 5], 20), df.index)
         _check_plot_works(scat2, 0, 1, by=grouper)
 
+    @slow 
+    def test_andrews_curves(self):
+        from pandas import read_csv
+        from pandas.tools.plotting import andrews_curves
+        df = read_csv('data/iris.data')
+        _check_plot_works(andrews_curves, df, 'Name')
+
     @slow
     def test_plot_int_columns(self):
         df = DataFrame(np.random.randn(100, 4)).cumsum()
diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py
@@ -129,7 +129,7 @@ def _gcf():
     import matplotlib.pyplot as plt
     return plt.gcf()
 
-def andrews_curves(data, class_column, samples=200):
+def andrews_curves(data, class_column, ax=None, samples=200):
     """
     Parameters:
     data: A DataFrame containing data to be plotted, preferably
@@ -160,6 +160,8 @@ def random_color(column):
     columns = [data[col] for col in data.columns if (col != class_column)]
     x = [-pi + 2.0 * pi * (t / float(samples)) for t in range(samples)]
     used_legends = set([])
+    if ax == None:
+        ax = plt.gca(xlim=(-pi, pi))
     for i in range(n):
         row = [columns[c][i] for c in range(len(columns))]
         f = function(row)
@@ -168,10 +170,10 @@ def random_color(column):
         if class_col[i] not in used_legends:
             label = class_col[i]
             used_legends.add(class_col[i])
-        plt.plot(x, y, color=random_color(class_col[i]), label=label)
-    plt.xlim(xmin=-pi, xmax=pi)
-    plt.legend(loc='upper right')
-    plt.grid()
+        ax.plot(x, y, color=random_color(class_col[i]), label=label)
+    ax.legend(loc='upper right')
+    ax.grid()
+    return ax
 
 def grouped_hist(data, column=None, by=None, ax=None, bins=50, log=False,
                  figsize=None, layout=None, sharex=False, sharey=False,