PERF: added vb_suite test for groupby_transform

jreback · jreback · commit 2d81b64bdad4 · 2013-03-25T16:14:42.000-04:00
added to RELEASE.rst, issue GH2121
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -100,9 +100,6 @@ pandas 0.11.0
     the collections.Mapping ABC.
   - Allow selection semantics via a string with a datelike index to work in both
     Series and DataFrames (GH3070_)
-  - Improved performance across several core functions by taking memory
-    ordering of arrays into account. Courtesy of @stephenwlin (GH3130_)
-
 
     .. ipython:: python
 
@@ -116,6 +113,10 @@ pandas 0.11.0
     for plots. Based on https://gist.github.com/huyng/816622 (GH3075_).
 
 
+  - Improved performance across several core functions by taking memory
+    ordering of arrays into account. Courtesy of @stephenwlin (GH3130_)
+  - Improved performance of groupby transform method (GH2121_)
+
 **API Changes**
 
   - Do not automatically upcast numeric specified dtypes to ``int64`` or
@@ -234,6 +235,7 @@ pandas 0.11.0
 .. _GH622: https://github.com/pydata/pandas/issues/622
 .. _GH797: https://github.com/pydata/pandas/issues/797
 .. _GH2758: https://github.com/pydata/pandas/issues/2758
+.. _GH2121: https://github.com/pydata/pandas/issues/2121
 .. _GH2809: https://github.com/pydata/pandas/issues/2809
 .. _GH2810: https://github.com/pydata/pandas/issues/2810
 .. _GH2837: https://github.com/pydata/pandas/issues/2837
diff --git a/bench/bench_transform.py b/bench/bench_transform.py
diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py
@@ -273,3 +273,41 @@ def f(g):
 """
 
 groupby_sum_booleans = Benchmark("df.groupby('ii').sum()", setup)
+
+#----------------------------------------------------------------------
+# Transform testing
+
+setup = common_setup + """
+n_dates = 1000
+n_securities = 500
+n_columns = 3
+share_na = 0.1
+
+dates = date_range('1997-12-31', periods=n_dates, freq='B')
+dates = Index(map(lambda x: x.year * 10000 + x.month * 100 + x.day, dates))
+
+secid_min = int('10000000', 16)
+secid_max = int('F0000000', 16)
+step = (secid_max - secid_min) // (n_securities - 1)
+security_ids = map(lambda x: hex(x)[2:10].upper(), range(secid_min, secid_max + 1, step))
+
+data_index = MultiIndex(levels=[dates.values, security_ids],
+    labels=[[i for i in xrange(n_dates) for _ in xrange(n_securities)], range(n_securities) * n_dates],
+    names=['date', 'security_id'])
+n_data = len(data_index)
+
+columns = Index(['factor{}'.format(i) for i in xrange(1, n_columns + 1)])
+
+data = DataFrame(np.random.randn(n_data, n_columns), index=data_index, columns=columns)
+
+step = int(n_data * share_na)
+for column_index in xrange(n_columns):
+    index = column_index
+    while index < n_data:
+        data.set_value(data_index[index], columns[column_index], np.nan)
+        index += step
+
+f_fillna = lambda x: x.fillna(method='pad')
+"""
+
+groupby_transform = Benchmark("data.groupby(level='security_id').transform(f_fillna)", setup)