From 7898ec24f3f70bcbc09b98457988b13ebc7f0775 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Fri, 22 Mar 2013 19:45:26 -0400
Subject: [PATCH 1/2] PERF: groupby transform

---
 bench/bench_transform.py | 66 ++++++++++++++++++++++++++++++++++++++++
 pandas/core/groupby.py   | 54 ++++++++++++++++++++++++--------
 2 files changed, 107 insertions(+), 13 deletions(-)
 create mode 100644 bench/bench_transform.py

diff --git a/bench/bench_transform.py b/bench/bench_transform.py
new file mode 100644
index 0000000000000..12fd24b66d3b4
--- /dev/null
+++ b/bench/bench_transform.py
@@ -0,0 +1,66 @@
+import numpy as np
+import pandas as pd
+from pandas import Index, MultiIndex, DataFrame
+from pandas.core.groupby import SeriesGroupBy, DataFrameGroupBy
+
+def apply_by_group(grouped, f):
+    """
+    Applies a function to each Series or DataFrame in a GroupBy object, concatenates the results
+    and returns the resulting Series or DataFrame.
+
+    Parameters
+    ----------
+    grouped: SeriesGroupBy or DataFrameGroupBy
+    f: callable
+        Function to apply to each Series or DataFrame in the grouped object.
+
+    Returns
+    -------
+    Series or DataFrame that results from applying the function to each Series or DataFrame in the
+    GroupBy object and concatenating the results.
+
+    """
+    assert isinstance(grouped, (SeriesGroupBy, DataFrameGroupBy))
+    assert hasattr(f, '__call__')
+
+    groups = []
+    for key, group in grouped:
+        groups.append(f(group))
+    c = pd.concat(groups)
+    c.sort_index(inplace=True)
+    return c
+
+n_dates = 1000
+n_securities = 2000
+n_columns = 3
+share_na = 0.1
+
+dates = pd.date_range('1997-12-31', periods=n_dates, freq='B')
+dates = Index(map(lambda x: x.year * 10000 + x.month * 100 + x.day, dates))
+
+secid_min = int('10000000', 16)
+secid_max = int('F0000000', 16)
+step = (secid_max - secid_min) // (n_securities - 1)
+security_ids = map(lambda x: hex(x)[2:10].upper(), range(secid_min, secid_max + 1, step))
+
+data_index = MultiIndex(levels=[dates.values, security_ids],
+    labels=[[i for i in xrange(n_dates) for _ in xrange(n_securities)], range(n_securities) * n_dates],
+    names=['date', 'security_id'])
+n_data = len(data_index)
+
+columns = Index(['factor{}'.format(i) for i in xrange(1, n_columns + 1)])
+
+data = DataFrame(np.random.randn(n_data, n_columns), index=data_index, columns=columns)
+
+step = int(n_data * share_na)
+for column_index in xrange(n_columns):
+    index = column_index
+    while index < n_data:
+        data.set_value(data_index[index], columns[column_index], np.nan)
+        index += step
+
+grouped = data.groupby(level='security_id')
+f_fillna = lambda x: x.fillna(method='pad')
+
+#%timeit grouped.transform(f_fillna)
+#%timeit apply_by_group(grouped, f_fillna)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index 053deaa550b06..cb0a03d306c53 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -13,7 +13,7 @@
 from pandas.util.compat import OrderedDict
 import pandas.core.algorithms as algos
 import pandas.core.common as com
-from pandas.core.common import _possibly_downcast_to_dtype
+from pandas.core.common import _possibly_downcast_to_dtype, notnull
 
 import pandas.lib as lib
 import pandas.algos as _algos
@@ -75,7 +75,7 @@ def f(self):
 def _first_compat(x, axis=0):
     def _first(x):
         x = np.asarray(x)
-        x = x[com.notnull(x)]
+        x = x[notnull(x)]
         if len(x) == 0:
             return np.nan
         return x[0]
@@ -89,7 +89,7 @@ def _first(x):
 def _last_compat(x, axis=0):
     def _last(x):
         x = np.asarray(x)
-        x = x[com.notnull(x)]
+        x = x[notnull(x)]
         if len(x) == 0:
             return np.nan
         return x[-1]
@@ -421,7 +421,7 @@ def ohlc(self):
 
     def nth(self, n):
         def picker(arr):
-            arr = arr[com.notnull(arr)]
+            arr = arr[notnull(arr)]
             if len(arr) >= n + 1:
                 return arr.iget(n)
             else:
@@ -1897,19 +1897,46 @@ def transform(self, func, *args, **kwargs):
         gen = self.grouper.get_iterator(obj, axis=self.axis)
 
         if isinstance(func, basestring):
-            wrapper = lambda x: getattr(x, func)(*args, **kwargs)
+            fast_path = lambda group: getattr(group, func)(*args, **kwargs)
+            slow_path = lambda group: group.apply(lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis)
         else:
-            wrapper = lambda x: func(x, *args, **kwargs)
+            fast_path = lambda group: func(group, *args, **kwargs)
+            slow_path = lambda group: group.apply(lambda x: func(x, *args, **kwargs), axis=self.axis)
 
+        path = None
         for name, group in gen:
             object.__setattr__(group, 'name', name)
 
-            try:
-                res = group.apply(wrapper, axis=self.axis)
-            except TypeError:
-                return self._transform_item_by_item(obj, wrapper)
-            except Exception:  # pragma: no cover
-                res = wrapper(group)
+            # decide on a fast path
+            if path is None:
+
+                path = slow_path
+                try:
+                    res  = slow_path(group)
+
+                    # if we make it here, test if we can use the fast path
+                    try:
+                        res_fast = fast_path(group)
+                    
+                        # compare that we get the same results
+                        if res.shape == res_fast.shape:
+                            res_r = res.values.ravel()
+                            res_fast_r = res_fast.values.ravel()
+                            mask = notnull(res_r)
+                            if (res_r[mask] == res_fast_r[mask]).all():
+                                path = fast_path
+                
+                    except:
+                        pass
+                except TypeError:
+                    return self._transform_item_by_item(obj, fast_path)
+                except Exception:  # pragma: no cover
+                    res  = fast_path(group)
+                    path = fast_path
+
+            else:
+
+                res = path(group)
 
             # broadcasting
             if isinstance(res, Series):
@@ -1925,7 +1952,8 @@ def transform(self, func, *args, **kwargs):
         concat_index = obj.columns if self.axis == 0 else obj.index
         concatenated = concat(applied, join_axes=[concat_index],
                               axis=self.axis, verify_integrity=False)
-        return concatenated.reindex_like(obj)
+        concatenated.sort_index(inplace=True)
+        return concatenated
 
     def _transform_item_by_item(self, obj, wrapper):
         # iterate through columns

From 2d81b64bdad4b59cc9e79cbcf32dfa945beeaab9 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Sat, 23 Mar 2013 19:05:02 -0400
Subject: [PATCH 2/2] PERF: added vb_suite test for groupby_transform      
 added to RELEASE.rst, issue GH2121

---
 RELEASE.rst              |  8 +++--
 bench/bench_transform.py | 66 ----------------------------------------
 vb_suite/groupby.py      | 38 +++++++++++++++++++++++
 3 files changed, 43 insertions(+), 69 deletions(-)
 delete mode 100644 bench/bench_transform.py

diff --git a/RELEASE.rst b/RELEASE.rst
index 45477610cabb2..4cd47ae384359 100644
--- a/RELEASE.rst
+++ b/RELEASE.rst
@@ -100,9 +100,6 @@ pandas 0.11.0
     the collections.Mapping ABC.
   - Allow selection semantics via a string with a datelike index to work in both
     Series and DataFrames (GH3070_)
-  - Improved performance across several core functions by taking memory
-    ordering of arrays into account. Courtesy of @stephenwlin (GH3130_)
-
 
     .. ipython:: python
 
@@ -116,6 +113,10 @@ pandas 0.11.0
     for plots. Based on https://gist.github.com/huyng/816622 (GH3075_).
 
 
+  - Improved performance across several core functions by taking memory
+    ordering of arrays into account. Courtesy of @stephenwlin (GH3130_)
+  - Improved performance of groupby transform method (GH2121_)
+
 **API Changes**
 
   - Do not automatically upcast numeric specified dtypes to ``int64`` or
@@ -234,6 +235,7 @@ pandas 0.11.0
 .. _GH622: https://github.com/pydata/pandas/issues/622
 .. _GH797: https://github.com/pydata/pandas/issues/797
 .. _GH2758: https://github.com/pydata/pandas/issues/2758
+.. _GH2121: https://github.com/pydata/pandas/issues/2121
 .. _GH2809: https://github.com/pydata/pandas/issues/2809
 .. _GH2810: https://github.com/pydata/pandas/issues/2810
 .. _GH2837: https://github.com/pydata/pandas/issues/2837
diff --git a/bench/bench_transform.py b/bench/bench_transform.py
deleted file mode 100644
index 12fd24b66d3b4..0000000000000
--- a/bench/bench_transform.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import numpy as np
-import pandas as pd
-from pandas import Index, MultiIndex, DataFrame
-from pandas.core.groupby import SeriesGroupBy, DataFrameGroupBy
-
-def apply_by_group(grouped, f):
-    """
-    Applies a function to each Series or DataFrame in a GroupBy object, concatenates the results
-    and returns the resulting Series or DataFrame.
-
-    Parameters
-    ----------
-    grouped: SeriesGroupBy or DataFrameGroupBy
-    f: callable
-        Function to apply to each Series or DataFrame in the grouped object.
-
-    Returns
-    -------
-    Series or DataFrame that results from applying the function to each Series or DataFrame in the
-    GroupBy object and concatenating the results.
-
-    """
-    assert isinstance(grouped, (SeriesGroupBy, DataFrameGroupBy))
-    assert hasattr(f, '__call__')
-
-    groups = []
-    for key, group in grouped:
-        groups.append(f(group))
-    c = pd.concat(groups)
-    c.sort_index(inplace=True)
-    return c
-
-n_dates = 1000
-n_securities = 2000
-n_columns = 3
-share_na = 0.1
-
-dates = pd.date_range('1997-12-31', periods=n_dates, freq='B')
-dates = Index(map(lambda x: x.year * 10000 + x.month * 100 + x.day, dates))
-
-secid_min = int('10000000', 16)
-secid_max = int('F0000000', 16)
-step = (secid_max - secid_min) // (n_securities - 1)
-security_ids = map(lambda x: hex(x)[2:10].upper(), range(secid_min, secid_max + 1, step))
-
-data_index = MultiIndex(levels=[dates.values, security_ids],
-    labels=[[i for i in xrange(n_dates) for _ in xrange(n_securities)], range(n_securities) * n_dates],
-    names=['date', 'security_id'])
-n_data = len(data_index)
-
-columns = Index(['factor{}'.format(i) for i in xrange(1, n_columns + 1)])
-
-data = DataFrame(np.random.randn(n_data, n_columns), index=data_index, columns=columns)
-
-step = int(n_data * share_na)
-for column_index in xrange(n_columns):
-    index = column_index
-    while index < n_data:
-        data.set_value(data_index[index], columns[column_index], np.nan)
-        index += step
-
-grouped = data.groupby(level='security_id')
-f_fillna = lambda x: x.fillna(method='pad')
-
-#%timeit grouped.transform(f_fillna)
-#%timeit apply_by_group(grouped, f_fillna)
diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py
index caa09c219a866..f9f221ae752b5 100644
--- a/vb_suite/groupby.py
+++ b/vb_suite/groupby.py
@@ -273,3 +273,41 @@ def f(g):
 """
 
 groupby_sum_booleans = Benchmark("df.groupby('ii').sum()", setup)
+
+#----------------------------------------------------------------------
+# Transform testing
+
+setup = common_setup + """
+n_dates = 1000
+n_securities = 500
+n_columns = 3
+share_na = 0.1
+
+dates = date_range('1997-12-31', periods=n_dates, freq='B')
+dates = Index(map(lambda x: x.year * 10000 + x.month * 100 + x.day, dates))
+
+secid_min = int('10000000', 16)
+secid_max = int('F0000000', 16)
+step = (secid_max - secid_min) // (n_securities - 1)
+security_ids = map(lambda x: hex(x)[2:10].upper(), range(secid_min, secid_max + 1, step))
+
+data_index = MultiIndex(levels=[dates.values, security_ids],
+    labels=[[i for i in xrange(n_dates) for _ in xrange(n_securities)], range(n_securities) * n_dates],
+    names=['date', 'security_id'])
+n_data = len(data_index)
+
+columns = Index(['factor{}'.format(i) for i in xrange(1, n_columns + 1)])
+
+data = DataFrame(np.random.randn(n_data, n_columns), index=data_index, columns=columns)
+
+step = int(n_data * share_na)
+for column_index in xrange(n_columns):
+    index = column_index
+    while index < n_data:
+        data.set_value(data_index[index], columns[column_index], np.nan)
+        index += step
+
+f_fillna = lambda x: x.fillna(method='pad')
+"""
+
+groupby_transform = Benchmark("data.groupby(level='security_id').transform(f_fillna)", setup)