From 7898ec24f3f70bcbc09b98457988b13ebc7f0775 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 22 Mar 2013 19:45:26 -0400 Subject: [PATCH 1/2] PERF: groupby transform --- bench/bench_transform.py | 66 ++++++++++++++++++++++++++++++++++++++++ pandas/core/groupby.py | 54 ++++++++++++++++++++++++-------- 2 files changed, 107 insertions(+), 13 deletions(-) create mode 100644 bench/bench_transform.py diff --git a/bench/bench_transform.py b/bench/bench_transform.py new file mode 100644 index 0000000000000..12fd24b66d3b4 --- /dev/null +++ b/bench/bench_transform.py @@ -0,0 +1,66 @@ +import numpy as np +import pandas as pd +from pandas import Index, MultiIndex, DataFrame +from pandas.core.groupby import SeriesGroupBy, DataFrameGroupBy + +def apply_by_group(grouped, f): + """ + Applies a function to each Series or DataFrame in a GroupBy object, concatenates the results + and returns the resulting Series or DataFrame. + + Parameters + ---------- + grouped: SeriesGroupBy or DataFrameGroupBy + f: callable + Function to apply to each Series or DataFrame in the grouped object. + + Returns + ------- + Series or DataFrame that results from applying the function to each Series or DataFrame in the + GroupBy object and concatenating the results. + + """ + assert isinstance(grouped, (SeriesGroupBy, DataFrameGroupBy)) + assert hasattr(f, '__call__') + + groups = [] + for key, group in grouped: + groups.append(f(group)) + c = pd.concat(groups) + c.sort_index(inplace=True) + return c + +n_dates = 1000 +n_securities = 2000 +n_columns = 3 +share_na = 0.1 + +dates = pd.date_range('1997-12-31', periods=n_dates, freq='B') +dates = Index(map(lambda x: x.year * 10000 + x.month * 100 + x.day, dates)) + +secid_min = int('10000000', 16) +secid_max = int('F0000000', 16) +step = (secid_max - secid_min) // (n_securities - 1) +security_ids = map(lambda x: hex(x)[2:10].upper(), range(secid_min, secid_max + 1, step)) + +data_index = MultiIndex(levels=[dates.values, security_ids], + labels=[[i for i in xrange(n_dates) for _ in xrange(n_securities)], range(n_securities) * n_dates], + names=['date', 'security_id']) +n_data = len(data_index) + +columns = Index(['factor{}'.format(i) for i in xrange(1, n_columns + 1)]) + +data = DataFrame(np.random.randn(n_data, n_columns), index=data_index, columns=columns) + +step = int(n_data * share_na) +for column_index in xrange(n_columns): + index = column_index + while index < n_data: + data.set_value(data_index[index], columns[column_index], np.nan) + index += step + +grouped = data.groupby(level='security_id') +f_fillna = lambda x: x.fillna(method='pad') + +#%timeit grouped.transform(f_fillna) +#%timeit apply_by_group(grouped, f_fillna) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 053deaa550b06..cb0a03d306c53 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -13,7 +13,7 @@ from pandas.util.compat import OrderedDict import pandas.core.algorithms as algos import pandas.core.common as com -from pandas.core.common import _possibly_downcast_to_dtype +from pandas.core.common import _possibly_downcast_to_dtype, notnull import pandas.lib as lib import pandas.algos as _algos @@ -75,7 +75,7 @@ def f(self): def _first_compat(x, axis=0): def _first(x): x = np.asarray(x) - x = x[com.notnull(x)] + x = x[notnull(x)] if len(x) == 0: return np.nan return x[0] @@ -89,7 +89,7 @@ def _first(x): def _last_compat(x, axis=0): def _last(x): x = np.asarray(x) - x = x[com.notnull(x)] + x = x[notnull(x)] if len(x) == 0: return np.nan return x[-1] @@ -421,7 +421,7 @@ def ohlc(self): def nth(self, n): def picker(arr): - arr = arr[com.notnull(arr)] + arr = arr[notnull(arr)] if len(arr) >= n + 1: return arr.iget(n) else: @@ -1897,19 +1897,46 @@ def transform(self, func, *args, **kwargs): gen = self.grouper.get_iterator(obj, axis=self.axis) if isinstance(func, basestring): - wrapper = lambda x: getattr(x, func)(*args, **kwargs) + fast_path = lambda group: getattr(group, func)(*args, **kwargs) + slow_path = lambda group: group.apply(lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis) else: - wrapper = lambda x: func(x, *args, **kwargs) + fast_path = lambda group: func(group, *args, **kwargs) + slow_path = lambda group: group.apply(lambda x: func(x, *args, **kwargs), axis=self.axis) + path = None for name, group in gen: object.__setattr__(group, 'name', name) - try: - res = group.apply(wrapper, axis=self.axis) - except TypeError: - return self._transform_item_by_item(obj, wrapper) - except Exception: # pragma: no cover - res = wrapper(group) + # decide on a fast path + if path is None: + + path = slow_path + try: + res = slow_path(group) + + # if we make it here, test if we can use the fast path + try: + res_fast = fast_path(group) + + # compare that we get the same results + if res.shape == res_fast.shape: + res_r = res.values.ravel() + res_fast_r = res_fast.values.ravel() + mask = notnull(res_r) + if (res_r[mask] == res_fast_r[mask]).all(): + path = fast_path + + except: + pass + except TypeError: + return self._transform_item_by_item(obj, fast_path) + except Exception: # pragma: no cover + res = fast_path(group) + path = fast_path + + else: + + res = path(group) # broadcasting if isinstance(res, Series): @@ -1925,7 +1952,8 @@ def transform(self, func, *args, **kwargs): concat_index = obj.columns if self.axis == 0 else obj.index concatenated = concat(applied, join_axes=[concat_index], axis=self.axis, verify_integrity=False) - return concatenated.reindex_like(obj) + concatenated.sort_index(inplace=True) + return concatenated def _transform_item_by_item(self, obj, wrapper): # iterate through columns From 2d81b64bdad4b59cc9e79cbcf32dfa945beeaab9 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 23 Mar 2013 19:05:02 -0400 Subject: [PATCH 2/2] PERF: added vb_suite test for groupby_transform added to RELEASE.rst, issue GH2121 --- RELEASE.rst | 8 +++-- bench/bench_transform.py | 66 ---------------------------------------- vb_suite/groupby.py | 38 +++++++++++++++++++++++ 3 files changed, 43 insertions(+), 69 deletions(-) delete mode 100644 bench/bench_transform.py diff --git a/RELEASE.rst b/RELEASE.rst index 45477610cabb2..4cd47ae384359 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -100,9 +100,6 @@ pandas 0.11.0 the collections.Mapping ABC. - Allow selection semantics via a string with a datelike index to work in both Series and DataFrames (GH3070_) - - Improved performance across several core functions by taking memory - ordering of arrays into account. Courtesy of @stephenwlin (GH3130_) - .. ipython:: python @@ -116,6 +113,10 @@ pandas 0.11.0 for plots. Based on https://gist.github.com/huyng/816622 (GH3075_). + - Improved performance across several core functions by taking memory + ordering of arrays into account. Courtesy of @stephenwlin (GH3130_) + - Improved performance of groupby transform method (GH2121_) + **API Changes** - Do not automatically upcast numeric specified dtypes to ``int64`` or @@ -234,6 +235,7 @@ pandas 0.11.0 .. _GH622: https://github.com/pydata/pandas/issues/622 .. _GH797: https://github.com/pydata/pandas/issues/797 .. _GH2758: https://github.com/pydata/pandas/issues/2758 +.. _GH2121: https://github.com/pydata/pandas/issues/2121 .. _GH2809: https://github.com/pydata/pandas/issues/2809 .. _GH2810: https://github.com/pydata/pandas/issues/2810 .. _GH2837: https://github.com/pydata/pandas/issues/2837 diff --git a/bench/bench_transform.py b/bench/bench_transform.py deleted file mode 100644 index 12fd24b66d3b4..0000000000000 --- a/bench/bench_transform.py +++ /dev/null @@ -1,66 +0,0 @@ -import numpy as np -import pandas as pd -from pandas import Index, MultiIndex, DataFrame -from pandas.core.groupby import SeriesGroupBy, DataFrameGroupBy - -def apply_by_group(grouped, f): - """ - Applies a function to each Series or DataFrame in a GroupBy object, concatenates the results - and returns the resulting Series or DataFrame. - - Parameters - ---------- - grouped: SeriesGroupBy or DataFrameGroupBy - f: callable - Function to apply to each Series or DataFrame in the grouped object. - - Returns - ------- - Series or DataFrame that results from applying the function to each Series or DataFrame in the - GroupBy object and concatenating the results. - - """ - assert isinstance(grouped, (SeriesGroupBy, DataFrameGroupBy)) - assert hasattr(f, '__call__') - - groups = [] - for key, group in grouped: - groups.append(f(group)) - c = pd.concat(groups) - c.sort_index(inplace=True) - return c - -n_dates = 1000 -n_securities = 2000 -n_columns = 3 -share_na = 0.1 - -dates = pd.date_range('1997-12-31', periods=n_dates, freq='B') -dates = Index(map(lambda x: x.year * 10000 + x.month * 100 + x.day, dates)) - -secid_min = int('10000000', 16) -secid_max = int('F0000000', 16) -step = (secid_max - secid_min) // (n_securities - 1) -security_ids = map(lambda x: hex(x)[2:10].upper(), range(secid_min, secid_max + 1, step)) - -data_index = MultiIndex(levels=[dates.values, security_ids], - labels=[[i for i in xrange(n_dates) for _ in xrange(n_securities)], range(n_securities) * n_dates], - names=['date', 'security_id']) -n_data = len(data_index) - -columns = Index(['factor{}'.format(i) for i in xrange(1, n_columns + 1)]) - -data = DataFrame(np.random.randn(n_data, n_columns), index=data_index, columns=columns) - -step = int(n_data * share_na) -for column_index in xrange(n_columns): - index = column_index - while index < n_data: - data.set_value(data_index[index], columns[column_index], np.nan) - index += step - -grouped = data.groupby(level='security_id') -f_fillna = lambda x: x.fillna(method='pad') - -#%timeit grouped.transform(f_fillna) -#%timeit apply_by_group(grouped, f_fillna) diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index caa09c219a866..f9f221ae752b5 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -273,3 +273,41 @@ def f(g): """ groupby_sum_booleans = Benchmark("df.groupby('ii').sum()", setup) + +#---------------------------------------------------------------------- +# Transform testing + +setup = common_setup + """ +n_dates = 1000 +n_securities = 500 +n_columns = 3 +share_na = 0.1 + +dates = date_range('1997-12-31', periods=n_dates, freq='B') +dates = Index(map(lambda x: x.year * 10000 + x.month * 100 + x.day, dates)) + +secid_min = int('10000000', 16) +secid_max = int('F0000000', 16) +step = (secid_max - secid_min) // (n_securities - 1) +security_ids = map(lambda x: hex(x)[2:10].upper(), range(secid_min, secid_max + 1, step)) + +data_index = MultiIndex(levels=[dates.values, security_ids], + labels=[[i for i in xrange(n_dates) for _ in xrange(n_securities)], range(n_securities) * n_dates], + names=['date', 'security_id']) +n_data = len(data_index) + +columns = Index(['factor{}'.format(i) for i in xrange(1, n_columns + 1)]) + +data = DataFrame(np.random.randn(n_data, n_columns), index=data_index, columns=columns) + +step = int(n_data * share_na) +for column_index in xrange(n_columns): + index = column_index + while index < n_data: + data.set_value(data_index[index], columns[column_index], np.nan) + index += step + +f_fillna = lambda x: x.fillna(method='pad') +""" + +groupby_transform = Benchmark("data.groupby(level='security_id').transform(f_fillna)", setup)