From e8e773576b1922c2b2112936c0093349769d2c1c Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Thu, 14 Nov 2013 16:18:16 -0800 Subject: [PATCH 1/2] PERF faster head, tail and size groupby methods --- pandas/core/groupby.py | 118 ++++++++++++++++++++++++++++++++--- pandas/tests/test_groupby.py | 41 ++++++++++-- 2 files changed, 145 insertions(+), 14 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index e763700d08cf4..3a3d985b8d84e 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -52,7 +52,6 @@ _apply_whitelist = frozenset(['last', 'first', 'mean', 'sum', 'min', 'max', - 'head', 'tail', 'cumsum', 'cumprod', 'cummin', 'cummax', 'resample', 'describe', @@ -482,8 +481,9 @@ def picker(arr): return np.nan return self.agg(picker) - def cumcount(self): - """Number each item in each group from 0 to the length of that group. + def cumcount(self, **kwargs): + ''' + Number each item in each group from 0 to the length of that group. Essentially this is equivalent to @@ -511,13 +511,101 @@ def cumcount(self): 5 3 dtype: int64 - """ + ''' + ascending = kwargs.pop('ascending', True) + index = self.obj.index - cumcounts = np.zeros(len(index), dtype='int64') - for v in self.indices.values(): - cumcounts[v] = np.arange(len(v), dtype='int64') + rng = np.arange(self.grouper._max_groupsize, dtype='int64') + cumcounts = self._cumcount_array(rng, ascending=ascending) return Series(cumcounts, index) + def head(self, n=5): + ''' + Returns first n rows of each group. + + Essentially equivalent to .apply(lambda x: x.head(n)) + + Example + ------- + + >>> df = DataFrame([[1, 2], [1, 4], [5, 6]], + columns=['A', 'B']) + >>> df.groupby('A', as_index=False).head(1) + A B + 0 1 2 + 2 5 6 + >>> df.groupby('A').head(1) + A B + A + 1 0 1 2 + 5 2 5 6 + + ''' + rng = np.arange(self.grouper._max_groupsize, dtype='int64') + in_head = self._cumcount_array(rng) < n + head = self.obj[in_head] + if self.as_index: + head.index = self._index_with_as_index(in_head) + return head + + def tail(self, n=5): + ''' + Returns first n rows of each group + + Essentially equivalent to .apply(lambda x: x.tail(n)) + + Example + ------- + + >>> df = DataFrame([[1, 2], [1, 4], [5, 6]], + columns=['A', 'B']) + >>> df.groupby('A', as_index=False).tail(1) + A B + 0 1 2 + 2 5 6 + >>> df.groupby('A').head(1) + A B + A + 1 0 1 2 + 5 2 5 6 + ''' + rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64') + in_tail = self._cumcount_array(rng, ascending=False) > -n + tail = self.obj[in_tail] + if self.as_index: + tail.index = self._index_with_as_index(in_tail) + return tail + + def _cumcount_array(self, arr, **kwargs): + ascending = kwargs.pop('ascending', True) + + len_index = len(self.obj.index) + cumcounts = np.zeros(len_index, dtype='int64') + if ascending: + for v in self.indices.values(): + cumcounts[v] = arr[:len(v)] + else: + for v in self.indices.values(): + cumcounts[v] = arr[len(v)-1::-1] + return cumcounts + + def _index_with_as_index(self, b): + ''' + Take boolean mask of index to be returned from apply, if as_index=True + + ''' + # TODO perf, it feels like this should already be somewhere... + from itertools import chain + original = self.obj.index + gp = self.grouper + levels = chain((gp.levels[i][gp.labels[i][b]] + for i in range(len(gp.groupings))), + (original.get_level_values(i)[b] + for i in range(original.nlevels))) + new = MultiIndex.from_arrays(list(levels)) + new.names = gp.names + original.names + return new + def _try_cast(self, result, obj): """ try to cast the result to our obj original type, @@ -758,14 +846,28 @@ def names(self): def size(self): """ Compute group sizes + """ # TODO: better impl labels, _, ngroups = self.group_info - bin_counts = Series(labels).value_counts() + bin_counts = algos.value_counts(labels, sort=False) bin_counts = bin_counts.reindex(np.arange(ngroups)) bin_counts.index = self.result_index return bin_counts + @cache_readonly + def _max_groupsize(self): + ''' + Compute size of largest group + + ''' + # For many items in each group this is much faster than + # self.size().max(), in worst case marginally slower + if self.indices: + return max(len(v) for v in self.indices.values()) + else: + return 0 + @cache_readonly def groups(self): if len(self.groupings) == 1: diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 9df5541615cee..010a65738caa0 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1203,24 +1203,53 @@ def test_groupby_as_index_apply(self): g_not_as = df.groupby('user_id', as_index=False) res_as = g_as.head(2).index - exp_as = MultiIndex.from_tuples([(1, 0), (1, 2), (2, 1), (3, 4)]) + exp_as = MultiIndex.from_tuples([(1, 0), (2, 1), (1, 2), (3, 4)]) assert_index_equal(res_as, exp_as) res_not_as = g_not_as.head(2).index - exp_not_as = Index([0, 2, 1, 4]) + exp_not_as = Index([0, 1, 2, 4]) assert_index_equal(res_not_as, exp_not_as) - res_as = g_as.apply(lambda x: x.head(2)).index - assert_index_equal(res_not_as, exp_not_as) + res_as_apply = g_as.apply(lambda x: x.head(2)).index + res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index - res_not_as = g_not_as.apply(lambda x: x.head(2)).index - assert_index_equal(res_not_as, exp_not_as) + # apply doesn't maintain the original ordering + exp_not_as_apply = Index([0, 2, 1, 4]) + exp_as_apply = MultiIndex.from_tuples([(1, 0), (1, 2), (2, 1), (3, 4)]) + + assert_index_equal(res_as_apply, exp_as_apply) + assert_index_equal(res_not_as_apply, exp_not_as_apply) ind = Index(list('abcde')) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) res = df.groupby(0, as_index=False).apply(lambda x: x).index assert_index_equal(res, ind) + def test_groupby_head_tail(self): + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + g_as = df.groupby('A', as_index=True) + g_not_as = df.groupby('A', as_index=False) + + # as_index= False much easier + exp_head_not_as = df.loc[[0, 2]] + res_head_not_as = g_not_as.head(1) + assert_frame_equal(exp_head_not_as, res_head_not_as) + exp_tail_not_as = df.loc[[1, 2]] + res_tail_not_as = g_not_as.tail(1) + assert_frame_equal(exp_tail_not_as, res_tail_not_as) + + # as_index=True, yuck + res_head_as = g_as.head(1) + res_tail_as = g_as.tail(1) + + # prepend the A column as an index, in a roundabout way + df.index = df.set_index('A', append=True, drop=False).index.swaplevel(0, 1) + exp_head_as = df.loc[[0, 2]] + exp_tail_as = df.loc[[1, 2]] + + assert_frame_equal(exp_head_as, res_head_as) + assert_frame_equal(exp_tail_as, res_tail_as) + def test_groupby_multiple_key(self): df = tm.makeTimeDataFrame() grouped = df.groupby([lambda x: x.year, From ef383190ea42d86ec46a25114f48e3875c7d06d2 Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Mon, 18 Nov 2013 12:48:05 -0800 Subject: [PATCH 2/2] TST more coverage for groupby head and tail --- pandas/core/groupby.py | 38 ++++++++++++++++++++++----------- pandas/tests/test_groupby.py | 41 +++++++++++++++++++++++------------- 2 files changed, 52 insertions(+), 27 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 3a3d985b8d84e..20f17a7f42472 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -482,13 +482,18 @@ def picker(arr): return self.agg(picker) def cumcount(self, **kwargs): - ''' - Number each item in each group from 0 to the length of that group. + """ + Number each item in each group from 0 to the length of that group - 1. Essentially this is equivalent to >>> self.apply(lambda x: Series(np.arange(len(x)), x.index)) + Parameters + ---------- + ascending : bool, default True + If False, number in reverse, from length of group - 1 to 0. + Example ------- @@ -510,8 +515,16 @@ def cumcount(self, **kwargs): 4 1 5 3 dtype: int64 + >>> df.groupby('A').cumcount(ascending=False) + 0 3 + 1 2 + 2 1 + 3 1 + 4 0 + 5 0 + dtype: int64 - ''' + """ ascending = kwargs.pop('ascending', True) index = self.obj.index @@ -520,10 +533,10 @@ def cumcount(self, **kwargs): return Series(cumcounts, index) def head(self, n=5): - ''' + """ Returns first n rows of each group. - Essentially equivalent to .apply(lambda x: x.head(n)) + Essentially equivalent to ``.apply(lambda x: x.head(n))`` Example ------- @@ -540,7 +553,7 @@ def head(self, n=5): 1 0 1 2 5 2 5 6 - ''' + """ rng = np.arange(self.grouper._max_groupsize, dtype='int64') in_head = self._cumcount_array(rng) < n head = self.obj[in_head] @@ -549,10 +562,10 @@ def head(self, n=5): return head def tail(self, n=5): - ''' - Returns first n rows of each group + """ + Returns last n rows of each group - Essentially equivalent to .apply(lambda x: x.tail(n)) + Essentially equivalent to ``.apply(lambda x: x.tail(n))`` Example ------- @@ -568,7 +581,8 @@ def tail(self, n=5): A 1 0 1 2 5 2 5 6 - ''' + + """ rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64') in_tail = self._cumcount_array(rng, ascending=False) > -n tail = self.obj[in_tail] @@ -590,10 +604,10 @@ def _cumcount_array(self, arr, **kwargs): return cumcounts def _index_with_as_index(self, b): - ''' + """ Take boolean mask of index to be returned from apply, if as_index=True - ''' + """ # TODO perf, it feels like this should already be somewhere... from itertools import chain original = self.obj.index diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 010a65738caa0..9c636168114c7 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1230,25 +1230,36 @@ def test_groupby_head_tail(self): g_as = df.groupby('A', as_index=True) g_not_as = df.groupby('A', as_index=False) - # as_index= False much easier - exp_head_not_as = df.loc[[0, 2]] - res_head_not_as = g_not_as.head(1) - assert_frame_equal(exp_head_not_as, res_head_not_as) - exp_tail_not_as = df.loc[[1, 2]] - res_tail_not_as = g_not_as.tail(1) - assert_frame_equal(exp_tail_not_as, res_tail_not_as) + # as_index= False, much easier + assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1)) + assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1)) - # as_index=True, yuck - res_head_as = g_as.head(1) - res_tail_as = g_as.tail(1) + empty_not_as = DataFrame(columns=df.columns) + assert_frame_equal(empty_not_as, g_not_as.head(0)) + assert_frame_equal(empty_not_as, g_not_as.tail(0)) + assert_frame_equal(empty_not_as, g_not_as.head(-1)) + assert_frame_equal(empty_not_as, g_not_as.tail(-1)) + + assert_frame_equal(df, g_not_as.head(7)) # contains all + assert_frame_equal(df, g_not_as.tail(7)) + # as_index=True, yuck # prepend the A column as an index, in a roundabout way - df.index = df.set_index('A', append=True, drop=False).index.swaplevel(0, 1) - exp_head_as = df.loc[[0, 2]] - exp_tail_as = df.loc[[1, 2]] + df_as = df.copy() + df_as.index = df.set_index('A', append=True, + drop=False).index.swaplevel(0, 1) + + assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1)) + assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1)) + + empty_as = DataFrame(index=df_as.index[:0], columns=df.columns) + assert_frame_equal(empty_as, g_as.head(0)) + assert_frame_equal(empty_as, g_as.tail(0)) + assert_frame_equal(empty_as, g_as.head(-1)) + assert_frame_equal(empty_as, g_as.tail(-1)) - assert_frame_equal(exp_head_as, res_head_as) - assert_frame_equal(exp_tail_as, res_tail_as) + assert_frame_equal(df_as, g_as.head(7)) # contains all + assert_frame_equal(df_as, g_as.tail(7)) def test_groupby_multiple_key(self): df = tm.makeTimeDataFrame()