From e8e773576b1922c2b2112936c0093349769d2c1c Mon Sep 17 00:00:00 2001
From: Andy Hayden <andyhayden1@gmail.com>
Date: Thu, 14 Nov 2013 16:18:16 -0800
Subject: [PATCH 1/2] PERF faster head, tail and size groupby methods

---
 pandas/core/groupby.py       | 118 ++++++++++++++++++++++++++++++++---
 pandas/tests/test_groupby.py |  41 ++++++++++--
 2 files changed, 145 insertions(+), 14 deletions(-)

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index e763700d08cf4..3a3d985b8d84e 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -52,7 +52,6 @@
 
 _apply_whitelist = frozenset(['last', 'first',
                               'mean', 'sum', 'min', 'max',
-                              'head', 'tail',
                               'cumsum', 'cumprod', 'cummin', 'cummax',
                               'resample',
                               'describe',
@@ -482,8 +481,9 @@ def picker(arr):
                 return np.nan
         return self.agg(picker)
 
-    def cumcount(self):
-        """Number each item in each group from 0 to the length of that group.
+    def cumcount(self, **kwargs):
+        '''
+        Number each item in each group from 0 to the length of that group.
 
         Essentially this is equivalent to
 
@@ -511,13 +511,101 @@ def cumcount(self):
         5    3
         dtype: int64
 
-        """
+        '''
+        ascending = kwargs.pop('ascending', True)
+
         index = self.obj.index
-        cumcounts = np.zeros(len(index), dtype='int64')
-        for v in self.indices.values():
-            cumcounts[v] = np.arange(len(v), dtype='int64')
+        rng = np.arange(self.grouper._max_groupsize, dtype='int64')
+        cumcounts = self._cumcount_array(rng, ascending=ascending)
         return Series(cumcounts, index)
 
+    def head(self, n=5):
+        '''
+        Returns first n rows of each group.
+
+        Essentially equivalent to .apply(lambda x: x.head(n))
+
+        Example
+        -------
+
+        >>> df = DataFrame([[1, 2], [1, 4], [5, 6]],
+                            columns=['A', 'B'])
+        >>> df.groupby('A', as_index=False).head(1) 
+           A  B
+        0  1  2
+        2  5  6
+        >>> df.groupby('A').head(1)
+             A  B
+        A        
+        1 0  1  2
+        5 2  5  6
+
+        '''
+        rng = np.arange(self.grouper._max_groupsize, dtype='int64')
+        in_head = self._cumcount_array(rng) < n
+        head = self.obj[in_head]
+        if self.as_index:
+            head.index = self._index_with_as_index(in_head)
+        return head
+
+    def tail(self, n=5):
+        '''
+        Returns first n rows of each group
+
+        Essentially equivalent to .apply(lambda x: x.tail(n))
+
+        Example
+        -------
+
+        >>> df = DataFrame([[1, 2], [1, 4], [5, 6]],
+                            columns=['A', 'B'])
+        >>> df.groupby('A', as_index=False).tail(1) 
+           A  B
+        0  1  2
+        2  5  6
+        >>> df.groupby('A').head(1)
+             A  B
+        A        
+        1 0  1  2
+        5 2  5  6
+        '''
+        rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64')
+        in_tail = self._cumcount_array(rng, ascending=False) > -n
+        tail = self.obj[in_tail]
+        if self.as_index:
+            tail.index = self._index_with_as_index(in_tail)
+        return tail
+
+    def _cumcount_array(self, arr, **kwargs):
+        ascending = kwargs.pop('ascending', True)
+
+        len_index = len(self.obj.index)
+        cumcounts = np.zeros(len_index, dtype='int64')
+        if ascending:
+            for v in self.indices.values():
+                cumcounts[v] = arr[:len(v)]
+        else:
+            for v in self.indices.values():
+                cumcounts[v] = arr[len(v)-1::-1]
+        return cumcounts
+
+    def _index_with_as_index(self, b):
+        '''
+        Take boolean mask of index to be returned from apply, if as_index=True
+
+        '''
+        # TODO perf, it feels like this should already be somewhere...
+        from itertools import chain
+        original = self.obj.index
+        gp = self.grouper
+        levels = chain((gp.levels[i][gp.labels[i][b]]
+                            for i in range(len(gp.groupings))),
+                        (original.get_level_values(i)[b]
+                            for i in range(original.nlevels)))
+        new = MultiIndex.from_arrays(list(levels))
+        new.names = gp.names + original.names
+        return new
+
     def _try_cast(self, result, obj):
         """
         try to cast the result to our obj original type,
@@ -758,14 +846,28 @@ def names(self):
     def size(self):
         """
         Compute group sizes
+
         """
         # TODO: better impl
         labels, _, ngroups = self.group_info
-        bin_counts = Series(labels).value_counts()
+        bin_counts = algos.value_counts(labels, sort=False)
         bin_counts = bin_counts.reindex(np.arange(ngroups))
         bin_counts.index = self.result_index
         return bin_counts
 
+    @cache_readonly
+    def _max_groupsize(self):
+        '''
+        Compute size of largest group
+
+        '''
+        # For many items in each group this is much faster than
+        # self.size().max(), in worst case marginally slower
+        if self.indices:
+            return max(len(v) for v in self.indices.values())
+        else:
+            return 0
+
     @cache_readonly
     def groups(self):
         if len(self.groupings) == 1:
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
index 9df5541615cee..010a65738caa0 100644
--- a/pandas/tests/test_groupby.py
+++ b/pandas/tests/test_groupby.py
@@ -1203,24 +1203,53 @@ def test_groupby_as_index_apply(self):
         g_not_as = df.groupby('user_id', as_index=False)
 
         res_as = g_as.head(2).index
-        exp_as = MultiIndex.from_tuples([(1, 0), (1, 2), (2, 1), (3, 4)])
+        exp_as = MultiIndex.from_tuples([(1, 0), (2, 1), (1, 2), (3, 4)])
         assert_index_equal(res_as, exp_as)
 
         res_not_as = g_not_as.head(2).index
-        exp_not_as = Index([0, 2, 1, 4])
+        exp_not_as = Index([0, 1, 2, 4])
         assert_index_equal(res_not_as, exp_not_as)
 
-        res_as = g_as.apply(lambda x: x.head(2)).index
-        assert_index_equal(res_not_as, exp_not_as)
+        res_as_apply = g_as.apply(lambda x: x.head(2)).index
+        res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index
 
-        res_not_as = g_not_as.apply(lambda x: x.head(2)).index
-        assert_index_equal(res_not_as, exp_not_as)
+        # apply doesn't maintain the original ordering
+        exp_not_as_apply = Index([0, 2, 1, 4])        
+        exp_as_apply = MultiIndex.from_tuples([(1, 0), (1, 2), (2, 1), (3, 4)])
+
+        assert_index_equal(res_as_apply, exp_as_apply)
+        assert_index_equal(res_not_as_apply, exp_not_as_apply)
 
         ind = Index(list('abcde'))
         df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
         res = df.groupby(0, as_index=False).apply(lambda x: x).index
         assert_index_equal(res, ind)
 
+    def test_groupby_head_tail(self):
+        df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
+        g_as = df.groupby('A', as_index=True)
+        g_not_as = df.groupby('A', as_index=False)
+
+        # as_index= False much easier
+        exp_head_not_as = df.loc[[0, 2]]
+        res_head_not_as = g_not_as.head(1)
+        assert_frame_equal(exp_head_not_as, res_head_not_as)
+        exp_tail_not_as = df.loc[[1, 2]]
+        res_tail_not_as = g_not_as.tail(1)
+        assert_frame_equal(exp_tail_not_as, res_tail_not_as)
+
+        # as_index=True, yuck
+        res_head_as = g_as.head(1)
+        res_tail_as = g_as.tail(1)
+
+        # prepend the A column as an index, in a roundabout way
+        df.index = df.set_index('A', append=True, drop=False).index.swaplevel(0, 1)
+        exp_head_as = df.loc[[0, 2]]
+        exp_tail_as = df.loc[[1, 2]]
+
+        assert_frame_equal(exp_head_as, res_head_as)
+        assert_frame_equal(exp_tail_as, res_tail_as)
+
     def test_groupby_multiple_key(self):
         df = tm.makeTimeDataFrame()
         grouped = df.groupby([lambda x: x.year,

From ef383190ea42d86ec46a25114f48e3875c7d06d2 Mon Sep 17 00:00:00 2001
From: Andy Hayden <andyhayden1@gmail.com>
Date: Mon, 18 Nov 2013 12:48:05 -0800
Subject: [PATCH 2/2] TST more coverage for groupby head and tail

---
 pandas/core/groupby.py       | 38 ++++++++++++++++++++++-----------
 pandas/tests/test_groupby.py | 41 +++++++++++++++++++++++-------------
 2 files changed, 52 insertions(+), 27 deletions(-)

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index 3a3d985b8d84e..20f17a7f42472 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -482,13 +482,18 @@ def picker(arr):
         return self.agg(picker)
 
     def cumcount(self, **kwargs):
-        '''
-        Number each item in each group from 0 to the length of that group.
+        """
+        Number each item in each group from 0 to the length of that group - 1.
 
         Essentially this is equivalent to
 
         >>> self.apply(lambda x: Series(np.arange(len(x)), x.index))
 
+        Parameters
+        ----------
+        ascending : bool, default True
+            If False, number in reverse, from length of group - 1 to 0.
+
         Example
         -------
 
@@ -510,8 +515,16 @@ def cumcount(self, **kwargs):
         4    1
         5    3
         dtype: int64
+        >>> df.groupby('A').cumcount(ascending=False)
+        0    3
+        1    2
+        2    1
+        3    1
+        4    0
+        5    0
+        dtype: int64
 
-        '''
+        """
         ascending = kwargs.pop('ascending', True)
 
         index = self.obj.index
@@ -520,10 +533,10 @@ def cumcount(self, **kwargs):
         return Series(cumcounts, index)
 
     def head(self, n=5):
-        '''
+        """
         Returns first n rows of each group.
 
-        Essentially equivalent to .apply(lambda x: x.head(n))
+        Essentially equivalent to ``.apply(lambda x: x.head(n))``
 
         Example
         -------
@@ -540,7 +553,7 @@ def head(self, n=5):
         1 0  1  2
         5 2  5  6
 
-        '''
+        """
         rng = np.arange(self.grouper._max_groupsize, dtype='int64')
         in_head = self._cumcount_array(rng) < n
         head = self.obj[in_head]
@@ -549,10 +562,10 @@ def head(self, n=5):
         return head
 
     def tail(self, n=5):
-        '''
-        Returns first n rows of each group
+        """
+        Returns last n rows of each group
 
-        Essentially equivalent to .apply(lambda x: x.tail(n))
+        Essentially equivalent to ``.apply(lambda x: x.tail(n))``
 
         Example
         -------
@@ -568,7 +581,8 @@ def tail(self, n=5):
         A        
         1 0  1  2
         5 2  5  6
-        '''
+        
+        """
         rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64')
         in_tail = self._cumcount_array(rng, ascending=False) > -n
         tail = self.obj[in_tail]
@@ -590,10 +604,10 @@ def _cumcount_array(self, arr, **kwargs):
         return cumcounts
 
     def _index_with_as_index(self, b):
-        '''
+        """
         Take boolean mask of index to be returned from apply, if as_index=True
 
-        '''
+        """
         # TODO perf, it feels like this should already be somewhere...
         from itertools import chain
         original = self.obj.index
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
index 010a65738caa0..9c636168114c7 100644
--- a/pandas/tests/test_groupby.py
+++ b/pandas/tests/test_groupby.py
@@ -1230,25 +1230,36 @@ def test_groupby_head_tail(self):
         g_as = df.groupby('A', as_index=True)
         g_not_as = df.groupby('A', as_index=False)
 
-        # as_index= False much easier
-        exp_head_not_as = df.loc[[0, 2]]
-        res_head_not_as = g_not_as.head(1)
-        assert_frame_equal(exp_head_not_as, res_head_not_as)
-        exp_tail_not_as = df.loc[[1, 2]]
-        res_tail_not_as = g_not_as.tail(1)
-        assert_frame_equal(exp_tail_not_as, res_tail_not_as)
+        # as_index= False, much easier
+        assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1))
+        assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1))
 
-        # as_index=True, yuck
-        res_head_as = g_as.head(1)
-        res_tail_as = g_as.tail(1)
+        empty_not_as = DataFrame(columns=df.columns)
+        assert_frame_equal(empty_not_as, g_not_as.head(0))
+        assert_frame_equal(empty_not_as, g_not_as.tail(0))
+        assert_frame_equal(empty_not_as, g_not_as.head(-1))
+        assert_frame_equal(empty_not_as, g_not_as.tail(-1))
+
+        assert_frame_equal(df, g_not_as.head(7)) # contains all
+        assert_frame_equal(df, g_not_as.tail(7))
 
+        # as_index=True, yuck
         # prepend the A column as an index, in a roundabout way
-        df.index = df.set_index('A', append=True, drop=False).index.swaplevel(0, 1)
-        exp_head_as = df.loc[[0, 2]]
-        exp_tail_as = df.loc[[1, 2]]
+        df_as = df.copy()
+        df_as.index = df.set_index('A', append=True,
+                                        drop=False).index.swaplevel(0, 1)
+
+        assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1))
+        assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1))
+
+        empty_as = DataFrame(index=df_as.index[:0], columns=df.columns)
+        assert_frame_equal(empty_as, g_as.head(0))
+        assert_frame_equal(empty_as, g_as.tail(0))
+        assert_frame_equal(empty_as, g_as.head(-1))
+        assert_frame_equal(empty_as, g_as.tail(-1))
 
-        assert_frame_equal(exp_head_as, res_head_as)
-        assert_frame_equal(exp_tail_as, res_tail_as)
+        assert_frame_equal(df_as, g_as.head(7)) # contains all
+        assert_frame_equal(df_as, g_as.tail(7))
 
     def test_groupby_multiple_key(self):
         df = tm.makeTimeDataFrame()