Merge pull request #5510 from hayd/groupby_cumcount

hayd · hayd · commit c70882ab8d1b · 2013-11-14T14:04:00.000-08:00
ENH add cumcount groupby method
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -705,3 +705,16 @@ can be used as group keys. If so, the order of the levels will be preserved:
    factor = qcut(data, [0, .25, .5, .75, 1.])
 
    data.groupby(factor).mean()
+
+Enumerate group items
+~~~~~~~~~~~~~~~~~~~~~
+
+To see the order in which each row appears within its group, use the
+``cumcount`` method:
+
+.. ipython:: python
+
+   df = pd.DataFrame(list('aaabba'), columns=['A'])
+   df
+
+   df.groupby('A').cumcount()
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -64,6 +64,7 @@ New features
   - ``to_csv()`` now outputs datetime objects according to a specified format
     string via the ``date_format`` keyword (:issue:`4313`)
   - Added ``LastWeekOfMonth`` DateOffset (:issue:`4637`)
+  - Added ``cumcount`` groupby method (:issue:`4646`)
   - Added ``FY5253``, and ``FY5253Quarter`` DateOffsets (:issue:`4511`)
   - Added ``mode()`` method to ``Series`` and ``DataFrame`` to get the
     statistical mode(s) of a column/series. (:issue:`5367`)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -468,6 +468,7 @@ def ohlc(self):
         Compute sum of values, excluding missing values
 
         For multiple groupings, the result index will be a MultiIndex
+
         """
         return self._cython_agg_general('ohlc')
 
@@ -480,9 +481,49 @@ def picker(arr):
                 return np.nan
         return self.agg(picker)
 
+    def cumcount(self):
+        '''
+        Number each item in each group from 0 to the length of that group.
+
+        Essentially this is equivalent to
+        
+        >>> self.apply(lambda x: Series(np.arange(len(x)), x.index)).
+
+        Example
+        -------
+
+        >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']], columns=['A'])
+        >>> df
+           A
+        0  a
+        1  a
+        2  a
+        3  b
+        4  b
+        5  a
+        >>> df.groupby('A').cumcount()
+        0    0
+        1    1
+        2    2
+        3    0
+        4    1
+        5    3
+        dtype: int64
+
+        '''
+        index = self.obj.index
+        cumcounts = np.zeros(len(index), dtype='int64')
+        for v in self.indices.values():
+            cumcounts[v] = np.arange(len(v), dtype='int64')
+        return Series(cumcounts, index)
+
+
     def _try_cast(self, result, obj):
-        """ try to cast the result to our obj original type,
-        we may have roundtripped thru object in the mean-time """
+        """
+        try to cast the result to our obj original type,
+        we may have roundtripped thru object in the mean-time
+
+        """
         if obj.ndim > 1:
             dtype = obj.values.dtype
         else:
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -2560,6 +2560,57 @@ def test_groupby_with_empty(self):
         grouped = series.groupby(grouper)
         assert next(iter(grouped), None) is None
 
+    def test_cumcount(self):
+        df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])
+        g = df.groupby('A')
+        sg = g.A
+
+        expected = Series([0, 1, 2, 0, 3])
+
+        assert_series_equal(expected, g.cumcount())
+        assert_series_equal(expected, sg.cumcount())
+
+    def test_cumcount_empty(self):
+        ge = DataFrame().groupby()
+        se = Series().groupby()
+
+        e = Series(dtype='int')  # edge case, as this is usually considered float
+
+        assert_series_equal(e, ge.cumcount())
+        assert_series_equal(e, se.cumcount())
+
+    def test_cumcount_dupe_index(self):
+        df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], index=[0] * 5)
+        g = df.groupby('A')
+        sg = g.A
+
+        expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
+
+        assert_series_equal(expected, g.cumcount())
+        assert_series_equal(expected, sg.cumcount())
+
+    def test_cumcount_mi(self):
+        mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
+        df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], index=mi)
+        g = df.groupby('A')
+        sg = g.A
+
+        expected = Series([0, 1, 2, 0, 3], index=mi)
+
+        assert_series_equal(expected, g.cumcount())
+        assert_series_equal(expected, sg.cumcount())        
+
+    def test_cumcount_groupby_not_col(self):
+        df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], index=[0] * 5)
+        g = df.groupby([0, 0, 0, 1, 0])
+        sg = g.A
+
+        expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
+
+        assert_series_equal(expected, g.cumcount())
+        assert_series_equal(expected, sg.cumcount())
+
+
     def test_filter_series(self):
         import pandas as pd
         s = pd.Series([1, 3, 20, 5, 22, 24, 7])
@@ -3180,7 +3231,7 @@ def test_tab_completion(self):
             'min','name','ngroups','nth','ohlc','plot', 'prod',
             'size','std','sum','transform','var', 'count', 'head', 'describe',
             'cummax', 'dtype', 'quantile', 'rank', 'cumprod', 'tail',
-            'resample', 'cummin', 'fillna', 'cumsum'])
+            'resample', 'cummin', 'fillna', 'cumsum', 'cumcount'])
         self.assertEqual(results, expected)
 
 def assert_fp_equal(a, b):