Skip to content

Commit c70882a

Browse files
committed
Merge pull request #5510 from hayd/groupby_cumcount
ENH add cumcount groupby method
2 parents 0c30665 + b564798 commit c70882a

File tree

4 files changed

+109
-3
lines changed

4 files changed

+109
-3
lines changed

doc/source/groupby.rst

+13
Original file line numberDiff line numberDiff line change
@@ -705,3 +705,16 @@ can be used as group keys. If so, the order of the levels will be preserved:
705705
factor = qcut(data, [0, .25, .5, .75, 1.])
706706
707707
data.groupby(factor).mean()
708+
709+
Enumerate group items
710+
~~~~~~~~~~~~~~~~~~~~~
711+
712+
To see the order in which each row appears within its group, use the
713+
``cumcount`` method:
714+
715+
.. ipython:: python
716+
717+
df = pd.DataFrame(list('aaabba'), columns=['A'])
718+
df
719+
720+
df.groupby('A').cumcount()

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ New features
6464
- ``to_csv()`` now outputs datetime objects according to a specified format
6565
string via the ``date_format`` keyword (:issue:`4313`)
6666
- Added ``LastWeekOfMonth`` DateOffset (:issue:`4637`)
67+
- Added ``cumcount`` groupby method (:issue:`4646`)
6768
- Added ``FY5253``, and ``FY5253Quarter`` DateOffsets (:issue:`4511`)
6869
- Added ``mode()`` method to ``Series`` and ``DataFrame`` to get the
6970
statistical mode(s) of a column/series. (:issue:`5367`)

pandas/core/groupby.py

+43-2
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,7 @@ def ohlc(self):
468468
Compute sum of values, excluding missing values
469469
470470
For multiple groupings, the result index will be a MultiIndex
471+
471472
"""
472473
return self._cython_agg_general('ohlc')
473474

@@ -480,9 +481,49 @@ def picker(arr):
480481
return np.nan
481482
return self.agg(picker)
482483

484+
def cumcount(self):
485+
'''
486+
Number each item in each group from 0 to the length of that group.
487+
488+
Essentially this is equivalent to
489+
490+
>>> self.apply(lambda x: Series(np.arange(len(x)), x.index)).
491+
492+
Example
493+
-------
494+
495+
>>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']], columns=['A'])
496+
>>> df
497+
A
498+
0 a
499+
1 a
500+
2 a
501+
3 b
502+
4 b
503+
5 a
504+
>>> df.groupby('A').cumcount()
505+
0 0
506+
1 1
507+
2 2
508+
3 0
509+
4 1
510+
5 3
511+
dtype: int64
512+
513+
'''
514+
index = self.obj.index
515+
cumcounts = np.zeros(len(index), dtype='int64')
516+
for v in self.indices.values():
517+
cumcounts[v] = np.arange(len(v), dtype='int64')
518+
return Series(cumcounts, index)
519+
520+
483521
def _try_cast(self, result, obj):
484-
""" try to cast the result to our obj original type,
485-
we may have roundtripped thru object in the mean-time """
522+
"""
523+
try to cast the result to our obj original type,
524+
we may have roundtripped thru object in the mean-time
525+
526+
"""
486527
if obj.ndim > 1:
487528
dtype = obj.values.dtype
488529
else:

pandas/tests/test_groupby.py

+52-1
Original file line numberDiff line numberDiff line change
@@ -2560,6 +2560,57 @@ def test_groupby_with_empty(self):
25602560
grouped = series.groupby(grouper)
25612561
assert next(iter(grouped), None) is None
25622562

2563+
def test_cumcount(self):
2564+
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])
2565+
g = df.groupby('A')
2566+
sg = g.A
2567+
2568+
expected = Series([0, 1, 2, 0, 3])
2569+
2570+
assert_series_equal(expected, g.cumcount())
2571+
assert_series_equal(expected, sg.cumcount())
2572+
2573+
def test_cumcount_empty(self):
2574+
ge = DataFrame().groupby()
2575+
se = Series().groupby()
2576+
2577+
e = Series(dtype='int') # edge case, as this is usually considered float
2578+
2579+
assert_series_equal(e, ge.cumcount())
2580+
assert_series_equal(e, se.cumcount())
2581+
2582+
def test_cumcount_dupe_index(self):
2583+
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], index=[0] * 5)
2584+
g = df.groupby('A')
2585+
sg = g.A
2586+
2587+
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
2588+
2589+
assert_series_equal(expected, g.cumcount())
2590+
assert_series_equal(expected, sg.cumcount())
2591+
2592+
def test_cumcount_mi(self):
2593+
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
2594+
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], index=mi)
2595+
g = df.groupby('A')
2596+
sg = g.A
2597+
2598+
expected = Series([0, 1, 2, 0, 3], index=mi)
2599+
2600+
assert_series_equal(expected, g.cumcount())
2601+
assert_series_equal(expected, sg.cumcount())
2602+
2603+
def test_cumcount_groupby_not_col(self):
2604+
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], index=[0] * 5)
2605+
g = df.groupby([0, 0, 0, 1, 0])
2606+
sg = g.A
2607+
2608+
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
2609+
2610+
assert_series_equal(expected, g.cumcount())
2611+
assert_series_equal(expected, sg.cumcount())
2612+
2613+
25632614
def test_filter_series(self):
25642615
import pandas as pd
25652616
s = pd.Series([1, 3, 20, 5, 22, 24, 7])
@@ -3180,7 +3231,7 @@ def test_tab_completion(self):
31803231
'min','name','ngroups','nth','ohlc','plot', 'prod',
31813232
'size','std','sum','transform','var', 'count', 'head', 'describe',
31823233
'cummax', 'dtype', 'quantile', 'rank', 'cumprod', 'tail',
3183-
'resample', 'cummin', 'fillna', 'cumsum'])
3234+
'resample', 'cummin', 'fillna', 'cumsum', 'cumcount'])
31843235
self.assertEqual(results, expected)
31853236

31863237
def assert_fp_equal(a, b):

0 commit comments

Comments
 (0)