Skip to content

Commit 053935b

Browse files
committed
ENH: add .ngroup() method to groupby objects (#14026)
1 parent ef487d9 commit 053935b

File tree

7 files changed

+322
-60
lines changed

7 files changed

+322
-60
lines changed

doc/source/api.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1705,6 +1705,7 @@ Computations / Descriptive Stats
17051705
GroupBy.mean
17061706
GroupBy.median
17071707
GroupBy.min
1708+
GroupBy.ngroup
17081709
GroupBy.nth
17091710
GroupBy.ohlc
17101711
GroupBy.prod

doc/source/groupby.rst

+46-4
Original file line numberDiff line numberDiff line change
@@ -1122,12 +1122,34 @@ To see the order in which each row appears within its group, use the
11221122

11231123
.. ipython:: python
11241124
1125-
df = pd.DataFrame(list('aaabba'), columns=['A'])
1126-
df
1125+
dfg = pd.DataFrame(list('aaabba'), columns=['A'])
1126+
dfg
1127+
1128+
dfg.groupby('A').cumcount()
1129+
1130+
dfg.groupby('A').cumcount(ascending=False)
1131+
1132+
Enumerate groups
1133+
~~~~~~~~~~~~~~~~
1134+
1135+
.. versionadded:: 0.20.2
1136+
1137+
To see the ordering of the groups (as opposed to the order of rows
1138+
within a group given by ``cumcount``) you can use the ``ngroup``
1139+
method.
1140+
1141+
Note that the numbers given to the groups match the order in which the
1142+
groups would be seen when iterating over the groupby object, not the
1143+
order they are first observed.
1144+
1145+
.. ipython:: python
11271146
1128-
df.groupby('A').cumcount()
1147+
dfg = pd.DataFrame(list('aaabba'), columns=['A'])
1148+
dfg
11291149
1130-
df.groupby('A').cumcount(ascending=False) # kwarg only
1150+
dfg.groupby('A').ngroup()
1151+
1152+
dfg.groupby('A').ngroup(ascending=False)
11311153
11321154
Plotting
11331155
~~~~~~~~
@@ -1176,6 +1198,26 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on
11761198
df
11771199
df.groupby(df.sum(), axis=1).sum()
11781200
1201+
Multi-column factorization
1202+
~~~~~~~~~~~~~~~~~~~~~~~~~~
1203+
1204+
By using ``.ngroup()``, we can extract information about the groups in a
1205+
way similar to ``pd.factorize()``, but which applies naturally to multiple
1206+
columns of mixed type and different sources. This can be useful as an
1207+
intermediate categorical-like step in processing, when the relationships
1208+
between the group rows are more important than their content, or as input
1209+
to an algorithm which only accepts the integer encoding.
1210+
1211+
.. ipython:: python
1212+
1213+
dfg = pd.DataFrame({"A": [1, 1, 2, 3, 2], "B": list("aaaba")})
1214+
1215+
dfg
1216+
1217+
dfg.groupby(["A", "B"]).ngroup()
1218+
1219+
dfg.groupby(["A", [0, 0, 0, 1, 1]]).ngroup()
1220+
11791221
Groupby by Indexer to 'resample' data
11801222
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
11811223

doc/source/whatsnew/v0.20.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ Enhancements
2121

2222
- Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`)
2323
- ``Series`` provides a ``to_latex`` method (:issue:`16180`)
24+
- A new groupby method :meth:`~pandas.core.groupby.GroupBy.ngroup`, parallel to the existing :meth:`~pandas.core.groupby.GroupBy.cumcount`, has been added to return the group order (:issue:`11642`).
2425

2526
.. _whatsnew_0202.performance:
2627

pandas/core/groupby.py

+74-1
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@
150150
'last', 'first',
151151
'head', 'tail', 'median',
152152
'mean', 'sum', 'min', 'max',
153-
'cumcount',
153+
'cumcount', 'ngroup',
154154
'resample',
155155
'rank', 'quantile',
156156
'fillna',
@@ -1437,6 +1437,75 @@ def nth(self, n, dropna=None):
14371437

14381438
return result
14391439

1440+
@Substitution(name='groupby')
1441+
@Appender(_doc_template)
1442+
def ngroup(self, ascending=True):
1443+
"""
1444+
Number each group from 0 to the number of groups - 1.
1445+
1446+
This is the enumerative complement of cumcount. Note that the
1447+
numbers given to the groups match the order in which the groups
1448+
would be seen when iterating over the groupby object, not the
1449+
order they are first observed.
1450+
1451+
.. versionadded:: 0.20.2
1452+
1453+
Parameters
1454+
----------
1455+
ascending : bool, default True
1456+
If False, number in reverse, from number of group - 1 to 0.
1457+
1458+
Examples
1459+
--------
1460+
1461+
>>> df = pd.DataFrame({"A": list("aaabba")})
1462+
>>> df
1463+
A
1464+
0 a
1465+
1 a
1466+
2 a
1467+
3 b
1468+
4 b
1469+
5 a
1470+
>>> df.groupby('A').ngroup()
1471+
0 0
1472+
1 0
1473+
2 0
1474+
3 1
1475+
4 1
1476+
5 0
1477+
dtype: int64
1478+
>>> df.groupby('A').ngroup(ascending=False)
1479+
0 1
1480+
1 1
1481+
2 1
1482+
3 0
1483+
4 0
1484+
5 1
1485+
dtype: int64
1486+
>>> df.groupby(["A", [1,1,2,3,2,1]]).ngroup()
1487+
0 0
1488+
1 0
1489+
2 1
1490+
3 3
1491+
4 2
1492+
5 0
1493+
dtype: int64
1494+
1495+
See also
1496+
--------
1497+
.cumcount : Number the rows in each group.
1498+
1499+
"""
1500+
1501+
self._set_group_selection()
1502+
1503+
index = self._selected_obj.index
1504+
result = Series(self.grouper.group_info[0], index)
1505+
if not ascending:
1506+
result = self.ngroups - 1 - result
1507+
return result
1508+
14401509
@Substitution(name='groupby')
14411510
@Appender(_doc_template)
14421511
def cumcount(self, ascending=True):
@@ -1481,6 +1550,10 @@ def cumcount(self, ascending=True):
14811550
4 0
14821551
5 0
14831552
dtype: int64
1553+
1554+
See also
1555+
--------
1556+
.ngroup : Number the groups themselves.
14841557
"""
14851558

14861559
self._set_group_selection()

pandas/tests/groupby/test_counting.py

+197
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
# -*- coding: utf-8 -*-
2+
from __future__ import print_function
3+
4+
import numpy as np
5+
6+
from pandas import (DataFrame, Series, MultiIndex)
7+
from pandas.util.testing import assert_series_equal
8+
from pandas.compat import (range, product as cart_product)
9+
10+
11+
class TestCounting(object):
12+
13+
def test_cumcount(self):
14+
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])
15+
g = df.groupby('A')
16+
sg = g.A
17+
18+
expected = Series([0, 1, 2, 0, 3])
19+
20+
assert_series_equal(expected, g.cumcount())
21+
assert_series_equal(expected, sg.cumcount())
22+
23+
def test_cumcount_empty(self):
24+
ge = DataFrame().groupby(level=0)
25+
se = Series().groupby(level=0)
26+
27+
# edge case, as this is usually considered float
28+
e = Series(dtype='int64')
29+
30+
assert_series_equal(e, ge.cumcount())
31+
assert_series_equal(e, se.cumcount())
32+
33+
def test_cumcount_dupe_index(self):
34+
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
35+
index=[0] * 5)
36+
g = df.groupby('A')
37+
sg = g.A
38+
39+
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
40+
41+
assert_series_equal(expected, g.cumcount())
42+
assert_series_equal(expected, sg.cumcount())
43+
44+
def test_cumcount_mi(self):
45+
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
46+
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
47+
index=mi)
48+
g = df.groupby('A')
49+
sg = g.A
50+
51+
expected = Series([0, 1, 2, 0, 3], index=mi)
52+
53+
assert_series_equal(expected, g.cumcount())
54+
assert_series_equal(expected, sg.cumcount())
55+
56+
def test_cumcount_groupby_not_col(self):
57+
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'],
58+
index=[0] * 5)
59+
g = df.groupby([0, 0, 0, 1, 0])
60+
sg = g.A
61+
62+
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
63+
64+
assert_series_equal(expected, g.cumcount())
65+
assert_series_equal(expected, sg.cumcount())
66+
67+
def test_ngroup(self):
68+
df = DataFrame({'A': list('aaaba')})
69+
g = df.groupby('A')
70+
sg = g.A
71+
72+
expected = Series([0, 0, 0, 1, 0])
73+
74+
assert_series_equal(expected, g.ngroup())
75+
assert_series_equal(expected, sg.ngroup())
76+
77+
def test_ngroup_distinct(self):
78+
df = DataFrame({'A': list('abcde')})
79+
g = df.groupby('A')
80+
sg = g.A
81+
82+
expected = Series(range(5), dtype='int64')
83+
84+
assert_series_equal(expected, g.ngroup())
85+
assert_series_equal(expected, sg.ngroup())
86+
87+
def test_ngroup_one_group(self):
88+
df = DataFrame({'A': [0] * 5})
89+
g = df.groupby('A')
90+
sg = g.A
91+
92+
expected = Series([0] * 5)
93+
94+
assert_series_equal(expected, g.ngroup())
95+
assert_series_equal(expected, sg.ngroup())
96+
97+
def test_ngroup_empty(self):
98+
ge = DataFrame().groupby(level=0)
99+
se = Series().groupby(level=0)
100+
101+
# edge case, as this is usually considered float
102+
e = Series(dtype='int64')
103+
104+
assert_series_equal(e, ge.ngroup())
105+
assert_series_equal(e, se.ngroup())
106+
107+
def test_ngroup_series_matches_frame(self):
108+
df = DataFrame({'A': list('aaaba')})
109+
s = Series(list('aaaba'))
110+
111+
assert_series_equal(df.groupby(s).ngroup(),
112+
s.groupby(s).ngroup())
113+
114+
def test_ngroup_dupe_index(self):
115+
df = DataFrame({'A': list('aaaba')}, index=[0] * 5)
116+
g = df.groupby('A')
117+
sg = g.A
118+
119+
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
120+
121+
assert_series_equal(expected, g.ngroup())
122+
assert_series_equal(expected, sg.ngroup())
123+
124+
def test_ngroup_mi(self):
125+
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
126+
df = DataFrame({'A': list('aaaba')}, index=mi)
127+
g = df.groupby('A')
128+
sg = g.A
129+
expected = Series([0, 0, 0, 1, 0], index=mi)
130+
131+
assert_series_equal(expected, g.ngroup())
132+
assert_series_equal(expected, sg.ngroup())
133+
134+
def test_ngroup_groupby_not_col(self):
135+
df = DataFrame({'A': list('aaaba')}, index=[0] * 5)
136+
g = df.groupby([0, 0, 0, 1, 0])
137+
sg = g.A
138+
139+
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
140+
141+
assert_series_equal(expected, g.ngroup())
142+
assert_series_equal(expected, sg.ngroup())
143+
144+
def test_ngroup_descending(self):
145+
df = DataFrame(['a', 'a', 'b', 'a', 'b'], columns=['A'])
146+
g = df.groupby(['A'])
147+
148+
ascending = Series([0, 0, 1, 0, 1])
149+
descending = Series([1, 1, 0, 1, 0])
150+
151+
assert_series_equal(descending, (g.ngroups - 1) - ascending)
152+
assert_series_equal(ascending, g.ngroup(ascending=True))
153+
assert_series_equal(descending, g.ngroup(ascending=False))
154+
155+
def test_ngroup_matches_cumcount(self):
156+
# verify one manually-worked out case works
157+
df = DataFrame([['a', 'x'], ['a', 'y'], ['b', 'x'],
158+
['a', 'x'], ['b', 'y']], columns=['A', 'X'])
159+
g = df.groupby(['A', 'X'])
160+
g_ngroup = g.ngroup()
161+
g_cumcount = g.cumcount()
162+
expected_ngroup = Series([0, 1, 2, 0, 3])
163+
expected_cumcount = Series([0, 0, 0, 1, 0])
164+
165+
assert_series_equal(g_ngroup, expected_ngroup)
166+
assert_series_equal(g_cumcount, expected_cumcount)
167+
168+
def test_ngroup_cumcount_pair(self):
169+
# brute force comparison for all small series
170+
for p in cart_product(range(3), repeat=4):
171+
df = DataFrame({'a': p})
172+
g = df.groupby(['a'])
173+
174+
order = sorted(set(p))
175+
ngroupd = [order.index(val) for val in p]
176+
cumcounted = [p[:i].count(val) for i, val in enumerate(p)]
177+
178+
assert_series_equal(g.ngroup(), Series(ngroupd))
179+
assert_series_equal(g.cumcount(), Series(cumcounted))
180+
181+
def test_ngroup_respects_groupby_order(self):
182+
np.random.seed(0)
183+
df = DataFrame({'a': np.random.choice(list('abcdef'), 100)})
184+
for sort_flag in (False, True):
185+
g = df.groupby(['a'], sort=sort_flag)
186+
df['group_id'] = -1
187+
df['group_index'] = -1
188+
189+
for i, (_, group) in enumerate(g):
190+
df.loc[group.index, 'group_id'] = i
191+
for j, ind in enumerate(group.index):
192+
df.loc[ind, 'group_index'] = j
193+
194+
assert_series_equal(Series(df['group_id'].values),
195+
g.ngroup())
196+
assert_series_equal(Series(df['group_index'].values),
197+
g.cumcount())

0 commit comments

Comments
 (0)