Skip to content

Commit 6d02616

Browse files
committed
PERF: DataFrame.groupby.nunique
closes #15197
1 parent be3f2ae commit 6d02616

File tree

3 files changed

+33
-11
lines changed

3 files changed

+33
-11
lines changed

doc/source/whatsnew/v0.20.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ Other enhancements
121121
- ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`)
122122

123123
- ``DataFrame`` has gained a ``nunique()`` method to count the distinct values over an axis (:issue:`14336`).
124-
- ``DataFrame.groupby()`` has gained a ``.nunique()`` method to count the distinct values for all columns within each group (:issue:`14336`).
124+
- ``DataFrame.groupby()`` has gained a ``.nunique()`` method to count the distinct values for all columns within each group (:issue:`14336`, :issue:`15197`).
125125

126126
- ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`)
127127
- Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`)

pandas/core/groupby.py

+21-2
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@
3434
from pandas.types.cast import _possibly_downcast_to_dtype
3535
from pandas.types.missing import isnull, notnull, _maybe_fill
3636

37-
from pandas.core.common import _values_from_object, AbstractMethodError
37+
from pandas.core.common import (_values_from_object, AbstractMethodError,
38+
_default_index)
39+
3840
from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
3941
DataError, SpecificationError)
4042
from pandas.core.categorical import Categorical
@@ -4042,7 +4044,24 @@ def nunique(self, dropna=True):
40424044
4 ham 5 x
40434045
5 ham 5 y
40444046
"""
4045-
return self.apply(lambda g: g.apply(Series.nunique, dropna=dropna))
4047+
4048+
obj = self._selected_obj
4049+
4050+
def groupby_series(obj, col=None):
4051+
return SeriesGroupBy(obj,
4052+
selection=col,
4053+
grouper=self.grouper).nunique(dropna=dropna)
4054+
4055+
if isinstance(obj, Series):
4056+
results = groupby_series(obj)
4057+
else:
4058+
from pandas.tools.merge import concat
4059+
results = [groupby_series(obj[col], col) for col in obj.columns]
4060+
results = concat(results, axis=1)
4061+
4062+
if not self.as_index:
4063+
results.index = _default_index(len(results))
4064+
return results
40464065

40474066

40484067
from pandas.tools.plotting import boxplot_frame_groupby # noqa

pandas/tests/groupby/test_groupby.py

+11-8
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from __future__ import print_function
33
import nose
44

5+
from string import ascii_lowercase
56
from datetime import datetime
67
from numpy import nan
78

@@ -1807,22 +1808,22 @@ def test_groupby_as_index_agg(self):
18071808
assert_frame_equal(left, right)
18081809

18091810
def test_series_groupby_nunique(self):
1810-
from itertools import product
1811-
from string import ascii_lowercase
18121811

1813-
def check_nunique(df, keys):
1814-
for sort, dropna in product((False, True), repeat=2):
1815-
gr = df.groupby(keys, sort=sort)
1812+
def check_nunique(df, keys, as_index=True):
1813+
for sort, dropna in cart_product((False, True), repeat=2):
1814+
gr = df.groupby(keys, as_index=as_index, sort=sort)
18161815
left = gr['julie'].nunique(dropna=dropna)
18171816

1818-
gr = df.groupby(keys, sort=sort)
1817+
gr = df.groupby(keys, as_index=as_index, sort=sort)
18191818
right = gr['julie'].apply(Series.nunique, dropna=dropna)
1819+
if not as_index:
1820+
right = right.reset_index(drop=True)
18201821

1821-
assert_series_equal(left, right)
1822+
assert_series_equal(left, right, check_names=False)
18221823

18231824
days = date_range('2015-08-23', periods=10)
18241825

1825-
for n, m in product(10 ** np.arange(2, 6), (10, 100, 1000)):
1826+
for n, m in cart_product(10 ** np.arange(2, 6), (10, 100, 1000)):
18261827
frame = DataFrame({
18271828
'jim': np.random.choice(
18281829
list(ascii_lowercase), n),
@@ -1841,6 +1842,8 @@ def check_nunique(df, keys):
18411842

18421843
check_nunique(frame, ['jim'])
18431844
check_nunique(frame, ['jim', 'joe'])
1845+
check_nunique(frame, ['jim'], as_index=False)
1846+
check_nunique(frame, ['jim', 'joe'], as_index=False)
18441847

18451848
def test_series_groupby_value_counts(self):
18461849
from itertools import product

0 commit comments

Comments
 (0)