diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index d30b7875e44b7..26fcbdca28ce7 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -679,6 +679,7 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`) - Performance improvements in ``Categorical.value_counts`` (:issue:`10804`) +- Performance improvements in ``SeriesGroupBy.nunique`` (:issue:`10820`) - 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`) - 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index d23cb39c15548..220e67c43e4be 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -82,8 +82,7 @@ _series_apply_whitelist = \ (_common_apply_whitelist - set(['boxplot'])) | \ - frozenset(['dtype', 'value_counts', 'unique', 'nunique', - 'nlargest', 'nsmallest']) + frozenset(['dtype', 'value_counts', 'unique', 'nlargest', 'nsmallest']) _dataframe_apply_whitelist = \ _common_apply_whitelist | frozenset(['dtypes', 'corrwith']) @@ -2558,6 +2557,32 @@ def true_and_notnull(x, *args, **kwargs): filtered = self._apply_filter(indices, dropna) return filtered + def nunique(self, dropna=True): + ids, _, _ = self.grouper.group_info + val = self.obj.get_values() + + sorter = np.lexsort((val, ids)) + ids, val = ids[sorter], val[sorter] + + # group boundries are where group ids change + # unique observations are where sorted values change + idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] + inc = np.r_[1, val[1:] != val[:-1]] + + # 1st item of each group is a new unique observation + mask = isnull(val) + if dropna: + inc[idx] = 1 + inc[mask] = 0 + else: + inc[mask & np.r_[False, mask[:-1]]] = 0 + inc[idx] = 1 + + out = np.add.reduceat(inc, idx) + return Series(out if ids[0] != -1 else out[1:], + index=self.grouper.result_index, + name=self.name) + def _apply_to_column_groupbys(self, func): """ return a pass thru """ return func(self) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index fa2e6e911ab5e..afce4e682c0f9 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1617,6 +1617,40 @@ def test_groupby_as_index_agg(self): assert_frame_equal(left, right) + def test_series_groupby_nunique(self): + from itertools import product + from string import ascii_lowercase + + def check_nunique(df, keys): + for sort, dropna in product((False, True), repeat=2): + gr = df.groupby(keys, sort=sort) + left = gr['julie'].nunique(dropna=dropna) + + gr = df.groupby(keys, sort=sort) + right = gr['julie'].apply(Series.nunique, dropna=dropna) + + assert_series_equal(left, right) + + days = date_range('2015-08-23', periods=10) + + for n, m in product(10**np.arange(2, 6), (10, 100, 1000)): + frame = DataFrame({ + 'jim':np.random.choice(list(ascii_lowercase), n), + 'joe':np.random.choice(days, n), + 'julie':np.random.randint(0, m, n)}) + + check_nunique(frame, ['jim']) + check_nunique(frame, ['jim', 'joe']) + + frame.loc[1::17, 'jim'] = None + frame.loc[3::37, 'joe'] = None + frame.loc[7::19, 'julie'] = None + frame.loc[8::19, 'julie'] = None + frame.loc[9::19, 'julie'] = None + + check_nunique(frame, ['jim']) + check_nunique(frame, ['jim', 'joe']) + def test_mulitindex_passthru(self): # GH 7997 @@ -4913,7 +4947,7 @@ def test_groupby_whitelist(self): 'corr', 'cov', 'value_counts', 'diff', - 'unique', 'nunique', + 'unique', 'nlargest', 'nsmallest', ])