pandas-dev · jreback · Aug 24, 2015 · Aug 23, 2015
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -679,6 +679,7 @@ Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 - Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`)
 - Performance improvements in ``Categorical.value_counts`` (:issue:`10804`)
+- Performance improvements in ``SeriesGroupBy.nunique`` (:issue:`10820`)
 
 - 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`)
 - 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`)

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -82,8 +82,7 @@
 
 _series_apply_whitelist = \
     (_common_apply_whitelist - set(['boxplot'])) | \
-    frozenset(['dtype', 'value_counts', 'unique', 'nunique',
-               'nlargest', 'nsmallest'])
+    frozenset(['dtype', 'value_counts', 'unique', 'nlargest', 'nsmallest'])
 
 _dataframe_apply_whitelist = \
     _common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
@@ -2558,6 +2557,32 @@ def true_and_notnull(x, *args, **kwargs):
         filtered = self._apply_filter(indices, dropna)
         return filtered
 
+    def nunique(self, dropna=True):
+        ids, _, _ = self.grouper.group_info
+        val = self.obj.get_values()
+
+        sorter = np.lexsort((val, ids))
+        ids, val = ids[sorter], val[sorter]
+
+        # group boundries are where group ids change
+        # unique observations are where sorted values change
+        idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
+        inc = np.r_[1, val[1:] != val[:-1]]
+
+        # 1st item of each group is a new unique observation
+        mask = isnull(val)
+        if dropna:
+            inc[idx] = 1
+            inc[mask] = 0
+        else:
+            inc[mask & np.r_[False, mask[:-1]]] = 0
+            inc[idx] = 1
+
+        out = np.add.reduceat(inc, idx)
+        return Series(out if ids[0] != -1 else out[1:],
+                      index=self.grouper.result_index,
+                      name=self.name)
+
     def _apply_to_column_groupbys(self, func):
         """ return a pass thru """
         return func(self)

diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -1617,6 +1617,40 @@ def test_groupby_as_index_agg(self):
 
             assert_frame_equal(left, right)
 
+    def test_series_groupby_nunique(self):
+        from itertools import product
+        from string import ascii_lowercase
+
+        def check_nunique(df, keys):
+            for sort, dropna in product((False, True), repeat=2):
+                gr = df.groupby(keys, sort=sort)
+                left = gr['julie'].nunique(dropna=dropna)
+
+                gr = df.groupby(keys, sort=sort)
+                right = gr['julie'].apply(Series.nunique, dropna=dropna)
+
+                assert_series_equal(left, right)
+
+        days = date_range('2015-08-23', periods=10)
+
+        for n, m in product(10**np.arange(2, 6), (10, 100, 1000)):
+            frame = DataFrame({
+                'jim':np.random.choice(list(ascii_lowercase), n),
+                'joe':np.random.choice(days, n),
+                'julie':np.random.randint(0, m, n)})
+
+            check_nunique(frame, ['jim'])
+            check_nunique(frame, ['jim', 'joe'])
+
+            frame.loc[1::17, 'jim'] = None
+            frame.loc[3::37, 'joe'] = None
+            frame.loc[7::19, 'julie'] = None
+            frame.loc[8::19, 'julie'] = None
+            frame.loc[9::19, 'julie'] = None
+
+            check_nunique(frame, ['jim'])
+            check_nunique(frame, ['jim', 'joe'])
+
     def test_mulitindex_passthru(self):
 
         # GH 7997
@@ -4913,7 +4947,7 @@ def test_groupby_whitelist(self):
             'corr', 'cov',
             'value_counts',
             'diff',
-            'unique', 'nunique',
+            'unique',
             'nlargest', 'nsmallest',
         ])