PERF: DataFrame.groupby.nunique

jreback · jreback · commit 6d02616548d8 · 2017-01-23T17:10:18.000-05:00
closes #15197
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -121,7 +121,7 @@ Other enhancements
 - ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`)
 
 - ``DataFrame`` has gained a ``nunique()`` method to count the distinct values over an axis (:issue:`14336`).
-- ``DataFrame.groupby()`` has gained a ``.nunique()`` method to count the distinct values for all columns within each group (:issue:`14336`).
+- ``DataFrame.groupby()`` has gained a ``.nunique()`` method to count the distinct values for all columns within each group (:issue:`14336`, :issue:`15197`).
 
 - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`)
 - Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -34,7 +34,9 @@
 from pandas.types.cast import _possibly_downcast_to_dtype
 from pandas.types.missing import isnull, notnull, _maybe_fill
 
-from pandas.core.common import _values_from_object, AbstractMethodError
+from pandas.core.common import (_values_from_object, AbstractMethodError,
+                                _default_index)
+
 from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
                               DataError, SpecificationError)
 from pandas.core.categorical import Categorical
@@ -4042,7 +4044,24 @@ def nunique(self, dropna=True):
         4   ham       5      x
         5   ham       5      y
         """
-        return self.apply(lambda g: g.apply(Series.nunique, dropna=dropna))
+
+        obj = self._selected_obj
+
+        def groupby_series(obj, col=None):
+            return SeriesGroupBy(obj,
+                                 selection=col,
+                                 grouper=self.grouper).nunique(dropna=dropna)
+
+        if isinstance(obj, Series):
+            results = groupby_series(obj)
+        else:
+            from pandas.tools.merge import concat
+            results = [groupby_series(obj[col], col) for col in obj.columns]
+            results = concat(results, axis=1)
+
+        if not self.as_index:
+            results.index = _default_index(len(results))
+        return results
 
 
 from pandas.tools.plotting import boxplot_frame_groupby  # noqa
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -2,6 +2,7 @@
 from __future__ import print_function
 import nose
 
+from string import ascii_lowercase
 from datetime import datetime
 from numpy import nan
 
@@ -1807,22 +1808,22 @@ def test_groupby_as_index_agg(self):
             assert_frame_equal(left, right)
 
     def test_series_groupby_nunique(self):
-        from itertools import product
-        from string import ascii_lowercase
 
-        def check_nunique(df, keys):
-            for sort, dropna in product((False, True), repeat=2):
-                gr = df.groupby(keys, sort=sort)
+        def check_nunique(df, keys, as_index=True):
+            for sort, dropna in cart_product((False, True), repeat=2):
+                gr = df.groupby(keys, as_index=as_index, sort=sort)
                 left = gr['julie'].nunique(dropna=dropna)
 
-                gr = df.groupby(keys, sort=sort)
+                gr = df.groupby(keys, as_index=as_index, sort=sort)
                 right = gr['julie'].apply(Series.nunique, dropna=dropna)
+                if not as_index:
+                    right = right.reset_index(drop=True)
 
-                assert_series_equal(left, right)
+                assert_series_equal(left, right, check_names=False)
 
         days = date_range('2015-08-23', periods=10)
 
-        for n, m in product(10 ** np.arange(2, 6), (10, 100, 1000)):
+        for n, m in cart_product(10 ** np.arange(2, 6), (10, 100, 1000)):
             frame = DataFrame({
                 'jim': np.random.choice(
                     list(ascii_lowercase), n),
@@ -1841,6 +1842,8 @@ def check_nunique(df, keys):
 
             check_nunique(frame, ['jim'])
             check_nunique(frame, ['jim', 'joe'])
+            check_nunique(frame, ['jim'], as_index=False)
+            check_nunique(frame, ['jim', 'joe'], as_index=False)
 
     def test_series_groupby_value_counts(self):
         from itertools import product