API: add DataFrame.nunique() and DataFrameGroupBy.nunique()

xflr6 · jreback · commit a1b6587153eb · 2017-01-23T08:40:38.000-05:00
closes pandas-dev#14336 Author: Sebastian Bank <sebastian.bank@uni-leipzig.de> Closes pandas-dev#14376 from xflr6/nunique and squashes the following commits: a0558e7 [Sebastian Bank] use apply()-kwargs instead of partial, more tests, better examples c8d3ac4 [Sebastian Bank] extend docs and tests fd0f22d [Sebastian Bank] add simple benchmarks 5c4b325 [Sebastian Bank] API: add DataFrame.nunique() and DataFrameGroupBy.nunique()
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -433,6 +433,20 @@ def time_frame_from_records_generator_nrows(self):
 
 
 
+#-----------------------------------------------------------------------------
+# nunique
+
+class frame_nunique(object):
+
+    def setup(self):
+        self.data = np.random.randn(10000, 1000)
+        self.df = DataFrame(self.data)
+
+    def time_frame_nunique(self):
+        self.df.nunique()
+
+
+
 #-----------------------------------------------------------------------------
 # duplicated
 
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -251,6 +251,22 @@ def time_groupby_int_count(self):
         self.df.groupby(['key1', 'key2']).count()
 
 
+#----------------------------------------------------------------------
+# nunique() speed
+
+class groupby_nunique(object):
+
+    def setup(self):
+        self.n = 10000
+        self.df = DataFrame({'key1': randint(0, 500, size=self.n),
+                             'key2': randint(0, 100, size=self.n),
+                             'ints': randint(0, 1000, size=self.n),
+                             'ints2': randint(0, 1000, size=self.n), })
+
+    def time_groupby_nunique(self):
+        self.df.groupby(['key1', 'key2']).nunique()
+
+
 #----------------------------------------------------------------------
 # group with different functions per column
 
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -119,6 +119,9 @@ Other enhancements
 ^^^^^^^^^^^^^^^^^^
 - ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`)
 
+- ``DataFrame`` has gained a ``nunique()`` method to count the distinct values over an axis (:issue:`14336`).
+- ``DataFrame.groupby()`` has gained a ``.nunique()`` method to count the distinct values for all columns within each group (:issue:`14336`).
+
 - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`)
 - Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`)
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4976,6 +4976,37 @@ def f(x):
 
         return Series(result, index=labels)
 
+    def nunique(self, axis=0, dropna=True):
+        """
+        Return Series with number of distinct observations over requested
+        axis.
+
+        .. versionadded:: 0.20.0
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+        dropna : boolean, default True
+            Don't include NaN in the counts.
+
+        Returns
+        -------
+        nunique : Series
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})
+        >>> df.nunique()
+        A    3
+        B    1
+
+        >>> df.nunique(axis=1)
+        0    1
+        1    2
+        2    2
+        """
+        return self.apply(Series.nunique, axis=axis, dropna=dropna)
+
     def idxmin(self, axis=0, skipna=True):
         """
         Return index of first occurrence of minimum over requested axis.
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -3996,6 +3996,54 @@ def count(self):
 
         return self._wrap_agged_blocks(data.items, list(blk))
 
+    def nunique(self, dropna=True):
+        """
+        Return DataFrame with number of distinct observations per group for
+        each column.
+
+        .. versionadded:: 0.20.0
+
+        Parameters
+        ----------
+        dropna : boolean, default True
+            Don't include NaN in the counts.
+
+        Returns
+        -------
+        nunique: DataFrame
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',
+        ...                           'ham', 'ham'],
+        ...                    'value1': [1, 5, 5, 2, 5, 5],
+        ...                    'value2': list('abbaxy')})
+        >>> df
+             id  value1 value2
+        0  spam       1      a
+        1   egg       5      b
+        2   egg       5      b
+        3  spam       2      a
+        4   ham       5      x
+        5   ham       5      y
+
+        >>> df.groupby('id').nunique()
+            id  value1  value2
+        id
+        egg    1       1       1
+        ham    1       1       2
+        spam   1       2       1
+
+        # check for rows with the same id but conflicting values
+        >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())
+             id  value1 value2
+        0  spam       1      a
+        3  spam       2      a
+        4   ham       5      x
+        5   ham       5      y
+        """
+        return self.apply(lambda g: g.apply(Series.nunique, dropna=dropna))
+
 
 from pandas.tools.plotting import boxplot_frame_groupby  # noqa
 DataFrameGroupBy.boxplot = boxplot_frame_groupby
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -16,6 +16,7 @@
                     MultiIndex, date_range, Timestamp)
 import pandas as pd
 import pandas.core.nanops as nanops
+import pandas.core.algorithms as algorithms
 import pandas.formats.printing as printing
 
 import pandas.util.testing as tm
@@ -411,6 +412,21 @@ def test_count(self):
         expected = Series(0, index=[])
         tm.assert_series_equal(result, expected)
 
+    def test_nunique(self):
+        f = lambda s: len(algorithms.unique1d(s.dropna()))
+        self._check_stat_op('nunique', f, has_skipna=False,
+                            check_dtype=False, check_dates=True)
+
+        df = DataFrame({'A': [1, 1, 1],
+                        'B': [1, 2, 3],
+                        'C': [1, np.nan, 3]})
+        tm.assert_series_equal(df.nunique(), Series({'A': 1, 'B': 3, 'C': 2}))
+        tm.assert_series_equal(df.nunique(dropna=False),
+                               Series({'A': 1, 'B': 3, 'C': 3}))
+        tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2}))
+        tm.assert_series_equal(df.nunique(axis=1, dropna=False),
+                               Series({0: 1, 1: 3, 2: 2}))
+
     def test_sum(self):
         self._check_stat_op('sum', np.sum, has_numeric_only=True)
 
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -2938,6 +2938,34 @@ def test_count_cross_type(self):  # GH8169
             result = df.groupby(['c', 'd']).count()
             tm.assert_frame_equal(result, expected)
 
+    def test_nunique(self):
+        df = DataFrame({
+            'A': list('abbacc'),
+            'B': list('abxacc'),
+            'C': list('abbacx'),
+        })
+
+        expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]})
+        result = df.groupby('A', as_index=False).nunique()
+        tm.assert_frame_equal(result, expected)
+
+        # as_index
+        expected.index = list('abc')
+        expected.index.name = 'A'
+        result = df.groupby('A').nunique()
+        tm.assert_frame_equal(result, expected)
+
+        # with na
+        result = df.replace({'x': None}).groupby('A').nunique(dropna=False)
+        tm.assert_frame_equal(result, expected)
+
+        # dropna
+        expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3},
+                             index=list('abc'))
+        expected.index.name = 'A'
+        result = df.replace({'x': None}).groupby('A').nunique()
+        tm.assert_frame_equal(result, expected)
+
     def test_non_cython_api(self):
 
         # GH5610
@@ -5281,11 +5309,11 @@ def test_tab_completion(self):
              'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max',
              'mean', 'median', 'min', 'name', 'ngroups', 'nth', 'ohlc', 'plot',
              'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count',
-             'head', 'irow', 'describe', 'cummax', 'quantile', 'rank',
-             'cumprod', 'tail', 'resample', 'cummin', 'fillna', 'cumsum',
-             'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill', 'take',
-             'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', 'cov',
-             'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin',
+             'nunique', 'head', 'irow', 'describe', 'cummax', 'quantile',
+             'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna',
+             'cumsum', 'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill',
+             'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith',
+             'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin',
              'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding'])
         self.assertEqual(results, expected)