Skip to content

Commit 5c4b325

Browse files
committed
API: add DataFrame.nunique() and DataFrameGroupBy.nunique()
1 parent 3e3434b commit 5c4b325

File tree

5 files changed

+59
-5
lines changed

5 files changed

+59
-5
lines changed

doc/source/whatsnew/v0.20.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ Other enhancements
9797
^^^^^^^^^^^^^^^^^^
9898
- ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`)
9999

100+
- ``DataFrame`` has gained a ``nunique()`` method as short-cut for ``.apply(lambda x: x.nunique())`` (counting the distinct values over an axis) (:issue:`14336`).
101+
- New ``DataFrame.groupby().nunique()`` method as short-cut for ``.apply(lambda g: g.apply(lambda x: x.nunique()))`` (counting the distinct values for all columns within each group) (:issue:`14336`).
102+
100103
- ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`)
101104
- Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`)
102105

pandas/core/frame.py

+21
Original file line numberDiff line numberDiff line change
@@ -4969,6 +4969,27 @@ def f(x):
49694969

49704970
return Series(result, index=labels)
49714971

4972+
def nunique(self, axis=0, dropna=True):
4973+
"""
4974+
Return Series with number of distinct observations over requested
4975+
axis.
4976+
4977+
.. versionadded:: 0.20.0
4978+
4979+
Parameters
4980+
----------
4981+
axis : {0 or 'index', 1 or 'columns'}, default 0
4982+
0 or 'index' for row-wise, 1 or 'columns' for column-wise
4983+
dropna : boolean, default True
4984+
Don't include NaN in the counts.
4985+
4986+
Returns
4987+
-------
4988+
nunique : Series
4989+
"""
4990+
func = functools.partial(Series.nunique, dropna=dropna)
4991+
return self.apply(func, axis=axis)
4992+
49724993
def idxmin(self, axis=0, skipna=True):
49734994
"""
49744995
Return index of first occurrence of minimum over requested axis.

pandas/core/groupby.py

+15
Original file line numberDiff line numberDiff line change
@@ -3899,6 +3899,21 @@ def count(self):
38993899

39003900
return self._wrap_agged_blocks(data.items, list(blk))
39013901

3902+
def nunique(self, dropna=True):
3903+
"""
3904+
Return Series with number of distinct observations per group.
3905+
3906+
.. versionadded:: 0.20.0
3907+
3908+
Parameters
3909+
----------
3910+
dropna : boolean, default True
3911+
Don't include NaN in the counts.
3912+
"""
3913+
from functools import partial
3914+
func = partial(Series.nunique, dropna=dropna)
3915+
return self.apply(lambda g: g.apply(func))
3916+
39023917

39033918
from pandas.tools.plotting import boxplot_frame_groupby # noqa
39043919
DataFrameGroupBy.boxplot = boxplot_frame_groupby

pandas/tests/frame/test_analytics.py

+5
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,11 @@ def test_count(self):
410410
expected = Series(0, index=[])
411411
tm.assert_series_equal(result, expected)
412412

413+
def test_nunique(self):
414+
f = lambda s: len(nanops.unique1d(s.dropna()))
415+
self._check_stat_op('nunique', f, has_skipna=False,
416+
check_dtype=False, check_dates=True)
417+
413418
def test_sum(self):
414419
self._check_stat_op('sum', np.sum, has_numeric_only=True)
415420

pandas/tests/groupby/test_groupby.py

+15-5
Original file line numberDiff line numberDiff line change
@@ -2800,6 +2800,16 @@ def test_count_cross_type(self): # GH8169
28002800
result = df.groupby(['c', 'd']).count()
28012801
tm.assert_frame_equal(result, expected)
28022802

2803+
def test_nunique(self):
2804+
df = DataFrame({
2805+
'A': list('abbacc'),
2806+
'B': list('abxacc'),
2807+
'C': list('abbacx'),
2808+
})
2809+
expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]})
2810+
result = df.groupby('A', as_index=False).nunique()
2811+
tm.assert_frame_equal(result, expected)
2812+
28032813
def test_non_cython_api(self):
28042814

28052815
# GH5610
@@ -5150,11 +5160,11 @@ def test_tab_completion(self):
51505160
'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max',
51515161
'mean', 'median', 'min', 'name', 'ngroups', 'nth', 'ohlc', 'plot',
51525162
'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count',
5153-
'head', 'irow', 'describe', 'cummax', 'quantile', 'rank',
5154-
'cumprod', 'tail', 'resample', 'cummin', 'fillna', 'cumsum',
5155-
'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill', 'take',
5156-
'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', 'cov',
5157-
'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin',
5163+
'nunique', 'head', 'irow', 'describe', 'cummax', 'quantile',
5164+
'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna',
5165+
'cumsum', 'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill',
5166+
'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith',
5167+
'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin',
51585168
'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding'])
51595169
self.assertEqual(results, expected)
51605170

0 commit comments

Comments
 (0)