Skip to content

Commit a1b6587

Browse files
xflr6jreback
authored andcommitted
API: add DataFrame.nunique() and DataFrameGroupBy.nunique()
closes pandas-dev#14336 Author: Sebastian Bank <[email protected]> Closes pandas-dev#14376 from xflr6/nunique and squashes the following commits: a0558e7 [Sebastian Bank] use apply()-kwargs instead of partial, more tests, better examples c8d3ac4 [Sebastian Bank] extend docs and tests fd0f22d [Sebastian Bank] add simple benchmarks 5c4b325 [Sebastian Bank] API: add DataFrame.nunique() and DataFrameGroupBy.nunique()
1 parent a62fdf8 commit a1b6587

File tree

7 files changed

+161
-5
lines changed

7 files changed

+161
-5
lines changed

asv_bench/benchmarks/frame_methods.py

+14
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,20 @@ def time_frame_from_records_generator_nrows(self):
433433

434434

435435

436+
#-----------------------------------------------------------------------------
437+
# nunique
438+
439+
class frame_nunique(object):
440+
441+
def setup(self):
442+
self.data = np.random.randn(10000, 1000)
443+
self.df = DataFrame(self.data)
444+
445+
def time_frame_nunique(self):
446+
self.df.nunique()
447+
448+
449+
436450
#-----------------------------------------------------------------------------
437451
# duplicated
438452

asv_bench/benchmarks/groupby.py

+16
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,22 @@ def time_groupby_int_count(self):
251251
self.df.groupby(['key1', 'key2']).count()
252252

253253

254+
#----------------------------------------------------------------------
255+
# nunique() speed
256+
257+
class groupby_nunique(object):
258+
259+
def setup(self):
260+
self.n = 10000
261+
self.df = DataFrame({'key1': randint(0, 500, size=self.n),
262+
'key2': randint(0, 100, size=self.n),
263+
'ints': randint(0, 1000, size=self.n),
264+
'ints2': randint(0, 1000, size=self.n), })
265+
266+
def time_groupby_nunique(self):
267+
self.df.groupby(['key1', 'key2']).nunique()
268+
269+
254270
#----------------------------------------------------------------------
255271
# group with different functions per column
256272

doc/source/whatsnew/v0.20.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,9 @@ Other enhancements
119119
^^^^^^^^^^^^^^^^^^
120120
- ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`)
121121

122+
- ``DataFrame`` has gained a ``nunique()`` method to count the distinct values over an axis (:issue:`14336`).
123+
- ``DataFrame.groupby()`` has gained a ``.nunique()`` method to count the distinct values for all columns within each group (:issue:`14336`).
124+
122125
- ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`)
123126
- Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`)
124127

pandas/core/frame.py

+31
Original file line numberDiff line numberDiff line change
@@ -4976,6 +4976,37 @@ def f(x):
49764976

49774977
return Series(result, index=labels)
49784978

4979+
def nunique(self, axis=0, dropna=True):
4980+
"""
4981+
Return Series with number of distinct observations over requested
4982+
axis.
4983+
4984+
.. versionadded:: 0.20.0
4985+
4986+
Parameters
4987+
----------
4988+
axis : {0 or 'index', 1 or 'columns'}, default 0
4989+
dropna : boolean, default True
4990+
Don't include NaN in the counts.
4991+
4992+
Returns
4993+
-------
4994+
nunique : Series
4995+
4996+
Examples
4997+
--------
4998+
>>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})
4999+
>>> df.nunique()
5000+
A 3
5001+
B 1
5002+
5003+
>>> df.nunique(axis=1)
5004+
0 1
5005+
1 2
5006+
2 2
5007+
"""
5008+
return self.apply(Series.nunique, axis=axis, dropna=dropna)
5009+
49795010
def idxmin(self, axis=0, skipna=True):
49805011
"""
49815012
Return index of first occurrence of minimum over requested axis.

pandas/core/groupby.py

+48
Original file line numberDiff line numberDiff line change
@@ -3996,6 +3996,54 @@ def count(self):
39963996

39973997
return self._wrap_agged_blocks(data.items, list(blk))
39983998

3999+
def nunique(self, dropna=True):
4000+
"""
4001+
Return DataFrame with number of distinct observations per group for
4002+
each column.
4003+
4004+
.. versionadded:: 0.20.0
4005+
4006+
Parameters
4007+
----------
4008+
dropna : boolean, default True
4009+
Don't include NaN in the counts.
4010+
4011+
Returns
4012+
-------
4013+
nunique: DataFrame
4014+
4015+
Examples
4016+
--------
4017+
>>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',
4018+
... 'ham', 'ham'],
4019+
... 'value1': [1, 5, 5, 2, 5, 5],
4020+
... 'value2': list('abbaxy')})
4021+
>>> df
4022+
id value1 value2
4023+
0 spam 1 a
4024+
1 egg 5 b
4025+
2 egg 5 b
4026+
3 spam 2 a
4027+
4 ham 5 x
4028+
5 ham 5 y
4029+
4030+
>>> df.groupby('id').nunique()
4031+
id value1 value2
4032+
id
4033+
egg 1 1 1
4034+
ham 1 1 2
4035+
spam 1 2 1
4036+
4037+
# check for rows with the same id but conflicting values
4038+
>>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())
4039+
id value1 value2
4040+
0 spam 1 a
4041+
3 spam 2 a
4042+
4 ham 5 x
4043+
5 ham 5 y
4044+
"""
4045+
return self.apply(lambda g: g.apply(Series.nunique, dropna=dropna))
4046+
39994047

40004048
from pandas.tools.plotting import boxplot_frame_groupby # noqa
40014049
DataFrameGroupBy.boxplot = boxplot_frame_groupby

pandas/tests/frame/test_analytics.py

+16
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
MultiIndex, date_range, Timestamp)
1717
import pandas as pd
1818
import pandas.core.nanops as nanops
19+
import pandas.core.algorithms as algorithms
1920
import pandas.formats.printing as printing
2021

2122
import pandas.util.testing as tm
@@ -411,6 +412,21 @@ def test_count(self):
411412
expected = Series(0, index=[])
412413
tm.assert_series_equal(result, expected)
413414

415+
def test_nunique(self):
416+
f = lambda s: len(algorithms.unique1d(s.dropna()))
417+
self._check_stat_op('nunique', f, has_skipna=False,
418+
check_dtype=False, check_dates=True)
419+
420+
df = DataFrame({'A': [1, 1, 1],
421+
'B': [1, 2, 3],
422+
'C': [1, np.nan, 3]})
423+
tm.assert_series_equal(df.nunique(), Series({'A': 1, 'B': 3, 'C': 2}))
424+
tm.assert_series_equal(df.nunique(dropna=False),
425+
Series({'A': 1, 'B': 3, 'C': 3}))
426+
tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2}))
427+
tm.assert_series_equal(df.nunique(axis=1, dropna=False),
428+
Series({0: 1, 1: 3, 2: 2}))
429+
414430
def test_sum(self):
415431
self._check_stat_op('sum', np.sum, has_numeric_only=True)
416432

pandas/tests/groupby/test_groupby.py

+33-5
Original file line numberDiff line numberDiff line change
@@ -2938,6 +2938,34 @@ def test_count_cross_type(self): # GH8169
29382938
result = df.groupby(['c', 'd']).count()
29392939
tm.assert_frame_equal(result, expected)
29402940

2941+
def test_nunique(self):
2942+
df = DataFrame({
2943+
'A': list('abbacc'),
2944+
'B': list('abxacc'),
2945+
'C': list('abbacx'),
2946+
})
2947+
2948+
expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]})
2949+
result = df.groupby('A', as_index=False).nunique()
2950+
tm.assert_frame_equal(result, expected)
2951+
2952+
# as_index
2953+
expected.index = list('abc')
2954+
expected.index.name = 'A'
2955+
result = df.groupby('A').nunique()
2956+
tm.assert_frame_equal(result, expected)
2957+
2958+
# with na
2959+
result = df.replace({'x': None}).groupby('A').nunique(dropna=False)
2960+
tm.assert_frame_equal(result, expected)
2961+
2962+
# dropna
2963+
expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3},
2964+
index=list('abc'))
2965+
expected.index.name = 'A'
2966+
result = df.replace({'x': None}).groupby('A').nunique()
2967+
tm.assert_frame_equal(result, expected)
2968+
29412969
def test_non_cython_api(self):
29422970

29432971
# GH5610
@@ -5281,11 +5309,11 @@ def test_tab_completion(self):
52815309
'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max',
52825310
'mean', 'median', 'min', 'name', 'ngroups', 'nth', 'ohlc', 'plot',
52835311
'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count',
5284-
'head', 'irow', 'describe', 'cummax', 'quantile', 'rank',
5285-
'cumprod', 'tail', 'resample', 'cummin', 'fillna', 'cumsum',
5286-
'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill', 'take',
5287-
'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', 'cov',
5288-
'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin',
5312+
'nunique', 'head', 'irow', 'describe', 'cummax', 'quantile',
5313+
'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna',
5314+
'cumsum', 'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill',
5315+
'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith',
5316+
'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin',
52895317
'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding'])
52905318
self.assertEqual(results, expected)
52915319

0 commit comments

Comments
 (0)