Skip to content

Commit 07042a9

Browse files
committed
Merge pull request #10894 from behzadnouri/grby-nunique
PERF: improves SeriesGroupBy.nunique performance
2 parents a77956d + 091c73d commit 07042a9

File tree

3 files changed

+63
-3
lines changed

3 files changed

+63
-3
lines changed

doc/source/whatsnew/v0.17.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -721,6 +721,7 @@ Performance Improvements
721721
~~~~~~~~~~~~~~~~~~~~~~~~
722722
- Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`)
723723
- Performance improvements in ``Categorical.value_counts`` (:issue:`10804`)
724+
- Performance improvements in ``SeriesGroupBy.nunique`` (:issue:`10820`)
724725

725726
- 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`)
726727
- 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`)

pandas/core/groupby.py

+27-2
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,7 @@
8282

8383
_series_apply_whitelist = \
8484
(_common_apply_whitelist - set(['boxplot'])) | \
85-
frozenset(['dtype', 'value_counts', 'unique', 'nunique',
86-
'nlargest', 'nsmallest'])
85+
frozenset(['dtype', 'value_counts', 'unique', 'nlargest', 'nsmallest'])
8786

8887
_dataframe_apply_whitelist = \
8988
_common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
@@ -2558,6 +2557,32 @@ def true_and_notnull(x, *args, **kwargs):
25582557
filtered = self._apply_filter(indices, dropna)
25592558
return filtered
25602559

2560+
def nunique(self, dropna=True):
2561+
ids, _, _ = self.grouper.group_info
2562+
val = self.obj.get_values()
2563+
2564+
sorter = np.lexsort((val, ids))
2565+
ids, val = ids[sorter], val[sorter]
2566+
2567+
# group boundries are where group ids change
2568+
# unique observations are where sorted values change
2569+
idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
2570+
inc = np.r_[1, val[1:] != val[:-1]]
2571+
2572+
# 1st item of each group is a new unique observation
2573+
mask = isnull(val)
2574+
if dropna:
2575+
inc[idx] = 1
2576+
inc[mask] = 0
2577+
else:
2578+
inc[mask & np.r_[False, mask[:-1]]] = 0
2579+
inc[idx] = 1
2580+
2581+
out = np.add.reduceat(inc, idx)
2582+
return Series(out if ids[0] != -1 else out[1:],
2583+
index=self.grouper.result_index,
2584+
name=self.name)
2585+
25612586
def _apply_to_column_groupbys(self, func):
25622587
""" return a pass thru """
25632588
return func(self)

pandas/tests/test_groupby.py

+35-1
Original file line numberDiff line numberDiff line change
@@ -1616,6 +1616,40 @@ def test_groupby_as_index_agg(self):
16161616

16171617
assert_frame_equal(left, right)
16181618

1619+
def test_series_groupby_nunique(self):
1620+
from itertools import product
1621+
from string import ascii_lowercase
1622+
1623+
def check_nunique(df, keys):
1624+
for sort, dropna in product((False, True), repeat=2):
1625+
gr = df.groupby(keys, sort=sort)
1626+
left = gr['julie'].nunique(dropna=dropna)
1627+
1628+
gr = df.groupby(keys, sort=sort)
1629+
right = gr['julie'].apply(Series.nunique, dropna=dropna)
1630+
1631+
assert_series_equal(left, right)
1632+
1633+
days = date_range('2015-08-23', periods=10)
1634+
1635+
for n, m in product(10**np.arange(2, 6), (10, 100, 1000)):
1636+
frame = DataFrame({
1637+
'jim':np.random.choice(list(ascii_lowercase), n),
1638+
'joe':np.random.choice(days, n),
1639+
'julie':np.random.randint(0, m, n)})
1640+
1641+
check_nunique(frame, ['jim'])
1642+
check_nunique(frame, ['jim', 'joe'])
1643+
1644+
frame.loc[1::17, 'jim'] = None
1645+
frame.loc[3::37, 'joe'] = None
1646+
frame.loc[7::19, 'julie'] = None
1647+
frame.loc[8::19, 'julie'] = None
1648+
frame.loc[9::19, 'julie'] = None
1649+
1650+
check_nunique(frame, ['jim'])
1651+
check_nunique(frame, ['jim', 'joe'])
1652+
16191653
def test_mulitindex_passthru(self):
16201654

16211655
# GH 7997
@@ -4912,7 +4946,7 @@ def test_groupby_whitelist(self):
49124946
'corr', 'cov',
49134947
'value_counts',
49144948
'diff',
4915-
'unique', 'nunique',
4949+
'unique',
49164950
'nlargest', 'nsmallest',
49174951
])
49184952

0 commit comments

Comments
 (0)