Skip to content

Commit dee302d

Browse files
committed
PERF: imporves performance in SeriesGroupBy.value_counts
1 parent a3c4b59 commit dee302d

File tree

3 files changed

+155
-3
lines changed

3 files changed

+155
-3
lines changed

doc/source/whatsnew/v0.17.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -721,7 +721,7 @@ Performance Improvements
721721
~~~~~~~~~~~~~~~~~~~~~~~~
722722
- Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`)
723723
- Performance improvements in ``Categorical.value_counts`` (:issue:`10804`)
724-
- Performance improvements in ``SeriesGroupBy.nunique`` (:issue:`10820`)
724+
- Performance improvements in ``SeriesGroupBy.nunique`` and ``SeriesGroupBy.value_counts`` (:issue:`10820`)
725725

726726
- 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`)
727727
- 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`)

pandas/core/groupby.py

+103-1
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@
8282

8383
_series_apply_whitelist = \
8484
(_common_apply_whitelist - set(['boxplot'])) | \
85-
frozenset(['dtype', 'value_counts', 'unique', 'nlargest', 'nsmallest'])
85+
frozenset(['dtype', 'unique', 'nlargest', 'nsmallest'])
8686

8787
_dataframe_apply_whitelist = \
8888
_common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
@@ -2583,6 +2583,108 @@ def nunique(self, dropna=True):
25832583
index=self.grouper.result_index,
25842584
name=self.name)
25852585

2586+
def value_counts(self, normalize=False, sort=True, ascending=False,
2587+
bins=None, dropna=True):
2588+
2589+
from functools import partial
2590+
from pandas.tools.tile import cut
2591+
from pandas.tools.merge import _get_join_indexers
2592+
2593+
if bins is not None and not np.iterable(bins):
2594+
# scalar bins cannot be done at top level
2595+
# in a backward compatible way
2596+
return self.apply(Series.value_counts,
2597+
normalize=normalize,
2598+
sort=sort,
2599+
ascending=ascending,
2600+
bins=bins)
2601+
2602+
ids, _, _ = self.grouper.group_info
2603+
val = self.obj.get_values()
2604+
2605+
# groupby removes null keys from groupings
2606+
mask = ids != -1
2607+
ids, val = ids[mask], val[mask]
2608+
2609+
if bins is None:
2610+
lab, lev = algos.factorize(val, sort=True)
2611+
else:
2612+
cat, bins = cut(val, bins, retbins=True)
2613+
# bins[:-1] for backward compat;
2614+
# o.w. cat.categories could be better
2615+
lab, lev, dropna = cat.codes, bins[:-1], False
2616+
2617+
sorter = np.lexsort((lab, ids))
2618+
ids, lab = ids[sorter], lab[sorter]
2619+
2620+
# group boundries are where group ids change
2621+
idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
2622+
2623+
# new values are where sorted labels change
2624+
inc = np.r_[True, lab[1:] != lab[:-1]]
2625+
inc[idx] = True # group boundries are also new values
2626+
out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
2627+
2628+
# num. of times each group should be repeated
2629+
rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
2630+
2631+
# multi-index components
2632+
labels = list(map(rep, self.grouper.recons_labels)) + [lab[inc]]
2633+
levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
2634+
names = self.grouper.names + [self.name]
2635+
2636+
if dropna:
2637+
mask = labels[-1] != -1
2638+
if mask.all():
2639+
dropna = False
2640+
else:
2641+
out, labels = out[mask], [label[mask] for label in labels]
2642+
2643+
if normalize:
2644+
out = out.astype('float')
2645+
acc = rep(np.diff(np.r_[idx, len(ids)]))
2646+
out /= acc[mask] if dropna else acc
2647+
2648+
if sort and bins is None:
2649+
cat = ids[inc][mask] if dropna else ids[inc]
2650+
sorter = np.lexsort((out if ascending else -out, cat))
2651+
out, labels[-1] = out[sorter], labels[-1][sorter]
2652+
2653+
if bins is None:
2654+
mi = MultiIndex(levels=levels, labels=labels, names=names,
2655+
verify_integrity=False)
2656+
2657+
return Series(out, index=mi)
2658+
2659+
# for compat. with algos.value_counts need to ensure every
2660+
# bin is present at every index level, null filled with zeros
2661+
diff = np.zeros(len(out), dtype='bool')
2662+
for lab in labels[:-1]:
2663+
diff |= np.r_[True, lab[1:] != lab[:-1]]
2664+
2665+
ncat, nbin = diff.sum(), len(levels[-1])
2666+
2667+
left = [np.repeat(np.arange(ncat), nbin),
2668+
np.tile(np.arange(nbin), ncat)]
2669+
2670+
right = [diff.cumsum() - 1, labels[-1]]
2671+
2672+
_, idx = _get_join_indexers(left, right, sort=False, how='left')
2673+
out = np.where(idx != -1, out[idx], 0)
2674+
2675+
if sort:
2676+
sorter = np.lexsort((out if ascending else -out, left[0]))
2677+
out, left[-1] = out[sorter], left[-1][sorter]
2678+
2679+
# build the multi-index w/ full levels
2680+
labels = list(map(lambda lab: np.repeat(lab[diff], nbin), labels[:-1]))
2681+
labels.append(left[-1])
2682+
2683+
mi = MultiIndex(levels=levels, labels=labels, names=names,
2684+
verify_integrity=False)
2685+
2686+
return Series(out, index=mi)
2687+
25862688
def _apply_to_column_groupbys(self, func):
25872689
""" return a pass thru """
25882690
return func(self)

pandas/tests/test_groupby.py

+51-1
Original file line numberDiff line numberDiff line change
@@ -1650,6 +1650,57 @@ def check_nunique(df, keys):
16501650
check_nunique(frame, ['jim'])
16511651
check_nunique(frame, ['jim', 'joe'])
16521652

1653+
def test_series_groupby_value_counts(self):
1654+
from itertools import product
1655+
1656+
def rebuild_index(df):
1657+
arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
1658+
df.index = MultiIndex.from_arrays(arr, names=df.index.names)
1659+
return df
1660+
1661+
def check_value_counts(df, keys, bins):
1662+
for isort, normalize, sort, ascending, dropna \
1663+
in product((False, True), repeat=5):
1664+
1665+
kwargs = dict(normalize=normalize, sort=sort,
1666+
ascending=ascending, dropna=dropna,
1667+
bins=bins)
1668+
1669+
gr = df.groupby(keys, sort=isort)
1670+
left = gr['3rd'].value_counts(**kwargs)
1671+
1672+
gr = df.groupby(keys, sort=isort)
1673+
right = gr['3rd'].apply(Series.value_counts, **kwargs)
1674+
right.index.names = right.index.names[:-1] + ['3rd']
1675+
1676+
# have to sort on index because of unstable sort on values
1677+
left, right = map(rebuild_index, (left, right)) # xref GH9212
1678+
assert_series_equal(left.sort_index(), right.sort_index())
1679+
1680+
def loop(df):
1681+
bins = None, np.arange(0, max(5, df['3rd'].max()) + 1, 2)
1682+
keys = '1st', '2nd', ('1st', '2nd')
1683+
for k, b in product(keys, bins):
1684+
check_value_counts(df, k, b)
1685+
1686+
days = date_range('2015-08-24', periods=10)
1687+
1688+
for n, m in product((100, 10000), (5, 20)):
1689+
frame = DataFrame({
1690+
'1st':np.random.choice(list('abcd'), n),
1691+
'2nd':np.random.choice(days, n),
1692+
'3rd':np.random.randint(1, m + 1, n)})
1693+
1694+
loop(frame)
1695+
1696+
frame.loc[1::11, '1st'] = nan
1697+
frame.loc[3::17, '2nd'] = nan
1698+
frame.loc[7::19, '3rd'] = nan
1699+
frame.loc[8::19, '3rd'] = nan
1700+
frame.loc[9::19, '3rd'] = nan
1701+
1702+
loop(frame)
1703+
16531704
def test_mulitindex_passthru(self):
16541705

16551706
# GH 7997
@@ -4944,7 +4995,6 @@ def test_groupby_whitelist(self):
49444995
'plot', 'hist',
49454996
'median', 'dtype',
49464997
'corr', 'cov',
4947-
'value_counts',
49484998
'diff',
49494999
'unique',
49505000
'nlargest', 'nsmallest',

0 commit comments

Comments
 (0)