Merge pull request #10909 from behzadnouri/grby-val-cnt

jreback · jreback · commit c3a4de31777b · 2015-08-26T23:16:19.000-04:00
PERF: imporves performance in SeriesGroupBy.value_counts
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -721,7 +721,7 @@ Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 - Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`)
 - Performance improvements in ``Categorical.value_counts`` (:issue:`10804`)
-- Performance improvements in ``SeriesGroupBy.nunique`` (:issue:`10820`)
+- Performance improvements in ``SeriesGroupBy.nunique`` and ``SeriesGroupBy.value_counts`` (:issue:`10820`)
 
 - 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`)
 - 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -82,7 +82,7 @@
 
 _series_apply_whitelist = \
     (_common_apply_whitelist - set(['boxplot'])) | \
-    frozenset(['dtype', 'value_counts', 'unique', 'nlargest', 'nsmallest'])
+    frozenset(['dtype', 'unique', 'nlargest', 'nsmallest'])
 
 _dataframe_apply_whitelist = \
     _common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
@@ -2583,6 +2583,108 @@ def nunique(self, dropna=True):
                       index=self.grouper.result_index,
                       name=self.name)
 
+    def value_counts(self, normalize=False, sort=True, ascending=False,
+                     bins=None, dropna=True):
+
+        from functools import partial
+        from pandas.tools.tile import cut
+        from pandas.tools.merge import _get_join_indexers
+
+        if bins is not None and not np.iterable(bins):
+            # scalar bins cannot be done at top level
+            # in a backward compatible way
+            return self.apply(Series.value_counts,
+                              normalize=normalize,
+                              sort=sort,
+                              ascending=ascending,
+                              bins=bins)
+
+        ids, _, _ = self.grouper.group_info
+        val = self.obj.get_values()
+
+        # groupby removes null keys from groupings
+        mask = ids != -1
+        ids, val = ids[mask], val[mask]
+
+        if bins is None:
+            lab, lev = algos.factorize(val, sort=True)
+        else:
+            cat, bins = cut(val, bins, retbins=True)
+            # bins[:-1] for backward compat;
+            # o.w. cat.categories could be better
+            lab, lev, dropna = cat.codes, bins[:-1], False
+
+        sorter = np.lexsort((lab, ids))
+        ids, lab = ids[sorter], lab[sorter]
+
+        # group boundries are where group ids change
+        idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
+
+        # new values are where sorted labels change
+        inc = np.r_[True, lab[1:] != lab[:-1]]
+        inc[idx] = True  # group boundries are also new values
+        out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
+
+        # num. of times each group should be repeated
+        rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
+
+        # multi-index components
+        labels = list(map(rep, self.grouper.recons_labels)) + [lab[inc]]
+        levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
+        names = self.grouper.names + [self.name]
+
+        if dropna:
+            mask = labels[-1] != -1
+            if mask.all():
+                dropna = False
+            else:
+                out, labels = out[mask], [label[mask] for label in labels]
+
+        if normalize:
+            out = out.astype('float')
+            acc = rep(np.diff(np.r_[idx, len(ids)]))
+            out /= acc[mask] if dropna else acc
+
+        if sort and bins is None:
+            cat = ids[inc][mask] if dropna else ids[inc]
+            sorter = np.lexsort((out if ascending else -out, cat))
+            out, labels[-1] = out[sorter], labels[-1][sorter]
+
+        if bins is None:
+            mi = MultiIndex(levels=levels, labels=labels, names=names,
+                            verify_integrity=False)
+
+            return Series(out, index=mi)
+
+        # for compat. with algos.value_counts need to ensure every
+        # bin is present at every index level, null filled with zeros
+        diff = np.zeros(len(out), dtype='bool')
+        for lab in labels[:-1]:
+            diff |= np.r_[True, lab[1:] != lab[:-1]]
+
+        ncat, nbin = diff.sum(), len(levels[-1])
+
+        left = [np.repeat(np.arange(ncat), nbin),
+                np.tile(np.arange(nbin), ncat)]
+
+        right = [diff.cumsum() - 1, labels[-1]]
+
+        _, idx = _get_join_indexers(left, right, sort=False, how='left')
+        out = np.where(idx != -1, out[idx], 0)
+
+        if sort:
+            sorter = np.lexsort((out if ascending else -out, left[0]))
+            out, left[-1] = out[sorter], left[-1][sorter]
+
+        # build the multi-index w/ full levels
+        labels = list(map(lambda lab: np.repeat(lab[diff], nbin), labels[:-1]))
+        labels.append(left[-1])
+
+        mi = MultiIndex(levels=levels, labels=labels, names=names,
+                        verify_integrity=False)
+
+        return Series(out, index=mi)
+
     def _apply_to_column_groupbys(self, func):
         """ return a pass thru """
         return func(self)
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -1650,6 +1650,57 @@ def check_nunique(df, keys):
             check_nunique(frame, ['jim'])
             check_nunique(frame, ['jim', 'joe'])
 
+    def test_series_groupby_value_counts(self):
+        from itertools import product
+
+        def rebuild_index(df):
+            arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
+            df.index = MultiIndex.from_arrays(arr, names=df.index.names)
+            return df
+
+        def check_value_counts(df, keys, bins):
+            for isort, normalize, sort, ascending, dropna \
+                    in product((False, True), repeat=5):
+
+                kwargs = dict(normalize=normalize, sort=sort,
+                              ascending=ascending, dropna=dropna,
+                              bins=bins)
+
+                gr = df.groupby(keys, sort=isort)
+                left = gr['3rd'].value_counts(**kwargs)
+
+                gr = df.groupby(keys, sort=isort)
+                right = gr['3rd'].apply(Series.value_counts, **kwargs)
+                right.index.names = right.index.names[:-1] + ['3rd']
+
+                # have to sort on index because of unstable sort on values
+                left, right = map(rebuild_index, (left, right)) # xref GH9212
+                assert_series_equal(left.sort_index(), right.sort_index())
+
+        def loop(df):
+            bins = None, np.arange(0, max(5, df['3rd'].max()) + 1, 2)
+            keys = '1st', '2nd', ('1st', '2nd')
+            for k, b in product(keys, bins):
+                check_value_counts(df, k, b)
+
+        days = date_range('2015-08-24', periods=10)
+
+        for n, m in product((100, 10000), (5, 20)):
+            frame = DataFrame({
+                '1st':np.random.choice(list('abcd'), n),
+                '2nd':np.random.choice(days, n),
+                '3rd':np.random.randint(1, m + 1, n)})
+
+            loop(frame)
+
+            frame.loc[1::11, '1st'] = nan
+            frame.loc[3::17, '2nd'] = nan
+            frame.loc[7::19, '3rd'] = nan
+            frame.loc[8::19, '3rd'] = nan
+            frame.loc[9::19, '3rd'] = nan
+
+            loop(frame)
+
     def test_mulitindex_passthru(self):
 
         # GH 7997
@@ -4944,7 +4995,6 @@ def test_groupby_whitelist(self):
             'plot', 'hist',
             'median', 'dtype',
             'corr', 'cov',
-            'value_counts',
             'diff',
             'unique',
             'nlargest', 'nsmallest',