PERF: uses bincount instead of hash table in categorical value counts

behzadnouri · behzadnouri · commit c5a47e3a174c · 2015-08-22T12:15:09.000-04:00
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -8,4 +8,20 @@ def setup(self):
         self.s = pd.Series((list('aabbcd') * 1000000)).astype('category')
 
     def time_concat_categorical(self):
-        concat([self.s, self.s])
+        concat([self.s, self.s])
+
+
+class categorical_value_counts(object):
+    goal_time = 1
+
+    def setup(self):
+        n = 500000
+        np.random.seed(2718281)
+        arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)]
+        self.ts = Series(arr).astype('category')
+
+    def time_value_counts(self):
+        self.ts.value_counts(dropna=False)
+
+    def time_value_counts_dropna(self):
+        self.ts.value_counts(dropna=True)
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -660,6 +660,7 @@ Removal of prior version deprecations/changes
 Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 - Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`)
+- Performance improvements in ``Categorical.value_counts`` (:issue:`10804`)
 
 - 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`)
 - 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`)
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -1025,25 +1025,28 @@ def value_counts(self, dropna=True):
         -------
         counts : Series
         """
-        import pandas.hashtable as htable
+        from numpy import bincount
+        from pandas.core.common import isnull
         from pandas.core.series import Series
         from pandas.core.index import CategoricalIndex
 
-        cat = self.dropna() if dropna else self
-        keys, counts = htable.value_count_scalar64(com._ensure_int64(cat._codes), dropna)
-        result = Series(counts, index=keys)
+        obj = self.remove_categories([np.nan]) \
+                if dropna and isnull(self.categories).any() else self
+
+        code, cat = obj._codes, obj.categories
+        ncat, mask = len(cat), 0 <= code
+        ix, clean = np.arange(ncat), mask.all()
 
-        ix = np.arange(len(cat.categories), dtype='int64')
-        if not dropna and -1 in keys:
+        if dropna or clean:
+            count = bincount(code if clean else code[mask], minlength=ncat)
+        else:
+            count = bincount(np.where(mask, code, ncat))
             ix = np.append(ix, -1)
-        result = result.reindex(ix, fill_value=0)
-        index = (np.append(cat.categories, np.nan)
-            if not dropna and -1 in keys
-            else cat.categories)
 
-        result.index = CategoricalIndex(index, self.categories, self.ordered)
+        ix = Categorical(ix, categories=cat,
+                ordered=obj.ordered, fastpath=True)
 
-        return result
+        return Series(count, index=CategoricalIndex(ix))
 
     def get_values(self):
         """ Return the values.