diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 34caef221a340..80b277336df7a 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -8,4 +8,20 @@ def setup(self): self.s = pd.Series((list('aabbcd') * 1000000)).astype('category') def time_concat_categorical(self): - concat([self.s, self.s]) \ No newline at end of file + concat([self.s, self.s]) + + +class categorical_value_counts(object): + goal_time = 1 + + def setup(self): + n = 500000 + np.random.seed(2718281) + arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] + self.ts = Series(arr).astype('category') + + def time_value_counts(self): + self.ts.value_counts(dropna=False) + + def time_value_counts_dropna(self): + self.ts.value_counts(dropna=True) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 6b4bde588469e..5df49b1457a7e 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -660,6 +660,7 @@ Removal of prior version deprecations/changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`) +- Performance improvements in ``Categorical.value_counts`` (:issue:`10804`) - 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`) - 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index ba2c9314322c3..78c9d264c43a5 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1025,25 +1025,28 @@ def value_counts(self, dropna=True): ------- counts : Series """ - import pandas.hashtable as htable + from numpy import bincount + from pandas.core.common import isnull from pandas.core.series import Series from pandas.core.index import CategoricalIndex - cat = self.dropna() if dropna else self - keys, counts = htable.value_count_scalar64(com._ensure_int64(cat._codes), dropna) - result = Series(counts, index=keys) + obj = self.remove_categories([np.nan]) \ + if dropna and isnull(self.categories).any() else self + + code, cat = obj._codes, obj.categories + ncat, mask = len(cat), 0 <= code + ix, clean = np.arange(ncat), mask.all() - ix = np.arange(len(cat.categories), dtype='int64') - if not dropna and -1 in keys: + if dropna or clean: + count = bincount(code if clean else code[mask], minlength=ncat) + else: + count = bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) - result = result.reindex(ix, fill_value=0) - index = (np.append(cat.categories, np.nan) - if not dropna and -1 in keys - else cat.categories) - result.index = CategoricalIndex(index, self.categories, self.ordered) + ix = Categorical(ix, categories=cat, + ordered=obj.ordered, fastpath=True) - return result + return Series(count, index=CategoricalIndex(ix)) def get_values(self): """ Return the values.