Skip to content

Commit c5a47e3

Browse files
committed
PERF: uses bincount instead of hash table in categorical value counts
1 parent e581e1e commit c5a47e3

File tree

3 files changed

+33
-13
lines changed

3 files changed

+33
-13
lines changed

asv_bench/benchmarks/categoricals.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,20 @@ def setup(self):
88
self.s = pd.Series((list('aabbcd') * 1000000)).astype('category')
99

1010
def time_concat_categorical(self):
11-
concat([self.s, self.s])
11+
concat([self.s, self.s])
12+
13+
14+
class categorical_value_counts(object):
15+
goal_time = 1
16+
17+
def setup(self):
18+
n = 500000
19+
np.random.seed(2718281)
20+
arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)]
21+
self.ts = Series(arr).astype('category')
22+
23+
def time_value_counts(self):
24+
self.ts.value_counts(dropna=False)
25+
26+
def time_value_counts_dropna(self):
27+
self.ts.value_counts(dropna=True)

doc/source/whatsnew/v0.17.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -660,6 +660,7 @@ Removal of prior version deprecations/changes
660660
Performance Improvements
661661
~~~~~~~~~~~~~~~~~~~~~~~~
662662
- Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`)
663+
- Performance improvements in ``Categorical.value_counts`` (:issue:`10804`)
663664

664665
- 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`)
665666
- 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`)

pandas/core/categorical.py

+15-12
Original file line numberDiff line numberDiff line change
@@ -1025,25 +1025,28 @@ def value_counts(self, dropna=True):
10251025
-------
10261026
counts : Series
10271027
"""
1028-
import pandas.hashtable as htable
1028+
from numpy import bincount
1029+
from pandas.core.common import isnull
10291030
from pandas.core.series import Series
10301031
from pandas.core.index import CategoricalIndex
10311032

1032-
cat = self.dropna() if dropna else self
1033-
keys, counts = htable.value_count_scalar64(com._ensure_int64(cat._codes), dropna)
1034-
result = Series(counts, index=keys)
1033+
obj = self.remove_categories([np.nan]) \
1034+
if dropna and isnull(self.categories).any() else self
1035+
1036+
code, cat = obj._codes, obj.categories
1037+
ncat, mask = len(cat), 0 <= code
1038+
ix, clean = np.arange(ncat), mask.all()
10351039

1036-
ix = np.arange(len(cat.categories), dtype='int64')
1037-
if not dropna and -1 in keys:
1040+
if dropna or clean:
1041+
count = bincount(code if clean else code[mask], minlength=ncat)
1042+
else:
1043+
count = bincount(np.where(mask, code, ncat))
10381044
ix = np.append(ix, -1)
1039-
result = result.reindex(ix, fill_value=0)
1040-
index = (np.append(cat.categories, np.nan)
1041-
if not dropna and -1 in keys
1042-
else cat.categories)
10431045

1044-
result.index = CategoricalIndex(index, self.categories, self.ordered)
1046+
ix = Categorical(ix, categories=cat,
1047+
ordered=obj.ordered, fastpath=True)
10451048

1046-
return result
1049+
return Series(count, index=CategoricalIndex(ix))
10471050

10481051
def get_values(self):
10491052
""" Return the values.

0 commit comments

Comments
 (0)