Skip to content

Commit 191557d

Browse files
authored
PERF: Performance improvement value_counts for masked arrays (#48338)
1 parent 4aaef2d commit 191557d

File tree

5 files changed

+34
-22
lines changed

5 files changed

+34
-22
lines changed

asv_bench/benchmarks/array.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,10 @@ def time_from_float_array(self):
3232

3333
class IntegerArray:
3434
def setup(self):
35-
self.values_integer = np.array([1, 0, 1, 0])
36-
self.data = np.array([1, 2, 3, 4], dtype="int64")
37-
self.mask = np.array([False, False, True, False])
35+
N = 250_000
36+
self.values_integer = np.array([1, 0, 1, 0] * N)
37+
self.data = np.array([1, 2, 3, 4] * N, dtype="int64")
38+
self.mask = np.array([False, False, True, False] * N)
3839

3940
def time_constructor(self):
4041
pd.arrays.IntegerArray(self.data, self.mask)

asv_bench/benchmarks/series_methods.py

+14
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import numpy as np
44

55
from pandas import (
6+
NA,
67
Index,
78
NaT,
89
Series,
@@ -166,6 +167,19 @@ def time_value_counts(self, N, dtype):
166167
self.s.value_counts()
167168

168169

170+
class ValueCountsEA:
171+
172+
params = [[10**3, 10**4, 10**5], [True, False]]
173+
param_names = ["N", "dropna"]
174+
175+
def setup(self, N, dropna):
176+
self.s = Series(np.random.randint(0, N, size=10 * N), dtype="Int64")
177+
self.s.loc[1] = NA
178+
179+
def time_value_counts(self, N, dropna):
180+
self.s.value_counts(dropna=dropna)
181+
182+
169183
class ValueCountsObjectDropNAFalse:
170184

171185
params = [10**3, 10**4, 10**5]

doc/source/whatsnew/v1.6.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ Performance improvements
103103
- Performance improvement in :meth:`.GroupBy.median` for nullable dtypes (:issue:`37493`)
104104
- Performance improvement in :meth:`MultiIndex.argsort` and :meth:`MultiIndex.sort_values` (:issue:`48406`)
105105
- Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`)
106+
- Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`)
107+
- Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`)
106108
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
107109
-
108110

pandas/core/arrays/masked.py

+9-18
Original file line numberDiff line numberDiff line change
@@ -949,31 +949,22 @@ def value_counts(self, dropna: bool = True) -> Series:
949949
)
950950
from pandas.arrays import IntegerArray
951951

952+
keys, value_counts = algos.value_counts_arraylike(
953+
self._data, dropna=True, mask=self._mask
954+
)
955+
952956
if dropna:
953-
keys, counts = algos.value_counts_arraylike(
954-
self._data, dropna=True, mask=self._mask
955-
)
956-
res = Series(counts, index=keys)
957+
res = Series(value_counts, index=keys)
957958
res.index = res.index.astype(self.dtype)
958959
res = res.astype("Int64")
959960
return res
960961

961-
# compute counts on the data with no nans
962-
data = self._data[~self._mask]
963-
value_counts = Index(data).value_counts()
964-
965-
index = value_counts.index
966-
967962
# if we want nans, count the mask
968-
if dropna:
969-
counts = value_counts._values
970-
else:
971-
counts = np.empty(len(value_counts) + 1, dtype="int64")
972-
counts[:-1] = value_counts
973-
counts[-1] = self._mask.sum()
974-
975-
index = index.insert(len(index), self.dtype.na_value)
963+
counts = np.empty(len(value_counts) + 1, dtype="int64")
964+
counts[:-1] = value_counts
965+
counts[-1] = self._mask.sum()
976966

967+
index = Index(keys, dtype=self.dtype).insert(len(keys), self.dtype.na_value)
977968
index = index.astype(self.dtype)
978969

979970
mask = np.zeros(len(counts), dtype="bool")

pandas/core/arrays/numeric.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,11 @@ def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype
189189
raise TypeError("values must be a 1D list-like")
190190

191191
if mask is None:
192-
mask = libmissing.is_numeric_na(values)
192+
if is_integer_dtype(values):
193+
# fastpath
194+
mask = np.zeros(len(values), dtype=np.bool_)
195+
else:
196+
mask = libmissing.is_numeric_na(values)
193197
else:
194198
assert len(mask) == len(values)
195199

0 commit comments

Comments
 (0)