Skip to content

Commit 1969079

Browse files
authored
Implement masked algorithm for mode (#55340)
1 parent 47a596e commit 1969079

File tree

6 files changed

+49
-9
lines changed

6 files changed

+49
-9
lines changed

doc/source/whatsnew/v2.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,7 @@ Performance improvements
321321
- Performance improvement in :meth:`MultiIndex.get_indexer` when ``method`` is not ``None`` (:issue:`55839`)
322322
- Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`)
323323
- Performance improvement in :meth:`Series.str` methods (:issue:`55736`)
324+
- Performance improvement in :meth:`Series.value_counts` and :meth:`Series.mode` for masked dtypes (:issue:`54984`, :issue:`55340`)
324325
- Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`)
325326
- Performance improvement when indexing into a non-unique index (:issue:`55816`)
326327
- Performance improvement when indexing with more than 4 keys (:issue:`54550`)

pandas/_libs/hashtable_func_helper.pxi.in

+7-3
Original file line numberDiff line numberDiff line change
@@ -404,12 +404,13 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
404404
cdef:
405405
ndarray[htfunc_t] keys
406406
ndarray[htfunc_t] modes
407+
ndarray[uint8_t] res_mask = None
407408

408409
int64_t[::1] counts
409410
int64_t count, _, max_count = -1
410-
Py_ssize_t nkeys, k, j = 0
411+
Py_ssize_t nkeys, k, na_counter, j = 0
411412

412-
keys, counts, _ = value_count(values, dropna, mask=mask)
413+
keys, counts, na_counter = value_count(values, dropna, mask=mask)
413414
nkeys = len(keys)
414415

415416
modes = np.empty(nkeys, dtype=values.dtype)
@@ -440,7 +441,10 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
440441

441442
modes[j] = keys[k]
442443

443-
return modes[:j + 1]
444+
if na_counter > 0:
445+
res_mask = np.zeros(j+1, dtype=np.bool_)
446+
res_mask[j] = True
447+
return modes[:j + 1], res_mask
444448

445449

446450
{{py:

pandas/core/algorithms.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1034,7 +1034,10 @@ def mode(
10341034

10351035
values = _ensure_data(values)
10361036

1037-
npresult = htable.mode(values, dropna=dropna, mask=mask)
1037+
npresult, res_mask = htable.mode(values, dropna=dropna, mask=mask)
1038+
if res_mask is not None:
1039+
return npresult, res_mask # type: ignore[return-value]
1040+
10381041
try:
10391042
npresult = np.sort(npresult)
10401043
except TypeError as err:

pandas/core/arrays/masked.py

+10
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
from pandas.core.algorithms import (
7070
factorize_array,
7171
isin,
72+
mode,
7273
take,
7374
)
7475
from pandas.core.array_algos import (
@@ -1069,6 +1070,15 @@ def value_counts(self, dropna: bool = True) -> Series:
10691070
)
10701071
return Series(arr, index=index, name="count", copy=False)
10711072

1073+
def _mode(self, dropna: bool = True) -> Self:
1074+
if dropna:
1075+
result = mode(self._data, dropna=dropna, mask=self._mask)
1076+
res_mask = np.zeros(result.shape, dtype=np.bool_)
1077+
else:
1078+
result, res_mask = mode(self._data, dropna=dropna, mask=self._mask)
1079+
result = type(self)(result, res_mask) # type: ignore[arg-type]
1080+
return result[result.argsort()]
1081+
10721082
@doc(ExtensionArray.equals)
10731083
def equals(self, other) -> bool:
10741084
if type(self) != type(other):

pandas/tests/libs/test_hashtable.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -644,21 +644,21 @@ def test_mode(self, dtype, writable):
644644
values = np.repeat(np.arange(N).astype(dtype), 5)
645645
values[0] = 42
646646
values.flags.writeable = writable
647-
result = ht.mode(values, False)
647+
result = ht.mode(values, False)[0]
648648
assert result == 42
649649

650650
def test_mode_stable(self, dtype, writable):
651651
values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype)
652652
values.flags.writeable = writable
653-
keys = ht.mode(values, False)
653+
keys = ht.mode(values, False)[0]
654654
tm.assert_numpy_array_equal(keys, values)
655655

656656

657657
def test_modes_with_nans():
658658
# GH42688, nans aren't mangled
659659
nulls = [pd.NA, np.nan, pd.NaT, None]
660660
values = np.array([True] + nulls * 2, dtype=np.object_)
661-
modes = ht.mode(values, False)
661+
modes = ht.mode(values, False)[0]
662662
assert modes.size == len(nulls)
663663

664664

@@ -724,8 +724,8 @@ def test_ismember_no(self, dtype):
724724

725725
def test_mode(self, dtype):
726726
values = np.array([42, np.nan, np.nan, np.nan], dtype=dtype)
727-
assert ht.mode(values, True) == 42
728-
assert np.isnan(ht.mode(values, False))
727+
assert ht.mode(values, True)[0] == 42
728+
assert np.isnan(ht.mode(values, False)[0])
729729

730730

731731
def test_ismember_tuple_with_nans():

pandas/tests/series/test_reductions.py

+22
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,28 @@ def test_mode_extension_dtype(as_period):
2929
tm.assert_series_equal(res, ser)
3030

3131

32+
def test_mode_nullable_dtype(any_numeric_ea_dtype):
33+
# GH#55340
34+
ser = Series([1, 3, 2, pd.NA, 3, 2, pd.NA], dtype=any_numeric_ea_dtype)
35+
result = ser.mode(dropna=False)
36+
expected = Series([2, 3, pd.NA], dtype=any_numeric_ea_dtype)
37+
tm.assert_series_equal(result, expected)
38+
39+
result = ser.mode(dropna=True)
40+
expected = Series([2, 3], dtype=any_numeric_ea_dtype)
41+
tm.assert_series_equal(result, expected)
42+
43+
ser[-1] = pd.NA
44+
45+
result = ser.mode(dropna=True)
46+
expected = Series([2, 3], dtype=any_numeric_ea_dtype)
47+
tm.assert_series_equal(result, expected)
48+
49+
result = ser.mode(dropna=False)
50+
expected = Series([pd.NA], dtype=any_numeric_ea_dtype)
51+
tm.assert_series_equal(result, expected)
52+
53+
3254
def test_reductions_td64_with_nat():
3355
# GH#8617
3456
ser = Series([0, pd.NaT], dtype="m8[ns]")

0 commit comments

Comments
 (0)