Skip to content

Commit d377cc9

Browse files
authored
PERF: groupby.nunique (#56061)
* PERF: groupby.nunique * Remove fastpath * Remove fastpath * int32 fixup * fixup
1 parent 44b6cb9 commit d377cc9

File tree

2 files changed

+25
-38
lines changed

2 files changed

+25
-38
lines changed

doc/source/whatsnew/v2.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,7 @@ Performance improvements
428428
- Performance improvement in :meth:`Series.str.get_dummies` when dtype is ``"string[pyarrow]"`` or ``"string[pyarrow_numpy]"`` (:issue:`56110`)
429429
- Performance improvement in :meth:`Series.str` methods (:issue:`55736`)
430430
- Performance improvement in :meth:`Series.value_counts` and :meth:`Series.mode` for masked dtypes (:issue:`54984`, :issue:`55340`)
431+
- Performance improvement in :meth:`DataFrameGroupBy.nunique` and :meth:`SeriesGroupBy.nunique` (:issue:`55972`)
431432
- Performance improvement in :meth:`SeriesGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin` (:issue:`54234`)
432433
- Performance improvement when indexing into a non-unique index (:issue:`55816`)
433434
- Performance improvement when indexing with more than 4 keys (:issue:`54550`)

pandas/core/groupby/generic.py

+24-38
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
Interval,
2929
lib,
3030
)
31+
from pandas._libs.hashtable import duplicated
3132
from pandas.errors import SpecificationError
3233
from pandas.util._decorators import (
3334
Appender,
@@ -84,6 +85,7 @@
8485
default_index,
8586
)
8687
from pandas.core.series import Series
88+
from pandas.core.sorting import get_group_index
8789
from pandas.core.util.numba_ import maybe_use_numba
8890

8991
from pandas.plotting import boxplot_frame_groupby
@@ -672,49 +674,33 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame:
672674
2023-02-01 1
673675
Freq: MS, dtype: int64
674676
"""
675-
ids, _, _ = self.grouper.group_info
676-
677+
ids, _, ngroups = self.grouper.group_info
677678
val = self.obj._values
679+
codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False)
680+
681+
if self.grouper.has_dropped_na:
682+
mask = ids >= 0
683+
ids = ids[mask]
684+
codes = codes[mask]
685+
686+
group_index = get_group_index(
687+
labels=[ids, codes],
688+
shape=(ngroups, len(uniques)),
689+
sort=False,
690+
xnull=dropna,
691+
)
678692

679-
codes, _ = algorithms.factorize(val, sort=False)
680-
sorter = np.lexsort((codes, ids))
681-
codes = codes[sorter]
682-
ids = ids[sorter]
683-
684-
# group boundaries are where group ids change
685-
# unique observations are where sorted values change
686-
idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
687-
inc = np.r_[1, codes[1:] != codes[:-1]]
688-
689-
# 1st item of each group is a new unique observation
690-
mask = codes == -1
691693
if dropna:
692-
inc[idx] = 1
693-
inc[mask] = 0
694-
else:
695-
inc[mask & np.r_[False, mask[:-1]]] = 0
696-
inc[idx] = 1
697-
698-
out = np.add.reduceat(inc, idx).astype("int64", copy=False)
699-
if len(ids):
700-
# NaN/NaT group exists if the head of ids is -1,
701-
# so remove it from res and exclude its index from idx
702-
if ids[0] == -1:
703-
res = out[1:]
704-
idx = idx[np.flatnonzero(idx)]
705-
else:
706-
res = out
707-
else:
708-
res = out[1:]
709-
ri = self.grouper.result_index
694+
mask = group_index >= 0
695+
if (~mask).any():
696+
ids = ids[mask]
697+
group_index = group_index[mask]
710698

711-
# we might have duplications among the bins
712-
if len(res) != len(ri):
713-
res, out = np.zeros(len(ri), dtype=out.dtype), res
714-
if len(ids) > 0:
715-
# GH#21334s
716-
res[ids[idx]] = out
699+
mask = duplicated(group_index, "first")
700+
res = np.bincount(ids[~mask], minlength=ngroups)
701+
res = ensure_int64(res)
717702

703+
ri = self.grouper.result_index
718704
result: Series | DataFrame = self.obj._constructor(
719705
res, index=ri, name=self.obj.name
720706
)

0 commit comments

Comments
 (0)