|
28 | 28 | Interval,
|
29 | 29 | lib,
|
30 | 30 | )
|
| 31 | +from pandas._libs.hashtable import duplicated |
31 | 32 | from pandas.errors import SpecificationError
|
32 | 33 | from pandas.util._decorators import (
|
33 | 34 | Appender,
|
|
84 | 85 | default_index,
|
85 | 86 | )
|
86 | 87 | from pandas.core.series import Series
|
| 88 | +from pandas.core.sorting import get_group_index |
87 | 89 | from pandas.core.util.numba_ import maybe_use_numba
|
88 | 90 |
|
89 | 91 | from pandas.plotting import boxplot_frame_groupby
|
@@ -672,49 +674,33 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame:
|
672 | 674 | 2023-02-01 1
|
673 | 675 | Freq: MS, dtype: int64
|
674 | 676 | """
|
675 |
| - ids, _, _ = self.grouper.group_info |
676 |
| - |
| 677 | + ids, _, ngroups = self.grouper.group_info |
677 | 678 | val = self.obj._values
|
| 679 | + codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False) |
| 680 | + |
| 681 | + if self.grouper.has_dropped_na: |
| 682 | + mask = ids >= 0 |
| 683 | + ids = ids[mask] |
| 684 | + codes = codes[mask] |
| 685 | + |
| 686 | + group_index = get_group_index( |
| 687 | + labels=[ids, codes], |
| 688 | + shape=(ngroups, len(uniques)), |
| 689 | + sort=False, |
| 690 | + xnull=dropna, |
| 691 | + ) |
678 | 692 |
|
679 |
| - codes, _ = algorithms.factorize(val, sort=False) |
680 |
| - sorter = np.lexsort((codes, ids)) |
681 |
| - codes = codes[sorter] |
682 |
| - ids = ids[sorter] |
683 |
| - |
684 |
| - # group boundaries are where group ids change |
685 |
| - # unique observations are where sorted values change |
686 |
| - idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] |
687 |
| - inc = np.r_[1, codes[1:] != codes[:-1]] |
688 |
| - |
689 |
| - # 1st item of each group is a new unique observation |
690 |
| - mask = codes == -1 |
691 | 693 | if dropna:
|
692 |
| - inc[idx] = 1 |
693 |
| - inc[mask] = 0 |
694 |
| - else: |
695 |
| - inc[mask & np.r_[False, mask[:-1]]] = 0 |
696 |
| - inc[idx] = 1 |
697 |
| - |
698 |
| - out = np.add.reduceat(inc, idx).astype("int64", copy=False) |
699 |
| - if len(ids): |
700 |
| - # NaN/NaT group exists if the head of ids is -1, |
701 |
| - # so remove it from res and exclude its index from idx |
702 |
| - if ids[0] == -1: |
703 |
| - res = out[1:] |
704 |
| - idx = idx[np.flatnonzero(idx)] |
705 |
| - else: |
706 |
| - res = out |
707 |
| - else: |
708 |
| - res = out[1:] |
709 |
| - ri = self.grouper.result_index |
| 694 | + mask = group_index >= 0 |
| 695 | + if (~mask).any(): |
| 696 | + ids = ids[mask] |
| 697 | + group_index = group_index[mask] |
710 | 698 |
|
711 |
| - # we might have duplications among the bins |
712 |
| - if len(res) != len(ri): |
713 |
| - res, out = np.zeros(len(ri), dtype=out.dtype), res |
714 |
| - if len(ids) > 0: |
715 |
| - # GH#21334s |
716 |
| - res[ids[idx]] = out |
| 699 | + mask = duplicated(group_index, "first") |
| 700 | + res = np.bincount(ids[~mask], minlength=ngroups) |
| 701 | + res = ensure_int64(res) |
717 | 702 |
|
| 703 | + ri = self.grouper.result_index |
718 | 704 | result: Series | DataFrame = self.obj._constructor(
|
719 | 705 | res, index=ri, name=self.obj.name
|
720 | 706 | )
|
|
0 commit comments