Skip to content

Commit e40ff65

Browse files
Backport PR pandas-dev#32175: BUG: groupby nunique changing values (pandas-dev#32201)
Co-authored-by: Marco Gorelli <[email protected]>
1 parent 791c7fc commit e40ff65

File tree

3 files changed

+9
-18
lines changed

3 files changed

+9
-18
lines changed

doc/source/whatsnew/v1.0.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Fixed regressions
1919
- Fixed regression in :meth:`Series.align` when ``other`` is a DataFrame and ``method`` is not None (:issue:`31785`)
2020
- Fixed regression in :meth:`pandas.core.groupby.RollingGroupby.apply` where the ``raw`` parameter was ignored (:issue:`31754`)
2121
- Fixed regression in :meth:`rolling(..).corr() <pandas.core.window.Rolling.corr>` when using a time offset (:issue:`31789`)
22+
- Fixed regression in :meth:`DataFrameGroupBy.nunique` which was modifying the original values if ``NaN`` values were present (:issue:`31950`)
2223
- Fixed regression where :func:`read_pickle` raised a ``UnicodeDecodeError`` when reading a py27 pickle with :class:`MultiIndex` column (:issue:`31988`).
2324
- Fixed regression in :class:`DataFrame` arithmetic operations with mis-matched columns (:issue:`31623`)
2425
- Fixed regression in :meth:`GroupBy.agg` calling a user-provided function an extra time on an empty input (:issue:`31760`)

pandas/core/groupby/generic.py

+6-18
Original file line numberDiff line numberDiff line change
@@ -588,30 +588,18 @@ def nunique(self, dropna: bool = True) -> Series:
588588

589589
val = self.obj._internal_get_values()
590590

591-
# GH 27951
592-
# temporary fix while we wait for NumPy bug 12629 to be fixed
593-
val[isna(val)] = np.datetime64("NaT")
594-
595-
try:
596-
sorter = np.lexsort((val, ids))
597-
except TypeError: # catches object dtypes
598-
msg = f"val.dtype must be object, got {val.dtype}"
599-
assert val.dtype == object, msg
600-
val, _ = algorithms.factorize(val, sort=False)
601-
sorter = np.lexsort((val, ids))
602-
_isna = lambda a: a == -1
603-
else:
604-
_isna = isna
605-
606-
ids, val = ids[sorter], val[sorter]
591+
codes, _ = algorithms.factorize(val, sort=False)
592+
sorter = np.lexsort((codes, ids))
593+
codes = codes[sorter]
594+
ids = ids[sorter]
607595

608596
# group boundaries are where group ids change
609597
# unique observations are where sorted values change
610598
idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
611-
inc = np.r_[1, val[1:] != val[:-1]]
599+
inc = np.r_[1, codes[1:] != codes[:-1]]
612600

613601
# 1st item of each group is a new unique observation
614-
mask = _isna(val)
602+
mask = codes == -1
615603
if dropna:
616604
inc[idx] = 1
617605
inc[mask] = 0

pandas/tests/groupby/test_function.py

+2
Original file line numberDiff line numberDiff line change
@@ -972,6 +972,7 @@ def test_frame_describe_unstacked_format():
972972
@pytest.mark.parametrize("dropna", [False, True])
973973
def test_series_groupby_nunique(n, m, sort, dropna):
974974
def check_nunique(df, keys, as_index=True):
975+
original_df = df.copy()
975976
gr = df.groupby(keys, as_index=as_index, sort=sort)
976977
left = gr["julie"].nunique(dropna=dropna)
977978

@@ -981,6 +982,7 @@ def check_nunique(df, keys, as_index=True):
981982
right = right.reset_index(drop=True)
982983

983984
tm.assert_series_equal(left, right, check_names=False)
985+
tm.assert_frame_equal(df, original_df)
984986

985987
days = date_range("2015-08-23", periods=10)
986988

0 commit comments

Comments
 (0)