diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index 07afe60c9c22a..5f06188def4f2 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression in :meth:`Series.align` when ``other`` is a DataFrame and ``method`` is not None (:issue:`31785`) - Fixed regression in :meth:`pandas.core.groupby.RollingGroupby.apply` where the ``raw`` parameter was ignored (:issue:`31754`) - Fixed regression in :meth:`rolling(..).corr() ` when using a time offset (:issue:`31789`) +- Fixed regression in :meth:`DataFrameGroupBy.nunique` which was modifying the original values if ``NaN`` values were present (:issue:`31950`) - Fixed regression where :func:`read_pickle` raised a ``UnicodeDecodeError`` when reading a py27 pickle with :class:`MultiIndex` column (:issue:`31988`). - Fixed regression in :class:`DataFrame` arithmetic operations with mis-matched columns (:issue:`31623`) - Fixed regression in :meth:`GroupBy.agg` calling a user-provided function an extra time on an empty input (:issue:`31760`) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 37b6429167646..1bb512aee39e2 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -591,30 +591,18 @@ def nunique(self, dropna: bool = True) -> Series: val = self.obj._internal_get_values() - # GH 27951 - # temporary fix while we wait for NumPy bug 12629 to be fixed - val[isna(val)] = np.datetime64("NaT") - - try: - sorter = np.lexsort((val, ids)) - except TypeError: # catches object dtypes - msg = f"val.dtype must be object, got {val.dtype}" - assert val.dtype == object, msg - val, _ = algorithms.factorize(val, sort=False) - sorter = np.lexsort((val, ids)) - _isna = lambda a: a == -1 - else: - _isna = isna - - ids, val = ids[sorter], val[sorter] + codes, _ = algorithms.factorize(val, sort=False) + sorter = np.lexsort((codes, ids)) + codes = codes[sorter] + ids = ids[sorter] # group boundaries are where group ids change # unique observations are where sorted values change idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] - inc = np.r_[1, val[1:] != val[:-1]] + inc = np.r_[1, codes[1:] != codes[:-1]] # 1st item of each group is a new unique observation - mask = _isna(val) + mask = codes == -1 if dropna: inc[idx] = 1 inc[mask] = 0 diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 6205dfb87bbd0..c402ca194648f 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1017,6 +1017,7 @@ def test_frame_describe_unstacked_format(): @pytest.mark.parametrize("dropna", [False, True]) def test_series_groupby_nunique(n, m, sort, dropna): def check_nunique(df, keys, as_index=True): + original_df = df.copy() gr = df.groupby(keys, as_index=as_index, sort=sort) left = gr["julie"].nunique(dropna=dropna) @@ -1026,6 +1027,7 @@ def check_nunique(df, keys, as_index=True): right = right.reset_index(drop=True) tm.assert_series_equal(left, right, check_names=False) + tm.assert_frame_equal(df, original_df) days = date_range("2015-08-23", periods=10)