diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index 0216007ea5ba8..77a8e7dc6e39d 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -19,7 +19,7 @@ Fixed regressions - Fixed regression in :meth:`Series.align` when ``other`` is a DataFrame and ``method`` is not None (:issue:`31785`) - Fixed regression in :meth:`pandas.core.groupby.RollingGroupby.apply` where the ``raw`` parameter was ignored (:issue:`31754`) - Fixed regression in :meth:`rolling(..).corr() ` when using a time offset (:issue:`31789`) -- +- Fixed regression in :meth:`DataFrameGroupBy.nunique` which was modifying the original values if ``NaN`` values were present (:issue:`31950`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 37b6429167646..ed3856bd58ed5 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -591,20 +591,24 @@ def nunique(self, dropna: bool = True) -> Series: val = self.obj._internal_get_values() - # GH 27951 - # temporary fix while we wait for NumPy bug 12629 to be fixed - val[isna(val)] = np.datetime64("NaT") - - try: - sorter = np.lexsort((val, ids)) - except TypeError: # catches object dtypes - msg = f"val.dtype must be object, got {val.dtype}" - assert val.dtype == object, msg + def _object_sorter(val, ids): val, _ = algorithms.factorize(val, sort=False) sorter = np.lexsort((val, ids)) _isna = lambda a: a == -1 + return val, sorter, _isna + + if isna(val).any() and val.dtype == object: + # Deal with pandas.NaT + val, sorter, _isna = _object_sorter(val, ids) else: - _isna = isna + try: + sorter = np.lexsort((val, ids)) + except TypeError: # catches object dtypes + msg = f"val.dtype must be object, got {val.dtype}" + assert val.dtype == object, msg + val, sorter, _isna = _object_sorter(val, ids) + else: + _isna = isna ids, val = ids[sorter], val[sorter] diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 73e36cb5e6c84..245ed5bf9900b 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -966,6 +966,7 @@ def test_frame_describe_unstacked_format(): @pytest.mark.parametrize("dropna", [False, True]) def test_series_groupby_nunique(n, m, sort, dropna): def check_nunique(df, keys, as_index=True): + original_df = df.copy() gr = df.groupby(keys, as_index=as_index, sort=sort) left = gr["julie"].nunique(dropna=dropna) @@ -975,6 +976,7 @@ def check_nunique(df, keys, as_index=True): right = right.reset_index(drop=True) tm.assert_series_equal(left, right, check_names=False) + tm.assert_frame_equal(df, original_df) days = date_range("2015-08-23", periods=10)