diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 7395f9d2dcb9e..d5f85682dba61 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -274,6 +274,7 @@ Groupby/resample/rolling - Bug in :meth:`Series.rolling.apply`, :meth:`DataFrame.rolling.apply`, :meth:`Series.expanding.apply` and :meth:`DataFrame.expanding.apply` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`42287`) - Bug in :meth:`DataFrame.groupby.rolling.var` would calculate the rolling variance only on the first group (:issue:`42442`) - Bug in :meth:`GroupBy.shift` that would return the grouping columns if ``fill_value`` was not None (:issue:`41556`) +- Bug in :meth:`SeriesGroupBy.nlargest` and :meth:`SeriesGroupBy.nsmallest` would have an inconsistent index when the input Series was sorted and ``n`` was greater than or equal to all group sizes (:issue:`15272`, :issue:`16345`, :issue:`29129`) - Bug in :meth:`pandas.DataFrame.ewm`, where non-float64 dtypes were silently failing (:issue:`42452`) Reshaping diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 59c57cf4a1ea0..a1c2d57a117c0 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -870,6 +870,24 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None): return (filled / shifted) - 1 + @doc(Series.nlargest) + def nlargest(self, n: int = 5, keep: str = "first"): + f = partial(Series.nlargest, n=n, keep=keep) + data = self._obj_with_exclusions + # Don't change behavior if result index happens to be the same, i.e. + # already ordered and n >= all group sizes. + result = self._python_apply_general(f, data, not_indexed_same=True) + return result + + @doc(Series.nsmallest) + def nsmallest(self, n: int = 5, keep: str = "first"): + f = partial(Series.nsmallest, n=n, keep=keep) + data = self._obj_with_exclusions + # Don't change behavior if result index happens to be the same, i.e. + # already ordered and n >= all group sizes. + result = self._python_apply_general(f, data, not_indexed_same=True) + return result + @pin_allowlisted_properties(DataFrame, base.dataframe_apply_allowlist) class DataFrameGroupBy(GroupBy[DataFrame]): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5f9b1dec062f8..61f6e669aca53 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1275,7 +1275,7 @@ def f(g): @final def _python_apply_general( - self, f: F, data: DataFrame | Series + self, f: F, data: DataFrame | Series, not_indexed_same: bool | None = None ) -> DataFrame | Series: """ Apply function f in python space @@ -1286,6 +1286,10 @@ def _python_apply_general( Function to apply data : Series or DataFrame Data to apply f to + not_indexed_same: bool, optional + When specified, overrides the value of not_indexed_same. Apply behaves + differently when the result index is equal to the input index, but + this can be coincidental leading to value-dependent behavior. Returns ------- @@ -1294,8 +1298,11 @@ def _python_apply_general( """ keys, values, mutated = self.grouper.apply(f, data, self.axis) + if not_indexed_same is None: + not_indexed_same = mutated or self.mutated + return self._wrap_applied_output( - data, keys, values, not_indexed_same=mutated or self.mutated + data, keys, values, not_indexed_same=not_indexed_same ) @final diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 77e5e9ba133f5..84ec43b5d38d7 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -680,6 +680,23 @@ def test_nsmallest(): tm.assert_series_equal(gb.nsmallest(3, keep="last"), e) +@pytest.mark.parametrize( + "data, groups", + [([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])], +) +@pytest.mark.parametrize("method", ["nlargest", "nsmallest"]) +def test_nlargest_and_smallest_noop(data, groups, method): + # GH 15272, GH 16345, GH 29129 + # Test nlargest/smallest when it results in a noop, + # i.e. input is sorted and group size <= n + if method == "nlargest": + data = list(reversed(data)) + ser = Series(data, name="a") + result = getattr(ser.groupby(groups), method)(n=2) + expected = Series(data, index=MultiIndex.from_arrays([groups, ser.index]), name="a") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("func", ["cumprod", "cumsum"]) def test_numpy_compat(func): # see gh-12811