From 0fb5c3c41ad8b8abbafbad6289801e49ab2fa884 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 18 Jul 2021 14:38:38 -0400 Subject: [PATCH 1/2] BUG: SeriesGroupBy.nlargest/smallest inconsistent shape --- pandas/core/groupby/generic.py | 18 ++++++++++++++++++ pandas/core/groupby/groupby.py | 11 +++++++++-- pandas/tests/groupby/test_function.py | 17 +++++++++++++++++ 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 88d1baae86467..ef254f9e7bdb5 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -870,6 +870,24 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None): return (filled / shifted) - 1 + @doc(Series.nlargest) + def nlargest(self, n: int = 5, keep: str = "first"): + f = partial(Series.nlargest, n=n, keep=keep) + data = self._obj_with_exclusions + # Don't change behavior if result index happens to be the same, i.e. + # already ordered and n >= all group sizes. + result = self._python_apply_general(f, data, not_indexed_same=True) + return result + + @doc(Series.nsmallest) + def nsmallest(self, n: int = 5, keep: str = "first"): + f = partial(Series.nsmallest, n=n, keep=keep) + data = self._obj_with_exclusions + # Don't change behavior if result index happens to be the same, i.e. + # already ordered and n >= all group sizes. + result = self._python_apply_general(f, data, not_indexed_same=True) + return result + @pin_allowlisted_properties(DataFrame, base.dataframe_apply_allowlist) class DataFrameGroupBy(GroupBy[DataFrame]): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d3a86fa5950ed..7fda181cf2e67 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1265,7 +1265,7 @@ def f(g): @final def _python_apply_general( - self, f: F, data: DataFrame | Series + self, f: F, data: DataFrame | Series, not_indexed_same: bool | None = None ) -> DataFrame | Series: """ Apply function f in python space @@ -1276,6 +1276,10 @@ def _python_apply_general( Function to apply data : Series or DataFrame Data to apply f to + not_indexed_same: bool, optional + When specified, overrides the value of not_indexed_same. Apply behaves + differently when the result index is equal to the input index, but + this can be coincidental leading to value-dependent behavior. Returns ------- @@ -1284,8 +1288,11 @@ def _python_apply_general( """ keys, values, mutated = self.grouper.apply(f, data, self.axis) + if not_indexed_same is None: + not_indexed_same = mutated or self.mutated + return self._wrap_applied_output( - data, keys, values, not_indexed_same=mutated or self.mutated + data, keys, values, not_indexed_same=not_indexed_same ) @final diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 5434fc49e2174..bb8c8f62d9917 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -680,6 +680,23 @@ def test_nsmallest(): tm.assert_series_equal(gb.nsmallest(3, keep="last"), e) +@pytest.mark.parametrize( + "data, groups", + [([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])], +) +@pytest.mark.parametrize("method", ["nlargest", "nsmallest"]) +def test_nlargest_and_smallest_noop(data, groups, method): + # GH 15272, GH 16345, GH 29129 + # Test nlargest/smallest when it results in a noop, + # i.e. input is sorted and group size <= n + if method == "nlargest": + data = list(reversed(data)) + ser = Series(data, name="a") + result = getattr(ser.groupby(groups), method)(n=2) + expected = Series(data, index=MultiIndex.from_arrays([groups, ser.index]), name="a") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("func", ["cumprod", "cumsum"]) def test_numpy_compat(func): # see gh-12811 From 6b2510cb9ed67bd885bf6429c16b652514254244 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 18 Jul 2021 14:44:05 -0400 Subject: [PATCH 2/2] whatsnew --- doc/source/whatsnew/v1.4.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 8608dffd58090..e90752fc9dcfc 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -252,6 +252,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`Series.rolling.apply`, :meth:`DataFrame.rolling.apply`, :meth:`Series.expanding.apply` and :meth:`DataFrame.expanding.apply` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`42287`) +- Bug in :meth:`SeriesGroupBy.nlargest` and :meth:`SeriesGroupBy.nsmallest` would have an inconsistent index when the input Series was sorted and ``n`` was greater than or equal to all group sizes (:issue:`15272`, :issue:`16345`, :issue:`29129`) - Reshaping