diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 92124a536fe26..57d2f0ce78f67 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -441,6 +441,8 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.groups` with a datetime key in conjunction with another key produced incorrect number of group keys (:issue:`51158`) - Bug in :meth:`GroupBy.quantile` may implicitly sort the result index with ``sort=False`` (:issue:`53009`) - Bug in :meth:`GroupBy.var` failing to raise ``TypeError`` when called with datetime64, timedelta64 or :class:`PeriodDtype` values (:issue:`52128`, :issue:`53045`) +- Bug in :meth:`SeriresGroupBy.nth` and :meth:`DataFrameGroupBy.nth` after performing column selection when using ``dropna="any"`` or ``dropna="all"`` would not subset columns (:issue:`53518`) +- Bug in :meth:`SeriresGroupBy.nth` and :meth:`DataFrameGroupBy.nth` raised after performing column selection when using ``dropna="any"`` or ``dropna="all"`` resulted in rows being dropped (:issue:`53518`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index bdab641719ded..d60141bc5a96c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3202,11 +3202,14 @@ def _nth( # old behaviour, but with all and any support for DataFrames. # modified in GH 7559 to have better perf n = cast(int, n) - dropped = self.obj.dropna(how=dropna, axis=self.axis) + dropped = self._selected_obj.dropna(how=dropna, axis=self.axis) # get a new grouper for our dropped obj grouper: np.ndarray | Index | ops.BaseGrouper - if self.keys is None and self.level is None: + if len(dropped) == len(self._selected_obj): + # Nothing was dropped, can use the same grouper + grouper = self.grouper + else: # we don't have the grouper info available # (e.g. we have selected out # a column that is not in the current object) @@ -3220,17 +3223,6 @@ def _nth( values = np.where(nulls, NA, grouper) # type: ignore[call-overload] grouper = Index(values, dtype="Int64") - else: - # create a grouper with the original parameters, but on dropped - # object - grouper, _, _ = get_grouper( - dropped, - key=self.keys, - axis=self.axis, - level=self.level, - sort=self.sort, - ) - if self.axis == 1: grb = dropped.T.groupby(grouper, as_index=self.as_index, sort=self.sort) else: diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index f744c5b741368..f0ca42c2e2719 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -852,3 +852,24 @@ def test_head_tail_dropna_false(): result = df.groupby(["X", "Y"], dropna=False).nth(n=0) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("selection", ("b", ["b"], ["b", "c"])) +@pytest.mark.parametrize("dropna", ["any", "all", None]) +def test_nth_after_selection(selection, dropna): + # GH#11038, GH#53518 + df = DataFrame( + { + "a": [1, 1, 2], + "b": [np.nan, 3, 4], + "c": [5, 6, 7], + } + ) + gb = df.groupby("a")[selection] + result = gb.nth(0, dropna=dropna) + if dropna == "any" or (dropna == "all" and selection != ["b", "c"]): + locs = [1, 2] + else: + locs = [0, 2] + expected = df.loc[locs, selection] + tm.assert_equal(result, expected)