From be9c46dbf4c74fd7344e4a4811e3c73680086a38 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 1 Feb 2023 17:11:33 -0800
Subject: [PATCH] REF: remove group_selection_context

---
 pandas/core/groupby/groupby.py                | 370 +++++++-----------
 pandas/core/groupby/indexing.py               |   1 -
 .../tests/groupby/transform/test_transform.py |   5 -
 3 files changed, 152 insertions(+), 224 deletions(-)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 696f924e31179..fd9a06a06cfa7 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -8,7 +8,6 @@ class providing the base-class of operations.
 """
 from __future__ import annotations
 
-from contextlib import contextmanager
 import datetime
 from functools import (
     partial,
@@ -19,7 +18,6 @@ class providing the base-class of operations.
 from typing import (
     TYPE_CHECKING,
     Callable,
-    Generator,
     Hashable,
     Iterable,
     Iterator,
@@ -611,7 +609,6 @@ def f(self):
 
 
 class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin):
-    _group_selection: IndexLabel | None = None
     _hidden_attrs = PandasObject._hidden_attrs | {
         "as_index",
         "axis",
@@ -738,8 +735,6 @@ def _selected_obj(self):
             #  that and avoid making a copy.
             return self._obj_with_exclusions
 
-        if self._group_selection is not None:
-            return self._obj_with_exclusions
         return self.obj
 
     @final
@@ -961,12 +956,8 @@ def __getattr__(self, attr: str):
     def _op_via_apply(self, name: str, *args, **kwargs):
         """Compute the result of an operation by using GroupBy's apply."""
         f = getattr(type(self._obj_with_exclusions), name)
-        with self._group_selection_context():
-            # need to setup the selection
-            # as are not passed directly but in the grouper
-            f = getattr(type(self._obj_with_exclusions), name)
-            if not callable(f):
-                return self.apply(lambda self: getattr(self, name))
+        if not callable(f):
+            return self.apply(lambda self: getattr(self, name))
 
         sig = inspect.signature(f)
 
@@ -1008,56 +999,6 @@ def curried(x):
     # -----------------------------------------------------------------
     # Selection
 
-    @final
-    def _set_group_selection(self) -> None:
-        """
-        Create group based selection.
-
-        Used when selection is not passed directly but instead via a grouper.
-
-        NOTE: this should be paired with a call to _reset_group_selection
-        """
-        # This is a no-op for SeriesGroupBy
-        grp = self.grouper
-        if (
-            grp.groupings is None
-            or self.obj.ndim == 1
-            or self._group_selection is not None
-        ):
-            return
-
-        groupers = self.exclusions
-
-        if len(groupers):
-            # GH12839 clear selected obj cache when group selection changes
-            ax = self.obj._info_axis
-            self._group_selection = ax.difference(Index(groupers), sort=False).tolist()
-            self._reset_cache("_selected_obj")
-
-    @final
-    def _reset_group_selection(self) -> None:
-        """
-        Clear group based selection.
-
-        Used for methods needing to return info on each group regardless of
-        whether a group selection was previously set.
-        """
-        if self._group_selection is not None:
-            # GH12839 clear cached selection too when changing group selection
-            self._group_selection = None
-            self._reset_cache("_selected_obj")
-
-    @contextmanager
-    def _group_selection_context(self) -> Generator[GroupBy, None, None]:
-        """
-        Set / reset the _group_selection_context.
-        """
-        self._set_group_selection()
-        try:
-            yield self
-        finally:
-            self._reset_group_selection()
-
     def _iterate_slices(self) -> Iterable[Series]:
         raise AbstractMethodError(self)
 
@@ -1327,8 +1268,7 @@ def _numba_agg_general(
         if self.axis == 1:
             raise NotImplementedError("axis=1 is not supported.")
 
-        with self._group_selection_context():
-            data = self._selected_obj
+        data = self._obj_with_exclusions
         df = data if data.ndim == 2 else data.to_frame()
         starts, ends, sorted_index, sorted_data = self._numba_prep(df)
         aggregator = executor.generate_shared_aggregator(
@@ -1455,8 +1395,7 @@ def f(g):
                 # fails on *some* columns, e.g. a numeric operation
                 # on a string grouper column
 
-                with self._group_selection_context():
-                    return self._python_apply_general(f, self._selected_obj)
+                return self._python_apply_general(f, self._obj_with_exclusions)
 
         return result
 
@@ -2215,117 +2154,115 @@ def _value_counts(
             )
         name = "proportion" if normalize else "count"
 
-        with self._group_selection_context():
-            df = self.obj
+        df = self.obj
+        obj = self._obj_with_exclusions
 
-            in_axis_names = {
-                grouping.name for grouping in self.grouper.groupings if grouping.in_axis
-            }
-            if isinstance(self._selected_obj, Series):
-                _name = self._selected_obj.name
-                keys = [] if _name in in_axis_names else [self._selected_obj]
+        in_axis_names = {
+            grouping.name for grouping in self.grouper.groupings if grouping.in_axis
+        }
+        if isinstance(obj, Series):
+            _name = obj.name
+            keys = [] if _name in in_axis_names else [obj]
+        else:
+            unique_cols = set(obj.columns)
+            if subset is not None:
+                subsetted = set(subset)
+                clashing = subsetted & set(in_axis_names)
+                if clashing:
+                    raise ValueError(
+                        f"Keys {clashing} in subset cannot be in "
+                        "the groupby column keys."
+                    )
+                doesnt_exist = subsetted - unique_cols
+                if doesnt_exist:
+                    raise ValueError(
+                        f"Keys {doesnt_exist} in subset do not "
+                        f"exist in the DataFrame."
+                    )
             else:
-                unique_cols = set(self._selected_obj.columns)
-                if subset is not None:
-                    subsetted = set(subset)
-                    clashing = subsetted & set(in_axis_names)
-                    if clashing:
-                        raise ValueError(
-                            f"Keys {clashing} in subset cannot be in "
-                            "the groupby column keys."
-                        )
-                    doesnt_exist = subsetted - unique_cols
-                    if doesnt_exist:
-                        raise ValueError(
-                            f"Keys {doesnt_exist} in subset do not "
-                            f"exist in the DataFrame."
-                        )
-                else:
-                    subsetted = unique_cols
-
-                keys = [
-                    # Can't use .values because the column label needs to be preserved
-                    self._selected_obj.iloc[:, idx]
-                    for idx, _name in enumerate(self._selected_obj.columns)
-                    if _name not in in_axis_names and _name in subsetted
-                ]
-
-            groupings = list(self.grouper.groupings)
-            for key in keys:
-                grouper, _, _ = get_grouper(
-                    df,
-                    key=key,
-                    axis=self.axis,
-                    sort=self.sort,
-                    observed=False,
-                    dropna=dropna,
-                )
-                groupings += list(grouper.groupings)
+                subsetted = unique_cols
+
+            keys = [
+                # Can't use .values because the column label needs to be preserved
+                obj.iloc[:, idx]
+                for idx, _name in enumerate(obj.columns)
+                if _name not in in_axis_names and _name in subsetted
+            ]
 
-            # Take the size of the overall columns
-            gb = df.groupby(
-                groupings,
+        groupings = list(self.grouper.groupings)
+        for key in keys:
+            grouper, _, _ = get_grouper(
+                df,
+                key=key,
+                axis=self.axis,
+                sort=self.sort,
+                observed=False,
+                dropna=dropna,
+            )
+            groupings += list(grouper.groupings)
+
+        # Take the size of the overall columns
+        gb = df.groupby(
+            groupings,
+            sort=self.sort,
+            observed=self.observed,
+            dropna=self.dropna,
+        )
+        result_series = cast(Series, gb.size())
+        result_series.name = name
+
+        # GH-46357 Include non-observed categories
+        # of non-grouping columns regardless of `observed`
+        if any(
+            isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
+            and not grouping._observed
+            for grouping in groupings
+        ):
+            levels_list = [ping.result_index for ping in groupings]
+            multi_index, _ = MultiIndex.from_product(
+                levels_list, names=[ping.name for ping in groupings]
+            ).sortlevel()
+            result_series = result_series.reindex(multi_index, fill_value=0)
+
+        if normalize:
+            # Normalize the results by dividing by the original group sizes.
+            # We are guaranteed to have the first N levels be the
+            # user-requested grouping.
+            levels = list(
+                range(len(self.grouper.groupings), result_series.index.nlevels)
+            )
+            indexed_group_size = result_series.groupby(
+                result_series.index.droplevel(levels),
                 sort=self.sort,
-                observed=self.observed,
                 dropna=self.dropna,
+            ).transform("sum")
+            result_series /= indexed_group_size
+
+            # Handle groups of non-observed categories
+            result_series = result_series.fillna(0.0)
+
+        if sort:
+            # Sort the values and then resort by the main grouping
+            index_level = range(len(self.grouper.groupings))
+            result_series = result_series.sort_values(ascending=ascending).sort_index(
+                level=index_level, sort_remaining=False
             )
-            result_series = cast(Series, gb.size())
-            result_series.name = name
 
-            # GH-46357 Include non-observed categories
-            # of non-grouping columns regardless of `observed`
-            if any(
-                isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
-                and not grouping._observed
-                for grouping in groupings
-            ):
-                levels_list = [ping.result_index for ping in groupings]
-                multi_index, _ = MultiIndex.from_product(
-                    levels_list, names=[ping.name for ping in groupings]
-                ).sortlevel()
-                result_series = result_series.reindex(multi_index, fill_value=0)
-
-            if normalize:
-                # Normalize the results by dividing by the original group sizes.
-                # We are guaranteed to have the first N levels be the
-                # user-requested grouping.
-                levels = list(
-                    range(len(self.grouper.groupings), result_series.index.nlevels)
-                )
-                indexed_group_size = result_series.groupby(
-                    result_series.index.droplevel(levels),
-                    sort=self.sort,
-                    dropna=self.dropna,
-                ).transform("sum")
-                result_series /= indexed_group_size
-
-                # Handle groups of non-observed categories
-                result_series = result_series.fillna(0.0)
-
-            if sort:
-                # Sort the values and then resort by the main grouping
-                index_level = range(len(self.grouper.groupings))
-                result_series = result_series.sort_values(
-                    ascending=ascending
-                ).sort_index(level=index_level, sort_remaining=False)
-
-            result: Series | DataFrame
-            if self.as_index:
-                result = result_series
-            else:
-                # Convert to frame
-                index = result_series.index
-                columns = com.fill_missing_names(index.names)
-                if name in columns:
-                    raise ValueError(
-                        f"Column label '{name}' is duplicate of result column"
-                    )
-                result_series.name = name
-                result_series.index = index.set_names(range(len(columns)))
-                result_frame = result_series.reset_index()
-                result_frame.columns = columns + [name]
-                result = result_frame
-            return result.__finalize__(self.obj, method="value_counts")
+        result: Series | DataFrame
+        if self.as_index:
+            result = result_series
+        else:
+            # Convert to frame
+            index = result_series.index
+            columns = com.fill_missing_names(index.names)
+            if name in columns:
+                raise ValueError(f"Column label '{name}' is duplicate of result column")
+            result_series.name = name
+            result_series.index = index.set_names(range(len(columns)))
+            result_frame = result_series.reset_index()
+            result_frame.columns = columns + [name]
+            result = result_frame
+        return result.__finalize__(self.obj, method="value_counts")
 
     @final
     def sem(self, ddof: int = 1, numeric_only: bool = False):
@@ -2651,36 +2588,36 @@ def describe(
         include=None,
         exclude=None,
     ) -> NDFrameT:
-        with self._group_selection_context():
-            selected_obj = self._selected_obj
-            if len(selected_obj) == 0:
-                described = selected_obj.describe(
+        obj = self._obj_with_exclusions
+
+        if len(obj) == 0:
+            described = obj.describe(
+                percentiles=percentiles, include=include, exclude=exclude
+            )
+            if obj.ndim == 1:
+                result = described
+            else:
+                result = described.unstack()
+            return result.to_frame().T.iloc[:0]
+
+        with com.temp_setattr(self, "as_index", True):
+            result = self._python_apply_general(
+                lambda x: x.describe(
                     percentiles=percentiles, include=include, exclude=exclude
-                )
-                if selected_obj.ndim == 1:
-                    result = described
-                else:
-                    result = described.unstack()
-                return result.to_frame().T.iloc[:0]
-
-            with com.temp_setattr(self, "as_index", True):
-                result = self._python_apply_general(
-                    lambda x: x.describe(
-                        percentiles=percentiles, include=include, exclude=exclude
-                    ),
-                    selected_obj,
-                    not_indexed_same=True,
-                )
-            if self.axis == 1:
-                return result.T
+                ),
+                obj,
+                not_indexed_same=True,
+            )
+        if self.axis == 1:
+            return result.T
 
-            # GH#49256 - properly handle the grouping column(s)
-            result = result.unstack()
-            if not self.as_index:
-                result = self._insert_inaxis_grouper(result)
-                result.index = default_index(len(result))
+        # GH#49256 - properly handle the grouping column(s)
+        result = result.unstack()
+        if not self.as_index:
+            result = self._insert_inaxis_grouper(result)
+            result.index = default_index(len(result))
 
-            return result
+        return result
 
     @final
     def resample(self, rule, *args, **kwargs):
@@ -3399,25 +3336,25 @@ def ngroup(self, ascending: bool = True):
         5    1
         dtype: int64
         """
-        with self._group_selection_context():
-            index = self._selected_obj._get_axis(self.axis)
-            comp_ids = self.grouper.group_info[0]
+        obj = self._obj_with_exclusions
+        index = obj._get_axis(self.axis)
+        comp_ids = self.grouper.group_info[0]
 
-            dtype: type
-            if self.grouper.has_dropped_na:
-                comp_ids = np.where(comp_ids == -1, np.nan, comp_ids)
-                dtype = np.float64
-            else:
-                dtype = np.int64
+        dtype: type
+        if self.grouper.has_dropped_na:
+            comp_ids = np.where(comp_ids == -1, np.nan, comp_ids)
+            dtype = np.float64
+        else:
+            dtype = np.int64
 
-            if any(ping._passed_categorical for ping in self.grouper.groupings):
-                # comp_ids reflect non-observed groups, we need only observed
-                comp_ids = rank_1d(comp_ids, ties_method="dense") - 1
+        if any(ping._passed_categorical for ping in self.grouper.groupings):
+            # comp_ids reflect non-observed groups, we need only observed
+            comp_ids = rank_1d(comp_ids, ties_method="dense") - 1
 
-            result = self._obj_1d_constructor(comp_ids, index, dtype=dtype)
-            if not ascending:
-                result = self.ngroups - 1 - result
-            return result
+        result = self._obj_1d_constructor(comp_ids, index, dtype=dtype)
+        if not ascending:
+            result = self.ngroups - 1 - result
+        return result
 
     @final
     @Substitution(name="groupby")
@@ -3474,10 +3411,9 @@ def cumcount(self, ascending: bool = True):
         5    0
         dtype: int64
         """
-        with self._group_selection_context():
-            index = self._selected_obj._get_axis(self.axis)
-            cumcounts = self._cumcount_array(ascending=ascending)
-            return self._obj_1d_constructor(cumcounts, index)
+        index = self._obj_with_exclusions._get_axis(self.axis)
+        cumcounts = self._cumcount_array(ascending=ascending)
+        return self._obj_1d_constructor(cumcounts, index)
 
     @final
     @Substitution(name="groupby")
@@ -3957,7 +3893,6 @@ def head(self, n: int = 5) -> NDFrameT:
            A  B
         0  1  2
         """
-        self._reset_group_selection()
         mask = self._make_mask_from_positional_indexer(slice(None, n))
         return self._mask_selected_obj(mask)
 
@@ -3997,7 +3932,6 @@ def tail(self, n: int = 5) -> NDFrameT:
         1  a  2
         3  b  2
         """
-        self._reset_group_selection()
         if n:
             mask = self._make_mask_from_positional_indexer(slice(-n, None))
         else:
diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py
index 40eaeea66b25c..911ee0e8e4725 100644
--- a/pandas/core/groupby/indexing.py
+++ b/pandas/core/groupby/indexing.py
@@ -280,7 +280,6 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series:
         GroupBy.nth : Take the nth row from each group if n is an int, or a
             subset of rows, if n is a list of ints.
         """
-        self.groupby_object._reset_group_selection()
         mask = self.groupby_object._make_mask_from_positional_indexer(arg)
         return self.groupby_object._mask_selected_obj(mask)
 
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index 81dca42b1d74f..1313c39bb67b2 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -710,11 +710,6 @@ def test_cython_transform_frame(op, args, targop):
             # {"by": ['int','string']}]:
 
             gb = df.groupby(group_keys=False, **gb_target)
-            # allowlisted methods set the selection before applying
-            # bit a of hack to make sure the cythonized shift
-            # is equivalent to pre 0.17.1 behavior
-            if op == "shift":
-                gb._set_group_selection()
 
             if op != "shift" and "int" not in gb_target:
                 # numeric apply fastpath promotes dtype so have