From be9c46dbf4c74fd7344e4a4811e3c73680086a38 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 1 Feb 2023 17:11:33 -0800 Subject: [PATCH] REF: remove group_selection_context --- pandas/core/groupby/groupby.py | 370 +++++++----------- pandas/core/groupby/indexing.py | 1 - .../tests/groupby/transform/test_transform.py | 5 - 3 files changed, 152 insertions(+), 224 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 696f924e31179..fd9a06a06cfa7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -8,7 +8,6 @@ class providing the base-class of operations. """ from __future__ import annotations -from contextlib import contextmanager import datetime from functools import ( partial, @@ -19,7 +18,6 @@ class providing the base-class of operations. from typing import ( TYPE_CHECKING, Callable, - Generator, Hashable, Iterable, Iterator, @@ -611,7 +609,6 @@ def f(self): class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): - _group_selection: IndexLabel | None = None _hidden_attrs = PandasObject._hidden_attrs | { "as_index", "axis", @@ -738,8 +735,6 @@ def _selected_obj(self): # that and avoid making a copy. return self._obj_with_exclusions - if self._group_selection is not None: - return self._obj_with_exclusions return self.obj @final @@ -961,12 +956,8 @@ def __getattr__(self, attr: str): def _op_via_apply(self, name: str, *args, **kwargs): """Compute the result of an operation by using GroupBy's apply.""" f = getattr(type(self._obj_with_exclusions), name) - with self._group_selection_context(): - # need to setup the selection - # as are not passed directly but in the grouper - f = getattr(type(self._obj_with_exclusions), name) - if not callable(f): - return self.apply(lambda self: getattr(self, name)) + if not callable(f): + return self.apply(lambda self: getattr(self, name)) sig = inspect.signature(f) @@ -1008,56 +999,6 @@ def curried(x): # ----------------------------------------------------------------- # Selection - @final - def _set_group_selection(self) -> None: - """ - Create group based selection. - - Used when selection is not passed directly but instead via a grouper. - - NOTE: this should be paired with a call to _reset_group_selection - """ - # This is a no-op for SeriesGroupBy - grp = self.grouper - if ( - grp.groupings is None - or self.obj.ndim == 1 - or self._group_selection is not None - ): - return - - groupers = self.exclusions - - if len(groupers): - # GH12839 clear selected obj cache when group selection changes - ax = self.obj._info_axis - self._group_selection = ax.difference(Index(groupers), sort=False).tolist() - self._reset_cache("_selected_obj") - - @final - def _reset_group_selection(self) -> None: - """ - Clear group based selection. - - Used for methods needing to return info on each group regardless of - whether a group selection was previously set. - """ - if self._group_selection is not None: - # GH12839 clear cached selection too when changing group selection - self._group_selection = None - self._reset_cache("_selected_obj") - - @contextmanager - def _group_selection_context(self) -> Generator[GroupBy, None, None]: - """ - Set / reset the _group_selection_context. - """ - self._set_group_selection() - try: - yield self - finally: - self._reset_group_selection() - def _iterate_slices(self) -> Iterable[Series]: raise AbstractMethodError(self) @@ -1327,8 +1268,7 @@ def _numba_agg_general( if self.axis == 1: raise NotImplementedError("axis=1 is not supported.") - with self._group_selection_context(): - data = self._selected_obj + data = self._obj_with_exclusions df = data if data.ndim == 2 else data.to_frame() starts, ends, sorted_index, sorted_data = self._numba_prep(df) aggregator = executor.generate_shared_aggregator( @@ -1455,8 +1395,7 @@ def f(g): # fails on *some* columns, e.g. a numeric operation # on a string grouper column - with self._group_selection_context(): - return self._python_apply_general(f, self._selected_obj) + return self._python_apply_general(f, self._obj_with_exclusions) return result @@ -2215,117 +2154,115 @@ def _value_counts( ) name = "proportion" if normalize else "count" - with self._group_selection_context(): - df = self.obj + df = self.obj + obj = self._obj_with_exclusions - in_axis_names = { - grouping.name for grouping in self.grouper.groupings if grouping.in_axis - } - if isinstance(self._selected_obj, Series): - _name = self._selected_obj.name - keys = [] if _name in in_axis_names else [self._selected_obj] + in_axis_names = { + grouping.name for grouping in self.grouper.groupings if grouping.in_axis + } + if isinstance(obj, Series): + _name = obj.name + keys = [] if _name in in_axis_names else [obj] + else: + unique_cols = set(obj.columns) + if subset is not None: + subsetted = set(subset) + clashing = subsetted & set(in_axis_names) + if clashing: + raise ValueError( + f"Keys {clashing} in subset cannot be in " + "the groupby column keys." + ) + doesnt_exist = subsetted - unique_cols + if doesnt_exist: + raise ValueError( + f"Keys {doesnt_exist} in subset do not " + f"exist in the DataFrame." + ) else: - unique_cols = set(self._selected_obj.columns) - if subset is not None: - subsetted = set(subset) - clashing = subsetted & set(in_axis_names) - if clashing: - raise ValueError( - f"Keys {clashing} in subset cannot be in " - "the groupby column keys." - ) - doesnt_exist = subsetted - unique_cols - if doesnt_exist: - raise ValueError( - f"Keys {doesnt_exist} in subset do not " - f"exist in the DataFrame." - ) - else: - subsetted = unique_cols - - keys = [ - # Can't use .values because the column label needs to be preserved - self._selected_obj.iloc[:, idx] - for idx, _name in enumerate(self._selected_obj.columns) - if _name not in in_axis_names and _name in subsetted - ] - - groupings = list(self.grouper.groupings) - for key in keys: - grouper, _, _ = get_grouper( - df, - key=key, - axis=self.axis, - sort=self.sort, - observed=False, - dropna=dropna, - ) - groupings += list(grouper.groupings) + subsetted = unique_cols + + keys = [ + # Can't use .values because the column label needs to be preserved + obj.iloc[:, idx] + for idx, _name in enumerate(obj.columns) + if _name not in in_axis_names and _name in subsetted + ] - # Take the size of the overall columns - gb = df.groupby( - groupings, + groupings = list(self.grouper.groupings) + for key in keys: + grouper, _, _ = get_grouper( + df, + key=key, + axis=self.axis, + sort=self.sort, + observed=False, + dropna=dropna, + ) + groupings += list(grouper.groupings) + + # Take the size of the overall columns + gb = df.groupby( + groupings, + sort=self.sort, + observed=self.observed, + dropna=self.dropna, + ) + result_series = cast(Series, gb.size()) + result_series.name = name + + # GH-46357 Include non-observed categories + # of non-grouping columns regardless of `observed` + if any( + isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex)) + and not grouping._observed + for grouping in groupings + ): + levels_list = [ping.result_index for ping in groupings] + multi_index, _ = MultiIndex.from_product( + levels_list, names=[ping.name for ping in groupings] + ).sortlevel() + result_series = result_series.reindex(multi_index, fill_value=0) + + if normalize: + # Normalize the results by dividing by the original group sizes. + # We are guaranteed to have the first N levels be the + # user-requested grouping. + levels = list( + range(len(self.grouper.groupings), result_series.index.nlevels) + ) + indexed_group_size = result_series.groupby( + result_series.index.droplevel(levels), sort=self.sort, - observed=self.observed, dropna=self.dropna, + ).transform("sum") + result_series /= indexed_group_size + + # Handle groups of non-observed categories + result_series = result_series.fillna(0.0) + + if sort: + # Sort the values and then resort by the main grouping + index_level = range(len(self.grouper.groupings)) + result_series = result_series.sort_values(ascending=ascending).sort_index( + level=index_level, sort_remaining=False ) - result_series = cast(Series, gb.size()) - result_series.name = name - # GH-46357 Include non-observed categories - # of non-grouping columns regardless of `observed` - if any( - isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex)) - and not grouping._observed - for grouping in groupings - ): - levels_list = [ping.result_index for ping in groupings] - multi_index, _ = MultiIndex.from_product( - levels_list, names=[ping.name for ping in groupings] - ).sortlevel() - result_series = result_series.reindex(multi_index, fill_value=0) - - if normalize: - # Normalize the results by dividing by the original group sizes. - # We are guaranteed to have the first N levels be the - # user-requested grouping. - levels = list( - range(len(self.grouper.groupings), result_series.index.nlevels) - ) - indexed_group_size = result_series.groupby( - result_series.index.droplevel(levels), - sort=self.sort, - dropna=self.dropna, - ).transform("sum") - result_series /= indexed_group_size - - # Handle groups of non-observed categories - result_series = result_series.fillna(0.0) - - if sort: - # Sort the values and then resort by the main grouping - index_level = range(len(self.grouper.groupings)) - result_series = result_series.sort_values( - ascending=ascending - ).sort_index(level=index_level, sort_remaining=False) - - result: Series | DataFrame - if self.as_index: - result = result_series - else: - # Convert to frame - index = result_series.index - columns = com.fill_missing_names(index.names) - if name in columns: - raise ValueError( - f"Column label '{name}' is duplicate of result column" - ) - result_series.name = name - result_series.index = index.set_names(range(len(columns))) - result_frame = result_series.reset_index() - result_frame.columns = columns + [name] - result = result_frame - return result.__finalize__(self.obj, method="value_counts") + result: Series | DataFrame + if self.as_index: + result = result_series + else: + # Convert to frame + index = result_series.index + columns = com.fill_missing_names(index.names) + if name in columns: + raise ValueError(f"Column label '{name}' is duplicate of result column") + result_series.name = name + result_series.index = index.set_names(range(len(columns))) + result_frame = result_series.reset_index() + result_frame.columns = columns + [name] + result = result_frame + return result.__finalize__(self.obj, method="value_counts") @final def sem(self, ddof: int = 1, numeric_only: bool = False): @@ -2651,36 +2588,36 @@ def describe( include=None, exclude=None, ) -> NDFrameT: - with self._group_selection_context(): - selected_obj = self._selected_obj - if len(selected_obj) == 0: - described = selected_obj.describe( + obj = self._obj_with_exclusions + + if len(obj) == 0: + described = obj.describe( + percentiles=percentiles, include=include, exclude=exclude + ) + if obj.ndim == 1: + result = described + else: + result = described.unstack() + return result.to_frame().T.iloc[:0] + + with com.temp_setattr(self, "as_index", True): + result = self._python_apply_general( + lambda x: x.describe( percentiles=percentiles, include=include, exclude=exclude - ) - if selected_obj.ndim == 1: - result = described - else: - result = described.unstack() - return result.to_frame().T.iloc[:0] - - with com.temp_setattr(self, "as_index", True): - result = self._python_apply_general( - lambda x: x.describe( - percentiles=percentiles, include=include, exclude=exclude - ), - selected_obj, - not_indexed_same=True, - ) - if self.axis == 1: - return result.T + ), + obj, + not_indexed_same=True, + ) + if self.axis == 1: + return result.T - # GH#49256 - properly handle the grouping column(s) - result = result.unstack() - if not self.as_index: - result = self._insert_inaxis_grouper(result) - result.index = default_index(len(result)) + # GH#49256 - properly handle the grouping column(s) + result = result.unstack() + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) - return result + return result @final def resample(self, rule, *args, **kwargs): @@ -3399,25 +3336,25 @@ def ngroup(self, ascending: bool = True): 5 1 dtype: int64 """ - with self._group_selection_context(): - index = self._selected_obj._get_axis(self.axis) - comp_ids = self.grouper.group_info[0] + obj = self._obj_with_exclusions + index = obj._get_axis(self.axis) + comp_ids = self.grouper.group_info[0] - dtype: type - if self.grouper.has_dropped_na: - comp_ids = np.where(comp_ids == -1, np.nan, comp_ids) - dtype = np.float64 - else: - dtype = np.int64 + dtype: type + if self.grouper.has_dropped_na: + comp_ids = np.where(comp_ids == -1, np.nan, comp_ids) + dtype = np.float64 + else: + dtype = np.int64 - if any(ping._passed_categorical for ping in self.grouper.groupings): - # comp_ids reflect non-observed groups, we need only observed - comp_ids = rank_1d(comp_ids, ties_method="dense") - 1 + if any(ping._passed_categorical for ping in self.grouper.groupings): + # comp_ids reflect non-observed groups, we need only observed + comp_ids = rank_1d(comp_ids, ties_method="dense") - 1 - result = self._obj_1d_constructor(comp_ids, index, dtype=dtype) - if not ascending: - result = self.ngroups - 1 - result - return result + result = self._obj_1d_constructor(comp_ids, index, dtype=dtype) + if not ascending: + result = self.ngroups - 1 - result + return result @final @Substitution(name="groupby") @@ -3474,10 +3411,9 @@ def cumcount(self, ascending: bool = True): 5 0 dtype: int64 """ - with self._group_selection_context(): - index = self._selected_obj._get_axis(self.axis) - cumcounts = self._cumcount_array(ascending=ascending) - return self._obj_1d_constructor(cumcounts, index) + index = self._obj_with_exclusions._get_axis(self.axis) + cumcounts = self._cumcount_array(ascending=ascending) + return self._obj_1d_constructor(cumcounts, index) @final @Substitution(name="groupby") @@ -3957,7 +3893,6 @@ def head(self, n: int = 5) -> NDFrameT: A B 0 1 2 """ - self._reset_group_selection() mask = self._make_mask_from_positional_indexer(slice(None, n)) return self._mask_selected_obj(mask) @@ -3997,7 +3932,6 @@ def tail(self, n: int = 5) -> NDFrameT: 1 a 2 3 b 2 """ - self._reset_group_selection() if n: mask = self._make_mask_from_positional_indexer(slice(-n, None)) else: diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 40eaeea66b25c..911ee0e8e4725 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -280,7 +280,6 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series: GroupBy.nth : Take the nth row from each group if n is an int, or a subset of rows, if n is a list of ints. """ - self.groupby_object._reset_group_selection() mask = self.groupby_object._make_mask_from_positional_indexer(arg) return self.groupby_object._mask_selected_obj(mask) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 81dca42b1d74f..1313c39bb67b2 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -710,11 +710,6 @@ def test_cython_transform_frame(op, args, targop): # {"by": ['int','string']}]: gb = df.groupby(group_keys=False, **gb_target) - # allowlisted methods set the selection before applying - # bit a of hack to make sure the cythonized shift - # is equivalent to pre 0.17.1 behavior - if op == "shift": - gb._set_group_selection() if op != "shift" and "int" not in gb_target: # numeric apply fastpath promotes dtype so have