From aa9c9e1a43ad843eeb3cd0366a4c119e5b9d073f Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 28 Dec 2022 13:59:14 -0500 Subject: [PATCH 1/3] REF: groupby Series selection with as_index=False --- pandas/core/apply.py | 78 +++++++++++++++++--------- pandas/core/base.py | 13 ++--- pandas/core/groupby/generic.py | 84 ++++++++++++++++------------ pandas/core/groupby/groupby.py | 63 ++++++++++++++------- pandas/core/groupby/ops.py | 2 +- pandas/core/series.py | 2 + pandas/tests/groupby/test_groupby.py | 2 + 7 files changed, 154 insertions(+), 90 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 02a9444dd4f97..d6de62676028d 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -2,6 +2,7 @@ import abc from collections import defaultdict +from contextlib import nullcontext from functools import partial import inspect from typing import ( @@ -292,6 +293,10 @@ def agg_list_like(self) -> DataFrame | Series: ------- Result of aggregation. """ + from pandas.core.groupby.generic import ( + DataFrameGroupBy, + SeriesGroupBy, + ) from pandas.core.reshape.concat import concat obj = self.obj @@ -312,26 +317,35 @@ def agg_list_like(self) -> DataFrame | Series: results = [] keys = [] - # degenerate case - if selected_obj.ndim == 1: - for a in arg: - colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) - new_res = colg.aggregate(a) - results.append(new_res) + is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy)) + if is_groupby: + # When as_index=False, we combine all results using indices + # and adjust index after + context_manager = com.temp_setattr(obj, "as_index", True) + else: + context_manager = nullcontext() + with context_manager: + # degenerate case + if selected_obj.ndim == 1: - # make sure we find a good name - name = com.get_callable_name(a) or a - keys.append(name) + for a in arg: + colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) + new_res = colg.aggregate(a) + results.append(new_res) - # multiples - else: - indices = [] - for index, col in enumerate(selected_obj): - colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) - new_res = colg.aggregate(arg) - results.append(new_res) - indices.append(index) - keys = selected_obj.columns.take(indices) + # make sure we find a good name + name = com.get_callable_name(a) or a + keys.append(name) + + # multiples + else: + indices = [] + for index, col in enumerate(selected_obj): + colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) + new_res = colg.aggregate(arg) + results.append(new_res) + indices.append(index) + keys = selected_obj.columns.take(indices) try: concatenated = concat(results, keys=keys, axis=1, sort=False) @@ -366,6 +380,10 @@ def agg_dict_like(self) -> DataFrame | Series: Result of aggregation. """ from pandas import Index + from pandas.core.groupby.generic import ( + DataFrameGroupBy, + SeriesGroupBy, + ) from pandas.core.reshape.concat import concat obj = self.obj @@ -384,15 +402,23 @@ def agg_dict_like(self) -> DataFrame | Series: arg = self.normalize_dictlike_arg("agg", selected_obj, arg) - if selected_obj.ndim == 1: - # key only used for output - colg = obj._gotitem(selection, ndim=1) - results = {key: colg.agg(how) for key, how in arg.items()} + is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy)) + if is_groupby: + # When as_index=False, we combine all results using indices + # and adjust index after + context_manager = com.temp_setattr(obj, "as_index", True) else: - # key used for column selection and output - results = { - key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items() - } + context_manager = nullcontext() + with context_manager: + if selected_obj.ndim == 1: + # key only used for output + colg = obj._gotitem(selection, ndim=1) + results = {key: colg.agg(how) for key, how in arg.items()} + else: + # key used for column selection and output + results = { + key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items() + } # set the final keys keys = list(arg.keys()) diff --git a/pandas/core/base.py b/pandas/core/base.py index 826583fd26f5d..8559640c1858d 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -216,6 +216,9 @@ def _obj_with_exclusions(self): if self._selection is not None and isinstance(self.obj, ABCDataFrame): return self.obj[self._selection_list] + if isinstance(self.obj, ABCSeries): + return self.obj + if len(self.exclusions) > 0: # equivalent to `self.obj.drop(self.exclusions, axis=1) # but this avoids consolidating and making a copy @@ -235,17 +238,11 @@ def __getitem__(self, key): raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}") return self._gotitem(list(key), ndim=2) - elif not getattr(self, "as_index", False): - if key not in self.obj.columns: - raise KeyError(f"Column not found: {key}") - return self._gotitem(key, ndim=2) - else: if key not in self.obj: raise KeyError(f"Column not found: {key}") - subset = self.obj[key] - ndim = subset.ndim - return self._gotitem(key, ndim=ndim, subset=subset) + ndim = self.obj[key].ndim + return self._gotitem(key, ndim=ndim) def _gotitem(self, key, ndim: int, subset=None): """ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 905c1193713cc..09648e0d3e040 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -248,7 +248,11 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs ) index = self.grouper.result_index - return self.obj._constructor(result.ravel(), index=index, name=data.name) + result = self.obj._constructor(result.ravel(), index=index, name=data.name) + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) + return result relabeling = func is None columns = None @@ -268,6 +272,9 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) # columns is not narrowed by mypy from relabeling flag assert columns is not None # for mypy ret.columns = columns + if not self.as_index: + ret = self._insert_inaxis_grouper(ret) + ret.index = default_index(len(ret)) return ret else: @@ -287,23 +294,24 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) # result is a dict whose keys are the elements of result_index index = self.grouper.result_index - return Series(result, index=index) + result = Series(result, index=index) + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) + return result agg = aggregate def _aggregate_multiple_funcs(self, arg) -> DataFrame: if isinstance(arg, dict): - - # show the deprecation, but only if we - # have not shown a higher level one - # GH 15931 - raise SpecificationError("nested renamer is not supported") - - if any(isinstance(x, (tuple, list)) for x in arg): + if self.as_index: + # GH 15931 + raise SpecificationError("nested renamer is not supported") + else: + # GH#50684 - This accidentally worked in 1.x + arg = list(arg.items()) + elif any(isinstance(x, (tuple, list)) for x in arg): arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg] - - # indicated column order - columns = next(zip(*arg)) else: # list of functions / function names columns = [] @@ -313,10 +321,13 @@ def _aggregate_multiple_funcs(self, arg) -> DataFrame: arg = zip(columns, arg) results: dict[base.OutputKey, DataFrame | Series] = {} - for idx, (name, func) in enumerate(arg): + with com.temp_setattr(self, "as_index", True): + # Combine results using the index, need to adjust index after + # if as_index=False (GH#50724) + for idx, (name, func) in enumerate(arg): - key = base.OutputKey(label=name, position=idx) - results[key] = self.aggregate(func) + key = base.OutputKey(label=name, position=idx) + results[key] = self.aggregate(func) if any(isinstance(x, DataFrame) for x in results.values()): from pandas import concat @@ -396,12 +407,18 @@ def _wrap_applied_output( ) if isinstance(result, Series): result.name = self.obj.name + if not self.as_index and not_indexed_same: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) return result else: # GH #6265 #24880 result = self.obj._constructor( data=values, index=self.grouper.result_index, name=self.obj.name ) + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) return self._reindex_output(result) def _aggregate_named(self, func, *args, **kwargs): @@ -630,6 +647,9 @@ def nunique(self, dropna: bool = True) -> Series: res[ids[idx]] = out result = self.obj._constructor(res, index=ri, name=self.obj.name) + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) return self._reindex_output(result, fill_value=0) @doc(Series.describe) @@ -643,12 +663,11 @@ def value_counts( ascending: bool = False, bins=None, dropna: bool = True, - ) -> Series: + ) -> Series | DataFrame: if bins is None: result = self._value_counts( normalize=normalize, sort=sort, ascending=ascending, dropna=dropna ) - assert isinstance(result, Series) return result from pandas.core.reshape.merge import get_join_indexers @@ -786,7 +805,11 @@ def build_codes(lev_codes: np.ndarray) -> np.ndarray: if is_integer_dtype(out.dtype): out = ensure_int64(out) - return self.obj._constructor(out, index=mi, name=self.obj.name) + result = self.obj._constructor(out, index=mi, name=self.obj.name) + if not self.as_index: + result.name = "proportion" if normalize else "count" + result = result.reset_index() + return result def fillna( self, @@ -1274,7 +1297,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) result.columns = result.columns.droplevel(-1) if not self.as_index: - self._insert_inaxis_grouper_inplace(result) + result = self._insert_inaxis_grouper(result) result.index = default_index(len(result)) return result @@ -1386,7 +1409,7 @@ def _wrap_applied_output( return self.obj._constructor_sliced(values, index=key_index) else: result = self.obj._constructor(values, columns=[self._selection]) - self._insert_inaxis_grouper_inplace(result) + result = self._insert_inaxis_grouper(result) return result else: # values are Series @@ -1443,7 +1466,7 @@ def _wrap_applied_output_series( result = self.obj._constructor(stacked_values, index=index, columns=columns) if not self.as_index: - self._insert_inaxis_grouper_inplace(result) + result = self._insert_inaxis_grouper(result) return self._reindex_output(result) @@ -1774,7 +1797,9 @@ def _gotitem(self, key, ndim: int, subset=None): subset, level=self.level, grouper=self.grouper, + exclusions=self.exclusions, selection=key, + as_index=self.as_index, sort=self.sort, group_keys=self.group_keys, observed=self.observed, @@ -1790,19 +1815,6 @@ def _get_data_to_aggregate(self) -> Manager2D: else: return obj._mgr - def _insert_inaxis_grouper_inplace(self, result: DataFrame) -> None: - # zip in reverse so we can always insert at loc 0 - columns = result.columns - for name, lev, in_axis in zip( - reversed(self.grouper.names), - reversed(self.grouper.get_group_levels()), - reversed([grp.in_axis for grp in self.grouper.groupings]), - ): - # GH #28549 - # When using .apply(-), name will be in columns already - if in_axis and name not in columns: - result.insert(0, name, lev) - def _indexed_output_to_ndframe( self, output: Mapping[base.OutputKey, ArrayLike] ) -> DataFrame: @@ -1825,7 +1837,7 @@ def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: mgr.set_axis(1, index) result = self.obj._constructor(mgr) - self._insert_inaxis_grouper_inplace(result) + result = self._insert_inaxis_grouper(result) result = result._consolidate() else: index = self.grouper.result_index @@ -1918,7 +1930,7 @@ def nunique(self, dropna: bool = True) -> DataFrame: if not self.as_index: results.index = default_index(len(results)) - self._insert_inaxis_grouper_inplace(results) + results = self._insert_inaxis_grouper(results) return results diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 431b23023b094..a7e3b4215625b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -123,6 +123,7 @@ class providing the base-class of operations. Index, MultiIndex, RangeIndex, + default_index, ) from pandas.core.internals.blocks import ensure_block_shape from pandas.core.series import Series @@ -910,8 +911,6 @@ def __init__( self.level = level if not as_index: - if not isinstance(obj, DataFrame): - raise TypeError("as_index=False only valid with DataFrame") if axis != 0: raise ValueError("as_index=False only valid for axis=0") @@ -1157,6 +1156,24 @@ def _set_result_index_ordered( return result + def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame: + if isinstance(result, Series): + result = result.to_frame() + + # zip in reverse so we can always insert at loc 0 + columns = result.columns + for name, lev, in_axis in zip( + reversed(self.grouper.names), + reversed(self.grouper.get_group_levels()), + reversed([grp.in_axis for grp in self.grouper.groupings]), + ): + # GH #28549 + # When using .apply(-), name will be in columns already + if in_axis and name not in columns: + result.insert(0, name, lev) + + return result + def _indexed_output_to_ndframe( self, result: Mapping[base.OutputKey, ArrayLike] ) -> Series | DataFrame: @@ -1193,7 +1210,7 @@ def _wrap_aggregated_output( if not self.as_index: # `not self.as_index` is only relevant for DataFrameGroupBy, # enforced in __init__ - self._insert_inaxis_grouper_inplace(result) + result = self._insert_inaxis_grouper(result) result = result._consolidate() index = Index(range(self.grouper.ngroups)) @@ -1613,7 +1630,10 @@ def array_func(values: ArrayLike) -> ArrayLike: res = self._wrap_agged_manager(new_mgr) if is_ser: - res.index = self.grouper.result_index + if self.as_index: + res.index = self.grouper.result_index + else: + res = self._insert_inaxis_grouper(res) return self._reindex_output(res) else: return res @@ -1887,7 +1907,10 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: result = self._wrap_agged_manager(new_mgr) if result.ndim == 1: - result.index = self.grouper.result_index + if self.as_index: + result.index = self.grouper.result_index + else: + result = self._insert_inaxis_grouper(result) return self._reindex_output(result, fill_value=0) @@ -2622,31 +2645,33 @@ def describe( exclude=None, ) -> NDFrameT: with self._group_selection_context(): - if len(self._selected_obj) == 0: - described = self._selected_obj.describe( + selected_obj = self._selected_obj + if len(selected_obj) == 0: + described = selected_obj.describe( percentiles=percentiles, include=include, exclude=exclude ) - if self._selected_obj.ndim == 1: + if selected_obj.ndim == 1: result = described else: result = described.unstack() return result.to_frame().T.iloc[:0] - result = self._python_apply_general( - lambda x: x.describe( - percentiles=percentiles, include=include, exclude=exclude - ), - self._selected_obj, - not_indexed_same=True, - ) + with com.temp_setattr(self, "as_index", True): + result = self._python_apply_general( + lambda x: x.describe( + percentiles=percentiles, include=include, exclude=exclude + ), + selected_obj, + not_indexed_same=True, + ) if self.axis == 1: return result.T # GH#49256 - properly handle the grouping column(s) - if self._selected_obj.ndim != 1 or self.as_index: - result = result.unstack() - if not self.as_index: - self._insert_inaxis_grouper_inplace(result) + result = result.unstack() + if not self.as_index: + result = self._insert_inaxis_grouper(result) + result.index = default_index(len(result)) return result diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index ea902800cf7e0..f88236b2464c1 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -946,7 +946,7 @@ def result_index(self) -> Index: @final def get_group_levels(self) -> list[ArrayLike]: - # Note: only called from _insert_inaxis_grouper_inplace, which + # Note: only called from _insert_inaxis_grouper, which # is only called for BaseGrouper, never for BinGrouper if len(self.groupings) == 1: return [self.groupings[0].group_arraylike] diff --git a/pandas/core/series.py b/pandas/core/series.py index 6b82d48f82ce7..ea6725fde5908 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1977,6 +1977,8 @@ def groupby( if level is None and by is None: raise TypeError("You have to supply one of 'by' and 'level'") + if not as_index: + raise TypeError("as_index=False only valid with DataFrame") axis = self._get_axis_number(axis) return SeriesGroupBy( diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3baf2d86010f7..c3ce3a1cc84c7 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -652,6 +652,8 @@ def test_groupby_as_index_select_column_sum_empty_df(): left = df.groupby(by="A", as_index=False)["B"].sum(numeric_only=False) expected = DataFrame(columns=df.columns[:2], index=range(0)) + # GH#?? - Columns after selection shouldn't retain names + expected.columns.names = [None] tm.assert_frame_equal(left, expected) From 7d00d07bf36468e97a1da910362885ccfb42710b Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 14 Jan 2023 10:46:26 -0500 Subject: [PATCH 2/3] GH# --- pandas/tests/groupby/test_groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c3ce3a1cc84c7..9b293f0f1669c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -652,7 +652,7 @@ def test_groupby_as_index_select_column_sum_empty_df(): left = df.groupby(by="A", as_index=False)["B"].sum(numeric_only=False) expected = DataFrame(columns=df.columns[:2], index=range(0)) - # GH#?? - Columns after selection shouldn't retain names + # GH#50744 - Columns after selection shouldn't retain names expected.columns.names = [None] tm.assert_frame_equal(left, expected) From 41399ad544fbcf3ab281f9264b34b62ecd74141a Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 16 Jan 2023 17:08:22 -0500 Subject: [PATCH 3/3] type-hinting fixes --- pandas/core/apply.py | 3 +++ pandas/core/groupby/generic.py | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index d6de62676028d..c28da1bc758cd 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -9,6 +9,7 @@ TYPE_CHECKING, Any, Callable, + ContextManager, DefaultDict, Dict, Hashable, @@ -318,6 +319,7 @@ def agg_list_like(self) -> DataFrame | Series: keys = [] is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy)) + context_manager: ContextManager if is_groupby: # When as_index=False, we combine all results using indices # and adjust index after @@ -403,6 +405,7 @@ def agg_dict_like(self) -> DataFrame | Series: arg = self.normalize_dictlike_arg("agg", selected_obj, arg) is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy)) + context_manager: ContextManager if is_groupby: # When as_index=False, we combine all results using indices # and adjust index after diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 09648e0d3e040..2340c36d14301 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -594,7 +594,7 @@ def true_and_notna(x) -> bool: filtered = self._apply_filter(indices, dropna) return filtered - def nunique(self, dropna: bool = True) -> Series: + def nunique(self, dropna: bool = True) -> Series | DataFrame: """ Return number of unique elements in the group. @@ -646,7 +646,9 @@ def nunique(self, dropna: bool = True) -> Series: # GH#21334s res[ids[idx]] = out - result = self.obj._constructor(res, index=ri, name=self.obj.name) + result: Series | DataFrame = self.obj._constructor( + res, index=ri, name=self.obj.name + ) if not self.as_index: result = self._insert_inaxis_grouper(result) result.index = default_index(len(result))