diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 48c1173a372a7..68beeea2849bb 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -376,6 +376,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue: `28192`) - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) +- Bug in :meth:`DataFrameGroupby` causing unexpected mutations of the groupby object (:issue:`28523`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) Reshaping diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f622480cfe4b7..34e78ff82acee 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -594,63 +594,63 @@ def pipe(self, func, *args, **kwargs): def _make_wrapper(self, name): assert name in self._apply_whitelist - self._set_group_selection() - - # need to setup the selection - # as are not passed directly but in the grouper - f = getattr(self._selected_obj, name) - if not isinstance(f, types.MethodType): - return self.apply(lambda self: getattr(self, name)) - - f = getattr(type(self._selected_obj), name) - sig = inspect.signature(f) - - def wrapper(*args, **kwargs): - # a little trickery for aggregation functions that need an axis - # argument - if "axis" in sig.parameters: - if kwargs.get("axis", None) is None: - kwargs["axis"] = self.axis - - def curried(x): - return f(x, *args, **kwargs) + with _group_selection_context(self): + # need to setup the selection + # as are not passed directly but in the grouper + f = getattr(self._selected_obj, name) + if not isinstance(f, types.MethodType): + return self.apply(lambda self: getattr(self, name)) + + f = getattr(type(self._selected_obj), name) + sig = inspect.signature(f) + + def wrapper(*args, **kwargs): + # a little trickery for aggregation functions that need an axis + # argument + if "axis" in sig.parameters: + if kwargs.get("axis", None) is None: + kwargs["axis"] = self.axis + + def curried(x): + return f(x, *args, **kwargs) + + # preserve the name so we can detect it when calling plot methods, + # to avoid duplicates + curried.__name__ = name + + # special case otherwise extra plots are created when catching the + # exception below + if name in base.plotting_methods: + return self.apply(curried) - # preserve the name so we can detect it when calling plot methods, - # to avoid duplicates - curried.__name__ = name + try: + return self.apply(curried) + except TypeError as err: + if not re.search( + "reduction operation '.*' not allowed for this dtype", str(err) + ): + # We don't have a cython implementation + # TODO: is the above comment accurate? + raise - # special case otherwise extra plots are created when catching the - # exception below - if name in base.plotting_methods: - return self.apply(curried) + # related to : GH3688 + # try item-by-item + # this can be called recursively, so need to raise + # ValueError + # if we don't have this method to indicated to aggregate to + # mark this column as an error + try: + return self._aggregate_item_by_item(name, *args, **kwargs) + except AttributeError: + # e.g. SparseArray has no flags attr + # FIXME: 'SeriesGroupBy' has no attribute '_aggregate_item_by_item' + # occurs in idxmax() case + # in tests.groupby.test_function.test_non_cython_api + raise ValueError - try: - return self.apply(curried) - except TypeError as err: - if not re.search( - "reduction operation '.*' not allowed for this dtype", str(err) - ): - # We don't have a cython implementation - # TODO: is the above comment accurate? - raise - - # related to : GH3688 - # try item-by-item - # this can be called recursively, so need to raise - # ValueError - # if we don't have this method to indicated to aggregate to - # mark this column as an error - try: - return self._aggregate_item_by_item(name, *args, **kwargs) - except AttributeError: - # e.g. SparseArray has no flags attr - # FIXME: 'SeriesGroupBy' has no attribute '_aggregate_item_by_item' - # occurs in idxmax() case - # in tests.groupby.test_function.test_non_cython_api - raise ValueError + wrapper.__name__ = name - wrapper.__name__ = name - return wrapper + return wrapper def get_group(self, name, obj=None): """ @@ -721,7 +721,9 @@ def f(g): f = func # ignore SettingWithCopy here in case the user mutates - with option_context("mode.chained_assignment", None): + with option_context( + "mode.chained_assignment", None + ) as _, _group_selection_context(self) as _: try: result = self._python_apply_general(f) except TypeError: @@ -732,9 +734,7 @@ def f(g): # except if the udf is trying an operation that # fails on *some* columns, e.g. a numeric operation # on a string grouper column - - with _group_selection_context(self): - return self._python_apply_general(f) + return self._python_apply_general(f) return result @@ -1335,35 +1335,34 @@ def f(self, **kwargs): if "min_count" not in kwargs: kwargs["min_count"] = min_count - self._set_group_selection() - - # try a cython aggregation if we can - try: - return self._cython_agg_general(alias, alt=npfunc, **kwargs) - except DataError: - pass - except NotImplementedError as err: - if "function is not implemented for this dtype" in str(err): - # raised in _get_cython_function, in some cases can - # be trimmed by implementing cython funcs for more dtypes - pass - elif "decimal does not support skipna=True" in str(err): - # FIXME: kludge for test_decimal:test_in_numeric_groupby + with _group_selection_context(self): + # try a cython aggregation if we can + try: + return self._cython_agg_general(alias, alt=npfunc, **kwargs) + except DataError: pass + except NotImplementedError as err: + if "function is not implemented for this dtype" in str(err): + # raised in _get_cython_function, in some cases can + # be trimmed by implementing cython funcs for more dtypes + pass + elif "decimal does not support skipna=True" in str(err): + # FIXME: kludge for test_decimal:test_in_numeric_groupby + pass + else: + raise + + # apply a non-cython aggregation + result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + + # coerce the resulting columns if we can + if isinstance(result, DataFrame): + for col in result.columns: + result[col] = self._try_cast(result[col], self.obj[col]) else: - raise - - # apply a non-cython aggregation - result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) - - # coerce the resulting columns if we can - if isinstance(result, DataFrame): - for col in result.columns: - result[col] = self._try_cast(result[col], self.obj[col]) - else: - result = self._try_cast(result, self.obj) + result = self._try_cast(result, self.obj) - return result + return result set_function_name(f, name, cls) @@ -1736,28 +1735,30 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra nth_values = list(set(n)) nth_array = np.array(nth_values, dtype=np.intp) - self._set_group_selection() - mask_left = np.in1d(self._cumcount_array(), nth_array) - mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, -nth_array) - mask = mask_left | mask_right + with _group_selection_context(self): + mask_left = np.in1d(self._cumcount_array(), nth_array) + mask_right = np.in1d( + self._cumcount_array(ascending=False) + 1, -nth_array + ) + mask = mask_left | mask_right - ids, _, _ = self.grouper.group_info + ids, _, _ = self.grouper.group_info - # Drop NA values in grouping - mask = mask & (ids != -1) + # Drop NA values in grouping + mask = mask & (ids != -1) - out = self._selected_obj[mask] - if not self.as_index: - return out + out = self._selected_obj[mask] + if not self.as_index: + return out - result_index = self.grouper.result_index - out.index = result_index[ids[mask]] + result_index = self.grouper.result_index + out.index = result_index[ids[mask]] - if not self.observed and isinstance(result_index, CategoricalIndex): - out = out.reindex(result_index) + if not self.observed and isinstance(result_index, CategoricalIndex): + out = out.reindex(result_index) - return out.sort_index() if self.sort else out + return out.sort_index() if self.sort else out # dropna is truthy if isinstance(n, valid_containers): diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 5dad868c8c3aa..e89d5cfeb10cd 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -482,13 +482,13 @@ def test_agg_timezone_round_trip(): assert ts == grouped.first()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 0] + assert ts == grouped.apply(lambda x: x.iloc[0]).drop("A", 1).iloc[0, 0] ts = df["B"].iloc[2] assert ts == grouped.last()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 0] + assert ts == grouped.apply(lambda x: x.iloc[-1]).drop("A", 1).iloc[0, 0] def test_sum_uint64_overflow(): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index dff5baa9b5984..831d053f603ec 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -433,6 +433,23 @@ def test_frame_groupby_columns(tsframe): assert len(v.columns) == 2 +def test_frame_groupby_avoids_mutate(reduction_func): + # GH28523 + func = reduction_func + df = pd.DataFrame({"A": ["foo", "bar", "foo", "bar"], "B": [1, 2, 3, 4]}) + grouped = df.groupby("A") + + expected = grouped.apply(lambda x: x) + + args = {"nth": [0], "quantile": [0.5]}.get(func, []) + fn = getattr(grouped, func) + fn(*args) + + result = grouped.apply(lambda x: x) + + tm.assert_frame_equal(expected, result) + + def test_frame_set_name_single(df): grouped = df.groupby("A") diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 403f5f11ee768..0adc1cda6c499 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -172,7 +172,7 @@ def test_grouper_creation_bug(self): result = g.sum() assert_frame_equal(result, expected) - result = g.apply(lambda x: x.sum()) + result = g.apply(lambda x: x.sum()).drop("A", 1) assert_frame_equal(result, expected) g = df.groupby(pd.Grouper(key="A", axis=0))