diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 73892da2cbf71..15adb74e27ba7 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -817,7 +817,7 @@ Groupby/resample/rolling - Bug in :meth:`Rolling.min` and :meth:`Rolling.max`: Growing memory usage after multiple calls when using a fixed window (:issue:`30726`) - Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`) - Bug in :meth:`GroupBy.rolling.apply` ignores args and kwargs parameters (:issue:`33433`) - +- Bug in :meth:`DataFrameGroupby.std` and :meth:`DataFrameGroupby.sem` would modify grouped-by columns when ``as_index=False`` (:issue:`10355`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d9b65f92ac0e1..b630aed69be10 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -649,11 +649,11 @@ def _set_group_selection(self): ): return - ax = self.obj._info_axis groupers = [g.name for g in grp.groupings if g.level is None and g.in_axis] if len(groupers): # GH12839 clear selected obj cache when group selection changes + ax = self.obj._info_axis self._group_selection = ax.difference(Index(groupers), sort=False).tolist() self._reset_cache("_selected_obj") @@ -1368,8 +1368,18 @@ def std(self, ddof: int = 1): Series or DataFrame Standard deviation of values within each group. """ - # TODO: implement at Cython level? - return np.sqrt(self.var(ddof=ddof)) + result = self.var(ddof=ddof) + if result.ndim == 1: + result = np.sqrt(result) + else: + cols = result.columns.get_indexer_for( + result.columns.difference(self.exclusions).unique() + ) + # TODO(GH-22046) - setting with iloc broken if labels are not unique + # .values to remove labels + result.iloc[:, cols] = np.sqrt(result.iloc[:, cols]).values + + return result @Substitution(name="groupby") @Appender(_common_see_also) @@ -1416,7 +1426,19 @@ def sem(self, ddof: int = 1): Series or DataFrame Standard error of the mean of values within each group. """ - return self.std(ddof=ddof) / np.sqrt(self.count()) + result = self.std(ddof=ddof) + if result.ndim == 1: + result /= np.sqrt(self.count()) + else: + cols = result.columns.get_indexer_for( + result.columns.difference(self.exclusions).unique() + ) + # TODO(GH-22046) - setting with iloc broken if labels are not unique + # .values to remove labels + result.iloc[:, cols] = ( + result.iloc[:, cols].values / np.sqrt(self.count().iloc[:, cols]).values + ) + return result @Substitution(name="groupby") @Appender(_common_see_also) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 68c8b86250e06..9d7bc749d6e89 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -573,6 +573,28 @@ def test_ops_general(op, targop): tm.assert_frame_equal(result, expected) +def test_ops_not_as_index(reduction_func): + # GH 10355 + # Using as_index=False should not modify grouped column + + if reduction_func in ("nth", "ngroup", "size",): + pytest.skip("Skip until behavior is determined (GH #5755)") + + if reduction_func in ("corrwith", "idxmax", "idxmin", "mad", "nunique", "skew",): + pytest.xfail( + "_GroupBy._python_apply_general incorrectly modifies grouping columns" + ) + + df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"]) + expected = getattr(df.groupby("a"), reduction_func)().reset_index() + + result = getattr(df.groupby("a", as_index=False), reduction_func)() + tm.assert_frame_equal(result, expected) + + result = getattr(df.groupby("a", as_index=False)["b"], reduction_func)() + tm.assert_frame_equal(result, expected) + + def test_max_nan_bug(): raw = """,Date,app,File -04-23,2013-04-23 00:00:00,,log080001.log