From 76cd6548d3e6bb513cdce56d193f0739c9614809 Mon Sep 17 00:00:00 2001 From: Richard Date: Sat, 18 Apr 2020 07:53:05 -0400 Subject: [PATCH] BUG: DataFrameGroupby std/sem modify grouped column when as_index=False --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/groupby/groupby.py | 30 +++++++++++++++++++++++---- pandas/tests/groupby/test_function.py | 22 ++++++++++++++++++++ 3 files changed, 49 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a723983590650..241a90b7f8fa2 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -803,6 +803,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`) - Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`) - Bug in :meth:`Rolling.min` and :meth:`Rolling.max`: Growing memory usage after multiple calls when using a fixed window (:issue:`30726`) +- Bug in :meth:`DataFrameGroupby.std` and :meth:`DataFrameGroupby.sem` would modify grouped-by columns when ``as_index=False`` (:issue:`10355`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c71085cd4918a..1694df8f067ff 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -649,11 +649,11 @@ def _set_group_selection(self): ): return - ax = self.obj._info_axis groupers = [g.name for g in grp.groupings if g.level is None and g.in_axis] if len(groupers): # GH12839 clear selected obj cache when group selection changes + ax = self.obj._info_axis self._group_selection = ax.difference(Index(groupers), sort=False).tolist() self._reset_cache("_selected_obj") @@ -1360,8 +1360,18 @@ def std(self, ddof: int = 1): Series or DataFrame Standard deviation of values within each group. """ - # TODO: implement at Cython level? - return np.sqrt(self.var(ddof=ddof)) + result = self.var(ddof=ddof) + if result.ndim == 1: + result = np.sqrt(result) + else: + cols = result.columns.get_indexer_for( + result.columns.difference(self.exclusions).unique() + ) + # TODO(GH-22046) - setting with iloc broken if labels are not unique + # .values to remove labels + result.iloc[:, cols] = np.sqrt(result.iloc[:, cols]).values + + return result @Substitution(name="groupby") @Appender(_common_see_also) @@ -1408,7 +1418,19 @@ def sem(self, ddof: int = 1): Series or DataFrame Standard error of the mean of values within each group. """ - return self.std(ddof=ddof) / np.sqrt(self.count()) + result = self.std(ddof=ddof) + if result.ndim == 1: + result /= np.sqrt(self.count()) + else: + cols = result.columns.get_indexer_for( + result.columns.difference(self.exclusions).unique() + ) + # TODO(GH-22046) - setting with iloc broken if labels are not unique + # .values to remove labels + result.iloc[:, cols] = ( + result.iloc[:, cols].values / np.sqrt(self.count().iloc[:, cols]).values + ) + return result @Substitution(name="groupby") @Appender(_common_see_also) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 68c8b86250e06..9d7bc749d6e89 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -573,6 +573,28 @@ def test_ops_general(op, targop): tm.assert_frame_equal(result, expected) +def test_ops_not_as_index(reduction_func): + # GH 10355 + # Using as_index=False should not modify grouped column + + if reduction_func in ("nth", "ngroup", "size",): + pytest.skip("Skip until behavior is determined (GH #5755)") + + if reduction_func in ("corrwith", "idxmax", "idxmin", "mad", "nunique", "skew",): + pytest.xfail( + "_GroupBy._python_apply_general incorrectly modifies grouping columns" + ) + + df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"]) + expected = getattr(df.groupby("a"), reduction_func)().reset_index() + + result = getattr(df.groupby("a", as_index=False), reduction_func)() + tm.assert_frame_equal(result, expected) + + result = getattr(df.groupby("a", as_index=False)["b"], reduction_func)() + tm.assert_frame_equal(result, expected) + + def test_max_nan_bug(): raw = """,Date,app,File -04-23,2013-04-23 00:00:00,,log080001.log