Skip to content

Commit a0cd28c

Browse files
committed
BUG: DataFrameGroupby std/sem modify grouped column when as_index=False
1 parent b630cdb commit a0cd28c

File tree

3 files changed

+45
-4
lines changed

3 files changed

+45
-4
lines changed

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -603,6 +603,7 @@ Groupby/resample/rolling
603603
- Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`)
604604
- Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`)
605605
- Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`)
606+
- Bug in :meth:`DataFrameGroupby.std` and :meth:`DataFrameGroupby.sem` would modify grouped-by columns when ``as_index=False`` (:issue:`10355`)
606607

607608
Reshaping
608609
^^^^^^^^^

pandas/core/groupby/groupby.py

+19-4
Original file line numberDiff line numberDiff line change
@@ -577,11 +577,11 @@ def _set_group_selection(self):
577577
):
578578
return
579579

580-
ax = self.obj._info_axis
581580
groupers = [g.name for g in grp.groupings if g.level is None and g.in_axis]
582581

583582
if len(groupers):
584583
# GH12839 clear selected obj cache when group selection changes
584+
ax = self.obj._info_axis
585585
self._group_selection = ax.difference(Index(groupers), sort=False).tolist()
586586
self._reset_cache("_selected_obj")
587587

@@ -1275,8 +1275,14 @@ def std(self, ddof: int = 1):
12751275
Series or DataFrame
12761276
Standard deviation of values within each group.
12771277
"""
1278-
# TODO: implement at Cython level?
1279-
return np.sqrt(self.var(ddof=ddof))
1278+
result = self.var(ddof=ddof)
1279+
if self.ndim == 1:
1280+
result = np.sqrt(result)
1281+
else:
1282+
for col in result:
1283+
if col not in self.exclusions:
1284+
result[col] = np.sqrt(result[col])
1285+
return result
12801286

12811287
@Substitution(name="groupby")
12821288
@Appender(_common_see_also)
@@ -1301,6 +1307,7 @@ def var(self, ddof: int = 1):
13011307
"var", alt=lambda x, axis: Series(x).var(ddof=ddof)
13021308
)
13031309
else:
1310+
# TODO: implement at Cython level?
13041311
func = lambda x: x.var(ddof=ddof)
13051312
with _group_selection_context(self):
13061313
return self._python_agg_general(func)
@@ -1323,7 +1330,15 @@ def sem(self, ddof: int = 1):
13231330
Series or DataFrame
13241331
Standard error of the mean of values within each group.
13251332
"""
1326-
return self.std(ddof=ddof) / np.sqrt(self.count())
1333+
result = self.std(ddof=ddof)
1334+
denom = np.sqrt(self.count())
1335+
if self.ndim == 1:
1336+
result /= denom
1337+
else:
1338+
for col in result:
1339+
if col not in self.exclusions:
1340+
result[col] /= denom[col]
1341+
return result
13271342

13281343
@Substitution(name="groupby")
13291344
@Appender(_common_see_also)

pandas/tests/groupby/test_function.py

+25
Original file line numberDiff line numberDiff line change
@@ -585,6 +585,31 @@ def test_ops_general(op, targop):
585585
tm.assert_frame_equal(result, expected)
586586

587587

588+
@pytest.mark.parametrize(
589+
"op",
590+
[
591+
"mean",
592+
"median",
593+
"std",
594+
"var",
595+
"sum",
596+
"prod",
597+
"min",
598+
"max",
599+
"first",
600+
"last",
601+
"count",
602+
"sem",
603+
],
604+
)
605+
def test_ops_not_as_index(op):
606+
df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"])
607+
608+
result = getattr(df.groupby("a", as_index=False), op)()
609+
expected = getattr(df.groupby("a"), op)().reset_index()
610+
tm.assert_frame_equal(result, expected)
611+
612+
588613
def test_max_nan_bug():
589614
raw = """,Date,app,File
590615
-04-23,2013-04-23 00:00:00,,log080001.log

0 commit comments

Comments
 (0)