Skip to content

Commit c1c6e40

Browse files
committed
BUG: DataFrame.groupby with as_index=False shouldn't modify grouping columns
1 parent ebb727e commit c1c6e40

File tree

4 files changed

+60
-17
lines changed

4 files changed

+60
-17
lines changed

pandas/core/groupby/generic.py

+17-4
Original file line numberDiff line numberDiff line change
@@ -1257,7 +1257,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
12571257

12581258
v = values[0]
12591259

1260-
if isinstance(v, (np.ndarray, Index, Series)):
1260+
if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index:
12611261
if isinstance(v, Series):
12621262
applied_index = self._selected_obj._get_axis(self.axis)
12631263
all_indexed_same = all_indexes_same([x.index for x in values])
@@ -1333,6 +1333,11 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
13331333
result = DataFrame(
13341334
stacked_values.T, index=v.index, columns=key_index
13351335
)
1336+
elif not self.as_index:
1337+
# We add grouping column below, so create a frame here
1338+
result = DataFrame(
1339+
values, index=key_index, columns=[self._selection]
1340+
)
13361341
else:
13371342
# GH#1738: values is list of arrays of unequal lengths
13381343
# fall through to the outer else clause
@@ -1348,6 +1353,9 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
13481353
else:
13491354
result = result._convert(datetime=True)
13501355

1356+
if not self.as_index:
1357+
self._insert_inaxis_grouper_inplace(result)
1358+
13511359
return self._reindex_output(result)
13521360

13531361
# values are not series or array-like but scalars
@@ -1684,9 +1692,11 @@ def _insert_inaxis_grouper_inplace(self, result):
16841692
),
16851693
)
16861694
)
1687-
1695+
columns = result.columns
16881696
for name, lev, in_axis in izip:
1689-
if in_axis:
1697+
# GH #28549
1698+
# When using .apply(-), name will be in columns already
1699+
if in_axis and name not in columns:
16901700
result.insert(0, name, lev)
16911701

16921702
def _wrap_aggregated_output(
@@ -1851,7 +1861,7 @@ def nunique(self, dropna: bool = True):
18511861
4 ham 5 x
18521862
5 ham 5 y
18531863
"""
1854-
obj = self._selected_obj
1864+
obj = self._obj_with_exclusions
18551865

18561866
def groupby_series(obj, col=None):
18571867
return SeriesGroupBy(obj, selection=col, grouper=self.grouper).nunique(
@@ -1882,6 +1892,9 @@ def groupby_series(obj, col=None):
18821892

18831893
if not self.as_index:
18841894
results.index = ibase.default_index(len(results))
1895+
if results.ndim == 1:
1896+
results = results.to_frame()
1897+
self._insert_inaxis_grouper_inplace(results)
18851898
return results
18861899

18871900
boxplot = boxplot_frame_groupby

pandas/core/groupby/groupby.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -715,11 +715,11 @@ def _make_wrapper(self, name):
715715

716716
# need to setup the selection
717717
# as are not passed directly but in the grouper
718-
f = getattr(self._selected_obj, name)
718+
f = getattr(self._obj_with_exclusions, name)
719719
if not isinstance(f, types.MethodType):
720720
return self.apply(lambda self: getattr(self, name))
721721

722-
f = getattr(type(self._selected_obj), name)
722+
f = getattr(type(self._obj_with_exclusions), name)
723723
sig = inspect.signature(f)
724724

725725
def wrapper(*args, **kwargs):
@@ -742,7 +742,7 @@ def curried(x):
742742
return self.apply(curried)
743743

744744
try:
745-
return self.apply(curried)
745+
return self._python_apply_general(curried, self._obj_with_exclusions)
746746
except TypeError as err:
747747
if not re.search(
748748
"reduction operation '.*' not allowed for this dtype", str(err)
@@ -833,7 +833,7 @@ def f(g):
833833
# ignore SettingWithCopy here in case the user mutates
834834
with option_context("mode.chained_assignment", None):
835835
try:
836-
result = self._python_apply_general(f)
836+
result = self._python_apply_general(f, self._selected_obj)
837837
except TypeError:
838838
# gh-20949
839839
# try again, with .apply acting as a filtering
@@ -844,12 +844,12 @@ def f(g):
844844
# on a string grouper column
845845

846846
with _group_selection_context(self):
847-
return self._python_apply_general(f)
847+
return self._python_apply_general(f, self._selected_obj)
848848

849849
return result
850850

851-
def _python_apply_general(self, f):
852-
keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis)
851+
def _python_apply_general(self, f, obj):
852+
keys, values, mutated = self.grouper.apply(f, obj, self.axis)
853853

854854
return self._wrap_applied_output(
855855
keys, values, not_indexed_same=mutated or self.mutated
@@ -1016,7 +1016,7 @@ def _python_agg_general(
10161016
output[key] = maybe_cast_result(result, obj, numeric_only=True)
10171017

10181018
if len(output) == 0:
1019-
return self._python_apply_general(f)
1019+
return self._python_apply_general(f, self._selected_obj)
10201020

10211021
if self.grouper._filter_empty_groups:
10221022

pandas/tests/groupby/test_function.py

+34-4
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ def test_non_cython_api():
291291
result = g.mad()
292292
tm.assert_frame_equal(result, expected)
293293

294-
expected = DataFrame([[0.0, 0.0], [0, np.nan]], columns=["A", "B"], index=[0, 1])
294+
expected = DataFrame([[1, 0.0], [3, np.nan]], columns=["A", "B"], index=[0, 1])
295295
result = gni.mad()
296296
tm.assert_frame_equal(result, expected)
297297

@@ -584,6 +584,31 @@ def test_ops_general(op, targop):
584584
tm.assert_frame_equal(result, expected)
585585

586586

587+
def test_ops_not_as_index(reduction_func):
588+
# Using as_index=False should not modify grouped column
589+
590+
if reduction_func in ("corrwith",):
591+
pytest.skip("Test not applicable")
592+
593+
if reduction_func in ("nth", "ngroup", "size",):
594+
pytest.skip("Skip until behavior is determined (GH #5755)")
595+
596+
if reduction_func in ("sem", "std"):
597+
pytest.skip("Function incorrectly modifies keys (GH #10355)")
598+
599+
df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"])
600+
expected = getattr(df.groupby("a"), reduction_func)().reset_index()
601+
602+
result = getattr(df.groupby("a", as_index=False), reduction_func)()
603+
tm.assert_frame_equal(result, expected)
604+
605+
result = df.groupby("a", as_index=False).agg(reduction_func)
606+
tm.assert_frame_equal(result, expected)
607+
608+
result = getattr(df.groupby("a", as_index=False)["b"], reduction_func)()
609+
tm.assert_frame_equal(result, expected)
610+
611+
587612
def test_max_nan_bug():
588613
raw = """,Date,app,File
589614
-04-23,2013-04-23 00:00:00,,log080001.log
@@ -1004,7 +1029,11 @@ def check_nunique(df, keys, as_index=True):
10041029
if not as_index:
10051030
right = right.reset_index(drop=True)
10061031

1007-
tm.assert_series_equal(left, right, check_names=False)
1032+
if not as_index:
1033+
# keys make the result a frame
1034+
tm.assert_frame_equal(left, right, check_names=False)
1035+
else:
1036+
tm.assert_series_equal(left, right, check_names=False)
10081037
tm.assert_frame_equal(df, original_df)
10091038

10101039
days = date_range("2015-08-23", periods=10)
@@ -1035,13 +1064,14 @@ def check_nunique(df, keys, as_index=True):
10351064
def test_nunique():
10361065
df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")})
10371066

1038-
expected = DataFrame({"A": [1] * 3, "B": [1, 2, 1], "C": [1, 1, 2]})
1067+
expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 2, 1], "C": [1, 1, 2]})
10391068
result = df.groupby("A", as_index=False).nunique()
10401069
tm.assert_frame_equal(result, expected)
10411070

10421071
# as_index
10431072
expected.index = list("abc")
10441073
expected.index.name = "A"
1074+
expected = expected.drop(columns="A")
10451075
result = df.groupby("A").nunique()
10461076
tm.assert_frame_equal(result, expected)
10471077

@@ -1050,7 +1080,7 @@ def test_nunique():
10501080
tm.assert_frame_equal(result, expected)
10511081

10521082
# dropna
1053-
expected = DataFrame({"A": [1] * 3, "B": [1] * 3, "C": [1] * 3}, index=list("abc"))
1083+
expected = DataFrame({"B": [1] * 3, "C": [1] * 3}, index=list("abc"))
10541084
expected.index.name = "A"
10551085
result = df.replace({"x": None}).groupby("A").nunique()
10561086
tm.assert_frame_equal(result, expected)

pandas/tests/groupby/test_whitelist.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,7 @@ def test_all_methods_categorized(mframe):
406406
if new_names:
407407
msg = f"""
408408
There are uncatgeorized methods defined on the Grouper class:
409-
{names}.
409+
{new_names}.
410410
411411
Was a new method recently added?
412412

0 commit comments

Comments
 (0)