Skip to content

Commit 5eb636d

Browse files
committed
BUG: DataFrame.groupby with as_index=False shouldn't modify grouping columns
1 parent 6c0be4b commit 5eb636d

File tree

7 files changed

+112
-21
lines changed

7 files changed

+112
-21
lines changed

doc/source/whatsnew/v1.1.0.rst

+28
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,34 @@ Assignment to multiple columns of a :class:`DataFrame` when some of the columns
559559
df[['a', 'c']] = 1
560560
df
561561
562+
.. _whatsnew_110.api_breaking.groupby_as_index_false:
563+
564+
Using groupby with ``as_index=False``
565+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
566+
567+
Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxmax``, ``idxmin``, ``mad``, ``nunique``, or ``skew`` would modify the grouping column. Now, the grouping column remains unchanged. (:issue:`21090`)
568+
569+
.. ipython:: python
570+
571+
df = pd.DataFrame({"a": ["x", "x", "y", "y"], "b": [1, 1, 2, 3]})
572+
df
573+
574+
*Previous behavior*:
575+
576+
.. code-block:: ipython
577+
578+
In [3]: df.groupby("a", as_index=False).nunique()
579+
Out[4]:
580+
a b
581+
0 1 1
582+
1 1 2
583+
584+
*New behavior*:
585+
586+
.. ipython:: python
587+
588+
df.groupby("a", as_index=False).nunique()
589+
562590
.. _whatsnew_110.deprecations:
563591

564592
Deprecations

pandas/core/groupby/generic.py

+21-8
Original file line numberDiff line numberDiff line change
@@ -1265,7 +1265,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
12651265

12661266
v = values[0]
12671267

1268-
if isinstance(v, (np.ndarray, Index, Series)):
1268+
if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index:
12691269
if isinstance(v, Series):
12701270
applied_index = self._selected_obj._get_axis(self.axis)
12711271
all_indexed_same = all_indexes_same([x.index for x in values])
@@ -1341,6 +1341,11 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
13411341
result = self.obj._constructor(
13421342
stacked_values.T, index=v.index, columns=key_index
13431343
)
1344+
elif not self.as_index:
1345+
# We add grouping column below, so create a frame here
1346+
result = DataFrame(
1347+
values, index=key_index, columns=[self._selection]
1348+
)
13441349
else:
13451350
# GH#1738: values is list of arrays of unequal lengths
13461351
# fall through to the outer else clause
@@ -1358,6 +1363,9 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
13581363
else:
13591364
result = result._convert(datetime=True)
13601365

1366+
if not self.as_index:
1367+
self._insert_inaxis_grouper_inplace(result)
1368+
13611369
return self._reindex_output(result)
13621370

13631371
# values are not series or array-like but scalars
@@ -1700,9 +1708,11 @@ def _insert_inaxis_grouper_inplace(self, result):
17001708
),
17011709
)
17021710
)
1703-
1711+
columns = result.columns
17041712
for name, lev, in_axis in izip:
1705-
if in_axis:
1713+
# GH #28549
1714+
# When using .apply(-), name will be in columns already
1715+
if in_axis and name not in columns:
17061716
result.insert(0, name, lev)
17071717

17081718
def _wrap_aggregated_output(
@@ -1852,11 +1862,11 @@ def nunique(self, dropna: bool = True):
18521862
5 ham 5 y
18531863
18541864
>>> df.groupby('id').nunique()
1855-
id value1 value2
1865+
value1 value2
18561866
id
1857-
egg 1 1 1
1858-
ham 1 1 2
1859-
spam 1 2 1
1867+
egg 1 1
1868+
ham 1 2
1869+
spam 2 1
18601870
18611871
Check for rows with the same id but conflicting values:
18621872
@@ -1867,7 +1877,7 @@ def nunique(self, dropna: bool = True):
18671877
4 ham 5 x
18681878
5 ham 5 y
18691879
"""
1870-
obj = self._selected_obj
1880+
obj = self._obj_with_exclusions
18711881

18721882
def groupby_series(obj, col=None):
18731883
return SeriesGroupBy(obj, selection=col, grouper=self.grouper).nunique(
@@ -1898,6 +1908,9 @@ def groupby_series(obj, col=None):
18981908

18991909
if not self.as_index:
19001910
results.index = ibase.default_index(len(results))
1911+
if results.ndim == 1:
1912+
results = results.to_frame()
1913+
self._insert_inaxis_grouper_inplace(results)
19011914
return results
19021915

19031916
boxplot = boxplot_frame_groupby

pandas/core/groupby/groupby.py

+23-8
Original file line numberDiff line numberDiff line change
@@ -718,11 +718,11 @@ def _make_wrapper(self, name):
718718

719719
# need to setup the selection
720720
# as are not passed directly but in the grouper
721-
f = getattr(self._selected_obj, name)
721+
f = getattr(self._obj_with_exclusions, name)
722722
if not isinstance(f, types.MethodType):
723723
return self.apply(lambda self: getattr(self, name))
724724

725-
f = getattr(type(self._selected_obj), name)
725+
f = getattr(type(self._obj_with_exclusions), name)
726726
sig = inspect.signature(f)
727727

728728
def wrapper(*args, **kwargs):
@@ -745,7 +745,7 @@ def curried(x):
745745
return self.apply(curried)
746746

747747
try:
748-
return self.apply(curried)
748+
return self._python_apply_general(curried, self._obj_with_exclusions)
749749
except TypeError as err:
750750
if not re.search(
751751
"reduction operation '.*' not allowed for this dtype", str(err)
@@ -836,7 +836,7 @@ def f(g):
836836
# ignore SettingWithCopy here in case the user mutates
837837
with option_context("mode.chained_assignment", None):
838838
try:
839-
result = self._python_apply_general(f)
839+
result = self._python_apply_general(f, self._selected_obj)
840840
except TypeError:
841841
# gh-20949
842842
# try again, with .apply acting as a filtering
@@ -847,12 +847,27 @@ def f(g):
847847
# on a string grouper column
848848

849849
with _group_selection_context(self):
850-
return self._python_apply_general(f)
850+
return self._python_apply_general(f, self._selected_obj)
851851

852852
return result
853853

854-
def _python_apply_general(self, f):
855-
keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis)
854+
def _python_apply_general(self, f, data):
855+
"""
856+
Apply function f in python space
857+
858+
Parameters
859+
----------
860+
f : callable
861+
Function to apply
862+
data : Series or DataFrame
863+
Data to apply f to
864+
865+
Returns
866+
-------
867+
Series or DataFrame
868+
data after applying f
869+
"""
870+
keys, values, mutated = self.grouper.apply(f, data, self.axis)
856871

857872
return self._wrap_applied_output(
858873
keys, values, not_indexed_same=mutated or self.mutated
@@ -1019,7 +1034,7 @@ def _python_agg_general(
10191034
output[key] = maybe_cast_result(result, obj, numeric_only=True)
10201035

10211036
if len(output) == 0:
1022-
return self._python_apply_general(f)
1037+
return self._python_apply_general(f, self._selected_obj)
10231038

10241039
if self.grouper._filter_empty_groups:
10251040

pandas/tests/groupby/test_function.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ def test_non_cython_api():
280280
result = g.mad()
281281
tm.assert_frame_equal(result, expected)
282282

283-
expected = DataFrame([[0.0, 0.0], [0, np.nan]], columns=["A", "B"], index=[0, 1])
283+
expected = DataFrame([[1, 0.0], [3, np.nan]], columns=["A", "B"], index=[0, 1])
284284
result = gni.mad()
285285
tm.assert_frame_equal(result, expected)
286286

pandas/tests/groupby/test_groupby.py

+31
Original file line numberDiff line numberDiff line change
@@ -658,6 +658,37 @@ def test_groupby_as_index_agg(df):
658658
tm.assert_frame_equal(left, right)
659659

660660

661+
def test_ops_not_as_index(reduction_func):
662+
# GH 21090
663+
# Using as_index=False should not modify grouped column
664+
665+
if reduction_func in ("corrwith",):
666+
pytest.skip("Test not applicable")
667+
668+
if reduction_func in ("nth", "ngroup", "size",):
669+
pytest.skip("Skip until behavior is determined (GH #5755)")
670+
671+
if reduction_func in ("sem", "std"):
672+
pytest.skip("Function incorrectly modifies keys (GH #10355)")
673+
674+
df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"])
675+
expected = getattr(df.groupby("a"), reduction_func)().reset_index()
676+
677+
g = df.groupby("a", as_index=False)
678+
679+
result = getattr(g, reduction_func)()
680+
tm.assert_frame_equal(result, expected)
681+
682+
result = g.agg(reduction_func)
683+
tm.assert_frame_equal(result, expected)
684+
685+
result = getattr(g["b"], reduction_func)()
686+
tm.assert_frame_equal(result, expected)
687+
688+
result = g["b"].agg(reduction_func)
689+
tm.assert_frame_equal(result, expected)
690+
691+
661692
def test_as_index_series_return_frame(df):
662693
grouped = df.groupby("A", as_index=False)
663694
grouped2 = df.groupby(["A", "B"], as_index=False)

pandas/tests/groupby/test_nunique.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,10 @@ def check_nunique(df, keys, as_index=True):
2525
if not as_index:
2626
right = right.reset_index(drop=True)
2727

28-
tm.assert_series_equal(left, right, check_names=False)
28+
if as_index:
29+
tm.assert_series_equal(left, right, check_names=False)
30+
else:
31+
tm.assert_frame_equal(left, right, check_names=False)
2932
tm.assert_frame_equal(df, original_df)
3033

3134
days = date_range("2015-08-23", periods=10)
@@ -56,13 +59,14 @@ def check_nunique(df, keys, as_index=True):
5659
def test_nunique():
5760
df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")})
5861

59-
expected = DataFrame({"A": [1] * 3, "B": [1, 2, 1], "C": [1, 1, 2]})
62+
expected = DataFrame({"A": list("abc"), "B": [1, 2, 1], "C": [1, 1, 2]})
6063
result = df.groupby("A", as_index=False).nunique()
6164
tm.assert_frame_equal(result, expected)
6265

6366
# as_index
6467
expected.index = list("abc")
6568
expected.index.name = "A"
69+
expected = expected.drop(columns="A")
6670
result = df.groupby("A").nunique()
6771
tm.assert_frame_equal(result, expected)
6872

@@ -71,7 +75,7 @@ def test_nunique():
7175
tm.assert_frame_equal(result, expected)
7276

7377
# dropna
74-
expected = DataFrame({"A": [1] * 3, "B": [1] * 3, "C": [1] * 3}, index=list("abc"))
78+
expected = DataFrame({"B": [1] * 3, "C": [1] * 3}, index=list("abc"))
7579
expected.index.name = "A"
7680
result = df.replace({"x": None}).groupby("A").nunique()
7781
tm.assert_frame_equal(result, expected)

pandas/tests/groupby/test_whitelist.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,7 @@ def test_all_methods_categorized(mframe):
406406
if new_names:
407407
msg = f"""
408408
There are uncatgeorized methods defined on the Grouper class:
409-
{names}.
409+
{new_names}.
410410
411411
Was a new method recently added?
412412

0 commit comments

Comments
 (0)