Skip to content

Commit 333db4b

Browse files
authored
BUG: groupby with as_index=False shouldn't modify grouping columns (#34012)
1 parent 98e495a commit 333db4b

File tree

7 files changed

+149
-66
lines changed

7 files changed

+149
-66
lines changed

doc/source/whatsnew/v1.1.0.rst

+47-1
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,53 @@ Assignment to multiple columns of a :class:`DataFrame` when some of the columns
583583
df[['a', 'c']] = 1
584584
df
585585
586+
.. _whatsnew_110.api_breaking.groupby_consistency:
587+
588+
Consistency across groupby reductions
589+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
590+
591+
Using :meth:`DataFrame.groupby` with ``as_index=True`` and the aggregation ``nunique`` would include the grouping column(s) in the columns of the result. Now the grouping column(s) only appear in the index, consistent with other reductions. (:issue:`32579`)
592+
593+
.. ipython:: python
594+
595+
df = pd.DataFrame({"a": ["x", "x", "y", "y"], "b": [1, 1, 2, 3]})
596+
df
597+
598+
*Previous behavior*:
599+
600+
.. code-block:: ipython
601+
602+
In [3]: df.groupby("a", as_index=True).nunique()
603+
Out[4]:
604+
a b
605+
a
606+
x 1 1
607+
y 1 2
608+
609+
*New behavior*:
610+
611+
.. ipython:: python
612+
613+
df.groupby("a", as_index=True).nunique()
614+
615+
Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxmax``, ``idxmin``, ``mad``, ``nunique``, ``sem``, ``skew``, or ``std`` would modify the grouping column. Now the grouping column remains unchanged, consistent with other reductions. (:issue:`21090`, :issue:`10355`)
616+
617+
*Previous behavior*:
618+
619+
.. code-block:: ipython
620+
621+
In [3]: df.groupby("a", as_index=False).nunique()
622+
Out[4]:
623+
a b
624+
0 1 1
625+
1 1 2
626+
627+
*New behavior*:
628+
629+
.. ipython:: python
630+
631+
df.groupby("a", as_index=False).nunique()
632+
586633
.. _whatsnew_110.deprecations:
587634

588635
Deprecations
@@ -855,7 +902,6 @@ Groupby/resample/rolling
855902
- Bug in :meth:`Series.groupby` would raise ``ValueError`` when grouping by :class:`PeriodIndex` level (:issue:`34010`)
856903
- Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`)
857904
- Bug in :meth:`GroupBy.rolling.apply` ignores args and kwargs parameters (:issue:`33433`)
858-
- Bug in :meth:`DataFrameGroupby.std` and :meth:`DataFrameGroupby.sem` would modify grouped-by columns when ``as_index=False`` (:issue:`10355`)
859905

860906
Reshaping
861907
^^^^^^^^^

pandas/core/groupby/generic.py

+39-29
Original file line numberDiff line numberDiff line change
@@ -1265,7 +1265,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
12651265

12661266
v = values[0]
12671267

1268-
if isinstance(v, (np.ndarray, Index, Series)):
1268+
if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index:
12691269
if isinstance(v, Series):
12701270
applied_index = self._selected_obj._get_axis(self.axis)
12711271
all_indexed_same = all_indexes_same([x.index for x in values])
@@ -1341,6 +1341,11 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
13411341
result = self.obj._constructor(
13421342
stacked_values.T, index=v.index, columns=key_index
13431343
)
1344+
elif not self.as_index:
1345+
# We add grouping column below, so create a frame here
1346+
result = DataFrame(
1347+
values, index=key_index, columns=[self._selection]
1348+
)
13441349
else:
13451350
# GH#1738: values is list of arrays of unequal lengths
13461351
# fall through to the outer else clause
@@ -1358,6 +1363,9 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
13581363
else:
13591364
result = result._convert(datetime=True)
13601365

1366+
if not self.as_index:
1367+
self._insert_inaxis_grouper_inplace(result)
1368+
13611369
return self._reindex_output(result)
13621370

13631371
# values are not series or array-like but scalars
@@ -1700,9 +1708,11 @@ def _insert_inaxis_grouper_inplace(self, result):
17001708
),
17011709
)
17021710
)
1703-
1711+
columns = result.columns
17041712
for name, lev, in_axis in izip:
1705-
if in_axis:
1713+
# GH #28549
1714+
# When using .apply(-), name will be in columns already
1715+
if in_axis and name not in columns:
17061716
result.insert(0, name, lev)
17071717

17081718
def _wrap_aggregated_output(
@@ -1852,11 +1862,11 @@ def nunique(self, dropna: bool = True):
18521862
5 ham 5 y
18531863
18541864
>>> df.groupby('id').nunique()
1855-
id value1 value2
1865+
value1 value2
18561866
id
1857-
egg 1 1 1
1858-
ham 1 1 2
1859-
spam 1 2 1
1867+
egg 1 1
1868+
ham 1 2
1869+
spam 2 1
18601870
18611871
Check for rows with the same id but conflicting values:
18621872
@@ -1867,37 +1877,37 @@ def nunique(self, dropna: bool = True):
18671877
4 ham 5 x
18681878
5 ham 5 y
18691879
"""
1870-
obj = self._selected_obj
1880+
from pandas.core.reshape.concat import concat
18711881

1872-
def groupby_series(obj, col=None):
1873-
return SeriesGroupBy(obj, selection=col, grouper=self.grouper).nunique(
1874-
dropna=dropna
1875-
)
1882+
# TODO: this is duplicative of how GroupBy naturally works
1883+
# Try to consolidate with normal wrapping functions
18761884

1877-
if isinstance(obj, Series):
1878-
results = groupby_series(obj)
1885+
obj = self._obj_with_exclusions
1886+
axis_number = obj._get_axis_number(self.axis)
1887+
other_axis = int(not axis_number)
1888+
if axis_number == 0:
1889+
iter_func = obj.items
18791890
else:
1880-
# TODO: this is duplicative of how GroupBy naturally works
1881-
# Try to consolidate with normal wrapping functions
1882-
from pandas.core.reshape.concat import concat
1883-
1884-
axis_number = obj._get_axis_number(self.axis)
1885-
other_axis = int(not axis_number)
1886-
if axis_number == 0:
1887-
iter_func = obj.items
1888-
else:
1889-
iter_func = obj.iterrows
1891+
iter_func = obj.iterrows
18901892

1891-
results = [groupby_series(content, label) for label, content in iter_func()]
1892-
results = concat(results, axis=1)
1893+
results = concat(
1894+
[
1895+
SeriesGroupBy(content, selection=label, grouper=self.grouper).nunique(
1896+
dropna
1897+
)
1898+
for label, content in iter_func()
1899+
],
1900+
axis=1,
1901+
)
18931902

1894-
if axis_number == 1:
1895-
results = results.T
1903+
if axis_number == 1:
1904+
results = results.T
18961905

1897-
results._get_axis(other_axis).names = obj._get_axis(other_axis).names
1906+
results._get_axis(other_axis).names = obj._get_axis(other_axis).names
18981907

18991908
if not self.as_index:
19001909
results.index = ibase.default_index(len(results))
1910+
self._insert_inaxis_grouper_inplace(results)
19011911
return results
19021912

19031913
boxplot = boxplot_frame_groupby

pandas/core/groupby/groupby.py

+26-9
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ class providing the base-class of operations.
3535

3636
from pandas._libs import Timestamp
3737
import pandas._libs.groupby as libgroupby
38-
from pandas._typing import FrameOrSeries, Scalar
38+
from pandas._typing import F, FrameOrSeries, FrameOrSeriesUnion, Scalar
3939
from pandas.compat.numpy import function as nv
4040
from pandas.errors import AbstractMethodError
4141
from pandas.util._decorators import Appender, Substitution, cache_readonly, doc
@@ -735,11 +735,11 @@ def _make_wrapper(self, name):
735735

736736
# need to setup the selection
737737
# as are not passed directly but in the grouper
738-
f = getattr(self._selected_obj, name)
738+
f = getattr(self._obj_with_exclusions, name)
739739
if not isinstance(f, types.MethodType):
740740
return self.apply(lambda self: getattr(self, name))
741741

742-
f = getattr(type(self._selected_obj), name)
742+
f = getattr(type(self._obj_with_exclusions), name)
743743
sig = inspect.signature(f)
744744

745745
def wrapper(*args, **kwargs):
@@ -762,7 +762,7 @@ def curried(x):
762762
return self.apply(curried)
763763

764764
try:
765-
return self.apply(curried)
765+
return self._python_apply_general(curried, self._obj_with_exclusions)
766766
except TypeError as err:
767767
if not re.search(
768768
"reduction operation '.*' not allowed for this dtype", str(err)
@@ -853,7 +853,7 @@ def f(g):
853853
# ignore SettingWithCopy here in case the user mutates
854854
with option_context("mode.chained_assignment", None):
855855
try:
856-
result = self._python_apply_general(f)
856+
result = self._python_apply_general(f, self._selected_obj)
857857
except TypeError:
858858
# gh-20949
859859
# try again, with .apply acting as a filtering
@@ -864,12 +864,29 @@ def f(g):
864864
# on a string grouper column
865865

866866
with _group_selection_context(self):
867-
return self._python_apply_general(f)
867+
return self._python_apply_general(f, self._selected_obj)
868868

869869
return result
870870

871-
def _python_apply_general(self, f):
872-
keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis)
871+
def _python_apply_general(
872+
self, f: F, data: FrameOrSeriesUnion
873+
) -> FrameOrSeriesUnion:
874+
"""
875+
Apply function f in python space
876+
877+
Parameters
878+
----------
879+
f : callable
880+
Function to apply
881+
data : Series or DataFrame
882+
Data to apply f to
883+
884+
Returns
885+
-------
886+
Series or DataFrame
887+
data after applying f
888+
"""
889+
keys, values, mutated = self.grouper.apply(f, data, self.axis)
873890

874891
return self._wrap_applied_output(
875892
keys, values, not_indexed_same=mutated or self.mutated
@@ -1067,7 +1084,7 @@ def _python_agg_general(
10671084
output[key] = maybe_cast_result(result, obj, numeric_only=True)
10681085

10691086
if len(output) == 0:
1070-
return self._python_apply_general(f)
1087+
return self._python_apply_general(f, self._selected_obj)
10711088

10721089
if self.grouper._filter_empty_groups:
10731090

pandas/tests/groupby/test_function.py

+1-23
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ def test_non_cython_api():
280280
result = g.mad()
281281
tm.assert_frame_equal(result, expected)
282282

283-
expected = DataFrame([[0.0, 0.0], [0, np.nan]], columns=["A", "B"], index=[0, 1])
283+
expected = DataFrame([[1, 0.0], [3, np.nan]], columns=["A", "B"], index=[0, 1])
284284
result = gni.mad()
285285
tm.assert_frame_equal(result, expected)
286286

@@ -573,28 +573,6 @@ def test_ops_general(op, targop):
573573
tm.assert_frame_equal(result, expected)
574574

575575

576-
def test_ops_not_as_index(reduction_func):
577-
# GH 10355
578-
# Using as_index=False should not modify grouped column
579-
580-
if reduction_func in ("nth", "ngroup", "size",):
581-
pytest.skip("Skip until behavior is determined (GH #5755)")
582-
583-
if reduction_func in ("corrwith", "idxmax", "idxmin", "mad", "nunique", "skew",):
584-
pytest.xfail(
585-
"_GroupBy._python_apply_general incorrectly modifies grouping columns"
586-
)
587-
588-
df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"])
589-
expected = getattr(df.groupby("a"), reduction_func)().reset_index()
590-
591-
result = getattr(df.groupby("a", as_index=False), reduction_func)()
592-
tm.assert_frame_equal(result, expected)
593-
594-
result = getattr(df.groupby("a", as_index=False)["b"], reduction_func)()
595-
tm.assert_frame_equal(result, expected)
596-
597-
598576
def test_max_nan_bug():
599577
raw = """,Date,app,File
600578
-04-23,2013-04-23 00:00:00,,log080001.log

pandas/tests/groupby/test_groupby.py

+28
Original file line numberDiff line numberDiff line change
@@ -661,6 +661,34 @@ def test_groupby_as_index_agg(df):
661661
tm.assert_frame_equal(left, right)
662662

663663

664+
def test_ops_not_as_index(reduction_func):
665+
# GH 10355, 21090
666+
# Using as_index=False should not modify grouped column
667+
668+
if reduction_func in ("corrwith",):
669+
pytest.skip("Test not applicable")
670+
671+
if reduction_func in ("nth", "ngroup", "size",):
672+
pytest.skip("Skip until behavior is determined (GH #5755)")
673+
674+
df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"])
675+
expected = getattr(df.groupby("a"), reduction_func)().reset_index()
676+
677+
g = df.groupby("a", as_index=False)
678+
679+
result = getattr(g, reduction_func)()
680+
tm.assert_frame_equal(result, expected)
681+
682+
result = g.agg(reduction_func)
683+
tm.assert_frame_equal(result, expected)
684+
685+
result = getattr(g["b"], reduction_func)()
686+
tm.assert_frame_equal(result, expected)
687+
688+
result = g["b"].agg(reduction_func)
689+
tm.assert_frame_equal(result, expected)
690+
691+
664692
def test_as_index_series_return_frame(df):
665693
grouped = df.groupby("A", as_index=False)
666694
grouped2 = df.groupby(["A", "B"], as_index=False)

pandas/tests/groupby/test_nunique.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,10 @@ def check_nunique(df, keys, as_index=True):
2525
if not as_index:
2626
right = right.reset_index(drop=True)
2727

28-
tm.assert_series_equal(left, right, check_names=False)
28+
if as_index:
29+
tm.assert_series_equal(left, right, check_names=False)
30+
else:
31+
tm.assert_frame_equal(left, right, check_names=False)
2932
tm.assert_frame_equal(df, original_df)
3033

3134
days = date_range("2015-08-23", periods=10)
@@ -56,13 +59,14 @@ def check_nunique(df, keys, as_index=True):
5659
def test_nunique():
5760
df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")})
5861

59-
expected = DataFrame({"A": [1] * 3, "B": [1, 2, 1], "C": [1, 1, 2]})
62+
expected = DataFrame({"A": list("abc"), "B": [1, 2, 1], "C": [1, 1, 2]})
6063
result = df.groupby("A", as_index=False).nunique()
6164
tm.assert_frame_equal(result, expected)
6265

6366
# as_index
6467
expected.index = list("abc")
6568
expected.index.name = "A"
69+
expected = expected.drop(columns="A")
6670
result = df.groupby("A").nunique()
6771
tm.assert_frame_equal(result, expected)
6872

@@ -71,7 +75,7 @@ def test_nunique():
7175
tm.assert_frame_equal(result, expected)
7276

7377
# dropna
74-
expected = DataFrame({"A": [1] * 3, "B": [1] * 3, "C": [1] * 3}, index=list("abc"))
78+
expected = DataFrame({"B": [1] * 3, "C": [1] * 3}, index=list("abc"))
7579
expected.index.name = "A"
7680
result = df.replace({"x": None}).groupby("A").nunique()
7781
tm.assert_frame_equal(result, expected)

pandas/tests/groupby/test_whitelist.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,7 @@ def test_all_methods_categorized(mframe):
406406
if new_names:
407407
msg = f"""
408408
There are uncatgeorized methods defined on the Grouper class:
409-
{names}.
409+
{new_names}.
410410
411411
Was a new method recently added?
412412

0 commit comments

Comments
 (0)