Skip to content

BUG: groupby with as_index=False shouldn't modify grouping columns #34012

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 27, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,63 @@ Assignment to multiple columns of a :class:`DataFrame` when some of the columns
df[['a', 'c']] = 1
df

.. _whatsnew_110.api_breaking.groupby_nunique:

Using groupby with ``nunique`` and ``as_index=True``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Using :meth:`DataFrame.groupby` with ``as_index=True`` and the aggregation ``nunique`` would include the grouping column(s) in the columns of the result. Now, the grouping columns only appear in the index. This is now consistent with other aggregation functions. (:issue:`32579`)

.. ipython:: python

df = pd.DataFrame({"a": ["x", "x", "y", "y"], "b": [1, 1, 2, 3]})
df

*Previous behavior*:

.. code-block:: ipython

In [3]: df.groupby("a", as_index=True).nunique()
Out[4]:
a b
a
x 1 1
y 1 2

*New behavior*:

.. ipython:: python

df.groupby("a", as_index=True).nunique()

.. _whatsnew_110.api_breaking.groupby_as_index_false:

Using groupby with ``as_index=False``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxmax``, ``idxmin``, ``mad``, ``nunique``, or ``skew`` would modify the grouping column. Now, the grouping column remains unchanged. (:issue:`21090`)

.. ipython:: python
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this also changes nunique for as_index=True, can you put that example here as well (first)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do this in the same note as its very confusing to read this otherwise.


df = pd.DataFrame({"a": ["x", "x", "y", "y"], "b": [1, 1, 2, 3]})
df

*Previous behavior*:

.. code-block:: ipython

In [3]: df.groupby("a", as_index=False).nunique()
Out[4]:
a b
0 1 1
1 1 2

*New behavior*:

.. ipython:: python

df.groupby("a", as_index=False).nunique()

.. _whatsnew_110.deprecations:

Deprecations
Expand Down
68 changes: 39 additions & 29 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1265,7 +1265,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):

v = values[0]

if isinstance(v, (np.ndarray, Index, Series)):
if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index:
if isinstance(v, Series):
applied_index = self._selected_obj._get_axis(self.axis)
all_indexed_same = all_indexes_same([x.index for x in values])
Expand Down Expand Up @@ -1341,6 +1341,11 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
result = self.obj._constructor(
stacked_values.T, index=v.index, columns=key_index
)
elif not self.as_index:
# We add grouping column below, so create a frame here
result = DataFrame(
values, index=key_index, columns=[self._selection]
)
else:
# GH#1738: values is list of arrays of unequal lengths
# fall through to the outer else clause
Expand All @@ -1358,6 +1363,9 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
else:
result = result._convert(datetime=True)

if not self.as_index:
self._insert_inaxis_grouper_inplace(result)

return self._reindex_output(result)

# values are not series or array-like but scalars
Expand Down Expand Up @@ -1700,9 +1708,11 @@ def _insert_inaxis_grouper_inplace(self, result):
),
)
)

columns = result.columns
for name, lev, in_axis in izip:
if in_axis:
# GH #28549
# When using .apply(-), name will be in columns already
if in_axis and name not in columns:
result.insert(0, name, lev)

def _wrap_aggregated_output(
Expand Down Expand Up @@ -1852,11 +1862,11 @@ def nunique(self, dropna: bool = True):
5 ham 5 y

>>> df.groupby('id').nunique()
id value1 value2
value1 value2
id
egg 1 1 1
ham 1 1 2
spam 1 2 1
egg 1 1
ham 1 2
spam 2 1

Check for rows with the same id but conflicting values:

Expand All @@ -1867,37 +1877,37 @@ def nunique(self, dropna: bool = True):
4 ham 5 x
5 ham 5 y
"""
obj = self._selected_obj
from pandas.core.reshape.concat import concat

def groupby_series(obj, col=None):
return SeriesGroupBy(obj, selection=col, grouper=self.grouper).nunique(
dropna=dropna
)
# TODO: this is duplicative of how GroupBy naturally works
# Try to consolidate with normal wrapping functions

if isinstance(obj, Series):
results = groupby_series(obj)
obj = self._obj_with_exclusions
axis_number = obj._get_axis_number(self.axis)
other_axis = int(not axis_number)
if axis_number == 0:
iter_func = obj.items
else:
# TODO: this is duplicative of how GroupBy naturally works
# Try to consolidate with normal wrapping functions
from pandas.core.reshape.concat import concat

axis_number = obj._get_axis_number(self.axis)
other_axis = int(not axis_number)
if axis_number == 0:
iter_func = obj.items
else:
iter_func = obj.iterrows
iter_func = obj.iterrows

results = [groupby_series(content, label) for label, content in iter_func()]
results = concat(results, axis=1)
results = concat(
[
SeriesGroupBy(content, selection=label, grouper=self.grouper).nunique(
dropna
)
for label, content in iter_func()
],
axis=1,
)

if axis_number == 1:
results = results.T
if axis_number == 1:
results = results.T

results._get_axis(other_axis).names = obj._get_axis(other_axis).names
results._get_axis(other_axis).names = obj._get_axis(other_axis).names

if not self.as_index:
results.index = ibase.default_index(len(results))
self._insert_inaxis_grouper_inplace(results)
return results

boxplot = boxplot_frame_groupby
Expand Down
33 changes: 24 additions & 9 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class providing the base-class of operations.

from pandas._libs import Timestamp
import pandas._libs.groupby as libgroupby
from pandas._typing import FrameOrSeries, Scalar
from pandas._typing import F, FrameOrSeries, Scalar
from pandas.compat.numpy import function as nv
from pandas.errors import AbstractMethodError
from pandas.util._decorators import Appender, Substitution, cache_readonly, doc
Expand Down Expand Up @@ -735,11 +735,11 @@ def _make_wrapper(self, name):

# need to setup the selection
# as are not passed directly but in the grouper
f = getattr(self._selected_obj, name)
f = getattr(self._obj_with_exclusions, name)
if not isinstance(f, types.MethodType):
return self.apply(lambda self: getattr(self, name))

f = getattr(type(self._selected_obj), name)
f = getattr(type(self._obj_with_exclusions), name)
sig = inspect.signature(f)

def wrapper(*args, **kwargs):
Expand All @@ -762,7 +762,7 @@ def curried(x):
return self.apply(curried)

try:
return self.apply(curried)
return self._python_apply_general(curried, self._obj_with_exclusions)
except TypeError as err:
if not re.search(
"reduction operation '.*' not allowed for this dtype", str(err)
Expand Down Expand Up @@ -853,7 +853,7 @@ def f(g):
# ignore SettingWithCopy here in case the user mutates
with option_context("mode.chained_assignment", None):
try:
result = self._python_apply_general(f)
result = self._python_apply_general(f, self._selected_obj)
except TypeError:
# gh-20949
# try again, with .apply acting as a filtering
Expand All @@ -864,12 +864,27 @@ def f(g):
# on a string grouper column

with _group_selection_context(self):
return self._python_apply_general(f)
return self._python_apply_general(f, self._selected_obj)

return result

def _python_apply_general(self, f):
keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis)
def _python_apply_general(self, f: F, data: Union[Series, DataFrame]):
"""
Apply function f in python space

Parameters
----------
f : callable
Function to apply
data : Series or DataFrame
Data to apply f to

Returns
-------
Series or DataFrame
data after applying f
"""
keys, values, mutated = self.grouper.apply(f, data, self.axis)

return self._wrap_applied_output(
keys, values, not_indexed_same=mutated or self.mutated
Expand Down Expand Up @@ -1067,7 +1082,7 @@ def _python_agg_general(
output[key] = maybe_cast_result(result, obj, numeric_only=True)

if len(output) == 0:
return self._python_apply_general(f)
return self._python_apply_general(f, self._selected_obj)

if self.grouper._filter_empty_groups:

Expand Down
24 changes: 1 addition & 23 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ def test_non_cython_api():
result = g.mad()
tm.assert_frame_equal(result, expected)

expected = DataFrame([[0.0, 0.0], [0, np.nan]], columns=["A", "B"], index=[0, 1])
expected = DataFrame([[1, 0.0], [3, np.nan]], columns=["A", "B"], index=[0, 1])
result = gni.mad()
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -573,28 +573,6 @@ def test_ops_general(op, targop):
tm.assert_frame_equal(result, expected)


def test_ops_not_as_index(reduction_func):
# GH 10355
# Using as_index=False should not modify grouped column

if reduction_func in ("nth", "ngroup", "size",):
pytest.skip("Skip until behavior is determined (GH #5755)")

if reduction_func in ("corrwith", "idxmax", "idxmin", "mad", "nunique", "skew",):
pytest.xfail(
"_GroupBy._python_apply_general incorrectly modifies grouping columns"
)

df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"])
expected = getattr(df.groupby("a"), reduction_func)().reset_index()

result = getattr(df.groupby("a", as_index=False), reduction_func)()
tm.assert_frame_equal(result, expected)

result = getattr(df.groupby("a", as_index=False)["b"], reduction_func)()
tm.assert_frame_equal(result, expected)


def test_max_nan_bug():
raw = """,Date,app,File
-04-23,2013-04-23 00:00:00,,log080001.log
Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,6 +658,34 @@ def test_groupby_as_index_agg(df):
tm.assert_frame_equal(left, right)


def test_ops_not_as_index(reduction_func):
# GH 10355, 21090
# Using as_index=False should not modify grouped column

if reduction_func in ("corrwith",):
pytest.skip("Test not applicable")

if reduction_func in ("nth", "ngroup", "size",):
pytest.skip("Skip until behavior is determined (GH #5755)")

df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"])
expected = getattr(df.groupby("a"), reduction_func)().reset_index()

g = df.groupby("a", as_index=False)

result = getattr(g, reduction_func)()
tm.assert_frame_equal(result, expected)

result = g.agg(reduction_func)
tm.assert_frame_equal(result, expected)

result = getattr(g["b"], reduction_func)()
tm.assert_frame_equal(result, expected)

result = g["b"].agg(reduction_func)
tm.assert_frame_equal(result, expected)


def test_as_index_series_return_frame(df):
grouped = df.groupby("A", as_index=False)
grouped2 = df.groupby(["A", "B"], as_index=False)
Expand Down
10 changes: 7 additions & 3 deletions pandas/tests/groupby/test_nunique.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ def check_nunique(df, keys, as_index=True):
if not as_index:
right = right.reset_index(drop=True)

tm.assert_series_equal(left, right, check_names=False)
if as_index:
tm.assert_series_equal(left, right, check_names=False)
else:
tm.assert_frame_equal(left, right, check_names=False)
tm.assert_frame_equal(df, original_df)

days = date_range("2015-08-23", periods=10)
Expand Down Expand Up @@ -56,13 +59,14 @@ def check_nunique(df, keys, as_index=True):
def test_nunique():
df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")})

expected = DataFrame({"A": [1] * 3, "B": [1, 2, 1], "C": [1, 1, 2]})
expected = DataFrame({"A": list("abc"), "B": [1, 2, 1], "C": [1, 1, 2]})
result = df.groupby("A", as_index=False).nunique()
tm.assert_frame_equal(result, expected)

# as_index
expected.index = list("abc")
expected.index.name = "A"
expected = expected.drop(columns="A")
result = df.groupby("A").nunique()
tm.assert_frame_equal(result, expected)

Expand All @@ -71,7 +75,7 @@ def test_nunique():
tm.assert_frame_equal(result, expected)

# dropna
expected = DataFrame({"A": [1] * 3, "B": [1] * 3, "C": [1] * 3}, index=list("abc"))
expected = DataFrame({"B": [1] * 3, "C": [1] * 3}, index=list("abc"))
expected.index.name = "A"
result = df.replace({"x": None}).groupby("A").nunique()
tm.assert_frame_equal(result, expected)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_whitelist.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ def test_all_methods_categorized(mframe):
if new_names:
msg = f"""
There are uncatgeorized methods defined on the Grouper class:
{names}.
{new_names}.
Was a new method recently added?
Expand Down