diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 4605c14643fa2..b69eda2f5e2b3 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -561,6 +561,53 @@ Assignment to multiple columns of a :class:`DataFrame` when some of the columns df[['a', 'c']] = 1 df +.. _whatsnew_110.api_breaking.groupby_consistency: + +Consistency across groupby reductions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Using :meth:`DataFrame.groupby` with ``as_index=True`` and the aggregation ``nunique`` would include the grouping column(s) in the columns of the result. Now the grouping column(s) only appear in the index, consistent with other reductions. (:issue:`32579`) + +.. ipython:: python + + df = pd.DataFrame({"a": ["x", "x", "y", "y"], "b": [1, 1, 2, 3]}) + df + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df.groupby("a", as_index=True).nunique() + Out[4]: + a b + a + x 1 1 + y 1 2 + +*New behavior*: + +.. ipython:: python + + df.groupby("a", as_index=True).nunique() + +Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxmax``, ``idxmin``, ``mad``, ``nunique``, ``sem``, ``skew``, or ``std`` would modify the grouping column. Now the grouping column remains unchanged, consistent with other reductions. (:issue:`21090`, :issue:`10355`) + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df.groupby("a", as_index=False).nunique() + Out[4]: + a b + 0 1 1 + 1 1 2 + +*New behavior*: + +.. ipython:: python + + df.groupby("a", as_index=False).nunique() + .. _whatsnew_110.deprecations: Deprecations @@ -819,7 +866,6 @@ Groupby/resample/rolling - Bug in :meth:`Series.groupby` would raise ``ValueError`` when grouping by :class:`PeriodIndex` level (:issue:`34010`) - Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`) - Bug in :meth:`GroupBy.rolling.apply` ignores args and kwargs parameters (:issue:`33433`) -- Bug in :meth:`DataFrameGroupby.std` and :meth:`DataFrameGroupby.sem` would modify grouped-by columns when ``as_index=False`` (:issue:`10355`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 69b143febeea2..ea4b6f4e65341 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1265,7 +1265,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): v = values[0] - if isinstance(v, (np.ndarray, Index, Series)): + if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index: if isinstance(v, Series): applied_index = self._selected_obj._get_axis(self.axis) all_indexed_same = all_indexes_same([x.index for x in values]) @@ -1341,6 +1341,11 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): result = self.obj._constructor( stacked_values.T, index=v.index, columns=key_index ) + elif not self.as_index: + # We add grouping column below, so create a frame here + result = DataFrame( + values, index=key_index, columns=[self._selection] + ) else: # GH#1738: values is list of arrays of unequal lengths # fall through to the outer else clause @@ -1358,6 +1363,9 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): else: result = result._convert(datetime=True) + if not self.as_index: + self._insert_inaxis_grouper_inplace(result) + return self._reindex_output(result) # values are not series or array-like but scalars @@ -1700,9 +1708,11 @@ def _insert_inaxis_grouper_inplace(self, result): ), ) ) - + columns = result.columns for name, lev, in_axis in izip: - if in_axis: + # GH #28549 + # When using .apply(-), name will be in columns already + if in_axis and name not in columns: result.insert(0, name, lev) def _wrap_aggregated_output( @@ -1852,11 +1862,11 @@ def nunique(self, dropna: bool = True): 5 ham 5 y >>> df.groupby('id').nunique() - id value1 value2 + value1 value2 id - egg 1 1 1 - ham 1 1 2 - spam 1 2 1 + egg 1 1 + ham 1 2 + spam 2 1 Check for rows with the same id but conflicting values: @@ -1867,37 +1877,37 @@ def nunique(self, dropna: bool = True): 4 ham 5 x 5 ham 5 y """ - obj = self._selected_obj + from pandas.core.reshape.concat import concat - def groupby_series(obj, col=None): - return SeriesGroupBy(obj, selection=col, grouper=self.grouper).nunique( - dropna=dropna - ) + # TODO: this is duplicative of how GroupBy naturally works + # Try to consolidate with normal wrapping functions - if isinstance(obj, Series): - results = groupby_series(obj) + obj = self._obj_with_exclusions + axis_number = obj._get_axis_number(self.axis) + other_axis = int(not axis_number) + if axis_number == 0: + iter_func = obj.items else: - # TODO: this is duplicative of how GroupBy naturally works - # Try to consolidate with normal wrapping functions - from pandas.core.reshape.concat import concat - - axis_number = obj._get_axis_number(self.axis) - other_axis = int(not axis_number) - if axis_number == 0: - iter_func = obj.items - else: - iter_func = obj.iterrows + iter_func = obj.iterrows - results = [groupby_series(content, label) for label, content in iter_func()] - results = concat(results, axis=1) + results = concat( + [ + SeriesGroupBy(content, selection=label, grouper=self.grouper).nunique( + dropna + ) + for label, content in iter_func() + ], + axis=1, + ) - if axis_number == 1: - results = results.T + if axis_number == 1: + results = results.T - results._get_axis(other_axis).names = obj._get_axis(other_axis).names + results._get_axis(other_axis).names = obj._get_axis(other_axis).names if not self.as_index: results.index = ibase.default_index(len(results)) + self._insert_inaxis_grouper_inplace(results) return results boxplot = boxplot_frame_groupby diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 37f2376d68d55..9838cff9b34f9 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -35,7 +35,7 @@ class providing the base-class of operations. from pandas._libs import Timestamp import pandas._libs.groupby as libgroupby -from pandas._typing import FrameOrSeries, Scalar +from pandas._typing import F, FrameOrSeries, FrameOrSeriesUnion, Scalar from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, cache_readonly, doc @@ -735,11 +735,11 @@ def _make_wrapper(self, name): # need to setup the selection # as are not passed directly but in the grouper - f = getattr(self._selected_obj, name) + f = getattr(self._obj_with_exclusions, name) if not isinstance(f, types.MethodType): return self.apply(lambda self: getattr(self, name)) - f = getattr(type(self._selected_obj), name) + f = getattr(type(self._obj_with_exclusions), name) sig = inspect.signature(f) def wrapper(*args, **kwargs): @@ -762,7 +762,7 @@ def curried(x): return self.apply(curried) try: - return self.apply(curried) + return self._python_apply_general(curried, self._obj_with_exclusions) except TypeError as err: if not re.search( "reduction operation '.*' not allowed for this dtype", str(err) @@ -853,7 +853,7 @@ def f(g): # ignore SettingWithCopy here in case the user mutates with option_context("mode.chained_assignment", None): try: - result = self._python_apply_general(f) + result = self._python_apply_general(f, self._selected_obj) except TypeError: # gh-20949 # try again, with .apply acting as a filtering @@ -864,12 +864,29 @@ def f(g): # on a string grouper column with _group_selection_context(self): - return self._python_apply_general(f) + return self._python_apply_general(f, self._selected_obj) return result - def _python_apply_general(self, f): - keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis) + def _python_apply_general( + self, f: F, data: FrameOrSeriesUnion + ) -> FrameOrSeriesUnion: + """ + Apply function f in python space + + Parameters + ---------- + f : callable + Function to apply + data : Series or DataFrame + Data to apply f to + + Returns + ------- + Series or DataFrame + data after applying f + """ + keys, values, mutated = self.grouper.apply(f, data, self.axis) return self._wrap_applied_output( keys, values, not_indexed_same=mutated or self.mutated @@ -1067,7 +1084,7 @@ def _python_agg_general( output[key] = maybe_cast_result(result, obj, numeric_only=True) if len(output) == 0: - return self._python_apply_general(f) + return self._python_apply_general(f, self._selected_obj) if self.grouper._filter_empty_groups: diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 9d7bc749d6e89..9303a084f1e71 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -280,7 +280,7 @@ def test_non_cython_api(): result = g.mad() tm.assert_frame_equal(result, expected) - expected = DataFrame([[0.0, 0.0], [0, np.nan]], columns=["A", "B"], index=[0, 1]) + expected = DataFrame([[1, 0.0], [3, np.nan]], columns=["A", "B"], index=[0, 1]) result = gni.mad() tm.assert_frame_equal(result, expected) @@ -573,28 +573,6 @@ def test_ops_general(op, targop): tm.assert_frame_equal(result, expected) -def test_ops_not_as_index(reduction_func): - # GH 10355 - # Using as_index=False should not modify grouped column - - if reduction_func in ("nth", "ngroup", "size",): - pytest.skip("Skip until behavior is determined (GH #5755)") - - if reduction_func in ("corrwith", "idxmax", "idxmin", "mad", "nunique", "skew",): - pytest.xfail( - "_GroupBy._python_apply_general incorrectly modifies grouping columns" - ) - - df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"]) - expected = getattr(df.groupby("a"), reduction_func)().reset_index() - - result = getattr(df.groupby("a", as_index=False), reduction_func)() - tm.assert_frame_equal(result, expected) - - result = getattr(df.groupby("a", as_index=False)["b"], reduction_func)() - tm.assert_frame_equal(result, expected) - - def test_max_nan_bug(): raw = """,Date,app,File -04-23,2013-04-23 00:00:00,,log080001.log diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c88d16e34eab8..90c0d6bd183f2 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -658,6 +658,34 @@ def test_groupby_as_index_agg(df): tm.assert_frame_equal(left, right) +def test_ops_not_as_index(reduction_func): + # GH 10355, 21090 + # Using as_index=False should not modify grouped column + + if reduction_func in ("corrwith",): + pytest.skip("Test not applicable") + + if reduction_func in ("nth", "ngroup", "size",): + pytest.skip("Skip until behavior is determined (GH #5755)") + + df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"]) + expected = getattr(df.groupby("a"), reduction_func)().reset_index() + + g = df.groupby("a", as_index=False) + + result = getattr(g, reduction_func)() + tm.assert_frame_equal(result, expected) + + result = g.agg(reduction_func) + tm.assert_frame_equal(result, expected) + + result = getattr(g["b"], reduction_func)() + tm.assert_frame_equal(result, expected) + + result = g["b"].agg(reduction_func) + tm.assert_frame_equal(result, expected) + + def test_as_index_series_return_frame(df): grouped = df.groupby("A", as_index=False) grouped2 = df.groupby(["A", "B"], as_index=False) diff --git a/pandas/tests/groupby/test_nunique.py b/pandas/tests/groupby/test_nunique.py index 952443e0ad23b..1475b1ce2907c 100644 --- a/pandas/tests/groupby/test_nunique.py +++ b/pandas/tests/groupby/test_nunique.py @@ -25,7 +25,10 @@ def check_nunique(df, keys, as_index=True): if not as_index: right = right.reset_index(drop=True) - tm.assert_series_equal(left, right, check_names=False) + if as_index: + tm.assert_series_equal(left, right, check_names=False) + else: + tm.assert_frame_equal(left, right, check_names=False) tm.assert_frame_equal(df, original_df) days = date_range("2015-08-23", periods=10) @@ -56,13 +59,14 @@ def check_nunique(df, keys, as_index=True): def test_nunique(): df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")}) - expected = DataFrame({"A": [1] * 3, "B": [1, 2, 1], "C": [1, 1, 2]}) + expected = DataFrame({"A": list("abc"), "B": [1, 2, 1], "C": [1, 1, 2]}) result = df.groupby("A", as_index=False).nunique() tm.assert_frame_equal(result, expected) # as_index expected.index = list("abc") expected.index.name = "A" + expected = expected.drop(columns="A") result = df.groupby("A").nunique() tm.assert_frame_equal(result, expected) @@ -71,7 +75,7 @@ def test_nunique(): tm.assert_frame_equal(result, expected) # dropna - expected = DataFrame({"A": [1] * 3, "B": [1] * 3, "C": [1] * 3}, index=list("abc")) + expected = DataFrame({"B": [1] * 3, "C": [1] * 3}, index=list("abc")) expected.index.name = "A" result = df.replace({"x": None}).groupby("A").nunique() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 8e387e9202ef6..6b33049a664de 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -406,7 +406,7 @@ def test_all_methods_categorized(mframe): if new_names: msg = f""" There are uncatgeorized methods defined on the Grouper class: -{names}. +{new_names}. Was a new method recently added?