diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 6597b764581a4..80b45fce4cd2c 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -398,6 +398,81 @@ keywords. df.rename(index={0: 1}, columns={0: 2}) + +.. _whatsnew_1000.api_breaking.GroupBy.apply: + +``GroupBy.apply`` behaves consistently with `as_index` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- Previously, the result of :meth:`GroupBy.apply` sometimes contained the grouper column(s), + in both the index, and in the `DataFrame`. :meth:`GroupBy.apply` + now respects the ``as_index`` parameter, and only returns the grouper column(s) in + the result if ``as_index`` is set to `False`. Other methods such as :meth:`GroupBy.resample` + exhibited similar behavior and now also respect the ``as_index`` parameter. + +*Previous Behavior* + +.. code-block:: ipython + + In [1]: df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]}) + + In [2]: df.groupby("a").apply(lambda x: x.sum()) + Out[2]: + a b + a + 1 2 3 + 2 4 7 + 3 6 11 + + In [3]: df.groupby("a").apply(lambda x: x.iloc[0]) + Out[3]: + a b + a + 1 1 1 + 2 2 3 + 3 3 5 + + In [4]: idx = pd.date_range('1/1/2000', periods=4, freq='T') + + In [5]: df = pd.DataFrame(data=4 * [range(2)], + ...: index=idx, + ...: columns=['a', 'b']) + + In [6]: df.iloc[2, 0] = 5 + + In [7]: df.groupby('a').resample('M').sum() + Out[7]: + a b + a + 0 2000-01-31 0 3 + 5 2000-01-31 5 1 + + +*Current Behavior* + +.. ipython:: python + + df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]}) + df.groupby("a").apply(lambda x: x.sum()) + df.groupby("a").apply(lambda x: x.iloc[0]) + idx = pd.date_range('1/1/2000', periods=4, freq='T') + df = pd.DataFrame(data=4 * [range(2)], + index=idx, + columns=['a', 'b']) + df.iloc[2, 0] = 5 + df.groupby('a').resample('M').sum() + + +All :class:`SeriesGroupBy` aggregation methods now respect the ``observed`` keyword +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The following methods now also correctly output values for unobserved categories when called through ``groupby(..., observed=False)`` (:issue:`17605`) + +- :meth:`SeriesGroupBy.count` +- :meth:`SeriesGroupBy.size` +- :meth:`SeriesGroupBy.nunique` +- :meth:`SeriesGroupBy.nth` + + Extended verbose info output for :class:`~pandas.DataFrame` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b52d1bb4db360..596be751d166d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -731,18 +731,17 @@ def f(g): # ignore SettingWithCopy here in case the user mutates with option_context("mode.chained_assignment", None): - try: - result = self._python_apply_general(f) - except TypeError: - # gh-20949 - # try again, with .apply acting as a filtering - # operation, by excluding the grouping column - # This would normally not be triggered - # except if the udf is trying an operation that - # fails on *some* columns, e.g. a numeric operation - # on a string grouper column - - with _group_selection_context(self): + with _group_selection_context(self): + try: + result = self._python_apply_general(f) + except TypeError: + # gh-20949 + # try again, with .apply acting as a filtering + # operation, by excluding the grouping column + # This would normally not be triggered + # except if the udf is trying an operation that + # fails on *some* columns, e.g. a numeric operation + # on a string grouper column return self._python_apply_general(f) return result @@ -1505,7 +1504,7 @@ def resample(self, rule, *args, **kwargs): ... columns=['a', 'b']) >>> df.iloc[2, 0] = 5 >>> df - a b + a b 2000-01-01 00:00:00 0 1 2000-01-01 00:01:00 0 1 2000-01-01 00:02:00 5 1 @@ -1515,63 +1514,63 @@ def resample(self, rule, *args, **kwargs): the timestamps falling into a bin. >>> df.groupby('a').resample('3T').sum() - a b + b a - 0 2000-01-01 00:00:00 0 2 - 2000-01-01 00:03:00 0 1 - 5 2000-01-01 00:00:00 5 1 + 0 2000-01-01 00:00:00 2 + 2000-01-01 00:03:00 1 + 5 2000-01-01 00:00:00 1 Upsample the series into 30 second bins. >>> df.groupby('a').resample('30S').sum() - a b + b a - 0 2000-01-01 00:00:00 0 1 - 2000-01-01 00:00:30 0 0 - 2000-01-01 00:01:00 0 1 - 2000-01-01 00:01:30 0 0 - 2000-01-01 00:02:00 0 0 - 2000-01-01 00:02:30 0 0 - 2000-01-01 00:03:00 0 1 - 5 2000-01-01 00:02:00 5 1 + 0 2000-01-01 00:00:00 1 + 2000-01-01 00:00:30 0 + 2000-01-01 00:01:00 1 + 2000-01-01 00:01:30 0 + 2000-01-01 00:02:00 0 + 2000-01-01 00:02:30 0 + 2000-01-01 00:03:00 1 + 5 2000-01-01 00:02:00 1 Resample by month. Values are assigned to the month of the period. >>> df.groupby('a').resample('M').sum() - a b + b a - 0 2000-01-31 0 3 - 5 2000-01-31 5 1 + 0 2000-01-31 3 + 5 2000-01-31 1 Downsample the series into 3 minute bins as above, but close the right side of the bin interval. >>> df.groupby('a').resample('3T', closed='right').sum() - a b + b a - 0 1999-12-31 23:57:00 0 1 - 2000-01-01 00:00:00 0 2 - 5 2000-01-01 00:00:00 5 1 + 0 1999-12-31 23:57:00 1 + 2000-01-01 00:00:00 2 + 5 2000-01-01 00:00:00 1 Downsample the series into 3 minute bins and close the right side of the bin interval, but label each bin using the right edge instead of the left. >>> df.groupby('a').resample('3T', closed='right', label='right').sum() - a b + b a - 0 2000-01-01 00:00:00 0 1 - 2000-01-01 00:03:00 0 2 - 5 2000-01-01 00:03:00 5 1 + 0 2000-01-01 00:00:00 1 + 2000-01-01 00:03:00 2 + 5 2000-01-01 00:03:00 1 Add an offset of twenty seconds. >>> df.groupby('a').resample('3T', loffset='20s').sum() - a b + b a - 0 2000-01-01 00:00:20 0 2 - 2000-01-01 00:03:20 0 1 - 5 2000-01-01 00:00:20 5 1 + 0 2000-01-01 00:00:20 2 + 2000-01-01 00:03:20 1 + 5 2000-01-01 00:00:20 1 """ from pandas.core.resample import get_resampler_for_grouping diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index ebac36c5f8c78..0c9e2d87dfd0c 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -112,6 +112,20 @@ def reduction_func(request): return request.param +@pytest.fixture +def as_index(): + """yields all possible options for the `as_index` parameter, one at a time. + """ + return [True, False] + + +@pytest.fixture +def group_keys(): + """yields all possible options for the `group_keys` parameter, one at a time. + """ + return [True, False] + + @pytest.fixture(params=sorted(transformation_kernels)) def transformation_func(request): """yields the string names of all groupby transformation functions.""" diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 9c2b045079622..19fc58a275a73 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -363,12 +363,33 @@ def f(group): tm.assert_frame_equal(result.loc[key], f(group)) -def test_apply_chunk_view(): +@pytest.mark.parametrize( + "as_index, group_keys, expected_index", + [ + ( + True, + True, + MultiIndex.from_tuples( + [(1, 0), (1, 1), (2, 2), (2, 3)], names=["key", None] + ), + ), + (True, False, [0, 1, 2, 3]), + (False, True, MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2), (1, 3)])), + (False, False, [0, 1, 2, 3]), + ], +) +def test_apply_chunk_view(as_index, group_keys, expected_index): # Low level tinkering could be unsafe, make sure not - df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) + df = DataFrame({"key": [1, 1, 2, 2], "value": range(4)}) + + result = df.groupby("key", as_index=as_index, group_keys=group_keys).apply( + lambda x: x[:2] + ) + if as_index: + df.pop("key") - result = df.groupby("key", group_keys=False).apply(lambda x: x[:2]) - expected = df.take([0, 1, 3, 4, 6, 7]) + expected = df.copy() + expected.index = expected_index tm.assert_frame_equal(result, expected) @@ -386,7 +407,7 @@ def test_apply_no_name_column_conflict(): grouped.apply(lambda x: x.sort_values("value", inplace=True)) -def test_apply_typecast_fail(): +def test_apply_typecast_fail(as_index): df = DataFrame( { "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0], @@ -400,7 +421,12 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - result = df.groupby("d").apply(f) + result = df.groupby("d", as_index=as_index).apply(f) + + # GH 28549 + # key no longer included in reduction output + if as_index: + df.pop("d") expected = df.copy() expected["v2"] = np.tile([0.0, 0.5, 1], 2) @@ -426,6 +452,10 @@ def f(group): result = df.groupby("d").apply(f) + # GH 28549 + # key no longer included in reduction output + df.pop("d") + expected = df.copy() expected["v2"] = np.tile([0.0, 0.5, 1], 2) @@ -482,7 +512,7 @@ def test_apply_with_duplicated_non_sorted_axis(test_series): expected = ser.sort_index() tm.assert_series_equal(result, expected) else: - result = df.groupby("Y").apply(lambda x: x) + result = df.groupby("Y", as_index=False).apply(lambda x: x) # not expecting the order to remain the same for duplicated axis result = result.sort_values("Y") @@ -661,24 +691,36 @@ def test_func(x): tm.assert_frame_equal(result, expected) -def test_groupby_apply_none_first(): +@pytest.mark.parametrize( + "groups, vars_, expected_vars, expected_groups", + [ + ([1, 1, 1, 2], [0, 1, 2, 3], [0, 2], [1, 1]), + ([1, 2, 2, 2], [0, 1, 2, 3], [1, 3], [2, 2]), + ], +) +def test_groupby_apply_none_first( + groups, vars_, expected_vars, expected_groups, as_index +): # GH 12824. Tests if apply returns None first. - test_df1 = DataFrame({"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]}) - test_df2 = DataFrame({"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]}) + test_df = DataFrame({"groups": groups, "vars": vars_}) def test_func(x): if x.shape[0] < 2: return None return x.iloc[[0, -1]] - result1 = test_df1.groupby("groups").apply(test_func) - result2 = test_df2.groupby("groups").apply(test_func) - index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None]) - index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None]) - expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1) - expected2 = DataFrame({"groups": [2, 2], "vars": [1, 3]}, index=index2) - tm.assert_frame_equal(result1, expected1) - tm.assert_frame_equal(result2, expected2) + result = test_df.groupby("groups", as_index=as_index).apply(test_func) + + # GH 28549 "groups" should not be in output of apply + # unless as_index=True + if not as_index: + expected = DataFrame( + {"groups": expected_groups, "vars": expected_vars}, index=result.index + ) + else: + expected = DataFrame({"vars": expected_vars}, index=result.index) + + tm.assert_frame_equal(result, expected) def test_groupby_apply_return_empty_chunk(): @@ -805,7 +847,7 @@ def test_groupby_apply_datetime_result_dtypes(): ], columns=["observation", "color", "mood", "intensity", "score"], ) - result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes + result = data.groupby("color", as_index=False).apply(lambda g: g.iloc[0]).dtypes expected = Series( [np.dtype("datetime64[ns]"), np.object, np.object, np.int64, np.object], index=["observation", "color", "mood", "intensity", "score"], @@ -825,7 +867,7 @@ def test_groupby_apply_datetime_result_dtypes(): def test_apply_index_has_complex_internals(index): # GH 31248 df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) - result = df.groupby("group").apply(lambda x: x) + result = df.groupby("group", as_index=False).apply(lambda x: x) tm.assert_frame_equal(result, df) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 1c2de8c8c223f..2c6bf391b6180 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -93,9 +93,16 @@ def f(x): return x.drop_duplicates("person_name").iloc[0] result = g.apply(f) - expected = x.iloc[[0, 1]].copy() + + # GH 28549 + # grouper key should not be present after apply + # with as_index=True. + # TODO split this into multiple tests + dropped = x.drop("person_id", 1) + + expected = dropped.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") - expected["person_name"] = expected["person_name"].astype("object") + expected["person_name"] = expected["person_name"] tm.assert_frame_equal(result, expected) # GH 9921 @@ -1246,6 +1253,16 @@ def test_get_nonexistent_category(): # Accessing a Category that is not in the dataframe df = pd.DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)}) with pytest.raises(KeyError, match="'vau'"): + df.groupby("var").apply( + lambda rows: pd.DataFrame({"val": [rows.iloc[-1]["vau"]]}) + ) + + +def test_category_as_grouper_keys(as_index): + # Accessing a key that is not in the dataframe + df = pd.DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)}) + bad_key = "'var'" if as_index else "'vau'" + with pytest.raises(KeyError, match=bad_key): df.groupby("var").apply( lambda rows: pd.DataFrame( {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]} diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 73e36cb5e6c84..79ffbada8be0d 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -88,10 +88,6 @@ def test_intercept_builtin_sum(): tm.assert_series_equal(result2, expected) -# @pytest.mark.parametrize("f", [max, min, sum]) -# def test_builtins_apply(f): - - @pytest.mark.parametrize("f", [max, min, sum]) @pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]]) # Single key # Multi-key def test_builtins_apply(keys, f): @@ -103,8 +99,17 @@ def test_builtins_apply(keys, f): result = df.groupby(keys).apply(f) ngroups = len(df.drop_duplicates(subset=keys)) - assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))" - assert result.shape == (ngroups, 3), assert_msg + # GH 28549 + # grouping keys should not be included in output + if isinstance(keys, list): + result_shape = len(df.columns) - len(keys) + else: + result_shape = len(df.columns) - 1 + + assert_msg = "invalid frame shape: {} (expected ({}, {}))".format( + result.shape, ngroups, result_shape + ) + assert result.shape == (ngroups, result_shape), assert_msg tm.assert_frame_equal( result, # numpy's equivalent function @@ -112,10 +117,15 @@ def test_builtins_apply(keys, f): ) if f != sum: - expected = df.groupby(keys).agg(fname).reset_index() - expected.set_index(keys, inplace=True, drop=False) + # GH 28549 + # No longer need to reset/set index here + expected = df.groupby(keys).agg(fname) tm.assert_frame_equal(result, expected, check_dtype=False) + # GH 28549 + # grouping keys should not be in output + df = df.drop(keys, 1) + tm.assert_series_equal(getattr(result, fname)(), getattr(df, fname)()) @@ -340,10 +350,10 @@ def test_cython_api2(): tm.assert_frame_equal(result, expected) # GH 13994 - result = df.groupby("A").cumsum(axis=1) + result = df.groupby("A", as_index=False).cumsum(axis=1) expected = df.cumsum(axis=1) tm.assert_frame_equal(result, expected) - result = df.groupby("A").cumprod(axis=1) + result = df.groupby("A", as_index=False).cumprod(axis=1) expected = df.cumprod(axis=1) tm.assert_frame_equal(result, expected) @@ -1151,7 +1161,7 @@ def test_count(): for key in ["1st", "2nd", ["1st", "2nd"]]: left = df.groupby(key).count() - right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) + right = df.groupby(key).apply(DataFrame.count) tm.assert_frame_equal(left, right) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b7d7124a3a5e5..ca3c6a5a24a3a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -87,9 +87,10 @@ def max_value(group): applied = df.groupby("A").apply(max_value) result = applied.dtypes + expected = Series( - [np.dtype("object")] * 2 + [np.dtype("float64")] * 2 + [np.dtype("int64")], - index=["A", "B", "C", "D", "value"], + [np.dtype("object")] + [np.dtype("float64")] * 2 + [np.dtype("int64")], + index=["B", "C", "D", "value"], ) tm.assert_series_equal(result, expected) @@ -955,14 +956,15 @@ def f_no_copy(x): def test_no_mutate_but_looks_like(): - # GH 8467 # first show's mutation indicator # second does not, but should yield the same results df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) - result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) + result1 = df.groupby("key", group_keys=True, as_index=False).apply( + lambda x: x[:].key + ) + result2 = df.groupby("key", group_keys=True, as_index=False).apply(lambda x: x.key) tm.assert_series_equal(result1, result2) @@ -1086,7 +1088,7 @@ def test_consistency_name(): tm.assert_series_equal(result, expected) -def test_groupby_name_propagation(df): +def test_groupby_name_propagation(df, as_index): # GH 6124 def summarize(df, name=None): return Series({"count": 1, "mean": 2, "omissions": 3}, name=name) @@ -1097,12 +1099,14 @@ def summarize_random_name(df): # inconsistent. return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"]) - metrics = df.groupby("A").apply(summarize) + metrics = df.groupby("A", as_index=as_index).apply(summarize) assert metrics.columns.name is None - metrics = df.groupby("A").apply(summarize, "metrics") + metrics = df.groupby("A", as_index=as_index).apply(summarize, "metrics") assert metrics.columns.name == "metrics" - metrics = df.groupby("A").apply(summarize_random_name) - assert metrics.columns.name is None + + if not as_index: + metrics = df.groupby("A", as_index=as_index).apply(summarize_random_name) + assert metrics.columns.name is None def test_groupby_nonstring_columns(): @@ -1351,12 +1355,16 @@ def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): _check_groupby(df, result, ["a", "b"], "d") -def test_dont_clobber_name_column(): +def test_dont_clobber_name_column(as_index): df = DataFrame( {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2} ) - result = df.groupby("key").apply(lambda x: x) + result = df.groupby("key", as_index=as_index).apply(lambda x: x) + + if as_index: + df.pop("key") + tm.assert_frame_equal(result, df) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 8967ef06f50fb..793a81dd33cc2 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -480,7 +480,7 @@ def test_groupby_transform_with_nan_group(): tm.assert_series_equal(result, expected) -def test_transform_mixed_type(): +def test_transform_mixed_type(as_index): index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]]) df = DataFrame( { @@ -495,16 +495,22 @@ def f(group): group["g"] = group["d"] * 2 return group[:1] - grouped = df.groupby("c") + grouped = df.groupby("c", as_index=as_index) result = grouped.apply(f) assert result["d"].dtype == np.float64 # this is by definition a mutating operation! with pd.option_context("mode.chained_assignment", None): - for key, group in grouped: + for index, (key, group) in enumerate(grouped): res = f(group) - tm.assert_frame_equal(res, result.loc[key]) + if as_index: + res = res.drop("c", 1) + k = key + else: + k = index + + tm.assert_frame_equal(res, result.loc[k]) def _check_cython_group_transform_cumulative(pd_op, np_op, dtype): diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 03c1445e099a0..90418492af0b7 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -260,9 +260,9 @@ def test_resample_groupby_with_label(): ), ] mindex = pd.MultiIndex.from_arrays(mi, names=["col0", None]) - expected = DataFrame( - data={"col0": [0, 0, 2, 2], "col1": [1, 1, 2, 1]}, index=mindex - ) + # GH 28549 col0 should not be included in the output since + # it is a grouper key + expected = DataFrame(data={"col1": [1, 1, 2, 1]}, index=mindex) tm.assert_frame_equal(result, expected)