From bf63b6029da284e9341f88d5a886ee8ec153346b Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 31 Mar 2023 16:54:06 -0400 Subject: [PATCH 1/3] DEPR: groupby with as_index=False not including out-of-axis groupings --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/groupby/groupby.py | 17 +++++++++++-- pandas/tests/groupby/test_categorical.py | 8 ++++-- pandas/tests/groupby/test_groupby.py | 27 +++++++++++++++------ pandas/tests/groupby/test_groupby_dropna.py | 8 +++++- pandas/tests/groupby/test_grouping.py | 4 ++- 6 files changed, 52 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 3a749708fb526..cf762bf8c679c 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -122,6 +122,7 @@ Deprecations - Deprecated 'method', 'limit', and 'fill_axis' keywords in :meth:`DataFrame.align` and :meth:`Series.align`, explicitly call ``fillna`` on the alignment results instead (:issue:`51856`) - Deprecated 'broadcast_axis' keyword in :meth:`Series.align` and :meth:`DataFrame.align`, upcast before calling ``align`` with ``left = DataFrame({col: left for col in right.columns}, index=right.index)`` (:issue:`51856`) - Deprecated the 'axis' keyword in :meth:`.GroupBy.idxmax`, :meth:`.GroupBy.idxmin`, :meth:`.GroupBy.fillna`, :meth:`.GroupBy.take`, :meth:`.GroupBy.skew`, :meth:`.GroupBy.rank`, :meth:`.GroupBy.cumprod`, :meth:`.GroupBy.cumsum`, :meth:`.GroupBy.cummax`, :meth:`.GroupBy.cummin`, :meth:`.GroupBy.pct_change`, :meth:`GroupBy.diff`, :meth:`.GroupBy.shift`, and :meth:`DataFrameGroupBy.corrwith`; for ``axis=1`` operate on the underlying :class:`DataFrame` instead (:issue:`50405`, :issue:`51046`) +- Deprecated :class:`.DataFrameGroupBy` with ``as_index=False`` not including groupings in the result when they are not columns of the DataFrame (:issue:`49519`) - Deprecated logical operations (``|``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``), wrap a sequence in a :class:`Series` or numpy array before operating instead (:issue:`51521`) - Deprecated :meth:`DataFrame.swapaxes` and :meth:`Series.swapaxes`, use :meth:`DataFrame.transpose` or :meth:`Series.transpose` instead (:issue:`51946`) - Deprecated parameter ``convert_type`` in :meth:`Series.apply` (:issue:`52140`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e591298e2a58e..688f2fe5905ed 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1151,8 +1151,21 @@ def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame: ): # GH #28549 # When using .apply(-), name will be in columns already - if in_axis and name not in columns: - result.insert(0, name, lev) + if name not in columns: + if in_axis: + result.insert(0, name, lev) + else: + msg = ( + "A grouping was used that is not in the columns of the " + "DataFrame and so was excluded from the result. This grouping " + "will be included in a future version of pandas. Add the " + "grouping as a column of the DataFrame to silence this warning." + ) + warnings.warn( + message=msg, + category=FutureWarning, + stacklevel=find_stack_level(), + ) return result diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 5ecb765e5861e..43b2a8639fe86 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -767,7 +767,9 @@ def test_as_index(): # function grouper f = lambda r: df.loc[r, "A"] - result = df.groupby(["cat", f], as_index=False, observed=True).sum() + msg = "grouping is currently excluded from the result" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["cat", f], as_index=False, observed=True).sum() expected = DataFrame( { "cat": Categorical([1, 2], categories=df.cat.cat.categories), @@ -780,7 +782,9 @@ def test_as_index(): # another not in-axis grouper (conflicting names in index) s = Series(["a", "b", "b"], name="cat") - result = df.groupby(["cat", s], as_index=False, observed=True).sum() + msg = "grouping is currently excluded from the result" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(["cat", s], as_index=False, observed=True).sum() tm.assert_frame_equal(result, expected) # is original index dropped? diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c4c7bee2970d0..cab6da4284ca2 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -232,18 +232,28 @@ def f(x, q=None, axis=0): # DataFrame for as_index in [True, False]: df_grouped = tsframe.groupby(lambda x: x.month, as_index=as_index) - agg_result = df_grouped.agg(np.percentile, 80, axis=0) - apply_result = df_grouped.apply(DataFrame.quantile, 0.8) - expected = df_grouped.quantile(0.8) + warn = None if as_index else FutureWarning + msg = "grouping is currently excluded from the result" + with tm.assert_produces_warning(warn, match=msg): + agg_result = df_grouped.agg(np.percentile, 80, axis=0) + warn = None if as_index else FutureWarning + msg = "grouping is currently excluded from the result" + with tm.assert_produces_warning(warn, match=msg): + apply_result = df_grouped.apply(DataFrame.quantile, 0.8) + with tm.assert_produces_warning(warn, match=msg): + expected = df_grouped.quantile(0.8) tm.assert_frame_equal(apply_result, expected, check_names=False) tm.assert_frame_equal(agg_result, expected) apply_result = df_grouped.apply(DataFrame.quantile, [0.4, 0.8]) - expected_seq = df_grouped.quantile([0.4, 0.8]) + with tm.assert_produces_warning(warn, match=msg): + expected_seq = df_grouped.quantile([0.4, 0.8]) tm.assert_frame_equal(apply_result, expected_seq, check_names=False) - agg_result = df_grouped.agg(f, q=80) - apply_result = df_grouped.apply(DataFrame.quantile, q=0.8) + with tm.assert_produces_warning(warn, match=msg): + agg_result = df_grouped.agg(f, q=80) + with tm.assert_produces_warning(warn, match=msg): + apply_result = df_grouped.apply(DataFrame.quantile, q=0.8) tm.assert_frame_equal(agg_result, expected) tm.assert_frame_equal(apply_result, expected, check_names=False) @@ -254,7 +264,10 @@ def test_pass_args_kwargs_duplicate_columns(tsframe, as_index): tsframe.columns = ["A", "B", "A", "C"] gb = tsframe.groupby(lambda x: x.month, as_index=as_index) - res = gb.agg(np.percentile, 80, axis=0) + warn = None if as_index else FutureWarning + msg = "grouping is currently excluded from the result" + with tm.assert_produces_warning(warn, match=msg): + res = gb.agg(np.percentile, 80, axis=0) ex_data = { 1: tsframe[tsframe.index.month == 1].quantile(0.8), diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index a051b30307a28..bcf365bef94d3 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -574,7 +574,13 @@ def test_categorical_reducers( gb_keepna = df.groupby( keys, dropna=False, observed=observed, sort=sort, as_index=as_index ) - result = getattr(gb_keepna, reduction_func)(*args) + if as_index or index_kind == "range" or reduction_func == "size": + warn = None + else: + warn = FutureWarning + msg = "grouping is currently excluded from the result" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(gb_keepna, reduction_func)(*args) # size will return a Series, others are DataFrame tm.assert_equal(result, expected) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 8e84a48eb7374..8820698484259 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -1058,7 +1058,9 @@ def test_grouping_by_key_is_in_axis(): # Currently only in-axis groupings are including in the result when as_index=False; # This is likely to change in the future. - result = gb.sum() + msg = "grouping is currently excluded from the result" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.sum() expected = DataFrame({"b": [1, 2], "c": [7, 5]}) tm.assert_frame_equal(result, expected) From 463ce44ce31cbe23d599097ecae5b1a61f471b84 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 2 Apr 2023 09:58:16 -0400 Subject: [PATCH 2/3] Fix warning msg in tests --- pandas/tests/groupby/test_categorical.py | 4 ++-- pandas/tests/groupby/test_groupby.py | 6 ++---- pandas/tests/groupby/test_groupby_dropna.py | 2 +- pandas/tests/groupby/test_grouping.py | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 43b2a8639fe86..fa9f1800b227b 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -767,7 +767,7 @@ def test_as_index(): # function grouper f = lambda r: df.loc[r, "A"] - msg = "grouping is currently excluded from the result" + msg = "A grouping .* was excluded from the result" with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby(["cat", f], as_index=False, observed=True).sum() expected = DataFrame( @@ -782,7 +782,7 @@ def test_as_index(): # another not in-axis grouper (conflicting names in index) s = Series(["a", "b", "b"], name="cat") - msg = "grouping is currently excluded from the result" + msg = "A grouping .* was excluded from the result" with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby(["cat", s], as_index=False, observed=True).sum() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 947fb6264b24e..223bb3f507463 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -233,11 +233,9 @@ def f(x, q=None, axis=0): for as_index in [True, False]: df_grouped = tsframe.groupby(lambda x: x.month, as_index=as_index) warn = None if as_index else FutureWarning - msg = "grouping is currently excluded from the result" + msg = "A grouping .* was excluded from the result" with tm.assert_produces_warning(warn, match=msg): agg_result = df_grouped.agg(np.percentile, 80, axis=0) - warn = None if as_index else FutureWarning - msg = "grouping is currently excluded from the result" with tm.assert_produces_warning(warn, match=msg): apply_result = df_grouped.apply(DataFrame.quantile, 0.8) with tm.assert_produces_warning(warn, match=msg): @@ -265,7 +263,7 @@ def test_pass_args_kwargs_duplicate_columns(tsframe, as_index): gb = tsframe.groupby(lambda x: x.month, as_index=as_index) warn = None if as_index else FutureWarning - msg = "grouping is currently excluded from the result" + msg = "A grouping .* was excluded from the result" with tm.assert_produces_warning(warn, match=msg): res = gb.agg(np.percentile, 80, axis=0) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index bcf365bef94d3..ec61ae105f383 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -578,7 +578,7 @@ def test_categorical_reducers( warn = None else: warn = FutureWarning - msg = "grouping is currently excluded from the result" + msg = "A grouping .* was excluded from the result" with tm.assert_produces_warning(warn, match=msg): result = getattr(gb_keepna, reduction_func)(*args) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index e271d2103650f..d4c39fb82e005 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -1060,7 +1060,7 @@ def test_grouping_by_key_is_in_axis(): # Currently only in-axis groupings are including in the result when as_index=False; # This is likely to change in the future. - msg = "grouping is currently excluded from the result" + msg = "A grouping .* was excluded from the result" with tm.assert_produces_warning(FutureWarning, match=msg): result = gb.sum() expected = DataFrame({"b": [1, 2], "c": [7, 5]}) From c91078a2974f994e8685b3ffca88023284dd787d Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 11 Apr 2023 22:00:04 -0400 Subject: [PATCH 3/3] fixup docs --- doc/source/whatsnew/v0.15.1.rst | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.15.1.rst b/doc/source/whatsnew/v0.15.1.rst index a1d4f9d14a905..07139ebad8737 100644 --- a/doc/source/whatsnew/v0.15.1.rst +++ b/doc/source/whatsnew/v0.15.1.rst @@ -70,9 +70,14 @@ API changes current behavior: - .. ipython:: python + .. code-block:: ipython - df.groupby(ts, as_index=False).max() + In [4]: df.groupby(ts, as_index=False).max() + Out[4]: + jim joe + 0 72 83 + 1 77 84 + 2 96 65 - ``groupby`` will not erroneously exclude columns if the column name conflicts with the grouper name (:issue:`8112`):