Skip to content

Commit aa84c19

Browse files
jbrockmendelJulianWgs
authored andcommitted
DEPR: dropping nuisance columns in DataFrameGroupby apply, agg, transform (pandas-dev#41475)
1 parent f007bd9 commit aa84c19

File tree

10 files changed

+172
-23
lines changed

10 files changed

+172
-23
lines changed

doc/source/user_guide/groupby.rst

+2
Original file line numberDiff line numberDiff line change
@@ -1000,6 +1000,7 @@ instance method on each data group. This is pretty easy to do by passing lambda
10001000
functions:
10011001

10021002
.. ipython:: python
1003+
:okwarning:
10031004
10041005
grouped = df.groupby("A")
10051006
grouped.agg(lambda x: x.std())
@@ -1009,6 +1010,7 @@ arguments. Using a bit of metaprogramming cleverness, GroupBy now has the
10091010
ability to "dispatch" method calls to the groups:
10101011

10111012
.. ipython:: python
1013+
:okwarning:
10121014
10131015
grouped.std()
10141016

doc/source/whatsnew/v1.3.0.rst

+38
Original file line numberDiff line numberDiff line change
@@ -726,6 +726,44 @@ For example:
726726
A 24
727727
dtype: int64
728728
729+
730+
Similarly, when applying a function to :class:`DataFrameGroupBy`, columns on which
731+
the function raises ``TypeError`` are currently silently ignored and dropped
732+
from the result.
733+
734+
This behavior is deprecated. In a future version, the ``TypeError``
735+
will be raised, and users will need to select only valid columns before calling
736+
the function.
737+
738+
For example:
739+
740+
.. ipython:: python
741+
742+
df = pd.DataFrame({"A": [1, 2, 3, 4], "B": pd.date_range("2016-01-01", periods=4)})
743+
gb = df.groupby([1, 1, 2, 2])
744+
745+
*Old behavior*:
746+
747+
.. code-block:: ipython
748+
749+
In [4]: gb.prod(numeric_only=False)
750+
Out[4]:
751+
A
752+
1 2
753+
2 12
754+
755+
.. code-block:: ipython
756+
757+
In [5]: gb.prod(numeric_only=False)
758+
...
759+
TypeError: datetime64 type does not support prod operations
760+
761+
In [6]: gb[["A"]].prod(numeric_only=False)
762+
Out[6]:
763+
A
764+
1 2
765+
2 12
766+
729767
.. ---------------------------------------------------------------------------
730768
731769

pandas/core/groupby/generic.py

+27-1
Original file line numberDiff line numberDiff line change
@@ -1087,6 +1087,15 @@ def array_func(values: ArrayLike) -> ArrayLike:
10871087
if not len(new_mgr) and len(orig):
10881088
# If the original Manager was already empty, no need to raise
10891089
raise DataError("No numeric types to aggregate")
1090+
if len(new_mgr) < len(data):
1091+
warnings.warn(
1092+
f"Dropping invalid columns in {type(self).__name__}.{how} "
1093+
"is deprecated. In a future version, a TypeError will be raised. "
1094+
f"Before calling .{how}, select only columns which should be "
1095+
"valid for the function.",
1096+
FutureWarning,
1097+
stacklevel=4,
1098+
)
10901099

10911100
return self._wrap_agged_manager(new_mgr)
10921101

@@ -1283,6 +1292,16 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike:
12831292
res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True)
12841293
res_mgr.set_axis(1, mgr.axes[1])
12851294

1295+
if len(res_mgr) < len(mgr):
1296+
warnings.warn(
1297+
f"Dropping invalid columns in {type(self).__name__}.{how} "
1298+
"is deprecated. In a future version, a TypeError will be raised. "
1299+
f"Before calling .{how}, select only columns which should be "
1300+
"valid for the transforming function.",
1301+
FutureWarning,
1302+
stacklevel=4,
1303+
)
1304+
12861305
res_df = self.obj._constructor(res_mgr)
12871306
if self.axis == 1:
12881307
res_df = res_df.T
@@ -1420,7 +1439,14 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:
14201439
output[i] = sgb.transform(wrapper)
14211440
except TypeError:
14221441
# e.g. trying to call nanmean with string values
1423-
pass
1442+
warnings.warn(
1443+
f"Dropping invalid columns in {type(self).__name__}.transform "
1444+
"is deprecated. In a future version, a TypeError will be raised. "
1445+
"Before calling .transform, select only columns which should be "
1446+
"valid for the transforming function.",
1447+
FutureWarning,
1448+
stacklevel=5,
1449+
)
14241450
else:
14251451
inds.append(i)
14261452

pandas/core/groupby/groupby.py

+19
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ class providing the base-class of operations.
3030
Union,
3131
cast,
3232
)
33+
import warnings
3334

3435
import numpy as np
3536

@@ -1285,6 +1286,14 @@ def _python_agg_general(self, func, *args, **kwargs):
12851286
# if this function is invalid for this dtype, we will ignore it.
12861287
result = self.grouper.agg_series(obj, f)
12871288
except TypeError:
1289+
warnings.warn(
1290+
f"Dropping invalid columns in {type(self).__name__}.agg "
1291+
"is deprecated. In a future version, a TypeError will be raised. "
1292+
"Before calling .agg, select only columns which should be "
1293+
"valid for the aggregating function.",
1294+
FutureWarning,
1295+
stacklevel=3,
1296+
)
12881297
continue
12891298

12901299
key = base.OutputKey(label=name, position=idx)
@@ -2844,6 +2853,16 @@ def _get_cythonized_result(
28442853
vals, inferences = pre_processing(vals)
28452854
except TypeError as err:
28462855
error_msg = str(err)
2856+
howstr = how.replace("group_", "")
2857+
warnings.warn(
2858+
"Dropping invalid columns in "
2859+
f"{type(self).__name__}.{howstr} is deprecated. "
2860+
"In a future version, a TypeError will be raised. "
2861+
f"Before calling .{howstr}, select only columns which "
2862+
"should be valid for the function.",
2863+
FutureWarning,
2864+
stacklevel=3,
2865+
)
28472866
continue
28482867
vals = vals.astype(cython_dtype, copy=False)
28492868
if needs_2d:

pandas/tests/groupby/aggregate/test_aggregate.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,8 @@ def func(ser):
257257
else:
258258
return ser.sum()
259259

260-
result = grouped.aggregate(func)
260+
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"):
261+
result = grouped.aggregate(func)
261262
exp_grouped = three_group.loc[:, three_group.columns != "C"]
262263
expected = exp_grouped.groupby(["A", "B"]).aggregate(func)
263264
tm.assert_frame_equal(result, expected)
@@ -1020,6 +1021,7 @@ def test_mangle_series_groupby(self):
10201021
tm.assert_frame_equal(result, expected)
10211022

10221023
@pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.")
1024+
@pytest.mark.filterwarnings("ignore:Dropping invalid columns:FutureWarning")
10231025
def test_with_kwargs(self):
10241026
f1 = lambda x, y, b=1: x.sum() + y + b
10251027
f2 = lambda x, y, b=2: x.sum() + y * b

pandas/tests/groupby/aggregate/test_other.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,16 @@ def test_agg_api():
4444
def peak_to_peak(arr):
4545
return arr.max() - arr.min()
4646

47-
expected = grouped.agg([peak_to_peak])
47+
with tm.assert_produces_warning(
48+
FutureWarning, match="Dropping invalid", check_stacklevel=False
49+
):
50+
expected = grouped.agg([peak_to_peak])
4851
expected.columns = ["data1", "data2"]
49-
result = grouped.agg(peak_to_peak)
52+
53+
with tm.assert_produces_warning(
54+
FutureWarning, match="Dropping invalid", check_stacklevel=False
55+
):
56+
result = grouped.agg(peak_to_peak)
5057
tm.assert_frame_equal(result, expected)
5158

5259

@@ -294,7 +301,8 @@ def raiseException(df):
294301
raise TypeError("test")
295302

296303
with pytest.raises(TypeError, match="test"):
297-
df.groupby(0).agg(raiseException)
304+
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
305+
df.groupby(0).agg(raiseException)
298306

299307

300308
def test_series_agg_multikey():

pandas/tests/groupby/test_function.py

+27-5
Original file line numberDiff line numberDiff line change
@@ -87,13 +87,15 @@ def test_max_min_object_multiple_columns(using_array_manager):
8787

8888
gb = df.groupby("A")
8989

90-
result = gb.max(numeric_only=False)
90+
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
91+
result = gb.max(numeric_only=False)
9192
# "max" is valid for column "C" but not for "B"
9293
ei = Index([1, 2, 3], name="A")
9394
expected = DataFrame({"C": ["b", "d", "e"]}, index=ei)
9495
tm.assert_frame_equal(result, expected)
9596

96-
result = gb.min(numeric_only=False)
97+
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
98+
result = gb.min(numeric_only=False)
9799
# "min" is valid for column "C" but not for "B"
98100
ei = Index([1, 2, 3], name="A")
99101
expected = DataFrame({"C": ["a", "c", "e"]}, index=ei)
@@ -221,7 +223,10 @@ def test_averages(self, df, method):
221223
],
222224
)
223225

224-
result = getattr(gb, method)(numeric_only=False)
226+
with tm.assert_produces_warning(
227+
FutureWarning, match="Dropping invalid", check_stacklevel=False
228+
):
229+
result = getattr(gb, method)(numeric_only=False)
225230
tm.assert_frame_equal(result.reindex_like(expected), expected)
226231

227232
expected_columns = expected.columns
@@ -303,10 +308,27 @@ def test_cummin_cummax(self, df, method):
303308
def _check(self, df, method, expected_columns, expected_columns_numeric):
304309
gb = df.groupby("group")
305310

306-
result = getattr(gb, method)()
311+
# cummin, cummax dont have numeric_only kwarg, always use False
312+
warn = None
313+
if method in ["cummin", "cummax"]:
314+
# these dont have numeric_only kwarg, always use False
315+
warn = FutureWarning
316+
elif method in ["min", "max"]:
317+
# these have numeric_only kwarg, but default to False
318+
warn = FutureWarning
319+
320+
with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
321+
result = getattr(gb, method)()
322+
307323
tm.assert_index_equal(result.columns, expected_columns_numeric)
308324

309-
result = getattr(gb, method)(numeric_only=False)
325+
# GH#41475 deprecated silently ignoring nuisance columns
326+
warn = None
327+
if len(expected_columns) < len(gb._obj_with_exclusions.columns):
328+
warn = FutureWarning
329+
with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
330+
result = getattr(gb, method)(numeric_only=False)
331+
310332
tm.assert_index_equal(result.columns, expected_columns)
311333

312334

pandas/tests/groupby/test_groupby.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -923,7 +923,8 @@ def aggfun(ser):
923923
else:
924924
return ser.sum()
925925

926-
agged2 = df.groupby(keys).aggregate(aggfun)
926+
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"):
927+
agged2 = df.groupby(keys).aggregate(aggfun)
927928
assert len(agged2.columns) + 1 == len(df.columns)
928929

929930

@@ -1757,6 +1758,7 @@ def test_pivot_table_values_key_error():
17571758
@pytest.mark.parametrize(
17581759
"op", ["idxmax", "idxmin", "mad", "min", "max", "sum", "prod", "skew"]
17591760
)
1761+
@pytest.mark.filterwarnings("ignore:Dropping invalid columns:FutureWarning")
17601762
@pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
17611763
def test_empty_groupby(columns, keys, values, method, op, request):
17621764
# GH8093 & GH26411

pandas/tests/groupby/test_quantile.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,10 @@ def test_quantile_raises():
155155
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])
156156

157157
with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
158-
df.groupby("key").quantile()
158+
with tm.assert_produces_warning(
159+
FutureWarning, match="Dropping invalid columns"
160+
):
161+
df.groupby("key").quantile()
159162

160163

161164
def test_quantile_out_of_bounds_q_raises():
@@ -236,7 +239,11 @@ def test_groupby_quantile_nullable_array(values, q):
236239
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
237240
def test_groupby_quantile_skips_invalid_dtype(q):
238241
df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
239-
result = df.groupby("a").quantile(q)
242+
243+
warn = None if isinstance(q, list) else FutureWarning
244+
with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
245+
result = df.groupby("a").quantile(q)
246+
240247
expected = df.groupby("a")[["b"]].quantile(q)
241248
tm.assert_frame_equal(result, expected)
242249

0 commit comments

Comments
 (0)