Skip to content

Commit 0a58c03

Browse files
authored
DEPR: Enforce deprecation of dropping columns when numeric_only=False in groupby / resample (#49665)
* DEPR: Enforce deprecation of dropping columns when numeric_only=False in groupby / resample * Change to TypeError * Better error message
1 parent 2517199 commit 0a58c03

File tree

7 files changed

+110
-87
lines changed

7 files changed

+110
-87
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,7 @@ Removal of prior version deprecations/changes
560560
- Enforced deprecation ``numeric_only=None`` (the default) in DataFrame reductions that would silently drop columns that raised; ``numeric_only`` now defaults to ``False`` (:issue:`41480`)
561561
- Changed default of ``numeric_only`` to ``False`` in all DataFrame methods with that argument (:issue:`46096`, :issue:`46906`)
562562
- Changed default of ``numeric_only`` to ``False`` in :meth:`Series.rank` (:issue:`47561`)
563+
- Enforced deprecation of silently dropping nuisance columns in groupby and resample operations when ``numeric_only=False`` (:issue:`41475`)
563564
-
564565

565566
.. ---------------------------------------------------------------------------

pandas/core/groupby/generic.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -1357,7 +1357,13 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike:
13571357

13581358
# We could use `mgr.apply` here and not have to set_axis, but
13591359
# we would have to do shape gymnastics for ArrayManager compat
1360-
res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True)
1360+
try:
1361+
res_mgr = mgr.grouped_reduce(
1362+
arr_func, ignore_failures=numeric_only is lib.no_default
1363+
)
1364+
except NotImplementedError as err:
1365+
# For NotImplementedError, args[0] is the error message
1366+
raise TypeError(err.args[0]) from err
13611367
res_mgr.set_axis(1, mgr.axes[1])
13621368

13631369
if len(res_mgr) < orig_mgr_len:

pandas/core/groupby/groupby.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -1655,6 +1655,7 @@ def _agg_general(
16551655
alt=npfunc,
16561656
numeric_only=numeric_only,
16571657
min_count=min_count,
1658+
ignore_failures=numeric_only is lib.no_default,
16581659
)
16591660
return result.__finalize__(self.obj, method="groupby")
16601661

@@ -2123,6 +2124,7 @@ def mean(
21232124
"mean",
21242125
alt=lambda x: Series(x).mean(numeric_only=numeric_only_bool),
21252126
numeric_only=numeric_only,
2127+
ignore_failures=numeric_only is lib.no_default,
21262128
)
21272129
return result.__finalize__(self.obj, method="groupby")
21282130

@@ -2152,6 +2154,7 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
21522154
"median",
21532155
alt=lambda x: Series(x).median(numeric_only=numeric_only_bool),
21542156
numeric_only=numeric_only,
2157+
ignore_failures=numeric_only is lib.no_default,
21552158
)
21562159
return result.__finalize__(self.obj, method="groupby")
21572160

@@ -3756,7 +3759,9 @@ def blk_func(values: ArrayLike) -> ArrayLike:
37563759
if numeric_only_bool:
37573760
mgr = mgr.get_numeric_data()
37583761

3759-
res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True)
3762+
res_mgr = mgr.grouped_reduce(
3763+
blk_func, ignore_failures=numeric_only is lib.no_default
3764+
)
37603765

37613766
if not is_ser and len(res_mgr.items) != orig_mgr_len:
37623767
howstr = how.replace("group_", "")

pandas/tests/groupby/test_function.py

+74-69
Original file line numberDiff line numberDiff line change
@@ -163,14 +163,12 @@ def test_averages(self, df, method):
163163
"int",
164164
"float",
165165
"category_int",
166-
"datetime",
167-
"datetimetz",
168-
"timedelta",
169166
],
170167
)
171168

172-
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
173-
result = getattr(gb, method)(numeric_only=False)
169+
with pytest.raises(TypeError, match="[Cc]ould not convert"):
170+
getattr(gb, method)(numeric_only=False)
171+
result = getattr(gb, method)()
174172
tm.assert_frame_equal(result.reindex_like(expected), expected)
175173

176174
expected_columns = expected.columns
@@ -252,30 +250,35 @@ def test_cummin_cummax(self, df, method):
252250
def _check(self, df, method, expected_columns, expected_columns_numeric):
253251
gb = df.groupby("group")
254252

255-
# cummin, cummax dont have numeric_only kwarg, always use False
256-
warn = None
257-
if method in ["cummin", "cummax"]:
258-
# these dont have numeric_only kwarg, always use False
259-
warn = FutureWarning
260-
elif method in ["min", "max"]:
261-
# these have numeric_only kwarg, but default to False
262-
warn = FutureWarning
263-
264-
with tm.assert_produces_warning(
265-
warn, match="Dropping invalid columns", raise_on_extra_warnings=False
266-
):
253+
if method in ("min", "max", "cummin", "cummax"):
254+
# The methods default to numeric_only=False and raise TypeError
255+
msg = "|".join(
256+
[
257+
"Categorical is not ordered",
258+
"function is not implemented for this dtype",
259+
]
260+
)
261+
with pytest.raises(TypeError, match=msg):
262+
getattr(gb, method)()
263+
else:
267264
result = getattr(gb, method)()
268-
269-
tm.assert_index_equal(result.columns, expected_columns_numeric)
270-
271-
# GH#41475 deprecated silently ignoring nuisance columns
272-
warn = None
273-
if len(expected_columns) < len(gb._obj_with_exclusions.columns):
274-
warn = FutureWarning
275-
with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
265+
tm.assert_index_equal(result.columns, expected_columns_numeric)
266+
267+
if method not in ("first", "last"):
268+
msg = "|".join(
269+
[
270+
"[Cc]ould not convert",
271+
"Categorical is not ordered",
272+
"category type does not support",
273+
"can't multiply sequence",
274+
"function is not implemented for this dtype",
275+
]
276+
)
277+
with pytest.raises(TypeError, match=msg):
278+
getattr(gb, method)(numeric_only=False)
279+
else:
276280
result = getattr(gb, method)(numeric_only=False)
277-
278-
tm.assert_index_equal(result.columns, expected_columns)
281+
tm.assert_index_equal(result.columns, expected_columns)
279282

280283

281284
class TestGroupByNonCythonPaths:
@@ -1323,45 +1326,45 @@ def test_groupby_sum_timedelta_with_nat():
13231326

13241327

13251328
@pytest.mark.parametrize(
1326-
"kernel, numeric_only_default, drops_nuisance, has_arg",
1329+
"kernel, numeric_only_default, has_arg",
13271330
[
1328-
("all", False, False, False),
1329-
("any", False, False, False),
1330-
("bfill", False, False, False),
1331-
("corr", True, False, True),
1332-
("corrwith", True, False, True),
1333-
("cov", True, False, True),
1334-
("cummax", False, True, True),
1335-
("cummin", False, True, True),
1336-
("cumprod", True, True, True),
1337-
("cumsum", True, True, True),
1338-
("diff", False, False, False),
1339-
("ffill", False, False, False),
1340-
("fillna", False, False, False),
1341-
("first", False, False, True),
1342-
("idxmax", True, False, True),
1343-
("idxmin", True, False, True),
1344-
("last", False, False, True),
1345-
("max", False, True, True),
1346-
("mean", True, True, True),
1347-
("median", True, True, True),
1348-
("min", False, True, True),
1349-
("nth", False, False, False),
1350-
("nunique", False, False, False),
1351-
("pct_change", False, False, False),
1352-
("prod", True, True, True),
1353-
("quantile", True, False, True),
1354-
("sem", True, True, True),
1355-
("skew", True, False, True),
1356-
("std", True, True, True),
1357-
("sum", True, True, True),
1358-
("var", True, False, True),
1331+
("all", False, False),
1332+
("any", False, False),
1333+
("bfill", False, False),
1334+
("corr", True, True),
1335+
("corrwith", True, True),
1336+
("cov", True, True),
1337+
("cummax", False, True),
1338+
("cummin", False, True),
1339+
("cumprod", True, True),
1340+
("cumsum", True, True),
1341+
("diff", False, False),
1342+
("ffill", False, False),
1343+
("fillna", False, False),
1344+
("first", False, True),
1345+
("idxmax", True, True),
1346+
("idxmin", True, True),
1347+
("last", False, True),
1348+
("max", False, True),
1349+
("mean", True, True),
1350+
("median", True, True),
1351+
("min", False, True),
1352+
("nth", False, False),
1353+
("nunique", False, False),
1354+
("pct_change", False, False),
1355+
("prod", True, True),
1356+
("quantile", True, True),
1357+
("sem", True, True),
1358+
("skew", True, True),
1359+
("std", True, True),
1360+
("sum", True, True),
1361+
("var", True, True),
13591362
],
13601363
)
13611364
@pytest.mark.parametrize("numeric_only", [True, False, lib.no_default])
13621365
@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
13631366
def test_deprecate_numeric_only(
1364-
kernel, numeric_only_default, drops_nuisance, has_arg, numeric_only, keys
1367+
kernel, numeric_only_default, has_arg, numeric_only, keys
13651368
):
13661369
# GH#46072
13671370
# drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False
@@ -1380,10 +1383,9 @@ def test_deprecate_numeric_only(
13801383
# Cases where b does not appear in the result
13811384
numeric_only is True
13821385
or (numeric_only is lib.no_default and numeric_only_default)
1383-
or drops_nuisance
13841386
)
13851387
):
1386-
if numeric_only is True or (not numeric_only_default and not drops_nuisance):
1388+
if numeric_only is True or not numeric_only_default:
13871389
warn = None
13881390
else:
13891391
warn = FutureWarning
@@ -1408,14 +1410,17 @@ def test_deprecate_numeric_only(
14081410
assert "b" in result.columns
14091411
elif has_arg or kernel in ("idxmax", "idxmin"):
14101412
assert numeric_only is not True
1411-
assert not drops_nuisance
14121413
# kernels that are successful on any dtype were above; this will fail
1413-
msg = (
1414-
"(not allowed for this dtype"
1415-
"|must be a string or a number"
1416-
"|cannot be performed against 'object' dtypes"
1417-
"|must be a string or a real number"
1418-
"|unsupported operand type)"
1414+
msg = "|".join(
1415+
[
1416+
"not allowed for this dtype",
1417+
"must be a string or a number",
1418+
"cannot be performed against 'object' dtypes",
1419+
"must be a string or a real number",
1420+
"unsupported operand type",
1421+
"not supported between instances of",
1422+
"function is not implemented for this dtype",
1423+
]
14191424
)
14201425
with pytest.raises(TypeError, match=msg):
14211426
method(*args, **kwargs)

pandas/tests/groupby/test_groupby.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -915,11 +915,13 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only):
915915

916916
grouped = df.groupby("A")
917917

918-
if agg_function in ("var", "std", "sem") and numeric_only is False:
918+
no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median")
919+
if agg_function in no_drop_nuisance and numeric_only is False:
919920
# Added numeric_only as part of GH#46560; these do not drop nuisance
920921
# columns when numeric_only is False
921-
klass = TypeError if agg_function == "var" else ValueError
922-
with pytest.raises(klass, match="could not convert string to float"):
922+
klass = ValueError if agg_function in ("std", "sem") else TypeError
923+
msg = "|".join(["[C|c]ould not convert", "can't multiply sequence"])
924+
with pytest.raises(klass, match=msg):
923925
getattr(grouped, agg_function)(numeric_only=numeric_only)
924926
else:
925927
if numeric_only is lib.no_default:
@@ -2049,10 +2051,13 @@ def get_result():
20492051
and isinstance(values, Categorical)
20502052
and len(keys) == 1
20512053
):
2054+
if op in ("min", "max"):
2055+
with pytest.raises(TypeError, match="Categorical is not ordered"):
2056+
get_result()
2057+
return
20522058
# Categorical doesn't implement, so with numeric_only=True
20532059
# these are dropped and we get an empty DataFrame back
20542060
result = get_result()
2055-
expected = df.set_index(keys)[[]]
20562061

20572062
# with numeric_only=True, these are dropped, and we get
20582063
# an empty DataFrame back

pandas/tests/groupby/test_min_max.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -48,15 +48,17 @@ def test_max_min_object_multiple_columns(using_array_manager):
4848

4949
gb = df.groupby("A")
5050

51-
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
52-
result = gb.max(numeric_only=False)
51+
with pytest.raises(TypeError, match="not supported between instances"):
52+
gb.max(numeric_only=False)
53+
result = gb[["C"]].max()
5354
# "max" is valid for column "C" but not for "B"
5455
ei = Index([1, 2, 3], name="A")
5556
expected = DataFrame({"C": ["b", "d", "e"]}, index=ei)
5657
tm.assert_frame_equal(result, expected)
5758

58-
with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"):
59-
result = gb.min(numeric_only=False)
59+
with pytest.raises(TypeError, match="not supported between instances"):
60+
gb.max(numeric_only=False)
61+
result = gb[["C"]].min()
6062
# "min" is valid for column "C" but not for "B"
6163
ei = Index([1, 2, 3], name="A")
6264
expected = DataFrame({"C": ["a", "c", "e"]}, index=ei)

pandas/tests/resample/test_resample_api.py

+7-8
Original file line numberDiff line numberDiff line change
@@ -821,8 +821,8 @@ def test_end_and_end_day_origin(
821821
("sum", False, {"cat": ["cat_1cat_2"], "num": [25]}),
822822
("sum", lib.no_default, {"num": [25]}),
823823
("prod", True, {"num": [100]}),
824-
("prod", False, {"num": [100]}),
825-
("prod", lib.no_default, {"num": [100]}),
824+
("prod", False, "can't multiply sequence"),
825+
("prod", lib.no_default, "can't multiply sequence"),
826826
("min", True, {"num": [5]}),
827827
("min", False, {"cat": ["cat_1"], "num": [5]}),
828828
("min", lib.no_default, {"cat": ["cat_1"], "num": [5]}),
@@ -836,10 +836,10 @@ def test_end_and_end_day_origin(
836836
("last", False, {"cat": ["cat_2"], "num": [20]}),
837837
("last", lib.no_default, {"cat": ["cat_2"], "num": [20]}),
838838
("mean", True, {"num": [12.5]}),
839-
("mean", False, {"num": [12.5]}),
839+
("mean", False, "Could not convert"),
840840
("mean", lib.no_default, {"num": [12.5]}),
841841
("median", True, {"num": [12.5]}),
842-
("median", False, {"num": [12.5]}),
842+
("median", False, "could not convert"),
843843
("median", lib.no_default, {"num": [12.5]}),
844844
("std", True, {"num": [10.606601717798213]}),
845845
("std", False, "could not convert string to float"),
@@ -876,15 +876,14 @@ def test_frame_downsample_method(method, numeric_only, expected_data):
876876
msg = (
877877
f"default value of numeric_only in DataFrameGroupBy.{method} is deprecated"
878878
)
879-
elif method in ("prod", "mean", "median") and numeric_only is not True:
880-
warn = FutureWarning
881-
msg = f"Dropping invalid columns in DataFrameGroupBy.{method} is deprecated"
882879
else:
883880
warn = None
884881
msg = ""
885882
with tm.assert_produces_warning(warn, match=msg):
886883
if isinstance(expected_data, str):
887-
klass = TypeError if method == "var" else ValueError
884+
klass = (
885+
TypeError if method in ("var", "mean", "median", "prod") else ValueError
886+
)
888887
with pytest.raises(klass, match=expected_data):
889888
_ = func(**kwargs)
890889
else:

0 commit comments

Comments
 (0)