Skip to content

Commit dd13032

Browse files
authored
DEPR: Change numeric_only to False in various groupby ops (#49892)
* DEPR: Change numeric_only to False in various groupby ops * Remove FIXME
1 parent ad98c2b commit dd13032

File tree

7 files changed

+73
-115
lines changed

7 files changed

+73
-115
lines changed

doc/source/whatsnew/v2.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -572,7 +572,7 @@ Removal of prior version deprecations/changes
572572
- Changed default of ``numeric_only`` to ``False`` in all DataFrame methods with that argument (:issue:`46096`, :issue:`46906`)
573573
- Changed default of ``numeric_only`` to ``False`` in :meth:`Series.rank` (:issue:`47561`)
574574
- Enforced deprecation of silently dropping nuisance columns in groupby and resample operations when ``numeric_only=False`` (:issue:`41475`)
575-
- Changed default of ``numeric_only`` to ``False`` in :meth:`.DataFrameGroupBy.sum` and :meth:`.DataFrameGroupBy.mean` (:issue:`46072`)
575+
- Changed default of ``numeric_only`` to ``False`` in various :class:`.DataFrameGroupBy` methods (:issue:`46072`)
576576
- Changed default of ``numeric_only`` to ``False`` in :class:`.Resampler` methods (:issue:`47177`)
577577
-
578578

pandas/core/groupby/generic.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2220,7 +2220,7 @@ def skew(
22202220
self,
22212221
axis: Axis | None | lib.NoDefault = lib.no_default,
22222222
skipna: bool = True,
2223-
numeric_only: bool | lib.NoDefault = lib.no_default,
2223+
numeric_only: bool = False,
22242224
**kwargs,
22252225
) -> DataFrame:
22262226
result = self._op_via_apply(

pandas/core/groupby/groupby.py

+22-18
Original file line numberDiff line numberDiff line change
@@ -2136,7 +2136,7 @@ def mean(
21362136
@final
21372137
@Substitution(name="groupby")
21382138
@Appender(_common_see_also)
2139-
def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
2139+
def median(self, numeric_only: bool = False):
21402140
"""
21412141
Compute median of groups, excluding missing values.
21422142
@@ -2173,7 +2173,7 @@ def std(
21732173
ddof: int = 1,
21742174
engine: str | None = None,
21752175
engine_kwargs: dict[str, bool] | None = None,
2176-
numeric_only: bool | lib.NoDefault = lib.no_default,
2176+
numeric_only: bool = False,
21772177
):
21782178
"""
21792179
Compute standard deviation of groups, excluding missing values.
@@ -2202,11 +2202,15 @@ def std(
22022202
22032203
.. versionadded:: 1.4.0
22042204
2205-
numeric_only : bool, default True
2205+
numeric_only : bool, default False
22062206
Include only `float`, `int` or `boolean` data.
22072207
22082208
.. versionadded:: 1.5.0
22092209
2210+
.. versionchanged:: 2.0.0
2211+
2212+
numeric_only now defaults to ``False``.
2213+
22102214
Returns
22112215
-------
22122216
Series or DataFrame
@@ -2236,7 +2240,6 @@ def std(
22362240
post_processing=lambda vals, inference: np.sqrt(vals),
22372241
ddof=ddof,
22382242
)
2239-
self._maybe_warn_numeric_only_depr("std", result, numeric_only)
22402243
return result
22412244

22422245
@final
@@ -2247,7 +2250,7 @@ def var(
22472250
ddof: int = 1,
22482251
engine: str | None = None,
22492252
engine_kwargs: dict[str, bool] | None = None,
2250-
numeric_only: bool | lib.NoDefault = lib.no_default,
2253+
numeric_only: bool = False,
22512254
):
22522255
"""
22532256
Compute variance of groups, excluding missing values.
@@ -2276,11 +2279,15 @@ def var(
22762279
22772280
.. versionadded:: 1.4.0
22782281
2279-
numeric_only : bool, default True
2282+
numeric_only : bool, default False
22802283
Include only `float`, `int` or `boolean` data.
22812284
22822285
.. versionadded:: 1.5.0
22832286
2287+
.. versionchanged:: 2.0.0
2288+
2289+
numeric_only now defaults to ``False``.
2290+
22842291
Returns
22852292
-------
22862293
Series or DataFrame
@@ -2301,7 +2308,7 @@ def var(
23012308
@final
23022309
@Substitution(name="groupby")
23032310
@Appender(_common_see_also)
2304-
def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default):
2311+
def sem(self, ddof: int = 1, numeric_only: bool = False):
23052312
"""
23062313
Compute standard error of the mean of groups, excluding missing values.
23072314
@@ -2317,23 +2324,22 @@ def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default
23172324
23182325
.. versionadded:: 1.5.0
23192326
2327+
.. versionchanged:: 2.0.0
2328+
2329+
numeric_only now defaults to ``False``.
2330+
23202331
Returns
23212332
-------
23222333
Series or DataFrame
23232334
Standard error of the mean of values within each group.
23242335
"""
23252336
# Reolve numeric_only so that std doesn't warn
2326-
numeric_only_bool = self._resolve_numeric_only("sem", numeric_only, axis=0)
2327-
if (
2328-
numeric_only_bool
2329-
and self.obj.ndim == 1
2330-
and not is_numeric_dtype(self.obj.dtype)
2331-
):
2337+
if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype):
23322338
raise TypeError(
23332339
f"{type(self).__name__}.sem called with "
23342340
f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
23352341
)
2336-
result = self.std(ddof=ddof, numeric_only=numeric_only_bool)
2342+
result = self.std(ddof=ddof, numeric_only=numeric_only)
23372343
self._maybe_warn_numeric_only_depr("sem", result, numeric_only)
23382344

23392345
if result.ndim == 1:
@@ -2411,10 +2417,8 @@ def sum(
24112417
return self._reindex_output(result, fill_value=0)
24122418

24132419
@final
2414-
@doc(_groupby_agg_method_template, fname="prod", no=True, mc=0)
2415-
def prod(
2416-
self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0
2417-
):
2420+
@doc(_groupby_agg_method_template, fname="prod", no=False, mc=0)
2421+
def prod(self, numeric_only: bool = False, min_count: int = 0):
24182422
return self._agg_general(
24192423
numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
24202424
)

pandas/tests/groupby/aggregate/test_aggregate.py

+2-8
Original file line numberDiff line numberDiff line change
@@ -239,10 +239,7 @@ def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype
239239
[[1, 2, 3, 4, 5, 6]] * 3,
240240
columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]),
241241
).astype({("a", "j"): dtype, ("b", "j"): dtype})
242-
warn = FutureWarning if func == "std" else None
243-
msg = "The default value of numeric_only"
244-
with tm.assert_produces_warning(warn, match=msg):
245-
result = df.groupby(level=1, axis=1).agg(func)
242+
result = df.groupby(level=1, axis=1).agg(func)
246243
expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype(
247244
result_dtype_dict
248245
)
@@ -266,10 +263,7 @@ def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict):
266263
columns=Index([10, 20, 10, 20], name="x"),
267264
dtype="int64",
268265
).astype({10: "Int64"})
269-
warn = FutureWarning if func == "std" else None
270-
msg = "The default value of numeric_only"
271-
with tm.assert_produces_warning(warn, match=msg):
272-
result = df.groupby("x", axis=1).agg(func)
266+
result = df.groupby("x", axis=1).agg(func)
273267
expected = DataFrame(
274268
data=expected_data,
275269
index=Index([0, 1, 0], name="y"),

pandas/tests/groupby/test_function.py

+17-14
Original file line numberDiff line numberDiff line change
@@ -169,12 +169,9 @@ def test_averages(self, df, method):
169169
],
170170
)
171171

172-
if method == "mean":
173-
with pytest.raises(TypeError, match="[Cc]ould not convert"):
174-
getattr(gb, method)()
175-
result = getattr(gb, method)(numeric_only=True)
176-
else:
177-
result = getattr(gb, method)()
172+
with pytest.raises(TypeError, match="[Cc]ould not convert"):
173+
getattr(gb, method)()
174+
result = getattr(gb, method)(numeric_only=True)
178175
tm.assert_frame_equal(result.reindex_like(expected), expected)
179176

180177
expected_columns = expected.columns
@@ -276,11 +273,12 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
276273
)
277274
with pytest.raises(exception, match=msg):
278275
getattr(gb, method)()
279-
elif method in ("sum", "mean"):
276+
elif method in ("sum", "mean", "median", "prod"):
280277
msg = "|".join(
281278
[
282279
"category type does not support sum operations",
283-
"Could not convert",
280+
"[Cc]ould not convert",
281+
"can't multiply sequence by non-int of type 'str'",
284282
]
285283
)
286284
with pytest.raises(exception, match=msg):
@@ -1397,18 +1395,18 @@ def test_groupby_sum_timedelta_with_nat():
13971395
("last", False, True),
13981396
("max", False, True),
13991397
("mean", False, True),
1400-
("median", True, True),
1398+
("median", False, True),
14011399
("min", False, True),
14021400
("nth", False, False),
14031401
("nunique", False, False),
14041402
("pct_change", False, False),
1405-
("prod", True, True),
1403+
("prod", False, True),
14061404
("quantile", True, True),
1407-
("sem", True, True),
1408-
("skew", True, True),
1409-
("std", True, True),
1405+
("sem", False, True),
1406+
("skew", False, True),
1407+
("std", False, True),
14101408
("sum", False, True),
1411-
("var", True, True),
1409+
("var", False, True),
14121410
],
14131411
)
14141412
@pytest.mark.parametrize("numeric_only", [True, False, lib.no_default])
@@ -1592,6 +1590,11 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request):
15921590
warn_msg = ""
15931591
err_category = TypeError
15941592
err_msg = "Series.skew does not allow numeric_only=True with non-numeric"
1593+
elif groupby_func == "sem":
1594+
warn_category = None
1595+
warn_msg = ""
1596+
err_category = TypeError
1597+
err_msg = "called with numeric_only=True and dtype object"
15951598
else:
15961599
warn_category = FutureWarning
15971600
warn_msg = "This will raise a TypeError"

pandas/tests/groupby/test_groupby.py

+29-69
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import numpy as np
55
import pytest
66

7-
from pandas._libs import lib
87
from pandas.compat import IS64
98
from pandas.errors import (
109
PerformanceWarning,
@@ -909,64 +908,37 @@ def test_keep_nuisance_agg(df, agg_function):
909908
"agg_function",
910909
["sum", "mean", "prod", "std", "var", "sem", "median"],
911910
)
912-
@pytest.mark.parametrize("numeric_only", [lib.no_default, True, False])
911+
@pytest.mark.parametrize("numeric_only", [True, False])
913912
def test_omit_nuisance_agg(df, agg_function, numeric_only):
914913
# GH 38774, GH 38815
915-
if numeric_only is lib.no_default or (not numeric_only and agg_function != "sum"):
916-
# sum doesn't drop strings
917-
warn = FutureWarning
918-
else:
919-
warn = None
920-
921914
grouped = df.groupby("A")
922915

923916
no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median")
924-
if agg_function in no_drop_nuisance and numeric_only is False:
917+
if agg_function in no_drop_nuisance and not numeric_only:
925918
# Added numeric_only as part of GH#46560; these do not drop nuisance
926919
# columns when numeric_only is False
927920
klass = ValueError if agg_function in ("std", "sem") else TypeError
928921
msg = "|".join(["[C|c]ould not convert", "can't multiply sequence"])
929922
with pytest.raises(klass, match=msg):
930923
getattr(grouped, agg_function)(numeric_only=numeric_only)
931924
else:
932-
if numeric_only is lib.no_default:
933-
msg = (
934-
f"The default value of numeric_only in DataFrameGroupBy.{agg_function}"
935-
)
936-
else:
937-
msg = "Dropping invalid columns"
938-
with tm.assert_produces_warning(warn, match=msg):
939-
result = getattr(grouped, agg_function)(numeric_only=numeric_only)
940-
if (
941-
(numeric_only is lib.no_default or not numeric_only)
942-
# These methods drop non-numeric columns even when numeric_only is False
943-
and agg_function not in ("mean", "prod", "median")
944-
):
925+
result = getattr(grouped, agg_function)(numeric_only=numeric_only)
926+
if not numeric_only and agg_function == "sum":
927+
# sum is successful on column B
945928
columns = ["A", "B", "C", "D"]
946929
else:
947930
columns = ["A", "C", "D"]
948-
if agg_function == "sum" and numeric_only is False:
949-
# sum doesn't drop nuisance string columns
950-
warn = None
951-
elif agg_function in ("sum", "std", "var", "sem") and numeric_only is not True:
952-
warn = FutureWarning
953-
else:
954-
warn = None
955-
msg = "The default value of numeric_only"
956-
with tm.assert_produces_warning(warn, match=msg):
957-
expected = getattr(df.loc[:, columns].groupby("A"), agg_function)(
958-
numeric_only=numeric_only
959-
)
931+
expected = getattr(df.loc[:, columns].groupby("A"), agg_function)(
932+
numeric_only=numeric_only
933+
)
960934
tm.assert_frame_equal(result, expected)
961935

962936

963-
def test_omit_nuisance_warnings(df):
937+
def test_raise_on_nuisance_python_single(df):
964938
# GH 38815
965-
with tm.assert_produces_warning(FutureWarning, filter_level="always"):
966-
grouped = df.groupby("A")
967-
result = grouped.skew()
968-
expected = df.loc[:, ["A", "C", "D"]].groupby("A").skew()
969-
tm.assert_frame_equal(result, expected)
939+
grouped = df.groupby("A")
940+
with pytest.raises(TypeError, match="could not convert"):
941+
grouped.skew()
970942

971943

972944
def test_raise_on_nuisance_python_multiple(three_group):
@@ -2012,14 +1984,9 @@ def get_result(**kwargs):
20121984
if df.dtypes[0].kind == "M":
20131985
# GH#41291
20141986
# datetime64 -> prod and sum are invalid
2015-
if op == "sum":
2016-
with pytest.raises(
2017-
TypeError, match="datetime64 type does not support"
2018-
):
2019-
get_result()
2020-
result = get_result(numeric_only=True)
2021-
else:
2022-
result = get_result()
1987+
with pytest.raises(TypeError, match="datetime64 type does not support"):
1988+
get_result()
1989+
result = get_result(numeric_only=True)
20231990

20241991
# with numeric_only=True, these are dropped, and we get
20251992
# an empty DataFrame back
@@ -2030,14 +1997,9 @@ def get_result(**kwargs):
20301997
elif isinstance(values, Categorical):
20311998
# GH#41291
20321999
# Categorical doesn't implement sum or prod
2033-
if op == "sum":
2034-
with pytest.raises(
2035-
TypeError, match="category type does not support"
2036-
):
2037-
get_result()
2038-
result = get_result(numeric_only=True)
2039-
else:
2040-
result = get_result()
2000+
with pytest.raises(TypeError, match="category type does not support"):
2001+
get_result()
2002+
result = get_result(numeric_only=True)
20412003

20422004
# with numeric_only=True, these are dropped, and we get
20432005
# an empty DataFrame back
@@ -2053,24 +2015,22 @@ def get_result(**kwargs):
20532015
return
20542016

20552017
elif df.dtypes[0] == object:
2056-
# FIXME: the test is actually wrong here, xref #41341
20572018
result = get_result()
2058-
# In this case we have list-of-list, will raise TypeError,
2059-
# and subsequently be dropped as nuisance columns
2060-
if op == "sum":
2061-
expected = df.set_index(keys)[["C"]]
2062-
else:
2063-
expected = df.set_index(keys)[[]]
2019+
expected = df.set_index(keys)[["C"]]
20642020
tm.assert_equal(result, expected)
20652021
return
20662022

2067-
if (
2068-
op in ["min", "max", "skew"]
2069-
and isinstance(values, Categorical)
2070-
and len(keys) == 1
2023+
if (op in ["min", "max", "skew"] and isinstance(values, Categorical)) or (
2024+
op == "skew" and df.dtypes[0].kind == "M"
20712025
):
2072-
if op in ("min", "max"):
2073-
with pytest.raises(TypeError, match="Categorical is not ordered"):
2026+
if op == "skew" or len(keys) == 1:
2027+
msg = "|".join(
2028+
[
2029+
"Categorical is not ordered",
2030+
"does not support reduction",
2031+
]
2032+
)
2033+
with pytest.raises(TypeError, match=msg):
20742034
get_result()
20752035
return
20762036
# Categorical doesn't implement, so with numeric_only=True

0 commit comments

Comments
 (0)