Skip to content

DEPR: Enforce numeric_only=False in groupby sum/mean #49829

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion asv_bench/benchmarks/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def time_pivot_table_categorical_observed(self):
)

def time_pivot_table_margins_only_column(self):
self.df.pivot_table(columns=["key2", "key3"], margins=True)
self.df.pivot_table(columns=["key1", "key2", "key3"], margins=True)


class Crosstab:
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,7 @@ Removal of prior version deprecations/changes
- Changed default of ``numeric_only`` to ``False`` in all DataFrame methods with that argument (:issue:`46096`, :issue:`46906`)
- Changed default of ``numeric_only`` to ``False`` in :meth:`Series.rank` (:issue:`47561`)
- Enforced deprecation of silently dropping nuisance columns in groupby and resample operations when ``numeric_only=False`` (:issue:`41475`)
- Changed default of ``numeric_only`` to ``False`` in :meth:`.DataFrameGroupBy.sum` and :meth:`.DataFrameGroupBy.mean` (:issue:`46072`)
-

.. ---------------------------------------------------------------------------
Expand Down
13 changes: 6 additions & 7 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2046,7 +2046,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike:
@Substitution(see_also=_common_see_also)
def mean(
self,
numeric_only: bool | lib.NoDefault = lib.no_default,
numeric_only: bool = False,
engine: str = "cython",
engine_kwargs: dict[str, bool] | None = None,
):
Expand All @@ -2055,12 +2055,12 @@ def mean(

Parameters
----------
numeric_only : bool, default True
numeric_only : bool, default False
Include only float, int, boolean columns.

.. versionchanged:: 2.0.0

numeric_only no longer accepts ``None``.
numeric_only no longer accepts ``None`` and defaults to ``False``.

engine : str, default None
* ``'cython'`` : Runs the operation through C-extensions from cython.
Expand Down Expand Up @@ -2117,7 +2117,6 @@ def mean(
2 4.0
Name: B, dtype: float64
"""
numeric_only_bool = self._resolve_numeric_only("mean", numeric_only, axis=0)

if maybe_use_numba(engine):
from pandas.core._numba.kernels import sliding_mean
Expand All @@ -2126,7 +2125,7 @@ def mean(
else:
result = self._cython_agg_general(
"mean",
alt=lambda x: Series(x).mean(numeric_only=numeric_only_bool),
alt=lambda x: Series(x).mean(numeric_only=numeric_only),
numeric_only=numeric_only,
)
return result.__finalize__(self.obj, method="groupby")
Expand Down Expand Up @@ -2379,10 +2378,10 @@ def size(self) -> DataFrame | Series:
return self._reindex_output(result, fill_value=0)

@final
@doc(_groupby_agg_method_template, fname="sum", no=True, mc=0)
@doc(_groupby_agg_method_template, fname="sum", no=False, mc=0)
def sum(
self,
numeric_only: bool | lib.NoDefault = lib.no_default,
numeric_only: bool = False,
min_count: int = 0,
engine: str | None = None,
engine_kwargs: dict[str, bool] | None = None,
Expand Down
30 changes: 19 additions & 11 deletions pandas/tests/extension/base/groupby.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import pytest

from pandas.core.dtypes.common import is_numeric_dtype
from pandas.core.dtypes.common import (
is_bool_dtype,
is_numeric_dtype,
is_object_dtype,
is_period_dtype,
is_string_dtype,
)

import pandas as pd
import pandas._testing as tm
Expand Down Expand Up @@ -100,17 +106,19 @@ def test_in_numeric_groupby(self, data_for_grouping):
)

dtype = data_for_grouping.dtype
if is_numeric_dtype(dtype) or dtype.name == "decimal":
warn = None
else:
warn = FutureWarning
msg = "The default value of numeric_only"
with tm.assert_produces_warning(warn, match=msg):
result = df.groupby("A").sum().columns

if data_for_grouping.dtype._is_numeric:
if (
is_numeric_dtype(dtype)
or is_bool_dtype(dtype)
or dtype.name == "decimal"
or is_string_dtype(dtype)
or is_period_dtype(dtype)
or is_object_dtype(dtype)
):
expected = pd.Index(["B", "C"])
result = df.groupby("A").sum().columns
else:
expected = pd.Index(["C"])

with pytest.raises(TypeError, match="does not support"):
df.groupby("A").sum().columns
result = df.groupby("A").sum(numeric_only=True).columns
tm.assert_index_equal(result, expected)
4 changes: 1 addition & 3 deletions pandas/tests/generic/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,7 @@ def test_metadata_propagation_indiv_groupby(self):
"D": np.random.randn(8),
}
)
msg = "The default value of numeric_only"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.groupby("A").sum()
result = df.groupby("A").sum()
tm.assert_metadata_equivalent(df, result)

def test_metadata_propagation_indiv_resample(self):
Expand Down
9 changes: 7 additions & 2 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,10 @@ def test_basic(): # TODO: split this test
gb = df.groupby("A", observed=False)
exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True)
expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)})
result = gb.sum()
msg = "category type does not support sum operations"
with pytest.raises(TypeError, match=msg):
gb.sum()
result = gb.sum(numeric_only=True)
tm.assert_frame_equal(result, expected)

# GH 8623
Expand Down Expand Up @@ -338,7 +341,9 @@ def test_observed(observed):

gb = df.groupby(["A", "B"], observed=observed)
exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"])
expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index)
expected = DataFrame(
{"values": [1, 2, 3, 4], "C": ["foo", "bar", "foo", "bar"]}, index=exp_index
)
result = gb.sum()
if not observed:
expected = cartesian_product_for_groupers(
Expand Down
22 changes: 17 additions & 5 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,12 @@ def test_averages(self, df, method):
],
)

with pytest.raises(TypeError, match="[Cc]ould not convert"):
getattr(gb, method)(numeric_only=False)
result = getattr(gb, method)()
if method == "mean":
with pytest.raises(TypeError, match="[Cc]ould not convert"):
getattr(gb, method)()
result = getattr(gb, method)(numeric_only=True)
else:
result = getattr(gb, method)()
tm.assert_frame_equal(result.reindex_like(expected), expected)

expected_columns = expected.columns
Expand Down Expand Up @@ -264,6 +267,15 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
)
with pytest.raises(exception, match=msg):
getattr(gb, method)()
elif method in ("sum", "mean"):
msg = "|".join(
[
"category type does not support sum operations",
"Could not convert",
]
)
with pytest.raises(exception, match=msg):
getattr(gb, method)()
else:
result = getattr(gb, method)()
tm.assert_index_equal(result.columns, expected_columns_numeric)
Expand Down Expand Up @@ -1375,7 +1387,7 @@ def test_groupby_sum_timedelta_with_nat():
("idxmin", True, True),
("last", False, True),
("max", False, True),
("mean", True, True),
("mean", False, True),
("median", True, True),
("min", False, True),
("nth", False, False),
Expand All @@ -1386,7 +1398,7 @@ def test_groupby_sum_timedelta_with_nat():
("sem", True, True),
("skew", True, True),
("std", True, True),
("sum", True, True),
("sum", False, True),
("var", True, True),
],
)
Expand Down
Loading