Skip to content

BUG: Fix issues with numeric_only deprecation #47481

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 28 additions & 22 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1610,17 +1610,20 @@ def idxmax(
numeric_only_arg = numeric_only

def func(df):
res = df._reduce(
nanops.nanargmax,
"argmax",
axis=axis,
skipna=skipna,
numeric_only=numeric_only_arg,
)
indices = res._values
index = df._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return df._constructor_sliced(result, index=res.index)
with warnings.catch_warnings():
# Suppress numeric_only warnings here, will warn below
warnings.filterwarnings("ignore", ".*numeric_only in DataFrame.argmax")
res = df._reduce(
nanops.nanargmax,
"argmax",
axis=axis,
skipna=skipna,
numeric_only=numeric_only_arg,
)
indices = res._values
index = df._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return df._constructor_sliced(result, index=res.index)

func.__name__ = "idxmax"
result = self._python_apply_general(func, self._obj_with_exclusions)
Expand All @@ -1646,17 +1649,20 @@ def idxmin(
numeric_only_arg = numeric_only

def func(df):
res = df._reduce(
nanops.nanargmin,
"argmin",
axis=axis,
skipna=skipna,
numeric_only=numeric_only_arg,
)
indices = res._values
index = df._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return df._constructor_sliced(result, index=res.index)
with warnings.catch_warnings():
# Suppress numeric_only warnings here, will warn below
warnings.filterwarnings("ignore", ".*numeric_only in DataFrame.argmin")
res = df._reduce(
nanops.nanargmin,
"argmin",
axis=axis,
skipna=skipna,
numeric_only=numeric_only_arg,
)
indices = res._values
index = df._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return df._constructor_sliced(result, index=res.index)

func.__name__ = "idxmin"
result = self._python_apply_general(func, self._obj_with_exclusions)
Expand Down
23 changes: 20 additions & 3 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1716,8 +1716,9 @@ def _cython_agg_general(
kwd_name = "numeric_only"
if how in ["any", "all"]:
kwd_name = "bool_only"
kernel = "sum" if how == "add" else how
raise NotImplementedError(
f"{type(self).__name__}.{how} does not implement {kwd_name}."
f"{type(self).__name__}.{kernel} does not implement {kwd_name}."
)
elif not is_ser:
data = data.get_numeric_data(copy=False)
Expand Down Expand Up @@ -2194,10 +2195,16 @@ def std(

return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof))
else:
# Resolve numeric_only so that var doesn't warn
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0)
if numeric_only_bool and self.obj.ndim == 1:
raise NotImplementedError(
f"{type(self).__name__}.std does not implement numeric_only."
)
result = self._get_cythonized_result(
libgroupby.group_var,
cython_dtype=np.dtype(np.float64),
numeric_only=numeric_only,
numeric_only=numeric_only_bool,
needs_counts=True,
post_processing=lambda vals, inference: np.sqrt(vals),
ddof=ddof,
Expand Down Expand Up @@ -2296,7 +2303,13 @@ def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default
Series or DataFrame
Standard error of the mean of values within each group.
"""
result = self.std(ddof=ddof, numeric_only=numeric_only)
# Reolve numeric_only so that std doesn't warn
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0)
if numeric_only_bool and self.obj.ndim == 1:
raise NotImplementedError(
f"{type(self).__name__}.sem does not implement numeric_only."
)
result = self.std(ddof=ddof, numeric_only=numeric_only_bool)
self._maybe_warn_numeric_only_depr("sem", result, numeric_only)

if result.ndim == 1:
Expand Down Expand Up @@ -3167,6 +3180,10 @@ def quantile(
b 3.0
"""
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0)
if numeric_only_bool and self.obj.ndim == 1:
raise NotImplementedError(
f"{type(self).__name__}.quantile does not implement numeric_only"
)

def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]:
if is_object_dtype(vals):
Expand Down
75 changes: 73 additions & 2 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ def test_idxmax(self, gb):
# non-cython calls should not include the grouper
expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3])
expected.index.name = "A"
msg = "The default value of numeric_only"
msg = "The default value of numeric_only in DataFrameGroupBy.idxmax"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = gb.idxmax()
tm.assert_frame_equal(result, expected)
Expand All @@ -317,7 +317,7 @@ def test_idxmin(self, gb):
# non-cython calls should not include the grouper
expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3])
expected.index.name = "A"
msg = "The default value of numeric_only"
msg = "The default value of numeric_only in DataFrameGroupBy.idxmin"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = gb.idxmin()
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -1356,6 +1356,77 @@ def test_deprecate_numeric_only(
method(*args, **kwargs)


def test_deprecate_numeric_only_series(groupby_func, request):
# GH#46560
if groupby_func in ("backfill", "mad", "pad", "tshift"):
pytest.skip("method is deprecated")
elif groupby_func == "corrwith":
msg = "corrwith is not implemented on SeriesGroupBy"
request.node.add_marker(pytest.mark.xfail(reason=msg))

ser = Series(list("xyz"))
gb = ser.groupby([0, 0, 1])

if groupby_func == "corrwith":
args = (ser,)
elif groupby_func == "corr":
args = (ser,)
elif groupby_func == "cov":
args = (ser,)
elif groupby_func == "nth":
args = (0,)
elif groupby_func == "fillna":
args = (True,)
elif groupby_func == "take":
args = ([0],)
elif groupby_func == "quantile":
args = (0.5,)
else:
args = ()
method = getattr(gb, groupby_func)

try:
_ = method(*args)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to confirm, all groupby_func not expected to fail so that's why try/except is used here and makes it harder to use pytest.raises?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i.e. can this be tested like

if groupby_func in methods_ok:
    method(*args)
elif groupby_func in methods_raise_typeerror:
    with pytest.raises(TypeError, ...) as err:
        ...
elif groupby_func in methods_raise_valueerror:
    with pytest.raises(ValueError, ...) as err:
        ...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, none are successful. Some would be if the dtype was not object, but I plan to deprecate that in a follow up assuming no objection. Changed over to using pytest.raises.

except (TypeError, ValueError) as err:
# ops that only work on numeric dtypes
assert groupby_func in (
"corr",
"cov",
"cummax",
"cummin",
"cumprod",
"cumsum",
"diff",
"idxmax",
"idxmin",
"mean",
"median",
"pct_change",
"prod",
"quantile",
"sem",
"skew",
"std",
"var",
)
assert (
"could not convert" in str(err).lower()
or "unsupported operand type" in str(err)
or "not allowed for this dtype" in str(err)
or "can't multiply sequence by non-int" in str(err)
or "cannot be performed against 'object' dtypes" in str(err)
or "is not supported for object dtype" in str(err)
), str(err)

msgs = (
"got an unexpected keyword argument 'numeric_only'",
f"{groupby_func} does not implement numeric_only",
f"{groupby_func} is not supported for object dtype",
)
with pytest.raises((NotImplementedError, TypeError), match=f"({'|'.join(msgs)})"):
_ = method(*args, numeric_only=True)


@pytest.mark.parametrize("dtype", [int, float, object])
@pytest.mark.parametrize(
"kwargs",
Expand Down