Skip to content

DEPR: groupby numeric_only default #47025

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 18, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,8 @@ retained by specifying ``group_keys=False``.
``numeric_only`` default value
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Across the DataFrame operations such as ``min``, ``sum``, and ``idxmax``, the default
Across the DataFrame and DataFrameGroupBy operations such as
``min``, ``sum``, and ``idxmax``, the default
value of the ``numeric_only`` argument, if it exists at all, was inconsistent.
Furthermore, operations with the default value ``None`` can lead to surprising
results. (:issue:`46560`)
Expand Down Expand Up @@ -523,6 +524,8 @@ gained the ``numeric_only`` argument.
- :meth:`DataFrame.cov`
- :meth:`DataFrame.idxmin`
- :meth:`DataFrame.idxmax`
- :meth:`.DataFrameGroupBy.cummin`
- :meth:`.DataFrameGroupBy.cummax`
- :meth:`.DataFrameGroupBy.idxmin`
- :meth:`.DataFrameGroupBy.idxmax`
- :meth:`.GroupBy.var`
Expand Down
77 changes: 60 additions & 17 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

from pandas._libs import (
Interval,
lib,
reduction as libreduction,
)
from pandas._typing import (
Expand Down Expand Up @@ -1128,18 +1129,24 @@ def _wrap_applied_output_series(
return self._reindex_output(result)

def _cython_transform(
self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
self,
how: str,
numeric_only: bool | lib.NoDefault = lib.no_default,
axis: int = 0,
**kwargs,
) -> DataFrame:
assert axis == 0 # handled by caller
# TODO: no tests with self.ndim == 1 for DataFrameGroupBy
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis)

# With self.axis == 0, we have multi-block tests
# e.g. test_rank_min_int, test_cython_transform_frame
# test_transform_numeric_ret
# With self.axis == 1, _get_data_to_aggregate does a transpose
# so we always have a single block.
mgr: Manager2D = self._get_data_to_aggregate()
if numeric_only:
orig_mgr_len = len(mgr)
if numeric_only_bool:
mgr = mgr.get_numeric_data(copy=False)

def arr_func(bvalues: ArrayLike) -> ArrayLike:
Expand All @@ -1152,8 +1159,8 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike:
res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True)
res_mgr.set_axis(1, mgr.axes[1])

if len(res_mgr) < len(mgr):
warn_dropping_nuisance_columns_deprecated(type(self), how)
if len(res_mgr) < orig_mgr_len:
warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only)

res_df = self.obj._constructor(res_mgr)
if self.axis == 1:
Expand Down Expand Up @@ -1269,7 +1276,9 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:
output[i] = sgb.transform(wrapper)
except TypeError:
# e.g. trying to call nanmean with string values
warn_dropping_nuisance_columns_deprecated(type(self), "transform")
warn_dropping_nuisance_columns_deprecated(
type(self), "transform", numeric_only=False
)
else:
inds.append(i)

Expand Down Expand Up @@ -1559,53 +1568,87 @@ def nunique(self, dropna: bool = True) -> DataFrame:
_shared_docs["idxmax"],
numeric_only_default="True for axis=0, False for axis=1",
)
def idxmax(self, axis=0, skipna: bool = True, numeric_only: bool | None = None):
def idxmax(
self,
axis=0,
skipna: bool = True,
numeric_only: bool | lib.NoDefault = lib.no_default,
):
axis = DataFrame._get_axis_number(axis)
if numeric_only is None:
numeric_only = None if axis == 0 else False
if numeric_only is lib.no_default:
# Cannot use self._resolve_numeric_only; we must pass None to
# DataFrame.idxmax for backwards compatibility
numeric_only_arg = None if axis == 0 else False
else:
numeric_only_arg = cast(bool, numeric_only)

def func(df):
# NB: here we use numeric_only=None, in DataFrame it is False GH#38217
res = df._reduce(
nanops.nanargmax,
"argmax",
axis=axis,
skipna=skipna,
numeric_only=numeric_only,
numeric_only=numeric_only_arg,
)
indices = res._values
index = df._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return df._constructor_sliced(result, index=res.index)

func.__name__ = "idxmax"
return self._python_apply_general(func, self._obj_with_exclusions)
result = self._python_apply_general(func, self._obj_with_exclusions)
if (
self._obj_with_exclusions.ndim > 1
and result.ndim > 1
and len(result.columns) < len(self._obj_with_exclusions.columns)
):
warn_dropping_nuisance_columns_deprecated(
type(self), "idxmax", numeric_only
)
return result

@doc(
_shared_docs["idxmin"],
numeric_only_default="True for axis=0, False for axis=1",
)
def idxmin(self, axis=0, skipna: bool = True, numeric_only: bool | None = None):
def idxmin(
self,
axis=0,
skipna: bool = True,
numeric_only: bool | lib.NoDefault = lib.no_default,
):
axis = DataFrame._get_axis_number(axis)
if numeric_only is None:
numeric_only = None if axis == 0 else False
if numeric_only is lib.no_default:
# Cannot use self._resolve_numeric_only; we must pass None to
# DataFrame.idxmin for backwards compatibility
numeric_only_arg = None if axis == 0 else False
else:
numeric_only_arg = cast(bool, numeric_only)

def func(df):
# NB: here we use numeric_only=None, in DataFrame it is False GH#46560
res = df._reduce(
nanops.nanargmin,
"argmin",
axis=axis,
skipna=skipna,
numeric_only=numeric_only,
numeric_only=numeric_only_arg,
)
indices = res._values
index = df._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return df._constructor_sliced(result, index=res.index)

func.__name__ = "idxmin"
return self._python_apply_general(func, self._obj_with_exclusions)
result = self._python_apply_general(func, self._obj_with_exclusions)
if (
self._obj_with_exclusions.ndim != 1
and result.ndim > 1
and len(result.columns) < len(self._obj_with_exclusions.columns)
):
warn_dropping_nuisance_columns_deprecated(
type(self), "idxmin", numeric_only
)
return result

boxplot = boxplot_frame_groupby

Expand Down
Loading