Skip to content

Commit ea9ec2b

Browse files
rhshadrachyehoshuadimarsky
authored andcommitted
DEPR: groupby numeric_only default (pandas-dev#47025)
1 parent d99bce5 commit ea9ec2b

22 files changed

+544
-168
lines changed

doc/source/whatsnew/v1.5.0.rst

+4-1
Original file line numberDiff line numberDiff line change
@@ -493,7 +493,8 @@ retained by specifying ``group_keys=False``.
493493
``numeric_only`` default value
494494
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
495495

496-
Across the DataFrame operations such as ``min``, ``sum``, and ``idxmax``, the default
496+
Across the DataFrame and DataFrameGroupBy operations such as
497+
``min``, ``sum``, and ``idxmax``, the default
497498
value of the ``numeric_only`` argument, if it exists at all, was inconsistent.
498499
Furthermore, operations with the default value ``None`` can lead to surprising
499500
results. (:issue:`46560`)
@@ -523,6 +524,8 @@ gained the ``numeric_only`` argument.
523524
- :meth:`DataFrame.cov`
524525
- :meth:`DataFrame.idxmin`
525526
- :meth:`DataFrame.idxmax`
527+
- :meth:`.DataFrameGroupBy.cummin`
528+
- :meth:`.DataFrameGroupBy.cummax`
526529
- :meth:`.DataFrameGroupBy.idxmin`
527530
- :meth:`.DataFrameGroupBy.idxmax`
528531
- :meth:`.GroupBy.var`

pandas/core/groupby/generic.py

+46-17
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828

2929
from pandas._libs import (
3030
Interval,
31+
lib,
3132
reduction as libreduction,
3233
)
3334
from pandas._typing import (
@@ -1128,18 +1129,24 @@ def _wrap_applied_output_series(
11281129
return self._reindex_output(result)
11291130

11301131
def _cython_transform(
1131-
self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
1132+
self,
1133+
how: str,
1134+
numeric_only: bool | lib.NoDefault = lib.no_default,
1135+
axis: int = 0,
1136+
**kwargs,
11321137
) -> DataFrame:
11331138
assert axis == 0 # handled by caller
11341139
# TODO: no tests with self.ndim == 1 for DataFrameGroupBy
1140+
numeric_only_bool = self._resolve_numeric_only(numeric_only, axis)
11351141

11361142
# With self.axis == 0, we have multi-block tests
11371143
# e.g. test_rank_min_int, test_cython_transform_frame
11381144
# test_transform_numeric_ret
11391145
# With self.axis == 1, _get_data_to_aggregate does a transpose
11401146
# so we always have a single block.
11411147
mgr: Manager2D = self._get_data_to_aggregate()
1142-
if numeric_only:
1148+
orig_mgr_len = len(mgr)
1149+
if numeric_only_bool:
11431150
mgr = mgr.get_numeric_data(copy=False)
11441151

11451152
def arr_func(bvalues: ArrayLike) -> ArrayLike:
@@ -1152,8 +1159,8 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike:
11521159
res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True)
11531160
res_mgr.set_axis(1, mgr.axes[1])
11541161

1155-
if len(res_mgr) < len(mgr):
1156-
warn_dropping_nuisance_columns_deprecated(type(self), how)
1162+
if len(res_mgr) < orig_mgr_len:
1163+
warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only)
11571164

11581165
res_df = self.obj._constructor(res_mgr)
11591166
if self.axis == 1:
@@ -1269,7 +1276,9 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:
12691276
output[i] = sgb.transform(wrapper)
12701277
except TypeError:
12711278
# e.g. trying to call nanmean with string values
1272-
warn_dropping_nuisance_columns_deprecated(type(self), "transform")
1279+
warn_dropping_nuisance_columns_deprecated(
1280+
type(self), "transform", numeric_only=False
1281+
)
12731282
else:
12741283
inds.append(i)
12751284

@@ -1559,53 +1568,73 @@ def nunique(self, dropna: bool = True) -> DataFrame:
15591568
_shared_docs["idxmax"],
15601569
numeric_only_default="True for axis=0, False for axis=1",
15611570
)
1562-
def idxmax(self, axis=0, skipna: bool = True, numeric_only: bool | None = None):
1571+
def idxmax(
1572+
self,
1573+
axis=0,
1574+
skipna: bool = True,
1575+
numeric_only: bool | lib.NoDefault = lib.no_default,
1576+
):
15631577
axis = DataFrame._get_axis_number(axis)
1564-
if numeric_only is None:
1565-
numeric_only = None if axis == 0 else False
1578+
if numeric_only is lib.no_default:
1579+
# Cannot use self._resolve_numeric_only; we must pass None to
1580+
# DataFrame.idxmax for backwards compatibility
1581+
numeric_only_arg = None if axis == 0 else False
1582+
else:
1583+
numeric_only_arg = cast(bool, numeric_only)
15661584

15671585
def func(df):
1568-
# NB: here we use numeric_only=None, in DataFrame it is False GH#38217
15691586
res = df._reduce(
15701587
nanops.nanargmax,
15711588
"argmax",
15721589
axis=axis,
15731590
skipna=skipna,
1574-
numeric_only=numeric_only,
1591+
numeric_only=numeric_only_arg,
15751592
)
15761593
indices = res._values
15771594
index = df._get_axis(axis)
15781595
result = [index[i] if i >= 0 else np.nan for i in indices]
15791596
return df._constructor_sliced(result, index=res.index)
15801597

15811598
func.__name__ = "idxmax"
1582-
return self._python_apply_general(func, self._obj_with_exclusions)
1599+
result = self._python_apply_general(func, self._obj_with_exclusions)
1600+
self._maybe_warn_numeric_only_depr("idxmax", result, numeric_only)
1601+
return result
15831602

15841603
@doc(
15851604
_shared_docs["idxmin"],
15861605
numeric_only_default="True for axis=0, False for axis=1",
15871606
)
1588-
def idxmin(self, axis=0, skipna: bool = True, numeric_only: bool | None = None):
1607+
def idxmin(
1608+
self,
1609+
axis=0,
1610+
skipna: bool = True,
1611+
numeric_only: bool | lib.NoDefault = lib.no_default,
1612+
):
15891613
axis = DataFrame._get_axis_number(axis)
1590-
if numeric_only is None:
1591-
numeric_only = None if axis == 0 else False
1614+
if numeric_only is lib.no_default:
1615+
# Cannot use self._resolve_numeric_only; we must pass None to
1616+
# DataFrame.idxmin for backwards compatibility
1617+
numeric_only_arg = None if axis == 0 else False
1618+
else:
1619+
numeric_only_arg = cast(bool, numeric_only)
15921620

15931621
def func(df):
1594-
# NB: here we use numeric_only=None, in DataFrame it is False GH#46560
15951622
res = df._reduce(
15961623
nanops.nanargmin,
15971624
"argmin",
15981625
axis=axis,
15991626
skipna=skipna,
1600-
numeric_only=numeric_only,
1627+
numeric_only=numeric_only_arg,
16011628
)
16021629
indices = res._values
16031630
index = df._get_axis(axis)
16041631
result = [index[i] if i >= 0 else np.nan for i in indices]
16051632
return df._constructor_sliced(result, index=res.index)
16061633

16071634
func.__name__ = "idxmin"
1608-
return self._python_apply_general(func, self._obj_with_exclusions)
1635+
result = self._python_apply_general(func, self._obj_with_exclusions)
1636+
self._maybe_warn_numeric_only_depr("idxmin", result, numeric_only)
1637+
return result
16091638

16101639
boxplot = boxplot_frame_groupby
16111640

0 commit comments

Comments
 (0)