Skip to content

Commit 82ccccd

Browse files
authored
REF: consolidate numeric_only checks in GroupBy (pandas-dev#51185)
1 parent 808e56e commit 82ccccd

File tree

5 files changed

+35
-48
lines changed

5 files changed

+35
-48
lines changed

pandas/core/groupby/generic.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
is_dict_like,
5959
is_integer_dtype,
6060
is_interval_dtype,
61+
is_numeric_dtype,
6162
is_scalar,
6263
)
6364
from pandas.core.dtypes.missing import (
@@ -172,9 +173,18 @@ def _wrap_agged_manager(self, mgr: Manager) -> Series:
172173
# NB: caller is responsible for setting ser.index
173174
return ser
174175

175-
def _get_data_to_aggregate(self) -> SingleManager:
176+
def _get_data_to_aggregate(
177+
self, *, numeric_only: bool = False, name: str | None = None
178+
) -> SingleManager:
176179
ser = self._selected_obj
177180
single = ser._mgr
181+
if numeric_only and not is_numeric_dtype(ser.dtype):
182+
# GH#41291 match Series behavior
183+
kwd_name = "numeric_only"
184+
raise TypeError(
185+
f"Cannot use {kwd_name}=True with "
186+
f"{type(self).__name__}.{name} and non-numeric dtypes."
187+
)
178188
return single
179189

180190
def _iterate_slices(self) -> Iterable[Series]:
@@ -1542,9 +1552,9 @@ def _cython_transform(
15421552
# test_transform_numeric_ret
15431553
# With self.axis == 1, _get_data_to_aggregate does a transpose
15441554
# so we always have a single block.
1545-
mgr: Manager2D = self._get_data_to_aggregate()
1546-
if numeric_only:
1547-
mgr = mgr.get_numeric_data(copy=False)
1555+
mgr: Manager2D = self._get_data_to_aggregate(
1556+
numeric_only=numeric_only, name=how
1557+
)
15481558

15491559
def arr_func(bvalues: ArrayLike) -> ArrayLike:
15501560
return self.grouper._cython_operation(
@@ -1864,12 +1874,18 @@ def _gotitem(self, key, ndim: int, subset=None):
18641874

18651875
raise AssertionError("invalid ndim for _gotitem")
18661876

1867-
def _get_data_to_aggregate(self) -> Manager2D:
1877+
def _get_data_to_aggregate(
1878+
self, *, numeric_only: bool = False, name: str | None = None
1879+
) -> Manager2D:
18681880
obj = self._obj_with_exclusions
18691881
if self.axis == 1:
1870-
return obj.T._mgr
1882+
mgr = obj.T._mgr
18711883
else:
1872-
return obj._mgr
1884+
mgr = obj._mgr
1885+
1886+
if numeric_only:
1887+
mgr = mgr.get_numeric_data(copy=False)
1888+
return mgr
18731889

18741890
def _indexed_output_to_ndframe(
18751891
self, output: Mapping[base.OutputKey, ArrayLike]

pandas/core/groupby/groupby.py

Lines changed: 3 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1540,22 +1540,9 @@ def _cython_agg_general(
15401540
# Note: we never get here with how="ohlc" for DataFrameGroupBy;
15411541
# that goes through SeriesGroupBy
15421542

1543-
data = self._get_data_to_aggregate()
1543+
data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how)
15441544
is_ser = data.ndim == 1
15451545

1546-
if numeric_only:
1547-
if is_ser and not is_numeric_dtype(self._selected_obj.dtype):
1548-
# GH#41291 match Series behavior
1549-
kwd_name = "numeric_only"
1550-
if how in ["any", "all"]:
1551-
kwd_name = "bool_only"
1552-
raise TypeError(
1553-
f"Cannot use {kwd_name}={numeric_only} with "
1554-
f"{type(self).__name__}.{how} and non-numeric types."
1555-
)
1556-
if not is_ser:
1557-
data = data.get_numeric_data(copy=False)
1558-
15591546
def array_func(values: ArrayLike) -> ArrayLike:
15601547
try:
15611548
result = self.grouper._cython_operation(
@@ -2034,15 +2021,6 @@ def std(
20342021

20352022
return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof))
20362023
else:
2037-
if (
2038-
numeric_only
2039-
and self.obj.ndim == 1
2040-
and not is_numeric_dtype(self.obj.dtype)
2041-
):
2042-
raise TypeError(
2043-
f"{type(self).__name__}.std called with "
2044-
f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
2045-
)
20462024

20472025
def _preprocessing(values):
20482026
if isinstance(values, BaseMaskedArray):
@@ -3114,11 +3092,6 @@ def quantile(
31143092
a 2.0
31153093
b 3.0
31163094
"""
3117-
if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype):
3118-
raise TypeError(
3119-
f"{type(self).__name__}.quantile called with "
3120-
f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
3121-
)
31223095

31233096
def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]:
31243097
if is_object_dtype(vals):
@@ -3258,8 +3231,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
32583231

32593232
obj = self._obj_with_exclusions
32603233
is_ser = obj.ndim == 1
3261-
mgr = self._get_data_to_aggregate()
3262-
data = mgr.get_numeric_data() if numeric_only else mgr
3234+
data = self._get_data_to_aggregate(numeric_only=numeric_only, name="quantile")
32633235
res_mgr = data.grouped_reduce(blk_func)
32643236

32653237
if is_ser:
@@ -3716,10 +3688,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
37163688

37173689
# Operate block-wise instead of column-by-column
37183690
is_ser = obj.ndim == 1
3719-
mgr = self._get_data_to_aggregate()
3720-
3721-
if numeric_only:
3722-
mgr = mgr.get_numeric_data()
3691+
mgr = self._get_data_to_aggregate(numeric_only=numeric_only, name=how)
37233692

37243693
res_mgr = mgr.grouped_reduce(blk_func)
37253694

pandas/tests/groupby/aggregate/test_cython.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,8 @@ def test_cython_agg_boolean():
9292
def test_cython_agg_nothing_to_agg():
9393
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
9494

95-
with pytest.raises(TypeError, match="Cannot use numeric_only=True"):
95+
msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
96+
with pytest.raises(TypeError, match=msg):
9697
frame.groupby("a")["b"].mean(numeric_only=True)
9798

9899
with pytest.raises(TypeError, match="Could not convert (foo|bar)*"):
@@ -117,7 +118,8 @@ def test_cython_agg_nothing_to_agg_with_dates():
117118
"dates": pd.date_range("now", periods=50, freq="T"),
118119
}
119120
)
120-
with pytest.raises(TypeError, match="Cannot use numeric_only=True"):
121+
msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
122+
with pytest.raises(TypeError, match=msg):
121123
frame.groupby("b").dates.mean(numeric_only=True)
122124

123125

pandas/tests/groupby/test_function.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1555,11 +1555,10 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request):
15551555
elif dtype is object:
15561556
msg = "|".join(
15571557
[
1558-
"Cannot use numeric_only=True",
1559-
"called with numeric_only=True and dtype object",
1558+
"SeriesGroupBy.sem called with numeric_only=True and dtype object",
15601559
"Series.skew does not allow numeric_only=True with non-numeric",
1561-
"got an unexpected keyword argument 'numeric_only'",
1562-
"is not supported for object dtype",
1560+
"cum(sum|prod|min|max) is not supported for object dtype",
1561+
r"Cannot use numeric_only=True with SeriesGroupBy\..* and non-numeric",
15631562
]
15641563
)
15651564
with pytest.raises(TypeError, match=msg):

pandas/tests/resample/test_resample_api.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -908,7 +908,8 @@ def test_series_downsample_method(method, numeric_only, expected_data):
908908

909909
func = getattr(resampled, method)
910910
if numeric_only and numeric_only is not lib.no_default:
911-
with pytest.raises(TypeError, match="Cannot use numeric_only=True"):
911+
msg = rf"Cannot use numeric_only=True with SeriesGroupBy\.{method}"
912+
with pytest.raises(TypeError, match=msg):
912913
func(**kwargs)
913914
elif method == "prod":
914915
with pytest.raises(TypeError, match="can't multiply sequence by non-int"):

0 commit comments

Comments
 (0)