diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 815f9936057f4..77bc30d5512a6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -58,6 +58,7 @@ is_dict_like, is_integer_dtype, is_interval_dtype, + is_numeric_dtype, is_scalar, ) from pandas.core.dtypes.missing import ( @@ -172,9 +173,18 @@ def _wrap_agged_manager(self, mgr: Manager) -> Series: # NB: caller is responsible for setting ser.index return ser - def _get_data_to_aggregate(self) -> SingleManager: + def _get_data_to_aggregate( + self, *, numeric_only: bool = False, name: str | None = None + ) -> SingleManager: ser = self._selected_obj single = ser._mgr + if numeric_only and not is_numeric_dtype(ser.dtype): + # GH#41291 match Series behavior + kwd_name = "numeric_only" + raise TypeError( + f"Cannot use {kwd_name}=True with " + f"{type(self).__name__}.{name} and non-numeric dtypes." + ) return single def _iterate_slices(self) -> Iterable[Series]: @@ -1542,9 +1552,9 @@ def _cython_transform( # test_transform_numeric_ret # With self.axis == 1, _get_data_to_aggregate does a transpose # so we always have a single block. - mgr: Manager2D = self._get_data_to_aggregate() - if numeric_only: - mgr = mgr.get_numeric_data(copy=False) + mgr: Manager2D = self._get_data_to_aggregate( + numeric_only=numeric_only, name=how + ) def arr_func(bvalues: ArrayLike) -> ArrayLike: return self.grouper._cython_operation( @@ -1864,12 +1874,18 @@ def _gotitem(self, key, ndim: int, subset=None): raise AssertionError("invalid ndim for _gotitem") - def _get_data_to_aggregate(self) -> Manager2D: + def _get_data_to_aggregate( + self, *, numeric_only: bool = False, name: str | None = None + ) -> Manager2D: obj = self._obj_with_exclusions if self.axis == 1: - return obj.T._mgr + mgr = obj.T._mgr else: - return obj._mgr + mgr = obj._mgr + + if numeric_only: + mgr = mgr.get_numeric_data(copy=False) + return mgr def _indexed_output_to_ndframe( self, output: Mapping[base.OutputKey, ArrayLike] diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index fd9a06a06cfa7..3106ae1ec701b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1540,22 +1540,9 @@ def _cython_agg_general( # Note: we never get here with how="ohlc" for DataFrameGroupBy; # that goes through SeriesGroupBy - data = self._get_data_to_aggregate() + data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how) is_ser = data.ndim == 1 - if numeric_only: - if is_ser and not is_numeric_dtype(self._selected_obj.dtype): - # GH#41291 match Series behavior - kwd_name = "numeric_only" - if how in ["any", "all"]: - kwd_name = "bool_only" - raise TypeError( - f"Cannot use {kwd_name}={numeric_only} with " - f"{type(self).__name__}.{how} and non-numeric types." - ) - if not is_ser: - data = data.get_numeric_data(copy=False) - def array_func(values: ArrayLike) -> ArrayLike: try: result = self.grouper._cython_operation( @@ -2034,15 +2021,6 @@ def std( return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof)) else: - if ( - numeric_only - and self.obj.ndim == 1 - and not is_numeric_dtype(self.obj.dtype) - ): - raise TypeError( - f"{type(self).__name__}.std called with " - f"numeric_only={numeric_only} and dtype {self.obj.dtype}" - ) def _preprocessing(values): if isinstance(values, BaseMaskedArray): @@ -3114,11 +3092,6 @@ def quantile( a 2.0 b 3.0 """ - if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype): - raise TypeError( - f"{type(self).__name__}.quantile called with " - f"numeric_only={numeric_only} and dtype {self.obj.dtype}" - ) def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]: if is_object_dtype(vals): @@ -3258,8 +3231,7 @@ def blk_func(values: ArrayLike) -> ArrayLike: obj = self._obj_with_exclusions is_ser = obj.ndim == 1 - mgr = self._get_data_to_aggregate() - data = mgr.get_numeric_data() if numeric_only else mgr + data = self._get_data_to_aggregate(numeric_only=numeric_only, name="quantile") res_mgr = data.grouped_reduce(blk_func) if is_ser: @@ -3716,10 +3688,7 @@ def blk_func(values: ArrayLike) -> ArrayLike: # Operate block-wise instead of column-by-column is_ser = obj.ndim == 1 - mgr = self._get_data_to_aggregate() - - if numeric_only: - mgr = mgr.get_numeric_data() + mgr = self._get_data_to_aggregate(numeric_only=numeric_only, name=how) res_mgr = mgr.grouped_reduce(blk_func) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 7e7d3d682f20f..d3f9dd31e9fa1 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -92,7 +92,8 @@ def test_cython_agg_boolean(): def test_cython_agg_nothing_to_agg(): frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) - with pytest.raises(TypeError, match="Cannot use numeric_only=True"): + msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes" + with pytest.raises(TypeError, match=msg): frame.groupby("a")["b"].mean(numeric_only=True) with pytest.raises(TypeError, match="Could not convert (foo|bar)*"): @@ -117,7 +118,8 @@ def test_cython_agg_nothing_to_agg_with_dates(): "dates": pd.date_range("now", periods=50, freq="T"), } ) - with pytest.raises(TypeError, match="Cannot use numeric_only=True"): + msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes" + with pytest.raises(TypeError, match=msg): frame.groupby("b").dates.mean(numeric_only=True) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 1fcbc7c305a06..1fd61e6eb268e 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1555,11 +1555,10 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): elif dtype is object: msg = "|".join( [ - "Cannot use numeric_only=True", - "called with numeric_only=True and dtype object", + "SeriesGroupBy.sem called with numeric_only=True and dtype object", "Series.skew does not allow numeric_only=True with non-numeric", - "got an unexpected keyword argument 'numeric_only'", - "is not supported for object dtype", + "cum(sum|prod|min|max) is not supported for object dtype", + r"Cannot use numeric_only=True with SeriesGroupBy\..* and non-numeric", ] ) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 0f18c5c5774b7..e6e924793389d 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -908,7 +908,8 @@ def test_series_downsample_method(method, numeric_only, expected_data): func = getattr(resampled, method) if numeric_only and numeric_only is not lib.no_default: - with pytest.raises(TypeError, match="Cannot use numeric_only=True"): + msg = rf"Cannot use numeric_only=True with SeriesGroupBy\.{method}" + with pytest.raises(TypeError, match=msg): func(**kwargs) elif method == "prod": with pytest.raises(TypeError, match="can't multiply sequence by non-int"):