From 659f4651acf5609f806309987afcc1dd8dd29b85 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 24 Nov 2022 11:58:42 +0000 Subject: [PATCH 1/2] DEPR: Change numeric_only default to False in remaining groupby methods --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/core/groupby/generic.py | 31 +-- pandas/core/groupby/groupby.py | 227 +++--------------- pandas/core/internals/array_manager.py | 17 +- .../tests/groupby/aggregate/test_aggregate.py | 7 +- pandas/tests/groupby/aggregate/test_cython.py | 10 +- pandas/tests/groupby/aggregate/test_other.py | 3 +- pandas/tests/groupby/test_categorical.py | 3 + pandas/tests/groupby/test_function.py | 135 ++++------- pandas/tests/groupby/test_groupby.py | 7 +- pandas/tests/groupby/test_quantile.py | 20 +- .../tests/groupby/transform/test_transform.py | 67 +++--- pandas/tests/resample/test_resample_api.py | 12 +- 13 files changed, 152 insertions(+), 389 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 42170aaa09978..9abd08004edaa 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -572,7 +572,7 @@ Removal of prior version deprecations/changes - Changed default of ``numeric_only`` to ``False`` in all DataFrame methods with that argument (:issue:`46096`, :issue:`46906`) - Changed default of ``numeric_only`` to ``False`` in :meth:`Series.rank` (:issue:`47561`) - Enforced deprecation of silently dropping nuisance columns in groupby and resample operations when ``numeric_only=False`` (:issue:`41475`) -- Changed default of ``numeric_only`` to ``False`` in various :class:`.DataFrameGroupBy` methods (:issue:`46072`) +- Changed default of ``numeric_only`` in various :class:`.DataFrameGroupBy` methods; all methods now default to ``numeric_only=False`` (:issue:`46072`) - Changed default of ``numeric_only`` to ``False`` in :class:`.Resampler` methods (:issue:`47177`) - diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7e9163b87cee6..a80892a145a70 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -87,7 +87,6 @@ _agg_template, _apply_docs, _transform_template, - warn_dropping_nuisance_columns_deprecated, ) from pandas.core.groupby.grouper import get_grouper from pandas.core.indexes.api import ( @@ -438,7 +437,7 @@ def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): ) def _cython_transform( - self, how: str, numeric_only: bool = True, axis: AxisInt = 0, **kwargs + self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs ): assert axis == 0 # handled by caller @@ -1333,13 +1332,12 @@ def _wrap_applied_output_series( def _cython_transform( self, how: str, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, axis: AxisInt = 0, **kwargs, ) -> DataFrame: assert axis == 0 # handled by caller # TODO: no tests with self.ndim == 1 for DataFrameGroupBy - numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis) # With self.axis == 0, we have multi-block tests # e.g. test_rank_min_int, test_cython_transform_frame @@ -1347,8 +1345,7 @@ def _cython_transform( # With self.axis == 1, _get_data_to_aggregate does a transpose # so we always have a single block. mgr: Manager2D = self._get_data_to_aggregate() - orig_mgr_len = len(mgr) - if numeric_only_bool: + if numeric_only: mgr = mgr.get_numeric_data(copy=False) def arr_func(bvalues: ArrayLike) -> ArrayLike: @@ -1358,12 +1355,9 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike: # We could use `mgr.apply` here and not have to set_axis, but # we would have to do shape gymnastics for ArrayManager compat - res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=False) + res_mgr = mgr.grouped_reduce(arr_func) res_mgr.set_axis(1, mgr.axes[1]) - if len(res_mgr) < orig_mgr_len: - warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only) - res_df = self.obj._constructor(res_mgr) if self.axis == 1: res_df = res_df.T @@ -1493,15 +1487,8 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: output = {} inds = [] for i, (colname, sgb) in enumerate(self._iterate_column_groupbys(obj)): - try: - output[i] = sgb.transform(wrapper) - except TypeError: - # e.g. trying to call nanmean with string values - warn_dropping_nuisance_columns_deprecated( - type(self), "transform", numeric_only=False - ) - else: - inds.append(i) + output[i] = sgb.transform(wrapper) + inds.append(i) if not output: raise TypeError("Transform function invalid for data types") @@ -2243,7 +2230,7 @@ def corr( self, method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson", min_periods: int = 1, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, ) -> DataFrame: result = self._op_via_apply( "corr", method=method, min_periods=min_periods, numeric_only=numeric_only @@ -2255,7 +2242,7 @@ def cov( self, min_periods: int | None = None, ddof: int | None = 1, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, ) -> DataFrame: result = self._op_via_apply( "cov", min_periods=min_periods, ddof=ddof, numeric_only=numeric_only @@ -2316,7 +2303,7 @@ def corrwith( axis: Axis = 0, drop: bool = False, method: CorrelationMethod = "pearson", - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, ) -> DataFrame: result = self._op_via_apply( "corrwith", diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b3f6bb3edb9da..497e0ef724373 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1007,15 +1007,8 @@ def _op_via_apply(self, name: str, *args, **kwargs): if kwargs.get("axis", None) is None or kwargs.get("axis") is lib.no_default: kwargs["axis"] = self.axis - numeric_only = kwargs.get("numeric_only", lib.no_default) - def curried(x): - with warnings.catch_warnings(): - # Catch any warnings from dispatch to DataFrame; we'll emit - # a warning for groupby below - match = "The default value of numeric_only " - warnings.filterwarnings("ignore", match, FutureWarning) - return f(x, *args, **kwargs) + return f(x, *args, **kwargs) # preserve the name so we can detect it when calling plot methods, # to avoid duplicates @@ -1037,13 +1030,6 @@ def curried(x): not_indexed_same=not is_transform, ) - if self._selected_obj.ndim != 1 and self.axis != 1 and result.ndim != 1: - missing = self._obj_with_exclusions.columns.difference(result.columns) - if len(missing) > 0: - warn_dropping_nuisance_columns_deprecated( - type(self), name, numeric_only - ) - if self.grouper.has_dropped_na and is_transform: # result will have dropped rows due to nans, fill with null # and ensure index is ordered same as the input @@ -1308,80 +1294,6 @@ def _wrap_applied_output( ): raise AbstractMethodError(self) - def _resolve_numeric_only( - self, how: str, numeric_only: bool | lib.NoDefault, axis: AxisInt - ) -> bool: - """ - Determine subclass-specific default value for 'numeric_only'. - - For SeriesGroupBy we want the default to be False (to match Series behavior). - For DataFrameGroupBy we want it to be True (for backwards-compat). - - Parameters - ---------- - numeric_only : bool or lib.no_default - axis : int - Axis passed to the groupby op (not self.axis). - - Returns - ------- - bool - """ - # GH#41291 - if numeric_only is lib.no_default: - # i.e. not explicitly passed by user - if self.obj.ndim == 2: - # i.e. DataFrameGroupBy - numeric_only = axis != 1 - # GH#42395 GH#43108 GH#43154 - # Regression from 1.2.5 to 1.3 caused object columns to be dropped - if self.axis: - obj = self._obj_with_exclusions.T - else: - obj = self._obj_with_exclusions - check = obj._get_numeric_data() - if len(obj.columns) and not len(check.columns) and not obj.empty: - numeric_only = False - - else: - numeric_only = False - - if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype): - # GH#47500 - warnings.warn( - f"{type(self).__name__}.{how} called with " - f"numeric_only={numeric_only} and dtype {self.obj.dtype}. This will " - "raise a TypeError in a future version of pandas", - category=FutureWarning, - stacklevel=find_stack_level(), - ) - raise NotImplementedError( - f"{type(self).__name__}.{how} does not implement numeric_only" - ) - - return numeric_only - - def _maybe_warn_numeric_only_depr( - self, how: str, result: DataFrame | Series, numeric_only: bool | lib.NoDefault - ) -> None: - """Emit warning on numeric_only behavior deprecation when appropriate. - - Parameters - ---------- - how : str - Groupby kernel name. - result : - Result of the groupby operation. - numeric_only : bool or lib.no_default - Argument as passed by user. - """ - if ( - self._obj_with_exclusions.ndim != 1 - and result.ndim > 1 - and len(result.columns) < len(self._obj_with_exclusions.columns) - ): - warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only) - # ----------------------------------------------------------------- # numba @@ -1606,9 +1518,7 @@ def _python_apply_general( ) @final - def _python_agg_general( - self, func, *args, raise_on_typeerror: bool = False, **kwargs - ): + def _python_agg_general(self, func, *args, **kwargs): func = com.is_builtin_func(func) f = lambda x: func(x, *args, **kwargs) @@ -1621,18 +1531,7 @@ def _python_agg_general( for idx, obj in enumerate(self._iterate_slices()): name = obj.name - - try: - # if this function is invalid for this dtype, we will ignore it. - result = self.grouper.agg_series(obj, f) - except TypeError: - if raise_on_typeerror: - raise - warn_dropping_nuisance_columns_deprecated( - type(self), "agg", numeric_only=False - ) - continue - + result = self.grouper.agg_series(obj, f) key = base.OutputKey(label=name, position=idx) output[key] = result @@ -1644,7 +1543,7 @@ def _python_agg_general( @final def _agg_general( self, - numeric_only: bool | lib.NoDefault = True, + numeric_only: bool = False, min_count: int = -1, *, alias: str, @@ -1706,26 +1605,25 @@ def _cython_agg_general( self, how: str, alt: Callable, - numeric_only: bool | lib.NoDefault, + numeric_only: bool = False, min_count: int = -1, **kwargs, ): # Note: we never get here with how="ohlc" for DataFrameGroupBy; # that goes through SeriesGroupBy - numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis=0) data = self._get_data_to_aggregate() is_ser = data.ndim == 1 - orig_len = len(data) - if numeric_only_bool: + if numeric_only: if is_ser and not is_numeric_dtype(self._selected_obj.dtype): # GH#41291 match Series behavior kwd_name = "numeric_only" if how in ["any", "all"]: kwd_name = "bool_only" - raise NotImplementedError( - f"{type(self).__name__}.{how} does not implement {kwd_name}." + raise TypeError( + f"Cannot use {kwd_name}={numeric_only} with " + f"{type(self).__name__}.{how} and non-numeric types." ) if not is_ser: data = data.get_numeric_data(copy=False) @@ -1751,10 +1649,7 @@ def array_func(values: ArrayLike) -> ArrayLike: # TypeError -> we may have an exception in trying to aggregate # continue and exclude the block - new_mgr = data.grouped_reduce(array_func, ignore_failures=False) - - if not is_ser and len(new_mgr) < orig_len: - warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only) + new_mgr = data.grouped_reduce(array_func) res = self._wrap_agged_manager(new_mgr) if is_ser: @@ -1764,7 +1659,7 @@ def array_func(values: ArrayLike) -> ArrayLike: return res def _cython_transform( - self, how: str, numeric_only: bool = True, axis: AxisInt = 0, **kwargs + self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs ): raise AbstractMethodError(self) @@ -2144,23 +2039,21 @@ def median(self, numeric_only: bool = False): Parameters ---------- - numeric_only : bool, default True + numeric_only : bool, default False Include only float, int, boolean columns. .. versionchanged:: 2.0.0 - numeric_only no longer accepts ``None``. + numeric_only no longer accepts ``None`` and defaults to False. Returns ------- Series or DataFrame Median of values within each group. """ - numeric_only_bool = self._resolve_numeric_only("median", numeric_only, axis=0) - result = self._cython_agg_general( "median", - alt=lambda x: Series(x).median(numeric_only=numeric_only_bool), + alt=lambda x: Series(x).median(numeric_only=numeric_only), numeric_only=numeric_only, ) return result.__finalize__(self.obj, method="groupby") @@ -2221,10 +2114,8 @@ def std( return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof)) else: - # Resolve numeric_only so that var doesn't warn - numeric_only_bool = self._resolve_numeric_only("std", numeric_only, axis=0) if ( - numeric_only_bool + numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype) ): @@ -2235,7 +2126,7 @@ def std( result = self._get_cythonized_result( libgroupby.group_var, cython_dtype=np.dtype(np.float64), - numeric_only=numeric_only_bool, + numeric_only=numeric_only, needs_counts=True, post_processing=lambda vals, inference: np.sqrt(vals), ddof=ddof, @@ -2319,7 +2210,7 @@ def sem(self, ddof: int = 1, numeric_only: bool = False): ddof : int, default 1 Degrees of freedom. - numeric_only : bool, default True + numeric_only : bool, default False Include only `float`, `int` or `boolean` data. .. versionadded:: 1.5.0 @@ -2333,14 +2224,12 @@ def sem(self, ddof: int = 1, numeric_only: bool = False): Series or DataFrame Standard error of the mean of values within each group. """ - # Reolve numeric_only so that std doesn't warn if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype): raise TypeError( f"{type(self).__name__}.sem called with " f"numeric_only={numeric_only} and dtype {self.obj.dtype}" ) result = self.std(ddof=ddof, numeric_only=numeric_only) - self._maybe_warn_numeric_only_depr("sem", result, numeric_only) if result.ndim == 1: result /= np.sqrt(self.count()) @@ -3107,7 +2996,7 @@ def quantile( self, q: float | AnyArrayLike = 0.5, interpolation: str = "linear", - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, ): """ Return group values at the given quantile, a la numpy.percentile. @@ -3118,11 +3007,15 @@ def quantile( Value(s) between 0 and 1 providing the quantile(s) to compute. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} Method to use when the desired quantile falls between two points. - numeric_only : bool, default True + numeric_only : bool, default False Include only `float`, `int` or `boolean` data. .. versionadded:: 1.5.0 + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + Returns ------- Series or DataFrame @@ -3146,12 +3039,7 @@ def quantile( a 2.0 b 3.0 """ - numeric_only_bool = self._resolve_numeric_only("quantile", numeric_only, axis=0) - if ( - numeric_only_bool - and self.obj.ndim == 1 - and not is_numeric_dtype(self.obj.dtype) - ): + if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype): raise TypeError( f"{type(self).__name__}.quantile called with " f"numeric_only={numeric_only} and dtype {self.obj.dtype}" @@ -3296,25 +3184,8 @@ def blk_func(values: ArrayLike) -> ArrayLike: obj = self._obj_with_exclusions is_ser = obj.ndim == 1 mgr = self._get_data_to_aggregate() - data = mgr.get_numeric_data() if numeric_only_bool else mgr - res_mgr = data.grouped_reduce(blk_func, ignore_failures=False) - - if ( - numeric_only is lib.no_default - and not is_ser - and len(res_mgr.items) != len(mgr.items) - ): - warn_dropping_nuisance_columns_deprecated( - type(self), "quantile", numeric_only - ) - - if len(res_mgr.items) == 0: - # re-call grouped_reduce to get the desired exception message - mgr.grouped_reduce(blk_func, ignore_failures=False) - # grouped_reduce _should_ raise, so this should not be reached - raise TypeError( # pragma: no cover - "All columns were dropped in grouped_reduce" - ) + data = mgr.get_numeric_data() if numeric_only else mgr + res_mgr = data.grouped_reduce(blk_func) if is_ser: res = self._wrap_agged_manager(res_mgr) @@ -3613,9 +3484,8 @@ def cummin( skipna = kwargs.get("skipna", True) if axis != 0: f = lambda x: np.minimum.accumulate(x, axis) - numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis) obj = self._selected_obj - if numeric_only_bool: + if numeric_only: obj = obj._get_numeric_data() return self._python_apply_general(f, obj, is_transform=True) @@ -3639,9 +3509,8 @@ def cummax( skipna = kwargs.get("skipna", True) if axis != 0: f = lambda x: np.maximum.accumulate(x, axis) - numeric_only_bool = self._resolve_numeric_only("cummax", numeric_only, axis) obj = self._selected_obj - if numeric_only_bool: + if numeric_only: obj = obj._get_numeric_data() return self._python_apply_general(f, obj, is_transform=True) @@ -3654,7 +3523,7 @@ def _get_cythonized_result( self, base_func: Callable, cython_dtype: np.dtype, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, needs_counts: bool = False, needs_nullable: bool = False, needs_mask: bool = False, @@ -3670,7 +3539,7 @@ def _get_cythonized_result( base_func : callable, Cythonized function to be called cython_dtype : np.dtype Type of the array that will be modified by the Cython call. - numeric_only : bool, default True + numeric_only : bool, default False Whether only numeric datatypes should be computed needs_counts : bool, default False Whether the counts should be a part of the Cython call @@ -3701,9 +3570,6 @@ def _get_cythonized_result( ------- `Series` or `DataFrame` with filled values """ - how = base_func.__name__ - numeric_only_bool = self._resolve_numeric_only(how, numeric_only, axis=0) - if post_processing and not callable(post_processing): raise ValueError("'post_processing' must be a callable!") if pre_processing and not callable(pre_processing): @@ -3772,18 +3638,15 @@ def blk_func(values: ArrayLike) -> ArrayLike: mgr = self._get_data_to_aggregate() orig_mgr_len = len(mgr) - if numeric_only_bool: + if numeric_only: mgr = mgr.get_numeric_data() - res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=False) + res_mgr = mgr.grouped_reduce(blk_func) if not is_ser and len(res_mgr.items) != orig_mgr_len: - howstr = how.replace("group_", "") - warn_dropping_nuisance_columns_deprecated(type(self), howstr, numeric_only) - if len(res_mgr.items) == 0: # We re-call grouped_reduce to get the right exception message - mgr.grouped_reduce(blk_func, ignore_failures=False) + mgr.grouped_reduce(blk_func) # grouped_reduce _should_ raise, so this should not be reached raise TypeError( # pragma: no cover "All columns were dropped in grouped_reduce" @@ -4331,27 +4194,3 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde else: mi = MultiIndex.from_product([idx, qs]) return mi - - -def warn_dropping_nuisance_columns_deprecated(cls, how: str, numeric_only) -> None: - if numeric_only is not lib.no_default and not numeric_only: - # numeric_only was specified and falsey but still dropped nuisance columns - warnings.warn( - "Dropping invalid columns in " - f"{cls.__name__}.{how} is deprecated. " - "In a future version, a TypeError will be raised. " - f"Before calling .{how}, select only columns which " - "should be valid for the function.", - FutureWarning, - stacklevel=find_stack_level(), - ) - elif numeric_only is lib.no_default: - warnings.warn( - "The default value of numeric_only in " - f"{cls.__name__}.{how} is deprecated. " - "In a future version, numeric_only will default to False. " - f"Either specify numeric_only or select only columns which " - "should be valid for the function.", - FutureWarning, - stacklevel=find_stack_level(), - ) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 37ae9d103c8b5..efb448eaa922a 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -923,15 +923,13 @@ def idelete(self, indexer) -> ArrayManager: # -------------------------------------------------------------------- # Array-wise Operation - def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: + def grouped_reduce(self: T, func: Callable) -> T: """ Apply grouped reduction function columnwise, returning a new ArrayManager. Parameters ---------- func : grouped reduction function - ignore_failures : bool, default False - Whether to drop columns where func raises TypeError. Returns ------- @@ -943,13 +941,7 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: for i, arr in enumerate(self.arrays): # grouped_reduce functions all expect 2D arrays arr = ensure_block_shape(arr, ndim=2) - try: - res = func(arr) - except (TypeError, NotImplementedError): - if not ignore_failures: - raise - continue - + res = func(arr) if res.ndim == 2: # reverse of ensure_block_shape assert res.shape[0] == 1 @@ -963,10 +955,7 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: else: index = Index(range(result_arrays[0].shape[0])) - if ignore_failures: - columns = self.items[np.array(result_indices, dtype="int64")] - else: - columns = self.items + columns = self.items # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]"; # expected "List[Union[ndarray, ExtensionArray]]" diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 2d3ff95504371..03b917edd357b 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -301,11 +301,12 @@ def test_wrap_agg_out(three_group): def func(ser): if ser.dtype == object: - raise TypeError + raise TypeError("Test error message") return ser.sum() - with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"): - result = grouped.aggregate(func) + with pytest.raises(TypeError, match="Test error message"): + grouped.aggregate(func) + result = grouped[[c for c in three_group if c != "C"]].aggregate(func) exp_grouped = three_group.loc[:, three_group.columns != "C"] expected = exp_grouped.groupby(["A", "B"]).aggregate(func) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index b8d2350cf6267..dc09a2e0ea6ad 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -92,9 +92,8 @@ def test_cython_agg_boolean(): def test_cython_agg_nothing_to_agg(): frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) - with tm.assert_produces_warning(FutureWarning, match="This will raise a TypeError"): - with pytest.raises(NotImplementedError, match="does not implement"): - frame.groupby("a")["b"].mean(numeric_only=True) + with pytest.raises(TypeError, match="Cannot use numeric_only=True"): + frame.groupby("a")["b"].mean(numeric_only=True) with pytest.raises(TypeError, match="Could not convert (foo|bar)*"): frame.groupby("a")["b"].mean() @@ -116,9 +115,8 @@ def test_cython_agg_nothing_to_agg_with_dates(): "dates": pd.date_range("now", periods=50, freq="T"), } ) - with tm.assert_produces_warning(FutureWarning, match="This will raise a TypeError"): - with pytest.raises(NotImplementedError, match="does not implement"): - frame.groupby("b").dates.mean(numeric_only=True) + with pytest.raises(TypeError, match="Cannot use numeric_only=True"): + frame.groupby("b").dates.mean(numeric_only=True) def test_cython_agg_frame_columns(): diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 9aa58e919ce24..6a89c72354d04 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -293,8 +293,7 @@ def raiseException(df): raise TypeError("test") with pytest.raises(TypeError, match="test"): - with tm.assert_produces_warning(FutureWarning, match="Dropping invalid"): - df.groupby(0).agg(raiseException) + df.groupby(0).agg(raiseException) def test_series_agg_multikey(): diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 5c250618bf3c4..b35c4158bf420 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1842,6 +1842,9 @@ def test_category_order_reducer( ): msg = "GH#10694 - idxmax/min fail with unused categories" request.node.add_marker(pytest.mark.xfail(reason=msg)) + elif reduction_func == "corrwith" and not as_index: + msg = "GH#49950 - corrwith with as_index=False may not have grouping column" + request.node.add_marker(pytest.mark.xfail(reason=msg)) elif index_kind != "range" and not as_index: pytest.skip(reason="Result doesn't have categories, nothing to test") df = DataFrame( diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 0f301e05dc898..ef39aabd83d22 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -263,7 +263,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): # have no Python fallback exception = NotImplementedError if method.startswith("cum") else TypeError - if method in ("min", "max", "cummin", "cummax"): + if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"): # The methods default to numeric_only=False and raise TypeError msg = "|".join( [ @@ -591,10 +591,8 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only): method(*args, **kwargs) elif groupby_func not in has_axis: msg = "got an unexpected keyword argument 'axis'" - warn = FutureWarning if groupby_func == "skew" and not numeric_only else None - with tm.assert_produces_warning(warn, match="Dropping of nuisance columns"): - with pytest.raises(TypeError, match=msg): - method(*args, **kwargs) + with pytest.raises(TypeError, match=msg): + method(*args, **kwargs) # fillna and shift are successful even on object dtypes elif (numeric_only is None or not numeric_only) and groupby_func not in ( "fillna", @@ -1374,46 +1372,44 @@ def test_groupby_sum_timedelta_with_nat(): @pytest.mark.parametrize( - "kernel, numeric_only_default, has_arg", + "kernel, has_arg", [ - ("all", False, False), - ("any", False, False), - ("bfill", False, False), - ("corr", True, True), - ("corrwith", True, True), - ("cov", True, True), - ("cummax", False, True), - ("cummin", False, True), - ("cumprod", True, True), - ("cumsum", True, True), - ("diff", False, False), - ("ffill", False, False), - ("fillna", False, False), - ("first", False, True), - ("idxmax", True, True), - ("idxmin", True, True), - ("last", False, True), - ("max", False, True), - ("mean", False, True), - ("median", False, True), - ("min", False, True), - ("nth", False, False), - ("nunique", False, False), - ("pct_change", False, False), - ("prod", False, True), - ("quantile", True, True), - ("sem", False, True), - ("skew", False, True), - ("std", False, True), - ("sum", False, True), - ("var", False, True), + ("all", False), + ("any", False), + ("bfill", False), + ("corr", True), + ("corrwith", True), + ("cov", True), + ("cummax", True), + ("cummin", True), + ("cumprod", True), + ("cumsum", True), + ("diff", False), + ("ffill", False), + ("fillna", False), + ("first", True), + ("idxmax", True), + ("idxmin", True), + ("last", True), + ("max", True), + ("mean", True), + ("median", True), + ("min", True), + ("nth", False), + ("nunique", False), + ("pct_change", False), + ("prod", True), + ("quantile", True), + ("sem", True), + ("skew", True), + ("std", True), + ("sum", True), + ("var", True), ], ) @pytest.mark.parametrize("numeric_only", [True, False, lib.no_default]) @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) -def test_deprecate_numeric_only( - kernel, numeric_only_default, has_arg, numeric_only, keys -): +def test_numeric_only(kernel, has_arg, numeric_only, keys): # GH#46072 # drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False # has_arg: Whether the op has a numeric_only arg @@ -1424,26 +1420,9 @@ def test_deprecate_numeric_only( gb = df.groupby(keys) method = getattr(gb, kernel) - if ( - has_arg - and (kernel not in ("idxmax", "idxmin") or numeric_only is True) - and ( - # Cases where b does not appear in the result - numeric_only is True - or (numeric_only is lib.no_default and numeric_only_default) - ) - ): - if numeric_only is True or not numeric_only_default: - warn = None - else: - warn = FutureWarning - if numeric_only is lib.no_default and numeric_only_default: - msg = f"The default value of numeric_only in DataFrameGroupBy.{kernel}" - else: - msg = f"Dropping invalid columns in DataFrameGroupBy.{kernel}" - with tm.assert_produces_warning(warn, match=msg): - result = method(*args, **kwargs) - + if has_arg and numeric_only is True: + # Cases where b does not appear in the result + result = method(*args, **kwargs) assert "b" not in result.columns elif ( # kernels that work on any dtype and have numeric_only arg @@ -1577,31 +1556,17 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): with pytest.raises(TypeError, match=msg): method(*args, numeric_only=True) elif dtype is object: - err_category = NotImplementedError - err_msg = f"{groupby_func} does not implement numeric_only" - if groupby_func.startswith("cum"): - # cum ops already exhibit future behavior - warn_category = None - warn_msg = "" - err_category = TypeError - err_msg = f"{groupby_func} is not supported for object dtype" - elif groupby_func == "skew": - warn_category = None - warn_msg = "" - err_category = TypeError - err_msg = "Series.skew does not allow numeric_only=True with non-numeric" - elif groupby_func == "sem": - warn_category = None - warn_msg = "" - err_category = TypeError - err_msg = "called with numeric_only=True and dtype object" - else: - warn_category = FutureWarning - warn_msg = "This will raise a TypeError" - - with tm.assert_produces_warning(warn_category, match=warn_msg): - with pytest.raises(err_category, match=err_msg): - method(*args, numeric_only=True) + msg = "|".join( + [ + "Cannot use numeric_only=True", + "called with numeric_only=True and dtype object", + "Series.skew does not allow numeric_only=True with non-numeric", + "got an unexpected keyword argument 'numeric_only'", + "is not supported for object dtype", + ] + ) + with pytest.raises(TypeError, match=msg): + method(*args, numeric_only=True) else: result = method(*args, numeric_only=True) expected = method(*args, numeric_only=False) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c35930ed43607..a7104c2e21049 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -996,12 +996,11 @@ def test_wrap_aggregated_output_multindex(mframe): def aggfun(ser): if ser.name == ("foo", "one"): - raise TypeError + raise TypeError("Test error message") return ser.sum() - with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"): - agged2 = df.groupby(keys).aggregate(aggfun) - assert len(agged2.columns) + 1 == len(df.columns) + with pytest.raises(TypeError, match="Test error message"): + df.groupby(keys).aggregate(aggfun) def test_groupby_level_apply(mframe): diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 5b0c0f671ae7c..56b9b35f1f688 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._libs import lib - import pandas as pd from pandas import ( DataFrame, @@ -160,10 +158,7 @@ def test_quantile_raises(): df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"): - with tm.assert_produces_warning( - FutureWarning, match="Dropping invalid columns" - ): - df.groupby("key").quantile() + df.groupby("key").quantile() def test_quantile_out_of_bounds_q_raises(): @@ -242,16 +237,11 @@ def test_groupby_quantile_nullable_array(values, q): @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) -@pytest.mark.parametrize("numeric_only", [lib.no_default, True, False]) -def test_groupby_quantile_skips_invalid_dtype(q, numeric_only): +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]}) - - if numeric_only is lib.no_default or numeric_only: - warn = FutureWarning if numeric_only is lib.no_default else None - msg = "The default value of numeric_only in DataFrameGroupBy.quantile" - with tm.assert_produces_warning(warn, match=msg): - result = df.groupby("a").quantile(q, numeric_only=numeric_only) - + if numeric_only: + result = df.groupby("a").quantile(q, numeric_only=numeric_only) expected = df.groupby("a")[["b"]].quantile(q) tm.assert_frame_equal(result, expected) else: diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 23005f291970b..8bdbc86d8659c 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -20,7 +20,6 @@ date_range, ) import pandas._testing as tm -from pandas.core.groupby.generic import DataFrameGroupBy from pandas.tests.groupby import get_groupby_method_args @@ -409,31 +408,21 @@ def test_transform_select_columns(df): tm.assert_frame_equal(result, expected) -def test_transform_exclude_nuisance(df): +def test_transform_nuisance_raises(df): # case that goes through _transform_item_by_item df.columns = ["A", "B", "B", "D"] # this also tests orderings in transform between # series/frame to make sure it's consistent - expected = {} grouped = df.groupby("A") gbc = grouped["B"] - with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"): - expected["B"] = gbc.transform(lambda x: np.mean(x)) - # squeeze 1-column DataFrame down to Series - expected["B"] = expected["B"]["B"] - - assert isinstance(gbc.obj, DataFrame) - assert isinstance(gbc, DataFrameGroupBy) - - expected["D"] = grouped["D"].transform(np.mean) - expected = DataFrame(expected) - with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"): - result = df.groupby("A").transform(lambda x: np.mean(x)) + with pytest.raises(TypeError, match="Could not convert"): + gbc.transform(lambda x: np.mean(x)) - tm.assert_frame_equal(result, expected) + with pytest.raises(TypeError, match="Could not convert"): + df.groupby("A").transform(lambda x: np.mean(x)) def test_transform_function_aliases(df): @@ -519,10 +508,11 @@ def test_groupby_transform_with_int(): } ) with np.errstate(all="ignore"): - with tm.assert_produces_warning( - FutureWarning, match="Dropping invalid columns" - ): - result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) + with pytest.raises(TypeError, match="Could not convert"): + df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) + result = df.groupby("A")[["B", "C"]].transform( + lambda x: (x - x.mean()) / x.std() + ) expected = DataFrame( {"B": np.nan, "C": Series([-1, 0, 1, -1, 0, 1], dtype="float64")} ) @@ -538,10 +528,11 @@ def test_groupby_transform_with_int(): } ) with np.errstate(all="ignore"): - with tm.assert_produces_warning( - FutureWarning, match="Dropping invalid columns" - ): - result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) + with pytest.raises(TypeError, match="Could not convert"): + df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) + result = df.groupby("A")[["B", "C"]].transform( + lambda x: (x - x.mean()) / x.std() + ) expected = DataFrame({"B": np.nan, "C": [-1.0, 0.0, 1.0, -1.0, 0.0, 1.0]}) tm.assert_frame_equal(result, expected) @@ -549,10 +540,11 @@ def test_groupby_transform_with_int(): s = Series([2, 3, 4, 10, 5, -1]) df = DataFrame({"A": [1, 1, 1, 2, 2, 2], "B": 1, "C": s, "D": "foo"}) with np.errstate(all="ignore"): - with tm.assert_produces_warning( - FutureWarning, match="Dropping invalid columns" - ): - result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) + with pytest.raises(TypeError, match="Could not convert"): + df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) + result = df.groupby("A")[["B", "C"]].transform( + lambda x: (x - x.mean()) / x.std() + ) s1 = s.iloc[0:3] s1 = (s1 - s1.mean()) / s1.std() @@ -562,8 +554,9 @@ def test_groupby_transform_with_int(): tm.assert_frame_equal(result, expected) # int doesn't get downcasted - with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"): - result = df.groupby("A").transform(lambda x: x * 2 / 2) + with pytest.raises(TypeError, match="unsupported operand type"): + df.groupby("A").transform(lambda x: x * 2 / 2) + result = df.groupby("A")[["B", "C"]].transform(lambda x: x * 2 / 2) expected = DataFrame({"B": 1.0, "C": [2.0, 3.0, 4.0, 10.0, 5.0, -1.0]}) tm.assert_frame_equal(result, expected) @@ -755,13 +748,15 @@ def test_cython_transform_frame(op, args, targop): expected = expected.sort_index(axis=1) - warn = None if op == "shift" else FutureWarning - msg = "The default value of numeric_only" - with tm.assert_produces_warning(warn, match=msg): - result = gb.transform(op, *args).sort_index(axis=1) + if op != "shift": + with pytest.raises(TypeError, match="datetime64 type does not support"): + gb.transform(op, *args).sort_index(axis=1) + result = gb[expected.columns].transform(op, *args).sort_index(axis=1) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(warn, match=msg): - result = getattr(gb, op)(*args).sort_index(axis=1) + if op != "shift": + with pytest.raises(TypeError, match="datetime64 type does not support"): + getattr(gb, op)(*args).sort_index(axis=1) + result = getattr(gb[expected.columns], op)(*args).sort_index(axis=1) tm.assert_frame_equal(result, expected) # individual columns for c in df: diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index e256b957699b7..5f1e0904b8c3c 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -903,18 +903,16 @@ def test_series_downsample_method(method, numeric_only, expected_data): expected_index = date_range("2018-12-31", periods=1, freq="Y") df = Series(["cat_1", "cat_2"], index=index) resampled = df.resample("Y") + kwargs = {} if numeric_only is lib.no_default else {"numeric_only": numeric_only} func = getattr(resampled, method) if numeric_only and numeric_only is not lib.no_default: - with tm.assert_produces_warning( - FutureWarning, match="This will raise a TypeError" - ): - with pytest.raises(NotImplementedError, match="not implement numeric_only"): - func(numeric_only=numeric_only) + with pytest.raises(TypeError, match="Cannot use numeric_only=True"): + func(**kwargs) elif method == "prod": with pytest.raises(TypeError, match="can't multiply sequence by non-int"): - func(numeric_only=numeric_only) + func(**kwargs) else: - result = func(numeric_only=numeric_only) + result = func(**kwargs) expected = Series(expected_data, index=expected_index) tm.assert_series_equal(result, expected) From 8b0c0397c575254fcd2e64ed811bd8ad464fe362 Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 28 Nov 2022 20:44:06 -0500 Subject: [PATCH 2/2] More cleanup of ignore_failures --- pandas/core/internals/array_manager.py | 2 +- pandas/core/internals/base.py | 7 +------ pandas/core/internals/managers.py | 27 ++++---------------------- 3 files changed, 6 insertions(+), 30 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index efb448eaa922a..feca755fd43db 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -213,7 +213,7 @@ def apply( ------- ArrayManager """ - assert "filter" not in kwargs and "ignore_failures" not in kwargs + assert "filter" not in kwargs align_keys = align_keys or [] result_arrays: list[np.ndarray] = [] diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index 37aa60f1ee52d..8a0f2863d851f 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -189,12 +189,7 @@ def setitem_inplace(self, indexer, value) -> None: arr[indexer] = value - def grouped_reduce(self, func, ignore_failures: bool = False): - """ - ignore_failures : bool, default False - Not used; for compatibility with ArrayManager/BlockManager. - """ - + def grouped_reduce(self, func): arr = self.array res = func(arr) index = default_index(len(res)) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3eca3756e1678..20cc087adab23 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -320,7 +320,7 @@ def apply( ------- BlockManager """ - assert "filter" not in kwargs and "ignore_failures" not in kwargs + assert "filter" not in kwargs align_keys = align_keys or [] result_blocks: list[Block] = [] @@ -1466,44 +1466,29 @@ def idelete(self, indexer) -> BlockManager: # ---------------------------------------------------------------- # Block-wise Operation - def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: + def grouped_reduce(self: T, func: Callable) -> T: """ Apply grouped reduction function blockwise, returning a new BlockManager. Parameters ---------- func : grouped reduction function - ignore_failures : bool, default False - Whether to drop blocks where func raises TypeError. Returns ------- BlockManager """ result_blocks: list[Block] = [] - dropped_any = False for blk in self.blocks: if blk.is_object: # split on object-dtype blocks bc some columns may raise # while others do not. for sb in blk._split(): - try: - applied = sb.apply(func) - except (TypeError, NotImplementedError): - if not ignore_failures: - raise - dropped_any = True - continue + applied = sb.apply(func) result_blocks = extend_blocks(applied, result_blocks) else: - try: - applied = blk.apply(func) - except (TypeError, NotImplementedError): - if not ignore_failures: - raise - dropped_any = True - continue + applied = blk.apply(func) result_blocks = extend_blocks(applied, result_blocks) if len(result_blocks) == 0: @@ -1511,10 +1496,6 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: else: index = Index(range(result_blocks[0].values.shape[-1])) - if dropped_any: - # faster to skip _combine if we haven't dropped any blocks - return self._combine(result_blocks, copy=False, index=index) - return type(self).from_blocks(result_blocks, [self.axes[0], index]) def reduce(self: T, func: Callable) -> T: