diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 128fd68674f96..af30add139222 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -493,7 +493,8 @@ retained by specifying ``group_keys=False``. ``numeric_only`` default value ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Across the DataFrame operations such as ``min``, ``sum``, and ``idxmax``, the default +Across the DataFrame and DataFrameGroupBy operations such as +``min``, ``sum``, and ``idxmax``, the default value of the ``numeric_only`` argument, if it exists at all, was inconsistent. Furthermore, operations with the default value ``None`` can lead to surprising results. (:issue:`46560`) @@ -523,6 +524,8 @@ gained the ``numeric_only`` argument. - :meth:`DataFrame.cov` - :meth:`DataFrame.idxmin` - :meth:`DataFrame.idxmax` +- :meth:`.DataFrameGroupBy.cummin` +- :meth:`.DataFrameGroupBy.cummax` - :meth:`.DataFrameGroupBy.idxmin` - :meth:`.DataFrameGroupBy.idxmax` - :meth:`.GroupBy.var` diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f725ae061cedb..2acf5c826eb57 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -28,6 +28,7 @@ from pandas._libs import ( Interval, + lib, reduction as libreduction, ) from pandas._typing import ( @@ -1128,10 +1129,15 @@ def _wrap_applied_output_series( return self._reindex_output(result) def _cython_transform( - self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs + self, + how: str, + numeric_only: bool | lib.NoDefault = lib.no_default, + axis: int = 0, + **kwargs, ) -> DataFrame: assert axis == 0 # handled by caller # TODO: no tests with self.ndim == 1 for DataFrameGroupBy + numeric_only_bool = self._resolve_numeric_only(numeric_only, axis) # With self.axis == 0, we have multi-block tests # e.g. test_rank_min_int, test_cython_transform_frame @@ -1139,7 +1145,8 @@ def _cython_transform( # With self.axis == 1, _get_data_to_aggregate does a transpose # so we always have a single block. mgr: Manager2D = self._get_data_to_aggregate() - if numeric_only: + orig_mgr_len = len(mgr) + if numeric_only_bool: mgr = mgr.get_numeric_data(copy=False) def arr_func(bvalues: ArrayLike) -> ArrayLike: @@ -1152,8 +1159,8 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike: res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True) res_mgr.set_axis(1, mgr.axes[1]) - if len(res_mgr) < len(mgr): - warn_dropping_nuisance_columns_deprecated(type(self), how) + if len(res_mgr) < orig_mgr_len: + warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only) res_df = self.obj._constructor(res_mgr) if self.axis == 1: @@ -1269,7 +1276,9 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: output[i] = sgb.transform(wrapper) except TypeError: # e.g. trying to call nanmean with string values - warn_dropping_nuisance_columns_deprecated(type(self), "transform") + warn_dropping_nuisance_columns_deprecated( + type(self), "transform", numeric_only=False + ) else: inds.append(i) @@ -1559,19 +1568,27 @@ def nunique(self, dropna: bool = True) -> DataFrame: _shared_docs["idxmax"], numeric_only_default="True for axis=0, False for axis=1", ) - def idxmax(self, axis=0, skipna: bool = True, numeric_only: bool | None = None): + def idxmax( + self, + axis=0, + skipna: bool = True, + numeric_only: bool | lib.NoDefault = lib.no_default, + ): axis = DataFrame._get_axis_number(axis) - if numeric_only is None: - numeric_only = None if axis == 0 else False + if numeric_only is lib.no_default: + # Cannot use self._resolve_numeric_only; we must pass None to + # DataFrame.idxmax for backwards compatibility + numeric_only_arg = None if axis == 0 else False + else: + numeric_only_arg = cast(bool, numeric_only) def func(df): - # NB: here we use numeric_only=None, in DataFrame it is False GH#38217 res = df._reduce( nanops.nanargmax, "argmax", axis=axis, skipna=skipna, - numeric_only=numeric_only, + numeric_only=numeric_only_arg, ) indices = res._values index = df._get_axis(axis) @@ -1579,25 +1596,35 @@ def func(df): return df._constructor_sliced(result, index=res.index) func.__name__ = "idxmax" - return self._python_apply_general(func, self._obj_with_exclusions) + result = self._python_apply_general(func, self._obj_with_exclusions) + self._maybe_warn_numeric_only_depr("idxmax", result, numeric_only) + return result @doc( _shared_docs["idxmin"], numeric_only_default="True for axis=0, False for axis=1", ) - def idxmin(self, axis=0, skipna: bool = True, numeric_only: bool | None = None): + def idxmin( + self, + axis=0, + skipna: bool = True, + numeric_only: bool | lib.NoDefault = lib.no_default, + ): axis = DataFrame._get_axis_number(axis) - if numeric_only is None: - numeric_only = None if axis == 0 else False + if numeric_only is lib.no_default: + # Cannot use self._resolve_numeric_only; we must pass None to + # DataFrame.idxmin for backwards compatibility + numeric_only_arg = None if axis == 0 else False + else: + numeric_only_arg = cast(bool, numeric_only) def func(df): - # NB: here we use numeric_only=None, in DataFrame it is False GH#46560 res = df._reduce( nanops.nanargmin, "argmin", axis=axis, skipna=skipna, - numeric_only=numeric_only, + numeric_only=numeric_only_arg, ) indices = res._values index = df._get_axis(axis) @@ -1605,7 +1632,9 @@ def func(df): return df._constructor_sliced(result, index=res.index) func.__name__ = "idxmin" - return self._python_apply_general(func, self._obj_with_exclusions) + result = self._python_apply_general(func, self._obj_with_exclusions) + self._maybe_warn_numeric_only_depr("idxmin", result, numeric_only) + return result boxplot = boxplot_frame_groupby diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 70f8e0a752dcb..0203d54e0de86 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -939,8 +939,15 @@ def wrapper(*args, **kwargs): if kwargs.get("axis", None) is None: kwargs["axis"] = self.axis + numeric_only = kwargs.get("numeric_only", lib.no_default) + def curried(x): - return f(x, *args, **kwargs) + with warnings.catch_warnings(): + # Catch any warnings from dispatch to DataFrame; we'll emit + # a warning for groupby below + match = "The default value of numeric_only " + warnings.filterwarnings("ignore", match, FutureWarning) + return f(x, *args, **kwargs) # preserve the name so we can detect it when calling plot methods, # to avoid duplicates @@ -956,6 +963,13 @@ def curried(x): curried, self._obj_with_exclusions, is_transform=is_transform ) + if self._selected_obj.ndim != 1 and self.axis != 1: + missing = self._obj_with_exclusions.columns.difference(result.columns) + if len(missing) > 0: + warn_dropping_nuisance_columns_deprecated( + type(self), name, numeric_only + ) + if self.grouper.has_dropped_na and is_transform: # result will have dropped rows due to nans, fill with null # and ensure index is ordered same as the input @@ -1223,7 +1237,9 @@ def _wrap_applied_output( ): raise AbstractMethodError(self) - def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: + def _resolve_numeric_only( + self, numeric_only: bool | lib.NoDefault, axis: int + ) -> bool: """ Determine subclass-specific default value for 'numeric_only'. @@ -1233,6 +1249,8 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: Parameters ---------- numeric_only : bool or lib.no_default + axis : int + Axis passed to the groupby op (not self.axis). Returns ------- @@ -1243,7 +1261,7 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: # i.e. not explicitly passed by user if self.obj.ndim == 2: # i.e. DataFrameGroupBy - numeric_only = True + numeric_only = axis != 1 # GH#42395 GH#43108 GH#43154 # Regression from 1.2.5 to 1.3 caused object columns to be dropped if self.axis: @@ -1253,7 +1271,6 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: check = obj._get_numeric_data() if len(obj.columns) and not len(check.columns) and not obj.empty: numeric_only = False - # TODO: v1.4+ Add FutureWarning else: numeric_only = False @@ -1262,6 +1279,27 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: # expected "bool") return numeric_only # type: ignore[return-value] + def _maybe_warn_numeric_only_depr( + self, how: str, result: DataFrame | Series, numeric_only: bool | lib.NoDefault + ) -> None: + """Emit warning on numeric_only behavior deprecation when appropriate. + + Parameters + ---------- + how : str + Groupby kernel name. + result : + Result of the groupby operation. + numeric_only : bool or lib.no_default + Argument as passed by user. + """ + if ( + self._obj_with_exclusions.ndim != 1 + and result.ndim > 1 + and len(result.columns) < len(self._obj_with_exclusions.columns) + ): + warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only) + # ----------------------------------------------------------------- # numba @@ -1522,7 +1560,9 @@ def _python_agg_general(self, func, *args, raise_on_typeerror=False, **kwargs): except TypeError: if raise_on_typeerror: raise - warn_dropping_nuisance_columns_deprecated(type(self), "agg") + warn_dropping_nuisance_columns_deprecated( + type(self), "agg", numeric_only=False + ) continue key = base.OutputKey(label=name, position=idx) @@ -1536,7 +1576,7 @@ def _python_agg_general(self, func, *args, raise_on_typeerror=False, **kwargs): @final def _agg_general( self, - numeric_only: bool = True, + numeric_only: bool | lib.NoDefault = True, min_count: int = -1, *, alias: str, @@ -1598,17 +1638,19 @@ def _cython_agg_general( self, how: str, alt: Callable, - numeric_only: bool, + numeric_only: bool | lib.NoDefault, min_count: int = -1, ignore_failures: bool = True, ): # Note: we never get here with how="ohlc" for DataFrameGroupBy; # that goes through SeriesGroupBy + numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0) data = self._get_data_to_aggregate() is_ser = data.ndim == 1 - if numeric_only: + orig_len = len(data) + if numeric_only_bool: if is_ser and not is_numeric_dtype(self._selected_obj.dtype): # GH#41291 match Series behavior kwd_name = "numeric_only" @@ -1638,8 +1680,8 @@ def array_func(values: ArrayLike) -> ArrayLike: # continue and exclude the block new_mgr = data.grouped_reduce(array_func, ignore_failures=ignore_failures) - if not is_ser and len(new_mgr) < len(data): - warn_dropping_nuisance_columns_deprecated(type(self), how) + if not is_ser and len(new_mgr) < orig_len: + warn_dropping_nuisance_columns_deprecated(type(self), how, numeric_only) res = self._wrap_agged_manager(new_mgr) if is_ser: @@ -1997,7 +2039,7 @@ def mean( 2 4.0 Name: B, dtype: float64 """ - numeric_only_bool = self._resolve_numeric_only(numeric_only) + numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0) if maybe_use_numba(engine): from pandas.core._numba.kernels import sliding_mean @@ -2007,7 +2049,7 @@ def mean( result = self._cython_agg_general( "mean", alt=lambda x: Series(x).mean(numeric_only=numeric_only_bool), - numeric_only=numeric_only_bool, + numeric_only=numeric_only, ) return result.__finalize__(self.obj, method="groupby") @@ -2031,12 +2073,12 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default): Series or DataFrame Median of values within each group. """ - numeric_only_bool = self._resolve_numeric_only(numeric_only) + numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0) result = self._cython_agg_general( "median", alt=lambda x: Series(x).median(numeric_only=numeric_only_bool), - numeric_only=numeric_only_bool, + numeric_only=numeric_only, ) return result.__finalize__(self.obj, method="groupby") @@ -2092,7 +2134,7 @@ def std( return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof)) else: - return self._get_cythonized_result( + result = self._get_cythonized_result( libgroupby.group_var, cython_dtype=np.dtype(np.float64), numeric_only=numeric_only, @@ -2100,6 +2142,8 @@ def std( post_processing=lambda vals, inference: np.sqrt(vals), ddof=ddof, ) + self._maybe_warn_numeric_only_depr("std", result, numeric_only) + return result @final @Substitution(name="groupby") @@ -2153,12 +2197,12 @@ def var( return self._numba_agg_general(sliding_var, engine_kwargs, ddof) else: - numeric_only_bool = self._resolve_numeric_only(numeric_only) + numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0) if ddof == 1: return self._cython_agg_general( "var", alt=lambda x: Series(x).var(ddof=ddof), - numeric_only=numeric_only_bool, + numeric_only=numeric_only, ignore_failures=numeric_only is lib.no_default, ) else: @@ -2193,6 +2237,8 @@ def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default Standard error of the mean of values within each group. """ result = self.std(ddof=ddof, numeric_only=numeric_only) + self._maybe_warn_numeric_only_depr("sem", result, numeric_only) + if result.ndim == 1: result /= np.sqrt(self.count()) else: @@ -2253,8 +2299,6 @@ def sum( engine_kwargs, ) else: - numeric_only = self._resolve_numeric_only(numeric_only) - # If we are grouping on categoricals we want unobserved categories to # return zero, rather than the default of NaN which the reindexing in # _agg_general() returns. GH #31422 @@ -2273,8 +2317,6 @@ def sum( def prod( self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0 ): - numeric_only = self._resolve_numeric_only(numeric_only) - return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod ) @@ -3050,7 +3092,7 @@ def quantile( a 2.0 b 3.0 """ - numeric_only_bool = self._resolve_numeric_only(numeric_only) + numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0) def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]: if is_object_dtype(vals): @@ -3153,7 +3195,9 @@ def blk_func(values: ArrayLike) -> ArrayLike: and not is_ser and len(res_mgr.items) != len(mgr.items) ): - warn_dropping_nuisance_columns_deprecated(type(self), "quantile") + warn_dropping_nuisance_columns_deprecated( + type(self), "quantile", numeric_only + ) if len(res_mgr.items) == 0: # re-call grouped_reduce to get the desired exception message @@ -3447,7 +3491,7 @@ def cumsum(self, axis=0, *args, **kwargs): @final @Substitution(name="groupby") @Appender(_common_see_also) - def cummin(self, axis=0, **kwargs): + def cummin(self, axis=0, numeric_only=False, **kwargs): """ Cumulative min for each group. @@ -3460,12 +3504,14 @@ def cummin(self, axis=0, **kwargs): f = lambda x: np.minimum.accumulate(x, axis) return self._python_apply_general(f, self._selected_obj, is_transform=True) - return self._cython_transform("cummin", numeric_only=False, skipna=skipna) + return self._cython_transform( + "cummin", numeric_only=numeric_only, skipna=skipna + ) @final @Substitution(name="groupby") @Appender(_common_see_also) - def cummax(self, axis=0, **kwargs): + def cummax(self, axis=0, numeric_only=False, **kwargs): """ Cumulative max for each group. @@ -3478,7 +3524,9 @@ def cummax(self, axis=0, **kwargs): f = lambda x: np.maximum.accumulate(x, axis) return self._python_apply_general(f, self._selected_obj, is_transform=True) - return self._cython_transform("cummax", numeric_only=False, skipna=skipna) + return self._cython_transform( + "cummax", numeric_only=numeric_only, skipna=skipna + ) @final def _get_cythonized_result( @@ -3532,7 +3580,7 @@ def _get_cythonized_result( ------- `Series` or `DataFrame` with filled values """ - numeric_only = self._resolve_numeric_only(numeric_only) + numeric_only_bool = self._resolve_numeric_only(numeric_only, axis=0) if post_processing and not callable(post_processing): raise ValueError("'post_processing' must be a callable!") @@ -3601,15 +3649,16 @@ def blk_func(values: ArrayLike) -> ArrayLike: # Operate block-wise instead of column-by-column is_ser = obj.ndim == 1 mgr = self._get_data_to_aggregate() + orig_mgr_len = len(mgr) - if numeric_only: + if numeric_only_bool: mgr = mgr.get_numeric_data() res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True) - if not is_ser and len(res_mgr.items) != len(mgr.items): + if not is_ser and len(res_mgr.items) != orig_mgr_len: howstr = how.replace("group_", "") - warn_dropping_nuisance_columns_deprecated(type(self), howstr) + warn_dropping_nuisance_columns_deprecated(type(self), howstr, numeric_only) if len(res_mgr.items) == 0: # We re-call grouped_reduce to get the right exception message @@ -4155,13 +4204,27 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde return mi -def warn_dropping_nuisance_columns_deprecated(cls, how: str) -> None: - warnings.warn( - "Dropping invalid columns in " - f"{cls.__name__}.{how} is deprecated. " - "In a future version, a TypeError will be raised. " - f"Before calling .{how}, select only columns which " - "should be valid for the function.", - FutureWarning, - stacklevel=find_stack_level(), - ) +def warn_dropping_nuisance_columns_deprecated(cls, how: str, numeric_only) -> None: + if how == "add": + how = "sum" + if numeric_only is not lib.no_default and not numeric_only: + # numeric_only was specified and falsey but still dropped nuisance columns + warnings.warn( + "Dropping invalid columns in " + f"{cls.__name__}.{how} is deprecated. " + "In a future version, a TypeError will be raised. " + f"Before calling .{how}, select only columns which " + "should be valid for the function.", + FutureWarning, + stacklevel=find_stack_level(), + ) + elif numeric_only is lib.no_default: + warnings.warn( + "The default value of numeric_only in " + f"{cls.__name__}.{how} is deprecated. " + "In a future version, numeric_only will default to False. " + f"Either specify numeric_only or select only columns which " + "should be valid for the function.", + FutureWarning, + stacklevel=find_stack_level(), + ) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 336865d32167d..711f1835446a5 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -1,5 +1,7 @@ import pytest +from pandas.core.dtypes.common import is_numeric_dtype + import pandas as pd import pandas._testing as tm from pandas.tests.extension.base.base import BaseExtensionTests @@ -96,7 +98,15 @@ def test_in_numeric_groupby(self, data_for_grouping): "C": [1, 1, 1, 1, 1, 1, 1, 1], } ) - result = df.groupby("A").sum().columns + + dtype = data_for_grouping.dtype + if is_numeric_dtype(dtype) or dtype.name == "decimal": + warn = None + else: + warn = FutureWarning + msg = "The default value of numeric_only" + with tm.assert_produces_warning(warn, match=msg): + result = df.groupby("A").sum().columns if data_for_grouping.dtype._is_numeric: expected = pd.Index(["B", "C"]) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index ba89a76a7f8c2..fedcc0e2a2284 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1785,7 +1785,9 @@ def test_stack_multiple_bug(self): multi = df.set_index(["DATE", "ID"]) multi.columns.name = "Params" unst = multi.unstack("ID") - down = unst.resample("W-THU").mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + down = unst.resample("W-THU").mean() rs = down.stack("ID") xp = unst.loc[:, ["VAR1"]].resample("W-THU").mean().stack("ID") diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 2b248afb42057..b4a3a60e72139 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -71,7 +71,9 @@ def test_metadata_propagation_indiv_groupby(self): "D": np.random.randn(8), } ) - result = df.groupby("A").sum() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").sum() tm.assert_metadata_equivalent(df, result) def test_metadata_propagation_indiv_resample(self): diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index bdb33bff5eadd..37b02571158b9 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -238,7 +238,10 @@ def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype [[1, 2, 3, 4, 5, 6]] * 3, columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]), ).astype({("a", "j"): dtype, ("b", "j"): dtype}) - result = df.groupby(level=1, axis=1).agg(func) + warn = FutureWarning if func == "std" else None + msg = "The default value of numeric_only" + with tm.assert_produces_warning(warn, match=msg): + result = df.groupby(level=1, axis=1).agg(func) expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype( result_dtype_dict ) @@ -262,7 +265,10 @@ def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict): columns=Index([10, 20, 10, 20], name="x"), dtype="int64", ).astype({10: "Int64"}) - result = df.groupby("x", axis=1).agg(func) + warn = FutureWarning if func == "std" else None + msg = "The default value of numeric_only" + with tm.assert_produces_warning(warn, match=msg): + result = df.groupby("x", axis=1).agg(func) expected = DataFrame( data=expected_data, index=Index([0, 1, 0], name="y"), diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py index 7c64d82608c9e..e541abb368a02 100644 --- a/pandas/tests/groupby/test_allowlist.py +++ b/pandas/tests/groupby/test_allowlist.py @@ -187,7 +187,9 @@ def test_regression_allowlist_methods(raw_frame, op, level, axis, skipna, sort): if op in AGG_FUNCTIONS_WITH_SKIPNA: grouped = frame.groupby(level=level, axis=axis, sort=sort) - with tm.assert_produces_warning(warn, match="The 'mad' method is deprecated"): + with tm.assert_produces_warning( + warn, match="The 'mad' method is deprecated", raise_on_extra_warnings=False + ): result = getattr(grouped, op)(skipna=skipna) with tm.assert_produces_warning(FutureWarning): expected = getattr(frame, op)(level=level, axis=axis, skipna=skipna) @@ -196,8 +198,8 @@ def test_regression_allowlist_methods(raw_frame, op, level, axis, skipna, sort): tm.assert_frame_equal(result, expected) else: grouped = frame.groupby(level=level, axis=axis, sort=sort) - result = getattr(grouped, op)() with tm.assert_produces_warning(FutureWarning): + result = getattr(grouped, op)() expected = getattr(frame, op)(level=level, axis=axis) if sort: expected = expected.sort_index(axis=axis, level=level) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index abe1b8f13e32e..004e55f4d161f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -103,7 +103,9 @@ def test_basic(): # TODO: split this test gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True) expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)}) - result = gb.sum() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.sum() tm.assert_frame_equal(result, expected) # GH 8623 @@ -314,6 +316,7 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:.*value of numeric_only.*:FutureWarning") def test_observed(observed): # multiple groupers, don't re-expand the output space # of the grouper @@ -807,8 +810,12 @@ def test_preserve_categorical_dtype(): } ) for col in ["C1", "C2"]: - result1 = df.groupby(by=col, as_index=False, observed=False).mean() - result2 = df.groupby(by=col, as_index=True, observed=False).mean().reset_index() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = df.groupby(by=col, as_index=False, observed=False).mean() + result2 = ( + df.groupby(by=col, as_index=True, observed=False).mean().reset_index() + ) expected = exp_full.reindex(columns=result1.columns) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index c99405dfccb66..206d37e1a800e 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas._libs import lib from pandas.errors import UnsupportedFunctionCall import pandas as pd @@ -259,7 +260,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): # these have numeric_only kwarg, but default to False warn = FutureWarning - with tm.assert_produces_warning(warn, match="Dropping invalid columns"): + with tm.assert_produces_warning( + warn, match="Dropping invalid columns", raise_on_extra_warnings=False + ): result = getattr(gb, method)() tm.assert_index_equal(result.columns, expected_columns_numeric) @@ -297,24 +300,26 @@ def gni(self, df): return gni # TODO: non-unique columns, as_index=False - @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning") def test_idxmax(self, gb): # object dtype so idxmax goes through _aggregate_item_by_item # GH#5610 # non-cython calls should not include the grouper expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3]) expected.index.name = "A" - result = gb.idxmax() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.idxmax() tm.assert_frame_equal(result, expected) - @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning") def test_idxmin(self, gb): # object dtype so idxmax goes through _aggregate_item_by_item # GH#5610 # non-cython calls should not include the grouper expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3]) expected.index.name = "A" - result = gb.idxmin() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.idxmin() tm.assert_frame_equal(result, expected) def test_mad(self, gb, gni): @@ -1238,3 +1243,114 @@ def test_groupby_sum_timedelta_with_nat(): res = gb["b"].sum(min_count=2) expected = Series([td3, pd.NaT], dtype="m8[ns]", name="b", index=expected.index) tm.assert_series_equal(res, expected) + + +@pytest.mark.parametrize( + "kernel, numeric_only_default, drops_nuisance, has_arg", + [ + ("all", False, False, False), + ("any", False, False, False), + ("bfill", False, False, False), + ("corr", True, False, True), + ("corrwith", True, False, True), + ("cov", True, False, True), + ("cummax", False, True, True), + ("cummin", False, True, True), + ("cumprod", True, True, True), + ("cumsum", True, True, True), + ("diff", False, False, False), + ("ffill", False, False, False), + ("fillna", False, False, False), + ("first", False, False, True), + ("idxmax", True, False, True), + ("idxmin", True, False, True), + ("last", False, False, True), + ("max", False, True, True), + ("mean", True, True, True), + ("median", True, True, True), + ("min", False, True, True), + ("nth", False, False, False), + ("nunique", False, False, False), + ("pct_change", False, False, False), + ("prod", True, True, True), + ("quantile", True, False, True), + ("sem", True, True, True), + ("skew", True, False, True), + ("std", True, True, True), + ("sum", True, True, True), + ("var", True, False, True), + ], +) +@pytest.mark.parametrize("numeric_only", [True, False, lib.no_default]) +@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) +def test_deprecate_numeric_only( + kernel, numeric_only_default, drops_nuisance, has_arg, numeric_only, keys +): + # GH#46072 + # drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False + # has_arg: Whether the op has a numeric_only arg + df = DataFrame({"a1": [1, 1], "a2": [2, 2], "a3": [5, 6], "b": 2 * [object]}) + + if kernel == "corrwith": + args = (df,) + elif kernel == "nth" or kernel == "fillna": + args = (0,) + else: + args = () + kwargs = {} if numeric_only is lib.no_default else {"numeric_only": numeric_only} + + gb = df.groupby(keys) + method = getattr(gb, kernel) + if has_arg and ( + # Cases where b does not appear in the result + numeric_only is True + or (numeric_only is lib.no_default and numeric_only_default) + or drops_nuisance + ): + if numeric_only is True or (not numeric_only_default and not drops_nuisance): + warn = None + else: + warn = FutureWarning + if numeric_only is lib.no_default and numeric_only_default: + msg = f"The default value of numeric_only in DataFrameGroupBy.{kernel}" + else: + msg = f"Dropping invalid columns in DataFrameGroupBy.{kernel}" + with tm.assert_produces_warning(warn, match=msg): + result = method(*args, **kwargs) + + assert "b" not in result.columns + elif ( + # kernels that work on any dtype and have numeric_only arg + kernel in ("first", "last", "corrwith") + or ( + # kernels that work on any dtype and don't have numeric_only arg + kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique") + and numeric_only is lib.no_default + ) + ): + result = method(*args, **kwargs) + assert "b" in result.columns + elif has_arg: + assert numeric_only is not True + assert numeric_only is not lib.no_default or numeric_only_default is False + assert not drops_nuisance + # kernels that are successful on any dtype were above; this will fail + msg = ( + "(not allowed for this dtype" + "|must be a string or a number" + "|cannot be performed against 'object' dtypes" + "|must be a string or a real number)" + ) + with pytest.raises(TypeError, match=msg): + method(*args, **kwargs) + elif not has_arg and numeric_only is not lib.no_default: + with pytest.raises( + TypeError, match="got an unexpected keyword argument 'numeric_only'" + ): + method(*args, **kwargs) + else: + assert kernel in ("diff", "pct_change") + assert numeric_only is lib.no_default + # Doesn't have numeric_only argument and fails on nuisance columns + with pytest.raises(TypeError, match=r"unsupported operand type"): + method(*args, **kwargs) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 016e817e43402..61951292d55a8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -474,13 +474,17 @@ def test_frame_groupby_columns(tsframe): def test_frame_set_name_single(df): grouped = df.groupby("A") - result = grouped.mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.mean() assert result.index.name == "A" - result = df.groupby("A", as_index=False).mean() + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A", as_index=False).mean() assert result.index.name != "A" - result = grouped.agg(np.mean) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.agg(np.mean) assert result.index.name == "A" result = grouped.agg({"C": np.mean, "D": np.std}) @@ -503,8 +507,10 @@ def test_multi_func(df): col2 = df["B"] grouped = df.groupby([col1.get, col2.get]) - agged = grouped.mean() - expected = df.groupby(["A", "B"]).mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + agged = grouped.mean() + expected = df.groupby(["A", "B"]).mean() # TODO groupby get drops names tm.assert_frame_equal( @@ -661,13 +667,16 @@ def test_groupby_as_index_agg(df): # single-key - result = grouped.agg(np.mean) - expected = grouped.mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.agg(np.mean) + expected = grouped.mean() tm.assert_frame_equal(result, expected) result2 = grouped.agg({"C": np.mean, "D": np.sum}) - expected2 = grouped.mean() - expected2["D"] = grouped.sum()["D"] + with tm.assert_produces_warning(FutureWarning, match=msg): + expected2 = grouped.mean() + expected2["D"] = grouped.sum()["D"] tm.assert_frame_equal(result2, expected2) grouped = df.groupby("A", as_index=True) @@ -754,8 +763,10 @@ def test_as_index_series_return_frame(df): grouped = df.groupby("A", as_index=False) grouped2 = df.groupby(["A", "B"], as_index=False) - result = grouped["C"].agg(np.sum) - expected = grouped.agg(np.sum).loc[:, ["A", "C"]] + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped["C"].agg(np.sum) + expected = grouped.agg(np.sum).loc[:, ["A", "C"]] assert isinstance(result, DataFrame) tm.assert_frame_equal(result, expected) @@ -765,7 +776,8 @@ def test_as_index_series_return_frame(df): tm.assert_frame_equal(result2, expected2) result = grouped["C"].sum() - expected = grouped.sum().loc[:, ["A", "C"]] + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = grouped.sum().loc[:, ["A", "C"]] assert isinstance(result, DataFrame) tm.assert_frame_equal(result, expected) @@ -789,8 +801,10 @@ def test_groupby_as_index_cython(df): # single-key grouped = data.groupby("A", as_index=False) - result = grouped.mean() - expected = data.groupby(["A"]).mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.mean() + expected = data.groupby(["A"]).mean() expected.insert(0, "A", expected.index) expected.index = np.arange(len(expected)) tm.assert_frame_equal(result, expected) @@ -859,15 +873,18 @@ def test_groupby_multi_corner(df): def test_omit_nuisance(df): grouped = df.groupby("A") - agged = grouped.agg(np.mean) - exp = grouped.mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + agged = grouped.agg(np.mean) + exp = grouped.mean() tm.assert_frame_equal(agged, exp) df = df.loc[:, ["A", "C", "D"]] df["E"] = datetime.now() grouped = df.groupby("A") - result = grouped.agg(np.sum) - expected = grouped.sum() + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.agg(np.sum) + expected = grouped.sum() tm.assert_frame_equal(result, expected) # won't work with axis = 1 @@ -898,7 +915,7 @@ def test_keep_nuisance_agg(df, agg_function): @pytest.mark.parametrize("numeric_only", [lib.no_default, True, False]) def test_omit_nuisance_agg(df, agg_function, numeric_only): # GH 38774, GH 38815 - if not numeric_only and agg_function != "sum": + if numeric_only is lib.no_default or (not numeric_only and agg_function != "sum"): # sum doesn't drop strings warn = FutureWarning else: @@ -913,7 +930,13 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): with pytest.raises(klass, match="could not convert string to float"): getattr(grouped, agg_function)(numeric_only=numeric_only) else: - with tm.assert_produces_warning(warn, match="Dropping invalid columns"): + if numeric_only is lib.no_default: + msg = ( + f"The default value of numeric_only in DataFrameGroupBy.{agg_function}" + ) + else: + msg = "Dropping invalid columns" + with tm.assert_produces_warning(warn, match=msg): result = getattr(grouped, agg_function)(numeric_only=numeric_only) if ( (numeric_only is lib.no_default or not numeric_only) @@ -923,9 +946,18 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): columns = ["A", "B", "C", "D"] else: columns = ["A", "C", "D"] - expected = getattr(df.loc[:, columns].groupby("A"), agg_function)( - numeric_only=numeric_only - ) + if agg_function == "sum" and numeric_only is False: + # sum doesn't drop nuisance string columns + warn = None + elif agg_function in ("sum", "std", "var", "sem") and numeric_only is not True: + warn = FutureWarning + else: + warn = None + msg = "The default value of numeric_only" + with tm.assert_produces_warning(warn, match=msg): + expected = getattr(df.loc[:, columns].groupby("A"), agg_function)( + numeric_only=numeric_only + ) tm.assert_frame_equal(result, expected) @@ -941,8 +973,10 @@ def test_omit_nuisance_warnings(df): def test_omit_nuisance_python_multiple(three_group): grouped = three_group.groupby(["A", "B"]) - agged = grouped.agg(np.mean) - exp = grouped.mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + agged = grouped.agg(np.mean) + exp = grouped.mean() tm.assert_frame_equal(agged, exp) @@ -959,8 +993,10 @@ def test_empty_groups_corner(mframe): ) grouped = df.groupby(["k1", "k2"]) - result = grouped.agg(np.mean) - expected = grouped.mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.agg(np.mean) + expected = grouped.mean() tm.assert_frame_equal(result, expected) grouped = mframe[3:5].groupby(level=0) @@ -982,7 +1018,9 @@ def test_wrap_aggregated_output_multindex(mframe): df["baz", "two"] = "peekaboo" keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] - agged = df.groupby(keys).agg(np.mean) + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + agged = df.groupby(keys).agg(np.mean) assert isinstance(agged.columns, MultiIndex) def aggfun(ser): @@ -1143,15 +1181,19 @@ def test_groupby_with_hier_columns(): # add a nuisance column sorted_columns, _ = columns.sortlevel(0) df["A", "foo"] = "bar" - result = df.groupby(level=0).mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(level=0).mean() tm.assert_index_equal(result.columns, df.columns[:-1]) def test_grouping_ndarray(df): grouped = df.groupby(df["A"].values) - result = grouped.sum() - expected = df.groupby("A").sum() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = grouped.sum() + expected = df.groupby("A").sum() tm.assert_frame_equal( result, expected, check_names=False ) # Note: no names when grouping by value @@ -1179,8 +1221,10 @@ def test_groupby_wrong_multi_labels(): def test_groupby_series_with_name(df): - result = df.groupby(df["A"]).mean() - result2 = df.groupby(df["A"], as_index=False).mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(df["A"]).mean() + result2 = df.groupby(df["A"], as_index=False).mean() assert result.index.name == "A" assert "A" in result2 @@ -1331,8 +1375,10 @@ def test_groupby_unit64_float_conversion(): def test_groupby_list_infer_array_like(df): - result = df.groupby(list(df["A"])).mean() - expected = df.groupby(df["A"]).mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(list(df["A"])).mean() + expected = df.groupby(df["A"]).mean() tm.assert_frame_equal(result, expected, check_names=False) with pytest.raises(KeyError, match=r"^'foo'$"): @@ -1445,7 +1491,9 @@ def test_groupby_2d_malformed(): d["zeros"] = [0, 0] d["ones"] = [1, 1] d["label"] = ["l1", "l2"] - tmp = d.groupby(["group"]).mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + tmp = d.groupby(["group"]).mean() res_values = np.array([[0.0, 1.0], [0.0, 1.0]]) tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"])) tm.assert_numpy_array_equal(tmp.values, res_values) @@ -1611,10 +1659,13 @@ def f(group): def test_no_dummy_key_names(df): # see gh-1291 - result = df.groupby(df["A"].values).sum() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(df["A"].values).sum() assert result.index.name is None - result = df.groupby([df["A"].values, df["B"].values]).sum() + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby([df["A"].values, df["B"].values]).sum() assert result.index.names == (None, None) @@ -2634,7 +2685,9 @@ def test_groupby_aggregation_numeric_with_non_numeric_dtype(): ) gb = df.groupby(by=["x"]) - result = gb.sum() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.sum() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 54cde30ceac92..b665843728165 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -112,5 +112,7 @@ def test_groupby_resample_preserves_subclass(obj): df = df.set_index("Date") # Confirm groupby.resample() preserves dataframe type - result = df.groupby("Buyer").resample("5D").sum() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("Buyer").resample("5D").sum() assert isinstance(result, obj) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index c6e4bec3f7b2c..85602fdf7274a 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -59,8 +59,10 @@ def test_column_select_via_attr(self, df): tm.assert_series_equal(result, expected) df["mean"] = 1.5 - result = df.groupby("A").mean() - expected = df.groupby("A").agg(np.mean) + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").mean() + expected = df.groupby("A").agg(np.mean) tm.assert_frame_equal(result, expected) def test_getitem_list_of_columns(self): @@ -284,25 +286,30 @@ def test_grouper_column_and_index(self): {"A": np.arange(6), "B": ["one", "one", "two", "two", "one", "one"]}, index=idx, ) - result = df_multi.groupby(["B", pd.Grouper(level="inner")]).mean() - expected = df_multi.reset_index().groupby(["B", "inner"]).mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df_multi.groupby(["B", pd.Grouper(level="inner")]).mean() + expected = df_multi.reset_index().groupby(["B", "inner"]).mean() tm.assert_frame_equal(result, expected) # Test the reverse grouping order - result = df_multi.groupby([pd.Grouper(level="inner"), "B"]).mean() - expected = df_multi.reset_index().groupby(["inner", "B"]).mean() + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df_multi.groupby([pd.Grouper(level="inner"), "B"]).mean() + expected = df_multi.reset_index().groupby(["inner", "B"]).mean() tm.assert_frame_equal(result, expected) # Grouping a single-index frame by a column and the index should # be equivalent to resetting the index and grouping by two columns df_single = df_multi.reset_index("outer") - result = df_single.groupby(["B", pd.Grouper(level="inner")]).mean() - expected = df_single.reset_index().groupby(["B", "inner"]).mean() + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df_single.groupby(["B", pd.Grouper(level="inner")]).mean() + expected = df_single.reset_index().groupby(["B", "inner"]).mean() tm.assert_frame_equal(result, expected) # Test the reverse grouping order - result = df_single.groupby([pd.Grouper(level="inner"), "B"]).mean() - expected = df_single.reset_index().groupby(["inner", "B"]).mean() + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df_single.groupby([pd.Grouper(level="inner"), "B"]).mean() + expected = df_single.reset_index().groupby(["inner", "B"]).mean() tm.assert_frame_equal(result, expected) def test_groupby_levels_and_columns(self): @@ -376,8 +383,10 @@ def test_empty_groups(self, df): def test_groupby_grouper(self, df): grouped = df.groupby("A") - result = df.groupby(grouped.grouper).mean() - expected = grouped.mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby(grouped.grouper).mean() + expected = grouped.mean() tm.assert_frame_equal(result, expected) def test_groupby_dict_mapping(self): diff --git a/pandas/tests/groupby/test_index_as_string.py b/pandas/tests/groupby/test_index_as_string.py index 971a447b84cae..501a21981a148 100644 --- a/pandas/tests/groupby/test_index_as_string.py +++ b/pandas/tests/groupby/test_index_as_string.py @@ -47,8 +47,11 @@ def series(): ], ) def test_grouper_index_level_as_string(frame, key_strs, groupers): - result = frame.groupby(key_strs).mean() - expected = frame.groupby(groupers).mean() + warn = FutureWarning if "B" not in key_strs or "outer" in frame.columns else None + msg = "The default value of numeric_only" + with tm.assert_produces_warning(warn, match=msg): + result = frame.groupby(key_strs).mean() + expected = frame.groupby(groupers).mean() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_pipe.py b/pandas/tests/groupby/test_pipe.py index 1229251f88c7d..4f58bcb5ee763 100644 --- a/pandas/tests/groupby/test_pipe.py +++ b/pandas/tests/groupby/test_pipe.py @@ -60,7 +60,9 @@ def f(dfgb, arg1): ) def g(dfgb, arg2): - return dfgb.sum() / dfgb.sum().sum() + arg2 + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + return dfgb.sum() / dfgb.sum().sum() + arg2 def h(df, arg3): return df.x + df.y - arg3 diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 0f7e71c99584d..20328426a69b2 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -246,9 +246,10 @@ def test_groupby_quantile_nullable_array(values, q): def test_groupby_quantile_skips_invalid_dtype(q, numeric_only): df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]}) - if numeric_only is None or numeric_only: + if numeric_only is lib.no_default or numeric_only: warn = FutureWarning if numeric_only is lib.no_default else None - with tm.assert_produces_warning(warn, match="Dropping invalid columns"): + msg = "The default value of numeric_only in DataFrameGroupBy.quantile" + with tm.assert_produces_warning(warn, match=msg): result = df.groupby("a").quantile(q, numeric_only=numeric_only) expected = df.groupby("a")[["b"]].quantile(q) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 7c9d6e7a73087..ae725cbb2b588 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -105,14 +105,18 @@ def test_groupby_with_timegrouper(self): ) expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype="int64") - result1 = df.resample("5D").sum() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result1 = df.resample("5D").sum() tm.assert_frame_equal(result1, expected) df_sorted = df.sort_index() - result2 = df_sorted.groupby(Grouper(freq="5D")).sum() + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = df_sorted.groupby(Grouper(freq="5D")).sum() tm.assert_frame_equal(result2, expected) - result3 = df.groupby(Grouper(freq="5D")).sum() + with tm.assert_produces_warning(FutureWarning, match=msg): + result3 = df.groupby(Grouper(freq="5D")).sum() tm.assert_frame_equal(result3, expected) @pytest.mark.parametrize("should_sort", [True, False]) @@ -186,7 +190,9 @@ def test_timegrouper_with_reg_groups(self): } ).set_index(["Date", "Buyer"]) - result = df.groupby([Grouper(freq="A"), "Buyer"]).sum() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby([Grouper(freq="A"), "Buyer"]).sum() tm.assert_frame_equal(result, expected) expected = DataFrame( @@ -201,7 +207,8 @@ def test_timegrouper_with_reg_groups(self): ], } ).set_index(["Date", "Buyer"]) - result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum() + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum() tm.assert_frame_equal(result, expected) df_original = DataFrame( @@ -239,10 +246,13 @@ def test_timegrouper_with_reg_groups(self): } ).set_index(["Date", "Buyer"]) - result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum() + warn_msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum() tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M"), "Buyer"]).sum() + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = df.groupby([Grouper(freq="1M"), "Buyer"]).sum() expected = DataFrame( { "Buyer": "Carl Joe Mark".split(), @@ -258,7 +268,8 @@ def test_timegrouper_with_reg_groups(self): # passing the name df = df.reset_index() - result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum() + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum() tm.assert_frame_equal(result, expected) with pytest.raises(KeyError, match="'The grouper name foo is not found'"): @@ -266,9 +277,11 @@ def test_timegrouper_with_reg_groups(self): # passing the level df = df.set_index("Date") - result = df.groupby([Grouper(freq="1M", level="Date"), "Buyer"]).sum() + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = df.groupby([Grouper(freq="1M", level="Date"), "Buyer"]).sum() tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M", level=0), "Buyer"]).sum() + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = df.groupby([Grouper(freq="1M", level=0), "Buyer"]).sum() tm.assert_frame_equal(result, expected) with pytest.raises(ValueError, match="The level foo is not valid"): @@ -277,7 +290,8 @@ def test_timegrouper_with_reg_groups(self): # multi names df = df.copy() df["Date"] = df.index + offsets.MonthEnd(2) - result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum() + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum() expected = DataFrame( { "Buyer": "Carl Joe Mark".split(), @@ -306,18 +320,22 @@ def test_timegrouper_with_reg_groups(self): [datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date" ), ) - result = df.groupby(Grouper(freq="1M")).sum() + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = df.groupby(Grouper(freq="1M")).sum() tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M")]).sum() + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = df.groupby([Grouper(freq="1M")]).sum() tm.assert_frame_equal(result, expected) expected.index = expected.index.shift(1) assert expected.index.freq == offsets.MonthEnd() - result = df.groupby(Grouper(freq="1M", key="Date")).sum() + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = df.groupby(Grouper(freq="1M", key="Date")).sum() tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M", key="Date")]).sum() + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = df.groupby([Grouper(freq="1M", key="Date")]).sum() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("freq", ["D", "M", "A", "Q-APR"]) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 0492b143eaf1f..b325edaf2b1ea 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -203,15 +203,24 @@ def test_transform_axis_1_reducer(request, reduction_func): ): marker = pytest.mark.xfail(reason="transform incorrectly fails - GH#45986") request.node.add_marker(marker) - warn = FutureWarning if reduction_func == "mad" else None + if reduction_func == "mad": + warn = FutureWarning + msg = "The 'mad' method is deprecated" + elif reduction_func in ("sem", "std"): + warn = FutureWarning + msg = "The default value of numeric_only" + else: + warn = None + msg = "" df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"]) - with tm.assert_produces_warning(warn, match="The 'mad' method is deprecated"): + with tm.assert_produces_warning(warn, match=msg): result = df.groupby([0, 0, 1], axis=1).transform(reduction_func) if reduction_func == "size": # size doesn't behave in the same manner; hardcode expected result expected = DataFrame(2 * [[2, 2, 1]], index=df.index, columns=df.columns) else: + warn = FutureWarning if reduction_func == "mad" else None with tm.assert_produces_warning(warn, match="The 'mad' method is deprecated"): expected = df.T.groupby([0, 0, 1]).transform(reduction_func).T tm.assert_equal(result, expected) @@ -462,8 +471,10 @@ def test_transform_exclude_nuisance(df): def test_transform_function_aliases(df): - result = df.groupby("A").transform("mean") - expected = df.groupby("A").transform(np.mean) + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").transform("mean") + expected = df.groupby("A").transform(np.mean) tm.assert_frame_equal(result, expected) result = df.groupby("A")["C"].transform("mean") @@ -774,8 +785,15 @@ def test_cython_transform_frame(op, args, targop): expected = gb.apply(targop) expected = expected.sort_index(axis=1) - tm.assert_frame_equal(expected, gb.transform(op, *args).sort_index(axis=1)) - tm.assert_frame_equal(expected, getattr(gb, op)(*args).sort_index(axis=1)) + + warn = None if op == "shift" else FutureWarning + msg = "The default value of numeric_only" + with tm.assert_produces_warning(warn, match=msg): + result = gb.transform(op, *args).sort_index(axis=1) + tm.assert_frame_equal(result, expected) + with tm.assert_produces_warning(warn, match=msg): + result = getattr(gb, op)(*args).sort_index(axis=1) + tm.assert_frame_equal(result, expected) # individual columns for c in df: if ( diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index b5bae4759090a..21ef078bcf418 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -90,9 +90,10 @@ def test_groupby_resample_on_api(): } ) - expected = df.set_index("dates").groupby("key").resample("D").mean() - - result = df.groupby("key").resample("D", on="dates").mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.set_index("dates").groupby("key").resample("D").mean() + result = df.groupby("key").resample("D", on="dates").mean() tm.assert_frame_equal(result, expected) @@ -196,7 +197,9 @@ def tests_skip_nuisance(test_frame): tm.assert_frame_equal(result, expected) expected = r[["A", "B", "C"]].sum() - result = r.sum() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = r.sum() tm.assert_frame_equal(result, expected) @@ -643,10 +646,15 @@ def test_selection_api_validation(): exp = df_exp.resample("2D").sum() exp.index.name = "date" - tm.assert_frame_equal(exp, df.resample("2D", on="date").sum()) + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.resample("2D", on="date").sum() + tm.assert_frame_equal(exp, result) exp.index.name = "d" - tm.assert_frame_equal(exp, df.resample("2D", level="d").sum()) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.resample("2D", level="d").sum() + tm.assert_frame_equal(exp, result) @pytest.mark.parametrize( @@ -809,9 +817,13 @@ def test_frame_downsample_method(method, numeric_only, expected_data): func = getattr(resampled, method) if method == "prod" and numeric_only is not True: warn = FutureWarning + msg = "Dropping invalid columns in DataFrameGroupBy.prod is deprecated" + elif method == "sum" and numeric_only is lib.no_default: + warn = FutureWarning + msg = "The default value of numeric_only in DataFrameGroupBy.sum is deprecated" else: warn = None - msg = "Dropping invalid columns in DataFrameGroupBy.prod is deprecated" + msg = "" with tm.assert_produces_warning(warn, match=msg): result = func(numeric_only=numeric_only) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index cae2d77dfbd3f..5392ec88544a1 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -408,7 +408,9 @@ def test_resample_groupby_agg(): df["date"] = pd.to_datetime(df["date"]) resampled = df.groupby("cat").resample("Y", on="date") - expected = resampled.sum() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = resampled.sum() result = resampled.agg({"num": "sum"}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 5d6df078ee8c3..905c2af2d22a5 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -553,7 +553,9 @@ def test_mixed_type_join_with_suffix(self): df.insert(5, "dt", "foo") grouped = df.groupby("id") - mn = grouped.mean() + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + mn = grouped.mean() cn = grouped.count() # it works! diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 31f720b9ec336..0d3b9f4561b55 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -146,8 +146,10 @@ def test_pivot_table_nocols(self): df = DataFrame( {"rows": ["a", "b", "c"], "cols": ["x", "y", "z"], "values": [1, 2, 3]} ) - rs = df.pivot_table(columns="cols", aggfunc=np.sum) - xp = df.pivot_table(index="cols", aggfunc=np.sum).T + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = df.pivot_table(columns="cols", aggfunc=np.sum) + xp = df.pivot_table(index="cols", aggfunc=np.sum).T tm.assert_frame_equal(rs, xp) rs = df.pivot_table(columns="cols", aggfunc={"values": "mean"}) @@ -903,12 +905,19 @@ def test_no_col(self): # to help with a buglet self.data.columns = [k * 2 for k in self.data.columns] - table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc=np.mean) + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + table = self.data.pivot_table( + index=["AA", "BB"], margins=True, aggfunc=np.mean + ) for value_col in table.columns: totals = table.loc[("All", ""), value_col] assert totals == self.data[value_col].mean() - table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") + with tm.assert_produces_warning(FutureWarning, match=msg): + table = self.data.pivot_table( + index=["AA", "BB"], margins=True, aggfunc="mean" + ) for item in ["DD", "EE", "FF"]: totals = table.loc[("All", ""), item] assert totals == self.data[item].mean() @@ -964,7 +973,9 @@ def test_margin_with_only_columns_defined( } ) - result = df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc) + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc) expected = DataFrame(values, index=Index(["D", "E"]), columns=expected_columns) tm.assert_frame_equal(result, expected) @@ -1990,8 +2001,11 @@ def test_pivot_string_as_func(self): def test_pivot_string_func_vs_func(self, f, f_numpy): # GH #18713 # for consistency purposes - result = pivot_table(self.data, index="A", columns="B", aggfunc=f) - expected = pivot_table(self.data, index="A", columns="B", aggfunc=f_numpy) + + msg = "The default value of numeric_only" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = pivot_table(self.data, index="A", columns="B", aggfunc=f) + expected = pivot_table(self.data, index="A", columns="B", aggfunc=f_numpy) tm.assert_frame_equal(result, expected) @pytest.mark.slow