diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index c5f2dbe71cb3c..fc8b59e11e001 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -383,12 +383,17 @@ this pathological behavior (:issue:`37827`): *New behavior*: -.. ipython:: python - :okwarning: +.. code-block:: ipython - df.mean() + In [3]: df.mean() + Out[3]: + A 1.0 + dtype: float64 - df[["A"]].mean() + In [4]: df[["A"]].mean() + Out[4]: + A 1.0 + dtype: float64 Moreover, DataFrame reductions with ``numeric_only=None`` will now be consistent with their Series counterparts. In particular, for @@ -415,10 +420,10 @@ instead of casting to a NumPy array which may have different semantics (:issue:` *New behavior*: -.. ipython:: python - :okwarning: +.. code-block:: ipython - df.any() + In [5]: df.any() + Out[5]: Series([], dtype: bool) .. _whatsnew_120.api_breaking.python: diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 07a4e4af3dbe7..d63575ceca2a0 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -500,6 +500,7 @@ Removal of prior version deprecations/changes - Changed behavior of :meth:`Series.__setitem__` with an integer key and a :class:`Float64Index` when the key is not present in the index; previously we treated the key as positional (behaving like ``series.iloc[key] = val``), now we treat it is a label (behaving like ``series.loc[key] = val``), consistent with :meth:`Series.__getitem__`` behavior (:issue:`33469`) - Removed ``na_sentinel`` argument from :func:`factorize`, :meth:`.Index.factorize`, and :meth:`.ExtensionArray.factorize` (:issue:`47157`) - Changed behavior of :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` so that ``group_keys`` is respected even if a transformer is detected (:issue:`34998`) +- Enforced deprecation ``numeric_only=None`` (the default) in DataFrame reductions that would silently drop columns that raised; ``numeric_only`` now defaults to ``False`` (:issue:`41480`) - .. --------------------------------------------------------------------------- @@ -569,6 +570,7 @@ Timezones Numeric ^^^^^^^ - Bug in :meth:`DataFrame.add` cannot apply ufunc when inputs contain mixed DataFrame type and Series type (:issue:`39853`) +- Bug in DataFrame reduction methods (e.g. :meth:`DataFrame.sum`) with object dtype, ``axis=1`` and ``numeric_only=False`` would not be coerced to float (:issue:`49551`) - Conversion diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5cf061b98680c..1627a7add25ed 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -266,9 +266,8 @@ you to specify a location to update with some value.""", } -_numeric_only_doc = """numeric_only : bool or None, default None - Include only float, int, boolean data. If None, will attempt to use - everything, then use only numeric data +_numeric_only_doc = """numeric_only : bool, default False + Include only float, int, boolean data. """ _merge_doc = """ @@ -10489,7 +10488,7 @@ def _reduce( *, axis: Axis = 0, skipna: bool = True, - numeric_only: bool | None = None, + numeric_only: bool = False, filter_type=None, **kwds, ): @@ -10498,7 +10497,6 @@ def _reduce( # TODO: Make other agg func handle axis=None properly GH#21597 axis = self._get_axis_number(axis) - labels = self._get_agg_axis(axis) assert axis in [0, 1] def func(values: np.ndarray): @@ -10524,25 +10522,22 @@ def _get_data() -> DataFrame: data = self._get_bool_data() return data - numeric_only_bool = com.resolve_numeric_only(numeric_only) - if numeric_only is not None or axis == 0: + if numeric_only or axis == 0: # For numeric_only non-None and axis non-None, we know # which blocks to use and no try/except is needed. # For numeric_only=None only the case with axis==0 and no object # dtypes are unambiguous can be handled with BlockManager.reduce # Case with EAs see GH#35881 df = self - if numeric_only_bool: + if numeric_only: df = _get_data() if axis == 1: df = df.T axis = 0 - ignore_failures = numeric_only is None - # After possibly _get_data and transposing, we are now in the # simple case where we can use BlockManager.reduce - res, _ = df._mgr.reduce(blk_func, ignore_failures=ignore_failures) + res, _ = df._mgr.reduce(blk_func, ignore_failures=False) out = df._constructor(res).iloc[0] if out_dtype is not None: out = out.astype(out_dtype) @@ -10559,36 +10554,11 @@ def _get_data() -> DataFrame: return out - assert numeric_only is None + assert not numeric_only and axis == 1 data = self values = data.values - - try: - result = func(values) - - except TypeError: - # e.g. in nanops trying to convert strs to float - - data = _get_data() - labels = data._get_agg_axis(axis) - - values = data.values - with np.errstate(all="ignore"): - result = func(values) - - # columns have been dropped GH#41480 - arg_name = "numeric_only" - if name in ["all", "any"]: - arg_name = "bool_only" - warnings.warn( - "Dropping of nuisance columns in DataFrame reductions " - f"(with '{arg_name}=None') is deprecated; in a future " - "version this will raise TypeError. Select only valid " - "columns before calling the reduction.", - FutureWarning, - stacklevel=find_stack_level(), - ) + result = func(values) if hasattr(result, "dtype"): if filter_type == "bool" and notna(result).all(): @@ -10600,6 +10570,7 @@ def _get_data() -> DataFrame: # try to coerce to the original dtypes item by item if we can pass + labels = self._get_agg_axis(axis) result = self._constructor_sliced(result, index=labels) return result diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ee41d07c52774..d26a11eae9f7f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10799,7 +10799,7 @@ def _logical_func( name: str, func, axis: Axis = 0, - bool_only: bool_t | None = None, + bool_only: bool_t = False, skipna: bool_t = True, level: Level | None = None, **kwargs, @@ -10814,7 +10814,7 @@ def _logical_func( FutureWarning, stacklevel=find_stack_level(), ) - if bool_only is not None: + if bool_only: raise NotImplementedError( "Option bool_only is not implemented with option level." ) @@ -10833,7 +10833,6 @@ def _logical_func( and len(self._mgr.arrays) > 1 # TODO(EA2D): special-case not needed and all(x.ndim == 2 for x in self._mgr.arrays) - and bool_only is not None and not kwargs ): # Fastpath avoiding potentially expensive transpose @@ -10854,7 +10853,7 @@ def _logical_func( def any( self, axis: Axis = 0, - bool_only: bool_t | None = None, + bool_only: bool_t = False, skipna: bool_t = True, level: Level | None = None, **kwargs, @@ -10866,7 +10865,7 @@ def any( def all( self, axis: Axis = 0, - bool_only: bool_t | None = None, + bool_only: bool_t = False, skipna: bool_t = True, level: Level | None = None, **kwargs, @@ -10933,7 +10932,7 @@ def _stat_function_ddof( skipna: bool_t = True, level: Level | None = None, ddof: int = 1, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ) -> Series | float: nv.validate_stat_ddof_func((), kwargs, fname=name) @@ -10961,7 +10960,7 @@ def sem( skipna: bool_t = True, level: Level | None = None, ddof: int = 1, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function_ddof( @@ -10974,7 +10973,7 @@ def var( skipna: bool_t = True, level: Level | None = None, ddof: int = 1, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function_ddof( @@ -10987,7 +10986,7 @@ def std( skipna: bool_t = True, level: Level | None = None, ddof: int = 1, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function_ddof( @@ -11002,7 +11001,7 @@ def _stat_function( axis: Axis | None | lib.NoDefault = None, skipna: bool_t = True, level: Level | None = None, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ): if name == "median": @@ -11047,7 +11046,7 @@ def min( axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level: Level | None = None, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ): return self._stat_function( @@ -11065,7 +11064,7 @@ def max( axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level: Level | None = None, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ): return self._stat_function( @@ -11083,7 +11082,7 @@ def mean( axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level: Level | None = None, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function( @@ -11095,7 +11094,7 @@ def median( axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level: Level | None = None, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function( @@ -11107,7 +11106,7 @@ def skew( axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level: Level | None = None, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function( @@ -11119,7 +11118,7 @@ def kurt( axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level: Level | None = None, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function( @@ -11136,7 +11135,7 @@ def _min_count_stat_function( axis: Axis | None = None, skipna: bool_t = True, level: Level | None = None, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, min_count: int = 0, **kwargs, ): @@ -11182,7 +11181,7 @@ def sum( axis: Axis | None = None, skipna: bool_t = True, level: Level | None = None, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, min_count: int = 0, **kwargs, ): @@ -11195,7 +11194,7 @@ def prod( axis: Axis | None = None, skipna: bool_t = True, level: Level | None = None, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, min_count: int = 0, **kwargs, ): @@ -11288,7 +11287,7 @@ def sem( skipna: bool_t = True, level=None, ddof: int = 1, - numeric_only=None, + numeric_only: bool_t = False, **kwargs, ): return NDFrame.sem(self, axis, skipna, level, ddof, numeric_only, **kwargs) @@ -11311,7 +11310,7 @@ def var( skipna: bool_t = True, level=None, ddof: int = 1, - numeric_only=None, + numeric_only: bool_t = False, **kwargs, ): return NDFrame.var(self, axis, skipna, level, ddof, numeric_only, **kwargs) @@ -11335,7 +11334,7 @@ def std( skipna: bool_t = True, level=None, ddof: int = 1, - numeric_only=None, + numeric_only: bool_t = False, **kwargs, ): return NDFrame.std(self, axis, skipna, level, ddof, numeric_only, **kwargs) @@ -11423,7 +11422,7 @@ def sum( axis: Axis | None = None, skipna: bool_t = True, level=None, - numeric_only=None, + numeric_only: bool_t = False, min_count: int = 0, **kwargs, ): @@ -11448,7 +11447,7 @@ def prod( axis: Axis | None = None, skipna: bool_t = True, level=None, - numeric_only=None, + numeric_only: bool_t = False, min_count: int = 0, **kwargs, ): @@ -11474,7 +11473,7 @@ def mean( axis: AxisInt | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level=None, - numeric_only=None, + numeric_only: bool_t = False, **kwargs, ): return NDFrame.mean(self, axis, skipna, level, numeric_only, **kwargs) @@ -11496,7 +11495,7 @@ def skew( axis: AxisInt | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level=None, - numeric_only=None, + numeric_only: bool_t = False, **kwargs, ): return NDFrame.skew(self, axis, skipna, level, numeric_only, **kwargs) @@ -11521,7 +11520,7 @@ def kurt( axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level=None, - numeric_only=None, + numeric_only: bool_t = False, **kwargs, ): return NDFrame.kurt(self, axis, skipna, level, numeric_only, **kwargs) @@ -11544,7 +11543,7 @@ def median( axis: AxisInt | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level=None, - numeric_only=None, + numeric_only: bool_t = False, **kwargs, ): return NDFrame.median(self, axis, skipna, level, numeric_only, **kwargs) @@ -11568,7 +11567,7 @@ def max( axis: AxisInt | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level=None, - numeric_only=None, + numeric_only: bool_t = False, **kwargs, ): return NDFrame.max(self, axis, skipna, level, numeric_only, **kwargs) @@ -11592,7 +11591,7 @@ def min( axis: AxisInt | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level=None, - numeric_only=None, + numeric_only: bool_t = False, **kwargs, ): return NDFrame.min(self, axis, skipna, level, numeric_only, **kwargs) @@ -11827,13 +11826,8 @@ def _doc_params(cls): .. deprecated:: 1.3.0 The level keyword is deprecated. Use groupby instead. -numeric_only : bool, default None - Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. Not implemented for Series. - - .. deprecated:: 1.5.0 - Specifying ``numeric_only=None`` is deprecated. The default value will be - ``False`` in a future version of pandas. +numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. {min_count}\ **kwargs @@ -11865,13 +11859,8 @@ def _doc_params(cls): ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. -numeric_only : bool, default None - Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. Not implemented for Series. - - .. deprecated:: 1.5.0 - Specifying ``numeric_only=None`` is deprecated. The default value will be - ``False`` in a future version of pandas. +numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. Returns ------- diff --git a/pandas/core/series.py b/pandas/core/series.py index 7a768841afb25..2664988a7b8d4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4592,7 +4592,7 @@ def _reduce( *, axis: Axis = 0, skipna: bool = True, - numeric_only=None, + numeric_only: bool = False, filter_type=None, **kwds, ): diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 068ce32b5e7aa..28c776d0a6d35 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1283,8 +1283,11 @@ def test_nuiscance_columns(): ) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, match="Select only valid"): - result = df.agg("sum") + msg = "DataFrame constructor called with incompatible data and dtype" + with pytest.raises(TypeError, match=msg): + df.agg("sum") + + result = df[["A", "B", "C"]].agg("sum") expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"]) tm.assert_series_equal(result, expected) @@ -1428,13 +1431,14 @@ def test_apply_datetime_tz_issue(): @pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})]) @pytest.mark.parametrize("method", ["min", "max", "sum"]) -def test_consistency_of_aggregates_of_columns_with_missing_values(df, method): +def test_mixed_column_raises(df, method): # GH 16832 - with tm.assert_produces_warning(FutureWarning, match="Select only valid"): - none_in_first_column_result = getattr(df[["A", "B"]], method)() - none_in_second_column_result = getattr(df[["B", "A"]], method)() - - tm.assert_series_equal(none_in_first_column_result, none_in_second_column_result) + if method == "sum": + msg = r'can only concatenate str \(not "int"\) to str' + else: + msg = "not supported between instances of 'str' and 'float'" + with pytest.raises(TypeError, match=msg): + getattr(df, method)() @pytest.mark.parametrize("col", [1, 1.0, True, "a", np.nan]) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 14b416011b956..b4661a92c8275 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -139,8 +139,7 @@ def test_non_numeric_exclusion(self, interp_method, request, using_array_manager rs = df.quantile( 0.5, numeric_only=True, interpolation=interpolation, method=method ) - with tm.assert_produces_warning(FutureWarning, match="Select only valid"): - xp = df.median().rename(0.5) + xp = df.median(numeric_only=True).rename(0.5) if interpolation == "nearest": xp = (xp + 0.5).astype(np.int64) if method == "table" and using_array_manager: diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 8d4d705296f35..0e5c6057b9a61 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -169,15 +169,23 @@ class TestDataFrameAnalytics: ], ) def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): - if opname in ["sum", "min", "max"] and axis == 0: - warn = None - elif opname not in ["count", "nunique"]: - warn = FutureWarning - else: - warn = None - msg = "nuisance columns|default value of numeric_only" - with tm.assert_produces_warning(warn, match=msg): + if (opname in ("sum", "min", "max") and axis == 0) or opname in ( + "count", + "nunique", + ): getattr(float_string_frame, opname)(axis=axis) + else: + msg = "|".join( + [ + "Could not convert", + "could not convert", + "can't multiply sequence by non-int", + "unsupported operand type", + "not supported between instances of", + ] + ) + with pytest.raises(TypeError, match=msg): + getattr(float_string_frame, opname)(axis=axis) if opname != "nunique": getattr(float_string_frame, opname)(axis=axis, numeric_only=True) @@ -323,9 +331,7 @@ def test_stat_operators_attempt_obj_array(self, method, df): assert df.values.dtype == np.object_ result = getattr(df, method)(1) expected = getattr(df.astype("f8"), method)(1) - - if method in ["sum", "prod"]: - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"]) def test_mixed_ops(self, op): @@ -337,18 +343,26 @@ def test_mixed_ops(self, op): "str": ["a", "b", "c", "d"], } ) - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): - result = getattr(df, op)() - assert len(result) == 2 + msg = "|".join( + [ + "Could not convert", + "could not convert", + "can't multiply sequence by non-int", + ] + ) + with pytest.raises(TypeError, match=msg): + getattr(df, op)() with pd.option_context("use_bottleneck", False): - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): - result = getattr(df, op)() - assert len(result) == 2 + msg = "|".join( + [ + "Could not convert", + "could not convert", + "can't multiply sequence by non-int", + ] + ) + with pytest.raises(TypeError, match=msg): + getattr(df, op)() def test_reduce_mixed_frame(self): # GH 6806 @@ -416,10 +430,9 @@ def test_mean_mixed_string_decimal(self): df = DataFrame(d) - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): - result = df.mean() + with pytest.raises(TypeError, match="unsupported operand type"): + df.mean() + result = df[["A", "C"]].mean() expected = Series([2.7, 681.6], index=["A", "C"]) tm.assert_series_equal(result, expected) @@ -648,9 +661,8 @@ def test_operators_timedelta64(self): ) tm.assert_series_equal(result, expected) - # excludes numeric - with tm.assert_produces_warning(FutureWarning, match="Select only valid"): - result = mixed.min(axis=1) + # excludes non-numeric + result = mixed.min(axis=1, numeric_only=True) expected = Series([1, 1, 1.0], index=[0, 1, 2]) tm.assert_series_equal(result, expected) @@ -819,25 +831,17 @@ def test_sum_mixed_datetime(self): df = DataFrame({"A": date_range("2000", periods=4), "B": [1, 2, 3, 4]}).reindex( [2, 3, 4] ) - with tm.assert_produces_warning(FutureWarning, match="Select only valid"): - result = df.sum() - - expected = Series({"B": 7.0}) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="does not support reduction 'sum'"): + df.sum() def test_mean_corner(self, float_frame, float_string_frame): # unit test when have object data - with tm.assert_produces_warning(FutureWarning, match="Select only valid"): - the_mean = float_string_frame.mean(axis=0) - the_sum = float_string_frame.sum(axis=0, numeric_only=True) - tm.assert_index_equal(the_sum.index, the_mean.index) - assert len(the_mean.index) < len(float_string_frame.columns) + with pytest.raises(TypeError, match="Could not convert"): + float_string_frame.mean(axis=0) # xs sum mixed type, just want to know it works... - with tm.assert_produces_warning(FutureWarning, match="Select only valid"): - the_mean = float_string_frame.mean(axis=1) - the_sum = float_string_frame.sum(axis=1, numeric_only=True) - tm.assert_index_equal(the_sum.index, the_mean.index) + with pytest.raises(TypeError, match="unsupported operand type"): + float_string_frame.mean(axis=1) # take mean of boolean column float_frame["bool"] = float_frame["A"] > 0 @@ -861,10 +865,8 @@ def test_mean_datetimelike(self): expected = Series({"A": 1.0}) tm.assert_series_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, match="Select only valid"): - result = df.mean() - expected = Series({"A": 1.0, "B": df.loc[1, "B"], "C": df.loc[1, "C"]}) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="mean is not implemented for PeriodArray"): + df.mean() def test_mean_datetimelike_numeric_only_false(self): df = DataFrame( @@ -895,13 +897,13 @@ def test_mean_extensionarray_numeric_only_true(self): tm.assert_series_equal(result, expected) def test_stats_mixed_type(self, float_string_frame): - # don't blow up - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): + with pytest.raises(TypeError, match="could not convert"): float_string_frame.std(1) + with pytest.raises(TypeError, match="could not convert"): float_string_frame.var(1) + with pytest.raises(TypeError, match="unsupported operand type"): float_string_frame.mean(1) + with pytest.raises(TypeError, match="could not convert"): float_string_frame.skew(1) def test_sum_bools(self): @@ -1250,24 +1252,26 @@ def test_any_all_np_func(self, func, data, expected): # GH 19976 data = DataFrame(data) - warn = None if any(is_categorical_dtype(x) for x in data.dtypes): - warn = FutureWarning + with pytest.raises( + TypeError, match="dtype category does not support reduction" + ): + func(data) - with tm.assert_produces_warning( - warn, match="Select only valid columns", check_stacklevel=False - ): + # method version + with pytest.raises( + TypeError, match="dtype category does not support reduction" + ): + getattr(DataFrame(data), func.__name__)(axis=None) + else: result = func(data) - assert isinstance(result, np.bool_) - assert result.item() is expected + assert isinstance(result, np.bool_) + assert result.item() is expected - # method version - with tm.assert_produces_warning( - warn, match="Select only valid columns", check_stacklevel=False - ): + # method version result = getattr(DataFrame(data), func.__name__)(axis=None) - assert isinstance(result, np.bool_) - assert result.item() is expected + assert isinstance(result, np.bool_) + assert result.item() is expected def test_any_all_object(self): # GH 19976 @@ -1512,20 +1516,11 @@ def test_any_all_categorical_dtype_nuisance_column(self, method): with pytest.raises(TypeError, match="does not support reduction"): getattr(df, method)(bool_only=False) - # With bool_only=None, operating on this column raises and is ignored, - # so we expect an empty result. - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): - result = getattr(df, method)(bool_only=None) - expected = Series([], index=Index([]), dtype=bool) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="does not support reduction"): + getattr(df, method)(bool_only=None) - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns", check_stacklevel=False - ): - result = getattr(np, method)(df, axis=0) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="does not support reduction"): + getattr(np, method)(df, axis=0) def test_median_categorical_dtype_nuisance_column(self): # GH#21020 DataFrame.median should match Series.median @@ -1539,12 +1534,8 @@ def test_median_categorical_dtype_nuisance_column(self): with pytest.raises(TypeError, match="does not support reduction"): df.median(numeric_only=False) - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): - result = df.median() - expected = Series([], index=Index([]), dtype=np.float64) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="does not support reduction"): + df.median() # same thing, but with an additional non-categorical column df["B"] = df["A"].astype(int) @@ -1552,12 +1543,8 @@ def test_median_categorical_dtype_nuisance_column(self): with pytest.raises(TypeError, match="does not support reduction"): df.median(numeric_only=False) - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): - result = df.median() - expected = Series([2.0], index=["B"]) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="does not support reduction"): + df.median() # TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead # of expected.values @@ -1579,58 +1566,19 @@ def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method): with pytest.raises(TypeError, match="is not ordered for operation"): getattr(df, method)(numeric_only=False) - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): - result = getattr(df, method)() - expected = Series([], index=Index([]), dtype=np.float64) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="is not ordered for operation"): + getattr(df, method)() - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns", check_stacklevel=False - ): - result = getattr(np, method)(df) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="is not ordered for operation"): + getattr(np, method)(df) # same thing, but with an additional non-categorical column df["B"] = df["A"].astype(object) - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): - result = getattr(df, method)() - if method == "min": - expected = Series(["a"], index=["B"]) - else: - expected = Series(["c"], index=["B"]) - tm.assert_series_equal(result, expected) - - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns", check_stacklevel=False - ): - result = getattr(np, method)(df) - tm.assert_series_equal(result, expected) - - def test_reduction_object_block_splits_nuisance_columns(self): - # GH#37827 - df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object) - - # We should only exclude "B", not "A" - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): - result = df.mean() - expected = Series([1.0], index=["A"]) - tm.assert_series_equal(result, expected) - - # Same behavior but heterogeneous dtype - df["C"] = df["A"].astype(int) + 4 + with pytest.raises(TypeError, match="is not ordered for operation"): + getattr(df, method)() - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): - result = df.mean() - expected = Series([1.0, 5.0], index=["A", "C"]) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="is not ordered for operation"): + getattr(np, method)(df) def test_sum_timedelta64_skipna_false(using_array_manager, request): @@ -1710,12 +1658,8 @@ def test_groupby_regular_arithmetic_equivalent(meth): def test_frame_mixed_numeric_object_with_timestamp(ts_value): # GH 13912 df = DataFrame({"a": [1], "b": [1.1], "c": ["foo"], "d": [ts_value]}) - with tm.assert_produces_warning( - FutureWarning, match="The default value of numeric_only" - ): - result = df.sum() - expected = Series([1, 1.1, "foo"], index=list("abc")) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="does not support reduction"): + df.sum() def test_prod_sum_min_count_mixed_object(): @@ -1755,18 +1699,46 @@ def test_reduction_axis_none_deprecation(method): "corrwith", "count", "cov", + "mode", + "quantile", + ], +) +def test_numeric_only_deprecation(kernel): + # GH#46852 + df = DataFrame({"a": [1, 2, 3], "b": object}) + args = (df,) if kernel == "corrwith" else () + signature = inspect.signature(getattr(DataFrame, kernel)) + default = signature.parameters["numeric_only"].default + assert default is not True + + if default is None or default is lib.no_default: + expected = getattr(df[["a"]], kernel)(*args) + warn = FutureWarning + else: + # default must be False and works on any nuisance columns + expected = getattr(df, kernel)(*args) + if kernel == "mode": + assert "b" in expected.columns + else: + assert "b" in expected.index + warn = None + msg = f"The default value of numeric_only in DataFrame.{kernel}" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(df, kernel)(*args) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "kernel", + [ "idxmax", "idxmin", "kurt", - "kurt", "max", "mean", "median", "min", - "mode", - "prod", "prod", - "quantile", "sem", "skew", "std", @@ -1774,32 +1746,17 @@ def test_reduction_axis_none_deprecation(method): "var", ], ) -def test_numeric_only_deprecation(kernel): +def test_fails_on_non_numeric(kernel): # GH#46852 df = DataFrame({"a": [1, 2, 3], "b": object}) - args = (df,) if kernel == "corrwith" else () - signature = inspect.signature(getattr(DataFrame, kernel)) - default = signature.parameters["numeric_only"].default - assert default is not True - - if kernel in ("idxmax", "idxmin"): - # kernels that default to numeric_only=False and fail on nuisance columns - assert default is False - with pytest.raises(TypeError, match="not allowed for this dtype"): - getattr(df, kernel)(*args) - else: - if default is None or default is lib.no_default: - expected = getattr(df[["a"]], kernel)(*args) - warn = FutureWarning - else: - # default must be False and works on any nuisance columns - expected = getattr(df, kernel)(*args) - if kernel == "mode": - assert "b" in expected.columns - else: - assert "b" in expected.index - warn = None - msg = f"The default value of numeric_only in DataFrame.{kernel}" - with tm.assert_produces_warning(warn, match=msg): - result = getattr(df, kernel)(*args) - tm.assert_equal(result, expected) + msg = "|".join( + [ + "not allowed for this dtype", + "argument must be a string or a number", + "not supported between instances of", + "unsupported operand type", + "argument must be a string or a real number", + ] + ) + with pytest.raises(TypeError, match=msg): + getattr(df, kernel)() diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 6f723e1be6fc6..fa51a291bf2f7 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -974,17 +974,15 @@ def test_apply_function_index_return(function): def test_apply_function_with_indexing_return_column(): - # GH: 7002 + # GH#7002, GH#41480 df = DataFrame( { "foo1": ["one", "two", "two", "three", "one", "two"], "foo2": [1, 2, 4, 4, 5, 6], } ) - with tm.assert_produces_warning(FutureWarning, match="Select only valid"): - result = df.groupby("foo1", as_index=False).apply(lambda x: x.mean()) - expected = DataFrame({"foo1": ["one", "three", "two"], "foo2": [3.0, 4.0, 4.0]}) - tm.assert_frame_equal(result, expected) + with pytest.raises(TypeError, match="Could not convert"): + df.groupby("foo1", as_index=False).apply(lambda x: x.mean()) @pytest.mark.parametrize( diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index f8c7cdf658ebf..8fe1dc010211a 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -294,13 +294,7 @@ def test_apply(ordered): idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) expected = DataFrame([0, 1, 2.0], index=idx, columns=["values"]) - # GH#21636 tracking down the xfail, in some builds np.mean(df.loc[[0]]) - # is coming back as Series([0., 1., 0.], index=["missing", "dense", "values"]) - # when we expect Series(0., index=["values"]) - with tm.assert_produces_warning( - FutureWarning, match="Select only valid", check_stacklevel=False - ): - result = grouped.apply(lambda x: np.mean(x)) + result = grouped.apply(lambda x: np.mean(x)) tm.assert_frame_equal(result, expected) result = grouped.mean() diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 7a9d540ae08c4..5383a4d28c8ce 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -301,27 +301,19 @@ def gni(self, df): return gni # TODO: non-unique columns, as_index=False - def test_idxmax(self, gb): - # object dtype so idxmax goes through _aggregate_item_by_item - # GH#5610 - # non-cython calls should not include the grouper + def test_idxmax_nuisance_raises(self, gb): + # GH#5610, GH#41480 expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3]) expected.index.name = "A" - msg = "The default value of numeric_only in DataFrameGroupBy.idxmax" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = gb.idxmax() - tm.assert_frame_equal(result, expected) + with pytest.raises(TypeError, match="not allowed for this dtype"): + gb.idxmax() - def test_idxmin(self, gb): - # object dtype so idxmax goes through _aggregate_item_by_item - # GH#5610 - # non-cython calls should not include the grouper + def test_idxmin_nuisance_raises(self, gb): + # GH#5610, GH#41480 expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3]) expected.index.name = "A" - msg = "The default value of numeric_only in DataFrameGroupBy.idxmin" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = gb.idxmin() - tm.assert_frame_equal(result, expected) + with pytest.raises(TypeError, match="not allowed for this dtype"): + gb.idxmin() def test_describe(self, df, gb, gni): # describe @@ -1382,11 +1374,15 @@ def test_deprecate_numeric_only( gb = df.groupby(keys) method = getattr(gb, kernel) - if has_arg and ( - # Cases where b does not appear in the result - numeric_only is True - or (numeric_only is lib.no_default and numeric_only_default) - or drops_nuisance + if ( + has_arg + and (kernel not in ("idxmax", "idxmin") or numeric_only is True) + and ( + # Cases where b does not appear in the result + numeric_only is True + or (numeric_only is lib.no_default and numeric_only_default) + or drops_nuisance + ) ): if numeric_only is True or (not numeric_only_default and not drops_nuisance): warn = None @@ -1411,9 +1407,8 @@ def test_deprecate_numeric_only( ): result = method(*args, **kwargs) assert "b" in result.columns - elif has_arg: + elif has_arg or kernel in ("idxmax", "idxmin"): assert numeric_only is not True - assert numeric_only is not lib.no_default or numeric_only_default is False assert not drops_nuisance # kernels that are successful on any dtype were above; this will fail msg = (