diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index c85a087835b80..9daab20e1b809 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -113,7 +113,7 @@ Other enhancements - :meth:`DataFrame.reset_index` now accepts a ``names`` argument which renames the index names (:issue:`6878`) - :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`) - :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`) -- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`) +- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, :meth:`DataFrame.cov`, :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax`, :meth:`.GroupBy.idxmin`, :meth:`.GroupBy.idxmax`, :meth:`.GroupBy.var`, :meth:`.GroupBy.std`, :meth:`.GroupBy.sem`, and :meth:`.GroupBy.quantile` (:issue:`46560`) - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`, :issue:`46725`) - Added ``validate`` argument to :meth:`DataFrame.join` (:issue:`46622`) - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7270d73e29741..ef5e6dd1d6757 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10605,11 +10605,17 @@ def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: """ return self.apply(Series.nunique, axis=axis, dropna=dropna) - @doc(_shared_docs["idxmin"]) - def idxmin(self, axis: Axis = 0, skipna: bool = True) -> Series: + @doc(_shared_docs["idxmin"], numeric_only_default="False") + def idxmin( + self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False + ) -> Series: axis = self._get_axis_number(axis) + if numeric_only: + data = self._get_numeric_data() + else: + data = self - res = self._reduce( + res = data._reduce( nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False ) indices = res._values @@ -10619,15 +10625,22 @@ def idxmin(self, axis: Axis = 0, skipna: bool = True) -> Series: # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" assert isinstance(indices, np.ndarray) # for mypy - index = self._get_axis(axis) + index = data._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] - return self._constructor_sliced(result, index=self._get_agg_axis(axis)) + return data._constructor_sliced(result, index=data._get_agg_axis(axis)) + + @doc(_shared_docs["idxmax"], numeric_only_default="False") + def idxmax( + self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False + ) -> Series: - @doc(_shared_docs["idxmax"]) - def idxmax(self, axis: Axis = 0, skipna: bool = True) -> Series: axis = self._get_axis_number(axis) + if numeric_only: + data = self._get_numeric_data() + else: + data = self - res = self._reduce( + res = data._reduce( nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False ) indices = res._values @@ -10637,9 +10650,9 @@ def idxmax(self, axis: Axis = 0, skipna: bool = True) -> Series: # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" assert isinstance(indices, np.ndarray) # for mypy - index = self._get_axis(axis) + index = data._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] - return self._constructor_sliced(result, index=self._get_agg_axis(axis)) + return data._constructor_sliced(result, index=data._get_agg_axis(axis)) def _get_agg_axis(self, axis_num: int) -> Index: """ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 06a1aed8e3b09..245e33fb1a23b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1555,10 +1555,14 @@ def nunique(self, dropna: bool = True) -> DataFrame: return results - @doc(_shared_docs["idxmax"]) - def idxmax(self, axis=0, skipna: bool = True): + @doc( + _shared_docs["idxmax"], + numeric_only_default="True for axis=0, False for axis=1", + ) + def idxmax(self, axis=0, skipna: bool = True, numeric_only: bool | None = None): axis = DataFrame._get_axis_number(axis) - numeric_only = None if axis == 0 else False + if numeric_only is None: + numeric_only = None if axis == 0 else False def func(df): # NB: here we use numeric_only=None, in DataFrame it is False GH#38217 @@ -1577,13 +1581,17 @@ def func(df): func.__name__ = "idxmax" return self._python_apply_general(func, self._obj_with_exclusions) - @doc(_shared_docs["idxmin"]) - def idxmin(self, axis=0, skipna: bool = True): + @doc( + _shared_docs["idxmin"], + numeric_only_default="True for axis=0, False for axis=1", + ) + def idxmin(self, axis=0, skipna: bool = True, numeric_only: bool | None = None): axis = DataFrame._get_axis_number(axis) - numeric_only = None if axis == 0 else False + if numeric_only is None: + numeric_only = None if axis == 0 else False def func(df): - # NB: here we use numeric_only=None, in DataFrame it is False GH#38217 + # NB: here we use numeric_only=None, in DataFrame it is False GH#46560 res = df._reduce( nanops.nanargmin, "argmin", diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b25781f87872a..70f8e0a752dcb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1502,7 +1502,7 @@ def _python_apply_general( ) @final - def _python_agg_general(self, func, *args, **kwargs): + def _python_agg_general(self, func, *args, raise_on_typeerror=False, **kwargs): func = com.is_builtin_func(func) f = lambda x: func(x, *args, **kwargs) @@ -1520,6 +1520,8 @@ def _python_agg_general(self, func, *args, **kwargs): # if this function is invalid for this dtype, we will ignore it. result = self.grouper.agg_series(obj, f) except TypeError: + if raise_on_typeerror: + raise warn_dropping_nuisance_columns_deprecated(type(self), "agg") continue @@ -1593,7 +1595,12 @@ def _agg_py_fallback( @final def _cython_agg_general( - self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 + self, + how: str, + alt: Callable, + numeric_only: bool, + min_count: int = -1, + ignore_failures: bool = True, ): # Note: we never get here with how="ohlc" for DataFrameGroupBy; # that goes through SeriesGroupBy @@ -1629,7 +1636,7 @@ def array_func(values: ArrayLike) -> ArrayLike: # TypeError -> we may have an exception in trying to aggregate # continue and exclude the block - new_mgr = data.grouped_reduce(array_func, ignore_failures=True) + new_mgr = data.grouped_reduce(array_func, ignore_failures=ignore_failures) if not is_ser and len(new_mgr) < len(data): warn_dropping_nuisance_columns_deprecated(type(self), how) @@ -2041,6 +2048,7 @@ def std( ddof: int = 1, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, + numeric_only: bool | lib.NoDefault = lib.no_default, ): """ Compute standard deviation of groups, excluding missing values. @@ -2069,6 +2077,11 @@ def std( .. versionadded:: 1.4.0 + numeric_only : bool, default True + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + Returns ------- Series or DataFrame @@ -2081,8 +2094,9 @@ def std( else: return self._get_cythonized_result( libgroupby.group_var, - needs_counts=True, cython_dtype=np.dtype(np.float64), + numeric_only=numeric_only, + needs_counts=True, post_processing=lambda vals, inference: np.sqrt(vals), ddof=ddof, ) @@ -2095,6 +2109,7 @@ def var( ddof: int = 1, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, + numeric_only: bool | lib.NoDefault = lib.no_default, ): """ Compute variance of groups, excluding missing values. @@ -2123,6 +2138,11 @@ def var( .. versionadded:: 1.4.0 + numeric_only : bool, default True + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + Returns ------- Series or DataFrame @@ -2133,22 +2153,25 @@ def var( return self._numba_agg_general(sliding_var, engine_kwargs, ddof) else: + numeric_only_bool = self._resolve_numeric_only(numeric_only) if ddof == 1: - numeric_only = self._resolve_numeric_only(lib.no_default) return self._cython_agg_general( "var", alt=lambda x: Series(x).var(ddof=ddof), - numeric_only=numeric_only, + numeric_only=numeric_only_bool, + ignore_failures=numeric_only is lib.no_default, ) else: func = lambda x: x.var(ddof=ddof) with self._group_selection_context(): - return self._python_agg_general(func) + return self._python_agg_general( + func, raise_on_typeerror=not numeric_only_bool + ) @final @Substitution(name="groupby") @Appender(_common_see_also) - def sem(self, ddof: int = 1): + def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default): """ Compute standard error of the mean of groups, excluding missing values. @@ -2159,12 +2182,17 @@ def sem(self, ddof: int = 1): ddof : int, default 1 Degrees of freedom. + numeric_only : bool, default True + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + Returns ------- Series or DataFrame Standard error of the mean of values within each group. """ - result = self.std(ddof=ddof) + result = self.std(ddof=ddof, numeric_only=numeric_only) if result.ndim == 1: result /= np.sqrt(self.count()) else: @@ -2979,7 +3007,12 @@ def nth( return result @final - def quantile(self, q=0.5, interpolation: str = "linear"): + def quantile( + self, + q=0.5, + interpolation: str = "linear", + numeric_only: bool | lib.NoDefault = lib.no_default, + ): """ Return group values at the given quantile, a la numpy.percentile. @@ -2989,6 +3022,10 @@ def quantile(self, q=0.5, interpolation: str = "linear"): Value(s) between 0 and 1 providing the quantile(s) to compute. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} Method to use when the desired quantile falls between two points. + numeric_only : bool, default True + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 Returns ------- @@ -3013,6 +3050,7 @@ def quantile(self, q=0.5, interpolation: str = "linear"): a 2.0 b 3.0 """ + numeric_only_bool = self._resolve_numeric_only(numeric_only) def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]: if is_object_dtype(vals): @@ -3106,9 +3144,15 @@ def blk_func(values: ArrayLike) -> ArrayLike: obj = self._obj_with_exclusions is_ser = obj.ndim == 1 mgr = self._get_data_to_aggregate() - - res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True) - if not is_ser and len(res_mgr.items) != len(mgr.items): + data = mgr.get_numeric_data() if numeric_only_bool else mgr + ignore_failures = numeric_only_bool + res_mgr = data.grouped_reduce(blk_func, ignore_failures=ignore_failures) + + if ( + numeric_only is lib.no_default + and not is_ser + and len(res_mgr.items) != len(mgr.items) + ): warn_dropping_nuisance_columns_deprecated(type(self), "quantile") if len(res_mgr.items) == 0: diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 354bab1217781..bece833066f89 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -749,6 +749,10 @@ skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. + numeric_only : bool, default {numeric_only_default} + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 Returns ------- @@ -812,6 +816,10 @@ skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. + numeric_only : bool, default {numeric_only_default} + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 Returns ------- diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index b915d104d6365..41deeec7c4b57 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -897,6 +897,17 @@ def test_idxmin(self, float_frame, int_frame, skipna, axis): expected = df.apply(Series.idxmin, axis=axis, skipna=skipna) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("numeric_only", [True, False]) + def test_idxmin_numeric_only(self, numeric_only): + df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")}) + if numeric_only: + result = df.idxmin(numeric_only=numeric_only) + expected = Series([2, 1], index=["a", "b"]) + tm.assert_series_equal(result, expected) + else: + with pytest.raises(TypeError, match="not allowed for this dtype"): + df.idxmin(numeric_only=numeric_only) + def test_idxmin_axis_2(self, float_frame): frame = float_frame msg = "No axis named 2 for object type DataFrame" @@ -914,6 +925,17 @@ def test_idxmax(self, float_frame, int_frame, skipna, axis): expected = df.apply(Series.idxmax, axis=axis, skipna=skipna) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("numeric_only", [True, False]) + def test_idxmax_numeric_only(self, numeric_only): + df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")}) + if numeric_only: + result = df.idxmax(numeric_only=numeric_only) + expected = Series([1, 0], index=["a", "b"]) + tm.assert_series_equal(result, expected) + else: + with pytest.raises(TypeError, match="not allowed for this dtype"): + df.idxmin(numeric_only=numeric_only) + def test_idxmax_axis_2(self, float_frame): frame = float_frame msg = "No axis named 2 for object type DataFrame" diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 102a3333035e5..c99405dfccb66 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -495,8 +495,9 @@ def test_groupby_non_arithmetic_agg_int_like_precision(i): ("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}), ], ) +@pytest.mark.parametrize("numeric_only", [True, False]) @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning") -def test_idxmin_idxmax_returns_int_types(func, values): +def test_idxmin_idxmax_returns_int_types(func, values, numeric_only): # GH 25444 df = DataFrame( { @@ -513,12 +514,15 @@ def test_idxmin_idxmax_returns_int_types(func, values): df["c_Integer"] = df["c_int"].astype("Int64") df["c_Floating"] = df["c_float"].astype("Float64") - result = getattr(df.groupby("name"), func)() + result = getattr(df.groupby("name"), func)(numeric_only=numeric_only) expected = DataFrame(values, index=Index(["A", "B"], name="name")) - expected["c_date_tz"] = expected["c_date"] - expected["c_timedelta"] = expected["c_date"] - expected["c_period"] = expected["c_date"] + if numeric_only: + expected = expected.drop(columns=["c_date"]) + else: + expected["c_date_tz"] = expected["c_date"] + expected["c_timedelta"] = expected["c_date"] + expected["c_period"] = expected["c_date"] expected["c_Integer"] = expected["c_int"] expected["c_Floating"] = expected["c_float"] diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 431c18ab6f4b2..016e817e43402 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas._libs import lib from pandas.compat import IS64 from pandas.errors import PerformanceWarning @@ -892,14 +893,40 @@ def test_keep_nuisance_agg(df, agg_function): @pytest.mark.parametrize( "agg_function", - ["sum", "mean", "prod", "std", "var", "median"], + ["sum", "mean", "prod", "std", "var", "sem", "median"], ) -def test_omit_nuisance_agg(df, agg_function): +@pytest.mark.parametrize("numeric_only", [lib.no_default, True, False]) +def test_omit_nuisance_agg(df, agg_function, numeric_only): # GH 38774, GH 38815 + if not numeric_only and agg_function != "sum": + # sum doesn't drop strings + warn = FutureWarning + else: + warn = None + grouped = df.groupby("A") - result = getattr(grouped, agg_function)() - expected = getattr(df.loc[:, ["A", "C", "D"]].groupby("A"), agg_function)() - tm.assert_frame_equal(result, expected) + + if agg_function in ("var", "std", "sem") and numeric_only is False: + # Added numeric_only as part of GH#46560; these do not drop nuisance + # columns when numeric_only is False + klass = TypeError if agg_function == "var" else ValueError + with pytest.raises(klass, match="could not convert string to float"): + getattr(grouped, agg_function)(numeric_only=numeric_only) + else: + with tm.assert_produces_warning(warn, match="Dropping invalid columns"): + result = getattr(grouped, agg_function)(numeric_only=numeric_only) + if ( + (numeric_only is lib.no_default or not numeric_only) + # These methods drop non-numeric columns even when numeric_only is False + and agg_function not in ("mean", "prod", "median") + ): + columns = ["A", "B", "C", "D"] + else: + columns = ["A", "C", "D"] + expected = getattr(df.loc[:, columns].groupby("A"), agg_function)( + numeric_only=numeric_only + ) + tm.assert_frame_equal(result, expected) def test_omit_nuisance_warnings(df): diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index bf06495f935cd..0f7e71c99584d 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._libs import lib + import pandas as pd from pandas import ( DataFrame, @@ -240,14 +242,22 @@ def test_groupby_quantile_nullable_array(values, q): @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) -def test_groupby_quantile_skips_invalid_dtype(q): +@pytest.mark.parametrize("numeric_only", [lib.no_default, True, False]) +def test_groupby_quantile_skips_invalid_dtype(q, numeric_only): df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]}) - with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"): - result = df.groupby("a").quantile(q) + if numeric_only is None or numeric_only: + warn = FutureWarning if numeric_only is lib.no_default else None + with tm.assert_produces_warning(warn, match="Dropping invalid columns"): + result = df.groupby("a").quantile(q, numeric_only=numeric_only) - expected = df.groupby("a")[["b"]].quantile(q) - tm.assert_frame_equal(result, expected) + expected = df.groupby("a")[["b"]].quantile(q) + tm.assert_frame_equal(result, expected) + else: + with pytest.raises( + TypeError, match="'quantile' cannot be performed against 'object' dtypes!" + ): + df.groupby("a").quantile(q, numeric_only=numeric_only) def test_groupby_quantile_NA_float(any_float_dtype):