From f7b76f21dcdc87fa5aa0c7623db0ca39a574a8f1 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 10 Apr 2022 11:44:56 -0400 Subject: [PATCH 1/7] ENH: Add numeric_only to certain groupby ops --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/core/frame.py | 149 ++++---------------------- pandas/core/groupby/generic.py | 23 ++-- pandas/core/groupby/groupby.py | 81 +++++++++++--- pandas/core/shared_docs.py | 134 +++++++++++++++++++++++ pandas/tests/frame/test_reductions.py | 22 ++++ pandas/tests/groupby/test_function.py | 14 ++- pandas/tests/groupby/test_groupby.py | 33 +++++- pandas/tests/groupby/test_quantile.py | 20 +++- 9 files changed, 315 insertions(+), 163 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 8acf0ab3b7761..7d4950ccd6f14 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -94,7 +94,7 @@ Other enhancements - :meth:`DataFrame.reset_index` now accepts a ``names`` argument which renames the index names (:issue:`6878`) - :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`) - :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`) -- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`) +- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, :meth:`DataFrame.cov`, :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax`, :meth:`.GroupBy.idxmin`, :meth:`.GroupBy.idxmax`, :meth:`.GroupBy.var`, :meth:`.GroupBy.std`, :meth:`.GroupBy.sem`, and :meth:`.GroupBy.quantile` (:issue:`46560`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1a28d9f44ae28..f70e322c450fc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10536,70 +10536,17 @@ def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: """ return self.apply(Series.nunique, axis=axis, dropna=dropna) - def idxmin(self, axis: Axis = 0, skipna: bool = True) -> Series: - """ - Return index of first occurrence of minimum over requested axis. - - NA/null values are excluded. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - - Returns - ------- - Series - Indexes of minima along the specified axis. - - Raises - ------ - ValueError - * If the row/column is empty - - See Also - -------- - Series.idxmin : Return index of the minimum element. - - Notes - ----- - This method is the DataFrame version of ``ndarray.argmin``. - - Examples - -------- - Consider a dataset containing food consumption in Argentina. - - >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], - ... 'co2_emissions': [37.2, 19.66, 1712]}, - ... index=['Pork', 'Wheat Products', 'Beef']) - - >>> df - consumption co2_emissions - Pork 10.51 37.20 - Wheat Products 103.11 19.66 - Beef 55.48 1712.00 - - By default, it returns the index for the minimum value in each column. - - >>> df.idxmin() - consumption Pork - co2_emissions Wheat Products - dtype: object - - To return the index for the minimum value in each row, use ``axis="columns"``. - - >>> df.idxmin(axis="columns") - Pork consumption - Wheat Products co2_emissions - Beef consumption - dtype: object - """ + @doc(_shared_docs["idxmin"], numeric_only_default="False") + def idxmin( + self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False + ) -> Series: axis = self._get_axis_number(axis) + if numeric_only: + data = self._get_numeric_data() + else: + data = self - res = self._reduce( + res = data._reduce( nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False ) indices = res._values @@ -10609,74 +10556,22 @@ def idxmin(self, axis: Axis = 0, skipna: bool = True) -> Series: # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" assert isinstance(indices, np.ndarray) # for mypy - index = self._get_axis(axis) + index = data._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] - return self._constructor_sliced(result, index=self._get_agg_axis(axis)) + return data._constructor_sliced(result, index=data._get_agg_axis(axis)) - def idxmax(self, axis: Axis = 0, skipna: bool = True) -> Series: - """ - Return index of first occurrence of maximum over requested axis. - - NA/null values are excluded. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - - Returns - ------- - Series - Indexes of maxima along the specified axis. - - Raises - ------ - ValueError - * If the row/column is empty - - See Also - -------- - Series.idxmax : Return index of the maximum element. - - Notes - ----- - This method is the DataFrame version of ``ndarray.argmax``. - - Examples - -------- - Consider a dataset containing food consumption in Argentina. - - >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], - ... 'co2_emissions': [37.2, 19.66, 1712]}, - ... index=['Pork', 'Wheat Products', 'Beef']) - - >>> df - consumption co2_emissions - Pork 10.51 37.20 - Wheat Products 103.11 19.66 - Beef 55.48 1712.00 - - By default, it returns the index for the maximum value in each column. - - >>> df.idxmax() - consumption Wheat Products - co2_emissions Beef - dtype: object - - To return the index for the maximum value in each row, use ``axis="columns"``. + @doc(_shared_docs["idxmax"], numeric_only_default="False") + def idxmax( + self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False + ) -> Series: - >>> df.idxmax(axis="columns") - Pork co2_emissions - Wheat Products consumption - Beef co2_emissions - dtype: object - """ axis = self._get_axis_number(axis) + if numeric_only: + data = self._get_numeric_data() + else: + data = self - res = self._reduce( + res = data._reduce( nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False ) indices = res._values @@ -10686,9 +10581,9 @@ def idxmax(self, axis: Axis = 0, skipna: bool = True) -> Series: # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" assert isinstance(indices, np.ndarray) # for mypy - index = self._get_axis(axis) + index = data._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] - return self._constructor_sliced(result, index=self._get_agg_axis(axis)) + return data._constructor_sliced(result, index=data._get_agg_axis(axis)) def _get_agg_axis(self, axis_num: int) -> Index: """ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 6986c04ae8d37..cb5cdc96acc4d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -84,6 +84,7 @@ all_indexes_same, ) from pandas.core.series import Series +from pandas.core.shared_docs import _shared_docs from pandas.core.util.numba_ import maybe_use_numba from pandas.plotting import boxplot_frame_groupby @@ -1552,10 +1553,14 @@ def nunique(self, dropna: bool = True) -> DataFrame: return results - @Appender(DataFrame.idxmax.__doc__) - def idxmax(self, axis=0, skipna: bool = True): + @doc( + _shared_docs["idxmax"], + numeric_only_default="True for axis=0, False for axis=1", + ) + def idxmax(self, axis=0, skipna: bool = True, numeric_only: bool | None = None): axis = DataFrame._get_axis_number(axis) - numeric_only = None if axis == 0 else False + if numeric_only is None: + numeric_only = None if axis == 0 else False def func(df): # NB: here we use numeric_only=None, in DataFrame it is False GH#38217 @@ -1574,13 +1579,17 @@ def func(df): func.__name__ = "idxmax" return self._python_apply_general(func, self._obj_with_exclusions) - @Appender(DataFrame.idxmin.__doc__) - def idxmin(self, axis=0, skipna: bool = True): + @doc( + _shared_docs["idxmin"], + numeric_only_default="True for axis=0, False for axis=1", + ) + def idxmin(self, axis=0, skipna: bool = True, numeric_only: bool | None = None): axis = DataFrame._get_axis_number(axis) - numeric_only = None if axis == 0 else False + if numeric_only is None: + numeric_only = None if axis == 0 else False def func(df): - # NB: here we use numeric_only=None, in DataFrame it is False GH#38217 + # NB: here we use numeric_only=None, in DataFrame it is False GH#46560 res = df._reduce( nanops.nanargmin, "argmin", diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f649cce985474..7ecef5ebb5b33 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1502,7 +1502,7 @@ def _python_apply_general( ) @final - def _python_agg_general(self, func, *args, **kwargs): + def _python_agg_general(self, func, *args, raise_on_typeerror=False, **kwargs): func = com.is_builtin_func(func) f = lambda x: func(x, *args, **kwargs) @@ -1520,6 +1520,8 @@ def _python_agg_general(self, func, *args, **kwargs): # if this function is invalid for this dtype, we will ignore it. result = self.grouper.agg_series(obj, f) except TypeError: + if raise_on_typeerror: + raise warn_dropping_nuisance_columns_deprecated(type(self), "agg") continue @@ -1593,7 +1595,12 @@ def _agg_py_fallback( @final def _cython_agg_general( - self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 + self, + how: str, + alt: Callable, + numeric_only: bool, + min_count: int = -1, + ignore_failures: bool = True, ): # Note: we never get here with how="ohlc" for DataFrameGroupBy; # that goes through SeriesGroupBy @@ -1629,7 +1636,7 @@ def array_func(values: ArrayLike) -> ArrayLike: # TypeError -> we may have an exception in trying to aggregate # continue and exclude the block - new_mgr = data.grouped_reduce(array_func, ignore_failures=True) + new_mgr = data.grouped_reduce(array_func, ignore_failures=ignore_failures) if not is_ser and len(new_mgr) < len(data): warn_dropping_nuisance_columns_deprecated(type(self), how) @@ -2041,6 +2048,7 @@ def std( ddof: int = 1, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, + numeric_only: bool | lib.NoDefault = lib.no_default, ): """ Compute standard deviation of groups, excluding missing values. @@ -2069,6 +2077,11 @@ def std( .. versionadded:: 1.4.0 + numeric_only : bool, default True + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + Returns ------- Series or DataFrame @@ -2081,8 +2094,9 @@ def std( else: return self._get_cythonized_result( libgroupby.group_var, - needs_counts=True, cython_dtype=np.dtype(np.float64), + numeric_only=numeric_only, + needs_counts=True, post_processing=lambda vals, inference: np.sqrt(vals), ddof=ddof, ) @@ -2095,6 +2109,7 @@ def var( ddof: int = 1, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, + numeric_only: bool | lib.NoDefault = lib.no_default, ): """ Compute variance of groups, excluding missing values. @@ -2123,6 +2138,11 @@ def var( .. versionadded:: 1.4.0 + numeric_only : bool, default True + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + Returns ------- Series or DataFrame @@ -2133,22 +2153,36 @@ def var( return self._numba_agg_general(sliding_var, engine_kwargs, ddof) else: + numeric_only = self._resolve_numeric_only(numeric_only) if ddof == 1: - numeric_only = self._resolve_numeric_only(lib.no_default) return self._cython_agg_general( "var", alt=lambda x: Series(x).var(ddof=ddof), numeric_only=numeric_only, + ignore_failures=numeric_only, ) else: func = lambda x: x.var(ddof=ddof) - with self._group_selection_context(): - return self._python_agg_general(func) + if numeric_only: + nonnumeric_exclusions = frozenset( + self.obj.columns.difference(self.exclusions).difference( + self.obj._get_numeric_data().columns + ) + ) + else: + nonnumeric_exclusions = frozenset() + with com.temp_setattr( + self, "exclusions", self.exclusions | nonnumeric_exclusions + ): + with self._group_selection_context(): + return self._python_agg_general( + func, raise_on_typeerror=not numeric_only + ) @final @Substitution(name="groupby") @Appender(_common_see_also) - def sem(self, ddof: int = 1): + def sem(self, ddof: int = 1, numeric_only: bool | lib.no_default = lib.no_default): """ Compute standard error of the mean of groups, excluding missing values. @@ -2159,12 +2193,17 @@ def sem(self, ddof: int = 1): ddof : int, default 1 Degrees of freedom. + numeric_only : bool, default True + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + Returns ------- Series or DataFrame Standard error of the mean of values within each group. """ - result = self.std(ddof=ddof) + result = self.std(ddof=ddof, numeric_only=numeric_only) if result.ndim == 1: result /= np.sqrt(self.count()) else: @@ -2968,7 +3007,12 @@ def nth( return result @final - def quantile(self, q=0.5, interpolation: str = "linear"): + def quantile( + self, + q=0.5, + interpolation: str = "linear", + numeric_only: bool | lib.NoDefault = lib.no_default, + ): """ Return group values at the given quantile, a la numpy.percentile. @@ -2978,6 +3022,10 @@ def quantile(self, q=0.5, interpolation: str = "linear"): Value(s) between 0 and 1 providing the quantile(s) to compute. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} Method to use when the desired quantile falls between two points. + numeric_only : bool, default True + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 Returns ------- @@ -3002,6 +3050,7 @@ def quantile(self, q=0.5, interpolation: str = "linear"): a 2.0 b 3.0 """ + numeric_only_bool = self._resolve_numeric_only(numeric_only) def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]: if is_object_dtype(vals): @@ -3095,9 +3144,15 @@ def blk_func(values: ArrayLike) -> ArrayLike: obj = self._obj_with_exclusions is_ser = obj.ndim == 1 mgr = self._get_data_to_aggregate() - - res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True) - if not is_ser and len(res_mgr.items) != len(mgr.items): + data = mgr.get_numeric_data() if numeric_only_bool else mgr + ignore_failures = numeric_only_bool + res_mgr = data.grouped_reduce(blk_func, ignore_failures=ignore_failures) + + if ( + numeric_only is lib.no_default + and not is_ser + and len(res_mgr.items) != len(mgr.items) + ): warn_dropping_nuisance_columns_deprecated(type(self), "quantile") if len(res_mgr.items) == 0: diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 17b5f0b70d34f..bece833066f89 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -734,3 +734,137 @@ .. versionchanged:: 1.4.0 Previously the explicit ``None`` was silently ignored. """ + +_shared_docs[ + "idxmin" +] = """ + Return index of first occurrence of minimum over requested axis. + + NA/null values are excluded. + + Parameters + ---------- + axis : {{0 or 'index', 1 or 'columns'}}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + numeric_only : bool, default {numeric_only_default} + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series + Indexes of minima along the specified axis. + + Raises + ------ + ValueError + * If the row/column is empty + + See Also + -------- + Series.idxmin : Return index of the minimum element. + + Notes + ----- + This method is the DataFrame version of ``ndarray.argmin``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame({{'consumption': [10.51, 103.11, 55.48], + ... 'co2_emissions': [37.2, 19.66, 1712]}}, + ... index=['Pork', 'Wheat Products', 'Beef']) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the minimum value in each column. + + >>> df.idxmin() + consumption Pork + co2_emissions Wheat Products + dtype: object + + To return the index for the minimum value in each row, use ``axis="columns"``. + + >>> df.idxmin(axis="columns") + Pork consumption + Wheat Products co2_emissions + Beef consumption + dtype: object +""" + +_shared_docs[ + "idxmax" +] = """ + Return index of first occurrence of maximum over requested axis. + + NA/null values are excluded. + + Parameters + ---------- + axis : {{0 or 'index', 1 or 'columns'}}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + numeric_only : bool, default {numeric_only_default} + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series + Indexes of maxima along the specified axis. + + Raises + ------ + ValueError + * If the row/column is empty + + See Also + -------- + Series.idxmax : Return index of the maximum element. + + Notes + ----- + This method is the DataFrame version of ``ndarray.argmax``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame({{'consumption': [10.51, 103.11, 55.48], + ... 'co2_emissions': [37.2, 19.66, 1712]}}, + ... index=['Pork', 'Wheat Products', 'Beef']) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the maximum value in each column. + + >>> df.idxmax() + consumption Wheat Products + co2_emissions Beef + dtype: object + + To return the index for the maximum value in each row, use ``axis="columns"``. + + >>> df.idxmax(axis="columns") + Pork co2_emissions + Wheat Products consumption + Beef co2_emissions + dtype: object +""" diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index d2c47498b2fe5..53e24f5e5f925 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -897,6 +897,17 @@ def test_idxmin(self, float_frame, int_frame, skipna, axis): expected = df.apply(Series.idxmin, axis=axis, skipna=skipna) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("numeric_only", [True, False]) + def test_idxmin_numeric_only(self, numeric_only): + df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")}) + if numeric_only: + result = df.idxmin(numeric_only=numeric_only) + expected = Series([2, 1], index=["a", "b"]) + tm.assert_series_equal(result, expected) + else: + with pytest.raises(TypeError, match="not allowed for this dtype"): + df.idxmin(numeric_only=numeric_only) + def test_idxmin_axis_2(self, float_frame): frame = float_frame msg = "No axis named 2 for object type DataFrame" @@ -914,6 +925,17 @@ def test_idxmax(self, float_frame, int_frame, skipna, axis): expected = df.apply(Series.idxmax, axis=axis, skipna=skipna) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("numeric_only", [True, False]) + def test_idxmax_numeric_only(self, numeric_only): + df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")}) + if numeric_only: + result = df.idxmax(numeric_only=numeric_only) + expected = Series([1, 0], index=["a", "b"]) + tm.assert_series_equal(result, expected) + else: + with pytest.raises(TypeError, match="not allowed for this dtype"): + df.idxmin(numeric_only=numeric_only) + def test_idxmax_axis_2(self, float_frame): frame = float_frame msg = "No axis named 2 for object type DataFrame" diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 102a3333035e5..c99405dfccb66 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -495,8 +495,9 @@ def test_groupby_non_arithmetic_agg_int_like_precision(i): ("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}), ], ) +@pytest.mark.parametrize("numeric_only", [True, False]) @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning") -def test_idxmin_idxmax_returns_int_types(func, values): +def test_idxmin_idxmax_returns_int_types(func, values, numeric_only): # GH 25444 df = DataFrame( { @@ -513,12 +514,15 @@ def test_idxmin_idxmax_returns_int_types(func, values): df["c_Integer"] = df["c_int"].astype("Int64") df["c_Floating"] = df["c_float"].astype("Float64") - result = getattr(df.groupby("name"), func)() + result = getattr(df.groupby("name"), func)(numeric_only=numeric_only) expected = DataFrame(values, index=Index(["A", "B"], name="name")) - expected["c_date_tz"] = expected["c_date"] - expected["c_timedelta"] = expected["c_date"] - expected["c_period"] = expected["c_date"] + if numeric_only: + expected = expected.drop(columns=["c_date"]) + else: + expected["c_date_tz"] = expected["c_date"] + expected["c_timedelta"] = expected["c_date"] + expected["c_period"] = expected["c_date"] expected["c_Integer"] = expected["c_int"] expected["c_Floating"] = expected["c_float"] diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 750ba802547ca..21b5b026d4fd4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas._libs import lib from pandas.compat import IS64 from pandas.errors import PerformanceWarning @@ -892,14 +893,36 @@ def test_keep_nuisance_agg(df, agg_function): @pytest.mark.parametrize( "agg_function", - ["sum", "mean", "prod", "std", "var", "median"], + ["sum", "mean", "prod", "std", "var", "sem", "median"], ) -def test_omit_nuisance_agg(df, agg_function): +@pytest.mark.parametrize("numeric_only", [lib.no_default, True, False]) +def test_omit_nuisance_agg(df, agg_function, numeric_only): # GH 38774, GH 38815 + if not numeric_only and agg_function != "sum": + # sum doesn't drop strings + warn = FutureWarning + else: + warn = None + grouped = df.groupby("A") - result = getattr(grouped, agg_function)() - expected = getattr(df.loc[:, ["A", "C", "D"]].groupby("A"), agg_function)() - tm.assert_frame_equal(result, expected) + + if agg_function in ("var", "std", "sem") and numeric_only is False: + # Added numeric_only as part of GH#46560; these do not drop nuisance + # columns when numeric_only is False + klass = TypeError if agg_function == "var" else ValueError + with pytest.raises(klass, match="could not convert string to float"): + getattr(grouped, agg_function)(numeric_only=numeric_only) + else: + with tm.assert_produces_warning(warn, match="Dropping invalid columns"): + result = getattr(grouped, agg_function)(numeric_only=numeric_only) + if numeric_only is lib.no_default or not numeric_only: + columns = ["A", "B", "C", "D"] + elif numeric_only: + columns = ["A", "C", "D"] + expected = getattr(df.loc[:, columns].groupby("A"), agg_function)( + numeric_only=numeric_only + ) + tm.assert_frame_equal(result, expected) def test_omit_nuisance_warnings(df): diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index bf06495f935cd..0f7e71c99584d 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._libs import lib + import pandas as pd from pandas import ( DataFrame, @@ -240,14 +242,22 @@ def test_groupby_quantile_nullable_array(values, q): @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) -def test_groupby_quantile_skips_invalid_dtype(q): +@pytest.mark.parametrize("numeric_only", [lib.no_default, True, False]) +def test_groupby_quantile_skips_invalid_dtype(q, numeric_only): df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]}) - with tm.assert_produces_warning(FutureWarning, match="Dropping invalid columns"): - result = df.groupby("a").quantile(q) + if numeric_only is None or numeric_only: + warn = FutureWarning if numeric_only is lib.no_default else None + with tm.assert_produces_warning(warn, match="Dropping invalid columns"): + result = df.groupby("a").quantile(q, numeric_only=numeric_only) - expected = df.groupby("a")[["b"]].quantile(q) - tm.assert_frame_equal(result, expected) + expected = df.groupby("a")[["b"]].quantile(q) + tm.assert_frame_equal(result, expected) + else: + with pytest.raises( + TypeError, match="'quantile' cannot be performed against 'object' dtypes!" + ): + df.groupby("a").quantile(q, numeric_only=numeric_only) def test_groupby_quantile_NA_float(any_float_dtype): From d434816a014e8870439b885d7b51a660b698c33b Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 10 Apr 2022 21:44:40 -0400 Subject: [PATCH 2/7] Fix type-hint --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7ecef5ebb5b33..dd4558d78681f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2182,7 +2182,7 @@ def var( @final @Substitution(name="groupby") @Appender(_common_see_also) - def sem(self, ddof: int = 1, numeric_only: bool | lib.no_default = lib.no_default): + def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default): """ Compute standard error of the mean of groups, excluding missing values. From ebf777a7caf4bee00ab5699d53f3ed690b9c2330 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 11 Apr 2022 17:26:39 -0400 Subject: [PATCH 3/7] test fixup --- pandas/tests/groupby/test_groupby.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 21b5b026d4fd4..f4624481a71d3 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -915,9 +915,13 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): else: with tm.assert_produces_warning(warn, match="Dropping invalid columns"): result = getattr(grouped, agg_function)(numeric_only=numeric_only) - if numeric_only is lib.no_default or not numeric_only: + if ( + (numeric_only is lib.no_default or not numeric_only) + # These methods drop non-numeric columns even when numeric_only is False + and agg_function not in ("mean", "prod", "median") + ): columns = ["A", "B", "C", "D"] - elif numeric_only: + else: columns = ["A", "C", "D"] expected = getattr(df.loc[:, columns].groupby("A"), agg_function)( numeric_only=numeric_only From 58e9ddcdc9dfcd785e3c4b82d36ebafd5ce1daad Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 14 Apr 2022 07:14:36 -0400 Subject: [PATCH 4/7] fixup --- pandas/core/groupby/groupby.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index dd4558d78681f..ac7ef7c19ccf6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2153,13 +2153,14 @@ def var( return self._numba_agg_general(sliding_var, engine_kwargs, ddof) else: + ignore_failures = numeric_only is lib.no_default numeric_only = self._resolve_numeric_only(numeric_only) if ddof == 1: return self._cython_agg_general( "var", alt=lambda x: Series(x).var(ddof=ddof), numeric_only=numeric_only, - ignore_failures=numeric_only, + ignore_failures=ignore_failures, ) else: func = lambda x: x.var(ddof=ddof) @@ -2176,7 +2177,7 @@ def var( ): with self._group_selection_context(): return self._python_agg_general( - func, raise_on_typeerror=not numeric_only + func, raise_on_typeerror=not ignore_failures ) @final From 88caf9b394cb59bc69e768a3ddc1961e6eb5b6f6 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 23 Apr 2022 17:40:17 -0400 Subject: [PATCH 5/7] Simplify var --- pandas/core/groupby/groupby.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ac7ef7c19ccf6..fd1405a369555 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2153,7 +2153,7 @@ def var( return self._numba_agg_general(sliding_var, engine_kwargs, ddof) else: - ignore_failures = numeric_only is lib.no_default + ignore_failures = numeric_only is lib.no_default or numeric_only numeric_only = self._resolve_numeric_only(numeric_only) if ddof == 1: return self._cython_agg_general( @@ -2164,21 +2164,10 @@ def var( ) else: func = lambda x: x.var(ddof=ddof) - if numeric_only: - nonnumeric_exclusions = frozenset( - self.obj.columns.difference(self.exclusions).difference( - self.obj._get_numeric_data().columns - ) + with self._group_selection_context(): + return self._python_agg_general( + func, raise_on_typeerror=not ignore_failures ) - else: - nonnumeric_exclusions = frozenset() - with com.temp_setattr( - self, "exclusions", self.exclusions | nonnumeric_exclusions - ): - with self._group_selection_context(): - return self._python_agg_general( - func, raise_on_typeerror=not ignore_failures - ) @final @Substitution(name="groupby") From dbd81cd27304c77ec44b8eafcc4d7c3ef5c3e161 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 28 Apr 2022 16:58:28 -0400 Subject: [PATCH 6/7] fixup --- pandas/core/groupby/groupby.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index fd1405a369555..b587188936a51 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2153,20 +2153,19 @@ def var( return self._numba_agg_general(sliding_var, engine_kwargs, ddof) else: - ignore_failures = numeric_only is lib.no_default or numeric_only numeric_only = self._resolve_numeric_only(numeric_only) if ddof == 1: return self._cython_agg_general( "var", alt=lambda x: Series(x).var(ddof=ddof), numeric_only=numeric_only, - ignore_failures=ignore_failures, + ignore_failures=False, ) else: func = lambda x: x.var(ddof=ddof) with self._group_selection_context(): return self._python_agg_general( - func, raise_on_typeerror=not ignore_failures + func, raise_on_typeerror=not numeric_only ) @final From c79c7007fcc79883c25ccfd65cce0579199d0a66 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 29 Apr 2022 17:53:13 -0400 Subject: [PATCH 7/7] fixup --- pandas/core/groupby/groupby.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 39af12aa91da3..70f8e0a752dcb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2153,19 +2153,19 @@ def var( return self._numba_agg_general(sliding_var, engine_kwargs, ddof) else: - numeric_only = self._resolve_numeric_only(numeric_only) + numeric_only_bool = self._resolve_numeric_only(numeric_only) if ddof == 1: return self._cython_agg_general( "var", alt=lambda x: Series(x).var(ddof=ddof), - numeric_only=numeric_only, - ignore_failures=False, + numeric_only=numeric_only_bool, + ignore_failures=numeric_only is lib.no_default, ) else: func = lambda x: x.var(ddof=ddof) with self._group_selection_context(): return self._python_agg_general( - func, raise_on_typeerror=not numeric_only + func, raise_on_typeerror=not numeric_only_bool ) @final