Skip to content

ENH: Add numeric_only to certain groupby ops #46728

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Apr 30, 2022
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ Other enhancements
- :meth:`DataFrame.reset_index` now accepts a ``names`` argument which renames the index names (:issue:`6878`)
- :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`)
- :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`)
- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`)
- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, :meth:`DataFrame.cov`, :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax`, :meth:`.GroupBy.idxmin`, :meth:`.GroupBy.idxmax`, :meth:`.GroupBy.var`, :meth:`.GroupBy.std`, :meth:`.GroupBy.sem`, and :meth:`.GroupBy.quantile` (:issue:`46560`)
- A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`)

.. ---------------------------------------------------------------------------
Expand Down
33 changes: 23 additions & 10 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10536,11 +10536,17 @@ def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series:
"""
return self.apply(Series.nunique, axis=axis, dropna=dropna)

@doc(_shared_docs["idxmin"])
def idxmin(self, axis: Axis = 0, skipna: bool = True) -> Series:
@doc(_shared_docs["idxmin"], numeric_only_default="False")
def idxmin(
self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
) -> Series:
axis = self._get_axis_number(axis)
if numeric_only:
data = self._get_numeric_data()
else:
data = self

res = self._reduce(
res = data._reduce(
nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False
)
indices = res._values
Expand All @@ -10550,15 +10556,22 @@ def idxmin(self, axis: Axis = 0, skipna: bool = True) -> Series:
# error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
assert isinstance(indices, np.ndarray) # for mypy

index = self._get_axis(axis)
index = data._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return self._constructor_sliced(result, index=self._get_agg_axis(axis))
return data._constructor_sliced(result, index=data._get_agg_axis(axis))

@doc(_shared_docs["idxmax"], numeric_only_default="False")
def idxmax(
self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
) -> Series:

@doc(_shared_docs["idxmax"])
def idxmax(self, axis: Axis = 0, skipna: bool = True) -> Series:
axis = self._get_axis_number(axis)
if numeric_only:
data = self._get_numeric_data()
else:
data = self

res = self._reduce(
res = data._reduce(
nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False
)
indices = res._values
Expand All @@ -10568,9 +10581,9 @@ def idxmax(self, axis: Axis = 0, skipna: bool = True) -> Series:
# error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
assert isinstance(indices, np.ndarray) # for mypy

index = self._get_axis(axis)
index = data._get_axis(axis)
result = [index[i] if i >= 0 else np.nan for i in indices]
return self._constructor_sliced(result, index=self._get_agg_axis(axis))
return data._constructor_sliced(result, index=data._get_agg_axis(axis))

def _get_agg_axis(self, axis_num: int) -> Index:
"""
Expand Down
22 changes: 15 additions & 7 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1555,10 +1555,14 @@ def nunique(self, dropna: bool = True) -> DataFrame:

return results

@doc(_shared_docs["idxmax"])
def idxmax(self, axis=0, skipna: bool = True):
@doc(
_shared_docs["idxmax"],
numeric_only_default="True for axis=0, False for axis=1",
)
def idxmax(self, axis=0, skipna: bool = True, numeric_only: bool | None = None):
axis = DataFrame._get_axis_number(axis)
numeric_only = None if axis == 0 else False
if numeric_only is None:
numeric_only = None if axis == 0 else False

def func(df):
# NB: here we use numeric_only=None, in DataFrame it is False GH#38217
Expand All @@ -1577,13 +1581,17 @@ def func(df):
func.__name__ = "idxmax"
return self._python_apply_general(func, self._obj_with_exclusions)

@doc(_shared_docs["idxmin"])
def idxmin(self, axis=0, skipna: bool = True):
@doc(
_shared_docs["idxmin"],
numeric_only_default="True for axis=0, False for axis=1",
)
def idxmin(self, axis=0, skipna: bool = True, numeric_only: bool | None = None):
axis = DataFrame._get_axis_number(axis)
numeric_only = None if axis == 0 else False
if numeric_only is None:
numeric_only = None if axis == 0 else False

def func(df):
# NB: here we use numeric_only=None, in DataFrame it is False GH#38217
# NB: here we use numeric_only=None, in DataFrame it is False GH#46560
res = df._reduce(
nanops.nanargmin,
"argmin",
Expand Down
82 changes: 69 additions & 13 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1502,7 +1502,7 @@ def _python_apply_general(
)

@final
def _python_agg_general(self, func, *args, **kwargs):
def _python_agg_general(self, func, *args, raise_on_typeerror=False, **kwargs):
func = com.is_builtin_func(func)
f = lambda x: func(x, *args, **kwargs)

Expand All @@ -1520,6 +1520,8 @@ def _python_agg_general(self, func, *args, **kwargs):
# if this function is invalid for this dtype, we will ignore it.
result = self.grouper.agg_series(obj, f)
except TypeError:
if raise_on_typeerror:
raise
warn_dropping_nuisance_columns_deprecated(type(self), "agg")
continue

Expand Down Expand Up @@ -1593,7 +1595,12 @@ def _agg_py_fallback(

@final
def _cython_agg_general(
self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1
self,
how: str,
alt: Callable,
numeric_only: bool,
min_count: int = -1,
ignore_failures: bool = True,
):
# Note: we never get here with how="ohlc" for DataFrameGroupBy;
# that goes through SeriesGroupBy
Expand Down Expand Up @@ -1629,7 +1636,7 @@ def array_func(values: ArrayLike) -> ArrayLike:

# TypeError -> we may have an exception in trying to aggregate
# continue and exclude the block
new_mgr = data.grouped_reduce(array_func, ignore_failures=True)
new_mgr = data.grouped_reduce(array_func, ignore_failures=ignore_failures)

if not is_ser and len(new_mgr) < len(data):
warn_dropping_nuisance_columns_deprecated(type(self), how)
Expand Down Expand Up @@ -2041,6 +2048,7 @@ def std(
ddof: int = 1,
engine: str | None = None,
engine_kwargs: dict[str, bool] | None = None,
numeric_only: bool | lib.NoDefault = lib.no_default,
):
"""
Compute standard deviation of groups, excluding missing values.
Expand Down Expand Up @@ -2069,6 +2077,11 @@ def std(

.. versionadded:: 1.4.0

numeric_only : bool, default True
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

Returns
-------
Series or DataFrame
Expand All @@ -2081,8 +2094,9 @@ def std(
else:
return self._get_cythonized_result(
libgroupby.group_var,
needs_counts=True,
cython_dtype=np.dtype(np.float64),
numeric_only=numeric_only,
needs_counts=True,
post_processing=lambda vals, inference: np.sqrt(vals),
ddof=ddof,
)
Expand All @@ -2095,6 +2109,7 @@ def var(
ddof: int = 1,
engine: str | None = None,
engine_kwargs: dict[str, bool] | None = None,
numeric_only: bool | lib.NoDefault = lib.no_default,
):
"""
Compute variance of groups, excluding missing values.
Expand Down Expand Up @@ -2123,6 +2138,11 @@ def var(

.. versionadded:: 1.4.0

numeric_only : bool, default True
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

Returns
-------
Series or DataFrame
Expand All @@ -2133,22 +2153,37 @@ def var(

return self._numba_agg_general(sliding_var, engine_kwargs, ddof)
else:
ignore_failures = numeric_only is lib.no_default
numeric_only = self._resolve_numeric_only(numeric_only)
if ddof == 1:
numeric_only = self._resolve_numeric_only(lib.no_default)
return self._cython_agg_general(
"var",
alt=lambda x: Series(x).var(ddof=ddof),
numeric_only=numeric_only,
ignore_failures=ignore_failures,
)
else:
func = lambda x: x.var(ddof=ddof)
with self._group_selection_context():
return self._python_agg_general(func)
if numeric_only:
nonnumeric_exclusions = frozenset(
self.obj.columns.difference(self.exclusions).difference(
self.obj._get_numeric_data().columns
)
)
else:
nonnumeric_exclusions = frozenset()
with com.temp_setattr(
self, "exclusions", self.exclusions | nonnumeric_exclusions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is the purpose here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks - I was able to remove this hack by setting the appropriate value of raise_on_typeerror.

):
with self._group_selection_context():
return self._python_agg_general(
func, raise_on_typeerror=not ignore_failures
)

@final
@Substitution(name="groupby")
@Appender(_common_see_also)
def sem(self, ddof: int = 1):
def sem(self, ddof: int = 1, numeric_only: bool | lib.NoDefault = lib.no_default):
"""
Compute standard error of the mean of groups, excluding missing values.

Expand All @@ -2159,12 +2194,17 @@ def sem(self, ddof: int = 1):
ddof : int, default 1
Degrees of freedom.

numeric_only : bool, default True
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

Returns
-------
Series or DataFrame
Standard error of the mean of values within each group.
"""
result = self.std(ddof=ddof)
result = self.std(ddof=ddof, numeric_only=numeric_only)
if result.ndim == 1:
result /= np.sqrt(self.count())
else:
Expand Down Expand Up @@ -2968,7 +3008,12 @@ def nth(
return result

@final
def quantile(self, q=0.5, interpolation: str = "linear"):
def quantile(
self,
q=0.5,
interpolation: str = "linear",
numeric_only: bool | lib.NoDefault = lib.no_default,
):
"""
Return group values at the given quantile, a la numpy.percentile.

Expand All @@ -2978,6 +3023,10 @@ def quantile(self, q=0.5, interpolation: str = "linear"):
Value(s) between 0 and 1 providing the quantile(s) to compute.
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
Method to use when the desired quantile falls between two points.
numeric_only : bool, default True
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

Returns
-------
Expand All @@ -3002,6 +3051,7 @@ def quantile(self, q=0.5, interpolation: str = "linear"):
a 2.0
b 3.0
"""
numeric_only_bool = self._resolve_numeric_only(numeric_only)

def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]:
if is_object_dtype(vals):
Expand Down Expand Up @@ -3095,9 +3145,15 @@ def blk_func(values: ArrayLike) -> ArrayLike:
obj = self._obj_with_exclusions
is_ser = obj.ndim == 1
mgr = self._get_data_to_aggregate()

res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True)
if not is_ser and len(res_mgr.items) != len(mgr.items):
data = mgr.get_numeric_data() if numeric_only_bool else mgr
ignore_failures = numeric_only_bool
res_mgr = data.grouped_reduce(blk_func, ignore_failures=ignore_failures)

if (
numeric_only is lib.no_default
and not is_ser
and len(res_mgr.items) != len(mgr.items)
):
warn_dropping_nuisance_columns_deprecated(type(self), "quantile")

if len(res_mgr.items) == 0:
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/shared_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -749,6 +749,10 @@
skipna : bool, default True
Exclude NA/null values. If an entire row/column is NA, the result
will be NA.
numeric_only : bool, default {numeric_only_default}
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

Returns
-------
Expand Down Expand Up @@ -812,6 +816,10 @@
skipna : bool, default True
Exclude NA/null values. If an entire row/column is NA, the result
will be NA.
numeric_only : bool, default {numeric_only_default}
Include only `float`, `int` or `boolean` data.

.. versionadded:: 1.5.0

Returns
-------
Expand Down
22 changes: 22 additions & 0 deletions pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -897,6 +897,17 @@ def test_idxmin(self, float_frame, int_frame, skipna, axis):
expected = df.apply(Series.idxmin, axis=axis, skipna=skipna)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("numeric_only", [True, False])
def test_idxmin_numeric_only(self, numeric_only):
df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")})
if numeric_only:
result = df.idxmin(numeric_only=numeric_only)
expected = Series([2, 1], index=["a", "b"])
tm.assert_series_equal(result, expected)
else:
with pytest.raises(TypeError, match="not allowed for this dtype"):
df.idxmin(numeric_only=numeric_only)

def test_idxmin_axis_2(self, float_frame):
frame = float_frame
msg = "No axis named 2 for object type DataFrame"
Expand All @@ -914,6 +925,17 @@ def test_idxmax(self, float_frame, int_frame, skipna, axis):
expected = df.apply(Series.idxmax, axis=axis, skipna=skipna)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("numeric_only", [True, False])
def test_idxmax_numeric_only(self, numeric_only):
df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")})
if numeric_only:
result = df.idxmax(numeric_only=numeric_only)
expected = Series([1, 0], index=["a", "b"])
tm.assert_series_equal(result, expected)
else:
with pytest.raises(TypeError, match="not allowed for this dtype"):
df.idxmin(numeric_only=numeric_only)

def test_idxmax_axis_2(self, float_frame):
frame = float_frame
msg = "No axis named 2 for object type DataFrame"
Expand Down
Loading