-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG/API: SeriesGroupBy reduction with numeric_only=True #41342
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
15ce219
6777da5
afdc50f
fd10c3b
6f4dd0e
ef0a634
8325138
2826fb8
5166b53
50e9e49
f0b48c0
6823ada
99ba977
eab8737
ce59c54
72b4b2f
43113b2
26dd570
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1099,6 +1099,24 @@ def _wrap_transformed_output(self, output: Mapping[base.OutputKey, ArrayLike]): | |
def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = False): | ||
raise AbstractMethodError(self) | ||
|
||
def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: | ||
""" | ||
For SeriesGroupBy we want the default to be False (to match Series behavior). | ||
For DataFrameGroupBy we want it to be True (for backwards-compat). | ||
rhshadrach marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
# GH#41291 | ||
if numeric_only is lib.no_default: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are we deprecating the dataframe groupby behavior here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no, this is just a way of having the effective default be different for SeriesGroupBy vs DataFrameGroupBy |
||
# i.e. not explicitly passed by user | ||
if self.obj.ndim == 2: | ||
# i.e. DataFrameGroupBy | ||
numeric_only = True | ||
else: | ||
numeric_only = False | ||
|
||
# error: Incompatible return value type (got "Union[bool, NoDefault]", | ||
# expected "bool") | ||
return numeric_only # type: ignore[return-value] | ||
|
||
# ----------------------------------------------------------------- | ||
# numba | ||
|
||
|
@@ -1288,6 +1306,7 @@ def _agg_general( | |
alias: str, | ||
npfunc: Callable, | ||
): | ||
|
||
with group_selection_context(self): | ||
# try a cython aggregation if we can | ||
result = None | ||
|
@@ -1344,7 +1363,7 @@ def _agg_py_fallback( | |
return ensure_block_shape(res_values, ndim=ndim) | ||
|
||
def _cython_agg_general( | ||
self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 | ||
self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 | ||
): | ||
raise AbstractMethodError(self) | ||
|
||
|
@@ -1564,7 +1583,7 @@ def count(self): | |
@final | ||
@Substitution(name="groupby") | ||
@Substitution(see_also=_common_see_also) | ||
def mean(self, numeric_only: bool = True): | ||
def mean(self, numeric_only: bool | lib.NoDefault = lib.no_default): | ||
""" | ||
Compute mean of groups, excluding missing values. | ||
|
||
|
@@ -1612,6 +1631,8 @@ def mean(self, numeric_only: bool = True): | |
2 4.0 | ||
Name: B, dtype: float64 | ||
""" | ||
numeric_only = self._resolve_numeric_only(numeric_only) | ||
|
||
result = self._cython_agg_general( | ||
"mean", | ||
alt=lambda x: Series(x).mean(numeric_only=numeric_only), | ||
|
@@ -1622,7 +1643,7 @@ def mean(self, numeric_only: bool = True): | |
@final | ||
@Substitution(name="groupby") | ||
@Appender(_common_see_also) | ||
def median(self, numeric_only=True): | ||
def median(self, numeric_only: bool | lib.NoDefault = lib.no_default): | ||
""" | ||
Compute median of groups, excluding missing values. | ||
|
||
|
@@ -1639,6 +1660,8 @@ def median(self, numeric_only=True): | |
Series or DataFrame | ||
Median of values within each group. | ||
""" | ||
numeric_only = self._resolve_numeric_only(numeric_only) | ||
|
||
result = self._cython_agg_general( | ||
"median", | ||
alt=lambda x: Series(x).median(numeric_only=numeric_only), | ||
|
@@ -1696,8 +1719,9 @@ def var(self, ddof: int = 1): | |
Variance of values within each group. | ||
""" | ||
if ddof == 1: | ||
numeric_only = self._resolve_numeric_only(lib.no_default) | ||
return self._cython_agg_general( | ||
"var", alt=lambda x: Series(x).var(ddof=ddof) | ||
"var", alt=lambda x: Series(x).var(ddof=ddof), numeric_only=numeric_only | ||
) | ||
else: | ||
func = lambda x: x.var(ddof=ddof) | ||
|
@@ -1762,7 +1786,10 @@ def size(self) -> FrameOrSeriesUnion: | |
|
||
@final | ||
@doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) | ||
def sum(self, numeric_only: bool = True, min_count: int = 0): | ||
def sum( | ||
self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0 | ||
): | ||
numeric_only = self._resolve_numeric_only(numeric_only) | ||
|
||
# If we are grouping on categoricals we want unobserved categories to | ||
# return zero, rather than the default of NaN which the reindexing in | ||
|
@@ -1779,7 +1806,11 @@ def sum(self, numeric_only: bool = True, min_count: int = 0): | |
|
||
@final | ||
@doc(_groupby_agg_method_template, fname="prod", no=True, mc=0) | ||
def prod(self, numeric_only: bool = True, min_count: int = 0): | ||
def prod( | ||
self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0 | ||
): | ||
numeric_only = self._resolve_numeric_only(numeric_only) | ||
|
||
return self._agg_general( | ||
numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod | ||
) | ||
|
@@ -2708,7 +2739,7 @@ def _get_cythonized_result( | |
how: str, | ||
cython_dtype: np.dtype, | ||
aggregate: bool = False, | ||
numeric_only: bool = True, | ||
numeric_only: bool | lib.NoDefault = lib.no_default, | ||
needs_counts: bool = False, | ||
needs_values: bool = False, | ||
needs_2d: bool = False, | ||
|
@@ -2776,6 +2807,8 @@ def _get_cythonized_result( | |
------- | ||
`Series` or `DataFrame` with filled values | ||
""" | ||
numeric_only = self._resolve_numeric_only(numeric_only) | ||
|
||
if result_is_index and aggregate: | ||
raise ValueError("'result_is_index' and 'aggregate' cannot both be True!") | ||
if post_processing and not callable(post_processing): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -91,7 +91,10 @@ def test_cython_agg_nothing_to_agg(): | |
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) | ||
msg = "No numeric types to aggregate" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this can be moved to the relevant section below |
||
|
||
with pytest.raises(DataError, match=msg): | ||
with pytest.raises(NotImplementedError, match="does not implement"): | ||
frame.groupby("a")["b"].mean(numeric_only=True) | ||
|
||
with pytest.raises(TypeError, match="Could not convert (foo|bar)*"): | ||
frame.groupby("a")["b"].mean() | ||
|
||
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) | ||
|
@@ -107,9 +110,8 @@ def test_cython_agg_nothing_to_agg_with_dates(): | |
"dates": pd.date_range("now", periods=50, freq="T"), | ||
} | ||
) | ||
msg = "No numeric types to aggregate" | ||
with pytest.raises(DataError, match=msg): | ||
frame.groupby("b").dates.mean() | ||
with pytest.raises(NotImplementedError, match="does not implement"): | ||
frame.groupby("b").dates.mean(numeric_only=True) | ||
|
||
|
||
def test_cython_agg_frame_columns(): | ||
|
@@ -170,7 +172,7 @@ def test__cython_agg_general(op, targop): | |
df = DataFrame(np.random.randn(1000)) | ||
labels = np.random.randint(0, 50, size=1000).astype(float) | ||
|
||
result = df.groupby(labels)._cython_agg_general(op) | ||
result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True) | ||
expected = df.groupby(labels).agg(targop) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
|
@@ -192,7 +194,7 @@ def test_cython_agg_empty_buckets(op, targop, observed): | |
# calling _cython_agg_general directly, instead of via the user API | ||
# which sets different values for min_count, so do that here. | ||
g = df.groupby(pd.cut(df[0], grps), observed=observed) | ||
result = g._cython_agg_general(op) | ||
result = g._cython_agg_general(op, alt=None, numeric_only=True) | ||
|
||
g = df.groupby(pd.cut(df[0], grps), observed=observed) | ||
expected = g.agg(lambda x: targop(x)) | ||
|
@@ -209,7 +211,7 @@ def test_cython_agg_empty_buckets_nanops(observed): | |
grps = range(0, 25, 5) | ||
# add / sum | ||
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( | ||
"add" | ||
"add", alt=None, numeric_only=True | ||
) | ||
intervals = pd.interval_range(0, 20, freq=5) | ||
expected = DataFrame( | ||
|
@@ -223,7 +225,7 @@ def test_cython_agg_empty_buckets_nanops(observed): | |
|
||
# prod | ||
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( | ||
"prod" | ||
"prod", alt=None, numeric_only=True | ||
) | ||
expected = DataFrame( | ||
{"a": [1, 1, 1716, 1]}, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1761,7 +1761,15 @@ def test_empty_groupby(columns, keys, values, method, op, request): | |
# GH8093 & GH26411 | ||
override_dtype = None | ||
|
||
if isinstance(values, Categorical) and len(keys) == 1 and method == "apply": | ||
if ( | ||
isinstance(values, Categorical) | ||
and not isinstance(columns, list) | ||
and op in ["sum", "prod"] | ||
and method != "apply" | ||
): | ||
# handled below GH#41291 | ||
pass | ||
elif isinstance(values, Categorical) and len(keys) == 1 and method == "apply": | ||
mark = pytest.mark.xfail(raises=TypeError, match="'str' object is not callable") | ||
request.node.add_marker(mark) | ||
elif ( | ||
|
@@ -1822,11 +1830,36 @@ def test_empty_groupby(columns, keys, values, method, op, request): | |
df = df.iloc[:0] | ||
|
||
gb = df.groupby(keys)[columns] | ||
if method == "attr": | ||
result = getattr(gb, op)() | ||
else: | ||
result = getattr(gb, method)(op) | ||
|
||
def get_result(): | ||
if method == "attr": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a super complicated test. can you try to split in followons. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agreed |
||
return getattr(gb, op)() | ||
else: | ||
return getattr(gb, method)(op) | ||
|
||
if columns == "C": | ||
# i.e. SeriesGroupBy | ||
if op in ["prod", "sum"]: | ||
# ops that require more than just ordered-ness | ||
if method != "apply": | ||
# FIXME: apply goes through different code path | ||
if df.dtypes[0].kind == "M": | ||
# GH#41291 | ||
# datetime64 -> prod and sum are invalid | ||
msg = "datetime64 type does not support" | ||
with pytest.raises(TypeError, match=msg): | ||
get_result() | ||
|
||
return | ||
elif isinstance(values, Categorical): | ||
# GH#41291 | ||
msg = "category type does not support" | ||
with pytest.raises(TypeError, match=msg): | ||
get_result() | ||
|
||
return | ||
|
||
result = get_result() | ||
expected = df.set_index(keys)[columns] | ||
if override_dtype is not None: | ||
expected = expected.astype(override_dtype) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you give an example here that a user would understand
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated + green