Skip to content

BUG/API: SeriesGroupBy reduction with numeric_only=True #41342

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
May 26, 2021
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1037,6 +1037,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrameGroupBy.__getitem__` with non-unique columns incorrectly returning a malformed :class:`SeriesGroupBy` instead of :class:`DataFrameGroupBy` (:issue:`41427`)
- Bug in :meth:`DataFrameGroupBy.transform` with non-unique columns incorrectly raising ``AttributeError`` (:issue:`41427`)
- Bug in :meth:`Resampler.apply` with non-unique columns incorrectly dropping duplicated columns (:issue:`41445`)
- Bug in :meth:`SeriesGroupBy` aggregations incorrectly returning empty :class:`Series` instead of raising ``TypeError`` on aggregations that are invalid for its dtype, e.g. ``.prod`` with ``datetime64[ns]`` dtype (:issue:`41342`)
- Bug in :meth:`DataFrameGroupBy.transform` and :meth:`DataFrameGroupBy.agg` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`41647`)

Reshaping
Expand Down
9 changes: 6 additions & 3 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,15 +323,18 @@ def _aggregate_multiple_funcs(self, arg) -> DataFrame:
return output

def _cython_agg_general(
self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1
):

obj = self._selected_obj
objvals = obj._values
data = obj._mgr

if numeric_only and not is_numeric_dtype(obj.dtype):
raise DataError("No numeric types to aggregate")
# GH#41291 match Series behavior
raise NotImplementedError(
f"{type(self).__name__}.{how} does not implement numeric_only."
)

# This is overkill because it is only called once, but is here to
# mirror the array_func used in DataFrameGroupBy._cython_agg_general
Expand Down Expand Up @@ -1056,7 +1059,7 @@ def _iterate_slices(self) -> Iterable[Series]:
yield values

def _cython_agg_general(
self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1
) -> DataFrame:
# Note: we never get here with how="ohlc"; that goes through SeriesGroupBy

Expand Down
57 changes: 50 additions & 7 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1101,6 +1101,34 @@ def _wrap_transformed_output(self, output: Mapping[base.OutputKey, ArrayLike]):
def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = False):
raise AbstractMethodError(self)

def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool:
"""
Determine subclass-specific default value for 'numeric_only'.

For SeriesGroupBy we want the default to be False (to match Series behavior).
For DataFrameGroupBy we want it to be True (for backwards-compat).

Parameters
----------
numeric_only : bool or lib.no_default

Returns
-------
bool
"""
# GH#41291
if numeric_only is lib.no_default:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are we deprecating the dataframe groupby behavior here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, this is just a way of having the effective default be different for SeriesGroupBy vs DataFrameGroupBy

# i.e. not explicitly passed by user
if self.obj.ndim == 2:
# i.e. DataFrameGroupBy
numeric_only = True
else:
numeric_only = False

# error: Incompatible return value type (got "Union[bool, NoDefault]",
# expected "bool")
return numeric_only # type: ignore[return-value]

# -----------------------------------------------------------------
# numba

Expand Down Expand Up @@ -1308,6 +1336,7 @@ def _agg_general(
alias: str,
npfunc: Callable,
):

with group_selection_context(self):
# try a cython aggregation if we can
result = None
Expand Down Expand Up @@ -1367,7 +1396,7 @@ def _agg_py_fallback(
return ensure_block_shape(res_values, ndim=ndim)

def _cython_agg_general(
self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1
):
raise AbstractMethodError(self)

Expand Down Expand Up @@ -1587,7 +1616,7 @@ def count(self):
@final
@Substitution(name="groupby")
@Substitution(see_also=_common_see_also)
def mean(self, numeric_only: bool = True):
def mean(self, numeric_only: bool | lib.NoDefault = lib.no_default):
"""
Compute mean of groups, excluding missing values.

Expand Down Expand Up @@ -1635,6 +1664,8 @@ def mean(self, numeric_only: bool = True):
2 4.0
Name: B, dtype: float64
"""
numeric_only = self._resolve_numeric_only(numeric_only)

result = self._cython_agg_general(
"mean",
alt=lambda x: Series(x).mean(numeric_only=numeric_only),
Expand All @@ -1645,7 +1676,7 @@ def mean(self, numeric_only: bool = True):
@final
@Substitution(name="groupby")
@Appender(_common_see_also)
def median(self, numeric_only=True):
def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
"""
Compute median of groups, excluding missing values.

Expand All @@ -1662,6 +1693,8 @@ def median(self, numeric_only=True):
Series or DataFrame
Median of values within each group.
"""
numeric_only = self._resolve_numeric_only(numeric_only)

result = self._cython_agg_general(
"median",
alt=lambda x: Series(x).median(numeric_only=numeric_only),
Expand Down Expand Up @@ -1719,8 +1752,9 @@ def var(self, ddof: int = 1):
Variance of values within each group.
"""
if ddof == 1:
numeric_only = self._resolve_numeric_only(lib.no_default)
return self._cython_agg_general(
"var", alt=lambda x: Series(x).var(ddof=ddof)
"var", alt=lambda x: Series(x).var(ddof=ddof), numeric_only=numeric_only
)
else:
func = lambda x: x.var(ddof=ddof)
Expand Down Expand Up @@ -1785,7 +1819,10 @@ def size(self) -> FrameOrSeriesUnion:

@final
@doc(_groupby_agg_method_template, fname="sum", no=True, mc=0)
def sum(self, numeric_only: bool = True, min_count: int = 0):
def sum(
self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0
):
numeric_only = self._resolve_numeric_only(numeric_only)

# If we are grouping on categoricals we want unobserved categories to
# return zero, rather than the default of NaN which the reindexing in
Expand All @@ -1802,7 +1839,11 @@ def sum(self, numeric_only: bool = True, min_count: int = 0):

@final
@doc(_groupby_agg_method_template, fname="prod", no=True, mc=0)
def prod(self, numeric_only: bool = True, min_count: int = 0):
def prod(
self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0
):
numeric_only = self._resolve_numeric_only(numeric_only)

return self._agg_general(
numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
)
Expand Down Expand Up @@ -2731,7 +2772,7 @@ def _get_cythonized_result(
how: str,
cython_dtype: np.dtype,
aggregate: bool = False,
numeric_only: bool = True,
numeric_only: bool | lib.NoDefault = lib.no_default,
needs_counts: bool = False,
needs_values: bool = False,
needs_2d: bool = False,
Expand Down Expand Up @@ -2799,6 +2840,8 @@ def _get_cythonized_result(
-------
`Series` or `DataFrame` with filled values
"""
numeric_only = self._resolve_numeric_only(numeric_only)

if result_is_index and aggregate:
raise ValueError("'result_is_index' and 'aggregate' cannot both be True!")
if post_processing and not callable(post_processing):
Expand Down
21 changes: 12 additions & 9 deletions pandas/tests/groupby/aggregate/test_cython.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,16 @@ def test_cython_agg_boolean():

def test_cython_agg_nothing_to_agg():
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
msg = "No numeric types to aggregate"

with pytest.raises(DataError, match=msg):
with pytest.raises(NotImplementedError, match="does not implement"):
frame.groupby("a")["b"].mean(numeric_only=True)

with pytest.raises(TypeError, match="Could not convert (foo|bar)*"):
frame.groupby("a")["b"].mean()

frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})

msg = "No numeric types to aggregate"
with pytest.raises(DataError, match=msg):
frame[["b"]].groupby(frame["a"]).mean()

Expand All @@ -107,9 +111,8 @@ def test_cython_agg_nothing_to_agg_with_dates():
"dates": pd.date_range("now", periods=50, freq="T"),
}
)
msg = "No numeric types to aggregate"
with pytest.raises(DataError, match=msg):
frame.groupby("b").dates.mean()
with pytest.raises(NotImplementedError, match="does not implement"):
frame.groupby("b").dates.mean(numeric_only=True)


def test_cython_agg_frame_columns():
Expand Down Expand Up @@ -170,7 +173,7 @@ def test__cython_agg_general(op, targop):
df = DataFrame(np.random.randn(1000))
labels = np.random.randint(0, 50, size=1000).astype(float)

result = df.groupby(labels)._cython_agg_general(op)
result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True)
expected = df.groupby(labels).agg(targop)
tm.assert_frame_equal(result, expected)

Expand All @@ -192,7 +195,7 @@ def test_cython_agg_empty_buckets(op, targop, observed):
# calling _cython_agg_general directly, instead of via the user API
# which sets different values for min_count, so do that here.
g = df.groupby(pd.cut(df[0], grps), observed=observed)
result = g._cython_agg_general(op)
result = g._cython_agg_general(op, alt=None, numeric_only=True)

g = df.groupby(pd.cut(df[0], grps), observed=observed)
expected = g.agg(lambda x: targop(x))
Expand All @@ -206,7 +209,7 @@ def test_cython_agg_empty_buckets_nanops(observed):
grps = range(0, 25, 5)
# add / sum
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
"add"
"add", alt=None, numeric_only=True
)
intervals = pd.interval_range(0, 20, freq=5)
expected = DataFrame(
Expand All @@ -220,7 +223,7 @@ def test_cython_agg_empty_buckets_nanops(observed):

# prod
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
"prod"
"prod", alt=None, numeric_only=True
)
expected = DataFrame(
{"a": [1, 1, 1716, 1]},
Expand Down
43 changes: 38 additions & 5 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1764,7 +1764,15 @@ def test_empty_groupby(columns, keys, values, method, op, request):
# GH8093 & GH26411
override_dtype = None

if isinstance(values, Categorical) and len(keys) == 1 and method == "apply":
if (
isinstance(values, Categorical)
and not isinstance(columns, list)
and op in ["sum", "prod"]
and method != "apply"
):
# handled below GH#41291
pass
elif isinstance(values, Categorical) and len(keys) == 1 and method == "apply":
mark = pytest.mark.xfail(raises=TypeError, match="'str' object is not callable")
request.node.add_marker(mark)
elif (
Expand Down Expand Up @@ -1825,11 +1833,36 @@ def test_empty_groupby(columns, keys, values, method, op, request):
df = df.iloc[:0]

gb = df.groupby(keys)[columns]
if method == "attr":
result = getattr(gb, op)()
else:
result = getattr(gb, method)(op)

def get_result():
if method == "attr":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a super complicated test. can you try to split in followons.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agreed

return getattr(gb, op)()
else:
return getattr(gb, method)(op)

if columns == "C":
# i.e. SeriesGroupBy
if op in ["prod", "sum"]:
# ops that require more than just ordered-ness
if method != "apply":
# FIXME: apply goes through different code path
if df.dtypes[0].kind == "M":
# GH#41291
# datetime64 -> prod and sum are invalid
msg = "datetime64 type does not support"
with pytest.raises(TypeError, match=msg):
get_result()

return
elif isinstance(values, Categorical):
# GH#41291
msg = "category type does not support"
with pytest.raises(TypeError, match=msg):
get_result()

return

result = get_result()
expected = df.set_index(keys)[columns]
if override_dtype is not None:
expected = expected.astype(override_dtype)
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/resample/test_datetime_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,15 @@ def test_custom_grouper(index):
g.ohlc() # doesn't use _cython_agg_general
funcs = ["add", "mean", "prod", "min", "max", "var"]
for f in funcs:
g._cython_agg_general(f)
g._cython_agg_general(f, alt=None, numeric_only=True)

b = Grouper(freq=Minute(5), closed="right", label="right")
g = s.groupby(b)
# check all cython functions work
g.ohlc() # doesn't use _cython_agg_general
funcs = ["add", "mean", "prod", "min", "max", "var"]
for f in funcs:
g._cython_agg_general(f)
g._cython_agg_general(f, alt=None, numeric_only=True)

assert g.ngroups == 2593
assert notna(g.mean()).all()
Expand Down Expand Up @@ -417,7 +417,7 @@ def test_resample_frame_basic():
# check all cython functions work
funcs = ["add", "mean", "prod", "min", "max", "var"]
for f in funcs:
g._cython_agg_general(f)
g._cython_agg_general(f, alt=None, numeric_only=True)

result = df.resample("A").mean()
tm.assert_series_equal(result["A"], df["A"].resample("A").mean())
Expand Down