Skip to content

Commit fa8f68e

Browse files
authored
BUG/API: SeriesGroupBy reduction with numeric_only=True (#41342)
1 parent 4944939 commit fa8f68e

File tree

6 files changed

+110
-27
lines changed

6 files changed

+110
-27
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1038,6 +1038,7 @@ Groupby/resample/rolling
10381038
- Bug in :meth:`DataFrameGroupBy.__getitem__` with non-unique columns incorrectly returning a malformed :class:`SeriesGroupBy` instead of :class:`DataFrameGroupBy` (:issue:`41427`)
10391039
- Bug in :meth:`DataFrameGroupBy.transform` with non-unique columns incorrectly raising ``AttributeError`` (:issue:`41427`)
10401040
- Bug in :meth:`Resampler.apply` with non-unique columns incorrectly dropping duplicated columns (:issue:`41445`)
1041+
- Bug in :meth:`SeriesGroupBy` aggregations incorrectly returning empty :class:`Series` instead of raising ``TypeError`` on aggregations that are invalid for its dtype, e.g. ``.prod`` with ``datetime64[ns]`` dtype (:issue:`41342`)
10411042
- Bug in :meth:`DataFrame.rolling.__iter__` where ``on`` was not assigned to the index of the resulting objects (:issue:`40373`)
10421043
- Bug in :meth:`DataFrameGroupBy.transform` and :meth:`DataFrameGroupBy.agg` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`41647`)
10431044

pandas/core/groupby/generic.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -323,15 +323,18 @@ def _aggregate_multiple_funcs(self, arg) -> DataFrame:
323323
return output
324324

325325
def _cython_agg_general(
326-
self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
326+
self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1
327327
):
328328

329329
obj = self._selected_obj
330330
objvals = obj._values
331331
data = obj._mgr
332332

333333
if numeric_only and not is_numeric_dtype(obj.dtype):
334-
raise DataError("No numeric types to aggregate")
334+
# GH#41291 match Series behavior
335+
raise NotImplementedError(
336+
f"{type(self).__name__}.{how} does not implement numeric_only."
337+
)
335338

336339
# This is overkill because it is only called once, but is here to
337340
# mirror the array_func used in DataFrameGroupBy._cython_agg_general
@@ -1056,7 +1059,7 @@ def _iterate_slices(self) -> Iterable[Series]:
10561059
yield values
10571060

10581061
def _cython_agg_general(
1059-
self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
1062+
self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1
10601063
) -> DataFrame:
10611064
# Note: we never get here with how="ohlc"; that goes through SeriesGroupBy
10621065

pandas/core/groupby/groupby.py

+50-7
Original file line numberDiff line numberDiff line change
@@ -1101,6 +1101,34 @@ def _wrap_transformed_output(self, output: Mapping[base.OutputKey, ArrayLike]):
11011101
def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = False):
11021102
raise AbstractMethodError(self)
11031103

1104+
def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool:
1105+
"""
1106+
Determine subclass-specific default value for 'numeric_only'.
1107+
1108+
For SeriesGroupBy we want the default to be False (to match Series behavior).
1109+
For DataFrameGroupBy we want it to be True (for backwards-compat).
1110+
1111+
Parameters
1112+
----------
1113+
numeric_only : bool or lib.no_default
1114+
1115+
Returns
1116+
-------
1117+
bool
1118+
"""
1119+
# GH#41291
1120+
if numeric_only is lib.no_default:
1121+
# i.e. not explicitly passed by user
1122+
if self.obj.ndim == 2:
1123+
# i.e. DataFrameGroupBy
1124+
numeric_only = True
1125+
else:
1126+
numeric_only = False
1127+
1128+
# error: Incompatible return value type (got "Union[bool, NoDefault]",
1129+
# expected "bool")
1130+
return numeric_only # type: ignore[return-value]
1131+
11041132
# -----------------------------------------------------------------
11051133
# numba
11061134

@@ -1308,6 +1336,7 @@ def _agg_general(
13081336
alias: str,
13091337
npfunc: Callable,
13101338
):
1339+
13111340
with group_selection_context(self):
13121341
# try a cython aggregation if we can
13131342
result = None
@@ -1367,7 +1396,7 @@ def _agg_py_fallback(
13671396
return ensure_block_shape(res_values, ndim=ndim)
13681397

13691398
def _cython_agg_general(
1370-
self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
1399+
self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1
13711400
):
13721401
raise AbstractMethodError(self)
13731402

@@ -1587,7 +1616,7 @@ def count(self):
15871616
@final
15881617
@Substitution(name="groupby")
15891618
@Substitution(see_also=_common_see_also)
1590-
def mean(self, numeric_only: bool = True):
1619+
def mean(self, numeric_only: bool | lib.NoDefault = lib.no_default):
15911620
"""
15921621
Compute mean of groups, excluding missing values.
15931622
@@ -1635,6 +1664,8 @@ def mean(self, numeric_only: bool = True):
16351664
2 4.0
16361665
Name: B, dtype: float64
16371666
"""
1667+
numeric_only = self._resolve_numeric_only(numeric_only)
1668+
16381669
result = self._cython_agg_general(
16391670
"mean",
16401671
alt=lambda x: Series(x).mean(numeric_only=numeric_only),
@@ -1645,7 +1676,7 @@ def mean(self, numeric_only: bool = True):
16451676
@final
16461677
@Substitution(name="groupby")
16471678
@Appender(_common_see_also)
1648-
def median(self, numeric_only=True):
1679+
def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
16491680
"""
16501681
Compute median of groups, excluding missing values.
16511682
@@ -1662,6 +1693,8 @@ def median(self, numeric_only=True):
16621693
Series or DataFrame
16631694
Median of values within each group.
16641695
"""
1696+
numeric_only = self._resolve_numeric_only(numeric_only)
1697+
16651698
result = self._cython_agg_general(
16661699
"median",
16671700
alt=lambda x: Series(x).median(numeric_only=numeric_only),
@@ -1719,8 +1752,9 @@ def var(self, ddof: int = 1):
17191752
Variance of values within each group.
17201753
"""
17211754
if ddof == 1:
1755+
numeric_only = self._resolve_numeric_only(lib.no_default)
17221756
return self._cython_agg_general(
1723-
"var", alt=lambda x: Series(x).var(ddof=ddof)
1757+
"var", alt=lambda x: Series(x).var(ddof=ddof), numeric_only=numeric_only
17241758
)
17251759
else:
17261760
func = lambda x: x.var(ddof=ddof)
@@ -1785,7 +1819,10 @@ def size(self) -> FrameOrSeriesUnion:
17851819

17861820
@final
17871821
@doc(_groupby_agg_method_template, fname="sum", no=True, mc=0)
1788-
def sum(self, numeric_only: bool = True, min_count: int = 0):
1822+
def sum(
1823+
self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0
1824+
):
1825+
numeric_only = self._resolve_numeric_only(numeric_only)
17891826

17901827
# If we are grouping on categoricals we want unobserved categories to
17911828
# return zero, rather than the default of NaN which the reindexing in
@@ -1802,7 +1839,11 @@ def sum(self, numeric_only: bool = True, min_count: int = 0):
18021839

18031840
@final
18041841
@doc(_groupby_agg_method_template, fname="prod", no=True, mc=0)
1805-
def prod(self, numeric_only: bool = True, min_count: int = 0):
1842+
def prod(
1843+
self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0
1844+
):
1845+
numeric_only = self._resolve_numeric_only(numeric_only)
1846+
18061847
return self._agg_general(
18071848
numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
18081849
)
@@ -2731,7 +2772,7 @@ def _get_cythonized_result(
27312772
how: str,
27322773
cython_dtype: np.dtype,
27332774
aggregate: bool = False,
2734-
numeric_only: bool = True,
2775+
numeric_only: bool | lib.NoDefault = lib.no_default,
27352776
needs_counts: bool = False,
27362777
needs_values: bool = False,
27372778
needs_2d: bool = False,
@@ -2799,6 +2840,8 @@ def _get_cythonized_result(
27992840
-------
28002841
`Series` or `DataFrame` with filled values
28012842
"""
2843+
numeric_only = self._resolve_numeric_only(numeric_only)
2844+
28022845
if result_is_index and aggregate:
28032846
raise ValueError("'result_is_index' and 'aggregate' cannot both be True!")
28042847
if post_processing and not callable(post_processing):

pandas/tests/groupby/aggregate/test_cython.py

+12-9
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,16 @@ def test_cython_agg_boolean():
8989

9090
def test_cython_agg_nothing_to_agg():
9191
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
92-
msg = "No numeric types to aggregate"
9392

94-
with pytest.raises(DataError, match=msg):
93+
with pytest.raises(NotImplementedError, match="does not implement"):
94+
frame.groupby("a")["b"].mean(numeric_only=True)
95+
96+
with pytest.raises(TypeError, match="Could not convert (foo|bar)*"):
9597
frame.groupby("a")["b"].mean()
9698

9799
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
100+
101+
msg = "No numeric types to aggregate"
98102
with pytest.raises(DataError, match=msg):
99103
frame[["b"]].groupby(frame["a"]).mean()
100104

@@ -107,9 +111,8 @@ def test_cython_agg_nothing_to_agg_with_dates():
107111
"dates": pd.date_range("now", periods=50, freq="T"),
108112
}
109113
)
110-
msg = "No numeric types to aggregate"
111-
with pytest.raises(DataError, match=msg):
112-
frame.groupby("b").dates.mean()
114+
with pytest.raises(NotImplementedError, match="does not implement"):
115+
frame.groupby("b").dates.mean(numeric_only=True)
113116

114117

115118
def test_cython_agg_frame_columns():
@@ -170,7 +173,7 @@ def test__cython_agg_general(op, targop):
170173
df = DataFrame(np.random.randn(1000))
171174
labels = np.random.randint(0, 50, size=1000).astype(float)
172175

173-
result = df.groupby(labels)._cython_agg_general(op)
176+
result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True)
174177
expected = df.groupby(labels).agg(targop)
175178
tm.assert_frame_equal(result, expected)
176179

@@ -192,7 +195,7 @@ def test_cython_agg_empty_buckets(op, targop, observed):
192195
# calling _cython_agg_general directly, instead of via the user API
193196
# which sets different values for min_count, so do that here.
194197
g = df.groupby(pd.cut(df[0], grps), observed=observed)
195-
result = g._cython_agg_general(op)
198+
result = g._cython_agg_general(op, alt=None, numeric_only=True)
196199

197200
g = df.groupby(pd.cut(df[0], grps), observed=observed)
198201
expected = g.agg(lambda x: targop(x))
@@ -206,7 +209,7 @@ def test_cython_agg_empty_buckets_nanops(observed):
206209
grps = range(0, 25, 5)
207210
# add / sum
208211
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
209-
"add"
212+
"add", alt=None, numeric_only=True
210213
)
211214
intervals = pd.interval_range(0, 20, freq=5)
212215
expected = DataFrame(
@@ -220,7 +223,7 @@ def test_cython_agg_empty_buckets_nanops(observed):
220223

221224
# prod
222225
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
223-
"prod"
226+
"prod", alt=None, numeric_only=True
224227
)
225228
expected = DataFrame(
226229
{"a": [1, 1, 1716, 1]},

pandas/tests/groupby/test_groupby.py

+38-5
Original file line numberDiff line numberDiff line change
@@ -1764,7 +1764,15 @@ def test_empty_groupby(columns, keys, values, method, op, request):
17641764
# GH8093 & GH26411
17651765
override_dtype = None
17661766

1767-
if isinstance(values, Categorical) and len(keys) == 1 and method == "apply":
1767+
if (
1768+
isinstance(values, Categorical)
1769+
and not isinstance(columns, list)
1770+
and op in ["sum", "prod"]
1771+
and method != "apply"
1772+
):
1773+
# handled below GH#41291
1774+
pass
1775+
elif isinstance(values, Categorical) and len(keys) == 1 and method == "apply":
17681776
mark = pytest.mark.xfail(raises=TypeError, match="'str' object is not callable")
17691777
request.node.add_marker(mark)
17701778
elif (
@@ -1825,11 +1833,36 @@ def test_empty_groupby(columns, keys, values, method, op, request):
18251833
df = df.iloc[:0]
18261834

18271835
gb = df.groupby(keys)[columns]
1828-
if method == "attr":
1829-
result = getattr(gb, op)()
1830-
else:
1831-
result = getattr(gb, method)(op)
18321836

1837+
def get_result():
1838+
if method == "attr":
1839+
return getattr(gb, op)()
1840+
else:
1841+
return getattr(gb, method)(op)
1842+
1843+
if columns == "C":
1844+
# i.e. SeriesGroupBy
1845+
if op in ["prod", "sum"]:
1846+
# ops that require more than just ordered-ness
1847+
if method != "apply":
1848+
# FIXME: apply goes through different code path
1849+
if df.dtypes[0].kind == "M":
1850+
# GH#41291
1851+
# datetime64 -> prod and sum are invalid
1852+
msg = "datetime64 type does not support"
1853+
with pytest.raises(TypeError, match=msg):
1854+
get_result()
1855+
1856+
return
1857+
elif isinstance(values, Categorical):
1858+
# GH#41291
1859+
msg = "category type does not support"
1860+
with pytest.raises(TypeError, match=msg):
1861+
get_result()
1862+
1863+
return
1864+
1865+
result = get_result()
18331866
expected = df.set_index(keys)[columns]
18341867
if override_dtype is not None:
18351868
expected = expected.astype(override_dtype)

pandas/tests/resample/test_datetime_index.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,15 @@ def test_custom_grouper(index):
6161
g.ohlc() # doesn't use _cython_agg_general
6262
funcs = ["add", "mean", "prod", "min", "max", "var"]
6363
for f in funcs:
64-
g._cython_agg_general(f)
64+
g._cython_agg_general(f, alt=None, numeric_only=True)
6565

6666
b = Grouper(freq=Minute(5), closed="right", label="right")
6767
g = s.groupby(b)
6868
# check all cython functions work
6969
g.ohlc() # doesn't use _cython_agg_general
7070
funcs = ["add", "mean", "prod", "min", "max", "var"]
7171
for f in funcs:
72-
g._cython_agg_general(f)
72+
g._cython_agg_general(f, alt=None, numeric_only=True)
7373

7474
assert g.ngroups == 2593
7575
assert notna(g.mean()).all()
@@ -417,7 +417,7 @@ def test_resample_frame_basic():
417417
# check all cython functions work
418418
funcs = ["add", "mean", "prod", "min", "max", "var"]
419419
for f in funcs:
420-
g._cython_agg_general(f)
420+
g._cython_agg_general(f, alt=None, numeric_only=True)
421421

422422
result = df.resample("A").mean()
423423
tm.assert_series_equal(result["A"], df["A"].resample("A").mean())

0 commit comments

Comments
 (0)