From 15ce2193dfbadb127e8d8032f40acd5682f34fa4 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 5 May 2021 15:13:18 -0700 Subject: [PATCH 1/5] BUG/API: SeriesGroupBy reduction with numeric_only=True --- pandas/core/groupby/generic.py | 9 ++-- pandas/core/groupby/groupby.py | 47 ++++++++++++++++--- pandas/tests/groupby/aggregate/test_cython.py | 18 +++---- pandas/tests/groupby/test_groupby.py | 26 ++++++++-- pandas/tests/resample/test_datetime_index.py | 6 +-- 5 files changed, 81 insertions(+), 25 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 18506b871bda6..c733fd7adbbf0 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -346,7 +346,7 @@ def _aggregate_multiple_funcs(self, arg): return output def _cython_agg_general( - self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 ): obj = self._selected_obj @@ -354,7 +354,10 @@ def _cython_agg_general( data = obj._mgr if numeric_only and not is_numeric_dtype(obj.dtype): - raise DataError("No numeric types to aggregate") + # GH#41291 match Series behavior + raise NotImplementedError( + f"{type(self).__name__}.{how} does not implement numeric_only." + ) # This is overkill because it is only called once, but is here to # mirror the array_func used in DataFrameGroupBy._cython_agg_general @@ -1069,7 +1072,7 @@ def _iterate_slices(self) -> Iterable[Series]: yield values def _cython_agg_general( - self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 ) -> DataFrame: # Note: we never get here with how="ohlc"; that goes through SeriesGroupBy diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b7254ffecb2bc..4d0c570feeb01 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1101,6 +1101,24 @@ def _wrap_transformed_output(self, output: Mapping[base.OutputKey, ArrayLike]): def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) + def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: + """ + For SeriesGroupBy we want the default to be False (to match Series behavior). + For DataFrameGroupBy we want it to be True (for backwards-compat). + """ + # GH#41291 + if numeric_only is lib.no_default: + # i.e. not explicitly passed by user + if self.obj.ndim == 2: + # i.e. DataFrameGroupBy + numeric_only = True + else: + numeric_only = False + + # error: Incompatible return value type (got "Union[bool, NoDefault]", + # expected "bool") + return numeric_only # type: ignore[return-value] + # ----------------------------------------------------------------- # numba @@ -1290,6 +1308,7 @@ def _agg_general( alias: str, npfunc: Callable, ): + with group_selection_context(self): # try a cython aggregation if we can result = None @@ -1357,7 +1376,7 @@ def _agg_py_fallback( return ensure_block_shape(res_values, ndim=ndim) def _cython_agg_general( - self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 ): raise AbstractMethodError(self) @@ -1599,7 +1618,7 @@ def count(self): @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) - def mean(self, numeric_only: bool = True): + def mean(self, numeric_only: bool | lib.NoDefault = lib.no_default): """ Compute mean of groups, excluding missing values. @@ -1647,6 +1666,8 @@ def mean(self, numeric_only: bool = True): 2 4.0 Name: B, dtype: float64 """ + numeric_only = self._resolve_numeric_only(numeric_only) + result = self._cython_agg_general( "mean", alt=lambda x: Series(x).mean(numeric_only=numeric_only), @@ -1657,7 +1678,7 @@ def mean(self, numeric_only: bool = True): @final @Substitution(name="groupby") @Appender(_common_see_also) - def median(self, numeric_only=True): + def median(self, numeric_only: bool | lib.NoDefault = lib.no_default): """ Compute median of groups, excluding missing values. @@ -1674,6 +1695,8 @@ def median(self, numeric_only=True): Series or DataFrame Median of values within each group. """ + numeric_only = self._resolve_numeric_only(numeric_only) + result = self._cython_agg_general( "median", alt=lambda x: Series(x).median(numeric_only=numeric_only), @@ -1731,8 +1754,9 @@ def var(self, ddof: int = 1): Variance of values within each group. """ if ddof == 1: + numeric_only = self._resolve_numeric_only(lib.no_default) return self._cython_agg_general( - "var", alt=lambda x: Series(x).var(ddof=ddof) + "var", alt=lambda x: Series(x).var(ddof=ddof), numeric_only=numeric_only ) else: func = lambda x: x.var(ddof=ddof) @@ -1797,7 +1821,10 @@ def size(self) -> FrameOrSeriesUnion: @final @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) - def sum(self, numeric_only: bool = True, min_count: int = 0): + def sum( + self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0 + ): + numeric_only = self._resolve_numeric_only(numeric_only) # If we are grouping on categoricals we want unobserved categories to # return zero, rather than the default of NaN which the reindexing in @@ -1814,7 +1841,11 @@ def sum(self, numeric_only: bool = True, min_count: int = 0): @final @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0) - def prod(self, numeric_only: bool = True, min_count: int = 0): + def prod( + self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0 + ): + numeric_only = self._resolve_numeric_only(numeric_only) + return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod ) @@ -2739,7 +2770,7 @@ def _get_cythonized_result( how: str, cython_dtype: np.dtype, aggregate: bool = False, - numeric_only: bool = True, + numeric_only: bool | lib.NoDefault = lib.no_default, needs_counts: bool = False, needs_values: bool = False, needs_2d: bool = False, @@ -2807,6 +2838,8 @@ def _get_cythonized_result( ------- `Series` or `DataFrame` with filled values """ + numeric_only = self._resolve_numeric_only(numeric_only) + if result_is_index and aggregate: raise ValueError("'result_is_index' and 'aggregate' cannot both be True!") if post_processing and not callable(post_processing): diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index ded10ab11d5a8..8777615d4519b 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -91,7 +91,10 @@ def test_cython_agg_nothing_to_agg(): frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) msg = "No numeric types to aggregate" - with pytest.raises(DataError, match=msg): + with pytest.raises(NotImplementedError, match="does not implement"): + frame.groupby("a")["b"].mean(numeric_only=True) + + with pytest.raises(TypeError, match="Could not convert (foo|bar)*"): frame.groupby("a")["b"].mean() frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) @@ -107,9 +110,8 @@ def test_cython_agg_nothing_to_agg_with_dates(): "dates": pd.date_range("now", periods=50, freq="T"), } ) - msg = "No numeric types to aggregate" - with pytest.raises(DataError, match=msg): - frame.groupby("b").dates.mean() + with pytest.raises(NotImplementedError, match="does not implement"): + frame.groupby("b").dates.mean(numeric_only=True) def test_cython_agg_frame_columns(): @@ -170,7 +172,7 @@ def test__cython_agg_general(op, targop): df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) - result = df.groupby(labels)._cython_agg_general(op) + result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True) expected = df.groupby(labels).agg(targop) tm.assert_frame_equal(result, expected) @@ -192,7 +194,7 @@ def test_cython_agg_empty_buckets(op, targop, observed): # calling _cython_agg_general directly, instead of via the user API # which sets different values for min_count, so do that here. g = df.groupby(pd.cut(df[0], grps), observed=observed) - result = g._cython_agg_general(op) + result = g._cython_agg_general(op, alt=None, numeric_only=True) g = df.groupby(pd.cut(df[0], grps), observed=observed) expected = g.agg(lambda x: targop(x)) @@ -209,7 +211,7 @@ def test_cython_agg_empty_buckets_nanops(observed): grps = range(0, 25, 5) # add / sum result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( - "add" + "add", alt=None, numeric_only=True ) intervals = pd.interval_range(0, 20, freq=5) expected = DataFrame( @@ -223,7 +225,7 @@ def test_cython_agg_empty_buckets_nanops(observed): # prod result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( - "prod" + "prod", alt=None, numeric_only=True ) expected = DataFrame( {"a": [1, 1, 1716, 1]}, diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index abfa2a23a4402..3eb2e361c52bc 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1748,11 +1748,29 @@ def test_empty_groupby(columns, keys, values, method, op): df = df.iloc[:0] gb = df.groupby(keys)[columns] - if method == "attr": - result = getattr(gb, op)() - else: - result = getattr(gb, method)(op) + def get_result(): + if method == "attr": + return getattr(gb, op)() + else: + return getattr(gb, method)(op) + + if columns == "C": + # i.e. SeriesGroupBy + if op in ["prod", "sum"]: + # ops that require more than just ordered-ness + if method != "apply": + # FIXME: apply goes through different code path + if df.dtypes[0].kind == "M": + # GH#41291 + # datetime64 -> prod and sum are invalid + msg = "datetime64 type does not support" + with pytest.raises(TypeError, match=msg): + get_result() + + return + + result = get_result() expected = df.set_index(keys)[columns] if override_dtype is not None: expected = expected.astype(override_dtype) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 66cb2f2291e98..3bc7b539a3e07 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -61,7 +61,7 @@ def test_custom_grouper(index): g.ohlc() # doesn't use _cython_agg_general funcs = ["add", "mean", "prod", "min", "max", "var"] for f in funcs: - g._cython_agg_general(f) + g._cython_agg_general(f, alt=None, numeric_only=True) b = Grouper(freq=Minute(5), closed="right", label="right") g = s.groupby(b) @@ -69,7 +69,7 @@ def test_custom_grouper(index): g.ohlc() # doesn't use _cython_agg_general funcs = ["add", "mean", "prod", "min", "max", "var"] for f in funcs: - g._cython_agg_general(f) + g._cython_agg_general(f, alt=None, numeric_only=True) assert g.ngroups == 2593 assert notna(g.mean()).all() @@ -417,7 +417,7 @@ def test_resample_frame_basic(): # check all cython functions work funcs = ["add", "mean", "prod", "min", "max", "var"] for f in funcs: - g._cython_agg_general(f) + g._cython_agg_general(f, alt=None, numeric_only=True) result = df.resample("A").mean() tm.assert_series_equal(result["A"], df["A"].resample("A").mean()) From fd10c3b8af958538cd8fdae1391812c4ab585d8e Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 6 May 2021 08:11:30 -0700 Subject: [PATCH 2/5] trim xfails --- pandas/tests/groupby/test_groupby.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9dcf56db3d0b4..7b526fe4c6605 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1739,7 +1739,15 @@ def test_pivot_table_values_key_error(): def test_empty_groupby(columns, keys, values, method, op, request): # GH8093 & GH26411 - if isinstance(values, Categorical) and len(keys) == 1 and method == "apply": + if ( + isinstance(values, Categorical) + and not isinstance(columns, list) + and op in ["sum", "prod"] + and method != "apply" + ): + # handled below GH#41291 + pass + elif isinstance(values, Categorical) and len(keys) == 1 and method == "apply": mark = pytest.mark.xfail(raises=TypeError, match="'str' object is not callable") request.node.add_marker(mark) elif ( @@ -1824,6 +1832,13 @@ def get_result(): get_result() return + elif isinstance(values, Categorical): + # GH#41291 + msg = "category type does not support" + with pytest.raises(TypeError, match=msg): + get_result() + + return result = get_result() expected = df.set_index(keys)[columns] From ef0a634238c5513fc76f6256afc5cd417f5135a2 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 10 May 2021 14:51:49 -0700 Subject: [PATCH 3/5] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 5adc8540e6864..b01fbdcdc24a5 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -895,6 +895,7 @@ Groupby/resample/rolling - Bug in :meth:`SeriesGroupBy.agg` failing to retain ordered :class:`CategoricalDtype` on order-preserving aggregations (:issue:`41147`) - Bug in :meth:`DataFrameGroupBy.min` and :meth:`DataFrameGroupBy.max` with multiple object-dtype columns and ``numeric_only=False`` incorrectly raising ``ValueError`` (:issue:41111`) - Bug in :meth:`DataFrameGroupBy.rank` with the GroupBy object's ``axis=0`` and the ``rank`` method's keyword ``axis=1`` (:issue:`41320`) +- Bug in :meth:`SeriesGroupBy` aggregations incorrectly returning empty :class:`Series` instead of raising ``NotImplementedError`` on aggregations that are invalid for its dtype (:issue:`41342`) Reshaping ^^^^^^^^^ From 99ba977c978578659e84b9c4526254e689eaa374 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 17 May 2021 11:04:29 -0700 Subject: [PATCH 4/5] expand whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index d2f52b63ca581..4ad3c5ea67963 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -907,7 +907,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.__getitem__` with non-unique columns incorrectly returning a malformed :class:`SeriesGroupBy` instead of :class:`DataFrameGroupBy` (:issue:`41427`) - Bug in :meth:`DataFrameGroupBy.transform` with non-unique columns incorrectly raising ``AttributeError`` (:issue:`41427`) - Bug in :meth:`Resampler.apply` with non-unique columns incorrectly dropping duplicated columns (:issue:`41445`) -- Bug in :meth:`SeriesGroupBy` aggregations incorrectly returning empty :class:`Series` instead of raising ``TypeError`` on aggregations that are invalid for its dtype (:issue:`41342`) +- Bug in :meth:`SeriesGroupBy` aggregations incorrectly returning empty :class:`Series` instead of raising ``TypeError`` on aggregations that are invalid for its dtype, e.g. ``.prod`` with ``datetime64[ns]`` dtype (:issue:`41342`) Reshaping ^^^^^^^^^ From ce59c54332ff5d787bff80370cba28c81ef54431 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 21 May 2021 10:22:51 -0700 Subject: [PATCH 5/5] formalize docstring --- pandas/core/groupby/groupby.py | 10 ++++++++++ pandas/tests/groupby/aggregate/test_cython.py | 3 ++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index fcfcbcc8e74ad..ad207c099f4bb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1102,8 +1102,18 @@ def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = Fals def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: """ + Determine subclass-specific default value for 'numeric_only'. + For SeriesGroupBy we want the default to be False (to match Series behavior). For DataFrameGroupBy we want it to be True (for backwards-compat). + + Parameters + ---------- + numeric_only : bool or lib.no_default + + Returns + ------- + bool """ # GH#41291 if numeric_only is lib.no_default: diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 79b31387b1afc..cf1177d231e37 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -89,7 +89,6 @@ def test_cython_agg_boolean(): def test_cython_agg_nothing_to_agg(): frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) - msg = "No numeric types to aggregate" with pytest.raises(NotImplementedError, match="does not implement"): frame.groupby("a")["b"].mean(numeric_only=True) @@ -98,6 +97,8 @@ def test_cython_agg_nothing_to_agg(): frame.groupby("a")["b"].mean() frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) + + msg = "No numeric types to aggregate" with pytest.raises(DataError, match=msg): frame[["b"]].groupby(frame["a"]).mean()