From 3e5b8fa514ada58d975c029ce3d779b7be1eb0af Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 2 May 2021 10:01:18 -0700 Subject: [PATCH 1/3] REF: implement SeriesGroupBy._cython_agg_general in terms of array_func --- pandas/core/groupby/generic.py | 51 +++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b9f1ca0710872..5d84f9642bdac 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -350,37 +350,42 @@ def _cython_agg_general( # the index from enumeration as the key of output, but ohlc in particular # returns a (n x 4) array. Output requires 1D ndarrays as values, so we # need to slice that up into 1D arrays - idx = 0 - for obj in self._iterate_slices(): - name = obj.name - is_numeric = is_numeric_dtype(obj.dtype) - if numeric_only and not is_numeric: - continue - objvals = obj._values + obj = self._selected_obj + objvals = obj._values + + if numeric_only and not is_numeric_dtype(obj.dtype): + raise DataError("No numeric types to aggregate") - if isinstance(objvals, Categorical): + # This is overkill because it is only called once, but is here to + # mirror the array_func used in DataFrameGroupBy._cython_agg_general + def array_func(values: ArrayLike) -> ArrayLike: + try: + result = self.grouper._cython_operation( + "aggregate", values, how, axis=0, min_count=min_count + ) + except NotImplementedError: + ser = Series(values) # equiv 'obj' from outer frame if self.grouper.ngroups > 0: - # without special-casing, we would raise, then in fallback - # would eventually call agg_series but without re-casting - # to Categorical - # equiv: res_values, _ = self.grouper.agg_series(obj, alt) - res_values, _ = self.grouper._aggregate_series_pure_python(obj, alt) + res_values, _ = self.grouper.agg_series(ser, alt) else: # equiv: res_values = self._python_agg_general(alt) - res_values = self._python_apply_general(alt, self._selected_obj) + res_values = self._python_apply_general(alt, ser) - result = type(objvals)._from_sequence(res_values, dtype=objvals.dtype) + if isinstance(values, Categorical): + # Because we only get here with known dtype-preserving + # reductions, we cast back to Categorical. + # TODO: if we ever get "rank" working, exclude it here. + result = type(values)._from_sequence(res_values, dtype=values.dtype) + else: + result = res_values + return result - else: - result = self.grouper._cython_operation( - "aggregate", obj._values, how, axis=0, min_count=min_count - ) + result = array_func(objvals) - assert result.ndim == 1 - key = base.OutputKey(label=name, position=idx) - output[key] = result - idx += 1 + assert result.ndim == 1 + key = base.OutputKey(label=obj.name, position=0) + output[key] = result if not output: raise DataError("No numeric types to aggregate") From 041602732e65f5edbd8bd1c71c744aa8016ba68b Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 2 May 2021 11:07:22 -0700 Subject: [PATCH 2/3] REF: dont catch NotImplementedErrorin _agg_general --- pandas/core/groupby/generic.py | 7 ++++++- pandas/core/groupby/groupby.py | 9 --------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5d84f9642bdac..8e4404b25483a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -370,7 +370,12 @@ def array_func(values: ArrayLike) -> ArrayLike: res_values, _ = self.grouper.agg_series(ser, alt) else: # equiv: res_values = self._python_agg_general(alt) - res_values = self._python_apply_general(alt, ser) + # error: Incompatible types in assignment (expression has + # type "Union[DataFrame, Series]", variable has type + # "Union[ExtensionArray, ndarray]") + res_values = self._python_apply_general( # type: ignore[assignment] + alt, ser + ) if isinstance(values, Categorical): # Because we only get here with known dtype-preserving diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7fe9d7cb49eb5..cdd2cef1f2e59 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1281,15 +1281,6 @@ def _agg_general( ) except DataError: pass - except NotImplementedError as err: - if "function is not implemented for this dtype" in str( - err - ) or "category dtype not supported" in str(err): - # raised in _get_cython_function, in some cases can - # be trimmed by implementing cython funcs for more dtypes - pass - else: - raise # apply a non-cython aggregation if result is None: From 3b691f961a1cd04fdff0fb6c09659859847722df Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 2 May 2021 12:54:22 -0700 Subject: [PATCH 3/3] REF: simplify _cython_agg_general --- pandas/core/groupby/generic.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8e4404b25483a..7edd458ced790 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -345,11 +345,6 @@ def _aggregate_multiple_funcs(self, arg): def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ): - output: dict[base.OutputKey, ArrayLike] = {} - # Ideally we would be able to enumerate self._iterate_slices and use - # the index from enumeration as the key of output, but ohlc in particular - # returns a (n x 4) array. Output requires 1D ndarrays as values, so we - # need to slice that up into 1D arrays obj = self._selected_obj objvals = obj._values @@ -388,14 +383,10 @@ def array_func(values: ArrayLike) -> ArrayLike: result = array_func(objvals) - assert result.ndim == 1 - key = base.OutputKey(label=obj.name, position=0) - output[key] = result - - if not output: - raise DataError("No numeric types to aggregate") - - return self._wrap_aggregated_output(output) + ser = self.obj._constructor( + result, index=self.grouper.result_index, name=obj.name + ) + return self._reindex_output(ser) def _wrap_aggregated_output( self,