diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7796df98395a7..123b9e3350fda 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -539,9 +539,6 @@ def _transform_general(self, func: Callable, *args, **kwargs) -> Series: object.__setattr__(group, "name", name) res = func(group, *args, **kwargs) - if isinstance(res, (DataFrame, Series)): - res = res._values - results.append(klass(res, index=group.index)) # check for empty "results" to avoid concat ValueError @@ -1251,12 +1248,11 @@ def _wrap_applied_output_series( columns = key_index stacked_values = stacked_values.T + if stacked_values.dtype == object: + # We'll have the DataFrame constructor do inference + stacked_values = stacked_values.tolist() result = self.obj._constructor(stacked_values, index=index, columns=columns) - # if we have date/time like in the original, then coerce dates - # as we are stacking can easily have object dtypes here - result = result._convert(datetime=True) - if not self.as_index: self._insert_inaxis_grouper_inplace(result) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2091d2fc484e1..0b07668a9fea2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1329,7 +1329,10 @@ def _agg_py_fallback( # reductions; see GH#28949 ser = df.iloc[:, 0] - res_values = self.grouper.agg_series(ser, alt) + # We do not get here with UDFs, so we know that our dtype + # should always be preserved by the implemented aggregations + # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype? + res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True) if isinstance(values, Categorical): # Because we only get here with known dtype-preserving diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 3045451974ee7..8b6136b3abc42 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -966,11 +966,24 @@ def _cython_operation( ) @final - def agg_series(self, obj: Series, func: F) -> ArrayLike: + def agg_series( + self, obj: Series, func: F, preserve_dtype: bool = False + ) -> ArrayLike: + """ + Parameters + ---------- + obj : Series + func : function taking a Series and returning a scalar-like + preserve_dtype : bool + Whether the aggregation is known to be dtype-preserving. + + Returns + ------- + np.ndarray or ExtensionArray + """ # test_groupby_empty_with_category gets here with self.ngroups == 0 # and len(obj) > 0 - cast_back = True if len(obj) == 0: # SeriesGrouper would raise if we were to call _aggregate_series_fast result = self._aggregate_series_pure_python(obj, func) @@ -982,17 +995,21 @@ def agg_series(self, obj: Series, func: F) -> ArrayLike: # TODO: can we get a performant workaround for EAs backed by ndarray? result = self._aggregate_series_pure_python(obj, func) + # we can preserve a little bit more aggressively with EA dtype + # because maybe_cast_pointwise_result will do a try/except + # with _from_sequence. NB we are assuming here that _from_sequence + # is sufficiently strict that it casts appropriately. + preserve_dtype = True + elif obj.index._has_complex_internals: # Preempt TypeError in _aggregate_series_fast result = self._aggregate_series_pure_python(obj, func) else: result = self._aggregate_series_fast(obj, func) - cast_back = False npvalues = lib.maybe_convert_objects(result, try_float=False) - if cast_back: - # TODO: Is there a documented reason why we dont always cast_back? + if preserve_dtype: out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) else: out = npvalues