Skip to content

REF: document casting behavior in groupby #41376

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,9 +539,6 @@ def _transform_general(self, func: Callable, *args, **kwargs) -> Series:
object.__setattr__(group, "name", name)
res = func(group, *args, **kwargs)

if isinstance(res, (DataFrame, Series)):
res = res._values

results.append(klass(res, index=group.index))

# check for empty "results" to avoid concat ValueError
Expand Down Expand Up @@ -1251,12 +1248,11 @@ def _wrap_applied_output_series(
columns = key_index
stacked_values = stacked_values.T

if stacked_values.dtype == object:
# We'll have the DataFrame constructor do inference
stacked_values = stacked_values.tolist()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any perf impact by passing a list to DataFrame constructor?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test_apply_numeric_coercion_when_datetime goes through here. running the usage in timeit looks like a wash

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think we may be able to do better/cleaner following #40489

result = self.obj._constructor(stacked_values, index=index, columns=columns)

# if we have date/time like in the original, then coerce dates
# as we are stacking can easily have object dtypes here
result = result._convert(datetime=True)

if not self.as_index:
self._insert_inaxis_grouper_inplace(result)

Expand Down
5 changes: 4 additions & 1 deletion pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1329,7 +1329,10 @@ def _agg_py_fallback(
# reductions; see GH#28949
ser = df.iloc[:, 0]

res_values = self.grouper.agg_series(ser, alt)
# We do not get here with UDFs, so we know that our dtype
# should always be preserved by the implemented aggregations
# TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?
res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)

if isinstance(values, Categorical):
# Because we only get here with known dtype-preserving
Expand Down
27 changes: 22 additions & 5 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -966,11 +966,24 @@ def _cython_operation(
)

@final
def agg_series(self, obj: Series, func: F) -> ArrayLike:
def agg_series(
self, obj: Series, func: F, preserve_dtype: bool = False
) -> ArrayLike:
"""
Parameters
----------
obj : Series
func : function taking a Series and returning a scalar-like
preserve_dtype : bool
Whether the aggregation is known to be dtype-preserving.

Returns
-------
np.ndarray or ExtensionArray
"""
# test_groupby_empty_with_category gets here with self.ngroups == 0
# and len(obj) > 0

cast_back = True
if len(obj) == 0:
# SeriesGrouper would raise if we were to call _aggregate_series_fast
result = self._aggregate_series_pure_python(obj, func)
Expand All @@ -982,17 +995,21 @@ def agg_series(self, obj: Series, func: F) -> ArrayLike:
# TODO: can we get a performant workaround for EAs backed by ndarray?
result = self._aggregate_series_pure_python(obj, func)

# we can preserve a little bit more aggressively with EA dtype
# because maybe_cast_pointwise_result will do a try/except
# with _from_sequence. NB we are assuming here that _from_sequence
# is sufficiently strict that it casts appropriately.
preserve_dtype = True

elif obj.index._has_complex_internals:
# Preempt TypeError in _aggregate_series_fast
result = self._aggregate_series_pure_python(obj, func)

else:
result = self._aggregate_series_fast(obj, func)
cast_back = False

npvalues = lib.maybe_convert_objects(result, try_float=False)
if cast_back:
# TODO: Is there a documented reason why we dont always cast_back?
if preserve_dtype:
out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True)
else:
out = npvalues
Expand Down