Skip to content

REF: Groupby._get_cythonized_result operate blockwise in axis==1 case #43435

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Sep 13, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,13 +375,18 @@ def _wrap_aggregated_output(
In the vast majority of cases output will only contain one element.
The exception is operations that expand dimensions, like ohlc.
"""
assert len(output) == 1
if isinstance(output, Series):
result = output
result.index = self.grouper.result_index
result.name = self.obj.name
else:
assert len(output) == 1

name = self.obj.name
index = self.grouper.result_index
values = next(iter(output.values()))
name = self.obj.name
index = self.grouper.result_index
values = next(iter(output.values()))

result = self.obj._constructor(values, index=index, name=name)
result = self.obj._constructor(values, index=index, name=name)
return self._reindex_output(result)

def _wrap_transformed_output(
Expand Down
40 changes: 37 additions & 3 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -3010,15 +3010,49 @@ def blk_func(values: ArrayLike) -> ArrayLike:
return result.T

obj = self._obj_with_exclusions
if obj.ndim == 2 and self.axis == 0 and needs_2d and real_2d:
if needs_2d and real_2d:
# Operate block-wise instead of column-by-column
orig_ndim = obj.ndim
if orig_ndim == 1:
# Operate on DataFrame, then squeeze below
obj = obj.to_frame()

mgr = obj._mgr
if self.axis == 1:
mgr = obj.T._mgr

if numeric_only:
mgr = mgr.get_numeric_data()

# setting ignore_failures=False for troubleshooting
res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=False)
res_mgr = mgr.grouped_reduce(blk_func, ignore_failures=True)
if len(res_mgr.items) != len(mgr.items):
howstr = how.replace("group_", "")
warnings.warn(
"Dropping invalid columns in "
f"{type(self).__name__}.{howstr} is deprecated. "
"In a future version, a TypeError will be raised. "
f"Before calling .{howstr}, select only columns which "
"should be valid for the function.",
FutureWarning,
stacklevel=3,
)
if len(res_mgr.items) == 0:
# We re-call grouped_reduce to get the right exception message
try:
mgr.grouped_reduce(blk_func, ignore_failures=False)
except Exception as err:
error_msg = str(err)
raise TypeError(error_msg)
# We should never get here
raise TypeError("All columns were dropped in grouped_reduce")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

luckily we'll be able to get rid of L3028-L3047 once the deprecation is enforced


output = type(obj)(res_mgr)

if orig_ndim == 1:
assert output.ndim == 2
assert output.shape[1] == 1
output = output.iloc[:, 0]

if aggregate:
return self._wrap_aggregated_output(output)
else:
Expand Down