Skip to content

PERF: groupby iteration #51109

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,32 @@ def _validate_dtype(cls, dtype) -> DtypeObj | None:

return dtype

@property
def _can_fast_construct(self) -> bool_t:
"""
Check if we can avoid _constructor lookup and __finalize__ calls.
"""
# GH#46505 repeated __finalize__ calls caused perf regression,
# see if we can avoid those in some cases.
from pandas import (
DataFrame,
Series,
)

if self.attrs:
# __finalize__ is not a no-op
return False

if type(self) is Series:
if self._metadata == ["name"]:
return True
return False
elif type(self) is DataFrame:
return not self._metadata

# subclass, may have overridden _constructor -> cannot fast-construct
return False

# ----------------------------------------------------------------------
# Construction

Expand Down
22 changes: 21 additions & 1 deletion pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1257,8 +1257,13 @@ def __iter__(self) -> Iterator:

starts, ends = lib.generate_slices(self._slabels, self.ngroups)

if sdata._can_fast_construct:
chop = self._chop_fast
else:
chop = self._chop

for start, end in zip(starts, ends):
yield self._chop(sdata, slice(start, end))
yield chop(sdata, slice(start, end))

@cache_readonly
def _sorted_data(self) -> NDFrameT:
Expand All @@ -1267,6 +1272,9 @@ def _sorted_data(self) -> NDFrameT:
def _chop(self, sdata, slice_obj: slice) -> NDFrame:
raise AbstractMethodError(self)

def _chop_fast(self, sdata, slice_obj: slice) -> NDFrame:
raise AbstractMethodError(self)


class SeriesSplitter(DataSplitter):
def _chop(self, sdata: Series, slice_obj: slice) -> Series:
Expand All @@ -1275,6 +1283,12 @@ def _chop(self, sdata: Series, slice_obj: slice) -> Series:
ser = sdata._constructor(mgr, name=sdata.name, fastpath=True)
return ser.__finalize__(sdata, method="groupby")

def _chop_fast(self, sdata: Series, slice_obj: slice) -> Series:
# _chop specialized to cast with _can_fast_construct
mgr = sdata._mgr.get_slice(slice_obj)
ser = Series(mgr, name=sdata.name, fastpath=True)
return ser


class FrameSplitter(DataSplitter):
def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame:
Expand All @@ -1287,6 +1301,12 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame:
df = sdata._constructor(mgr)
return df.__finalize__(sdata, method="groupby")

def _chop_fast(self, sdata: DataFrame, slice_obj: slice) -> DataFrame:
# _chop specialized to cast with _can_fast_construct
mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis)
df = DataFrame(mgr)
return df


def get_splitter(
data: NDFrame, labels: np.ndarray, ngroups: int, axis: AxisInt = 0
Expand Down