-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
PERF: Try fast/slow paths only once in DataFrameGroupby.transform #42195
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
7b86fdb
b70e420
15d238a
36de30e
2c1d337
2d40eaa
ec9515b
2bea964
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1309,42 +1309,51 @@ def _transform_general(self, func, *args, **kwargs): | |
gen = self.grouper.get_iterator(obj, axis=self.axis) | ||
fast_path, slow_path = self._define_paths(func, *args, **kwargs) | ||
|
||
for name, group in gen: | ||
object.__setattr__(group, "name", name) | ||
|
||
# Try slow path and fast path. | ||
try: | ||
path, res = self._choose_path(fast_path, slow_path, group) | ||
except TypeError: | ||
return self._transform_item_by_item(obj, fast_path) | ||
except ValueError as err: | ||
msg = "transform must return a scalar value for each group" | ||
raise ValueError(msg) from err | ||
|
||
def process_result(applied, group, res): | ||
if isinstance(res, Series): | ||
|
||
# we need to broadcast across the | ||
# other dimension; this will preserve dtypes | ||
# GH14457 | ||
if not np.prod(group.shape): | ||
continue | ||
elif res.index.is_(obj.index): | ||
r = concat([res] * len(group.columns), axis=1) | ||
r.columns = group.columns | ||
r.index = group.index | ||
pass | ||
else: | ||
r = self.obj._constructor( | ||
np.concatenate([res.values] * len(group.index)).reshape( | ||
group.shape | ||
), | ||
columns=group.columns, | ||
index=group.index, | ||
) | ||
|
||
applied.append(r) | ||
if res.index.is_(obj.index): | ||
r = concat([res] * len(group.columns), axis=1) | ||
r.columns = group.columns | ||
r.index = group.index | ||
else: | ||
r = self.obj._constructor( | ||
np.concatenate([res.values] * len(group.index)).reshape( | ||
group.shape | ||
), | ||
columns=group.columns, | ||
index=group.index, | ||
) | ||
applied.append(r) | ||
else: | ||
applied.append(res) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can process_result return res and do this append outside the func There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One path uses There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Never mind, the path using continue can almost be moved ahead of any computation - it doesn't really depend on the result except for whether the result is a Series or not. I'm going to see if I can do a precursor cleanup of this method. Marking this as a draft for now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK. if the answer to my question is "no" thats OK too. |
||
|
||
try: | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
name, group = next(gen) | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
except StopIteration: | ||
pass | ||
else: | ||
object.__setattr__(group, "name", name) | ||
try: | ||
path, res = self._choose_path(fast_path, slow_path, group) | ||
except TypeError: | ||
return self._transform_item_by_item(obj, fast_path) | ||
except ValueError as err: | ||
msg = "transform must return a scalar value for each group" | ||
raise ValueError(msg) from err | ||
process_result(applied, group, res) | ||
|
||
for name, group in gen: | ||
object.__setattr__(group, "name", name) | ||
res = path(group) | ||
process_result(applied, group, res) | ||
|
||
concat_index = obj.columns if self.axis == 0 else obj.index | ||
other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 | ||
concatenated = concat(applied, axis=self.axis, verify_integrity=False) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can this be changed to return the object to be appended to applied? rather than mutating