Skip to content

REF: de-duplicate result index construction in groupby #43466

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 2 additions & 9 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,16 +431,9 @@ def _wrap_applied_output(
)
assert values is not None

def _get_index() -> Index:
if self.grouper.nkeys > 1:
index = MultiIndex.from_tuples(keys, names=self.grouper.names)
else:
index = Index._with_infer(keys, name=self.grouper.names[0])
return index

if isinstance(values[0], dict):
# GH #823 #24880
index = _get_index()
index = self._group_keys_index
res_df = self.obj._constructor_expanddim(values, index=index)
res_df = self._reindex_output(res_df)
# if self.observed is False,
Expand All @@ -453,7 +446,7 @@ def _get_index() -> Index:
else:
# GH #6265 #24880
result = self.obj._constructor(
data=values, index=_get_index(), name=self.obj.name
data=values, index=self._group_keys_index, name=self.obj.name
)
return self._reindex_output(result)

Expand Down
20 changes: 14 additions & 6 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1177,6 +1177,18 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool:
# expected "bool")
return numeric_only # type: ignore[return-value]

@cache_readonly
def _group_keys_index(self) -> Index:
# The index to use for the result of Groupby Aggregations.
# This _may_ be redundant with self.grouper.result_index, but that
# has not been conclusively proven yet.
keys = self.grouper._get_group_keys()
if self.grouper.nkeys > 1:
index = MultiIndex.from_tuples(keys, names=self.grouper.names)
else:
index = Index._with_infer(keys, name=self.grouper.names[0])
return index

# -----------------------------------------------------------------
# numba

Expand Down Expand Up @@ -1244,15 +1256,15 @@ def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs)
data and indices into a Numba jitted function.
"""
starts, ends, sorted_index, sorted_data = self._numba_prep(func, data)
group_keys = self.grouper._get_group_keys()
index = self._group_keys_index

numba_agg_func = numba_.generate_numba_agg_func(kwargs, func, engine_kwargs)
result = numba_agg_func(
sorted_data,
sorted_index,
starts,
ends,
len(group_keys),
len(index),
len(data.columns),
*args,
)
Expand All @@ -1261,10 +1273,6 @@ def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs)
if cache_key not in NUMBA_FUNC_CACHE:
NUMBA_FUNC_CACHE[cache_key] = numba_agg_func

if self.grouper.nkeys > 1:
index = MultiIndex.from_tuples(group_keys, names=self.grouper.names)
else:
index = Index(group_keys, name=self.grouper.names[0])
return result, index

# -----------------------------------------------------------------
Expand Down