Skip to content

REF: groupby Series selection with as_index=False #50744

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jan 18, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 52 additions & 26 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import abc
from collections import defaultdict
from contextlib import nullcontext
from functools import partial
import inspect
from typing import (
Expand Down Expand Up @@ -292,6 +293,10 @@ def agg_list_like(self) -> DataFrame | Series:
-------
Result of aggregation.
"""
from pandas.core.groupby.generic import (
DataFrameGroupBy,
SeriesGroupBy,
)
from pandas.core.reshape.concat import concat

obj = self.obj
Expand All @@ -312,26 +317,35 @@ def agg_list_like(self) -> DataFrame | Series:
results = []
keys = []

# degenerate case
if selected_obj.ndim == 1:
for a in arg:
colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj)
new_res = colg.aggregate(a)
results.append(new_res)
is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy))
if is_groupby:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Kinda unfortunate that groupby knowledge is leaking in pandas/core/apply.py, guessing there's no way to avoid this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. I don't have thoughts on how to improve this just yet, but after removing obj_with_exclusions (we're almost there I think), I plan to try to cleanup the paths through groupby.agg, etc. Without a refactor, we need this because other objects don't have as_index.

# When as_index=False, we combine all results using indices
# and adjust index after
context_manager = com.temp_setattr(obj, "as_index", True)
else:
context_manager = nullcontext()
with context_manager:
# degenerate case
if selected_obj.ndim == 1:

# make sure we find a good name
name = com.get_callable_name(a) or a
keys.append(name)
for a in arg:
colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj)
new_res = colg.aggregate(a)
results.append(new_res)

# multiples
else:
indices = []
for index, col in enumerate(selected_obj):
colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index])
new_res = colg.aggregate(arg)
results.append(new_res)
indices.append(index)
keys = selected_obj.columns.take(indices)
# make sure we find a good name
name = com.get_callable_name(a) or a
keys.append(name)

# multiples
else:
indices = []
for index, col in enumerate(selected_obj):
colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index])
new_res = colg.aggregate(arg)
results.append(new_res)
indices.append(index)
keys = selected_obj.columns.take(indices)

try:
concatenated = concat(results, keys=keys, axis=1, sort=False)
Expand Down Expand Up @@ -366,6 +380,10 @@ def agg_dict_like(self) -> DataFrame | Series:
Result of aggregation.
"""
from pandas import Index
from pandas.core.groupby.generic import (
DataFrameGroupBy,
SeriesGroupBy,
)
from pandas.core.reshape.concat import concat

obj = self.obj
Expand All @@ -384,15 +402,23 @@ def agg_dict_like(self) -> DataFrame | Series:

arg = self.normalize_dictlike_arg("agg", selected_obj, arg)

if selected_obj.ndim == 1:
# key only used for output
colg = obj._gotitem(selection, ndim=1)
results = {key: colg.agg(how) for key, how in arg.items()}
is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy))
if is_groupby:
# When as_index=False, we combine all results using indices
# and adjust index after
context_manager = com.temp_setattr(obj, "as_index", True)
else:
# key used for column selection and output
results = {
key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()
}
context_manager = nullcontext()
with context_manager:
if selected_obj.ndim == 1:
# key only used for output
colg = obj._gotitem(selection, ndim=1)
results = {key: colg.agg(how) for key, how in arg.items()}
else:
# key used for column selection and output
results = {
key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()
}

# set the final keys
keys = list(arg.keys())
Expand Down
13 changes: 5 additions & 8 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,9 @@ def _obj_with_exclusions(self):
if self._selection is not None and isinstance(self.obj, ABCDataFrame):
return self.obj[self._selection_list]

if isinstance(self.obj, ABCSeries):
return self.obj

if len(self.exclusions) > 0:
# equivalent to `self.obj.drop(self.exclusions, axis=1)
# but this avoids consolidating and making a copy
Expand All @@ -235,17 +238,11 @@ def __getitem__(self, key):
raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}")
return self._gotitem(list(key), ndim=2)

elif not getattr(self, "as_index", False):
if key not in self.obj.columns:
raise KeyError(f"Column not found: {key}")
return self._gotitem(key, ndim=2)

else:
if key not in self.obj:
raise KeyError(f"Column not found: {key}")
subset = self.obj[key]
ndim = subset.ndim
return self._gotitem(key, ndim=ndim, subset=subset)
ndim = self.obj[key].ndim
return self._gotitem(key, ndim=ndim)

def _gotitem(self, key, ndim: int, subset=None):
"""
Expand Down
84 changes: 48 additions & 36 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,11 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs
)
index = self.grouper.result_index
return self.obj._constructor(result.ravel(), index=index, name=data.name)
result = self.obj._constructor(result.ravel(), index=index, name=data.name)
if not self.as_index:
result = self._insert_inaxis_grouper(result)
result.index = default_index(len(result))
return result

relabeling = func is None
columns = None
Expand All @@ -268,6 +272,9 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
# columns is not narrowed by mypy from relabeling flag
assert columns is not None # for mypy
ret.columns = columns
if not self.as_index:
ret = self._insert_inaxis_grouper(ret)
ret.index = default_index(len(ret))
return ret

else:
Expand All @@ -287,23 +294,24 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)

# result is a dict whose keys are the elements of result_index
index = self.grouper.result_index
return Series(result, index=index)
result = Series(result, index=index)
if not self.as_index:
result = self._insert_inaxis_grouper(result)
result.index = default_index(len(result))
return result

agg = aggregate

def _aggregate_multiple_funcs(self, arg) -> DataFrame:
if isinstance(arg, dict):

# show the deprecation, but only if we
# have not shown a higher level one
# GH 15931
raise SpecificationError("nested renamer is not supported")

if any(isinstance(x, (tuple, list)) for x in arg):
if self.as_index:
# GH 15931
raise SpecificationError("nested renamer is not supported")
else:
# GH#50684 - This accidentally worked in 1.x
arg = list(arg.items())
elif any(isinstance(x, (tuple, list)) for x in arg):
arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]

# indicated column order
columns = next(zip(*arg))
else:
# list of functions / function names
columns = []
Expand All @@ -313,10 +321,13 @@ def _aggregate_multiple_funcs(self, arg) -> DataFrame:
arg = zip(columns, arg)

results: dict[base.OutputKey, DataFrame | Series] = {}
for idx, (name, func) in enumerate(arg):
with com.temp_setattr(self, "as_index", True):
# Combine results using the index, need to adjust index after
# if as_index=False (GH#50724)
for idx, (name, func) in enumerate(arg):

key = base.OutputKey(label=name, position=idx)
results[key] = self.aggregate(func)
key = base.OutputKey(label=name, position=idx)
results[key] = self.aggregate(func)

if any(isinstance(x, DataFrame) for x in results.values()):
from pandas import concat
Expand Down Expand Up @@ -396,12 +407,18 @@ def _wrap_applied_output(
)
if isinstance(result, Series):
result.name = self.obj.name
if not self.as_index and not_indexed_same:
result = self._insert_inaxis_grouper(result)
result.index = default_index(len(result))
return result
else:
# GH #6265 #24880
result = self.obj._constructor(
data=values, index=self.grouper.result_index, name=self.obj.name
)
if not self.as_index:
result = self._insert_inaxis_grouper(result)
result.index = default_index(len(result))
return self._reindex_output(result)

def _aggregate_named(self, func, *args, **kwargs):
Expand Down Expand Up @@ -630,6 +647,9 @@ def nunique(self, dropna: bool = True) -> Series:
res[ids[idx]] = out

result = self.obj._constructor(res, index=ri, name=self.obj.name)
if not self.as_index:
result = self._insert_inaxis_grouper(result)
result.index = default_index(len(result))
return self._reindex_output(result, fill_value=0)

@doc(Series.describe)
Expand All @@ -643,12 +663,11 @@ def value_counts(
ascending: bool = False,
bins=None,
dropna: bool = True,
) -> Series:
) -> Series | DataFrame:
if bins is None:
result = self._value_counts(
normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
)
assert isinstance(result, Series)
return result

from pandas.core.reshape.merge import get_join_indexers
Expand Down Expand Up @@ -786,7 +805,11 @@ def build_codes(lev_codes: np.ndarray) -> np.ndarray:

if is_integer_dtype(out.dtype):
out = ensure_int64(out)
return self.obj._constructor(out, index=mi, name=self.obj.name)
result = self.obj._constructor(out, index=mi, name=self.obj.name)
if not self.as_index:
result.name = "proportion" if normalize else "count"
result = result.reset_index()
return result

def fillna(
self,
Expand Down Expand Up @@ -1274,7 +1297,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
result.columns = result.columns.droplevel(-1)

if not self.as_index:
self._insert_inaxis_grouper_inplace(result)
result = self._insert_inaxis_grouper(result)
result.index = default_index(len(result))

return result
Expand Down Expand Up @@ -1386,7 +1409,7 @@ def _wrap_applied_output(
return self.obj._constructor_sliced(values, index=key_index)
else:
result = self.obj._constructor(values, columns=[self._selection])
self._insert_inaxis_grouper_inplace(result)
result = self._insert_inaxis_grouper(result)
return result
else:
# values are Series
Expand Down Expand Up @@ -1443,7 +1466,7 @@ def _wrap_applied_output_series(
result = self.obj._constructor(stacked_values, index=index, columns=columns)

if not self.as_index:
self._insert_inaxis_grouper_inplace(result)
result = self._insert_inaxis_grouper(result)

return self._reindex_output(result)

Expand Down Expand Up @@ -1774,7 +1797,9 @@ def _gotitem(self, key, ndim: int, subset=None):
subset,
level=self.level,
grouper=self.grouper,
exclusions=self.exclusions,
selection=key,
as_index=self.as_index,
sort=self.sort,
group_keys=self.group_keys,
observed=self.observed,
Expand All @@ -1790,19 +1815,6 @@ def _get_data_to_aggregate(self) -> Manager2D:
else:
return obj._mgr

def _insert_inaxis_grouper_inplace(self, result: DataFrame) -> None:
# zip in reverse so we can always insert at loc 0
columns = result.columns
for name, lev, in_axis in zip(
reversed(self.grouper.names),
reversed(self.grouper.get_group_levels()),
reversed([grp.in_axis for grp in self.grouper.groupings]),
):
# GH #28549
# When using .apply(-), name will be in columns already
if in_axis and name not in columns:
result.insert(0, name, lev)

def _indexed_output_to_ndframe(
self, output: Mapping[base.OutputKey, ArrayLike]
) -> DataFrame:
Expand All @@ -1825,7 +1837,7 @@ def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame:
mgr.set_axis(1, index)
result = self.obj._constructor(mgr)

self._insert_inaxis_grouper_inplace(result)
result = self._insert_inaxis_grouper(result)
result = result._consolidate()
else:
index = self.grouper.result_index
Expand Down Expand Up @@ -1918,7 +1930,7 @@ def nunique(self, dropna: bool = True) -> DataFrame:

if not self.as_index:
results.index = default_index(len(results))
self._insert_inaxis_grouper_inplace(results)
results = self._insert_inaxis_grouper(results)

return results

Expand Down
Loading