Skip to content

Fix 'observed' kwarg not doing anything on SeriesGroupBy #26461

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 15 additions & 75 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,14 @@
from pandas.core.dtypes.missing import isna, notna

import pandas.core.algorithms as algorithms
from pandas.core.arrays import Categorical
from pandas.core.base import DataError, SpecificationError
import pandas.core.common as com
from pandas.core.frame import DataFrame
from pandas.core.generic import NDFrame, _shared_docs
from pandas.core.groupby import base
from pandas.core.groupby.groupby import (
GroupBy, _apply_docs, _transform_template)
from pandas.core.index import CategoricalIndex, Index, MultiIndex
from pandas.core.index import Index, MultiIndex
import pandas.core.indexes.base as ibase
from pandas.core.internals import BlockManager, make_block
from pandas.core.series import Series
Expand Down Expand Up @@ -834,9 +833,10 @@ def _wrap_output(self, output, index, names=None):
return Series(output, index=index, name=name)

def _wrap_aggregated_output(self, output, names=None):
return self._wrap_output(output=output,
index=self.grouper.result_index,
names=names)
result = self._wrap_output(output=output,
index=self.grouper.result_index,
names=names)
return self._reindex_output(result)._convert(datetime=True)

def _wrap_transformed_output(self, output, names=None):
return self._wrap_output(output=output,
Expand All @@ -856,23 +856,25 @@ def _get_index():
return index

if isinstance(values[0], dict):
# GH #823
# GH #823 #24880
index = _get_index()
result = DataFrame(values, index=index).stack()
result = self._reindex_output(DataFrame(values, index=index))
dropna = self.observed # if self.observed is False, keep all-NaN rows created while re-indexing
result = result.stack(dropna=dropna)
result.name = self._selection_name
return result

if isinstance(values[0], (Series, dict)):
if isinstance(values[0], Series):
return self._concat_objects(keys, values,
not_indexed_same=not_indexed_same)
elif isinstance(values[0], DataFrame):
# possible that Series -> DataFrame by applied function
return self._concat_objects(keys, values,
not_indexed_same=not_indexed_same)
else:
# GH #6265
return Series(values, index=_get_index(),
name=self._selection_name)
# GH #6265 #24880
result = Series(values, index=_get_index(), name=self._selection_name)
return self._reindex_output(result)

def _aggregate_named(self, func, *args, **kwargs):
result = OrderedDict()
Expand Down Expand Up @@ -1335,7 +1337,8 @@ def _gotitem(self, key, ndim, subset=None):
if subset is None:
subset = self.obj[key]
return SeriesGroupBy(subset, selection=key,
grouper=self.grouper)
grouper=self.grouper,
observed=self.observed)

raise AssertionError("invalid ndim for _gotitem")

Expand Down Expand Up @@ -1407,69 +1410,6 @@ def _wrap_agged_blocks(self, items, blocks):

return self._reindex_output(result)._convert(datetime=True)

def _reindex_output(self, result):
"""
If we have categorical groupers, then we want to make sure that
we have a fully reindex-output to the levels. These may have not
participated in the groupings (e.g. may have all been
nan groups);

This can re-expand the output space
"""

# we need to re-expand the output space to accomodate all values
# whether observed or not in the cartesian product of our groupes
groupings = self.grouper.groupings
if groupings is None:
return result
elif len(groupings) == 1:
return result

# if we only care about the observed values
# we are done
elif self.observed:
return result

# reindexing only applies to a Categorical grouper
elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex))
for ping in groupings):
return result

levels_list = [ping.group_index for ping in groupings]
index, _ = MultiIndex.from_product(
levels_list, names=self.grouper.names).sortlevel()

if self.as_index:
d = {self.obj._get_axis_name(self.axis): index, 'copy': False}
return result.reindex(**d)

# GH 13204
# Here, the categorical in-axis groupers, which need to be fully
# expanded, are columns in `result`. An idea is to do:
# result = result.set_index(self.grouper.names)
# .reindex(index).reset_index()
# but special care has to be taken because of possible not-in-axis
# groupers.
# So, we manually select and drop the in-axis grouper columns,
# reindex `result`, and then reset the in-axis grouper columns.

# Select in-axis groupers
in_axis_grps = ((i, ping.name) for (i, ping)
in enumerate(groupings) if ping.in_axis)
g_nums, g_names = zip(*in_axis_grps)

result = result.drop(labels=list(g_names), axis=1)

# Set a temp index and reindex (possibly expanding)
result = result.set_index(self.grouper.result_index
).reindex(index, copy=False)

# Reset in-axis grouper columns
# (using level numbers `g_nums` because level names may not be unique)
result = result.reset_index(level=g_nums)

return result.reset_index(drop=True)

def _iterate_column_groupbys(self):
for i, colname in enumerate(self._selected_obj.columns):
yield colname, SeriesGroupBy(self._selected_obj.iloc[:, i],
Expand Down
66 changes: 65 additions & 1 deletion pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class providing the base-class of operations.

import numpy as np

from pandas.core.arrays import Categorical
from pandas._config.config import option_context

from pandas._libs import Timestamp
Expand All @@ -42,7 +43,7 @@ class providing the base-class of operations.
from pandas.core.frame import DataFrame
from pandas.core.generic import NDFrame
from pandas.core.groupby import base
from pandas.core.index import Index, MultiIndex
from pandas.core.index import Index, CategoricalIndex, MultiIndex
from pandas.core.series import Series
from pandas.core.sorting import get_group_index_sorter

Expand Down Expand Up @@ -2301,6 +2302,69 @@ def tail(self, n=5):
mask = self._cumcount_array(ascending=False) < n
return self._selected_obj[mask]

def _reindex_output(self, result):
"""
If we have categorical groupers, then we want to make sure that
we have a fully reindex-output to the levels. These may have not
participated in the groupings (e.g. may have all been
nan groups);

This can re-expand the output space
"""

# we need to re-expand the output space to accomodate all values
# whether observed or not in the cartesian product of our groupes
groupings = self.grouper.groupings
if groupings is None:
return result
elif len(groupings) == 1:
return result

# if we only care about the observed values
# we are done
elif self.observed:
return result

# reindexing only applies to a Categorical grouper
elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex))
for ping in groupings):
return result

levels_list = [ping.group_index for ping in groupings]
index, _ = MultiIndex.from_product(
levels_list, names=self.grouper.names).sortlevel()

if self.as_index:
d = {self.obj._get_axis_name(self.axis): index, 'copy': False}
return result.reindex(**d)

# GH 13204
# Here, the categorical in-axis groupers, which need to be fully
# expanded, are columns in `result`. An idea is to do:
# result = result.set_index(self.grouper.names)
# .reindex(index).reset_index()
# but special care has to be taken because of possible not-in-axis
# groupers.
# So, we manually select and drop the in-axis grouper columns,
# reindex `result`, and then reset the in-axis grouper columns.

# Select in-axis groupers
in_axis_grps = ((i, ping.name) for (i, ping)
in enumerate(groupings) if ping.in_axis)
g_nums, g_names = zip(*in_axis_grps)

result = result.drop(labels=list(g_names), axis=1)

# Set a temp index and reindex (possibly expanding)
result = result.set_index(self.grouper.result_index
).reindex(index, copy=False)

# Reset in-axis grouper columns
# (using level numbers `g_nums` because level names may not be unique)
result = result.reset_index(level=g_nums)

return result.reset_index(drop=True)


GroupBy._add_numeric_operations()

Expand Down
59 changes: 58 additions & 1 deletion pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import pandas as pd
from pandas import (
DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv)
DataFrame, Index, MultiIndex, CategoricalIndex, Series, Timestamp, date_range, read_csv)
import pandas.core.common as com
import pandas.util.testing as tm
from pandas.util.testing import (
Expand Down Expand Up @@ -1736,3 +1736,60 @@ def test_groupby_multiindex_series_keys_len_equal_group_axis():
expected = pd.Series([3], index=ei)

assert_series_equal(result, expected)


def test_groupby_observed():
# GH 24880
df = DataFrame({'a': ['x', 'x', 'x', 'y'],
'b': ['a', 'a', 'b', 'a'],
'c': [1, 2, 3, 4]})
df['a'] = df['a'].astype('category')
df['b'] = df['b'].astype('category')

# test .agg and .apply when observed == False
levels = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False),
CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False)]
index, _ = MultiIndex.from_product(levels, names=['a', 'b']).sortlevel()
expected = pd.Series(data=[3, 3, 4, np.nan], index=index, name='c')
actual_agg = df.groupby(['a', 'b']).c.agg(sum)
actual_apply = df.groupby(['a', 'b']).c.apply(sum)
assert_series_equal(expected, actual_agg)
assert_series_equal(expected, actual_apply)

# test .agg when observed == True
index = MultiIndex.from_frame(df[['a', 'b']].drop_duplicates())
expected = pd.Series([3, 3, 4], index=index, name='c')
actual = df.groupby(['a', 'b'], observed=True).c.agg(sum)
assert_series_equal(expected, actual)

# test .apply when observed == True
index = MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'a')], names=('a', 'b'))
expected = pd.Series([3, 3, 4], index=index, name='c')
actual = df.groupby(['a', 'b'], observed=True).c.apply(sum)
assert_series_equal(expected, actual)


def test_groupby_observed_apply_lambda_returns_dict():
# GH 24880
df = DataFrame({'a': ['x', 'x', 'x', 'y'],
'b': ['a', 'a', 'b', 'a'],
'c': [1, 2, 3, 4]})
df['a'] = df['a'].astype('category')
df['b'] = df['b'].astype('category')

# observed == False
levels = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False),
CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False),
Index(['min', 'max'])]
index, _ = MultiIndex.from_product(levels, names=['a', 'b', None]).sortlevel()
expected = pd.Series(data=[2, 1, 3, 3, 4, 4, np.nan, np.nan], index=index, name='c')
actual = df.groupby(['a', 'b']).c.apply(lambda x: {'min': x.min(), 'max': x.max()})
assert_series_equal(expected, actual)

# observed == True
index = MultiIndex.from_tuples([('x', 'a', 'max'), ('x', 'a', 'min'),
('x', 'b', 'max'), ('x', 'b', 'min'),
('y', 'a', 'max'), ('y', 'a', 'min')],names=('a', 'b', None))
expected = pd.Series(data=[2, 1, 3, 3, 4, 4], index=index, name='c')
actual = df.groupby(['a', 'b'], observed=True).c.apply(lambda x: {'min': x.min(), 'max': x.max()})
assert_series_equal(expected, actual)