diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2f665975f96bd..dc414a588a2ce 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -25,7 +25,6 @@ from pandas.core.dtypes.missing import isna, notna import pandas.core.algorithms as algorithms -from pandas.core.arrays import Categorical from pandas.core.base import DataError, SpecificationError import pandas.core.common as com from pandas.core.frame import DataFrame @@ -33,7 +32,7 @@ from pandas.core.groupby import base from pandas.core.groupby.groupby import ( GroupBy, _apply_docs, _transform_template) -from pandas.core.index import CategoricalIndex, Index, MultiIndex +from pandas.core.index import Index, MultiIndex import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series @@ -834,9 +833,10 @@ def _wrap_output(self, output, index, names=None): return Series(output, index=index, name=name) def _wrap_aggregated_output(self, output, names=None): - return self._wrap_output(output=output, - index=self.grouper.result_index, - names=names) + result = self._wrap_output(output=output, + index=self.grouper.result_index, + names=names) + return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output(self, output, names=None): return self._wrap_output(output=output, @@ -856,13 +856,16 @@ def _get_index(): return index if isinstance(values[0], dict): - # GH #823 + # GH #823 #24880 index = _get_index() - result = DataFrame(values, index=index).stack() + result = self._reindex_output(DataFrame(values, index=index)) + # if self.observed is False, + # keep all-NaN rows created while re-indexing + result = result.stack(dropna=self.observed) result.name = self._selection_name return result - if isinstance(values[0], (Series, dict)): + if isinstance(values[0], Series): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) elif isinstance(values[0], DataFrame): @@ -870,9 +873,11 @@ def _get_index(): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: - # GH #6265 - return Series(values, index=_get_index(), - name=self._selection_name) + # GH #6265 #24880 + result = Series(data=values, + index=_get_index(), + name=self._selection_name) + return self._reindex_output(result) def _aggregate_named(self, func, *args, **kwargs): result = OrderedDict() @@ -1335,7 +1340,8 @@ def _gotitem(self, key, ndim, subset=None): if subset is None: subset = self.obj[key] return SeriesGroupBy(subset, selection=key, - grouper=self.grouper) + grouper=self.grouper, + observed=self.observed) raise AssertionError("invalid ndim for _gotitem") @@ -1407,69 +1413,6 @@ def _wrap_agged_blocks(self, items, blocks): return self._reindex_output(result)._convert(datetime=True) - def _reindex_output(self, result): - """ - If we have categorical groupers, then we want to make sure that - we have a fully reindex-output to the levels. These may have not - participated in the groupings (e.g. may have all been - nan groups); - - This can re-expand the output space - """ - - # we need to re-expand the output space to accomodate all values - # whether observed or not in the cartesian product of our groupes - groupings = self.grouper.groupings - if groupings is None: - return result - elif len(groupings) == 1: - return result - - # if we only care about the observed values - # we are done - elif self.observed: - return result - - # reindexing only applies to a Categorical grouper - elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex)) - for ping in groupings): - return result - - levels_list = [ping.group_index for ping in groupings] - index, _ = MultiIndex.from_product( - levels_list, names=self.grouper.names).sortlevel() - - if self.as_index: - d = {self.obj._get_axis_name(self.axis): index, 'copy': False} - return result.reindex(**d) - - # GH 13204 - # Here, the categorical in-axis groupers, which need to be fully - # expanded, are columns in `result`. An idea is to do: - # result = result.set_index(self.grouper.names) - # .reindex(index).reset_index() - # but special care has to be taken because of possible not-in-axis - # groupers. - # So, we manually select and drop the in-axis grouper columns, - # reindex `result`, and then reset the in-axis grouper columns. - - # Select in-axis groupers - in_axis_grps = ((i, ping.name) for (i, ping) - in enumerate(groupings) if ping.in_axis) - g_nums, g_names = zip(*in_axis_grps) - - result = result.drop(labels=list(g_names), axis=1) - - # Set a temp index and reindex (possibly expanding) - result = result.set_index(self.grouper.result_index - ).reindex(index, copy=False) - - # Reset in-axis grouper columns - # (using level numbers `g_nums` because level names may not be unique) - result = result.reset_index(level=g_nums) - - return result.reset_index(drop=True) - def _iterate_column_groupbys(self): for i, colname in enumerate(self._selected_obj.columns): yield colname, SeriesGroupBy(self._selected_obj.iloc[:, i], diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4e9e3b4963b6d..3d8716def20fb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -17,6 +17,7 @@ class providing the base-class of operations. import numpy as np +from pandas.core.arrays import Categorical from pandas._config.config import option_context from pandas._libs import Timestamp @@ -42,7 +43,7 @@ class providing the base-class of operations. from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base -from pandas.core.index import Index, MultiIndex +from pandas.core.index import Index, CategoricalIndex, MultiIndex from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter @@ -2301,6 +2302,69 @@ def tail(self, n=5): mask = self._cumcount_array(ascending=False) < n return self._selected_obj[mask] + def _reindex_output(self, result): + """ + If we have categorical groupers, then we want to make sure that + we have a fully reindex-output to the levels. These may have not + participated in the groupings (e.g. may have all been + nan groups); + + This can re-expand the output space + """ + + # we need to re-expand the output space to accomodate all values + # whether observed or not in the cartesian product of our groupes + groupings = self.grouper.groupings + if groupings is None: + return result + elif len(groupings) == 1: + return result + + # if we only care about the observed values + # we are done + elif self.observed: + return result + + # reindexing only applies to a Categorical grouper + elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex)) + for ping in groupings): + return result + + levels_list = [ping.group_index for ping in groupings] + index, _ = MultiIndex.from_product( + levels_list, names=self.grouper.names).sortlevel() + + if self.as_index: + d = {self.obj._get_axis_name(self.axis): index, 'copy': False} + return result.reindex(**d) + + # GH 13204 + # Here, the categorical in-axis groupers, which need to be fully + # expanded, are columns in `result`. An idea is to do: + # result = result.set_index(self.grouper.names) + # .reindex(index).reset_index() + # but special care has to be taken because of possible not-in-axis + # groupers. + # So, we manually select and drop the in-axis grouper columns, + # reindex `result`, and then reset the in-axis grouper columns. + + # Select in-axis groupers + in_axis_grps = ((i, ping.name) for (i, ping) + in enumerate(groupings) if ping.in_axis) + g_nums, g_names = zip(*in_axis_grps) + + result = result.drop(labels=list(g_names), axis=1) + + # Set a temp index and reindex (possibly expanding) + result = result.set_index(self.grouper.result_index + ).reindex(index, copy=False) + + # Reset in-axis grouper columns + # (using level numbers `g_nums` because level names may not be unique) + result = result.reset_index(level=g_nums) + + return result.reset_index(drop=True) + GroupBy._add_numeric_operations() diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 2511063110f92..3690dc7bb048d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -10,7 +10,8 @@ import pandas as pd from pandas import ( - DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv) + DataFrame, Index, MultiIndex, CategoricalIndex, + Series, Timestamp, date_range, read_csv) import pandas.core.common as com import pandas.util.testing as tm from pandas.util.testing import ( @@ -1736,3 +1737,67 @@ def test_groupby_multiindex_series_keys_len_equal_group_axis(): expected = pd.Series([3], index=ei) assert_series_equal(result, expected) + + +def test_groupby_observed(): + # GH 24880 + df = DataFrame({'a': ['x', 'x', 'x', 'y'], + 'b': ['a', 'a', 'b', 'a'], + 'c': [1, 2, 3, 4]}) + df['a'] = df['a'].astype('category') + df['b'] = df['b'].astype('category') + + # test .agg and .apply when observed == False + lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), + CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False)] + index, _ = MultiIndex.from_product(lvls, names=['a', 'b']).sortlevel() + expected = pd.Series(data=[3, 3, 4, np.nan], index=index, name='c') + actual_agg = df.groupby(['a', 'b']).c.agg(sum) + actual_apply = df.groupby(['a', 'b']).c.apply(sum) + assert_series_equal(expected, actual_agg) + assert_series_equal(expected, actual_apply) + + # test .agg when observed == True + index = MultiIndex.from_frame(df[['a', 'b']].drop_duplicates()) + expected = pd.Series([3, 3, 4], index=index, name='c') + actual = df.groupby(['a', 'b'], observed=True).c.agg(sum) + assert_series_equal(expected, actual) + + # test .apply when observed == True + index = MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'a')], + names=('a', 'b')) + expected = pd.Series([3, 3, 4], index=index, name='c') + actual = df.groupby(['a', 'b'], observed=True).c.apply(sum) + assert_series_equal(expected, actual) + + +def test_groupby_observed_apply_lambda_returns_dict(): + # GH 24880 + df = DataFrame({'a': ['x', 'x', 'x', 'y'], + 'b': ['a', 'a', 'b', 'a'], + 'c': [1, 2, 3, 4]}) + df['a'] = df['a'].astype('category') + df['b'] = df['b'].astype('category') + + # observed == False + lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), + CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False), + Index(['min', 'max'])] + index, _ = MultiIndex.from_product(lvls, + names=['a', 'b', None]).sortlevel() + expected = pd.Series(data=[2, 1, 3, 3, 4, 4, np.nan, np.nan], + index=index, + name='c') + actual = df.groupby(['a', 'b']).c.apply(lambda x: {'min': x.min(), + 'max': x.max()}) + assert_series_equal(expected, actual) + + # observed == True + index = MultiIndex.from_tuples([('x', 'a', 'max'), ('x', 'a', 'min'), + ('x', 'b', 'max'), ('x', 'b', 'min'), + ('y', 'a', 'max'), ('y', 'a', 'min')], + names=('a', 'b', None)) + expected = pd.Series(data=[2, 1, 3, 3, 4, 4], index=index, name='c') + actual = df.groupby(['a', 'b'], observed=True).c.\ + apply(lambda x: {'min': x.min(), 'max': x.max()}) + assert_series_equal(expected, actual)