pandas-dev · jreback · May 30, 2019 · May 19, 2019 · May 19, 2019 · May 19, 2019
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -25,15 +25,14 @@
 from pandas.core.dtypes.missing import isna, notna
 
 import pandas.core.algorithms as algorithms
-from pandas.core.arrays import Categorical
 from pandas.core.base import DataError, SpecificationError
 import pandas.core.common as com
 from pandas.core.frame import DataFrame
 from pandas.core.generic import NDFrame, _shared_docs
 from pandas.core.groupby import base
 from pandas.core.groupby.groupby import (
     GroupBy, _apply_docs, _transform_template)
-from pandas.core.index import CategoricalIndex, Index, MultiIndex
+from pandas.core.index import Index, MultiIndex
 import pandas.core.indexes.base as ibase
 from pandas.core.internals import BlockManager, make_block
 from pandas.core.series import Series
@@ -834,9 +833,10 @@ def _wrap_output(self, output, index, names=None):
             return Series(output, index=index, name=name)
 
     def _wrap_aggregated_output(self, output, names=None):
-        return self._wrap_output(output=output,
-                                 index=self.grouper.result_index,
-                                 names=names)
+        result = self._wrap_output(output=output,
+                                   index=self.grouper.result_index,
+                                   names=names)
+        return self._reindex_output(result)._convert(datetime=True)
 
     def _wrap_transformed_output(self, output, names=None):
         return self._wrap_output(output=output,
@@ -856,23 +856,28 @@ def _get_index():
             return index
 
         if isinstance(values[0], dict):
-            # GH #823
+            # GH #823 #24880
             index = _get_index()
-            result = DataFrame(values, index=index).stack()
+            result = self._reindex_output(DataFrame(values, index=index))
+            # if self.observed is False,
+            # keep all-NaN rows created while re-indexing
+            result = result.stack(dropna=self.observed)
             result.name = self._selection_name
             return result
 
-        if isinstance(values[0], (Series, dict)):
+        if isinstance(values[0], Series):
             return self._concat_objects(keys, values,
                                         not_indexed_same=not_indexed_same)
         elif isinstance(values[0], DataFrame):
             # possible that Series -> DataFrame by applied function
             return self._concat_objects(keys, values,
                                         not_indexed_same=not_indexed_same)
         else:
-            # GH #6265
-            return Series(values, index=_get_index(),
-                          name=self._selection_name)
+            # GH #6265 #24880
+            result = Series(data=values,
+                            index=_get_index(),
+                            name=self._selection_name)
+            return self._reindex_output(result)
 
     def _aggregate_named(self, func, *args, **kwargs):
         result = OrderedDict()
@@ -1335,7 +1340,8 @@ def _gotitem(self, key, ndim, subset=None):
             if subset is None:
                 subset = self.obj[key]
             return SeriesGroupBy(subset, selection=key,
-                                 grouper=self.grouper)
+                                 grouper=self.grouper,
+                                 observed=self.observed)
 
         raise AssertionError("invalid ndim for _gotitem")
 
@@ -1407,69 +1413,6 @@ def _wrap_agged_blocks(self, items, blocks):
 
         return self._reindex_output(result)._convert(datetime=True)
 
-    def _reindex_output(self, result):
-        """
-        If we have categorical groupers, then we want to make sure that
-        we have a fully reindex-output to the levels. These may have not
-        participated in the groupings (e.g. may have all been
-        nan groups);
-
-        This can re-expand the output space
-        """
-
-        # we need to re-expand the output space to accomodate all values
-        # whether observed or not in the cartesian product of our groupes
-        groupings = self.grouper.groupings
-        if groupings is None:
-            return result
-        elif len(groupings) == 1:
-            return result
-
-        # if we only care about the observed values
-        # we are done
-        elif self.observed:
-            return result
-
-        # reindexing only applies to a Categorical grouper
-        elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex))
-                     for ping in groupings):
-            return result
-
-        levels_list = [ping.group_index for ping in groupings]
-        index, _ = MultiIndex.from_product(
-            levels_list, names=self.grouper.names).sortlevel()
-
-        if self.as_index:
-            d = {self.obj._get_axis_name(self.axis): index, 'copy': False}
-            return result.reindex(**d)
-
-        # GH 13204
-        # Here, the categorical in-axis groupers, which need to be fully
-        # expanded, are columns in `result`. An idea is to do:
-        # result = result.set_index(self.grouper.names)
-        #                .reindex(index).reset_index()
-        # but special care has to be taken because of possible not-in-axis
-        # groupers.
-        # So, we manually select and drop the in-axis grouper columns,
-        # reindex `result`, and then reset the in-axis grouper columns.
-
-        # Select in-axis groupers
-        in_axis_grps = ((i, ping.name) for (i, ping)
-                        in enumerate(groupings) if ping.in_axis)
-        g_nums, g_names = zip(*in_axis_grps)
-
-        result = result.drop(labels=list(g_names), axis=1)
-
-        # Set a temp index and reindex (possibly expanding)
-        result = result.set_index(self.grouper.result_index
-                                  ).reindex(index, copy=False)
-
-        # Reset in-axis grouper columns
-        # (using level numbers `g_nums` because level names may not be unique)
-        result = result.reset_index(level=g_nums)
-
-        return result.reset_index(drop=True)
-
     def _iterate_column_groupbys(self):
         for i, colname in enumerate(self._selected_obj.columns):
             yield colname, SeriesGroupBy(self._selected_obj.iloc[:, i],

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -17,6 +17,7 @@ class providing the base-class of operations.
 
 import numpy as np
 
+from pandas.core.arrays import Categorical
 from pandas._config.config import option_context
 
 from pandas._libs import Timestamp
@@ -42,7 +43,7 @@ class providing the base-class of operations.
 from pandas.core.frame import DataFrame
 from pandas.core.generic import NDFrame
 from pandas.core.groupby import base
-from pandas.core.index import Index, MultiIndex
+from pandas.core.index import Index, CategoricalIndex, MultiIndex
 from pandas.core.series import Series
 from pandas.core.sorting import get_group_index_sorter
 
@@ -2301,6 +2302,69 @@ def tail(self, n=5):
         mask = self._cumcount_array(ascending=False) < n
         return self._selected_obj[mask]
 
+    def _reindex_output(self, result):
+        """
+        If we have categorical groupers, then we want to make sure that
+        we have a fully reindex-output to the levels. These may have not
+        participated in the groupings (e.g. may have all been
+        nan groups);
+
+        This can re-expand the output space
+        """
+
+        # we need to re-expand the output space to accomodate all values
+        # whether observed or not in the cartesian product of our groupes
+        groupings = self.grouper.groupings
+        if groupings is None:
+            return result
+        elif len(groupings) == 1:
+            return result
+
+        # if we only care about the observed values
+        # we are done
+        elif self.observed:
+            return result
+
+        # reindexing only applies to a Categorical grouper
+        elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex))
+                     for ping in groupings):
+            return result
+
+        levels_list = [ping.group_index for ping in groupings]
+        index, _ = MultiIndex.from_product(
+            levels_list, names=self.grouper.names).sortlevel()
+
+        if self.as_index:
+            d = {self.obj._get_axis_name(self.axis): index, 'copy': False}
+            return result.reindex(**d)
+
+        # GH 13204
+        # Here, the categorical in-axis groupers, which need to be fully
+        # expanded, are columns in `result`. An idea is to do:
+        # result = result.set_index(self.grouper.names)
+        #                .reindex(index).reset_index()
+        # but special care has to be taken because of possible not-in-axis
+        # groupers.
+        # So, we manually select and drop the in-axis grouper columns,
+        # reindex `result`, and then reset the in-axis grouper columns.
+
+        # Select in-axis groupers
+        in_axis_grps = ((i, ping.name) for (i, ping)
+                        in enumerate(groupings) if ping.in_axis)
+        g_nums, g_names = zip(*in_axis_grps)
+
+        result = result.drop(labels=list(g_names), axis=1)
+
+        # Set a temp index and reindex (possibly expanding)
+        result = result.set_index(self.grouper.result_index
+                                  ).reindex(index, copy=False)
+
+        # Reset in-axis grouper columns
+        # (using level numbers `g_nums` because level names may not be unique)
+        result = result.reset_index(level=g_nums)
+
+        return result.reset_index(drop=True)
+
 
 GroupBy._add_numeric_operations()
 

diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pytest
 
-from pandas import DataFrame, MultiIndex
+from pandas import DataFrame, CategoricalIndex, Index, MultiIndex
 from pandas.util import testing as tm
 
 
@@ -76,3 +76,49 @@ def three_group():
                       'D': np.random.randn(11),
                       'E': np.random.randn(11),
                       'F': np.random.randn(11)})
+
+
+@pytest.fixture
+def df_cat():
+    df = DataFrame({'a': ['x', 'x', 'x', 'y'],
+                    'b': ['a', 'a', 'b', 'a'],
+                    'c': [1, 2, 3, 4]})
+    df['a'] = df['a'].astype('category')
+    df['b'] = df['b'].astype('category')
+    return df
+
+
+@pytest.fixture
+def multi_index_cat_complete():
+    lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False),
+            CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False)]
+    index = MultiIndex.from_product(lvls, names=['a', 'b'])
+    return index
+
+
+@pytest.fixture
+def multi_index_cat_partial(df_cat):
+    return MultiIndex.from_frame(df_cat[['a', 'b']].drop_duplicates())
+
+
+@pytest.fixture
+def multi_index_non_cat_partial():
+    return MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'a')],
+                                  names=('a', 'b'))
+
+
+@pytest.fixture
+def multi_index_cat_compl_dict():
+    lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False),
+            CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False),
+            Index(['min', 'max'])]
+    index = MultiIndex.from_product(lvls, names=['a', 'b', None])
+    return index
+
+
+@pytest.fixture
+def multi_index_non_cat_partial_dict():
+    return MultiIndex.from_tuples([('x', 'a', 'min'), ('x', 'a', 'max'),
+                                   ('x', 'b', 'min'), ('x', 'b', 'max'),
+                                   ('y', 'a', 'min'), ('y', 'a', 'max')],
+                                  names=('a', 'b', None))
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -1,4 +1,5 @@
 from datetime import datetime
+from collections import OrderedDict
 
 import numpy as np
 import pytest
@@ -963,3 +964,33 @@ def test_shift(fill_value):
                               categories=['a', 'b', 'c', 'd'], ordered=False)
     res = ct.shift(1, fill_value=fill_value)
     assert_equal(res, expected)
+
+
+@pytest.mark.parametrize("observed, index, op, data", [
+    (True, 'multi_index_cat_partial', 'agg', [3, 3, 4]),
+    (True, 'multi_index_non_cat_partial', 'apply', [3, 3, 4]),
+    (False, 'multi_index_cat_complete', 'agg', [3, 3, 4, np.nan]),
+    (False, 'multi_index_cat_complete', 'apply', [3, 3, 4, np.nan]),
+    (None, 'multi_index_cat_complete', 'agg', [3, 3, 4, np.nan]),
+    (None, 'multi_index_cat_complete', 'apply', [3, 3, 4, np.nan])])
+def test_groupby_series_observed(request, df_cat, observed, index, op, data):
+    # GH 24880
+    index = request.getfixturevalue(index)
+    expected = pd.Series(data=data, index=index, name='c')
+    grouped = df_cat.groupby(['a', 'b'], observed=observed).c
+    actual = getattr(grouped, op)(sum)
+    assert_series_equal(expected, actual)
+
+
+@pytest.mark.parametrize("observed, index, data", [
+    (True, 'multi_index_non_cat_partial_dict', [1, 2, 3, 3, 4, 4]),
+    (False, 'multi_index_cat_compl_dict', [1, 2, 3, 3, 4, 4, np.nan, np.nan]),
+    (None, 'multi_index_cat_compl_dict', [1, 2, 3, 3, 4, 4, np.nan, np.nan])])
+def test_groupby_series_observed_apply_dict(request, df_cat, observed, index,
+                                            data):
+    # GH 24880
+    index = request.getfixturevalue(index)
+    expected = pd.Series(data=data, index=index, name='c')
+    actual = df_cat.groupby(['a', 'b'], observed=observed).c.\
+        apply(lambda x: OrderedDict([('min', x.min()), ('max', x.max())]))
+    assert_series_equal(expected, actual)