diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 2c66d3e4db321..2e9709f7bdd8f 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -512,6 +512,7 @@ Groupby/Resample/Rolling - Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying a aggregation function to timezone aware data (:issue:`23683`) - Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`) - Bug in :func:`pandas.core.groupby.GroupBy.size` when grouping only NA values (:issue:`23050`) +- Bug in :func:`Series.groupby` where ``observed`` kwarg was previously ignored (:issue:`24880`) - Bug in :func:`Series.groupby` where using ``groupby`` with a :class:`MultiIndex` Series with a list of labels equal to the length of the series caused incorrect grouping (:issue:`25704`) - Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`) - Ensured that result group order is correct when grouping on an ordered ``Categorical`` and specifying ``observed=True`` (:issue:`25871`, :issue:`25167`) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2f665975f96bd..dc414a588a2ce 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -25,7 +25,6 @@ from pandas.core.dtypes.missing import isna, notna import pandas.core.algorithms as algorithms -from pandas.core.arrays import Categorical from pandas.core.base import DataError, SpecificationError import pandas.core.common as com from pandas.core.frame import DataFrame @@ -33,7 +32,7 @@ from pandas.core.groupby import base from pandas.core.groupby.groupby import ( GroupBy, _apply_docs, _transform_template) -from pandas.core.index import CategoricalIndex, Index, MultiIndex +from pandas.core.index import Index, MultiIndex import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series @@ -834,9 +833,10 @@ def _wrap_output(self, output, index, names=None): return Series(output, index=index, name=name) def _wrap_aggregated_output(self, output, names=None): - return self._wrap_output(output=output, - index=self.grouper.result_index, - names=names) + result = self._wrap_output(output=output, + index=self.grouper.result_index, + names=names) + return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output(self, output, names=None): return self._wrap_output(output=output, @@ -856,13 +856,16 @@ def _get_index(): return index if isinstance(values[0], dict): - # GH #823 + # GH #823 #24880 index = _get_index() - result = DataFrame(values, index=index).stack() + result = self._reindex_output(DataFrame(values, index=index)) + # if self.observed is False, + # keep all-NaN rows created while re-indexing + result = result.stack(dropna=self.observed) result.name = self._selection_name return result - if isinstance(values[0], (Series, dict)): + if isinstance(values[0], Series): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) elif isinstance(values[0], DataFrame): @@ -870,9 +873,11 @@ def _get_index(): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: - # GH #6265 - return Series(values, index=_get_index(), - name=self._selection_name) + # GH #6265 #24880 + result = Series(data=values, + index=_get_index(), + name=self._selection_name) + return self._reindex_output(result) def _aggregate_named(self, func, *args, **kwargs): result = OrderedDict() @@ -1335,7 +1340,8 @@ def _gotitem(self, key, ndim, subset=None): if subset is None: subset = self.obj[key] return SeriesGroupBy(subset, selection=key, - grouper=self.grouper) + grouper=self.grouper, + observed=self.observed) raise AssertionError("invalid ndim for _gotitem") @@ -1407,69 +1413,6 @@ def _wrap_agged_blocks(self, items, blocks): return self._reindex_output(result)._convert(datetime=True) - def _reindex_output(self, result): - """ - If we have categorical groupers, then we want to make sure that - we have a fully reindex-output to the levels. These may have not - participated in the groupings (e.g. may have all been - nan groups); - - This can re-expand the output space - """ - - # we need to re-expand the output space to accomodate all values - # whether observed or not in the cartesian product of our groupes - groupings = self.grouper.groupings - if groupings is None: - return result - elif len(groupings) == 1: - return result - - # if we only care about the observed values - # we are done - elif self.observed: - return result - - # reindexing only applies to a Categorical grouper - elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex)) - for ping in groupings): - return result - - levels_list = [ping.group_index for ping in groupings] - index, _ = MultiIndex.from_product( - levels_list, names=self.grouper.names).sortlevel() - - if self.as_index: - d = {self.obj._get_axis_name(self.axis): index, 'copy': False} - return result.reindex(**d) - - # GH 13204 - # Here, the categorical in-axis groupers, which need to be fully - # expanded, are columns in `result`. An idea is to do: - # result = result.set_index(self.grouper.names) - # .reindex(index).reset_index() - # but special care has to be taken because of possible not-in-axis - # groupers. - # So, we manually select and drop the in-axis grouper columns, - # reindex `result`, and then reset the in-axis grouper columns. - - # Select in-axis groupers - in_axis_grps = ((i, ping.name) for (i, ping) - in enumerate(groupings) if ping.in_axis) - g_nums, g_names = zip(*in_axis_grps) - - result = result.drop(labels=list(g_names), axis=1) - - # Set a temp index and reindex (possibly expanding) - result = result.set_index(self.grouper.result_index - ).reindex(index, copy=False) - - # Reset in-axis grouper columns - # (using level numbers `g_nums` because level names may not be unique) - result = result.reset_index(level=g_nums) - - return result.reset_index(drop=True) - def _iterate_column_groupbys(self): for i, colname in enumerate(self._selected_obj.columns): yield colname, SeriesGroupBy(self._selected_obj.iloc[:, i], diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index aa04b7505afe4..91bb71a1a8af7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -36,13 +36,14 @@ class providing the base-class of operations. from pandas.api.types import ( is_datetime64_dtype, is_integer_dtype, is_object_dtype) import pandas.core.algorithms as algorithms +from pandas.core.arrays import Categorical from pandas.core.base import ( DataError, GroupByError, PandasObject, SelectionMixin, SpecificationError) import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base -from pandas.core.index import Index, MultiIndex +from pandas.core.index import CategoricalIndex, Index, MultiIndex from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter @@ -2301,6 +2302,79 @@ def tail(self, n=5): mask = self._cumcount_array(ascending=False) < n return self._selected_obj[mask] + def _reindex_output(self, output): + """ + If we have categorical groupers, then we might want to make sure that + we have a fully re-indexed output to the levels. This means expanding + the output space to accommodate all values in the cartesian product of + our groups, regardless of whether they were observed in the data or + not. This will expand the output space if there are missing groups. + + The method returns early without modifying the input if the number of + groupings is less than 2, self.observed == True or none of the groupers + are categorical. + + Parameters + ---------- + output: Series or DataFrame + Object resulting from grouping and applying an operation. + + Returns + ------- + Series or DataFrame + Object (potentially) re-indexed to include all possible groups. + """ + groupings = self.grouper.groupings + if groupings is None: + return output + elif len(groupings) == 1: + return output + + # if we only care about the observed values + # we are done + elif self.observed: + return output + + # reindexing only applies to a Categorical grouper + elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex)) + for ping in groupings): + return output + + levels_list = [ping.group_index for ping in groupings] + index, _ = MultiIndex.from_product( + levels_list, names=self.grouper.names).sortlevel() + + if self.as_index: + d = {self.obj._get_axis_name(self.axis): index, 'copy': False} + return output.reindex(**d) + + # GH 13204 + # Here, the categorical in-axis groupers, which need to be fully + # expanded, are columns in `output`. An idea is to do: + # output = output.set_index(self.grouper.names) + # .reindex(index).reset_index() + # but special care has to be taken because of possible not-in-axis + # groupers. + # So, we manually select and drop the in-axis grouper columns, + # reindex `output`, and then reset the in-axis grouper columns. + + # Select in-axis groupers + in_axis_grps = ((i, ping.name) for (i, ping) + in enumerate(groupings) if ping.in_axis) + g_nums, g_names = zip(*in_axis_grps) + + output = output.drop(labels=list(g_names), axis=1) + + # Set a temp index and reindex (possibly expanding) + output = output.set_index(self.grouper.result_index + ).reindex(index, copy=False) + + # Reset in-axis grouper columns + # (using level numbers `g_nums` because level names may not be unique) + output = output.reset_index(level=g_nums) + + return output.reset_index(drop=True) + GroupBy._add_numeric_operations() diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 112f7629d735a..f24fa0daa5b18 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1,3 +1,4 @@ +from collections import OrderedDict from datetime import datetime import numpy as np @@ -25,7 +26,7 @@ def f(a): ordered=a.ordered) return a - index = pd.MultiIndex.from_product(map(f, args), names=names) + index = MultiIndex.from_product(map(f, args), names=names) return result.reindex(index).sort_index() @@ -189,7 +190,7 @@ def test_level_get_group(observed): # GH15155 df = DataFrame(data=np.arange(2, 22, 2), index=MultiIndex( - levels=[pd.CategoricalIndex(["a", "b"]), range(10)], + levels=[CategoricalIndex(["a", "b"]), range(10)], codes=[[0] * 5 + [1] * 5, range(10)], names=["Index1", "Index2"])) g = df.groupby(level=["Index1"], observed=observed) @@ -197,7 +198,7 @@ def test_level_get_group(observed): # expected should equal test.loc[["a"]] # GH15166 expected = DataFrame(data=np.arange(2, 12, 2), - index=pd.MultiIndex(levels=[pd.CategoricalIndex( + index=MultiIndex(levels=[CategoricalIndex( ["a", "b"]), range(5)], codes=[[0] * 5, range(5)], names=["Index1", "Index2"])) @@ -265,7 +266,7 @@ def test_observed(observed): # multiple groupers with a non-cat gb = df.groupby(['A', 'B', 'C'], observed=observed) - exp_index = pd.MultiIndex.from_arrays( + exp_index = MultiIndex.from_arrays( [cat1, cat2, ['foo', 'bar'] * 2], names=['A', 'B', 'C']) expected = DataFrame({'values': Series( @@ -280,7 +281,7 @@ def test_observed(observed): tm.assert_frame_equal(result, expected) gb = df.groupby(['A', 'B'], observed=observed) - exp_index = pd.MultiIndex.from_arrays( + exp_index = MultiIndex.from_arrays( [cat1, cat2], names=['A', 'B']) expected = DataFrame({'values': [1, 2, 3, 4]}, @@ -296,25 +297,25 @@ def test_observed(observed): # https://github.com/pandas-dev/pandas/issues/8138 d = {'cat': - pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], - ordered=True), + Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], + ordered=True), 'ints': [1, 1, 2, 2], 'val': [10, 20, 30, 40]} - df = pd.DataFrame(d) + df = DataFrame(d) # Grouping on a single column groups_single_key = df.groupby("cat", observed=observed) result = groups_single_key.mean() - exp_index = pd.CategoricalIndex(list('ab'), name="cat", - categories=list('abc'), - ordered=True) + exp_index = CategoricalIndex(list('ab'), name="cat", + categories=list('abc'), + ordered=True) expected = DataFrame({"ints": [1.5, 1.5], "val": [20., 30]}, index=exp_index) if not observed: - index = pd.CategoricalIndex(list('abc'), name="cat", - categories=list('abc'), - ordered=True) + index = CategoricalIndex(list('abc'), name="cat", + categories=list('abc'), + ordered=True) expected = expected.reindex(index) tm.assert_frame_equal(result, expected) @@ -324,9 +325,9 @@ def test_observed(observed): result = groups_double_key.agg('mean') expected = DataFrame( {"val": [10, 30, 20, 40], - "cat": pd.Categorical(['a', 'a', 'b', 'b'], - categories=['a', 'b', 'c'], - ordered=True), + "cat": Categorical(['a', 'a', 'b', 'b'], + categories=['a', 'b', 'c'], + ordered=True), "ints": [1, 2, 1, 2]}).set_index(["cat", "ints"]) if not observed: expected = cartesian_product_for_groupers( @@ -347,7 +348,7 @@ def test_observed(observed): # with as_index d = {'foo': [10, 8, 4, 8, 4, 1, 1], 'bar': [10, 20, 30, 40, 50, 60, 70], 'baz': ['d', 'c', 'e', 'a', 'a', 'd', 'c']} - df = pd.DataFrame(d) + df = DataFrame(d) cat = pd.cut(df['foo'], np.linspace(0, 10, 3)) df['range'] = cat groups = df.groupby(['range', 'baz'], as_index=False, observed=observed) @@ -360,7 +361,7 @@ def test_observed(observed): def test_observed_codes_remap(observed): d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} - df = pd.DataFrame(d) + df = DataFrame(d) values = pd.cut(df['C1'], [1, 2, 3, 6]) values.name = "cat" groups_double_key = df.groupby([values, 'C2'], observed=observed) @@ -401,8 +402,8 @@ def test_observed_groups(observed): # gh-20583 # test that we have the appropriate groups - cat = pd.Categorical(['a', 'c', 'a'], categories=['a', 'b', 'c']) - df = pd.DataFrame({'cat': cat, 'vals': [1, 2, 3]}) + cat = Categorical(['a', 'c', 'a'], categories=['a', 'b', 'c']) + df = DataFrame({'cat': cat, 'vals': [1, 2, 3]}) g = df.groupby('cat', observed=observed) result = g.groups @@ -419,9 +420,9 @@ def test_observed_groups(observed): def test_observed_groups_with_nan(observed): # GH 24740 - df = pd.DataFrame({'cat': pd.Categorical(['a', np.nan, 'a'], - categories=['a', 'b', 'd']), - 'vals': [1, 2, 3]}) + df = DataFrame({'cat': Categorical(['a', np.nan, 'a'], + categories=['a', 'b', 'd']), + 'vals': [1, 2, 3]}) g = df.groupby('cat', observed=observed) result = g.groups if observed: @@ -435,16 +436,16 @@ def test_observed_groups_with_nan(observed): def test_dataframe_categorical_with_nan(observed): # GH 21151 - s1 = pd.Categorical([np.nan, 'a', np.nan, 'a'], - categories=['a', 'b', 'c']) - s2 = pd.Series([1, 2, 3, 4]) - df = pd.DataFrame({'s1': s1, 's2': s2}) + s1 = Categorical([np.nan, 'a', np.nan, 'a'], + categories=['a', 'b', 'c']) + s2 = Series([1, 2, 3, 4]) + df = DataFrame({'s1': s1, 's2': s2}) result = df.groupby('s1', observed=observed).first().reset_index() if observed: - expected = DataFrame({'s1': pd.Categorical(['a'], + expected = DataFrame({'s1': Categorical(['a'], categories=['a', 'b', 'c']), 's2': [2]}) else: - expected = DataFrame({'s1': pd.Categorical(['a', 'b', 'c'], + expected = DataFrame({'s1': Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c']), 's2': [2, np.nan, np.nan]}) tm.assert_frame_equal(result, expected) @@ -459,11 +460,11 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): # Build a dataframe with cat having one unobserved category ('missing'), # and a Series with identical values - label = pd.Categorical(['d', 'a', 'b', 'a', 'd', 'b'], - categories=['a', 'b', 'missing', 'd'], - ordered=ordered) - val = pd.Series(['d', 'a', 'b', 'a', 'd', 'b']) - df = pd.DataFrame({'label': label, 'val': val}) + label = Categorical(['d', 'a', 'b', 'a', 'd', 'b'], + categories=['a', 'b', 'missing', 'd'], + ordered=ordered) + val = Series(['d', 'a', 'b', 'a', 'd', 'b']) + df = DataFrame({'label': label, 'val': val}) # aggregate on the Categorical result = (df.groupby('label', observed=observed, sort=sort)['val'] @@ -471,8 +472,8 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): # If ordering works, we expect index labels equal to aggregation results, # except for 'observed=False': label 'missing' has aggregation None - label = pd.Series(result.index.array, dtype='object') - aggr = pd.Series(result.array) + label = Series(result.index.array, dtype='object') + aggr = Series(result.array) if not observed: aggr[aggr.isna()] = 'missing' if not all(label == aggr): @@ -555,9 +556,9 @@ def test_categorical_index(): def test_describe_categorical_columns(): # GH 11558 - cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'], - categories=['foo', 'bar', 'baz', 'qux'], - ordered=True) + cats = CategoricalIndex(['qux', 'foo', 'baz', 'bar'], + categories=['foo', 'bar', 'baz', 'qux'], + ordered=True) df = DataFrame(np.random.randn(20, 4), columns=cats) result = df.groupby([1, 2, 3, 4] * 5).describe() @@ -567,22 +568,22 @@ def test_describe_categorical_columns(): def test_unstack_categorical(): # GH11558 (example is taken from the original issue) - df = pd.DataFrame({'a': range(10), - 'medium': ['A', 'B'] * 5, - 'artist': list('XYXXY') * 2}) + df = DataFrame({'a': range(10), + 'medium': ['A', 'B'] * 5, + 'artist': list('XYXXY') * 2}) df['medium'] = df['medium'].astype('category') gcat = df.groupby( ['artist', 'medium'], observed=False)['a'].count().unstack() result = gcat.describe() - exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False, - name='medium') + exp_columns = CategoricalIndex(['A', 'B'], ordered=False, + name='medium') tm.assert_index_equal(result.columns, exp_columns) tm.assert_categorical_equal(result.columns.values, exp_columns.values) result = gcat['A'] + gcat['B'] - expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist')) + expected = Series([6, 4], index=Index(['X', 'Y'], name='artist')) tm.assert_series_equal(result, expected) @@ -644,22 +645,22 @@ def test_preserve_categories(): categories = list('abc') # ordered=True - df = DataFrame({'A': pd.Categorical(list('ba'), - categories=categories, - ordered=True)}) - index = pd.CategoricalIndex(categories, categories, ordered=True) + df = DataFrame({'A': Categorical(list('ba'), + categories=categories, + ordered=True)}) + index = CategoricalIndex(categories, categories, ordered=True) tm.assert_index_equal( df.groupby('A', sort=True, observed=False).first().index, index) tm.assert_index_equal( df.groupby('A', sort=False, observed=False).first().index, index) # ordered=False - df = DataFrame({'A': pd.Categorical(list('ba'), - categories=categories, - ordered=False)}) - sort_index = pd.CategoricalIndex(categories, categories, ordered=False) - nosort_index = pd.CategoricalIndex(list('bac'), list('bac'), - ordered=False) + df = DataFrame({'A': Categorical(list('ba'), + categories=categories, + ordered=False)}) + sort_index = CategoricalIndex(categories, categories, ordered=False) + nosort_index = CategoricalIndex(list('bac'), list('bac'), + ordered=False) tm.assert_index_equal( df.groupby('A', sort=True, observed=False).first().index, sort_index) @@ -857,94 +858,94 @@ def test_sort_datetimelike(): def test_empty_sum(): # https://github.com/pandas-dev/pandas/issues/18678 - df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], - categories=['a', 'b', 'c']), - 'B': [1, 2, 1]}) - expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') + df = DataFrame({"A": Categorical(['a', 'a', 'b'], + categories=['a', 'b', 'c']), + 'B': [1, 2, 1]}) + expected_idx = CategoricalIndex(['a', 'b', 'c'], name='A') # 0 by default result = df.groupby("A", observed=False).B.sum() - expected = pd.Series([3, 1, 0], expected_idx, name='B') + expected = Series([3, 1, 0], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=0 result = df.groupby("A", observed=False).B.sum(min_count=0) - expected = pd.Series([3, 1, 0], expected_idx, name='B') + expected = Series([3, 1, 0], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=1 result = df.groupby("A", observed=False).B.sum(min_count=1) - expected = pd.Series([3, 1, np.nan], expected_idx, name='B') + expected = Series([3, 1, np.nan], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count>1 result = df.groupby("A", observed=False).B.sum(min_count=2) - expected = pd.Series([3, np.nan, np.nan], expected_idx, name='B') + expected = Series([3, np.nan, np.nan], expected_idx, name='B') tm.assert_series_equal(result, expected) def test_empty_prod(): # https://github.com/pandas-dev/pandas/issues/18678 - df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], - categories=['a', 'b', 'c']), - 'B': [1, 2, 1]}) + df = DataFrame({"A": Categorical(['a', 'a', 'b'], + categories=['a', 'b', 'c']), + 'B': [1, 2, 1]}) - expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') + expected_idx = CategoricalIndex(['a', 'b', 'c'], name='A') # 1 by default result = df.groupby("A", observed=False).B.prod() - expected = pd.Series([2, 1, 1], expected_idx, name='B') + expected = Series([2, 1, 1], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=0 result = df.groupby("A", observed=False).B.prod(min_count=0) - expected = pd.Series([2, 1, 1], expected_idx, name='B') + expected = Series([2, 1, 1], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=1 result = df.groupby("A", observed=False).B.prod(min_count=1) - expected = pd.Series([2, 1, np.nan], expected_idx, name='B') + expected = Series([2, 1, np.nan], expected_idx, name='B') tm.assert_series_equal(result, expected) def test_groupby_multiindex_categorical_datetime(): # https://github.com/pandas-dev/pandas/issues/21390 - df = pd.DataFrame({ - 'key1': pd.Categorical(list('abcbabcba')), - 'key2': pd.Categorical( + df = DataFrame({ + 'key1': Categorical(list('abcbabcba')), + 'key2': Categorical( list(pd.date_range('2018-06-01 00', freq='1T', periods=3)) * 3), 'values': np.arange(9), }) result = df.groupby(['key1', 'key2']).mean() - idx = pd.MultiIndex.from_product( - [pd.Categorical(['a', 'b', 'c']), - pd.Categorical(pd.date_range('2018-06-01 00', freq='1T', periods=3))], + idx = MultiIndex.from_product( + [Categorical(['a', 'b', 'c']), + Categorical(pd.date_range('2018-06-01 00', freq='1T', periods=3))], names=['key1', 'key2']) - expected = pd.DataFrame( + expected = DataFrame( {'values': [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx) assert_frame_equal(result, expected) @pytest.mark.parametrize("as_index, expected", [ - (True, pd.Series( - index=pd.MultiIndex.from_arrays( - [pd.Series([1, 1, 2], dtype='category'), - [1, 2, 2]], names=['a', 'b'] + (True, Series( + index=MultiIndex.from_arrays( + [Series([1, 1, 2], dtype='category'), + [1, 2, 2]], names=['a', 'b'] ), data=[1, 2, 3], name='x' )), - (False, pd.DataFrame({ - 'a': pd.Series([1, 1, 2], dtype='category'), + (False, DataFrame({ + 'a': Series([1, 1, 2], dtype='category'), 'b': [1, 2, 2], 'x': [1, 2, 3] })) ]) def test_groupby_agg_observed_true_single_column(as_index, expected): # GH-23970 - df = pd.DataFrame({ - 'a': pd.Series([1, 1, 2], dtype='category'), + df = DataFrame({ + 'a': Series([1, 1, 2], dtype='category'), 'b': [1, 2, 2], 'x': [1, 2, 3] }) @@ -957,9 +958,92 @@ def test_groupby_agg_observed_true_single_column(as_index, expected): @pytest.mark.parametrize('fill_value', [None, np.nan, pd.NaT]) def test_shift(fill_value): - ct = pd.Categorical(['a', 'b', 'c', 'd'], - categories=['a', 'b', 'c', 'd'], ordered=False) - expected = pd.Categorical([None, 'a', 'b', 'c'], - categories=['a', 'b', 'c', 'd'], ordered=False) + ct = Categorical(['a', 'b', 'c', 'd'], + categories=['a', 'b', 'c', 'd'], ordered=False) + expected = Categorical([None, 'a', 'b', 'c'], + categories=['a', 'b', 'c', 'd'], ordered=False) res = ct.shift(1, fill_value=fill_value) assert_equal(res, expected) + + +@pytest.fixture +def df_cat(df): + """ + DataFrame with multiple categorical columns and a column of integers. + Shortened so as not to contain all possible combinations of categories. + Useful for testing `observed` kwarg functionality on GroupBy objects. + + Parameters + ---------- + df: DataFrame + Non-categorical, longer DataFrame from another fixture, used to derive + this one + + Returns + ------- + df_cat: DataFrame + """ + df_cat = df.copy()[:4] # leave out some groups + df_cat['A'] = df_cat['A'].astype('category') + df_cat['B'] = df_cat['B'].astype('category') + df_cat['C'] = Series([1, 2, 3, 4]) + df_cat = df_cat.drop(['D'], axis=1) + return df_cat + + +@pytest.mark.parametrize('operation, kwargs', [ + ('agg', dict(dtype='category')), + ('apply', dict())]) +def test_seriesgroupby_observed_true(df_cat, operation, kwargs): + # GH 24880 + index = MultiIndex.from_frame( + DataFrame({'A': ['foo', 'foo', 'bar', 'bar'], + 'B': ['one', 'two', 'one', 'three'] + }, **kwargs)) + expected = Series(data=[1, 3, 2, 4], index=index, name='C') + grouped = df_cat.groupby(['A', 'B'], observed=True)['C'] + result = getattr(grouped, operation)(sum) + assert_series_equal(result, expected) + + +@pytest.mark.parametrize('operation', ['agg', 'apply']) +@pytest.mark.parametrize('observed', [False, None]) +def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): + # GH 24880 + index, _ = MultiIndex.from_product( + [CategoricalIndex(['bar', 'foo'], ordered=False), + CategoricalIndex(['one', 'three', 'two'], ordered=False)], + names=['A', 'B']).sortlevel() + + expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], + index=index, name='C') + grouped = df_cat.groupby(['A', 'B'], observed=observed)['C'] + result = getattr(grouped, operation)(sum) + assert_series_equal(result, expected) + + +@pytest.mark.parametrize("observed, index, data", [ + (True, MultiIndex.from_tuples( + [('foo', 'one', 'min'), ('foo', 'one', 'max'), + ('foo', 'two', 'min'), ('foo', 'two', 'max'), + ('bar', 'one', 'min'), ('bar', 'one', 'max'), + ('bar', 'three', 'min'), ('bar', 'three', 'max')], + names=['A', 'B', None]), [1, 1, 3, 3, 2, 2, 4, 4]), + (False, MultiIndex.from_product( + [CategoricalIndex(['bar', 'foo'], ordered=False), + CategoricalIndex(['one', 'three', 'two'], ordered=False), + Index(['min', 'max'])], + names=['A', 'B', None]), + [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3]), + (None, MultiIndex.from_product( + [CategoricalIndex(['bar', 'foo'], ordered=False), + CategoricalIndex(['one', 'three', 'two'], ordered=False), + Index(['min', 'max'])], + names=['A', 'B', None]), + [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3])]) +def test_seriesgroupby_observed_apply_dict(df_cat, observed, index, data): + # GH 24880 + expected = Series(data=data, index=index, name='C') + result = df_cat.groupby(['A', 'B'], observed=observed)['C'].apply( + lambda x: OrderedDict([('min', x.min()), ('max', x.max())])) + assert_series_equal(result, expected)