From a5d6d1a935bc17a29a42f16abe0827aaa60381e2 Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Sun, 19 May 2019 21:38:05 +0100 Subject: [PATCH 01/14] Fix 'observed' kwarg not doing anything on SeriesGroupBy --- pandas/core/groupby/generic.py | 90 +++++----------------------- pandas/core/groupby/groupby.py | 66 +++++++++++++++++++- pandas/tests/groupby/test_groupby.py | 59 +++++++++++++++++- 3 files changed, 138 insertions(+), 77 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2f665975f96bd..32933c3385e25 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -25,7 +25,6 @@ from pandas.core.dtypes.missing import isna, notna import pandas.core.algorithms as algorithms -from pandas.core.arrays import Categorical from pandas.core.base import DataError, SpecificationError import pandas.core.common as com from pandas.core.frame import DataFrame @@ -33,7 +32,7 @@ from pandas.core.groupby import base from pandas.core.groupby.groupby import ( GroupBy, _apply_docs, _transform_template) -from pandas.core.index import CategoricalIndex, Index, MultiIndex +from pandas.core.index import Index, MultiIndex import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series @@ -834,9 +833,10 @@ def _wrap_output(self, output, index, names=None): return Series(output, index=index, name=name) def _wrap_aggregated_output(self, output, names=None): - return self._wrap_output(output=output, - index=self.grouper.result_index, - names=names) + result = self._wrap_output(output=output, + index=self.grouper.result_index, + names=names) + return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output(self, output, names=None): return self._wrap_output(output=output, @@ -856,13 +856,15 @@ def _get_index(): return index if isinstance(values[0], dict): - # GH #823 + # GH #823 #24880 index = _get_index() - result = DataFrame(values, index=index).stack() + result = self._reindex_output(DataFrame(values, index=index)) + dropna = self.observed # if self.observed is False, keep all-NaN rows created while re-indexing + result = result.stack(dropna=dropna) result.name = self._selection_name return result - if isinstance(values[0], (Series, dict)): + if isinstance(values[0], Series): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) elif isinstance(values[0], DataFrame): @@ -870,9 +872,9 @@ def _get_index(): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: - # GH #6265 - return Series(values, index=_get_index(), - name=self._selection_name) + # GH #6265 #24880 + result = Series(values, index=_get_index(), name=self._selection_name) + return self._reindex_output(result) def _aggregate_named(self, func, *args, **kwargs): result = OrderedDict() @@ -1335,7 +1337,8 @@ def _gotitem(self, key, ndim, subset=None): if subset is None: subset = self.obj[key] return SeriesGroupBy(subset, selection=key, - grouper=self.grouper) + grouper=self.grouper, + observed=self.observed) raise AssertionError("invalid ndim for _gotitem") @@ -1407,69 +1410,6 @@ def _wrap_agged_blocks(self, items, blocks): return self._reindex_output(result)._convert(datetime=True) - def _reindex_output(self, result): - """ - If we have categorical groupers, then we want to make sure that - we have a fully reindex-output to the levels. These may have not - participated in the groupings (e.g. may have all been - nan groups); - - This can re-expand the output space - """ - - # we need to re-expand the output space to accomodate all values - # whether observed or not in the cartesian product of our groupes - groupings = self.grouper.groupings - if groupings is None: - return result - elif len(groupings) == 1: - return result - - # if we only care about the observed values - # we are done - elif self.observed: - return result - - # reindexing only applies to a Categorical grouper - elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex)) - for ping in groupings): - return result - - levels_list = [ping.group_index for ping in groupings] - index, _ = MultiIndex.from_product( - levels_list, names=self.grouper.names).sortlevel() - - if self.as_index: - d = {self.obj._get_axis_name(self.axis): index, 'copy': False} - return result.reindex(**d) - - # GH 13204 - # Here, the categorical in-axis groupers, which need to be fully - # expanded, are columns in `result`. An idea is to do: - # result = result.set_index(self.grouper.names) - # .reindex(index).reset_index() - # but special care has to be taken because of possible not-in-axis - # groupers. - # So, we manually select and drop the in-axis grouper columns, - # reindex `result`, and then reset the in-axis grouper columns. - - # Select in-axis groupers - in_axis_grps = ((i, ping.name) for (i, ping) - in enumerate(groupings) if ping.in_axis) - g_nums, g_names = zip(*in_axis_grps) - - result = result.drop(labels=list(g_names), axis=1) - - # Set a temp index and reindex (possibly expanding) - result = result.set_index(self.grouper.result_index - ).reindex(index, copy=False) - - # Reset in-axis grouper columns - # (using level numbers `g_nums` because level names may not be unique) - result = result.reset_index(level=g_nums) - - return result.reset_index(drop=True) - def _iterate_column_groupbys(self): for i, colname in enumerate(self._selected_obj.columns): yield colname, SeriesGroupBy(self._selected_obj.iloc[:, i], diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4e9e3b4963b6d..3d8716def20fb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -17,6 +17,7 @@ class providing the base-class of operations. import numpy as np +from pandas.core.arrays import Categorical from pandas._config.config import option_context from pandas._libs import Timestamp @@ -42,7 +43,7 @@ class providing the base-class of operations. from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base -from pandas.core.index import Index, MultiIndex +from pandas.core.index import Index, CategoricalIndex, MultiIndex from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter @@ -2301,6 +2302,69 @@ def tail(self, n=5): mask = self._cumcount_array(ascending=False) < n return self._selected_obj[mask] + def _reindex_output(self, result): + """ + If we have categorical groupers, then we want to make sure that + we have a fully reindex-output to the levels. These may have not + participated in the groupings (e.g. may have all been + nan groups); + + This can re-expand the output space + """ + + # we need to re-expand the output space to accomodate all values + # whether observed or not in the cartesian product of our groupes + groupings = self.grouper.groupings + if groupings is None: + return result + elif len(groupings) == 1: + return result + + # if we only care about the observed values + # we are done + elif self.observed: + return result + + # reindexing only applies to a Categorical grouper + elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex)) + for ping in groupings): + return result + + levels_list = [ping.group_index for ping in groupings] + index, _ = MultiIndex.from_product( + levels_list, names=self.grouper.names).sortlevel() + + if self.as_index: + d = {self.obj._get_axis_name(self.axis): index, 'copy': False} + return result.reindex(**d) + + # GH 13204 + # Here, the categorical in-axis groupers, which need to be fully + # expanded, are columns in `result`. An idea is to do: + # result = result.set_index(self.grouper.names) + # .reindex(index).reset_index() + # but special care has to be taken because of possible not-in-axis + # groupers. + # So, we manually select and drop the in-axis grouper columns, + # reindex `result`, and then reset the in-axis grouper columns. + + # Select in-axis groupers + in_axis_grps = ((i, ping.name) for (i, ping) + in enumerate(groupings) if ping.in_axis) + g_nums, g_names = zip(*in_axis_grps) + + result = result.drop(labels=list(g_names), axis=1) + + # Set a temp index and reindex (possibly expanding) + result = result.set_index(self.grouper.result_index + ).reindex(index, copy=False) + + # Reset in-axis grouper columns + # (using level numbers `g_nums` because level names may not be unique) + result = result.reset_index(level=g_nums) + + return result.reset_index(drop=True) + GroupBy._add_numeric_operations() diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 2511063110f92..6aa07eac681bf 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import ( - DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv) + DataFrame, Index, MultiIndex, CategoricalIndex, Series, Timestamp, date_range, read_csv) import pandas.core.common as com import pandas.util.testing as tm from pandas.util.testing import ( @@ -1736,3 +1736,60 @@ def test_groupby_multiindex_series_keys_len_equal_group_axis(): expected = pd.Series([3], index=ei) assert_series_equal(result, expected) + + +def test_groupby_observed(): + # GH 24880 + df = DataFrame({'a': ['x', 'x', 'x', 'y'], + 'b': ['a', 'a', 'b', 'a'], + 'c': [1, 2, 3, 4]}) + df['a'] = df['a'].astype('category') + df['b'] = df['b'].astype('category') + + # test .agg and .apply when observed == False + levels = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), + CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False)] + index, _ = MultiIndex.from_product(levels, names=['a', 'b']).sortlevel() + expected = pd.Series(data=[3, 3, 4, np.nan], index=index, name='c') + actual_agg = df.groupby(['a', 'b']).c.agg(sum) + actual_apply = df.groupby(['a', 'b']).c.apply(sum) + assert_series_equal(expected, actual_agg) + assert_series_equal(expected, actual_apply) + + # test .agg when observed == True + index = MultiIndex.from_frame(df[['a', 'b']].drop_duplicates()) + expected = pd.Series([3, 3, 4], index=index, name='c') + actual = df.groupby(['a', 'b'], observed=True).c.agg(sum) + assert_series_equal(expected, actual) + + # test .apply when observed == True + index = MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'a')], names=('a', 'b')) + expected = pd.Series([3, 3, 4], index=index, name='c') + actual = df.groupby(['a', 'b'], observed=True).c.apply(sum) + assert_series_equal(expected, actual) + + +def test_groupby_observed_apply_lambda_returns_dict(): + # GH 24880 + df = DataFrame({'a': ['x', 'x', 'x', 'y'], + 'b': ['a', 'a', 'b', 'a'], + 'c': [1, 2, 3, 4]}) + df['a'] = df['a'].astype('category') + df['b'] = df['b'].astype('category') + + # observed == False + levels = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), + CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False), + Index(['min', 'max'])] + index, _ = MultiIndex.from_product(levels, names=['a', 'b', None]).sortlevel() + expected = pd.Series(data=[2, 1, 3, 3, 4, 4, np.nan, np.nan], index=index, name='c') + actual = df.groupby(['a', 'b']).c.apply(lambda x: {'min': x.min(), 'max': x.max()}) + assert_series_equal(expected, actual) + + # observed == True + index = MultiIndex.from_tuples([('x', 'a', 'max'), ('x', 'a', 'min'), + ('x', 'b', 'max'), ('x', 'b', 'min'), + ('y', 'a', 'max'), ('y', 'a', 'min')],names=('a', 'b', None)) + expected = pd.Series(data=[2, 1, 3, 3, 4, 4], index=index, name='c') + actual = df.groupby(['a', 'b'], observed=True).c.apply(lambda x: {'min': x.min(), 'max': x.max()}) + assert_series_equal(expected, actual) From 2575c41727168039b7c2c62629e8a0226590176f Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Sun, 19 May 2019 21:56:47 +0100 Subject: [PATCH 02/14] Wrap long lines --- pandas/core/groupby/generic.py | 9 +++++--- pandas/tests/groupby/test_groupby.py | 34 +++++++++++++++++----------- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 32933c3385e25..dc414a588a2ce 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -859,8 +859,9 @@ def _get_index(): # GH #823 #24880 index = _get_index() result = self._reindex_output(DataFrame(values, index=index)) - dropna = self.observed # if self.observed is False, keep all-NaN rows created while re-indexing - result = result.stack(dropna=dropna) + # if self.observed is False, + # keep all-NaN rows created while re-indexing + result = result.stack(dropna=self.observed) result.name = self._selection_name return result @@ -873,7 +874,9 @@ def _get_index(): not_indexed_same=not_indexed_same) else: # GH #6265 #24880 - result = Series(values, index=_get_index(), name=self._selection_name) + result = Series(data=values, + index=_get_index(), + name=self._selection_name) return self._reindex_output(result) def _aggregate_named(self, func, *args, **kwargs): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6aa07eac681bf..3690dc7bb048d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -10,7 +10,8 @@ import pandas as pd from pandas import ( - DataFrame, Index, MultiIndex, CategoricalIndex, Series, Timestamp, date_range, read_csv) + DataFrame, Index, MultiIndex, CategoricalIndex, + Series, Timestamp, date_range, read_csv) import pandas.core.common as com import pandas.util.testing as tm from pandas.util.testing import ( @@ -1747,9 +1748,9 @@ def test_groupby_observed(): df['b'] = df['b'].astype('category') # test .agg and .apply when observed == False - levels = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), - CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False)] - index, _ = MultiIndex.from_product(levels, names=['a', 'b']).sortlevel() + lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), + CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False)] + index, _ = MultiIndex.from_product(lvls, names=['a', 'b']).sortlevel() expected = pd.Series(data=[3, 3, 4, np.nan], index=index, name='c') actual_agg = df.groupby(['a', 'b']).c.agg(sum) actual_apply = df.groupby(['a', 'b']).c.apply(sum) @@ -1763,7 +1764,8 @@ def test_groupby_observed(): assert_series_equal(expected, actual) # test .apply when observed == True - index = MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'a')], names=('a', 'b')) + index = MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'a')], + names=('a', 'b')) expected = pd.Series([3, 3, 4], index=index, name='c') actual = df.groupby(['a', 'b'], observed=True).c.apply(sum) assert_series_equal(expected, actual) @@ -1778,18 +1780,24 @@ def test_groupby_observed_apply_lambda_returns_dict(): df['b'] = df['b'].astype('category') # observed == False - levels = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), - CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False), - Index(['min', 'max'])] - index, _ = MultiIndex.from_product(levels, names=['a', 'b', None]).sortlevel() - expected = pd.Series(data=[2, 1, 3, 3, 4, 4, np.nan, np.nan], index=index, name='c') - actual = df.groupby(['a', 'b']).c.apply(lambda x: {'min': x.min(), 'max': x.max()}) + lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), + CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False), + Index(['min', 'max'])] + index, _ = MultiIndex.from_product(lvls, + names=['a', 'b', None]).sortlevel() + expected = pd.Series(data=[2, 1, 3, 3, 4, 4, np.nan, np.nan], + index=index, + name='c') + actual = df.groupby(['a', 'b']).c.apply(lambda x: {'min': x.min(), + 'max': x.max()}) assert_series_equal(expected, actual) # observed == True index = MultiIndex.from_tuples([('x', 'a', 'max'), ('x', 'a', 'min'), ('x', 'b', 'max'), ('x', 'b', 'min'), - ('y', 'a', 'max'), ('y', 'a', 'min')],names=('a', 'b', None)) + ('y', 'a', 'max'), ('y', 'a', 'min')], + names=('a', 'b', None)) expected = pd.Series(data=[2, 1, 3, 3, 4, 4], index=index, name='c') - actual = df.groupby(['a', 'b'], observed=True).c.apply(lambda x: {'min': x.min(), 'max': x.max()}) + actual = df.groupby(['a', 'b'], observed=True).c.\ + apply(lambda x: {'min': x.min(), 'max': x.max()}) assert_series_equal(expected, actual) From 1c02d9fb0e893f72273846809f273171fa08be0b Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Mon, 20 May 2019 00:33:20 +0100 Subject: [PATCH 03/14] Move tests to test_categorical.py --- pandas/tests/groupby/test_categorical.py | 64 ++++++++++++++++++++++ pandas/tests/groupby/test_groupby.py | 67 +----------------------- 2 files changed, 65 insertions(+), 66 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 112f7629d735a..0f86e360fea6a 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -963,3 +963,67 @@ def test_shift(fill_value): categories=['a', 'b', 'c', 'd'], ordered=False) res = ct.shift(1, fill_value=fill_value) assert_equal(res, expected) + + +def test_groupby_series_observed(): + # GH 24880 + df = DataFrame({'a': ['x', 'x', 'x', 'y'], + 'b': ['a', 'a', 'b', 'a'], + 'c': [1, 2, 3, 4]}) + df['a'] = df['a'].astype('category') + df['b'] = df['b'].astype('category') + + # test .agg and .apply when observed == False + lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), + CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False)] + index, _ = MultiIndex.from_product(lvls, names=['a', 'b']).sortlevel() + expected = pd.Series(data=[3, 3, 4, np.nan], index=index, name='c') + actual_agg = df.groupby(['a', 'b']).c.agg(sum) + actual_apply = df.groupby(['a', 'b']).c.apply(sum) + assert_series_equal(expected, actual_agg) + assert_series_equal(expected, actual_apply) + + # test .agg when observed == True + index = MultiIndex.from_frame(df[['a', 'b']].drop_duplicates()) + expected = pd.Series([3, 3, 4], index=index, name='c') + actual = df.groupby(['a', 'b'], observed=True).c.agg(sum) + assert_series_equal(expected, actual) + + # test .apply when observed == True + index = MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'a')], + names=('a', 'b')) + expected = pd.Series([3, 3, 4], index=index, name='c') + actual = df.groupby(['a', 'b'], observed=True).c.apply(sum) + assert_series_equal(expected, actual) + + +def test_groupby_series_observed_apply_dict(): + # GH 24880 + df = DataFrame({'a': ['x', 'x', 'x', 'y'], + 'b': ['a', 'a', 'b', 'a'], + 'c': [1, 2, 3, 4]}) + df['a'] = df['a'].astype('category') + df['b'] = df['b'].astype('category') + + # observed == False + lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), + CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False), + Index(['min', 'max'])] + index, _ = MultiIndex.from_product(lvls, + names=['a', 'b', None]).sortlevel() + expected = pd.Series(data=[2, 1, 3, 3, 4, 4, np.nan, np.nan], + index=index, + name='c') + actual = df.groupby(['a', 'b']).c.apply(lambda x: {'min': x.min(), + 'max': x.max()}) + assert_series_equal(expected, actual) + + # observed == True + index = MultiIndex.from_tuples([('x', 'a', 'max'), ('x', 'a', 'min'), + ('x', 'b', 'max'), ('x', 'b', 'min'), + ('y', 'a', 'max'), ('y', 'a', 'min')], + names=('a', 'b', None)) + expected = pd.Series(data=[2, 1, 3, 3, 4, 4], index=index, name='c') + actual = df.groupby(['a', 'b'], observed=True).c.\ + apply(lambda x: {'min': x.min(), 'max': x.max()}) + assert_series_equal(expected, actual) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3690dc7bb048d..2511063110f92 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -10,8 +10,7 @@ import pandas as pd from pandas import ( - DataFrame, Index, MultiIndex, CategoricalIndex, - Series, Timestamp, date_range, read_csv) + DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv) import pandas.core.common as com import pandas.util.testing as tm from pandas.util.testing import ( @@ -1737,67 +1736,3 @@ def test_groupby_multiindex_series_keys_len_equal_group_axis(): expected = pd.Series([3], index=ei) assert_series_equal(result, expected) - - -def test_groupby_observed(): - # GH 24880 - df = DataFrame({'a': ['x', 'x', 'x', 'y'], - 'b': ['a', 'a', 'b', 'a'], - 'c': [1, 2, 3, 4]}) - df['a'] = df['a'].astype('category') - df['b'] = df['b'].astype('category') - - # test .agg and .apply when observed == False - lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), - CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False)] - index, _ = MultiIndex.from_product(lvls, names=['a', 'b']).sortlevel() - expected = pd.Series(data=[3, 3, 4, np.nan], index=index, name='c') - actual_agg = df.groupby(['a', 'b']).c.agg(sum) - actual_apply = df.groupby(['a', 'b']).c.apply(sum) - assert_series_equal(expected, actual_agg) - assert_series_equal(expected, actual_apply) - - # test .agg when observed == True - index = MultiIndex.from_frame(df[['a', 'b']].drop_duplicates()) - expected = pd.Series([3, 3, 4], index=index, name='c') - actual = df.groupby(['a', 'b'], observed=True).c.agg(sum) - assert_series_equal(expected, actual) - - # test .apply when observed == True - index = MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'a')], - names=('a', 'b')) - expected = pd.Series([3, 3, 4], index=index, name='c') - actual = df.groupby(['a', 'b'], observed=True).c.apply(sum) - assert_series_equal(expected, actual) - - -def test_groupby_observed_apply_lambda_returns_dict(): - # GH 24880 - df = DataFrame({'a': ['x', 'x', 'x', 'y'], - 'b': ['a', 'a', 'b', 'a'], - 'c': [1, 2, 3, 4]}) - df['a'] = df['a'].astype('category') - df['b'] = df['b'].astype('category') - - # observed == False - lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), - CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False), - Index(['min', 'max'])] - index, _ = MultiIndex.from_product(lvls, - names=['a', 'b', None]).sortlevel() - expected = pd.Series(data=[2, 1, 3, 3, 4, 4, np.nan, np.nan], - index=index, - name='c') - actual = df.groupby(['a', 'b']).c.apply(lambda x: {'min': x.min(), - 'max': x.max()}) - assert_series_equal(expected, actual) - - # observed == True - index = MultiIndex.from_tuples([('x', 'a', 'max'), ('x', 'a', 'min'), - ('x', 'b', 'max'), ('x', 'b', 'min'), - ('y', 'a', 'max'), ('y', 'a', 'min')], - names=('a', 'b', None)) - expected = pd.Series(data=[2, 1, 3, 3, 4, 4], index=index, name='c') - actual = df.groupby(['a', 'b'], observed=True).c.\ - apply(lambda x: {'min': x.min(), 'max': x.max()}) - assert_series_equal(expected, actual) From 0e9f4737f19e6e5ab538481781fc9b371297173b Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Mon, 20 May 2019 13:46:59 +0100 Subject: [PATCH 04/14] Parameterized tests for 'observed' kwarg on SeriesGroupBy --- pandas/tests/groupby/conftest.py | 48 +++++++++++++- pandas/tests/groupby/test_categorical.py | 79 +++++++----------------- 2 files changed, 70 insertions(+), 57 deletions(-) diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index cb4fe511651ee..ca0ea5bc49d27 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import DataFrame, MultiIndex +from pandas import DataFrame, CategoricalIndex, Index, MultiIndex from pandas.util import testing as tm @@ -76,3 +76,49 @@ def three_group(): 'D': np.random.randn(11), 'E': np.random.randn(11), 'F': np.random.randn(11)}) + + +@pytest.fixture +def df_cat(): + df = DataFrame({'a': ['x', 'x', 'x', 'y'], + 'b': ['a', 'a', 'b', 'a'], + 'c': [1, 2, 3, 4]}) + df['a'] = df['a'].astype('category') + df['b'] = df['b'].astype('category') + return df + + +@pytest.fixture +def multi_index_cat_complete(): + lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), + CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False)] + index = MultiIndex.from_product(lvls, names=['a', 'b']) + return index + + +@pytest.fixture +def multi_index_cat_partial(df_cat): + return MultiIndex.from_frame(df_cat[['a', 'b']].drop_duplicates()) + + +@pytest.fixture +def multi_index_non_cat_partial(): + return MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'a')], + names=('a', 'b')) + + +@pytest.fixture +def multi_index_cat_compl_dict(): + lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), + CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False), + Index(['min', 'max'])] + index = MultiIndex.from_product(lvls, names=['a', 'b', None]) + return index + + +@pytest.fixture +def multi_index_non_cat_partial_dict(): + return MultiIndex.from_tuples([('x', 'a', 'min'), ('x', 'a', 'max'), + ('x', 'b', 'min'), ('x', 'b', 'max'), + ('y', 'a', 'min'), ('y', 'a', 'max')], + names=('a', 'b', None)) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 0f86e360fea6a..292e606390f34 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1,4 +1,5 @@ from datetime import datetime +from collections import OrderedDict import numpy as np import pytest @@ -965,65 +966,31 @@ def test_shift(fill_value): assert_equal(res, expected) -def test_groupby_series_observed(): +@pytest.mark.parametrize("observed, index, op, data", [ + (True, 'multi_index_cat_partial', 'agg', [3, 3, 4]), + (True, 'multi_index_non_cat_partial', 'apply', [3, 3, 4]), + (False, 'multi_index_cat_complete', 'agg', [3, 3, 4, np.nan]), + (False, 'multi_index_cat_complete', 'apply', [3, 3, 4, np.nan]), + (None, 'multi_index_cat_complete', 'agg', [3, 3, 4, np.nan]), + (None, 'multi_index_cat_complete', 'apply', [3, 3, 4, np.nan])]) +def test_groupby_series_observed(request, df_cat, observed, index, op, data): # GH 24880 - df = DataFrame({'a': ['x', 'x', 'x', 'y'], - 'b': ['a', 'a', 'b', 'a'], - 'c': [1, 2, 3, 4]}) - df['a'] = df['a'].astype('category') - df['b'] = df['b'].astype('category') - - # test .agg and .apply when observed == False - lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), - CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False)] - index, _ = MultiIndex.from_product(lvls, names=['a', 'b']).sortlevel() - expected = pd.Series(data=[3, 3, 4, np.nan], index=index, name='c') - actual_agg = df.groupby(['a', 'b']).c.agg(sum) - actual_apply = df.groupby(['a', 'b']).c.apply(sum) - assert_series_equal(expected, actual_agg) - assert_series_equal(expected, actual_apply) - - # test .agg when observed == True - index = MultiIndex.from_frame(df[['a', 'b']].drop_duplicates()) - expected = pd.Series([3, 3, 4], index=index, name='c') - actual = df.groupby(['a', 'b'], observed=True).c.agg(sum) - assert_series_equal(expected, actual) - - # test .apply when observed == True - index = MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'a')], - names=('a', 'b')) - expected = pd.Series([3, 3, 4], index=index, name='c') - actual = df.groupby(['a', 'b'], observed=True).c.apply(sum) + index = request.getfixturevalue(index) + expected = pd.Series(data=data, index=index, name='c') + grouped = df_cat.groupby(['a', 'b'], observed=observed).c + actual = getattr(grouped, op)(sum) assert_series_equal(expected, actual) -def test_groupby_series_observed_apply_dict(): +@pytest.mark.parametrize("observed, index, data", [ + (True, 'multi_index_non_cat_partial_dict', [1, 2, 3, 3, 4, 4]), + (False, 'multi_index_cat_compl_dict', [1, 2, 3, 3, 4, 4, np.nan, np.nan]), + (None, 'multi_index_cat_compl_dict', [1, 2, 3, 3, 4, 4, np.nan, np.nan])]) +def test_groupby_series_observed_apply_dict(request, df_cat, observed, index, + data): # GH 24880 - df = DataFrame({'a': ['x', 'x', 'x', 'y'], - 'b': ['a', 'a', 'b', 'a'], - 'c': [1, 2, 3, 4]}) - df['a'] = df['a'].astype('category') - df['b'] = df['b'].astype('category') - - # observed == False - lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), - CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False), - Index(['min', 'max'])] - index, _ = MultiIndex.from_product(lvls, - names=['a', 'b', None]).sortlevel() - expected = pd.Series(data=[2, 1, 3, 3, 4, 4, np.nan, np.nan], - index=index, - name='c') - actual = df.groupby(['a', 'b']).c.apply(lambda x: {'min': x.min(), - 'max': x.max()}) - assert_series_equal(expected, actual) - - # observed == True - index = MultiIndex.from_tuples([('x', 'a', 'max'), ('x', 'a', 'min'), - ('x', 'b', 'max'), ('x', 'b', 'min'), - ('y', 'a', 'max'), ('y', 'a', 'min')], - names=('a', 'b', None)) - expected = pd.Series(data=[2, 1, 3, 3, 4, 4], index=index, name='c') - actual = df.groupby(['a', 'b'], observed=True).c.\ - apply(lambda x: {'min': x.min(), 'max': x.max()}) + index = request.getfixturevalue(index) + expected = pd.Series(data=data, index=index, name='c') + actual = df_cat.groupby(['a', 'b'], observed=observed).c.\ + apply(lambda x: OrderedDict([('min', x.min()), ('max', x.max())])) assert_series_equal(expected, actual) From cd481ad5c016d7fd919d77d96ea7ab120d3a65c2 Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Mon, 20 May 2019 18:09:19 +0100 Subject: [PATCH 05/14] Split test_groupby_series_observed to utilize fixtures better;Sort imports --- pandas/tests/groupby/conftest.py | 2 +- pandas/tests/groupby/test_categorical.py | 40 ++++++++++++++---------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index ca0ea5bc49d27..42c0080bf60e6 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import DataFrame, CategoricalIndex, Index, MultiIndex +from pandas import CategoricalIndex, DataFrame, Index, MultiIndex from pandas.util import testing as tm diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 292e606390f34..dfa4872b9daf7 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1,5 +1,5 @@ -from datetime import datetime from collections import OrderedDict +from datetime import datetime import numpy as np import pytest @@ -966,20 +966,28 @@ def test_shift(fill_value): assert_equal(res, expected) -@pytest.mark.parametrize("observed, index, op, data", [ - (True, 'multi_index_cat_partial', 'agg', [3, 3, 4]), - (True, 'multi_index_non_cat_partial', 'apply', [3, 3, 4]), - (False, 'multi_index_cat_complete', 'agg', [3, 3, 4, np.nan]), - (False, 'multi_index_cat_complete', 'apply', [3, 3, 4, np.nan]), - (None, 'multi_index_cat_complete', 'agg', [3, 3, 4, np.nan]), - (None, 'multi_index_cat_complete', 'apply', [3, 3, 4, np.nan])]) -def test_groupby_series_observed(request, df_cat, observed, index, op, data): +@pytest.mark.parametrize("index, op", [ + ('multi_index_cat_partial', 'agg'), + ('multi_index_non_cat_partial', 'apply')]) +def test_groupby_series_observed_true(request, df_cat, index, op): # GH 24880 index = request.getfixturevalue(index) - expected = pd.Series(data=data, index=index, name='c') - grouped = df_cat.groupby(['a', 'b'], observed=observed).c - actual = getattr(grouped, op)(sum) - assert_series_equal(expected, actual) + expected = pd.Series(data=[3, 3, 4], index=index, name='c') + grouped = df_cat.groupby(['a', 'b'], observed=True)['c'] + result = getattr(grouped, op)(sum) + assert_series_equal(result, expected) + + +@pytest.mark.parametrize("op", ['agg', 'apply']) +@pytest.mark.parametrize("observed", [False, None]) +def test_groupby_series_observed_false_or_none( + df_cat, multi_index_cat_complete, observed, op): + # GH 24880 + index = multi_index_cat_complete + expected = pd.Series(data=[3, 3, 4, np.nan], index=index, name='c') + grouped = df_cat.groupby(['a', 'b'], observed=observed)['c'] + result = getattr(grouped, op)(sum) + assert_series_equal(result, expected) @pytest.mark.parametrize("observed, index, data", [ @@ -991,6 +999,6 @@ def test_groupby_series_observed_apply_dict(request, df_cat, observed, index, # GH 24880 index = request.getfixturevalue(index) expected = pd.Series(data=data, index=index, name='c') - actual = df_cat.groupby(['a', 'b'], observed=observed).c.\ - apply(lambda x: OrderedDict([('min', x.min()), ('max', x.max())])) - assert_series_equal(expected, actual) + result = df_cat.groupby(['a', 'b'], observed=observed)['c'].apply( + lambda x: OrderedDict([('min', x.min()), ('max', x.max())])) + assert_series_equal(result, expected) From a515cafd95047ab9408064b3c67fe2c6bb44c07c Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Mon, 20 May 2019 19:47:23 +0100 Subject: [PATCH 06/14] Sort imports in core/groupby/groupby.py --- pandas/core/groupby/groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3d8716def20fb..75d678ae277e1 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -17,7 +17,6 @@ class providing the base-class of operations. import numpy as np -from pandas.core.arrays import Categorical from pandas._config.config import option_context from pandas._libs import Timestamp @@ -37,13 +36,14 @@ class providing the base-class of operations. from pandas.api.types import ( is_datetime64_dtype, is_integer_dtype, is_object_dtype) import pandas.core.algorithms as algorithms +from pandas.core.arrays import Categorical from pandas.core.base import ( DataError, GroupByError, PandasObject, SelectionMixin, SpecificationError) import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base -from pandas.core.index import Index, CategoricalIndex, MultiIndex +from pandas.core.index import CategoricalIndex, Index, MultiIndex from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter From ff42dd7bde58abb004214b1d53a9f5e171b31a13 Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Mon, 20 May 2019 23:40:49 +0100 Subject: [PATCH 07/14] Remove too specific fixtures and adjust tests --- pandas/tests/groupby/conftest.py | 42 ++-------------- pandas/tests/groupby/test_categorical.py | 64 ++++++++++++++++-------- 2 files changed, 47 insertions(+), 59 deletions(-) diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 42c0080bf60e6..3a8ca2383a091 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import CategoricalIndex, DataFrame, Index, MultiIndex +from pandas import DataFrame, MultiIndex from pandas.util import testing as tm @@ -80,45 +80,9 @@ def three_group(): @pytest.fixture def df_cat(): - df = DataFrame({'a': ['x', 'x', 'x', 'y'], - 'b': ['a', 'a', 'b', 'a'], + df = DataFrame({'a': ['one', 'one', 'one', 'two'], + 'b': ['foo', 'foo', 'bar', 'foo'], 'c': [1, 2, 3, 4]}) df['a'] = df['a'].astype('category') df['b'] = df['b'].astype('category') return df - - -@pytest.fixture -def multi_index_cat_complete(): - lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), - CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False)] - index = MultiIndex.from_product(lvls, names=['a', 'b']) - return index - - -@pytest.fixture -def multi_index_cat_partial(df_cat): - return MultiIndex.from_frame(df_cat[['a', 'b']].drop_duplicates()) - - -@pytest.fixture -def multi_index_non_cat_partial(): - return MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'a')], - names=('a', 'b')) - - -@pytest.fixture -def multi_index_cat_compl_dict(): - lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), - CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False), - Index(['min', 'max'])] - index = MultiIndex.from_product(lvls, names=['a', 'b', None]) - return index - - -@pytest.fixture -def multi_index_non_cat_partial_dict(): - return MultiIndex.from_tuples([('x', 'a', 'min'), ('x', 'a', 'max'), - ('x', 'b', 'min'), ('x', 'b', 'max'), - ('y', 'a', 'min'), ('y', 'a', 'max')], - names=('a', 'b', None)) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index dfa4872b9daf7..484dac298362f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -966,38 +966,62 @@ def test_shift(fill_value): assert_equal(res, expected) -@pytest.mark.parametrize("index, op", [ - ('multi_index_cat_partial', 'agg'), - ('multi_index_non_cat_partial', 'apply')]) -def test_groupby_series_observed_true(request, df_cat, index, op): +@pytest.mark.parametrize('operation', ['agg', 'apply']) +def test_groupby_series_observed_true(df_cat, operation): # GH 24880 - index = request.getfixturevalue(index) + index = { + 'agg': MultiIndex.from_frame(df_cat[['a', 'b']].drop_duplicates()), + 'apply': MultiIndex.from_tuples( + [tuple(grp) for grp in + df_cat.select_dtypes('category').drop_duplicates().values], + names=df_cat.select_dtypes('category')) + }[operation] + expected = pd.Series(data=[3, 3, 4], index=index, name='c') grouped = df_cat.groupby(['a', 'b'], observed=True)['c'] - result = getattr(grouped, op)(sum) + result = getattr(grouped, operation)(sum) assert_series_equal(result, expected) -@pytest.mark.parametrize("op", ['agg', 'apply']) -@pytest.mark.parametrize("observed", [False, None]) -def test_groupby_series_observed_false_or_none( - df_cat, multi_index_cat_complete, observed, op): +@pytest.mark.parametrize('operation', ['agg', 'apply']) +@pytest.mark.parametrize('observed', [False, None]) +def test_groupby_series_observed_false_or_none(df_cat, observed, operation): # GH 24880 - index = multi_index_cat_complete - expected = pd.Series(data=[3, 3, 4, np.nan], index=index, name='c') + index, _ = MultiIndex.from_product( + iterables=(CategoricalIndex(data=d) + for d in np.apply_along_axis( + np.unique, 1, df_cat.select_dtypes('category').T.values)), + names=df_cat.select_dtypes('category').columns).sortlevel() + + expected = pd.Series(data=[3, 3, np.nan, 4], index=index, name='c') grouped = df_cat.groupby(['a', 'b'], observed=observed)['c'] - result = getattr(grouped, op)(sum) + result = getattr(grouped, operation)(sum) assert_series_equal(result, expected) -@pytest.mark.parametrize("observed, index, data", [ - (True, 'multi_index_non_cat_partial_dict', [1, 2, 3, 3, 4, 4]), - (False, 'multi_index_cat_compl_dict', [1, 2, 3, 3, 4, 4, np.nan, np.nan]), - (None, 'multi_index_cat_compl_dict', [1, 2, 3, 3, 4, 4, np.nan, np.nan])]) -def test_groupby_series_observed_apply_dict(request, df_cat, observed, index, - data): +@pytest.mark.parametrize("observed, data", [ + (True, [1, 2, 3, 3, 4, 4]), + (False, [3, 3, 1, 2, np.nan, np.nan, 4.0, 4.0]), + (None, [3, 3, 1, 2, np.nan, np.nan, 4.0, 4.0])]) +def test_groupby_series_observed_apply_dict(df_cat, observed, data): # GH 24880 - index = request.getfixturevalue(index) + index_names = df_cat.select_dtypes( + 'category').columns.values.tolist() + [None] + index = { + True: MultiIndex.from_tuples( + [tuple(list(grp) + [p]) + for grp in df_cat.select_dtypes( + 'category').drop_duplicates().values + for p in ('min', 'max')], + names=index_names), + False: MultiIndex.from_product( + [CategoricalIndex(data=d) + for d in np.apply_along_axis( + np.unique, 1, df_cat.select_dtypes('category').T.values) + ] + [Index(['min', 'max'])], + names=index_names) + }[bool(observed)] + expected = pd.Series(data=data, index=index, name='c') result = df_cat.groupby(['a', 'b'], observed=observed)['c'].apply( lambda x: OrderedDict([('min', x.min()), ('max', x.max())])) From cc0b72590989253f23e8eab606266f4fbc3d0d94 Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Tue, 21 May 2019 09:33:02 +0100 Subject: [PATCH 08/14] Use literal values for indices in tests --- pandas/tests/groupby/test_categorical.py | 61 ++++++++++++++---------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 484dac298362f..1a2b62e556981 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -970,11 +970,17 @@ def test_shift(fill_value): def test_groupby_series_observed_true(df_cat, operation): # GH 24880 index = { - 'agg': MultiIndex.from_frame(df_cat[['a', 'b']].drop_duplicates()), - 'apply': MultiIndex.from_tuples( - [tuple(grp) for grp in - df_cat.select_dtypes('category').drop_duplicates().values], - names=df_cat.select_dtypes('category')) + 'agg': MultiIndex(levels=[CategoricalIndex(['one', 'two'], + categories=['one', 'two'], + ordered=False), + CategoricalIndex(['bar', 'foo'], + categories=['bar', 'foo'], + ordered=False)], + codes=[[0, 0, 1], [1, 0, 1]], + names=['a', 'b']), + 'apply': MultiIndex(levels=[['one', 'two'], ['bar', 'foo']], + codes=[[0, 0, 1], [1, 0, 1]], + names=['a', 'b']) }[operation] expected = pd.Series(data=[3, 3, 4], index=index, name='c') @@ -987,11 +993,14 @@ def test_groupby_series_observed_true(df_cat, operation): @pytest.mark.parametrize('observed', [False, None]) def test_groupby_series_observed_false_or_none(df_cat, observed, operation): # GH 24880 - index, _ = MultiIndex.from_product( - iterables=(CategoricalIndex(data=d) - for d in np.apply_along_axis( - np.unique, 1, df_cat.select_dtypes('category').T.values)), - names=df_cat.select_dtypes('category').columns).sortlevel() + index, _ = MultiIndex(levels=[CategoricalIndex(['one', 'two'], + categories=['one', 'two'], + ordered=False), + CategoricalIndex(['bar', 'foo'], + categories=['bar', 'foo'], + ordered=False)], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=['a', 'b']).sortlevel() expected = pd.Series(data=[3, 3, np.nan, 4], index=index, name='c') grouped = df_cat.groupby(['a', 'b'], observed=observed)['c'] @@ -1005,21 +1014,25 @@ def test_groupby_series_observed_false_or_none(df_cat, observed, operation): (None, [3, 3, 1, 2, np.nan, np.nan, 4.0, 4.0])]) def test_groupby_series_observed_apply_dict(df_cat, observed, data): # GH 24880 - index_names = df_cat.select_dtypes( - 'category').columns.values.tolist() + [None] index = { - True: MultiIndex.from_tuples( - [tuple(list(grp) + [p]) - for grp in df_cat.select_dtypes( - 'category').drop_duplicates().values - for p in ('min', 'max')], - names=index_names), - False: MultiIndex.from_product( - [CategoricalIndex(data=d) - for d in np.apply_along_axis( - np.unique, 1, df_cat.select_dtypes('category').T.values) - ] + [Index(['min', 'max'])], - names=index_names) + True: MultiIndex(levels=[['one', 'two'], + ['bar', 'foo'], + ['max', 'min']], + codes=[[0, 0, 0, 0, 1, 1], + [1, 1, 0, 0, 1, 1], + [1, 0, 1, 0, 1, 0]], + names=['a', 'b', None]), + False: MultiIndex(levels=[CategoricalIndex(['one', 'two'], + categories=['one', 'two'], + ordered=False), + CategoricalIndex(['bar', 'foo'], + categories=['bar', 'foo'], + ordered=False), + Index(['max', 'min'])], + codes=[[0, 0, 0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 0, 0, 1, 1], + [1, 0, 1, 0, 1, 0, 1, 0]], + names=['a', 'b', None]) }[bool(observed)] expected = pd.Series(data=data, index=index, name='c') From e4fda22837922e900947af3e7ffb1a2e195fb5f9 Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Wed, 22 May 2019 09:48:23 +0100 Subject: [PATCH 09/14] Use MultiIndex.from_* to construct indices in tests --- pandas/tests/groupby/test_categorical.py | 78 +++++++++--------------- 1 file changed, 28 insertions(+), 50 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 1a2b62e556981..8e58a6a45be8b 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -966,23 +966,14 @@ def test_shift(fill_value): assert_equal(res, expected) -@pytest.mark.parametrize('operation', ['agg', 'apply']) -def test_groupby_series_observed_true(df_cat, operation): +@pytest.mark.parametrize('operation, index', [ + ('agg', MultiIndex.from_frame(pd.DataFrame({'a': ['one', 'one', 'two'], + 'b': ['foo', 'bar', 'foo']}, + dtype='category'))), + ('apply', MultiIndex.from_frame(pd.DataFrame({'a': ['one', 'one', 'two'], + 'b': ['foo', 'bar', 'foo']})))]) +def test_groupby_series_observed_true(df_cat, operation, index): # GH 24880 - index = { - 'agg': MultiIndex(levels=[CategoricalIndex(['one', 'two'], - categories=['one', 'two'], - ordered=False), - CategoricalIndex(['bar', 'foo'], - categories=['bar', 'foo'], - ordered=False)], - codes=[[0, 0, 1], [1, 0, 1]], - names=['a', 'b']), - 'apply': MultiIndex(levels=[['one', 'two'], ['bar', 'foo']], - codes=[[0, 0, 1], [1, 0, 1]], - names=['a', 'b']) - }[operation] - expected = pd.Series(data=[3, 3, 4], index=index, name='c') grouped = df_cat.groupby(['a', 'b'], observed=True)['c'] result = getattr(grouped, operation)(sum) @@ -993,14 +984,10 @@ def test_groupby_series_observed_true(df_cat, operation): @pytest.mark.parametrize('observed', [False, None]) def test_groupby_series_observed_false_or_none(df_cat, observed, operation): # GH 24880 - index, _ = MultiIndex(levels=[CategoricalIndex(['one', 'two'], - categories=['one', 'two'], - ordered=False), - CategoricalIndex(['bar', 'foo'], - categories=['bar', 'foo'], - ordered=False)], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]], - names=['a', 'b']).sortlevel() + index, _ = MultiIndex.from_product( + [CategoricalIndex(['one', 'two'], categories=['one', 'two'], ordered=False), + CategoricalIndex(['bar', 'foo'], categories=['bar', 'foo'], ordered=False)], + names=['a', 'b']).sortlevel() expected = pd.Series(data=[3, 3, np.nan, 4], index=index, name='c') grouped = df_cat.groupby(['a', 'b'], observed=observed)['c'] @@ -1008,33 +995,24 @@ def test_groupby_series_observed_false_or_none(df_cat, observed, operation): assert_series_equal(result, expected) -@pytest.mark.parametrize("observed, data", [ - (True, [1, 2, 3, 3, 4, 4]), - (False, [3, 3, 1, 2, np.nan, np.nan, 4.0, 4.0]), - (None, [3, 3, 1, 2, np.nan, np.nan, 4.0, 4.0])]) -def test_groupby_series_observed_apply_dict(df_cat, observed, data): +@pytest.mark.parametrize("observed, index, data", [ + (True, MultiIndex.from_tuples( + [('one', 'foo', 'min'), ('one', 'foo', 'max'), + ('one', 'bar', 'min'), ('one', 'bar', 'max'), + ('two', 'foo', 'min'), ('two', 'foo', 'max')], + names=['a', 'b', None]), [1, 2, 3, 3, 4, 4]), + (False, MultiIndex.from_product( + [CategoricalIndex(['one', 'two'], categories=['one', 'two'], ordered=False), + CategoricalIndex(['bar', 'foo'], categories=['bar', 'foo'], ordered=False), + Index(['min', 'max'])], + names=['a', 'b', None]), [3, 3, 1, 2, np.nan, np.nan, 4.0, 4.0]), + (None, MultiIndex.from_product( + [CategoricalIndex(['one', 'two'], categories=['one', 'two'], ordered=False), + CategoricalIndex(['bar', 'foo'], categories=['bar', 'foo'], ordered=False), + Index(['min', 'max'])], + names=['a', 'b', None]), [3, 3, 1, 2, np.nan, np.nan, 4.0, 4.0])]) +def test_groupby_series_observed_apply_dict(df_cat, observed, index, data): # GH 24880 - index = { - True: MultiIndex(levels=[['one', 'two'], - ['bar', 'foo'], - ['max', 'min']], - codes=[[0, 0, 0, 0, 1, 1], - [1, 1, 0, 0, 1, 1], - [1, 0, 1, 0, 1, 0]], - names=['a', 'b', None]), - False: MultiIndex(levels=[CategoricalIndex(['one', 'two'], - categories=['one', 'two'], - ordered=False), - CategoricalIndex(['bar', 'foo'], - categories=['bar', 'foo'], - ordered=False), - Index(['max', 'min'])], - codes=[[0, 0, 0, 0, 1, 1, 1, 1], - [0, 0, 1, 1, 0, 0, 1, 1], - [1, 0, 1, 0, 1, 0, 1, 0]], - names=['a', 'b', None]) - }[bool(observed)] - expected = pd.Series(data=data, index=index, name='c') result = df_cat.groupby(['a', 'b'], observed=observed)['c'].apply( lambda x: OrderedDict([('min', x.min()), ('max', x.max())])) From 8cfa4a13a46377658cb6bae6bb81f2be745e6be5 Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Wed, 22 May 2019 09:58:22 +0100 Subject: [PATCH 10/14] Wrap long lines --- pandas/tests/groupby/test_categorical.py | 29 +++++++++++++++++------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 8e58a6a45be8b..4b6af5da403dc 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -971,7 +971,8 @@ def test_shift(fill_value): 'b': ['foo', 'bar', 'foo']}, dtype='category'))), ('apply', MultiIndex.from_frame(pd.DataFrame({'a': ['one', 'one', 'two'], - 'b': ['foo', 'bar', 'foo']})))]) + 'b': ['foo', 'bar', 'foo']})) + )]) def test_groupby_series_observed_true(df_cat, operation, index): # GH 24880 expected = pd.Series(data=[3, 3, 4], index=index, name='c') @@ -985,8 +986,12 @@ def test_groupby_series_observed_true(df_cat, operation, index): def test_groupby_series_observed_false_or_none(df_cat, observed, operation): # GH 24880 index, _ = MultiIndex.from_product( - [CategoricalIndex(['one', 'two'], categories=['one', 'two'], ordered=False), - CategoricalIndex(['bar', 'foo'], categories=['bar', 'foo'], ordered=False)], + [CategoricalIndex(['one', 'two'], + categories=['one', 'two'], + ordered=False), + CategoricalIndex(['bar', 'foo'], + categories=['bar', 'foo'], + ordered=False)], names=['a', 'b']).sortlevel() expected = pd.Series(data=[3, 3, np.nan, 4], index=index, name='c') @@ -1002,13 +1007,21 @@ def test_groupby_series_observed_false_or_none(df_cat, observed, operation): ('two', 'foo', 'min'), ('two', 'foo', 'max')], names=['a', 'b', None]), [1, 2, 3, 3, 4, 4]), (False, MultiIndex.from_product( - [CategoricalIndex(['one', 'two'], categories=['one', 'two'], ordered=False), - CategoricalIndex(['bar', 'foo'], categories=['bar', 'foo'], ordered=False), + [CategoricalIndex(['one', 'two'], + categories=['one', 'two'], + ordered=False), + CategoricalIndex(['bar', 'foo'], + categories=['bar', 'foo'], + ordered=False), Index(['min', 'max'])], - names=['a', 'b', None]), [3, 3, 1, 2, np.nan, np.nan, 4.0, 4.0]), + names=['a', 'b', None]), [3, 3, 1, 2, np.nan, np.nan, 4.0, 4.0]), (None, MultiIndex.from_product( - [CategoricalIndex(['one', 'two'], categories=['one', 'two'], ordered=False), - CategoricalIndex(['bar', 'foo'], categories=['bar', 'foo'], ordered=False), + [CategoricalIndex(['one', 'two'], + categories=['one', 'two'], + ordered=False), + CategoricalIndex(['bar', 'foo'], + categories=['bar', 'foo'], + ordered=False), Index(['min', 'max'])], names=['a', 'b', None]), [3, 3, 1, 2, np.nan, np.nan, 4.0, 4.0])]) def test_groupby_series_observed_apply_dict(df_cat, observed, index, data): From d520952b745bdf631d7c87cc6309b5a0dec4b063 Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Sun, 26 May 2019 23:45:16 +0100 Subject: [PATCH 11/14] Enhance docstring for _reindex_output --- pandas/core/groupby/groupby.py | 52 ++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 737e411b64e88..91bb71a1a8af7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2302,33 +2302,43 @@ def tail(self, n=5): mask = self._cumcount_array(ascending=False) < n return self._selected_obj[mask] - def _reindex_output(self, result): + def _reindex_output(self, output): """ - If we have categorical groupers, then we want to make sure that - we have a fully reindex-output to the levels. These may have not - participated in the groupings (e.g. may have all been - nan groups); + If we have categorical groupers, then we might want to make sure that + we have a fully re-indexed output to the levels. This means expanding + the output space to accommodate all values in the cartesian product of + our groups, regardless of whether they were observed in the data or + not. This will expand the output space if there are missing groups. - This can re-expand the output space - """ + The method returns early without modifying the input if the number of + groupings is less than 2, self.observed == True or none of the groupers + are categorical. + + Parameters + ---------- + output: Series or DataFrame + Object resulting from grouping and applying an operation. - # we need to re-expand the output space to accomodate all values - # whether observed or not in the cartesian product of our groupes + Returns + ------- + Series or DataFrame + Object (potentially) re-indexed to include all possible groups. + """ groupings = self.grouper.groupings if groupings is None: - return result + return output elif len(groupings) == 1: - return result + return output # if we only care about the observed values # we are done elif self.observed: - return result + return output # reindexing only applies to a Categorical grouper elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex)) for ping in groupings): - return result + return output levels_list = [ping.group_index for ping in groupings] index, _ = MultiIndex.from_product( @@ -2336,34 +2346,34 @@ def _reindex_output(self, result): if self.as_index: d = {self.obj._get_axis_name(self.axis): index, 'copy': False} - return result.reindex(**d) + return output.reindex(**d) # GH 13204 # Here, the categorical in-axis groupers, which need to be fully - # expanded, are columns in `result`. An idea is to do: - # result = result.set_index(self.grouper.names) + # expanded, are columns in `output`. An idea is to do: + # output = output.set_index(self.grouper.names) # .reindex(index).reset_index() # but special care has to be taken because of possible not-in-axis # groupers. # So, we manually select and drop the in-axis grouper columns, - # reindex `result`, and then reset the in-axis grouper columns. + # reindex `output`, and then reset the in-axis grouper columns. # Select in-axis groupers in_axis_grps = ((i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis) g_nums, g_names = zip(*in_axis_grps) - result = result.drop(labels=list(g_names), axis=1) + output = output.drop(labels=list(g_names), axis=1) # Set a temp index and reindex (possibly expanding) - result = result.set_index(self.grouper.result_index + output = output.set_index(self.grouper.result_index ).reindex(index, copy=False) # Reset in-axis grouper columns # (using level numbers `g_nums` because level names may not be unique) - result = result.reset_index(level=g_nums) + output = output.reset_index(level=g_nums) - return result.reset_index(drop=True) + return output.reset_index(drop=True) GroupBy._add_numeric_operations() From 3591dbc34c273d13bd0496308ed020d5f01d6219 Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Mon, 27 May 2019 14:51:32 +0100 Subject: [PATCH 12/14] Modify tests to reuse existing fixture --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/tests/groupby/conftest.py | 10 -- pandas/tests/groupby/test_categorical.py | 136 ++++++++++++----------- 3 files changed, 75 insertions(+), 72 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 32faf7115f0fd..537cc69bf8469 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -483,6 +483,7 @@ Groupby/Resample/Rolling - Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying a aggregation function to timezone aware data (:issue:`23683`) - Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`) - Bug in :func:`pandas.core.groupby.GroupBy.size` when grouping only NA values (:issue:`23050`) +- Bug in :func:`Series.groupby` where ``observed`` kwarg was previously ignored (:issue:`24880`) - Bug in :func:`Series.groupby` where using ``groupby`` with a :class:`MultiIndex` Series with a list of labels equal to the length of the series caused incorrect grouping (:issue:`25704`) - Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`) - Ensured that result group order is correct when grouping on an ordered ``Categorical`` and specifying ``observed=True`` (:issue:`25871`, :issue:`25167`) diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 3a8ca2383a091..cb4fe511651ee 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -76,13 +76,3 @@ def three_group(): 'D': np.random.randn(11), 'E': np.random.randn(11), 'F': np.random.randn(11)}) - - -@pytest.fixture -def df_cat(): - df = DataFrame({'a': ['one', 'one', 'one', 'two'], - 'b': ['foo', 'foo', 'bar', 'foo'], - 'c': [1, 2, 3, 4]}) - df['a'] = df['a'].astype('category') - df['b'] = df['b'].astype('category') - return df diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 4b6af5da403dc..0b7bc5dc0efb9 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -966,67 +966,79 @@ def test_shift(fill_value): assert_equal(res, expected) -@pytest.mark.parametrize('operation, index', [ - ('agg', MultiIndex.from_frame(pd.DataFrame({'a': ['one', 'one', 'two'], - 'b': ['foo', 'bar', 'foo']}, - dtype='category'))), - ('apply', MultiIndex.from_frame(pd.DataFrame({'a': ['one', 'one', 'two'], - 'b': ['foo', 'bar', 'foo']})) - )]) -def test_groupby_series_observed_true(df_cat, operation, index): +class TestSeriesGroupByObservedKwarg: # GH 24880 - expected = pd.Series(data=[3, 3, 4], index=index, name='c') - grouped = df_cat.groupby(['a', 'b'], observed=True)['c'] - result = getattr(grouped, operation)(sum) - assert_series_equal(result, expected) - - -@pytest.mark.parametrize('operation', ['agg', 'apply']) -@pytest.mark.parametrize('observed', [False, None]) -def test_groupby_series_observed_false_or_none(df_cat, observed, operation): - # GH 24880 - index, _ = MultiIndex.from_product( - [CategoricalIndex(['one', 'two'], - categories=['one', 'two'], - ordered=False), - CategoricalIndex(['bar', 'foo'], - categories=['bar', 'foo'], - ordered=False)], - names=['a', 'b']).sortlevel() - - expected = pd.Series(data=[3, 3, np.nan, 4], index=index, name='c') - grouped = df_cat.groupby(['a', 'b'], observed=observed)['c'] - result = getattr(grouped, operation)(sum) - assert_series_equal(result, expected) - -@pytest.mark.parametrize("observed, index, data", [ - (True, MultiIndex.from_tuples( - [('one', 'foo', 'min'), ('one', 'foo', 'max'), - ('one', 'bar', 'min'), ('one', 'bar', 'max'), - ('two', 'foo', 'min'), ('two', 'foo', 'max')], - names=['a', 'b', None]), [1, 2, 3, 3, 4, 4]), - (False, MultiIndex.from_product( - [CategoricalIndex(['one', 'two'], - categories=['one', 'two'], - ordered=False), - CategoricalIndex(['bar', 'foo'], - categories=['bar', 'foo'], - ordered=False), - Index(['min', 'max'])], - names=['a', 'b', None]), [3, 3, 1, 2, np.nan, np.nan, 4.0, 4.0]), - (None, MultiIndex.from_product( - [CategoricalIndex(['one', 'two'], - categories=['one', 'two'], - ordered=False), - CategoricalIndex(['bar', 'foo'], - categories=['bar', 'foo'], - ordered=False), - Index(['min', 'max'])], - names=['a', 'b', None]), [3, 3, 1, 2, np.nan, np.nan, 4.0, 4.0])]) -def test_groupby_series_observed_apply_dict(df_cat, observed, index, data): - # GH 24880 - expected = pd.Series(data=data, index=index, name='c') - result = df_cat.groupby(['a', 'b'], observed=observed)['c'].apply( - lambda x: OrderedDict([('min', x.min()), ('max', x.max())])) - assert_series_equal(result, expected) + @pytest.fixture(autouse=True) + def setup_method(self, df): + self.df = df.copy()[:4] # leave out some groups + self.df['A'] = self.df['A'].astype('category') + self.df['B'] = self.df['B'].astype('category') + self.df['C'] = pd.Series([1, 2, 3, 4]) + + @pytest.mark.parametrize('operation, index', [ + ('agg', MultiIndex.from_frame( + pd.DataFrame({'A': ['foo', 'foo', 'bar', 'bar'], + 'B': ['one', 'two', 'one', 'three'] + }, dtype='category'))), + ('apply', MultiIndex.from_frame( + pd.DataFrame({'A': ['foo', 'foo', 'bar', 'bar'], + 'B': ['one', 'two', 'one', 'three'] + })))]) + def test_true(self, operation, index): + expected = pd.Series(data=[1, 3, 2, 4], index=index, name='C') + grouped = self.df.groupby(['A', 'B'], observed=True)['C'] + result = getattr(grouped, operation)(sum) + assert_series_equal(result, expected) + + @pytest.mark.parametrize('operation', ['agg', 'apply']) + @pytest.mark.parametrize('observed', [False, None]) + def test_false_or_none(self, observed, operation): + index, _ = MultiIndex.from_product( + [CategoricalIndex(['bar', 'foo'], + categories=['bar', 'foo'], + ordered=False), + CategoricalIndex(['one', 'three', 'two'], + categories=['one', 'three', 'two'], + ordered=False), + ], + names=['A', 'B']).sortlevel() + + expected = pd.Series(data=[2, 4, np.nan, 1, np.nan, 3], + index=index, name='C') + grouped = self.df.groupby(['A', 'B'], observed=observed)['C'] + result = getattr(grouped, operation)(sum) + assert_series_equal(result, expected) + + @pytest.mark.parametrize("observed, index, data", [ + (True, MultiIndex.from_tuples( + [('foo', 'one', 'min'), ('foo', 'one', 'max'), + ('foo', 'two', 'min'), ('foo', 'two', 'max'), + ('bar', 'one', 'min'), ('bar', 'one', 'max'), + ('bar', 'three', 'min'), ('bar', 'three', 'max')], + names=['A', 'B', None]), [1, 1, 3, 3, 2, 2, 4, 4]), + (False, MultiIndex.from_product( + [CategoricalIndex(['bar', 'foo'], + categories=['bar', 'foo'], + ordered=False), + CategoricalIndex(['one', 'three', 'two'], + categories=['one', 'three', 'two'], + ordered=False), + Index(['min', 'max'])], + names=['A', 'B', None]), + [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3]), + (None, MultiIndex.from_product( + [CategoricalIndex(['bar', 'foo'], + categories=['bar', 'foo'], + ordered=False), + CategoricalIndex(['one', 'three', 'two'], + categories=['one', 'three', 'two'], + ordered=False), + Index(['min', 'max'])], + names=['A', 'B', None]), + [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3])]) + def test_apply_dict(self, observed, index, data): + expected = pd.Series(data=data, index=index, name='C') + result = self.df.groupby(['A', 'B'], observed=observed)['C'].apply( + lambda x: OrderedDict([('min', x.min()), ('max', x.max())])) + assert_series_equal(result, expected) From d5c9c40c92a3cb7d32b5ef8a02dc944cfd936ab6 Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Mon, 27 May 2019 21:57:03 +0100 Subject: [PATCH 13/14] Refactor tests from a class to stand-alone functions --- pandas/tests/groupby/test_categorical.py | 139 +++++++++++------------ 1 file changed, 65 insertions(+), 74 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 0b7bc5dc0efb9..f3778c4289558 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -966,79 +966,70 @@ def test_shift(fill_value): assert_equal(res, expected) -class TestSeriesGroupByObservedKwarg: +@pytest.fixture +def df_cat(df): + df_cat = df.copy()[:4] # leave out some groups + df_cat['A'] = df_cat['A'].astype('category') + df_cat['B'] = df_cat['B'].astype('category') + df_cat['C'] = pd.Series([1, 2, 3, 4]) + yield df_cat + + +@pytest.mark.parametrize('operation, index', [ + ('agg', MultiIndex.from_frame( + pd.DataFrame({'A': ['foo', 'foo', 'bar', 'bar'], + 'B': ['one', 'two', 'one', 'three'] + }, dtype='category'))), + ('apply', MultiIndex.from_frame( + pd.DataFrame({'A': ['foo', 'foo', 'bar', 'bar'], + 'B': ['one', 'two', 'one', 'three'] + })))]) +def test_seriesgroupby_observed_true(df_cat, operation, index): # GH 24880 + expected = pd.Series(data=[1, 3, 2, 4], index=index, name='C') + grouped = df_cat.groupby(['A', 'B'], observed=True)['C'] + result = getattr(grouped, operation)(sum) + assert_series_equal(result, expected) + + +@pytest.mark.parametrize('operation', ['agg', 'apply']) +@pytest.mark.parametrize('observed', [False, None]) +def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): + # GH 24880 + index, _ = MultiIndex.from_product( + [CategoricalIndex(['bar', 'foo'], ordered=False), + CategoricalIndex(['one', 'three', 'two'], ordered=False)], + names=['A', 'B']).sortlevel() + + expected = pd.Series(data=[2, 4, np.nan, 1, np.nan, 3], + index=index, name='C') + grouped = df_cat.groupby(['A', 'B'], observed=observed)['C'] + result = getattr(grouped, operation)(sum) + assert_series_equal(result, expected) + - @pytest.fixture(autouse=True) - def setup_method(self, df): - self.df = df.copy()[:4] # leave out some groups - self.df['A'] = self.df['A'].astype('category') - self.df['B'] = self.df['B'].astype('category') - self.df['C'] = pd.Series([1, 2, 3, 4]) - - @pytest.mark.parametrize('operation, index', [ - ('agg', MultiIndex.from_frame( - pd.DataFrame({'A': ['foo', 'foo', 'bar', 'bar'], - 'B': ['one', 'two', 'one', 'three'] - }, dtype='category'))), - ('apply', MultiIndex.from_frame( - pd.DataFrame({'A': ['foo', 'foo', 'bar', 'bar'], - 'B': ['one', 'two', 'one', 'three'] - })))]) - def test_true(self, operation, index): - expected = pd.Series(data=[1, 3, 2, 4], index=index, name='C') - grouped = self.df.groupby(['A', 'B'], observed=True)['C'] - result = getattr(grouped, operation)(sum) - assert_series_equal(result, expected) - - @pytest.mark.parametrize('operation', ['agg', 'apply']) - @pytest.mark.parametrize('observed', [False, None]) - def test_false_or_none(self, observed, operation): - index, _ = MultiIndex.from_product( - [CategoricalIndex(['bar', 'foo'], - categories=['bar', 'foo'], - ordered=False), - CategoricalIndex(['one', 'three', 'two'], - categories=['one', 'three', 'two'], - ordered=False), - ], - names=['A', 'B']).sortlevel() - - expected = pd.Series(data=[2, 4, np.nan, 1, np.nan, 3], - index=index, name='C') - grouped = self.df.groupby(['A', 'B'], observed=observed)['C'] - result = getattr(grouped, operation)(sum) - assert_series_equal(result, expected) - - @pytest.mark.parametrize("observed, index, data", [ - (True, MultiIndex.from_tuples( - [('foo', 'one', 'min'), ('foo', 'one', 'max'), - ('foo', 'two', 'min'), ('foo', 'two', 'max'), - ('bar', 'one', 'min'), ('bar', 'one', 'max'), - ('bar', 'three', 'min'), ('bar', 'three', 'max')], - names=['A', 'B', None]), [1, 1, 3, 3, 2, 2, 4, 4]), - (False, MultiIndex.from_product( - [CategoricalIndex(['bar', 'foo'], - categories=['bar', 'foo'], - ordered=False), - CategoricalIndex(['one', 'three', 'two'], - categories=['one', 'three', 'two'], - ordered=False), - Index(['min', 'max'])], - names=['A', 'B', None]), - [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3]), - (None, MultiIndex.from_product( - [CategoricalIndex(['bar', 'foo'], - categories=['bar', 'foo'], - ordered=False), - CategoricalIndex(['one', 'three', 'two'], - categories=['one', 'three', 'two'], - ordered=False), - Index(['min', 'max'])], - names=['A', 'B', None]), - [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3])]) - def test_apply_dict(self, observed, index, data): - expected = pd.Series(data=data, index=index, name='C') - result = self.df.groupby(['A', 'B'], observed=observed)['C'].apply( - lambda x: OrderedDict([('min', x.min()), ('max', x.max())])) - assert_series_equal(result, expected) +@pytest.mark.parametrize("observed, index, data", [ + (True, MultiIndex.from_tuples( + [('foo', 'one', 'min'), ('foo', 'one', 'max'), + ('foo', 'two', 'min'), ('foo', 'two', 'max'), + ('bar', 'one', 'min'), ('bar', 'one', 'max'), + ('bar', 'three', 'min'), ('bar', 'three', 'max')], + names=['A', 'B', None]), [1, 1, 3, 3, 2, 2, 4, 4]), + (False, MultiIndex.from_product( + [CategoricalIndex(['bar', 'foo'], ordered=False), + CategoricalIndex(['one', 'three', 'two'], ordered=False), + Index(['min', 'max'])], + names=['A', 'B', None]), + [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3]), + (None, MultiIndex.from_product( + [CategoricalIndex(['bar', 'foo'], ordered=False), + CategoricalIndex(['one', 'three', 'two'], ordered=False), + Index(['min', 'max'])], + names=['A', 'B', None]), + [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3])]) +def test_seriesgroupby_observed_apply_dict(df_cat, observed, index, data): + # GH 24880 + expected = pd.Series(data=data, index=index, name='C') + result = df_cat.groupby(['A', 'B'], observed=observed)['C'].apply( + lambda x: OrderedDict([('min', x.min()), ('max', x.max())])) + assert_series_equal(result, expected) From ad16db89134d82a6383d31abeb9a5558e11b46c1 Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Tue, 28 May 2019 01:48:21 +0100 Subject: [PATCH 14/14] Simplify a test, add a docstring for the fixture and drop pd.* prefix for already imported --- pandas/tests/groupby/test_categorical.py | 230 ++++++++++++----------- 1 file changed, 122 insertions(+), 108 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index f3778c4289558..f24fa0daa5b18 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -26,7 +26,7 @@ def f(a): ordered=a.ordered) return a - index = pd.MultiIndex.from_product(map(f, args), names=names) + index = MultiIndex.from_product(map(f, args), names=names) return result.reindex(index).sort_index() @@ -190,7 +190,7 @@ def test_level_get_group(observed): # GH15155 df = DataFrame(data=np.arange(2, 22, 2), index=MultiIndex( - levels=[pd.CategoricalIndex(["a", "b"]), range(10)], + levels=[CategoricalIndex(["a", "b"]), range(10)], codes=[[0] * 5 + [1] * 5, range(10)], names=["Index1", "Index2"])) g = df.groupby(level=["Index1"], observed=observed) @@ -198,7 +198,7 @@ def test_level_get_group(observed): # expected should equal test.loc[["a"]] # GH15166 expected = DataFrame(data=np.arange(2, 12, 2), - index=pd.MultiIndex(levels=[pd.CategoricalIndex( + index=MultiIndex(levels=[CategoricalIndex( ["a", "b"]), range(5)], codes=[[0] * 5, range(5)], names=["Index1", "Index2"])) @@ -266,7 +266,7 @@ def test_observed(observed): # multiple groupers with a non-cat gb = df.groupby(['A', 'B', 'C'], observed=observed) - exp_index = pd.MultiIndex.from_arrays( + exp_index = MultiIndex.from_arrays( [cat1, cat2, ['foo', 'bar'] * 2], names=['A', 'B', 'C']) expected = DataFrame({'values': Series( @@ -281,7 +281,7 @@ def test_observed(observed): tm.assert_frame_equal(result, expected) gb = df.groupby(['A', 'B'], observed=observed) - exp_index = pd.MultiIndex.from_arrays( + exp_index = MultiIndex.from_arrays( [cat1, cat2], names=['A', 'B']) expected = DataFrame({'values': [1, 2, 3, 4]}, @@ -297,25 +297,25 @@ def test_observed(observed): # https://github.com/pandas-dev/pandas/issues/8138 d = {'cat': - pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], - ordered=True), + Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], + ordered=True), 'ints': [1, 1, 2, 2], 'val': [10, 20, 30, 40]} - df = pd.DataFrame(d) + df = DataFrame(d) # Grouping on a single column groups_single_key = df.groupby("cat", observed=observed) result = groups_single_key.mean() - exp_index = pd.CategoricalIndex(list('ab'), name="cat", - categories=list('abc'), - ordered=True) + exp_index = CategoricalIndex(list('ab'), name="cat", + categories=list('abc'), + ordered=True) expected = DataFrame({"ints": [1.5, 1.5], "val": [20., 30]}, index=exp_index) if not observed: - index = pd.CategoricalIndex(list('abc'), name="cat", - categories=list('abc'), - ordered=True) + index = CategoricalIndex(list('abc'), name="cat", + categories=list('abc'), + ordered=True) expected = expected.reindex(index) tm.assert_frame_equal(result, expected) @@ -325,9 +325,9 @@ def test_observed(observed): result = groups_double_key.agg('mean') expected = DataFrame( {"val": [10, 30, 20, 40], - "cat": pd.Categorical(['a', 'a', 'b', 'b'], - categories=['a', 'b', 'c'], - ordered=True), + "cat": Categorical(['a', 'a', 'b', 'b'], + categories=['a', 'b', 'c'], + ordered=True), "ints": [1, 2, 1, 2]}).set_index(["cat", "ints"]) if not observed: expected = cartesian_product_for_groupers( @@ -348,7 +348,7 @@ def test_observed(observed): # with as_index d = {'foo': [10, 8, 4, 8, 4, 1, 1], 'bar': [10, 20, 30, 40, 50, 60, 70], 'baz': ['d', 'c', 'e', 'a', 'a', 'd', 'c']} - df = pd.DataFrame(d) + df = DataFrame(d) cat = pd.cut(df['foo'], np.linspace(0, 10, 3)) df['range'] = cat groups = df.groupby(['range', 'baz'], as_index=False, observed=observed) @@ -361,7 +361,7 @@ def test_observed(observed): def test_observed_codes_remap(observed): d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} - df = pd.DataFrame(d) + df = DataFrame(d) values = pd.cut(df['C1'], [1, 2, 3, 6]) values.name = "cat" groups_double_key = df.groupby([values, 'C2'], observed=observed) @@ -402,8 +402,8 @@ def test_observed_groups(observed): # gh-20583 # test that we have the appropriate groups - cat = pd.Categorical(['a', 'c', 'a'], categories=['a', 'b', 'c']) - df = pd.DataFrame({'cat': cat, 'vals': [1, 2, 3]}) + cat = Categorical(['a', 'c', 'a'], categories=['a', 'b', 'c']) + df = DataFrame({'cat': cat, 'vals': [1, 2, 3]}) g = df.groupby('cat', observed=observed) result = g.groups @@ -420,9 +420,9 @@ def test_observed_groups(observed): def test_observed_groups_with_nan(observed): # GH 24740 - df = pd.DataFrame({'cat': pd.Categorical(['a', np.nan, 'a'], - categories=['a', 'b', 'd']), - 'vals': [1, 2, 3]}) + df = DataFrame({'cat': Categorical(['a', np.nan, 'a'], + categories=['a', 'b', 'd']), + 'vals': [1, 2, 3]}) g = df.groupby('cat', observed=observed) result = g.groups if observed: @@ -436,16 +436,16 @@ def test_observed_groups_with_nan(observed): def test_dataframe_categorical_with_nan(observed): # GH 21151 - s1 = pd.Categorical([np.nan, 'a', np.nan, 'a'], - categories=['a', 'b', 'c']) - s2 = pd.Series([1, 2, 3, 4]) - df = pd.DataFrame({'s1': s1, 's2': s2}) + s1 = Categorical([np.nan, 'a', np.nan, 'a'], + categories=['a', 'b', 'c']) + s2 = Series([1, 2, 3, 4]) + df = DataFrame({'s1': s1, 's2': s2}) result = df.groupby('s1', observed=observed).first().reset_index() if observed: - expected = DataFrame({'s1': pd.Categorical(['a'], + expected = DataFrame({'s1': Categorical(['a'], categories=['a', 'b', 'c']), 's2': [2]}) else: - expected = DataFrame({'s1': pd.Categorical(['a', 'b', 'c'], + expected = DataFrame({'s1': Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c']), 's2': [2, np.nan, np.nan]}) tm.assert_frame_equal(result, expected) @@ -460,11 +460,11 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): # Build a dataframe with cat having one unobserved category ('missing'), # and a Series with identical values - label = pd.Categorical(['d', 'a', 'b', 'a', 'd', 'b'], - categories=['a', 'b', 'missing', 'd'], - ordered=ordered) - val = pd.Series(['d', 'a', 'b', 'a', 'd', 'b']) - df = pd.DataFrame({'label': label, 'val': val}) + label = Categorical(['d', 'a', 'b', 'a', 'd', 'b'], + categories=['a', 'b', 'missing', 'd'], + ordered=ordered) + val = Series(['d', 'a', 'b', 'a', 'd', 'b']) + df = DataFrame({'label': label, 'val': val}) # aggregate on the Categorical result = (df.groupby('label', observed=observed, sort=sort)['val'] @@ -472,8 +472,8 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): # If ordering works, we expect index labels equal to aggregation results, # except for 'observed=False': label 'missing' has aggregation None - label = pd.Series(result.index.array, dtype='object') - aggr = pd.Series(result.array) + label = Series(result.index.array, dtype='object') + aggr = Series(result.array) if not observed: aggr[aggr.isna()] = 'missing' if not all(label == aggr): @@ -556,9 +556,9 @@ def test_categorical_index(): def test_describe_categorical_columns(): # GH 11558 - cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'], - categories=['foo', 'bar', 'baz', 'qux'], - ordered=True) + cats = CategoricalIndex(['qux', 'foo', 'baz', 'bar'], + categories=['foo', 'bar', 'baz', 'qux'], + ordered=True) df = DataFrame(np.random.randn(20, 4), columns=cats) result = df.groupby([1, 2, 3, 4] * 5).describe() @@ -568,22 +568,22 @@ def test_describe_categorical_columns(): def test_unstack_categorical(): # GH11558 (example is taken from the original issue) - df = pd.DataFrame({'a': range(10), - 'medium': ['A', 'B'] * 5, - 'artist': list('XYXXY') * 2}) + df = DataFrame({'a': range(10), + 'medium': ['A', 'B'] * 5, + 'artist': list('XYXXY') * 2}) df['medium'] = df['medium'].astype('category') gcat = df.groupby( ['artist', 'medium'], observed=False)['a'].count().unstack() result = gcat.describe() - exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False, - name='medium') + exp_columns = CategoricalIndex(['A', 'B'], ordered=False, + name='medium') tm.assert_index_equal(result.columns, exp_columns) tm.assert_categorical_equal(result.columns.values, exp_columns.values) result = gcat['A'] + gcat['B'] - expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist')) + expected = Series([6, 4], index=Index(['X', 'Y'], name='artist')) tm.assert_series_equal(result, expected) @@ -645,22 +645,22 @@ def test_preserve_categories(): categories = list('abc') # ordered=True - df = DataFrame({'A': pd.Categorical(list('ba'), - categories=categories, - ordered=True)}) - index = pd.CategoricalIndex(categories, categories, ordered=True) + df = DataFrame({'A': Categorical(list('ba'), + categories=categories, + ordered=True)}) + index = CategoricalIndex(categories, categories, ordered=True) tm.assert_index_equal( df.groupby('A', sort=True, observed=False).first().index, index) tm.assert_index_equal( df.groupby('A', sort=False, observed=False).first().index, index) # ordered=False - df = DataFrame({'A': pd.Categorical(list('ba'), - categories=categories, - ordered=False)}) - sort_index = pd.CategoricalIndex(categories, categories, ordered=False) - nosort_index = pd.CategoricalIndex(list('bac'), list('bac'), - ordered=False) + df = DataFrame({'A': Categorical(list('ba'), + categories=categories, + ordered=False)}) + sort_index = CategoricalIndex(categories, categories, ordered=False) + nosort_index = CategoricalIndex(list('bac'), list('bac'), + ordered=False) tm.assert_index_equal( df.groupby('A', sort=True, observed=False).first().index, sort_index) @@ -858,94 +858,94 @@ def test_sort_datetimelike(): def test_empty_sum(): # https://github.com/pandas-dev/pandas/issues/18678 - df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], - categories=['a', 'b', 'c']), - 'B': [1, 2, 1]}) - expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') + df = DataFrame({"A": Categorical(['a', 'a', 'b'], + categories=['a', 'b', 'c']), + 'B': [1, 2, 1]}) + expected_idx = CategoricalIndex(['a', 'b', 'c'], name='A') # 0 by default result = df.groupby("A", observed=False).B.sum() - expected = pd.Series([3, 1, 0], expected_idx, name='B') + expected = Series([3, 1, 0], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=0 result = df.groupby("A", observed=False).B.sum(min_count=0) - expected = pd.Series([3, 1, 0], expected_idx, name='B') + expected = Series([3, 1, 0], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=1 result = df.groupby("A", observed=False).B.sum(min_count=1) - expected = pd.Series([3, 1, np.nan], expected_idx, name='B') + expected = Series([3, 1, np.nan], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count>1 result = df.groupby("A", observed=False).B.sum(min_count=2) - expected = pd.Series([3, np.nan, np.nan], expected_idx, name='B') + expected = Series([3, np.nan, np.nan], expected_idx, name='B') tm.assert_series_equal(result, expected) def test_empty_prod(): # https://github.com/pandas-dev/pandas/issues/18678 - df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], - categories=['a', 'b', 'c']), - 'B': [1, 2, 1]}) + df = DataFrame({"A": Categorical(['a', 'a', 'b'], + categories=['a', 'b', 'c']), + 'B': [1, 2, 1]}) - expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') + expected_idx = CategoricalIndex(['a', 'b', 'c'], name='A') # 1 by default result = df.groupby("A", observed=False).B.prod() - expected = pd.Series([2, 1, 1], expected_idx, name='B') + expected = Series([2, 1, 1], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=0 result = df.groupby("A", observed=False).B.prod(min_count=0) - expected = pd.Series([2, 1, 1], expected_idx, name='B') + expected = Series([2, 1, 1], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=1 result = df.groupby("A", observed=False).B.prod(min_count=1) - expected = pd.Series([2, 1, np.nan], expected_idx, name='B') + expected = Series([2, 1, np.nan], expected_idx, name='B') tm.assert_series_equal(result, expected) def test_groupby_multiindex_categorical_datetime(): # https://github.com/pandas-dev/pandas/issues/21390 - df = pd.DataFrame({ - 'key1': pd.Categorical(list('abcbabcba')), - 'key2': pd.Categorical( + df = DataFrame({ + 'key1': Categorical(list('abcbabcba')), + 'key2': Categorical( list(pd.date_range('2018-06-01 00', freq='1T', periods=3)) * 3), 'values': np.arange(9), }) result = df.groupby(['key1', 'key2']).mean() - idx = pd.MultiIndex.from_product( - [pd.Categorical(['a', 'b', 'c']), - pd.Categorical(pd.date_range('2018-06-01 00', freq='1T', periods=3))], + idx = MultiIndex.from_product( + [Categorical(['a', 'b', 'c']), + Categorical(pd.date_range('2018-06-01 00', freq='1T', periods=3))], names=['key1', 'key2']) - expected = pd.DataFrame( + expected = DataFrame( {'values': [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx) assert_frame_equal(result, expected) @pytest.mark.parametrize("as_index, expected", [ - (True, pd.Series( - index=pd.MultiIndex.from_arrays( - [pd.Series([1, 1, 2], dtype='category'), - [1, 2, 2]], names=['a', 'b'] + (True, Series( + index=MultiIndex.from_arrays( + [Series([1, 1, 2], dtype='category'), + [1, 2, 2]], names=['a', 'b'] ), data=[1, 2, 3], name='x' )), - (False, pd.DataFrame({ - 'a': pd.Series([1, 1, 2], dtype='category'), + (False, DataFrame({ + 'a': Series([1, 1, 2], dtype='category'), 'b': [1, 2, 2], 'x': [1, 2, 3] })) ]) def test_groupby_agg_observed_true_single_column(as_index, expected): # GH-23970 - df = pd.DataFrame({ - 'a': pd.Series([1, 1, 2], dtype='category'), + df = DataFrame({ + 'a': Series([1, 1, 2], dtype='category'), 'b': [1, 2, 2], 'x': [1, 2, 3] }) @@ -958,35 +958,49 @@ def test_groupby_agg_observed_true_single_column(as_index, expected): @pytest.mark.parametrize('fill_value', [None, np.nan, pd.NaT]) def test_shift(fill_value): - ct = pd.Categorical(['a', 'b', 'c', 'd'], - categories=['a', 'b', 'c', 'd'], ordered=False) - expected = pd.Categorical([None, 'a', 'b', 'c'], - categories=['a', 'b', 'c', 'd'], ordered=False) + ct = Categorical(['a', 'b', 'c', 'd'], + categories=['a', 'b', 'c', 'd'], ordered=False) + expected = Categorical([None, 'a', 'b', 'c'], + categories=['a', 'b', 'c', 'd'], ordered=False) res = ct.shift(1, fill_value=fill_value) assert_equal(res, expected) @pytest.fixture def df_cat(df): + """ + DataFrame with multiple categorical columns and a column of integers. + Shortened so as not to contain all possible combinations of categories. + Useful for testing `observed` kwarg functionality on GroupBy objects. + + Parameters + ---------- + df: DataFrame + Non-categorical, longer DataFrame from another fixture, used to derive + this one + + Returns + ------- + df_cat: DataFrame + """ df_cat = df.copy()[:4] # leave out some groups df_cat['A'] = df_cat['A'].astype('category') df_cat['B'] = df_cat['B'].astype('category') - df_cat['C'] = pd.Series([1, 2, 3, 4]) - yield df_cat - - -@pytest.mark.parametrize('operation, index', [ - ('agg', MultiIndex.from_frame( - pd.DataFrame({'A': ['foo', 'foo', 'bar', 'bar'], - 'B': ['one', 'two', 'one', 'three'] - }, dtype='category'))), - ('apply', MultiIndex.from_frame( - pd.DataFrame({'A': ['foo', 'foo', 'bar', 'bar'], - 'B': ['one', 'two', 'one', 'three'] - })))]) -def test_seriesgroupby_observed_true(df_cat, operation, index): + df_cat['C'] = Series([1, 2, 3, 4]) + df_cat = df_cat.drop(['D'], axis=1) + return df_cat + + +@pytest.mark.parametrize('operation, kwargs', [ + ('agg', dict(dtype='category')), + ('apply', dict())]) +def test_seriesgroupby_observed_true(df_cat, operation, kwargs): # GH 24880 - expected = pd.Series(data=[1, 3, 2, 4], index=index, name='C') + index = MultiIndex.from_frame( + DataFrame({'A': ['foo', 'foo', 'bar', 'bar'], + 'B': ['one', 'two', 'one', 'three'] + }, **kwargs)) + expected = Series(data=[1, 3, 2, 4], index=index, name='C') grouped = df_cat.groupby(['A', 'B'], observed=True)['C'] result = getattr(grouped, operation)(sum) assert_series_equal(result, expected) @@ -1001,8 +1015,8 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): CategoricalIndex(['one', 'three', 'two'], ordered=False)], names=['A', 'B']).sortlevel() - expected = pd.Series(data=[2, 4, np.nan, 1, np.nan, 3], - index=index, name='C') + expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], + index=index, name='C') grouped = df_cat.groupby(['A', 'B'], observed=observed)['C'] result = getattr(grouped, operation)(sum) assert_series_equal(result, expected) @@ -1029,7 +1043,7 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3])]) def test_seriesgroupby_observed_apply_dict(df_cat, observed, index, data): # GH 24880 - expected = pd.Series(data=data, index=index, name='C') + expected = Series(data=data, index=index, name='C') result = df_cat.groupby(['A', 'B'], observed=observed)['C'].apply( lambda x: OrderedDict([('min', x.min()), ('max', x.max())])) assert_series_equal(result, expected)