From a5d6d1a935bc17a29a42f16abe0827aaa60381e2 Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Sun, 19 May 2019 21:38:05 +0100 Subject: [PATCH 1/2] Fix 'observed' kwarg not doing anything on SeriesGroupBy --- pandas/core/groupby/generic.py | 90 +++++----------------------- pandas/core/groupby/groupby.py | 66 +++++++++++++++++++- pandas/tests/groupby/test_groupby.py | 59 +++++++++++++++++- 3 files changed, 138 insertions(+), 77 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2f665975f96bd..32933c3385e25 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -25,7 +25,6 @@ from pandas.core.dtypes.missing import isna, notna import pandas.core.algorithms as algorithms -from pandas.core.arrays import Categorical from pandas.core.base import DataError, SpecificationError import pandas.core.common as com from pandas.core.frame import DataFrame @@ -33,7 +32,7 @@ from pandas.core.groupby import base from pandas.core.groupby.groupby import ( GroupBy, _apply_docs, _transform_template) -from pandas.core.index import CategoricalIndex, Index, MultiIndex +from pandas.core.index import Index, MultiIndex import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series @@ -834,9 +833,10 @@ def _wrap_output(self, output, index, names=None): return Series(output, index=index, name=name) def _wrap_aggregated_output(self, output, names=None): - return self._wrap_output(output=output, - index=self.grouper.result_index, - names=names) + result = self._wrap_output(output=output, + index=self.grouper.result_index, + names=names) + return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output(self, output, names=None): return self._wrap_output(output=output, @@ -856,13 +856,15 @@ def _get_index(): return index if isinstance(values[0], dict): - # GH #823 + # GH #823 #24880 index = _get_index() - result = DataFrame(values, index=index).stack() + result = self._reindex_output(DataFrame(values, index=index)) + dropna = self.observed # if self.observed is False, keep all-NaN rows created while re-indexing + result = result.stack(dropna=dropna) result.name = self._selection_name return result - if isinstance(values[0], (Series, dict)): + if isinstance(values[0], Series): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) elif isinstance(values[0], DataFrame): @@ -870,9 +872,9 @@ def _get_index(): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: - # GH #6265 - return Series(values, index=_get_index(), - name=self._selection_name) + # GH #6265 #24880 + result = Series(values, index=_get_index(), name=self._selection_name) + return self._reindex_output(result) def _aggregate_named(self, func, *args, **kwargs): result = OrderedDict() @@ -1335,7 +1337,8 @@ def _gotitem(self, key, ndim, subset=None): if subset is None: subset = self.obj[key] return SeriesGroupBy(subset, selection=key, - grouper=self.grouper) + grouper=self.grouper, + observed=self.observed) raise AssertionError("invalid ndim for _gotitem") @@ -1407,69 +1410,6 @@ def _wrap_agged_blocks(self, items, blocks): return self._reindex_output(result)._convert(datetime=True) - def _reindex_output(self, result): - """ - If we have categorical groupers, then we want to make sure that - we have a fully reindex-output to the levels. These may have not - participated in the groupings (e.g. may have all been - nan groups); - - This can re-expand the output space - """ - - # we need to re-expand the output space to accomodate all values - # whether observed or not in the cartesian product of our groupes - groupings = self.grouper.groupings - if groupings is None: - return result - elif len(groupings) == 1: - return result - - # if we only care about the observed values - # we are done - elif self.observed: - return result - - # reindexing only applies to a Categorical grouper - elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex)) - for ping in groupings): - return result - - levels_list = [ping.group_index for ping in groupings] - index, _ = MultiIndex.from_product( - levels_list, names=self.grouper.names).sortlevel() - - if self.as_index: - d = {self.obj._get_axis_name(self.axis): index, 'copy': False} - return result.reindex(**d) - - # GH 13204 - # Here, the categorical in-axis groupers, which need to be fully - # expanded, are columns in `result`. An idea is to do: - # result = result.set_index(self.grouper.names) - # .reindex(index).reset_index() - # but special care has to be taken because of possible not-in-axis - # groupers. - # So, we manually select and drop the in-axis grouper columns, - # reindex `result`, and then reset the in-axis grouper columns. - - # Select in-axis groupers - in_axis_grps = ((i, ping.name) for (i, ping) - in enumerate(groupings) if ping.in_axis) - g_nums, g_names = zip(*in_axis_grps) - - result = result.drop(labels=list(g_names), axis=1) - - # Set a temp index and reindex (possibly expanding) - result = result.set_index(self.grouper.result_index - ).reindex(index, copy=False) - - # Reset in-axis grouper columns - # (using level numbers `g_nums` because level names may not be unique) - result = result.reset_index(level=g_nums) - - return result.reset_index(drop=True) - def _iterate_column_groupbys(self): for i, colname in enumerate(self._selected_obj.columns): yield colname, SeriesGroupBy(self._selected_obj.iloc[:, i], diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4e9e3b4963b6d..3d8716def20fb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -17,6 +17,7 @@ class providing the base-class of operations. import numpy as np +from pandas.core.arrays import Categorical from pandas._config.config import option_context from pandas._libs import Timestamp @@ -42,7 +43,7 @@ class providing the base-class of operations. from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base -from pandas.core.index import Index, MultiIndex +from pandas.core.index import Index, CategoricalIndex, MultiIndex from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter @@ -2301,6 +2302,69 @@ def tail(self, n=5): mask = self._cumcount_array(ascending=False) < n return self._selected_obj[mask] + def _reindex_output(self, result): + """ + If we have categorical groupers, then we want to make sure that + we have a fully reindex-output to the levels. These may have not + participated in the groupings (e.g. may have all been + nan groups); + + This can re-expand the output space + """ + + # we need to re-expand the output space to accomodate all values + # whether observed or not in the cartesian product of our groupes + groupings = self.grouper.groupings + if groupings is None: + return result + elif len(groupings) == 1: + return result + + # if we only care about the observed values + # we are done + elif self.observed: + return result + + # reindexing only applies to a Categorical grouper + elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex)) + for ping in groupings): + return result + + levels_list = [ping.group_index for ping in groupings] + index, _ = MultiIndex.from_product( + levels_list, names=self.grouper.names).sortlevel() + + if self.as_index: + d = {self.obj._get_axis_name(self.axis): index, 'copy': False} + return result.reindex(**d) + + # GH 13204 + # Here, the categorical in-axis groupers, which need to be fully + # expanded, are columns in `result`. An idea is to do: + # result = result.set_index(self.grouper.names) + # .reindex(index).reset_index() + # but special care has to be taken because of possible not-in-axis + # groupers. + # So, we manually select and drop the in-axis grouper columns, + # reindex `result`, and then reset the in-axis grouper columns. + + # Select in-axis groupers + in_axis_grps = ((i, ping.name) for (i, ping) + in enumerate(groupings) if ping.in_axis) + g_nums, g_names = zip(*in_axis_grps) + + result = result.drop(labels=list(g_names), axis=1) + + # Set a temp index and reindex (possibly expanding) + result = result.set_index(self.grouper.result_index + ).reindex(index, copy=False) + + # Reset in-axis grouper columns + # (using level numbers `g_nums` because level names may not be unique) + result = result.reset_index(level=g_nums) + + return result.reset_index(drop=True) + GroupBy._add_numeric_operations() diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 2511063110f92..6aa07eac681bf 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import ( - DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv) + DataFrame, Index, MultiIndex, CategoricalIndex, Series, Timestamp, date_range, read_csv) import pandas.core.common as com import pandas.util.testing as tm from pandas.util.testing import ( @@ -1736,3 +1736,60 @@ def test_groupby_multiindex_series_keys_len_equal_group_axis(): expected = pd.Series([3], index=ei) assert_series_equal(result, expected) + + +def test_groupby_observed(): + # GH 24880 + df = DataFrame({'a': ['x', 'x', 'x', 'y'], + 'b': ['a', 'a', 'b', 'a'], + 'c': [1, 2, 3, 4]}) + df['a'] = df['a'].astype('category') + df['b'] = df['b'].astype('category') + + # test .agg and .apply when observed == False + levels = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), + CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False)] + index, _ = MultiIndex.from_product(levels, names=['a', 'b']).sortlevel() + expected = pd.Series(data=[3, 3, 4, np.nan], index=index, name='c') + actual_agg = df.groupby(['a', 'b']).c.agg(sum) + actual_apply = df.groupby(['a', 'b']).c.apply(sum) + assert_series_equal(expected, actual_agg) + assert_series_equal(expected, actual_apply) + + # test .agg when observed == True + index = MultiIndex.from_frame(df[['a', 'b']].drop_duplicates()) + expected = pd.Series([3, 3, 4], index=index, name='c') + actual = df.groupby(['a', 'b'], observed=True).c.agg(sum) + assert_series_equal(expected, actual) + + # test .apply when observed == True + index = MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'a')], names=('a', 'b')) + expected = pd.Series([3, 3, 4], index=index, name='c') + actual = df.groupby(['a', 'b'], observed=True).c.apply(sum) + assert_series_equal(expected, actual) + + +def test_groupby_observed_apply_lambda_returns_dict(): + # GH 24880 + df = DataFrame({'a': ['x', 'x', 'x', 'y'], + 'b': ['a', 'a', 'b', 'a'], + 'c': [1, 2, 3, 4]}) + df['a'] = df['a'].astype('category') + df['b'] = df['b'].astype('category') + + # observed == False + levels = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), + CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False), + Index(['min', 'max'])] + index, _ = MultiIndex.from_product(levels, names=['a', 'b', None]).sortlevel() + expected = pd.Series(data=[2, 1, 3, 3, 4, 4, np.nan, np.nan], index=index, name='c') + actual = df.groupby(['a', 'b']).c.apply(lambda x: {'min': x.min(), 'max': x.max()}) + assert_series_equal(expected, actual) + + # observed == True + index = MultiIndex.from_tuples([('x', 'a', 'max'), ('x', 'a', 'min'), + ('x', 'b', 'max'), ('x', 'b', 'min'), + ('y', 'a', 'max'), ('y', 'a', 'min')],names=('a', 'b', None)) + expected = pd.Series(data=[2, 1, 3, 3, 4, 4], index=index, name='c') + actual = df.groupby(['a', 'b'], observed=True).c.apply(lambda x: {'min': x.min(), 'max': x.max()}) + assert_series_equal(expected, actual) From 2575c41727168039b7c2c62629e8a0226590176f Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Sun, 19 May 2019 21:56:47 +0100 Subject: [PATCH 2/2] Wrap long lines --- pandas/core/groupby/generic.py | 9 +++++--- pandas/tests/groupby/test_groupby.py | 34 +++++++++++++++++----------- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 32933c3385e25..dc414a588a2ce 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -859,8 +859,9 @@ def _get_index(): # GH #823 #24880 index = _get_index() result = self._reindex_output(DataFrame(values, index=index)) - dropna = self.observed # if self.observed is False, keep all-NaN rows created while re-indexing - result = result.stack(dropna=dropna) + # if self.observed is False, + # keep all-NaN rows created while re-indexing + result = result.stack(dropna=self.observed) result.name = self._selection_name return result @@ -873,7 +874,9 @@ def _get_index(): not_indexed_same=not_indexed_same) else: # GH #6265 #24880 - result = Series(values, index=_get_index(), name=self._selection_name) + result = Series(data=values, + index=_get_index(), + name=self._selection_name) return self._reindex_output(result) def _aggregate_named(self, func, *args, **kwargs): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6aa07eac681bf..3690dc7bb048d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -10,7 +10,8 @@ import pandas as pd from pandas import ( - DataFrame, Index, MultiIndex, CategoricalIndex, Series, Timestamp, date_range, read_csv) + DataFrame, Index, MultiIndex, CategoricalIndex, + Series, Timestamp, date_range, read_csv) import pandas.core.common as com import pandas.util.testing as tm from pandas.util.testing import ( @@ -1747,9 +1748,9 @@ def test_groupby_observed(): df['b'] = df['b'].astype('category') # test .agg and .apply when observed == False - levels = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), - CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False)] - index, _ = MultiIndex.from_product(levels, names=['a', 'b']).sortlevel() + lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), + CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False)] + index, _ = MultiIndex.from_product(lvls, names=['a', 'b']).sortlevel() expected = pd.Series(data=[3, 3, 4, np.nan], index=index, name='c') actual_agg = df.groupby(['a', 'b']).c.agg(sum) actual_apply = df.groupby(['a', 'b']).c.apply(sum) @@ -1763,7 +1764,8 @@ def test_groupby_observed(): assert_series_equal(expected, actual) # test .apply when observed == True - index = MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'a')], names=('a', 'b')) + index = MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'a')], + names=('a', 'b')) expected = pd.Series([3, 3, 4], index=index, name='c') actual = df.groupby(['a', 'b'], observed=True).c.apply(sum) assert_series_equal(expected, actual) @@ -1778,18 +1780,24 @@ def test_groupby_observed_apply_lambda_returns_dict(): df['b'] = df['b'].astype('category') # observed == False - levels = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), - CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False), - Index(['min', 'max'])] - index, _ = MultiIndex.from_product(levels, names=['a', 'b', None]).sortlevel() - expected = pd.Series(data=[2, 1, 3, 3, 4, 4, np.nan, np.nan], index=index, name='c') - actual = df.groupby(['a', 'b']).c.apply(lambda x: {'min': x.min(), 'max': x.max()}) + lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), + CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False), + Index(['min', 'max'])] + index, _ = MultiIndex.from_product(lvls, + names=['a', 'b', None]).sortlevel() + expected = pd.Series(data=[2, 1, 3, 3, 4, 4, np.nan, np.nan], + index=index, + name='c') + actual = df.groupby(['a', 'b']).c.apply(lambda x: {'min': x.min(), + 'max': x.max()}) assert_series_equal(expected, actual) # observed == True index = MultiIndex.from_tuples([('x', 'a', 'max'), ('x', 'a', 'min'), ('x', 'b', 'max'), ('x', 'b', 'min'), - ('y', 'a', 'max'), ('y', 'a', 'min')],names=('a', 'b', None)) + ('y', 'a', 'max'), ('y', 'a', 'min')], + names=('a', 'b', None)) expected = pd.Series(data=[2, 1, 3, 3, 4, 4], index=index, name='c') - actual = df.groupby(['a', 'b'], observed=True).c.apply(lambda x: {'min': x.min(), 'max': x.max()}) + actual = df.groupby(['a', 'b'], observed=True).c.\ + apply(lambda x: {'min': x.min(), 'max': x.max()}) assert_series_equal(expected, actual)