From ed304a15e7429840ae237958d6570488e7e9d968 Mon Sep 17 00:00:00 2001 From: kpflugshaupt Date: Tue, 26 Mar 2019 17:55:24 +0100 Subject: [PATCH 1/4] BUG: Fix groupby on ordered Categoricals (GH25871) As documented in #25871, groupby() on an ordered Categorical messes up category order when 'observed=True' is specified. Specifically, group labels will be ordered by first occurrence (as for an unordered Categorical), but grouped aggregation results will retain the Categorical's order. The fix is a modified subset of #25173, which fixes a related case, but has not been merged yet. --- pandas/core/groupby/grouper.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e470a32b85cd6..335d40dd16949 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -302,6 +302,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, if observed: codes = algorithms.unique1d(self.grouper.codes) codes = codes[codes != -1] + if sort or self.grouper.ordered: + codes = np.sort(codes) else: codes = np.arange(len(categories)) From 18837583e9ec8583be9390b551d95861910655af Mon Sep 17 00:00:00 2001 From: kpflugshaupt Date: Tue, 26 Mar 2019 18:14:35 +0100 Subject: [PATCH 2/4] BUG: Fix groupby on ordered Categoricals (GH25871) --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 561562f367db2..8d246b16ea4c4 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -347,7 +347,7 @@ Groupby/Resample/Rolling - Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying a aggregation function to timezone aware data (:issue:`23683`) - Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`) - Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`) - +- Ensured that result group order is correct when grouping on an ordered Categorical and specifying ``observed=True`` (:issue:`25871`) Reshaping ^^^^^^^^^ From 6c04a70a46b36595ec62dfc63b196e33d05fe0fa Mon Sep 17 00:00:00 2001 From: Pflugshaupt Kaspar kpf Date: Wed, 27 Mar 2019 16:37:06 +0100 Subject: [PATCH 3/4] new test --- pandas/tests/groupby/test_categorical.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e118135ccc75d..e47f1bca284e0 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -453,6 +453,28 @@ def test_dataframe_categorical_with_nan(observed): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("observed", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +def test_dataframe_categorical_ordered_observed(observed, sort): + # GH 25871 + cat = pd.Categorical([3, 1, 2, 1, 3, 2], categories=[1, 2, 3, 4], ordered=True) + val = pd.Series([1.5, 0.5, 1.0, 0.5, 1.5, 1.0]) + df = pd.DataFrame({'cat': cat, 'val': val}) + result = df.groupby('cat', observed=observed, sort=sort)['val'].agg('sum') + + # For ordered Categoricals, sort must have no influence on the result (they always sort) + if observed: + expected = pd.Series(data=[1.0, 2.0, 3.0], + index=pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3, 4], ordered=True, name='cat', dtype='category'), + dtype='float64', name='val') + else: + expected = pd.Series(data=[1.0, 2.0, 3.0, 0.0], + index=pd.CategoricalIndex([1, 2, 3, 4], categories=[1, 2, 3, 4], ordered=True, name='cat', dtype='category'), + dtype='float64', name='val') + + tm.assert_series_equal(result, expected) + + def test_datetime(): # GH9049: ensure backward compatibility levels = pd.date_range('2014-01-01', periods=4) From 2c2500b04db9d6800a4d73a350efee0eb4984f2d Mon Sep 17 00:00:00 2001 From: kpflugshaupt Date: Thu, 28 Mar 2019 11:35:33 +0100 Subject: [PATCH 4/4] Fix groupby on ordered Categoricals (GH25871) Testing all combinations of: - ordered vs. unordered grouping column - 'observed' True vs. False - 'sort' True vs. False In all cases, result group ordering must be correct. The test is built such that the result index labels are equal to aggregation results if all goes well (except for the one unobserved category) --- pandas/tests/groupby/test_categorical.py | 34 ++++++++++++------------ 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e47f1bca284e0..8dfb27ee34232 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -453,28 +453,28 @@ def test_dataframe_categorical_with_nan(observed): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("ordered", [True, False]) @pytest.mark.parametrize("observed", [True, False]) @pytest.mark.parametrize("sort", [True, False]) -def test_dataframe_categorical_ordered_observed(observed, sort): - # GH 25871 - cat = pd.Categorical([3, 1, 2, 1, 3, 2], categories=[1, 2, 3, 4], ordered=True) - val = pd.Series([1.5, 0.5, 1.0, 0.5, 1.5, 1.0]) +def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): + # GH 25871: Fix groupby sorting on ordered Categoricals + # Build a dataframe with a Categorical having one unobserved category ('AWOL'), and a Series with identical values + cat = pd.Categorical(['d', 'a', 'b', 'a', 'd', 'b'], categories=['a', 'b', 'AWOL', 'd'], ordered=ordered) + val = pd.Series (['d', 'a', 'b', 'a', 'd', 'b']) df = pd.DataFrame({'cat': cat, 'val': val}) - result = df.groupby('cat', observed=observed, sort=sort)['val'].agg('sum') - - # For ordered Categoricals, sort must have no influence on the result (they always sort) - if observed: - expected = pd.Series(data=[1.0, 2.0, 3.0], - index=pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3, 4], ordered=True, name='cat', dtype='category'), - dtype='float64', name='val') - else: - expected = pd.Series(data=[1.0, 2.0, 3.0, 0.0], - index=pd.CategoricalIndex([1, 2, 3, 4], categories=[1, 2, 3, 4], ordered=True, name='cat', dtype='category'), - dtype='float64', name='val') - - tm.assert_series_equal(result, expected) + # aggregate on the Categorical + result = df.groupby('cat', observed=observed, sort=sort)['val'].agg('first') + + # If ordering is correct, we expect index labels equal to aggregation results, + # except for 'observed=False', when index contains 'AWOL' and aggregation None + label = pd.Series(result.index.array, dtype='object') + aggr = pd.Series(result.array) + if not observed: + aggr[aggr.isna()] = 'AWOL' + tm.assert_equal(label, aggr) + def test_datetime(): # GH9049: ensure backward compatibility levels = pd.date_range('2014-01-01', periods=4)