diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1ef05ae5f9c6b..2fe3c21f20236 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -374,10 +374,10 @@ Groupby/Resample/Rolling - Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`) - Bug in :func:`Series.groupby` where using ``groupby`` with a :class:`MultiIndex` Series with a list of labels equal to the length of the series caused incorrect grouping (:issue:`25704`) - Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`) +- Ensured that result group order is correct when grouping on an ordered ``Categorical`` and specifying ``observed=True`` (:issue:`25871`, :issue:`25167`) - Bug in :meth:`pandas.core.window.Rolling.min` and :meth:`pandas.core.window.Rolling.max` that caused a memory leak (:issue:`25893`) - Bug in :func:`idxmax` and :func:`idxmin` on :meth:`DataFrame.groupby` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`) - Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index d02775cd4b328..561856ace2d9f 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -301,6 +301,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, if observed: codes = algorithms.unique1d(self.grouper.codes) codes = codes[codes != -1] + if sort or self.grouper.ordered: + codes = np.sort(codes) else: codes = np.arange(len(categories)) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 1c18e6b1d25e0..c876700cabea4 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -451,6 +451,38 @@ def test_dataframe_categorical_with_nan(observed): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("ordered", [True, False]) +@pytest.mark.parametrize("observed", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): + # GH 25871: Fix groupby sorting on ordered Categoricals + # GH 25167: Groupby with observed=True doesn't sort + + # Build a dataframe with cat having one unobserved category ('missing'), + # and a Series with identical values + label = pd.Categorical(['d', 'a', 'b', 'a', 'd', 'b'], + categories=['a', 'b', 'missing', 'd'], + ordered=ordered) + val = pd.Series(['d', 'a', 'b', 'a', 'd', 'b']) + df = pd.DataFrame({'label': label, 'val': val}) + + # aggregate on the Categorical + result = (df.groupby('label', observed=observed, sort=sort)['val'] + .aggregate('first')) + + # If ordering works, we expect index labels equal to aggregation results, + # except for 'observed=False': label 'missing' has aggregation None + label = pd.Series(result.index.array, dtype='object') + aggr = pd.Series(result.array) + if not observed: + aggr[aggr.isna()] = 'missing' + if not all(label == aggr): + msg = ('Labels and aggregation results not consistently sorted\n' + + 'for (ordered={}, observed={}, sort={})\n' + + 'Result:\n{}').format(ordered, observed, sort, result) + assert False, msg + + def test_datetime(): # GH9049: ensure backward compatibility levels = pd.date_range('2014-01-01', periods=4) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index f3363e1baad19..8382111ec9901 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -253,28 +253,27 @@ def test_groupby_levels_and_columns(self): tm.assert_frame_equal(by_levels, by_columns) def test_groupby_categorical_index_and_columns(self, observed): - # GH18432 + # GH18432, adapted for GH25871 columns = ['A', 'B', 'A', 'B'] categories = ['B', 'A'] - data = np.ones((5, 4), int) + data = np.array([[1, 2, 1, 2], + [1, 2, 1, 2], + [1, 2, 1, 2], + [1, 2, 1, 2], + [1, 2, 1, 2]], int) cat_columns = CategoricalIndex(columns, categories=categories, ordered=True) df = DataFrame(data=data, columns=cat_columns) result = df.groupby(axis=1, level=0, observed=observed).sum() - expected_data = 2 * np.ones((5, 2), int) - - if observed: - # if we are not-observed we undergo a reindex - # so need to adjust the output as our expected sets us up - # to be non-observed - expected_columns = CategoricalIndex(['A', 'B'], - categories=categories, - ordered=True) - else: - expected_columns = CategoricalIndex(categories, - categories=categories, - ordered=True) + expected_data = np.array([[4, 2], + [4, 2], + [4, 2], + [4, 2], + [4, 2]], int) + expected_columns = CategoricalIndex(categories, + categories=categories, + ordered=True) expected = DataFrame(data=expected_data, columns=expected_columns) assert_frame_equal(result, expected)