Skip to content

Commit 1a30601

Browse files
kpflugshauptjreback
authored andcommitted
BUG: Fix groupby sorting on ordered Categoricals (GH25871) (#25908)
1 parent 35156dc commit 1a30601

File tree

4 files changed

+49
-16
lines changed

4 files changed

+49
-16
lines changed

doc/source/whatsnew/v0.25.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -378,10 +378,10 @@ Groupby/Resample/Rolling
378378
- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`)
379379
- Bug in :func:`Series.groupby` where using ``groupby`` with a :class:`MultiIndex` Series with a list of labels equal to the length of the series caused incorrect grouping (:issue:`25704`)
380380
- Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`)
381+
- Ensured that result group order is correct when grouping on an ordered ``Categorical`` and specifying ``observed=True`` (:issue:`25871`, :issue:`25167`)
381382
- Bug in :meth:`pandas.core.window.Rolling.min` and :meth:`pandas.core.window.Rolling.max` that caused a memory leak (:issue:`25893`)
382383
- Bug in :func:`idxmax` and :func:`idxmin` on :meth:`DataFrame.groupby` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`)
383384

384-
385385
Reshaping
386386
^^^^^^^^^
387387

pandas/core/groupby/grouper.py

+2
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
301301
if observed:
302302
codes = algorithms.unique1d(self.grouper.codes)
303303
codes = codes[codes != -1]
304+
if sort or self.grouper.ordered:
305+
codes = np.sort(codes)
304306
else:
305307
codes = np.arange(len(categories))
306308

pandas/tests/groupby/test_categorical.py

+32
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,38 @@ def test_dataframe_categorical_with_nan(observed):
451451
tm.assert_frame_equal(result, expected)
452452

453453

454+
@pytest.mark.parametrize("ordered", [True, False])
455+
@pytest.mark.parametrize("observed", [True, False])
456+
@pytest.mark.parametrize("sort", [True, False])
457+
def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort):
458+
# GH 25871: Fix groupby sorting on ordered Categoricals
459+
# GH 25167: Groupby with observed=True doesn't sort
460+
461+
# Build a dataframe with cat having one unobserved category ('missing'),
462+
# and a Series with identical values
463+
label = pd.Categorical(['d', 'a', 'b', 'a', 'd', 'b'],
464+
categories=['a', 'b', 'missing', 'd'],
465+
ordered=ordered)
466+
val = pd.Series(['d', 'a', 'b', 'a', 'd', 'b'])
467+
df = pd.DataFrame({'label': label, 'val': val})
468+
469+
# aggregate on the Categorical
470+
result = (df.groupby('label', observed=observed, sort=sort)['val']
471+
.aggregate('first'))
472+
473+
# If ordering works, we expect index labels equal to aggregation results,
474+
# except for 'observed=False': label 'missing' has aggregation None
475+
label = pd.Series(result.index.array, dtype='object')
476+
aggr = pd.Series(result.array)
477+
if not observed:
478+
aggr[aggr.isna()] = 'missing'
479+
if not all(label == aggr):
480+
msg = ('Labels and aggregation results not consistently sorted\n' +
481+
'for (ordered={}, observed={}, sort={})\n' +
482+
'Result:\n{}').format(ordered, observed, sort, result)
483+
assert False, msg
484+
485+
454486
def test_datetime():
455487
# GH9049: ensure backward compatibility
456488
levels = pd.date_range('2014-01-01', periods=4)

pandas/tests/groupby/test_grouping.py

+14-15
Original file line numberDiff line numberDiff line change
@@ -253,28 +253,27 @@ def test_groupby_levels_and_columns(self):
253253
tm.assert_frame_equal(by_levels, by_columns)
254254

255255
def test_groupby_categorical_index_and_columns(self, observed):
256-
# GH18432
256+
# GH18432, adapted for GH25871
257257
columns = ['A', 'B', 'A', 'B']
258258
categories = ['B', 'A']
259-
data = np.ones((5, 4), int)
259+
data = np.array([[1, 2, 1, 2],
260+
[1, 2, 1, 2],
261+
[1, 2, 1, 2],
262+
[1, 2, 1, 2],
263+
[1, 2, 1, 2]], int)
260264
cat_columns = CategoricalIndex(columns,
261265
categories=categories,
262266
ordered=True)
263267
df = DataFrame(data=data, columns=cat_columns)
264268
result = df.groupby(axis=1, level=0, observed=observed).sum()
265-
expected_data = 2 * np.ones((5, 2), int)
266-
267-
if observed:
268-
# if we are not-observed we undergo a reindex
269-
# so need to adjust the output as our expected sets us up
270-
# to be non-observed
271-
expected_columns = CategoricalIndex(['A', 'B'],
272-
categories=categories,
273-
ordered=True)
274-
else:
275-
expected_columns = CategoricalIndex(categories,
276-
categories=categories,
277-
ordered=True)
269+
expected_data = np.array([[4, 2],
270+
[4, 2],
271+
[4, 2],
272+
[4, 2],
273+
[4, 2]], int)
274+
expected_columns = CategoricalIndex(categories,
275+
categories=categories,
276+
ordered=True)
278277
expected = DataFrame(data=expected_data, columns=expected_columns)
279278
assert_frame_equal(result, expected)
280279

0 commit comments

Comments
 (0)