Skip to content

Commit 2c2500b

Browse files
authored
Fix groupby on ordered Categoricals (GH25871)
Testing all combinations of: - ordered vs. unordered grouping column - 'observed' True vs. False - 'sort' True vs. False In all cases, result group ordering must be correct. The test is built such that the result index labels are equal to aggregation results if all goes well (except for the one unobserved category)
1 parent 6c04a70 commit 2c2500b

File tree

1 file changed

+17
-17
lines changed

1 file changed

+17
-17
lines changed

pandas/tests/groupby/test_categorical.py

+17-17
Original file line numberDiff line numberDiff line change
@@ -453,28 +453,28 @@ def test_dataframe_categorical_with_nan(observed):
453453
tm.assert_frame_equal(result, expected)
454454

455455

456+
@pytest.mark.parametrize("ordered", [True, False])
456457
@pytest.mark.parametrize("observed", [True, False])
457458
@pytest.mark.parametrize("sort", [True, False])
458-
def test_dataframe_categorical_ordered_observed(observed, sort):
459-
# GH 25871
460-
cat = pd.Categorical([3, 1, 2, 1, 3, 2], categories=[1, 2, 3, 4], ordered=True)
461-
val = pd.Series([1.5, 0.5, 1.0, 0.5, 1.5, 1.0])
459+
def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort):
460+
# GH 25871: Fix groupby sorting on ordered Categoricals
461+
# Build a dataframe with a Categorical having one unobserved category ('AWOL'), and a Series with identical values
462+
cat = pd.Categorical(['d', 'a', 'b', 'a', 'd', 'b'], categories=['a', 'b', 'AWOL', 'd'], ordered=ordered)
463+
val = pd.Series (['d', 'a', 'b', 'a', 'd', 'b'])
462464
df = pd.DataFrame({'cat': cat, 'val': val})
463-
result = df.groupby('cat', observed=observed, sort=sort)['val'].agg('sum')
464-
465-
# For ordered Categoricals, sort must have no influence on the result (they always sort)
466-
if observed:
467-
expected = pd.Series(data=[1.0, 2.0, 3.0],
468-
index=pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3, 4], ordered=True, name='cat', dtype='category'),
469-
dtype='float64', name='val')
470-
else:
471-
expected = pd.Series(data=[1.0, 2.0, 3.0, 0.0],
472-
index=pd.CategoricalIndex([1, 2, 3, 4], categories=[1, 2, 3, 4], ordered=True, name='cat', dtype='category'),
473-
dtype='float64', name='val')
474-
475-
tm.assert_series_equal(result, expected)
476465

466+
# aggregate on the Categorical
467+
result = df.groupby('cat', observed=observed, sort=sort)['val'].agg('first')
468+
469+
# If ordering is correct, we expect index labels equal to aggregation results,
470+
# except for 'observed=False', when index contains 'AWOL' and aggregation None
471+
label = pd.Series(result.index.array, dtype='object')
472+
aggr = pd.Series(result.array)
473+
if not observed:
474+
aggr[aggr.isna()] = 'AWOL'
475+
tm.assert_equal(label, aggr)
477476

477+
478478
def test_datetime():
479479
# GH9049: ensure backward compatibility
480480
levels = pd.date_range('2014-01-01', periods=4)

0 commit comments

Comments
 (0)