Fix groupby on ordered Categoricals (GH25871)

kpflugshaupt · web-flow · commit 2c2500b04db9 · 2019-03-28T11:35:33.000+01:00
Testing all combinations of:
- ordered vs. unordered grouping column
- 'observed' True vs. False
- 'sort' True vs. False
In all cases, result group ordering must be correct. 
The test is built such that the result index labels are equal to aggregation results if all goes well (except for the one unobserved category)
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -453,28 +453,28 @@ def test_dataframe_categorical_with_nan(observed):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize("ordered",  [True, False])
 @pytest.mark.parametrize("observed", [True, False])
 @pytest.mark.parametrize("sort",     [True, False])
-def test_dataframe_categorical_ordered_observed(observed, sort):
-    # GH 25871
-    cat = pd.Categorical([3, 1, 2, 1, 3, 2], categories=[1, 2, 3, 4], ordered=True)
-    val = pd.Series([1.5, 0.5, 1.0, 0.5, 1.5, 1.0])
+def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort):
+    # GH 25871: Fix groupby sorting on ordered Categoricals 
+    # Build a dataframe with a Categorical having one unobserved category ('AWOL'), and a Series with identical values
+    cat = pd.Categorical(['d', 'a', 'b', 'a', 'd', 'b'], categories=['a', 'b', 'AWOL', 'd'], ordered=ordered)
+    val = pd.Series     (['d', 'a', 'b', 'a', 'd', 'b'])
     df  = pd.DataFrame({'cat': cat, 'val': val})
-    result = df.groupby('cat', observed=observed, sort=sort)['val'].agg('sum')
-    
-    # For ordered Categoricals, sort must have no influence on the result (they always sort)
-    if observed:
-        expected = pd.Series(data=[1.0, 2.0, 3.0], 
-                             index=pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3, 4], ordered=True, name='cat', dtype='category'), 
-                             dtype='float64', name='val')
-    else:
-        expected = pd.Series(data=[1.0, 2.0, 3.0, 0.0], 
-                             index=pd.CategoricalIndex([1, 2, 3, 4], categories=[1, 2, 3, 4], ordered=True, name='cat', dtype='category'), 
-                             dtype='float64', name='val')
-    
-    tm.assert_series_equal(result, expected)
 
+    # aggregate on the Categorical
+    result = df.groupby('cat', observed=observed, sort=sort)['val'].agg('first')
+    
+    # If ordering is correct, we expect index labels equal to aggregation results,
+    # except for 'observed=False', when index contains 'AWOL' and aggregation None
+    label = pd.Series(result.index.array, dtype='object')
+    aggr  = pd.Series(result.array)
+    if not observed:
+        aggr[aggr.isna()] = 'AWOL'
+    tm.assert_equal(label, aggr)    
 
+    
 def test_datetime():
     # GH9049: ensure backward compatibility
     levels = pd.date_range('2014-01-01', periods=4)