@@ -487,6 +487,60 @@ def test_observed_groups(observed):
487
487
tm .assert_dict_equal (result , expected )
488
488
489
489
490
+ @pytest .mark .parametrize (
491
+ "keys, expected_values, expected_index_levels" ,
492
+ [
493
+ ("a" , [15 , 9 , 0 ], CategoricalIndex ([1 , 2 , 3 ], name = "a" )),
494
+ (
495
+ ["a" , "b" ],
496
+ [7 , 8 , 0 , 0 , 0 , 9 , 0 , 0 , 0 ],
497
+ [CategoricalIndex ([1 , 2 , 3 ], name = "a" ), Index ([4 , 5 , 6 ])],
498
+ ),
499
+ (
500
+ ["a" , "a2" ],
501
+ [15 , 0 , 0 , 0 , 9 , 0 , 0 , 0 , 0 ],
502
+ [
503
+ CategoricalIndex ([1 , 2 , 3 ], name = "a" ),
504
+ CategoricalIndex ([1 , 2 , 3 ], name = "a" ),
505
+ ],
506
+ ),
507
+ ],
508
+ )
509
+ @pytest .mark .parametrize ("test_series" , [True , False ])
510
+ def test_unobserved_in_index (keys , expected_values , expected_index_levels , test_series ):
511
+ # GH#49354 - ensure unobserved cats occur when grouping by index levels
512
+ df = DataFrame (
513
+ {
514
+ "a" : Categorical ([1 , 1 , 2 ], categories = [1 , 2 , 3 ]),
515
+ "a2" : Categorical ([1 , 1 , 2 ], categories = [1 , 2 , 3 ]),
516
+ "b" : [4 , 5 , 6 ],
517
+ "c" : [7 , 8 , 9 ],
518
+ }
519
+ ).set_index (["a" , "a2" ])
520
+ if "b" not in keys :
521
+ # Only keep b when it is used for grouping for consistent columns in the result
522
+ df = df .drop (columns = "b" )
523
+
524
+ gb = df .groupby (keys , observed = False )
525
+ if test_series :
526
+ gb = gb ["c" ]
527
+ result = gb .sum ()
528
+
529
+ if len (keys ) == 1 :
530
+ index = expected_index_levels
531
+ else :
532
+ codes = [[0 , 0 , 0 , 1 , 1 , 1 , 2 , 2 , 2 ], 3 * [0 , 1 , 2 ]]
533
+ index = MultiIndex (
534
+ expected_index_levels ,
535
+ codes = codes ,
536
+ names = keys ,
537
+ )
538
+ expected = DataFrame ({"c" : expected_values }, index = index )
539
+ if test_series :
540
+ expected = expected ["c" ]
541
+ tm .assert_equal (result , expected )
542
+
543
+
490
544
def test_observed_groups_with_nan (observed ):
491
545
# GH 24740
492
546
df = DataFrame (
@@ -1234,11 +1288,12 @@ def df_cat(df):
1234
1288
1235
1289
@pytest .mark .parametrize ("operation" , ["agg" , "apply" ])
1236
1290
def test_seriesgroupby_observed_true (df_cat , operation ):
1237
- # GH 24880
1238
- lev_a = Index (["foo" , "foo" , "bar" , "bar" ], dtype = df_cat ["A" ].dtype , name = "A" )
1239
- lev_b = Index (["one" , "two" , "one" , "three" ], dtype = df_cat ["B" ].dtype , name = "B" )
1291
+ # GH#24880
1292
+ # GH#49223 - order of results was wrong when grouping by index levels
1293
+ lev_a = Index (["bar" , "bar" , "foo" , "foo" ], dtype = df_cat ["A" ].dtype , name = "A" )
1294
+ lev_b = Index (["one" , "three" , "one" , "two" ], dtype = df_cat ["B" ].dtype , name = "B" )
1240
1295
index = MultiIndex .from_arrays ([lev_a , lev_b ])
1241
- expected = Series (data = [1 , 3 , 2 , 4 ], index = index , name = "C" )
1296
+ expected = Series (data = [2 , 4 , 1 , 3 ], index = index , name = "C" )
1242
1297
1243
1298
grouped = df_cat .groupby (["A" , "B" ], observed = True )["C" ]
1244
1299
result = getattr (grouped , operation )(sum )
@@ -1249,6 +1304,7 @@ def test_seriesgroupby_observed_true(df_cat, operation):
1249
1304
@pytest .mark .parametrize ("observed" , [False , None ])
1250
1305
def test_seriesgroupby_observed_false_or_none (df_cat , observed , operation ):
1251
1306
# GH 24880
1307
+ # GH#49223 - order of results was wrong when grouping by index levels
1252
1308
index , _ = MultiIndex .from_product (
1253
1309
[
1254
1310
CategoricalIndex (["bar" , "foo" ], ordered = False ),
@@ -1272,16 +1328,16 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
1272
1328
True ,
1273
1329
MultiIndex .from_arrays (
1274
1330
[
1275
- Index (["foo " ] * 4 + ["bar " ] * 4 , dtype = "category" , name = "A" ),
1331
+ Index (["bar " ] * 4 + ["foo " ] * 4 , dtype = "category" , name = "A" ),
1276
1332
Index (
1277
- ["one" , "one" , "two " , "two " , "one" , "one" , "three " , "three " ],
1333
+ ["one" , "one" , "three " , "three " , "one" , "one" , "two " , "two " ],
1278
1334
dtype = "category" ,
1279
1335
name = "B" ,
1280
1336
),
1281
1337
Index (["min" , "max" ] * 4 ),
1282
1338
]
1283
1339
),
1284
- [1 , 1 , 3 , 3 , 2 , 2 , 4 , 4 ],
1340
+ [2 , 2 , 4 , 4 , 1 , 1 , 3 , 3 ],
1285
1341
),
1286
1342
(
1287
1343
False ,
@@ -1857,7 +1913,7 @@ def test_category_order_reducer(
1857
1913
if (
1858
1914
reduction_func in ("idxmax" , "idxmin" )
1859
1915
and not observed
1860
- and index_kind == "range "
1916
+ and index_kind != "multi "
1861
1917
):
1862
1918
msg = "GH#10694 - idxmax/min fail with unused categories"
1863
1919
request .node .add_marker (pytest .mark .xfail (reason = msg ))
@@ -2005,10 +2061,13 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde
2005
2061
2006
2062
2007
2063
@pytest .mark .parametrize ("index_kind" , ["range" , "single" , "multi" ])
2008
- def test_many_categories (as_index , sort , index_kind , ordered ):
2064
+ def test_many_categories (request , as_index , sort , index_kind , ordered ):
2009
2065
# GH#48749 - Test when the grouper has many categories
2010
2066
if index_kind != "range" and not as_index :
2011
2067
pytest .skip (reason = "Result doesn't have categories, nothing to test" )
2068
+ if index_kind == "multi" and as_index and not sort and ordered :
2069
+ msg = "GH#48749 - values are unsorted even though the Categorical is ordered"
2070
+ request .node .add_marker (pytest .mark .xfail (reason = msg ))
2012
2071
categories = np .arange (9999 , - 1 , - 1 )
2013
2072
grouper = Categorical ([2 , 1 , 2 , 3 ], categories = categories , ordered = ordered )
2014
2073
df = DataFrame ({"a" : grouper , "b" : range (4 )})
@@ -2025,11 +2084,7 @@ def test_many_categories(as_index, sort, index_kind, ordered):
2025
2084
result = gb .sum ()
2026
2085
2027
2086
# Test is setup so that data and index are the same values
2028
- # TODO: GH#49223 - Order of values should be the same for all index_kinds
2029
- if index_kind == "range" :
2030
- data = [3 , 2 , 1 ] if ordered else [2 , 1 , 3 ]
2031
- else :
2032
- data = [3 , 2 , 1 ] if sort else [2 , 1 , 3 ]
2087
+ data = [3 , 2 , 1 ] if sort or ordered else [2 , 1 , 3 ]
2033
2088
2034
2089
index = CategoricalIndex (
2035
2090
data , categories = grouper .categories , ordered = ordered , name = "a"
0 commit comments