@@ -36,6 +36,41 @@ def f(a):
36
36
return result .reindex (index ).sort_index ()
37
37
38
38
39
+ _results_for_groupbys_with_missing_categories = dict (
40
+ # This maps the builtin groupby functions to their expected outputs for
41
+ # missing categories when they are called on a categorical grouper with
42
+ # observed=False. Some functions are expected to return NaN, some zero.
43
+ # These expected values can be used across several tests (i.e. they are
44
+ # the same for SeriesGroupBy and DataFrameGroupBy) but they should only be
45
+ # hardcoded in one place.
46
+ [
47
+ ("all" , np .NaN ),
48
+ ("any" , np .NaN ),
49
+ ("count" , 0 ),
50
+ ("corrwith" , np .NaN ),
51
+ ("first" , np .NaN ),
52
+ ("idxmax" , np .NaN ),
53
+ ("idxmin" , np .NaN ),
54
+ ("last" , np .NaN ),
55
+ ("mad" , np .NaN ),
56
+ ("max" , np .NaN ),
57
+ ("mean" , np .NaN ),
58
+ ("median" , np .NaN ),
59
+ ("min" , np .NaN ),
60
+ ("nth" , np .NaN ),
61
+ ("nunique" , 0 ),
62
+ ("prod" , np .NaN ),
63
+ ("quantile" , np .NaN ),
64
+ ("sem" , np .NaN ),
65
+ ("size" , 0 ),
66
+ ("skew" , np .NaN ),
67
+ ("std" , np .NaN ),
68
+ ("sum" , 0 ),
69
+ ("var" , np .NaN ),
70
+ ]
71
+ )
72
+
73
+
39
74
def test_apply_use_categorical_name (df ):
40
75
cats = qcut (df .C , 4 )
41
76
@@ -1263,12 +1298,13 @@ def test_series_groupby_on_2_categoricals_unobserved(
1263
1298
reduction_func : str , observed : bool , request
1264
1299
):
1265
1300
# GH 17605
1266
-
1267
1301
if reduction_func == "ngroup" :
1268
1302
pytest .skip ("ngroup is not truly a reduction" )
1269
1303
1270
1304
if reduction_func == "corrwith" : # GH 32293
1271
- mark = pytest .mark .xfail (reason = "TODO: implemented SeriesGroupBy.corrwith" )
1305
+ mark = pytest .mark .xfail (
1306
+ reason = "TODO: implemented SeriesGroupBy.corrwith. See GH 32293"
1307
+ )
1272
1308
request .node .add_marker (mark )
1273
1309
1274
1310
df = pd .DataFrame (
@@ -1289,36 +1325,30 @@ def test_series_groupby_on_2_categoricals_unobserved(
1289
1325
assert len (result ) == expected_length
1290
1326
1291
1327
1292
- @pytest .mark .parametrize (
1293
- "func, zero_or_nan" ,
1294
- [
1295
- ("all" , np .NaN ),
1296
- ("any" , np .NaN ),
1297
- ("count" , 0 ),
1298
- ("first" , np .NaN ),
1299
- ("idxmax" , np .NaN ),
1300
- ("idxmin" , np .NaN ),
1301
- ("last" , np .NaN ),
1302
- ("mad" , np .NaN ),
1303
- ("max" , np .NaN ),
1304
- ("mean" , np .NaN ),
1305
- ("median" , np .NaN ),
1306
- ("min" , np .NaN ),
1307
- ("nth" , np .NaN ),
1308
- ("nunique" , 0 ),
1309
- ("prod" , np .NaN ),
1310
- ("quantile" , np .NaN ),
1311
- ("sem" , np .NaN ),
1312
- ("size" , 0 ),
1313
- ("skew" , np .NaN ),
1314
- ("std" , np .NaN ),
1315
- ("sum" , np .NaN ),
1316
- ("var" , np .NaN ),
1317
- ],
1318
- )
1319
- def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans (func , zero_or_nan ):
1328
+ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans (
1329
+ reduction_func : str , request
1330
+ ):
1320
1331
# GH 17605
1321
1332
# Tests whether the unobserved categories in the result contain 0 or NaN
1333
+
1334
+ if reduction_func == "ngroup" :
1335
+ pytest .skip ("ngroup is not truly a reduction" )
1336
+
1337
+ if reduction_func == "corrwith" : # GH 32293
1338
+ mark = pytest .mark .xfail (
1339
+ reason = "TODO: implemented SeriesGroupBy.corrwith. See GH 32293"
1340
+ )
1341
+ request .node .add_marker (mark )
1342
+
1343
+ if reduction_func == "sum" : # GH 31422
1344
+ mark = pytest .mark .xfail (
1345
+ reason = (
1346
+ "sum should return 0 but currently returns NaN. "
1347
+ "This is a known bug. See GH 31422."
1348
+ )
1349
+ )
1350
+ request .node .add_marker (mark )
1351
+
1322
1352
df = pd .DataFrame (
1323
1353
{
1324
1354
"cat_1" : pd .Categorical (list ("AABB" ), categories = list ("ABC" )),
@@ -1327,12 +1357,14 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_o
1327
1357
}
1328
1358
)
1329
1359
unobserved = [tuple ("AC" ), tuple ("BC" ), tuple ("CA" ), tuple ("CB" ), tuple ("CC" )]
1330
- args = {"nth" : [0 ]}.get (func , [])
1360
+ args = {"nth" : [0 ]}.get (reduction_func , [])
1331
1361
1332
1362
series_groupby = df .groupby (["cat_1" , "cat_2" ], observed = False )["value" ]
1333
- agg = getattr (series_groupby , func )
1363
+ agg = getattr (series_groupby , reduction_func )
1334
1364
result = agg (* args )
1335
1365
1366
+ zero_or_nan = _results_for_groupbys_with_missing_categories [reduction_func ]
1367
+
1336
1368
for idx in unobserved :
1337
1369
val = result .loc [idx ]
1338
1370
assert (pd .isna (zero_or_nan ) and pd .isna (val )) or (val == zero_or_nan )
@@ -1342,6 +1374,84 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_o
1342
1374
assert np .issubdtype (result .dtype , np .integer )
1343
1375
1344
1376
1377
+ def test_dataframe_groupby_on_2_categoricals_when_observed_is_true (reduction_func : str ):
1378
+ # GH 23865
1379
+ # GH 27075
1380
+ # Ensure that df.groupby, when 'by' is two pd.Categorical variables,
1381
+ # does not return the categories that are not in df when observed=True
1382
+ if reduction_func == "ngroup" :
1383
+ pytest .skip ("ngroup does not return the Categories on the index" )
1384
+
1385
+ df = pd .DataFrame (
1386
+ {
1387
+ "cat_1" : pd .Categorical (list ("AABB" ), categories = list ("ABC" )),
1388
+ "cat_2" : pd .Categorical (list ("1111" ), categories = list ("12" )),
1389
+ "value" : [0.1 , 0.1 , 0.1 , 0.1 ],
1390
+ }
1391
+ )
1392
+ unobserved_cats = [("A" , "2" ), ("B" , "2" ), ("C" , "1" ), ("C" , "2" )]
1393
+
1394
+ df_grp = df .groupby (["cat_1" , "cat_2" ], observed = True )
1395
+
1396
+ args = {"nth" : [0 ], "corrwith" : [df ]}.get (reduction_func , [])
1397
+ res = getattr (df_grp , reduction_func )(* args )
1398
+
1399
+ for cat in unobserved_cats :
1400
+ assert cat not in res .index
1401
+
1402
+
1403
+ @pytest .mark .parametrize ("observed" , [False , None ])
1404
+ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false (
1405
+ reduction_func : str , observed : bool , request
1406
+ ):
1407
+ # GH 23865
1408
+ # GH 27075
1409
+ # Ensure that df.groupby, when 'by' is two pd.Categorical variables,
1410
+ # returns the categories that are not in df when observed=False/None
1411
+
1412
+ if reduction_func == "ngroup" :
1413
+ pytest .skip ("ngroup does not return the Categories on the index" )
1414
+
1415
+ if reduction_func == "count" : # GH 35028
1416
+ mark = pytest .mark .xfail (
1417
+ reason = (
1418
+ "DataFrameGroupBy.count returns np.NaN for missing "
1419
+ "categories, when it should return 0. See GH 35028"
1420
+ )
1421
+ )
1422
+ request .node .add_marker (mark )
1423
+
1424
+ if reduction_func == "sum" : # GH 31422
1425
+ mark = pytest .mark .xfail (
1426
+ reason = (
1427
+ "sum should return 0 but currently returns NaN. "
1428
+ "This is a known bug. See GH 31422."
1429
+ )
1430
+ )
1431
+ request .node .add_marker (mark )
1432
+
1433
+ df = pd .DataFrame (
1434
+ {
1435
+ "cat_1" : pd .Categorical (list ("AABB" ), categories = list ("ABC" )),
1436
+ "cat_2" : pd .Categorical (list ("1111" ), categories = list ("12" )),
1437
+ "value" : [0.1 , 0.1 , 0.1 , 0.1 ],
1438
+ }
1439
+ )
1440
+ unobserved_cats = [("A" , "2" ), ("B" , "2" ), ("C" , "1" ), ("C" , "2" )]
1441
+
1442
+ df_grp = df .groupby (["cat_1" , "cat_2" ], observed = observed )
1443
+
1444
+ args = {"nth" : [0 ], "corrwith" : [df ]}.get (reduction_func , [])
1445
+ res = getattr (df_grp , reduction_func )(* args )
1446
+
1447
+ expected = _results_for_groupbys_with_missing_categories [reduction_func ]
1448
+
1449
+ if expected is np .nan :
1450
+ assert res .loc [unobserved_cats ].isnull ().all ().all ()
1451
+ else :
1452
+ assert (res .loc [unobserved_cats ] == expected ).all ().all ()
1453
+
1454
+
1345
1455
def test_series_groupby_categorical_aggregation_getitem ():
1346
1456
# GH 8870
1347
1457
d = {"foo" : [10 , 8 , 4 , 1 ], "bar" : [10 , 20 , 30 , 40 ], "baz" : ["d" , "c" , "d" , "c" ]}
0 commit comments