@@ -1259,11 +1259,100 @@ def test_get_nonexistent_category():
1259
1259
)
1260
1260
1261
1261
1262
+ def test_dataframe_groupby_on_2_categoricals_when_observed_is_true (
1263
+ reduction_func :str ):
1264
+
1265
+ if reduction_func == 'ngroup' :
1266
+ pytest .skip ("ngroup does not return the Categories on the index" )
1267
+
1268
+ res , unobserved_cats = _dataframe_groupby_on_2_categoricals (
1269
+ reduction_func , observed = True )
1270
+
1271
+ for cat in unobserved_cats :
1272
+ assert cat not in res .index
1273
+
1274
+
1275
+ def _dataframe_groupby_on_2_categoricals (reduction_func :str , observed :bool ):
1276
+
1277
+ df = pd .DataFrame ({
1278
+ "cat_1" : pd .Categorical (list ("AABB" ), categories = list ("ABC" )),
1279
+ "cat_2" : pd .Categorical (list ("1111" ), categories = list ("12" )),
1280
+ "value" : [.1 , .1 , .1 , .1 ]
1281
+ })
1282
+ unobserved_cats = [
1283
+ ('A' , '2' ),
1284
+ ('B' , '2' ),
1285
+ ('C' , '1' ),
1286
+ ('C' , '2' )
1287
+ ]
1288
+
1289
+ df_grp = df .groupby (['cat_1' , 'cat_2' ], observed = observed )
1290
+
1291
+ args = {
1292
+ 'nth' : [0 ],
1293
+ 'corrwith' : [df ]
1294
+ }.get (reduction_func , [])
1295
+ res = getattr (df_grp , reduction_func )(* args )
1296
+
1297
+ return res , unobserved_cats
1298
+
1299
+
1300
+ _results_for_groupbys_with_missing_categories = dict ([
1301
+ ("all" , np .NaN ),
1302
+ ("any" , np .NaN ),
1303
+ ("count" , 0 ),
1304
+ ("corrwith" , np .NaN ),
1305
+ ("first" , np .NaN ),
1306
+ ("idxmax" , np .NaN ),
1307
+ ("idxmin" , np .NaN ),
1308
+ ("last" , np .NaN ),
1309
+ ("mad" , np .NaN ),
1310
+ ("max" , np .NaN ),
1311
+ ("mean" , np .NaN ),
1312
+ ("median" , np .NaN ),
1313
+ ("min" , np .NaN ),
1314
+ ("nth" , np .NaN ),
1315
+ ("nunique" , 0 ),
1316
+ ("prod" , np .NaN ),
1317
+ ("quantile" , np .NaN ),
1318
+ ("sem" , np .NaN ),
1319
+ ("size" , 0 ),
1320
+ ("skew" , np .NaN ),
1321
+ ("std" , np .NaN ),
1322
+ ("sum" , np .NaN ),
1323
+ ("var" , np .NaN ),
1324
+ ])
1325
+
1326
+
1327
+ @pytest .mark .parametrize ('observed' , [False , None ])
1328
+ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false (
1329
+ reduction_func :str , observed :bool , request ):
1330
+
1331
+ if reduction_func == 'ngroup' :
1332
+ pytest .skip ("ngroup does not return the Categories on the index" )
1333
+
1334
+ if reduction_func == 'count' :
1335
+ mark = pytest .mark .xfail (
1336
+ reason = ("DataFrameGroupBy.count returns np.NaN for missing "
1337
+ "categories, when it should return 0" ))
1338
+ request .node .add_marker (mark )
1339
+
1340
+ res , unobserved_cats = _dataframe_groupby_on_2_categoricals (
1341
+ reduction_func , observed )
1342
+
1343
+ expected = _results_for_groupbys_with_missing_categories [reduction_func ]
1344
+
1345
+ if expected is np .nan :
1346
+ assert res .loc [unobserved_cats ].isnull ().all ().all ()
1347
+ else :
1348
+ assert (res .loc [unobserved_cats ] == expected ).all ().all ()
1349
+
1350
+
1351
+
1262
1352
def test_series_groupby_on_2_categoricals_unobserved (
1263
1353
reduction_func : str , observed : bool , request
1264
1354
):
1265
1355
# GH 17605
1266
-
1267
1356
if reduction_func == "ngroup" :
1268
1357
pytest .skip ("ngroup is not truly a reduction" )
1269
1358
@@ -1289,36 +1378,18 @@ def test_series_groupby_on_2_categoricals_unobserved(
1289
1378
assert len (result ) == expected_length
1290
1379
1291
1380
1292
- @pytest .mark .parametrize (
1293
- "func, zero_or_nan" ,
1294
- [
1295
- ("all" , np .NaN ),
1296
- ("any" , np .NaN ),
1297
- ("count" , 0 ),
1298
- ("first" , np .NaN ),
1299
- ("idxmax" , np .NaN ),
1300
- ("idxmin" , np .NaN ),
1301
- ("last" , np .NaN ),
1302
- ("mad" , np .NaN ),
1303
- ("max" , np .NaN ),
1304
- ("mean" , np .NaN ),
1305
- ("median" , np .NaN ),
1306
- ("min" , np .NaN ),
1307
- ("nth" , np .NaN ),
1308
- ("nunique" , 0 ),
1309
- ("prod" , np .NaN ),
1310
- ("quantile" , np .NaN ),
1311
- ("sem" , np .NaN ),
1312
- ("size" , 0 ),
1313
- ("skew" , np .NaN ),
1314
- ("std" , np .NaN ),
1315
- ("sum" , np .NaN ),
1316
- ("var" , np .NaN ),
1317
- ],
1318
- )
1319
- def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans (func , zero_or_nan ):
1381
+ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans (
1382
+ reduction_func :str , request ):
1320
1383
# GH 17605
1321
1384
# Tests whether the unobserved categories in the result contain 0 or NaN
1385
+
1386
+ if reduction_func == "ngroup" :
1387
+ pytest .skip ("ngroup is not truly a reduction" )
1388
+
1389
+ if reduction_func == "corrwith" : # GH 32293
1390
+ mark = pytest .mark .xfail (reason = "TODO: implemented SeriesGroupBy.corrwith" )
1391
+ request .node .add_marker (mark )
1392
+
1322
1393
df = pd .DataFrame (
1323
1394
{
1324
1395
"cat_1" : pd .Categorical (list ("AABB" ), categories = list ("ABC" )),
@@ -1327,11 +1398,13 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_o
1327
1398
}
1328
1399
)
1329
1400
unobserved = [tuple ("AC" ), tuple ("BC" ), tuple ("CA" ), tuple ("CB" ), tuple ("CC" )]
1330
- args = {"nth" : [0 ]}.get (func , [])
1401
+ args = {"nth" : [0 ]}.get (reduction_func , [])
1331
1402
1332
1403
series_groupby = df .groupby (["cat_1" , "cat_2" ], observed = False )["value" ]
1333
- agg = getattr (series_groupby , func )
1404
+ agg = getattr (series_groupby , reduction_func )
1334
1405
result = agg (* args )
1406
+
1407
+ zero_or_nan = _results_for_groupbys_with_missing_categories [reduction_func ]
1335
1408
1336
1409
for idx in unobserved :
1337
1410
val = result .loc [idx ]
0 commit comments