19
19
import pandas ._testing as tm
20
20
21
21
22
- def cartesian_product_for_groupers (result , args , names ):
22
+ def cartesian_product_for_groupers (result , args , names , fill_value = np . NaN ):
23
23
""" Reindex to a cartesian production for the groupers,
24
24
preserving the nature (Categorical) of each grouper
25
25
"""
@@ -33,7 +33,7 @@ def f(a):
33
33
return a
34
34
35
35
index = MultiIndex .from_product (map (f , args ), names = names )
36
- return result .reindex (index ).sort_index ()
36
+ return result .reindex (index , fill_value = fill_value ).sort_index ()
37
37
38
38
39
39
_results_for_groupbys_with_missing_categories = dict (
@@ -309,7 +309,7 @@ def test_observed(observed):
309
309
result = gb .sum ()
310
310
if not observed :
311
311
expected = cartesian_product_for_groupers (
312
- expected , [cat1 , cat2 , ["foo" , "bar" ]], list ("ABC" )
312
+ expected , [cat1 , cat2 , ["foo" , "bar" ]], list ("ABC" ), fill_value = 0
313
313
)
314
314
315
315
tm .assert_frame_equal (result , expected )
@@ -319,7 +319,9 @@ def test_observed(observed):
319
319
expected = DataFrame ({"values" : [1 , 2 , 3 , 4 ]}, index = exp_index )
320
320
result = gb .sum ()
321
321
if not observed :
322
- expected = cartesian_product_for_groupers (expected , [cat1 , cat2 ], list ("AB" ))
322
+ expected = cartesian_product_for_groupers (
323
+ expected , [cat1 , cat2 ], list ("AB" ), fill_value = 0
324
+ )
323
325
324
326
tm .assert_frame_equal (result , expected )
325
327
@@ -1188,9 +1190,10 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
1188
1190
names = ["A" , "B" ],
1189
1191
).sortlevel ()
1190
1192
1191
- expected = Series (data = [2 , 4 , np . nan , 1 , np . nan , 3 ], index = index , name = "C" )
1193
+ expected = Series (data = [2 , 4 , 0 , 1 , 0 , 3 ], index = index , name = "C" )
1192
1194
grouped = df_cat .groupby (["A" , "B" ], observed = observed )["C" ]
1193
1195
result = getattr (grouped , operation )(sum )
1196
+
1194
1197
tm .assert_series_equal (result , expected )
1195
1198
1196
1199
@@ -1340,15 +1343,6 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
1340
1343
)
1341
1344
request .node .add_marker (mark )
1342
1345
1343
- if reduction_func == "sum" : # GH 31422
1344
- mark = pytest .mark .xfail (
1345
- reason = (
1346
- "sum should return 0 but currently returns NaN. "
1347
- "This is a known bug. See GH 31422."
1348
- )
1349
- )
1350
- request .node .add_marker (mark )
1351
-
1352
1346
df = pd .DataFrame (
1353
1347
{
1354
1348
"cat_1" : pd .Categorical (list ("AABB" ), categories = list ("ABC" )),
@@ -1369,8 +1363,11 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
1369
1363
val = result .loc [idx ]
1370
1364
assert (pd .isna (zero_or_nan ) and pd .isna (val )) or (val == zero_or_nan )
1371
1365
1372
- # If we expect unobserved values to be zero, we also expect the dtype to be int
1373
- if zero_or_nan == 0 :
1366
+ # If we expect unobserved values to be zero, we also expect the dtype to be int.
1367
+ # Except for .sum(). If the observed categories sum to dtype=float (i.e. their
1368
+ # sums have decimals), then the zeros for the missing categories should also be
1369
+ # floats.
1370
+ if zero_or_nan == 0 and reduction_func != "sum" :
1374
1371
assert np .issubdtype (result .dtype , np .integer )
1375
1372
1376
1373
@@ -1412,15 +1409,6 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false(
1412
1409
if reduction_func == "ngroup" :
1413
1410
pytest .skip ("ngroup does not return the Categories on the index" )
1414
1411
1415
- if reduction_func == "sum" : # GH 31422
1416
- mark = pytest .mark .xfail (
1417
- reason = (
1418
- "sum should return 0 but currently returns NaN. "
1419
- "This is a known bug. See GH 31422."
1420
- )
1421
- )
1422
- request .node .add_marker (mark )
1423
-
1424
1412
df = pd .DataFrame (
1425
1413
{
1426
1414
"cat_1" : pd .Categorical (list ("AABB" ), categories = list ("ABC" )),
0 commit comments