@@ -308,66 +308,130 @@ def test_data_frame_value_counts_dropna(
308
308
tm .assert_series_equal (result_frame_groupby , expected )
309
309
310
310
311
- @pytest .mark .parametrize ("as_index" , [False , True ])
311
+ @pytest .mark .parametrize ("as_index" , [True , False ])
312
312
@pytest .mark .parametrize (
313
313
"observed, expected_index" ,
314
314
[
315
315
(
316
316
False ,
317
317
[
318
- ("FR" , "male " , "low " ),
319
- ("FR" , "female " , "high " ),
320
- ("FR" , "male " , "medium " ),
321
- ("FR" , "female " , "low " ),
322
- ("FR" , "female " , "medium " ),
323
- ("FR" , "male " , "high " ),
324
- ("US" , "female " , "high " ),
325
- ("US" , "male " , "low " ),
326
- ("US" , "female " , "low " ),
327
- ("US" , "female " , "medium " ),
328
- ("US" , "male " , "high " ),
329
- ("US" , "male " , "medium " ),
318
+ ("FR" , "high " , "female " ),
319
+ ("FR" , "high " , "male " ),
320
+ ("FR" , "low " , "male " ),
321
+ ("FR" , "low " , "female " ),
322
+ ("FR" , "medium " , "male " ),
323
+ ("FR" , "medium " , "female " ),
324
+ ("US" , "high " , "female " ),
325
+ ("US" , "high " , "male " ),
326
+ ("US" , "low " , "male " ),
327
+ ("US" , "low " , "female " ),
328
+ ("US" , "medium " , "female " ),
329
+ ("US" , "medium " , "male " ),
330
330
],
331
331
),
332
332
(
333
333
True ,
334
334
[
335
- ("FR" , "male " , "low " ),
336
- ("FR" , "female " , "high " ),
337
- ("FR" , "male " , "medium " ),
338
- ("US" , "female " , "high " ),
339
- ("US" , "male " , "low " ),
335
+ ("FR" , "high " , "female " ),
336
+ ("FR" , "low " , "male " ),
337
+ ("FR" , "medium " , "male " ),
338
+ ("US" , "high " , "female " ),
339
+ ("US" , "low " , "male " ),
340
340
],
341
341
),
342
342
],
343
343
)
344
344
@pytest .mark .parametrize (
345
345
"normalize, expected_data" ,
346
346
[
347
- (False , np .array ([2 , 1 , 1 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 ], dtype = np .int64 )),
347
+ (False , np .array ([1 , 0 , 2 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 0 , 0 ], dtype = np .int64 )),
348
348
(
349
349
True ,
350
- np .array ([0.5 , 0.25 , 0.25 , 0.0 , 0.0 , 0.0 , 0.5 , 0.5 , 0.0 , 0.0 , 0.0 , 0.0 ]),
350
+ # NaN values corresponds to non-observed groups
351
+ np .array (
352
+ [1.0 , 0.0 , 1.0 , 0.0 , 1.0 , 0.0 , 1.0 , 0.0 , 1.0 , 0.0 , np .nan , np .nan ]
353
+ ),
351
354
),
352
355
],
353
356
)
354
- def test_categorical (
357
+ def test_categorical_groupers (
355
358
education_df , as_index , observed , expected_index , normalize , expected_data
356
359
):
357
- # Test categorical data whether or not observed
358
- gp = education_df .astype ("category" ).groupby (
359
- "country" , as_index = as_index , observed = observed
360
+ education_df = education_df .copy ()
361
+ education_df ["country" ] = education_df ["country" ].astype ("category" )
362
+ education_df ["education" ] = education_df ["education" ].astype ("category" )
363
+
364
+ gp = education_df .groupby (
365
+ ["country" , "education" ], as_index = as_index , observed = observed
360
366
)
361
367
result = gp .value_counts (normalize = normalize )
362
368
363
369
expected_series = Series (
364
370
data = expected_data [expected_data > 0.0 ] if observed else expected_data ,
371
+ index = MultiIndex .from_tuples (
372
+ expected_index ,
373
+ names = ["country" , "education" , "gender" ],
374
+ ),
375
+ )
376
+ for i in range (2 ):
377
+ expected_series .index = expected_series .index .set_levels (
378
+ CategoricalIndex (expected_series .index .levels [i ]), level = i
379
+ )
380
+
381
+ if as_index :
382
+ tm .assert_series_equal (result , expected_series )
383
+ else :
384
+ expected = expected_series .reset_index (
385
+ name = "proportion" if normalize else "count"
386
+ )
387
+ tm .assert_frame_equal (result , expected )
388
+
389
+
390
+ @pytest .mark .parametrize ("as_index" , [False , True ])
391
+ @pytest .mark .parametrize ("observed" , [False , True ])
392
+ @pytest .mark .parametrize (
393
+ "normalize, expected_data" ,
394
+ [
395
+ (False , np .array ([2 , 1 , 1 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 ], dtype = np .int64 )),
396
+ (
397
+ True ,
398
+ # NaN values corresponds to non-observed groups
399
+ np .array ([0.5 , 0.25 , 0.25 , 0.0 , 0.0 , 0.0 , 0.5 , 0.5 , 0.0 , 0.0 , 0.0 , 0.0 ]),
400
+ ),
401
+ ],
402
+ )
403
+ def test_categorical_values (education_df , as_index , observed , normalize , expected_data ):
404
+ # Test non-observed categories are included in the result,
405
+ # regardless of `observed`
406
+ education_df = education_df .copy ()
407
+ education_df ["gender" ] = education_df ["gender" ].astype ("category" )
408
+ education_df ["education" ] = education_df ["education" ].astype ("category" )
409
+
410
+ gp = education_df .groupby ("country" , as_index = as_index , observed = observed )
411
+ result = gp .value_counts (normalize = normalize )
412
+
413
+ expected_index = [
414
+ ("FR" , "male" , "low" ),
415
+ ("FR" , "female" , "high" ),
416
+ ("FR" , "male" , "medium" ),
417
+ ("FR" , "female" , "low" ),
418
+ ("FR" , "female" , "medium" ),
419
+ ("FR" , "male" , "high" ),
420
+ ("US" , "female" , "high" ),
421
+ ("US" , "male" , "low" ),
422
+ ("US" , "female" , "low" ),
423
+ ("US" , "female" , "medium" ),
424
+ ("US" , "male" , "high" ),
425
+ ("US" , "male" , "medium" ),
426
+ ]
427
+ expected_series = Series (
428
+ data = expected_data ,
365
429
index = MultiIndex .from_tuples (
366
430
expected_index ,
367
431
names = ["country" , "gender" , "education" ],
368
432
),
369
433
)
370
- for i in range (3 ):
434
+ for i in range (1 , 3 ):
371
435
expected_series .index = expected_series .index .set_levels (
372
436
CategoricalIndex (expected_series .index .levels [i ]), level = i
373
437
)
0 commit comments