@@ -393,74 +393,91 @@ def test_groupby_drop_nan_with_multi_index():
393
393
tm .assert_frame_equal (result , expected )
394
394
395
395
396
+ # sequence_index enumerates all strings made up of x, y, z of length 4
397
+ @pytest .mark .parametrize ("sequence_index" , range (3 ** 4 ))
396
398
@pytest .mark .parametrize (
397
- "values, dtype" ,
399
+ "dtype" ,
398
400
[
399
- ([2 , np .nan , 1 , 2 ], None ),
400
- ([2 , np .nan , 1 , 2 ], "UInt8" ),
401
- ([2 , np .nan , 1 , 2 ], "Int8" ),
402
- ([2 , np .nan , 1 , 2 ], "UInt16" ),
403
- ([2 , np .nan , 1 , 2 ], "Int16" ),
404
- ([2 , np .nan , 1 , 2 ], "UInt32" ),
405
- ([2 , np .nan , 1 , 2 ], "Int32" ),
406
- ([2 , np .nan , 1 , 2 ], "UInt64" ),
407
- ([2 , np .nan , 1 , 2 ], "Int64" ),
408
- ([2 , np .nan , 1 , 2 ], "Float32" ),
409
- ([2 , np .nan , 1 , 2 ], "Int64" ),
410
- ([2 , np .nan , 1 , 2 ], "Float64" ),
401
+ None ,
402
+ "UInt8" ,
403
+ "Int8" ,
404
+ "UInt16" ,
405
+ "Int16" ,
406
+ "UInt32" ,
407
+ "Int32" ,
408
+ "UInt64" ,
409
+ "Int64" ,
410
+ "Float32" ,
411
+ "Int64" ,
412
+ "Float64" ,
413
+ "category" ,
414
+ "string" ,
411
415
pytest .param (
412
- ["y" , None , "x" , "y" ],
413
- "category" ,
414
- marks = pytest .mark .xfail (
415
- reason = "dropna=False not correct for categorical, GH#48645"
416
- ),
417
- ),
418
- (["y" , pd .NA , "x" , "y" ], "string" ),
419
- pytest .param (
420
- ["y" , pd .NA , "x" , "y" ],
421
416
"string[pyarrow]" ,
422
417
marks = pytest .mark .skipif (
423
418
pa_version_under1p01 , reason = "pyarrow is not installed"
424
419
),
425
420
),
426
- (
427
- ["2016-01-01" , np .datetime64 ("NaT" ), "2017-01-01" , "2016-01-01" ],
428
- "datetime64[ns]" ,
429
- ),
430
- (
431
- [
432
- pd .Period ("2012-02-01" , freq = "D" ),
433
- pd .NaT ,
434
- pd .Period ("2012-01-01" , freq = "D" ),
435
- pd .Period ("2012-02-01" , freq = "D" ),
436
- ],
437
- None ,
438
- ),
439
- (pd .arrays .SparseArray ([2 , np .nan , 1 , 2 ]), None ),
421
+ "datetime64[ns]" ,
422
+ "period[d]" ,
423
+ "Sparse[float]" ,
440
424
],
441
425
)
442
426
@pytest .mark .parametrize ("test_series" , [True , False ])
443
- def test_no_sort_keep_na (values , dtype , test_series ):
444
- # GH#46584
445
- key = pd .Series (values , dtype = dtype )
446
- df = pd .DataFrame ({"key" : key , "a" : [1 , 2 , 3 , 4 ]})
427
+ def test_no_sort_keep_na (request , sequence_index , dtype , test_series ):
428
+ # GH#46584, GH#48794
429
+
430
+ # Convert sequence_index into a string sequence, e.g. 5 becomes "xxyz"
431
+ # This sequence is used for the grouper.
432
+ sequence = "" .join (
433
+ [{0 : "x" , 1 : "y" , 2 : "z" }[sequence_index // (3 ** k ) % 3 ] for k in range (4 )]
434
+ )
435
+
436
+ if dtype == "category" and "z" in sequence :
437
+ # Only xfail when nulls are present
438
+ msg = "dropna=False not correct for categorical, GH#48645"
439
+ request .node .add_marker (pytest .mark .xfail (reason = msg ))
440
+
441
+ # Unique values to use for grouper, depends on dtype
442
+ if dtype in ("string" , "string[pyarrow]" ):
443
+ uniques = {"x" : "x" , "y" : "y" , "z" : pd .NA }
444
+ elif dtype in ("datetime64[ns]" , "period[d]" ):
445
+ uniques = {"x" : "2016-01-01" , "y" : "2017-01-01" , "z" : pd .NA }
446
+ else :
447
+ uniques = {"x" : 1 , "y" : 2 , "z" : np .nan }
448
+
449
+ df = pd .DataFrame (
450
+ {
451
+ "key" : pd .Series ([uniques [label ] for label in sequence ], dtype = dtype ),
452
+ "a" : [0 , 1 , 2 , 3 ],
453
+ }
454
+ )
447
455
gb = df .groupby ("key" , dropna = False , sort = False )
448
456
if test_series :
449
457
gb = gb ["a" ]
458
+ result = gb .sum ()
450
459
451
- warn = None
452
- if isinstance (values , pd .arrays .SparseArray ):
453
- warn = FutureWarning
454
- msg = "passing a SparseArray to pd.Index will store that array directly"
455
- with tm .assert_produces_warning (warn , match = msg ):
456
- result = gb .sum ()
457
- expected = pd .DataFrame ({"a" : [5 , 2 , 3 ]}, index = key [:- 1 ].rename ("key" ))
460
+ # Manually compute the groupby sum, use the labels "x", "y", and "z" to avoid
461
+ # issues with hashing np.nan
462
+ summed = {}
463
+ for idx , label in enumerate (sequence ):
464
+ summed [label ] = summed .get (label , 0 ) + idx
465
+ if dtype == "category" :
466
+ index = pd .CategoricalIndex (
467
+ [uniques [e ] for e in summed ],
468
+ list ({uniques [k ]: 0 for k in sequence if not pd .isnull (uniques [k ])}),
469
+ name = "key" ,
470
+ )
471
+ elif isinstance (dtype , str ) and dtype .startswith ("Sparse" ):
472
+ index = pd .Index (
473
+ pd .array ([uniques [label ] for label in summed ], dtype = dtype ), name = "key"
474
+ )
475
+ else :
476
+ index = pd .Index ([uniques [label ] for label in summed ], dtype = dtype , name = "key" )
477
+ expected = pd .Series (summed .values (), index = index , name = "a" , dtype = None )
478
+ if not test_series :
479
+ expected = expected .to_frame ()
458
480
459
- if test_series :
460
- expected = expected ["a" ]
461
- if expected .index .is_categorical ():
462
- # TODO: Slicing reorders categories?
463
- expected .index = expected .index .reorder_categories (["y" , "x" ])
464
481
tm .assert_equal (result , expected )
465
482
466
483
0 commit comments