@@ -484,3 +484,75 @@ def f(g):
484
484
485
485
groupby_agg_builtins1 = Benchmark ("df.groupby('jim').agg([sum, min, max])" , setup )
486
486
groupby_agg_builtins2 = Benchmark ("df.groupby(['jim', 'joe']).agg([sum, min, max])" , setup )
487
+
488
+ #----------------------------------------------------------------------
489
+ # groupby with a large value for ngroups
490
+
491
+ setup = common_setup + """
492
+ np.random.seed(1234)
493
+ ngroups = 10000
494
+ size = ngroups * 10
495
+ rng = np.arange(ngroups)
496
+ df = DataFrame(dict(
497
+ timestamp=rng.take(np.random.randint(0, ngroups, size=size)),
498
+ value=np.random.randint(0, size, size=size)
499
+ ))
500
+ """
501
+
502
+ no_arg_func_list = [
503
+ 'all' ,
504
+ 'any' ,
505
+ 'count' ,
506
+ 'cumcount' ,
507
+ 'cummax' ,
508
+ 'cummin' ,
509
+ 'cumprod' ,
510
+ 'cumsum' ,
511
+ 'describe' ,
512
+ 'diff' ,
513
+ 'first' ,
514
+ 'head' ,
515
+ 'last' ,
516
+ 'mad' ,
517
+ 'max' ,
518
+ 'mean' ,
519
+ 'median' ,
520
+ 'min' ,
521
+ 'nunique' ,
522
+ 'pct_change' ,
523
+ 'prod' ,
524
+ 'rank' ,
525
+ 'sem' ,
526
+ 'size' ,
527
+ 'skew' ,
528
+ 'std' ,
529
+ 'sum' ,
530
+ 'tail' ,
531
+ 'unique' ,
532
+ 'var' ,
533
+ 'value_counts' ,
534
+ ]
535
+
536
+
537
+ _stmt_template = "df.groupby('value')['timestamp'].%s"
538
+ START_DATE = datetime (2011 , 7 , 1 )
539
+
540
+
541
+ def make_large_ngroups_bmark (func_name , func_args = '' ):
542
+ bmark_name = 'groupby_large_ngroups_%s' % func_name
543
+ stmt = _stmt_template % ('%s(%s)' % (func_name , func_args ))
544
+ bmark = Benchmark (stmt , setup , start_date = START_DATE )
545
+ # MUST set name
546
+ bmark .name = bmark_name
547
+ return bmark
548
+
549
+
550
+ def inject_bmark_into_globals (bmark ):
551
+ if not bmark .name :
552
+ raise AssertionError ('benchmark must have a name' )
553
+ globals ()[bmark .name ] = bmark
554
+
555
+
556
+ for func_name in no_arg_func_list :
557
+ bmark = make_large_ngroups_bmark (func_name )
558
+ inject_bmark_into_globals (bmark )
0 commit comments