Skip to content

Commit 3d5db57

Browse files
committed
BENCH: programmatically create benchmarks for large ngroups (GH6787)
1 parent e487a30 commit 3d5db57

File tree

2 files changed

+73
-0
lines changed

2 files changed

+73
-0
lines changed

doc/source/v0.15.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -813,6 +813,7 @@ Performance
813813
- Performance and memory usage improvements in multi-key ``groupby`` (:issue:`8128`)
814814
- Performance improvements in groupby ``.agg`` and ``.apply`` where builtins max/min were not mapped to numpy/cythonized versions (:issue:`7722`)
815815
- Performance improvement in writing to sql (``to_sql``) of up to 50% (:issue:`8208`).
816+
- Performance benchmarking of groupby for large value of ngroups (:issue:`6787`)
816817

817818

818819

vb_suite/groupby.py

+72
Original file line numberDiff line numberDiff line change
@@ -484,3 +484,75 @@ def f(g):
484484

485485
groupby_agg_builtins1 = Benchmark("df.groupby('jim').agg([sum, min, max])", setup)
486486
groupby_agg_builtins2 = Benchmark("df.groupby(['jim', 'joe']).agg([sum, min, max])", setup)
487+
488+
#----------------------------------------------------------------------
489+
# groupby with a large value for ngroups
490+
491+
setup = common_setup + """
492+
np.random.seed(1234)
493+
ngroups = 10000
494+
size = ngroups * 10
495+
rng = np.arange(ngroups)
496+
df = DataFrame(dict(
497+
timestamp=rng.take(np.random.randint(0, ngroups, size=size)),
498+
value=np.random.randint(0, size, size=size)
499+
))
500+
"""
501+
502+
no_arg_func_list = [
503+
'all',
504+
'any',
505+
'count',
506+
'cumcount',
507+
'cummax',
508+
'cummin',
509+
'cumprod',
510+
'cumsum',
511+
'describe',
512+
'diff',
513+
'first',
514+
'head',
515+
'last',
516+
'mad',
517+
'max',
518+
'mean',
519+
'median',
520+
'min',
521+
'nunique',
522+
'pct_change',
523+
'prod',
524+
'rank',
525+
'sem',
526+
'size',
527+
'skew',
528+
'std',
529+
'sum',
530+
'tail',
531+
'unique',
532+
'var',
533+
'value_counts',
534+
]
535+
536+
537+
_stmt_template = "df.groupby('value')['timestamp'].%s"
538+
START_DATE = datetime(2011, 7, 1)
539+
540+
541+
def make_large_ngroups_bmark(func_name, func_args=''):
542+
bmark_name = 'groupby_large_ngroups_%s' % func_name
543+
stmt = _stmt_template % ('%s(%s)' % (func_name, func_args))
544+
bmark = Benchmark(stmt, setup, start_date=START_DATE)
545+
# MUST set name
546+
bmark.name = bmark_name
547+
return bmark
548+
549+
550+
def inject_bmark_into_globals(bmark):
551+
if not bmark.name:
552+
raise AssertionError('benchmark must have a name')
553+
globals()[bmark.name] = bmark
554+
555+
556+
for func_name in no_arg_func_list:
557+
bmark = make_large_ngroups_bmark(func_name)
558+
inject_bmark_into_globals(bmark)

0 commit comments

Comments
 (0)