diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 8c0e193ec6348..0d003b9f80588 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -813,6 +813,7 @@ Performance - Performance and memory usage improvements in multi-key ``groupby`` (:issue:`8128`) - Performance improvements in groupby ``.agg`` and ``.apply`` where builtins max/min were not mapped to numpy/cythonized versions (:issue:`7722`) - Performance improvement in writing to sql (``to_sql``) of up to 50% (:issue:`8208`). +- Performance benchmarking of groupby for large value of ngroups (:issue:`6787`) diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index c9746359b6ecd..ec1befa53d383 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -484,3 +484,78 @@ def f(g): groupby_agg_builtins1 = Benchmark("df.groupby('jim').agg([sum, min, max])", setup) groupby_agg_builtins2 = Benchmark("df.groupby(['jim', 'joe']).agg([sum, min, max])", setup) + +#---------------------------------------------------------------------- +# groupby with a variable value for ngroups + + +ngroups_list = [100, 10000] +no_arg_func_list = [ + 'all', + 'any', + 'count', + 'cumcount', + 'cummax', + 'cummin', + 'cumprod', + 'cumsum', + 'describe', + 'diff', + 'first', + 'head', + 'last', + 'mad', + 'max', + 'mean', + 'median', + 'min', + 'nunique', + 'pct_change', + 'prod', + 'rank', + 'sem', + 'size', + 'skew', + 'std', + 'sum', + 'tail', + 'unique', + 'var', + 'value_counts', +] + + +_stmt_template = "df.groupby('value')['timestamp'].%s" +_setup_template = common_setup + """ +np.random.seed(1234) +ngroups = %s +size = ngroups * 10 +rng = np.arange(ngroups) +df = DataFrame(dict( + timestamp=rng.take(np.random.randint(0, ngroups, size=size)), + value=np.random.randint(0, size, size=size) +)) +""" +START_DATE = datetime(2011, 7, 1) + + +def make_large_ngroups_bmark(ngroups, func_name, func_args=''): + bmark_name = 'groupby_ngroups_%s_%s' % (ngroups, func_name) + stmt = _stmt_template % ('%s(%s)' % (func_name, func_args)) + setup = _setup_template % ngroups + bmark = Benchmark(stmt, setup, start_date=START_DATE) + # MUST set name + bmark.name = bmark_name + return bmark + + +def inject_bmark_into_globals(bmark): + if not bmark.name: + raise AssertionError('benchmark must have a name') + globals()[bmark.name] = bmark + + +for ngroups in ngroups_list: + for func_name in no_arg_func_list: + bmark = make_large_ngroups_bmark(ngroups, func_name) + inject_bmark_into_globals(bmark)