BENCH: programmatically create benchmarks for large ngroups (GH6787)

dlovell · dlovell · commit 3d5db57c7678 · 2014-09-28T13:47:35.000-04:00
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -813,6 +813,7 @@ Performance
 - Performance and memory usage improvements in multi-key ``groupby`` (:issue:`8128`)
 - Performance improvements in groupby ``.agg`` and ``.apply`` where builtins max/min were not mapped to numpy/cythonized versions (:issue:`7722`)
 - Performance improvement in writing to sql (``to_sql``) of up to 50% (:issue:`8208`).
+- Performance benchmarking of groupby for large value of ngroups (:issue:`6787`)
 
 
 
diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py
@@ -484,3 +484,75 @@ def f(g):
 
 groupby_agg_builtins1 = Benchmark("df.groupby('jim').agg([sum, min, max])", setup)
 groupby_agg_builtins2 = Benchmark("df.groupby(['jim', 'joe']).agg([sum, min, max])", setup)
+
+#----------------------------------------------------------------------
+# groupby with a large value for ngroups
+
+setup = common_setup + """
+np.random.seed(1234)
+ngroups = 10000
+size = ngroups * 10
+rng = np.arange(ngroups)
+df = DataFrame(dict(
+    timestamp=rng.take(np.random.randint(0, ngroups, size=size)),
+    value=np.random.randint(0, size, size=size)
+))
+"""
+
+no_arg_func_list = [
+    'all',
+    'any',
+    'count',
+    'cumcount',
+    'cummax',
+    'cummin',
+    'cumprod',
+    'cumsum',
+    'describe',
+    'diff',
+    'first',
+    'head',
+    'last',
+    'mad',
+    'max',
+    'mean',
+    'median',
+    'min',
+    'nunique',
+    'pct_change',
+    'prod',
+    'rank',
+    'sem',
+    'size',
+    'skew',
+    'std',
+    'sum',
+    'tail',
+    'unique',
+    'var',
+    'value_counts',
+]
+
+
+_stmt_template = "df.groupby('value')['timestamp'].%s"
+START_DATE = datetime(2011, 7, 1)
+
+
+def make_large_ngroups_bmark(func_name, func_args=''):
+    bmark_name = 'groupby_large_ngroups_%s' % func_name
+    stmt = _stmt_template % ('%s(%s)' % (func_name, func_args))
+    bmark = Benchmark(stmt, setup, start_date=START_DATE)
+    # MUST set name
+    bmark.name = bmark_name
+    return bmark
+
+
+def inject_bmark_into_globals(bmark):
+    if not bmark.name:
+        raise AssertionError('benchmark must have a name')
+    globals()[bmark.name] = bmark
+
+
+for func_name in no_arg_func_list:
+    bmark = make_large_ngroups_bmark(func_name)
+    inject_bmark_into_globals(bmark)