pandas-dev · immerrr · Feb 23, 2014 · Feb 27, 2014 · Feb 28, 2014 · Mar 9, 2014
diff --git a/vb_suite/extras_indexing.py b/vb_suite/extras_indexing.py
@@ -0,0 +1,164 @@
+#----------------------------------------------------------------------
+# Thorough checks of all containers and all indexing types
+
+from vbench.benchmark import Benchmark
+
+SECTION = 'Exhaustive check of indexing and scalar value access'
+
+common_setup = """from pandas_vb_common import *
+"""
+
+
+import pandas.util.testing as tm
+
+MAX_ENTRIES = 100000
+
+# FIXME: makeCustomIndexWithCache reimplements (sort of) tm.makeCustomIndex,
+# because the latter doesn't offer customization of date/period index
+# frequencies and integer index offset.
+
+setup_template = common_setup + """
+import sys
+import pandas as pd
+
+try:
+    make_index = tm.makeCustomIndexWithCache
+except AttributeError:
+    MAX_ENTRIES = %(MAX_ENTRIES)s
+    _indices = {}
+
+    def makeCustomIndexWithCache(nentries, idx_type):
+        assert nentries <= MAX_ENTRIES
+
+        key = idx_type
+        try:
+            full_idx = _indices[key]
+        except KeyError:
+            if idx_type == 'mi':
+                full_idx = tm.makeCustomIndex(nentries=MAX_ENTRIES, nlevels=2)
+            elif idx_type == 'dt':
+                full_idx = pd.date_range('2000-01-01', periods=MAX_ENTRIES, freq='T')
+            elif idx_type == 'p':
+                full_idx = pd.period_range('2000-01-01', periods=MAX_ENTRIES, freq='T')
+            elif idx_type == 's':
+                full_idx = tm.makeStringIndex(k=MAX_ENTRIES)
+            elif idx_type == 'u':
+                full_idx = tm.makeUnicodeIndex(k=MAX_ENTRIES)
+            elif idx_type == 'i':
+                full_idx = pd.Index(np.arange(MAX_ENTRIES) + MAX_ENTRIES)
+            elif idx_type == 'f':
+                full_idx = tm.makeFloatIndex(MAX_ENTRIES)
+            else:
+                raise ValueError('Wrong idx type: %%s' %% idx_type)
+
+            _indices[key] = full_idx
+
+        return full_idx[:nentries]
+
+    make_index = tm.makeCustomIndexWithCache = makeCustomIndexWithCache
+
+obj = %(class_name)s(%(ctor_args)s)
+
+pos = -1
+axis = obj._get_axis(%(axis)r)
+label = axis[pos]
+arr_pos = np.arange(int(len(axis) / 2))
+arr_label = axis[arr_pos].values
+mask = tm.np.arange(len(axis)) %% 3 == 0
+series_mask = Series(mask)
+"""
+
+# generate_index_benchmarks(
+#                     klass, long_axis=axis, idx_type=idx_type, is_dup=is_dup)
+
+
+def generate_index_benchmarks(klass, idx_type, long_axis):
+    ndim = klass().ndim
+
+    shape = [10] * ndim
+    shape[long_axis] = MAX_ENTRIES
+    shape = tuple(shape)
+
+    types = ['i'] * ndim
+    types[long_axis] = idx_type
+    types = tuple(types)
+
+    axes = klass._AXIS_ORDERS
+    ctor_args = ',\n    '.join([
+        '%s=make_index(nentries=%r, idx_type=%r)' % v
+        for v in zip(axes, shape, types)])
+
+    def get_benchmark_name(indexer, axis):
+        shape_type_str = 'x'.join([str(s) + str(t)
+                                   for s, t in zip(shape, types)])
+
+        components = ['indexing_', klass.__name__.lower(), indexer,
+                      shape_type_str]
+        if axis is not None:
+            components.append("ax%s" % axis)
+
+        return '_'.join(components)
+
+    def make_suffix(attrname, indexer_str, axis):
+        if axis is not None:
+            indexers = [':,'] * ndim
+            indexers[axis] = indexer_str + ','
+            indexer_str = ''.join(indexers)
+        return '%s[%s]' % (attrname, indexer_str)
+
+    benchmarked_axes = set([None, 0, ndim - 1])
+
+    result = {}
+    for axis in benchmarked_axes:
+        for params in [
+            {'indexer': 'basic_pos',
+             'suffix': make_suffix('.iloc', 'pos', axis)},
+            {'indexer': 'basic_label',
+             'suffix': make_suffix('.loc', 'label', axis)},
+
+            {'indexer': 'slice_pos',
+             'suffix': make_suffix('.iloc', ':pos', axis)},
+            {'indexer': 'slice_label',
+             'suffix': make_suffix('.loc', ':label', axis)},
+
+            {'indexer': 'arr_pos',
+             'suffix': make_suffix('.iloc', 'arr_pos', axis)},
+            {'indexer': 'arr_label',
+             'suffix': make_suffix('.loc', 'arr_label', axis)},
+
+            {'indexer': 'iloc_mask',
+             'suffix': make_suffix('.iloc', 'mask', axis)},
+            {'indexer': 'loc_mask',
+             'suffix': make_suffix('.loc', 'mask', axis)}, ]:
+
+            b = Benchmark('obj%s' % params['suffix'],
+                          setup_template % {
+                              'class_name': klass.__name__,
+                              'ctor_args': ctor_args, 'axis': axis or 0,
+                              'MAX_ENTRIES': MAX_ENTRIES},
+                          name=get_benchmark_name(params['indexer'], axis))
+            result[b.name] = b
+
+    return result
+
+# Benchmarks are generated as follows: given a container type, generate an
+# instance of it with one of the axes long enough to produce statistically
+# significant timing values and try different kinds of indexing on it.
+#
+# Generated benchmark set involves a cartesian product of
+# - container types
+# - designated "long" axis (minor or major one)
+# - "long" axis type (string, integer, datetime, period, multiindex)
+# - indexer type (positional, slice, fancy, etc.)
+# - indexer axis (indexing is not limited to "long" axis)
+# - label/positional indexer
+#
+# FIXME: add multiindex indexers?
+# FIXME: add non-unique axes?
+# FIXME: add non-unique non-monotonic axes?
+for klass in (tm.Series, tm.DataFrame, tm.Panel):
+    for axis in set([0, klass().ndim - 1]):
+        for idx_type in ('s', 'i', 'dt', 'p', 'mi'):
+                bms = generate_index_benchmarks(
+                    klass, long_axis=axis, idx_type=idx_type)
+                globals().update(bms)
diff --git a/vb_suite/indexing_exhaustive.py b/vb_suite/indexing_exhaustive.py
@@ -0,0 +1,120 @@
+#----------------------------------------------------------------------
+# Thorough checks of all containers and all indexing types
+
+from vbench.benchmark import Benchmark
+
+SECTION = 'Exhaustive check of indexing and scalar value access'
+
+common_setup = """from pandas_vb_common import *
+"""
+
+
+import pandas.util.testing as tm
+
+setup_template = common_setup + """
+import sys
+
+try:
+    make_index = tm.makeCustomIndexWithCache
+except AttributeError:
+    MAX_ENTRIES = 1000000
+    _indices = {}
+
+    def makeCustomIndexWithCache(nentries, **kwargs):
+        assert nentries < MAX_ENTRIES
+
+        key = tuple(kwargs.items())
+        try:
+            full_idx = _indices[key]
+        except KeyError:
+            full_idx = _indices[key] = tm.makeCustomIndex(nentries=MAX_ENTRIES,
+                                                          **kwargs)
+        return full_idx[:nentries]
+
+    make_index = tm.makeCustomIndexWithCache = makeCustomIndexWithCache
+
+obj = %(class_name)s(%(ctor_args)s)
+
+pos = -1
+axis = obj._get_axis(%(axis)r)
+label = axis[pos]
+arr_pos = np.arange(int(len(axis) / 2))
+arr_label = axis[arr_pos].values
+mask = tm.np.arange(len(axis)) %% 3 == 0
+series_mask = Series(mask)
+"""
+
+
+def generate_index_benchmarks(klass, idx_type, shape):
+    if not isinstance(shape, tuple):
+        shape = (shape,)
+    ndim = len(shape)
+
+    if not isinstance(idx_type, tuple):
+        idx_types = tuple([idx_type] * ndim)
+    else:
+        assert len(idx_type) == ndim
+        idx_types = idx_type
+
+    axes = klass._AXIS_ORDERS
+    ctor_args = ',\n    '.join([
+        '%s=make_index(idx_type=%r, nentries=%s, nlevels=1)' % v
+        for v in zip(axes, idx_types, shape)])
+
+    def get_benchmark_name(indexer, axis):
+        shape_type_str = 'x'.join([str(s) + str(t)
+                                   for s, t in zip(shape, idx_types)])
+
+        components = ['indexing_', klass.__name__.lower(), indexer,
+                      shape_type_str]
+        if axis is not None:
+            components.append("ax%s" % axis)
+
+        return '_'.join(components)
+
+    def make_suffix(attrname, indexer_str, axis):
+        if axis is not None:
+            indexers = [':,'] * ndim
+            indexers[axis] = indexer_str + ','
+            indexer_str = ''.join(indexers)
+        return '%s[%s]' % (attrname, indexer_str)
+
+    benchmarked_axes = set([None, 0, ndim - 1])
+
+    result = {}
+    for axis in benchmarked_axes:
+        for params in [
+            {'indexer': 'basic_pos',
+             'suffix': make_suffix('.iloc', 'pos', axis)},
+            {'indexer': 'basic_label',
+             'suffix': make_suffix('.loc', 'label', axis)},
+
+            {'indexer': 'slice_pos',
+             'suffix': make_suffix('.iloc', ':pos', axis)},
+            {'indexer': 'slice_label',
+             'suffix': make_suffix('.loc', ':label', axis)},
+
+            {'indexer': 'arr_pos',
+             'suffix': make_suffix('.iloc', 'arr_pos', axis)},
+            {'indexer': 'arr_label',
+             'suffix': make_suffix('.loc', 'arr_label', axis)},
+
+            {'indexer': 'iloc_mask',
+             'suffix': make_suffix('.iloc', 'mask', axis)},
+            {'indexer': 'loc_mask',
+             'suffix': make_suffix('.loc', 'mask', axis)}, ]:
+
+            b = Benchmark('obj%s' % params['suffix'],
+                          setup_template % {
+                              'class_name': klass.__name__,
+                              'ctor_args': ctor_args, 'axis': axis or 0},
+                          name=get_benchmark_name(params['indexer'], axis))
+            result[b.name] = b
+
+    return result
+
+globals().update(generate_index_benchmarks(tm.Series, 's', 100000))
+globals().update(generate_index_benchmarks(tm.DataFrame, 's', (10, 100000)))
+globals().update(generate_index_benchmarks(tm.DataFrame, 's', (100000, 10)))
+globals().update(generate_index_benchmarks(tm.Panel, 's', (100000, 10, 10)))
+globals().update(generate_index_benchmarks(tm.Panel, 's', (10, 10, 100000)))
diff --git a/vb_suite/suite.py b/vb_suite/suite.py
@@ -30,17 +30,42 @@
            'timedelta',
            'eval']
 
-by_module = {}
-benchmarks = []
 
-for modname in modules:
-    ref = __import__(modname)
-    by_module[modname] = [v for v in ref.__dict__.values()
+def discover_benchmarks(mods, return_as='list'):
+    """
+    Collect available benchmarks from specified modules.
+
+    Arguments
+    ---------
+    mods: list of str
+        List of modules to search in
+    return_as: {'both', 'list', 'dict'}
+        Specifies result type: dict will group benchmarks by module
+    """
+    by_module = {}
+    benchmarks = []
+
+    for modname in mods:
+        ref = __import__(modname)
+        mod_benchmarks = [v for v in ref.__dict__.values()
                           if isinstance(v, Benchmark)]
-    benchmarks.extend(by_module[modname])
 
-for bm in benchmarks:
-    assert(bm.name is not None)
+        for bm in mod_benchmarks:
+            assert bm.name is not None
+
+        by_module[modname] = mod_benchmarks
+        benchmarks.extend(mod_benchmarks)
+
+    if return_as == 'both':
+        return by_module, benchmarks
+    elif return_as == 'list':
+        return benchmarks
+    elif return_as == 'dict':
+        return by_module
+    else:
+        raise ValueError("Incorrect return_as value: %s" % return_as)
+
+by_module, benchmarks = discover_benchmarks(modules, return_as='both')
 
 import getpass
 import sys

diff --git a/vb_suite/test_perf.py b/vb_suite/test_perf.py
@@ -113,6 +113,9 @@ def __call__(self, parser, namespace, values, option_string=None):
                     dest='regex',
                     default="",
                     help='Regex pat, only tests whose name matches the regext will be run.')
+parser.add_argument('-e', '--extra-benchmarks', metavar='EXTRA',
+                    dest='extras', action='append',
+                    help='Extra modules to collect benchmarks from')
 parser.add_argument('-s', '--seed',
                     metavar="SEED",
                     dest='seed',
@@ -442,6 +445,7 @@ def print_report(df,h_head=None,h_msg="",h_baseline=None,b_msg=""):
     if args.stats :
         try:
             pd.options.display.expand_frame_repr=False
+            pd.set_option('display.max_rows', None)
         except:
             pass
         stats_footer += str(df.T.describe().T) + "\n\n"
@@ -461,10 +465,7 @@ def print_report(df,h_head=None,h_msg="",h_baseline=None,b_msg=""):
             args.log_file)
 
 
-
 def main():
-    from suite import benchmarks
-
     if not args.log_file:
         args.log_file = os.path.abspath(
             os.path.join(REPO_PATH, 'vb_suite.log'))
@@ -509,7 +510,14 @@ def main():
     # surprises
     os.chdir(os.path.dirname(os.path.abspath(__file__)))
 
-    benchmarks = [x for x in benchmarks if re.search(args.regex,x.name)]
+    from suite import discover_benchmarks, benchmarks
+
+    benchmarks = [b for b in benchmarks]
+    if args.extras:
+        benchmarks.extend(discover_benchmarks(args.extras, return_as='list'))
+
+    benchmarks = [bm for bm in benchmarks
+                  if re.search(args.regex, bm.name)]
 
     for b in benchmarks:
         b.repeat = args.repeats