diff --git a/vb_suite/extras_indexing.py b/vb_suite/extras_indexing.py new file mode 100644 index 0000000000000..a6cc1284794b4 --- /dev/null +++ b/vb_suite/extras_indexing.py @@ -0,0 +1,164 @@ +#---------------------------------------------------------------------- +# Thorough checks of all containers and all indexing types + +from vbench.benchmark import Benchmark + +SECTION = 'Exhaustive check of indexing and scalar value access' + +common_setup = """from pandas_vb_common import * +""" + + +import pandas.util.testing as tm + +MAX_ENTRIES = 100000 + +# FIXME: makeCustomIndexWithCache reimplements (sort of) tm.makeCustomIndex, +# because the latter doesn't offer customization of date/period index +# frequencies and integer index offset. + +setup_template = common_setup + """ +import sys +import pandas as pd + +try: + make_index = tm.makeCustomIndexWithCache +except AttributeError: + MAX_ENTRIES = %(MAX_ENTRIES)s + _indices = {} + + def makeCustomIndexWithCache(nentries, idx_type): + assert nentries <= MAX_ENTRIES + + key = idx_type + try: + full_idx = _indices[key] + except KeyError: + if idx_type == 'mi': + full_idx = tm.makeCustomIndex(nentries=MAX_ENTRIES, nlevels=2) + elif idx_type == 'dt': + full_idx = pd.date_range('2000-01-01', periods=MAX_ENTRIES, freq='T') + elif idx_type == 'p': + full_idx = pd.period_range('2000-01-01', periods=MAX_ENTRIES, freq='T') + elif idx_type == 's': + full_idx = tm.makeStringIndex(k=MAX_ENTRIES) + elif idx_type == 'u': + full_idx = tm.makeUnicodeIndex(k=MAX_ENTRIES) + elif idx_type == 'i': + full_idx = pd.Index(np.arange(MAX_ENTRIES) + MAX_ENTRIES) + elif idx_type == 'f': + full_idx = tm.makeFloatIndex(MAX_ENTRIES) + else: + raise ValueError('Wrong idx type: %%s' %% idx_type) + + _indices[key] = full_idx + + return full_idx[:nentries] + + make_index = tm.makeCustomIndexWithCache = makeCustomIndexWithCache + +obj = %(class_name)s(%(ctor_args)s) + +pos = -1 +axis = obj._get_axis(%(axis)r) +label = axis[pos] +arr_pos = np.arange(int(len(axis) / 2)) +arr_label = axis[arr_pos].values +mask = tm.np.arange(len(axis)) %% 3 == 0 +series_mask = Series(mask) +""" + +# generate_index_benchmarks( +# klass, long_axis=axis, idx_type=idx_type, is_dup=is_dup) + + +def generate_index_benchmarks(klass, idx_type, long_axis): + ndim = klass().ndim + + shape = [10] * ndim + shape[long_axis] = MAX_ENTRIES + shape = tuple(shape) + + types = ['i'] * ndim + types[long_axis] = idx_type + types = tuple(types) + + axes = klass._AXIS_ORDERS + ctor_args = ',\n '.join([ + '%s=make_index(nentries=%r, idx_type=%r)' % v + for v in zip(axes, shape, types)]) + + def get_benchmark_name(indexer, axis): + shape_type_str = 'x'.join([str(s) + str(t) + for s, t in zip(shape, types)]) + + components = ['indexing_', klass.__name__.lower(), indexer, + shape_type_str] + if axis is not None: + components.append("ax%s" % axis) + + return '_'.join(components) + + def make_suffix(attrname, indexer_str, axis): + if axis is not None: + indexers = [':,'] * ndim + indexers[axis] = indexer_str + ',' + indexer_str = ''.join(indexers) + return '%s[%s]' % (attrname, indexer_str) + + benchmarked_axes = set([None, 0, ndim - 1]) + + result = {} + for axis in benchmarked_axes: + for params in [ + {'indexer': 'basic_pos', + 'suffix': make_suffix('.iloc', 'pos', axis)}, + {'indexer': 'basic_label', + 'suffix': make_suffix('.loc', 'label', axis)}, + + {'indexer': 'slice_pos', + 'suffix': make_suffix('.iloc', ':pos', axis)}, + {'indexer': 'slice_label', + 'suffix': make_suffix('.loc', ':label', axis)}, + + {'indexer': 'arr_pos', + 'suffix': make_suffix('.iloc', 'arr_pos', axis)}, + {'indexer': 'arr_label', + 'suffix': make_suffix('.loc', 'arr_label', axis)}, + + {'indexer': 'iloc_mask', + 'suffix': make_suffix('.iloc', 'mask', axis)}, + {'indexer': 'loc_mask', + 'suffix': make_suffix('.loc', 'mask', axis)}, ]: + + b = Benchmark('obj%s' % params['suffix'], + setup_template % { + 'class_name': klass.__name__, + 'ctor_args': ctor_args, 'axis': axis or 0, + 'MAX_ENTRIES': MAX_ENTRIES}, + name=get_benchmark_name(params['indexer'], axis)) + result[b.name] = b + + return result + +# Benchmarks are generated as follows: given a container type, generate an +# instance of it with one of the axes long enough to produce statistically +# significant timing values and try different kinds of indexing on it. +# +# Generated benchmark set involves a cartesian product of +# - container types +# - designated "long" axis (minor or major one) +# - "long" axis type (string, integer, datetime, period, multiindex) +# - indexer type (positional, slice, fancy, etc.) +# - indexer axis (indexing is not limited to "long" axis) +# - label/positional indexer +# +# FIXME: add multiindex indexers? +# FIXME: add non-unique axes? +# FIXME: add non-unique non-monotonic axes? +for klass in (tm.Series, tm.DataFrame, tm.Panel): + for axis in set([0, klass().ndim - 1]): + for idx_type in ('s', 'i', 'dt', 'p', 'mi'): + bms = generate_index_benchmarks( + klass, long_axis=axis, idx_type=idx_type) + globals().update(bms) diff --git a/vb_suite/indexing_exhaustive.py b/vb_suite/indexing_exhaustive.py new file mode 100644 index 0000000000000..358a3d81b69e5 --- /dev/null +++ b/vb_suite/indexing_exhaustive.py @@ -0,0 +1,120 @@ +#---------------------------------------------------------------------- +# Thorough checks of all containers and all indexing types + +from vbench.benchmark import Benchmark + +SECTION = 'Exhaustive check of indexing and scalar value access' + +common_setup = """from pandas_vb_common import * +""" + + +import pandas.util.testing as tm + +setup_template = common_setup + """ +import sys + +try: + make_index = tm.makeCustomIndexWithCache +except AttributeError: + MAX_ENTRIES = 1000000 + _indices = {} + + def makeCustomIndexWithCache(nentries, **kwargs): + assert nentries < MAX_ENTRIES + + key = tuple(kwargs.items()) + try: + full_idx = _indices[key] + except KeyError: + full_idx = _indices[key] = tm.makeCustomIndex(nentries=MAX_ENTRIES, + **kwargs) + return full_idx[:nentries] + + make_index = tm.makeCustomIndexWithCache = makeCustomIndexWithCache + +obj = %(class_name)s(%(ctor_args)s) + +pos = -1 +axis = obj._get_axis(%(axis)r) +label = axis[pos] +arr_pos = np.arange(int(len(axis) / 2)) +arr_label = axis[arr_pos].values +mask = tm.np.arange(len(axis)) %% 3 == 0 +series_mask = Series(mask) +""" + + +def generate_index_benchmarks(klass, idx_type, shape): + if not isinstance(shape, tuple): + shape = (shape,) + ndim = len(shape) + + if not isinstance(idx_type, tuple): + idx_types = tuple([idx_type] * ndim) + else: + assert len(idx_type) == ndim + idx_types = idx_type + + axes = klass._AXIS_ORDERS + ctor_args = ',\n '.join([ + '%s=make_index(idx_type=%r, nentries=%s, nlevels=1)' % v + for v in zip(axes, idx_types, shape)]) + + def get_benchmark_name(indexer, axis): + shape_type_str = 'x'.join([str(s) + str(t) + for s, t in zip(shape, idx_types)]) + + components = ['indexing_', klass.__name__.lower(), indexer, + shape_type_str] + if axis is not None: + components.append("ax%s" % axis) + + return '_'.join(components) + + def make_suffix(attrname, indexer_str, axis): + if axis is not None: + indexers = [':,'] * ndim + indexers[axis] = indexer_str + ',' + indexer_str = ''.join(indexers) + return '%s[%s]' % (attrname, indexer_str) + + benchmarked_axes = set([None, 0, ndim - 1]) + + result = {} + for axis in benchmarked_axes: + for params in [ + {'indexer': 'basic_pos', + 'suffix': make_suffix('.iloc', 'pos', axis)}, + {'indexer': 'basic_label', + 'suffix': make_suffix('.loc', 'label', axis)}, + + {'indexer': 'slice_pos', + 'suffix': make_suffix('.iloc', ':pos', axis)}, + {'indexer': 'slice_label', + 'suffix': make_suffix('.loc', ':label', axis)}, + + {'indexer': 'arr_pos', + 'suffix': make_suffix('.iloc', 'arr_pos', axis)}, + {'indexer': 'arr_label', + 'suffix': make_suffix('.loc', 'arr_label', axis)}, + + {'indexer': 'iloc_mask', + 'suffix': make_suffix('.iloc', 'mask', axis)}, + {'indexer': 'loc_mask', + 'suffix': make_suffix('.loc', 'mask', axis)}, ]: + + b = Benchmark('obj%s' % params['suffix'], + setup_template % { + 'class_name': klass.__name__, + 'ctor_args': ctor_args, 'axis': axis or 0}, + name=get_benchmark_name(params['indexer'], axis)) + result[b.name] = b + + return result + +globals().update(generate_index_benchmarks(tm.Series, 's', 100000)) +globals().update(generate_index_benchmarks(tm.DataFrame, 's', (10, 100000))) +globals().update(generate_index_benchmarks(tm.DataFrame, 's', (100000, 10))) +globals().update(generate_index_benchmarks(tm.Panel, 's', (100000, 10, 10))) +globals().update(generate_index_benchmarks(tm.Panel, 's', (10, 10, 100000))) diff --git a/vb_suite/suite.py b/vb_suite/suite.py index a1b38e8509e4e..12da997068b38 100644 --- a/vb_suite/suite.py +++ b/vb_suite/suite.py @@ -30,17 +30,42 @@ 'timedelta', 'eval'] -by_module = {} -benchmarks = [] -for modname in modules: - ref = __import__(modname) - by_module[modname] = [v for v in ref.__dict__.values() +def discover_benchmarks(mods, return_as='list'): + """ + Collect available benchmarks from specified modules. + + Arguments + --------- + mods: list of str + List of modules to search in + return_as: {'both', 'list', 'dict'} + Specifies result type: dict will group benchmarks by module + """ + by_module = {} + benchmarks = [] + + for modname in mods: + ref = __import__(modname) + mod_benchmarks = [v for v in ref.__dict__.values() if isinstance(v, Benchmark)] - benchmarks.extend(by_module[modname]) -for bm in benchmarks: - assert(bm.name is not None) + for bm in mod_benchmarks: + assert bm.name is not None + + by_module[modname] = mod_benchmarks + benchmarks.extend(mod_benchmarks) + + if return_as == 'both': + return by_module, benchmarks + elif return_as == 'list': + return benchmarks + elif return_as == 'dict': + return by_module + else: + raise ValueError("Incorrect return_as value: %s" % return_as) + +by_module, benchmarks = discover_benchmarks(modules, return_as='both') import getpass import sys diff --git a/vb_suite/test_perf.py b/vb_suite/test_perf.py index 66e50269f00c6..005e829b96d46 100755 --- a/vb_suite/test_perf.py +++ b/vb_suite/test_perf.py @@ -113,6 +113,9 @@ def __call__(self, parser, namespace, values, option_string=None): dest='regex', default="", help='Regex pat, only tests whose name matches the regext will be run.') +parser.add_argument('-e', '--extra-benchmarks', metavar='EXTRA', + dest='extras', action='append', + help='Extra modules to collect benchmarks from') parser.add_argument('-s', '--seed', metavar="SEED", dest='seed', @@ -442,6 +445,7 @@ def print_report(df,h_head=None,h_msg="",h_baseline=None,b_msg=""): if args.stats : try: pd.options.display.expand_frame_repr=False + pd.set_option('display.max_rows', None) except: pass stats_footer += str(df.T.describe().T) + "\n\n" @@ -461,10 +465,7 @@ def print_report(df,h_head=None,h_msg="",h_baseline=None,b_msg=""): args.log_file) - def main(): - from suite import benchmarks - if not args.log_file: args.log_file = os.path.abspath( os.path.join(REPO_PATH, 'vb_suite.log')) @@ -509,7 +510,14 @@ def main(): # surprises os.chdir(os.path.dirname(os.path.abspath(__file__))) - benchmarks = [x for x in benchmarks if re.search(args.regex,x.name)] + from suite import discover_benchmarks, benchmarks + + benchmarks = [b for b in benchmarks] + if args.extras: + benchmarks.extend(discover_benchmarks(args.extras, return_as='list')) + + benchmarks = [bm for bm in benchmarks + if re.search(args.regex, bm.name)] for b in benchmarks: b.repeat = args.repeats