Skip to content

BLD: add benchmarks for all single-axis indexers #6450

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 164 additions & 0 deletions vb_suite/extras_indexing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
#----------------------------------------------------------------------
# Thorough checks of all containers and all indexing types

from vbench.benchmark import Benchmark

SECTION = 'Exhaustive check of indexing and scalar value access'

common_setup = """from pandas_vb_common import *
"""


import pandas.util.testing as tm

MAX_ENTRIES = 100000

# FIXME: makeCustomIndexWithCache reimplements (sort of) tm.makeCustomIndex,
# because the latter doesn't offer customization of date/period index
# frequencies and integer index offset.

setup_template = common_setup + """
import sys
import pandas as pd

try:
make_index = tm.makeCustomIndexWithCache
except AttributeError:
MAX_ENTRIES = %(MAX_ENTRIES)s
_indices = {}

def makeCustomIndexWithCache(nentries, idx_type):
assert nentries <= MAX_ENTRIES

key = idx_type
try:
full_idx = _indices[key]
except KeyError:
if idx_type == 'mi':
full_idx = tm.makeCustomIndex(nentries=MAX_ENTRIES, nlevels=2)
elif idx_type == 'dt':
full_idx = pd.date_range('2000-01-01', periods=MAX_ENTRIES, freq='T')
elif idx_type == 'p':
full_idx = pd.period_range('2000-01-01', periods=MAX_ENTRIES, freq='T')
elif idx_type == 's':
full_idx = tm.makeStringIndex(k=MAX_ENTRIES)
elif idx_type == 'u':
full_idx = tm.makeUnicodeIndex(k=MAX_ENTRIES)
elif idx_type == 'i':
full_idx = pd.Index(np.arange(MAX_ENTRIES) + MAX_ENTRIES)
elif idx_type == 'f':
full_idx = tm.makeFloatIndex(MAX_ENTRIES)
else:
raise ValueError('Wrong idx type: %%s' %% idx_type)

_indices[key] = full_idx

return full_idx[:nentries]

make_index = tm.makeCustomIndexWithCache = makeCustomIndexWithCache

obj = %(class_name)s(%(ctor_args)s)

pos = -1
axis = obj._get_axis(%(axis)r)
label = axis[pos]
arr_pos = np.arange(int(len(axis) / 2))
arr_label = axis[arr_pos].values
mask = tm.np.arange(len(axis)) %% 3 == 0
series_mask = Series(mask)
"""

# generate_index_benchmarks(
# klass, long_axis=axis, idx_type=idx_type, is_dup=is_dup)


def generate_index_benchmarks(klass, idx_type, long_axis):
ndim = klass().ndim

shape = [10] * ndim
shape[long_axis] = MAX_ENTRIES
shape = tuple(shape)

types = ['i'] * ndim
types[long_axis] = idx_type
types = tuple(types)

axes = klass._AXIS_ORDERS
ctor_args = ',\n '.join([
'%s=make_index(nentries=%r, idx_type=%r)' % v
for v in zip(axes, shape, types)])

def get_benchmark_name(indexer, axis):
shape_type_str = 'x'.join([str(s) + str(t)
for s, t in zip(shape, types)])

components = ['indexing_', klass.__name__.lower(), indexer,
shape_type_str]
if axis is not None:
components.append("ax%s" % axis)

return '_'.join(components)

def make_suffix(attrname, indexer_str, axis):
if axis is not None:
indexers = [':,'] * ndim
indexers[axis] = indexer_str + ','
indexer_str = ''.join(indexers)
return '%s[%s]' % (attrname, indexer_str)

benchmarked_axes = set([None, 0, ndim - 1])

result = {}
for axis in benchmarked_axes:
for params in [
{'indexer': 'basic_pos',
'suffix': make_suffix('.iloc', 'pos', axis)},
{'indexer': 'basic_label',
'suffix': make_suffix('.loc', 'label', axis)},

{'indexer': 'slice_pos',
'suffix': make_suffix('.iloc', ':pos', axis)},
{'indexer': 'slice_label',
'suffix': make_suffix('.loc', ':label', axis)},

{'indexer': 'arr_pos',
'suffix': make_suffix('.iloc', 'arr_pos', axis)},
{'indexer': 'arr_label',
'suffix': make_suffix('.loc', 'arr_label', axis)},

{'indexer': 'iloc_mask',
'suffix': make_suffix('.iloc', 'mask', axis)},
{'indexer': 'loc_mask',
'suffix': make_suffix('.loc', 'mask', axis)}, ]:

b = Benchmark('obj%s' % params['suffix'],
setup_template % {
'class_name': klass.__name__,
'ctor_args': ctor_args, 'axis': axis or 0,
'MAX_ENTRIES': MAX_ENTRIES},
name=get_benchmark_name(params['indexer'], axis))
result[b.name] = b

return result

# Benchmarks are generated as follows: given a container type, generate an
# instance of it with one of the axes long enough to produce statistically
# significant timing values and try different kinds of indexing on it.
#
# Generated benchmark set involves a cartesian product of
# - container types
# - designated "long" axis (minor or major one)
# - "long" axis type (string, integer, datetime, period, multiindex)
# - indexer type (positional, slice, fancy, etc.)
# - indexer axis (indexing is not limited to "long" axis)
# - label/positional indexer
#
# FIXME: add multiindex indexers?
# FIXME: add non-unique axes?
# FIXME: add non-unique non-monotonic axes?
for klass in (tm.Series, tm.DataFrame, tm.Panel):
for axis in set([0, klass().ndim - 1]):
for idx_type in ('s', 'i', 'dt', 'p', 'mi'):
bms = generate_index_benchmarks(
klass, long_axis=axis, idx_type=idx_type)
globals().update(bms)
120 changes: 120 additions & 0 deletions vb_suite/indexing_exhaustive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#----------------------------------------------------------------------
# Thorough checks of all containers and all indexing types

from vbench.benchmark import Benchmark

SECTION = 'Exhaustive check of indexing and scalar value access'

common_setup = """from pandas_vb_common import *
"""


import pandas.util.testing as tm

setup_template = common_setup + """
import sys

try:
make_index = tm.makeCustomIndexWithCache
except AttributeError:
MAX_ENTRIES = 1000000
_indices = {}

def makeCustomIndexWithCache(nentries, **kwargs):
assert nentries < MAX_ENTRIES

key = tuple(kwargs.items())
try:
full_idx = _indices[key]
except KeyError:
full_idx = _indices[key] = tm.makeCustomIndex(nentries=MAX_ENTRIES,
**kwargs)
return full_idx[:nentries]

make_index = tm.makeCustomIndexWithCache = makeCustomIndexWithCache

obj = %(class_name)s(%(ctor_args)s)

pos = -1
axis = obj._get_axis(%(axis)r)
label = axis[pos]
arr_pos = np.arange(int(len(axis) / 2))
arr_label = axis[arr_pos].values
mask = tm.np.arange(len(axis)) %% 3 == 0
series_mask = Series(mask)
"""


def generate_index_benchmarks(klass, idx_type, shape):
if not isinstance(shape, tuple):
shape = (shape,)
ndim = len(shape)

if not isinstance(idx_type, tuple):
idx_types = tuple([idx_type] * ndim)
else:
assert len(idx_type) == ndim
idx_types = idx_type

axes = klass._AXIS_ORDERS
ctor_args = ',\n '.join([
'%s=make_index(idx_type=%r, nentries=%s, nlevels=1)' % v
for v in zip(axes, idx_types, shape)])

def get_benchmark_name(indexer, axis):
shape_type_str = 'x'.join([str(s) + str(t)
for s, t in zip(shape, idx_types)])

components = ['indexing_', klass.__name__.lower(), indexer,
shape_type_str]
if axis is not None:
components.append("ax%s" % axis)

return '_'.join(components)

def make_suffix(attrname, indexer_str, axis):
if axis is not None:
indexers = [':,'] * ndim
indexers[axis] = indexer_str + ','
indexer_str = ''.join(indexers)
return '%s[%s]' % (attrname, indexer_str)

benchmarked_axes = set([None, 0, ndim - 1])

result = {}
for axis in benchmarked_axes:
for params in [
{'indexer': 'basic_pos',
'suffix': make_suffix('.iloc', 'pos', axis)},
{'indexer': 'basic_label',
'suffix': make_suffix('.loc', 'label', axis)},

{'indexer': 'slice_pos',
'suffix': make_suffix('.iloc', ':pos', axis)},
{'indexer': 'slice_label',
'suffix': make_suffix('.loc', ':label', axis)},

{'indexer': 'arr_pos',
'suffix': make_suffix('.iloc', 'arr_pos', axis)},
{'indexer': 'arr_label',
'suffix': make_suffix('.loc', 'arr_label', axis)},

{'indexer': 'iloc_mask',
'suffix': make_suffix('.iloc', 'mask', axis)},
{'indexer': 'loc_mask',
'suffix': make_suffix('.loc', 'mask', axis)}, ]:

b = Benchmark('obj%s' % params['suffix'],
setup_template % {
'class_name': klass.__name__,
'ctor_args': ctor_args, 'axis': axis or 0},
name=get_benchmark_name(params['indexer'], axis))
result[b.name] = b

return result

globals().update(generate_index_benchmarks(tm.Series, 's', 100000))
globals().update(generate_index_benchmarks(tm.DataFrame, 's', (10, 100000)))
globals().update(generate_index_benchmarks(tm.DataFrame, 's', (100000, 10)))
globals().update(generate_index_benchmarks(tm.Panel, 's', (100000, 10, 10)))
globals().update(generate_index_benchmarks(tm.Panel, 's', (10, 10, 100000)))
41 changes: 33 additions & 8 deletions vb_suite/suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,42 @@
'timedelta',
'eval']

by_module = {}
benchmarks = []

for modname in modules:
ref = __import__(modname)
by_module[modname] = [v for v in ref.__dict__.values()
def discover_benchmarks(mods, return_as='list'):
"""
Collect available benchmarks from specified modules.

Arguments
---------
mods: list of str
List of modules to search in
return_as: {'both', 'list', 'dict'}
Specifies result type: dict will group benchmarks by module
"""
by_module = {}
benchmarks = []

for modname in mods:
ref = __import__(modname)
mod_benchmarks = [v for v in ref.__dict__.values()
if isinstance(v, Benchmark)]
benchmarks.extend(by_module[modname])

for bm in benchmarks:
assert(bm.name is not None)
for bm in mod_benchmarks:
assert bm.name is not None

by_module[modname] = mod_benchmarks
benchmarks.extend(mod_benchmarks)

if return_as == 'both':
return by_module, benchmarks
elif return_as == 'list':
return benchmarks
elif return_as == 'dict':
return by_module
else:
raise ValueError("Incorrect return_as value: %s" % return_as)

by_module, benchmarks = discover_benchmarks(modules, return_as='both')

import getpass
import sys
Expand Down
16 changes: 12 additions & 4 deletions vb_suite/test_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@ def __call__(self, parser, namespace, values, option_string=None):
dest='regex',
default="",
help='Regex pat, only tests whose name matches the regext will be run.')
parser.add_argument('-e', '--extra-benchmarks', metavar='EXTRA',
dest='extras', action='append',
help='Extra modules to collect benchmarks from')
parser.add_argument('-s', '--seed',
metavar="SEED",
dest='seed',
Expand Down Expand Up @@ -442,6 +445,7 @@ def print_report(df,h_head=None,h_msg="",h_baseline=None,b_msg=""):
if args.stats :
try:
pd.options.display.expand_frame_repr=False
pd.set_option('display.max_rows', None)
except:
pass
stats_footer += str(df.T.describe().T) + "\n\n"
Expand All @@ -461,10 +465,7 @@ def print_report(df,h_head=None,h_msg="",h_baseline=None,b_msg=""):
args.log_file)



def main():
from suite import benchmarks

if not args.log_file:
args.log_file = os.path.abspath(
os.path.join(REPO_PATH, 'vb_suite.log'))
Expand Down Expand Up @@ -509,7 +510,14 @@ def main():
# surprises
os.chdir(os.path.dirname(os.path.abspath(__file__)))

benchmarks = [x for x in benchmarks if re.search(args.regex,x.name)]
from suite import discover_benchmarks, benchmarks

benchmarks = [b for b in benchmarks]
if args.extras:
benchmarks.extend(discover_benchmarks(args.extras, return_as='list'))

benchmarks = [bm for bm in benchmarks
if re.search(args.regex, bm.name)]

for b in benchmarks:
b.repeat = args.repeats
Expand Down