diff --git a/test_perf.sh b/test_perf.sh deleted file mode 100755 index 022de25bca8fc..0000000000000 --- a/test_perf.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh - -CURDIR=$(pwd) -BASEDIR=$(cd "$(dirname "$0")"; pwd) -python "$BASEDIR"/vb_suite/test_perf.py $@ diff --git a/vb_suite/.gitignore b/vb_suite/.gitignore deleted file mode 100644 index cc110f04e1225..0000000000000 --- a/vb_suite/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -benchmarks.db -build/* -source/vbench/* -source/*.rst \ No newline at end of file diff --git a/vb_suite/attrs_caching.py b/vb_suite/attrs_caching.py deleted file mode 100644 index a7e3ed7094ed6..0000000000000 --- a/vb_suite/attrs_caching.py +++ /dev/null @@ -1,20 +0,0 @@ -from vbench.benchmark import Benchmark - -common_setup = """from .pandas_vb_common import * -""" - -#---------------------------------------------------------------------- -# DataFrame.index / columns property lookup time - -setup = common_setup + """ -df = DataFrame(np.random.randn(10, 6)) -cur_index = df.index -""" -stmt = "foo = df.index" - -getattr_dataframe_index = Benchmark(stmt, setup, - name="getattr_dataframe_index") - -stmt = "df.index = cur_index" -setattr_dataframe_index = Benchmark(stmt, setup, - name="setattr_dataframe_index") diff --git a/vb_suite/binary_ops.py b/vb_suite/binary_ops.py deleted file mode 100644 index edc29bf3eec37..0000000000000 --- a/vb_suite/binary_ops.py +++ /dev/null @@ -1,199 +0,0 @@ -from vbench.benchmark import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -""" - -SECTION = 'Binary ops' - -#---------------------------------------------------------------------- -# binary ops - -#---------------------------------------------------------------------- -# add - -setup = common_setup + """ -df = DataFrame(np.random.randn(20000, 100)) -df2 = DataFrame(np.random.randn(20000, 100)) -""" -frame_add = \ - Benchmark("df + df2", setup, name='frame_add', - start_date=datetime(2012, 1, 1)) - -setup = common_setup + """ -import pandas.core.computation.expressions as expr -df = DataFrame(np.random.randn(20000, 100)) -df2 = DataFrame(np.random.randn(20000, 100)) -expr.set_numexpr_threads(1) -""" - -frame_add_st = \ - Benchmark("df + df2", setup, name='frame_add_st',cleanup="expr.set_numexpr_threads()", - start_date=datetime(2013, 2, 26)) - -setup = common_setup + """ -import pandas.core.computation.expressions as expr -df = DataFrame(np.random.randn(20000, 100)) -df2 = DataFrame(np.random.randn(20000, 100)) -expr.set_use_numexpr(False) -""" -frame_add_no_ne = \ - Benchmark("df + df2", setup, name='frame_add_no_ne',cleanup="expr.set_use_numexpr(True)", - start_date=datetime(2013, 2, 26)) - -#---------------------------------------------------------------------- -# mult - -setup = common_setup + """ -df = DataFrame(np.random.randn(20000, 100)) -df2 = DataFrame(np.random.randn(20000, 100)) -""" -frame_mult = \ - Benchmark("df * df2", setup, name='frame_mult', - start_date=datetime(2012, 1, 1)) - -setup = common_setup + """ -import pandas.core.computation.expressions as expr -df = DataFrame(np.random.randn(20000, 100)) -df2 = DataFrame(np.random.randn(20000, 100)) -expr.set_numexpr_threads(1) -""" -frame_mult_st = \ - Benchmark("df * df2", setup, name='frame_mult_st',cleanup="expr.set_numexpr_threads()", - start_date=datetime(2013, 2, 26)) - -setup = common_setup + """ -import pandas.core.computation.expressions as expr -df = DataFrame(np.random.randn(20000, 100)) -df2 = DataFrame(np.random.randn(20000, 100)) -expr.set_use_numexpr(False) -""" -frame_mult_no_ne = \ - Benchmark("df * df2", setup, name='frame_mult_no_ne',cleanup="expr.set_use_numexpr(True)", - start_date=datetime(2013, 2, 26)) - -#---------------------------------------------------------------------- -# division - -setup = common_setup + """ -df = DataFrame(np.random.randn(1000, 1000)) -""" -frame_float_div_by_zero = \ - Benchmark("df / 0", setup, name='frame_float_div_by_zero') - -setup = common_setup + """ -df = DataFrame(np.random.randn(1000, 1000)) -""" -frame_float_floor_by_zero = \ - Benchmark("df // 0", setup, name='frame_float_floor_by_zero') - -setup = common_setup + """ -df = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) -""" -frame_int_div_by_zero = \ - Benchmark("df / 0", setup, name='frame_int_div_by_zero') - -setup = common_setup + """ -df = DataFrame(np.random.randn(1000, 1000)) -df2 = DataFrame(np.random.randn(1000, 1000)) -""" -frame_float_div = \ - Benchmark("df // df2", setup, name='frame_float_div') - -#---------------------------------------------------------------------- -# modulo - -setup = common_setup + """ -df = DataFrame(np.random.randn(1000, 1000)) -df2 = DataFrame(np.random.randn(1000, 1000)) -""" -frame_float_mod = \ - Benchmark("df / df2", setup, name='frame_float_mod') - -setup = common_setup + """ -df = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) -df2 = DataFrame(np.random.random_integers(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(1000, 1000))) -""" -frame_int_mod = \ - Benchmark("df / df2", setup, name='frame_int_mod') - -#---------------------------------------------------------------------- -# multi and - -setup = common_setup + """ -df = DataFrame(np.random.randn(20000, 100)) -df2 = DataFrame(np.random.randn(20000, 100)) -""" -frame_multi_and = \ - Benchmark("df[(df>0) & (df2>0)]", setup, name='frame_multi_and', - start_date=datetime(2012, 1, 1)) - -setup = common_setup + """ -import pandas.core.computation.expressions as expr -df = DataFrame(np.random.randn(20000, 100)) -df2 = DataFrame(np.random.randn(20000, 100)) -expr.set_numexpr_threads(1) -""" -frame_multi_and_st = \ - Benchmark("df[(df>0) & (df2>0)]", setup, name='frame_multi_and_st',cleanup="expr.set_numexpr_threads()", - start_date=datetime(2013, 2, 26)) - -setup = common_setup + """ -import pandas.core.computation.expressions as expr -df = DataFrame(np.random.randn(20000, 100)) -df2 = DataFrame(np.random.randn(20000, 100)) -expr.set_use_numexpr(False) -""" -frame_multi_and_no_ne = \ - Benchmark("df[(df>0) & (df2>0)]", setup, name='frame_multi_and_no_ne',cleanup="expr.set_use_numexpr(True)", - start_date=datetime(2013, 2, 26)) - -#---------------------------------------------------------------------- -# timeseries - -setup = common_setup + """ -N = 1000000 -halfway = N // 2 - 1 -s = Series(date_range('20010101', periods=N, freq='T')) -ts = s[halfway] -""" - -timestamp_series_compare = Benchmark("ts >= s", setup, - start_date=datetime(2013, 9, 27)) -series_timestamp_compare = Benchmark("s <= ts", setup, - start_date=datetime(2012, 2, 21)) - -setup = common_setup + """ -N = 1000000 -s = Series(date_range('20010101', periods=N, freq='s')) -""" - -timestamp_ops_diff1 = Benchmark("s.diff()", setup, - start_date=datetime(2013, 1, 1)) -timestamp_ops_diff2 = Benchmark("s-s.shift()", setup, - start_date=datetime(2013, 1, 1)) - -#---------------------------------------------------------------------- -# timeseries with tz - -setup = common_setup + """ -N = 10000 -halfway = N // 2 - 1 -s = Series(date_range('20010101', periods=N, freq='T', tz='US/Eastern')) -ts = s[halfway] -""" - -timestamp_tz_series_compare = Benchmark("ts >= s", setup, - start_date=datetime(2013, 9, 27)) -series_timestamp_tz_compare = Benchmark("s <= ts", setup, - start_date=datetime(2012, 2, 21)) - -setup = common_setup + """ -N = 10000 -s = Series(date_range('20010101', periods=N, freq='s', tz='US/Eastern')) -""" - -timestamp_tz_ops_diff1 = Benchmark("s.diff()", setup, - start_date=datetime(2013, 1, 1)) -timestamp_tz_ops_diff2 = Benchmark("s-s.shift()", setup, - start_date=datetime(2013, 1, 1)) diff --git a/vb_suite/categoricals.py b/vb_suite/categoricals.py deleted file mode 100644 index a08d479df20cb..0000000000000 --- a/vb_suite/categoricals.py +++ /dev/null @@ -1,16 +0,0 @@ -from vbench.benchmark import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -""" - -#---------------------------------------------------------------------- -# Series constructors - -setup = common_setup + """ -s = pd.Series(list('aabbcd') * 1000000).astype('category') -""" - -concat_categorical = \ - Benchmark("concat([s, s])", setup=setup, name='concat_categorical', - start_date=datetime(year=2015, month=7, day=15)) diff --git a/vb_suite/ctors.py b/vb_suite/ctors.py deleted file mode 100644 index 8123322383f0a..0000000000000 --- a/vb_suite/ctors.py +++ /dev/null @@ -1,39 +0,0 @@ -from vbench.benchmark import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -""" - -#---------------------------------------------------------------------- -# Series constructors - -setup = common_setup + """ -data = np.random.randn(100) -index = Index(np.arange(100)) -""" - -ctor_series_ndarray = \ - Benchmark("Series(data, index=index)", setup=setup, - name='series_constructor_ndarray') - -setup = common_setup + """ -arr = np.random.randn(100, 100) -""" - -ctor_frame_ndarray = \ - Benchmark("DataFrame(arr)", setup=setup, - name='frame_constructor_ndarray') - -setup = common_setup + """ -data = np.array(['foo', 'bar', 'baz'], dtype=object) -""" - -ctor_index_array_string = Benchmark('Index(data)', setup=setup) - -# index constructors -setup = common_setup + """ -s = Series([Timestamp('20110101'),Timestamp('20120101'),Timestamp('20130101')]*1000) -""" -index_from_series_ctor = Benchmark('Index(s)', setup=setup) - -dtindex_from_series_ctor = Benchmark('DatetimeIndex(s)', setup=setup) diff --git a/vb_suite/eval.py b/vb_suite/eval.py deleted file mode 100644 index 011669256a9bc..0000000000000 --- a/vb_suite/eval.py +++ /dev/null @@ -1,150 +0,0 @@ -from vbench.benchmark import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -import pandas as pd -df = DataFrame(np.random.randn(20000, 100)) -df2 = DataFrame(np.random.randn(20000, 100)) -df3 = DataFrame(np.random.randn(20000, 100)) -df4 = DataFrame(np.random.randn(20000, 100)) -""" - -setup = common_setup + """ -import pandas.core.computation.expressions as expr -expr.set_numexpr_threads(1) -""" - -SECTION = 'Eval' - -#---------------------------------------------------------------------- -# binary ops - -#---------------------------------------------------------------------- -# add -eval_frame_add_all_threads = \ - Benchmark("pd.eval('df + df2 + df3 + df4')", common_setup, - name='eval_frame_add_all_threads', - start_date=datetime(2013, 7, 21)) - - - -eval_frame_add_one_thread = \ - Benchmark("pd.eval('df + df2 + df3 + df4')", setup, - name='eval_frame_add_one_thread', - start_date=datetime(2013, 7, 26)) - -eval_frame_add_python = \ - Benchmark("pd.eval('df + df2 + df3 + df4', engine='python')", common_setup, - name='eval_frame_add_python', start_date=datetime(2013, 7, 21)) - -eval_frame_add_python_one_thread = \ - Benchmark("pd.eval('df + df2 + df3 + df4', engine='python')", setup, - name='eval_frame_add_python_one_thread', - start_date=datetime(2013, 7, 26)) -#---------------------------------------------------------------------- -# mult - -eval_frame_mult_all_threads = \ - Benchmark("pd.eval('df * df2 * df3 * df4')", common_setup, - name='eval_frame_mult_all_threads', - start_date=datetime(2013, 7, 21)) - -eval_frame_mult_one_thread = \ - Benchmark("pd.eval('df * df2 * df3 * df4')", setup, - name='eval_frame_mult_one_thread', - start_date=datetime(2013, 7, 26)) - -eval_frame_mult_python = \ - Benchmark("pd.eval('df * df2 * df3 * df4', engine='python')", - common_setup, - name='eval_frame_mult_python', start_date=datetime(2013, 7, 21)) - -eval_frame_mult_python_one_thread = \ - Benchmark("pd.eval('df * df2 * df3 * df4', engine='python')", setup, - name='eval_frame_mult_python_one_thread', - start_date=datetime(2013, 7, 26)) - -#---------------------------------------------------------------------- -# multi and - -eval_frame_and_all_threads = \ - Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')", - common_setup, - name='eval_frame_and_all_threads', - start_date=datetime(2013, 7, 21)) - -eval_frame_and_one_thread = \ - Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')", setup, - name='eval_frame_and_one_thread', - start_date=datetime(2013, 7, 26)) - -eval_frame_and_python = \ - Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')", - common_setup, name='eval_frame_and_python', - start_date=datetime(2013, 7, 21)) - -eval_frame_and_one_thread = \ - Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')", - setup, - name='eval_frame_and_python_one_thread', - start_date=datetime(2013, 7, 26)) - -#-------------------------------------------------------------------- -# chained comp -eval_frame_chained_cmp_all_threads = \ - Benchmark("pd.eval('df < df2 < df3 < df4')", common_setup, - name='eval_frame_chained_cmp_all_threads', - start_date=datetime(2013, 7, 21)) - -eval_frame_chained_cmp_one_thread = \ - Benchmark("pd.eval('df < df2 < df3 < df4')", setup, - name='eval_frame_chained_cmp_one_thread', - start_date=datetime(2013, 7, 26)) - -eval_frame_chained_cmp_python = \ - Benchmark("pd.eval('df < df2 < df3 < df4', engine='python')", - common_setup, name='eval_frame_chained_cmp_python', - start_date=datetime(2013, 7, 26)) - -eval_frame_chained_cmp_one_thread = \ - Benchmark("pd.eval('df < df2 < df3 < df4', engine='python')", setup, - name='eval_frame_chained_cmp_python_one_thread', - start_date=datetime(2013, 7, 26)) - - -common_setup = """from .pandas_vb_common import * -""" - -setup = common_setup + """ -N = 1000000 -halfway = N // 2 - 1 -index = date_range('20010101', periods=N, freq='T') -s = Series(index) -ts = s.iloc[halfway] -""" - -series_setup = setup + """ -df = DataFrame({'dates': s.values}) -""" - -query_datetime_series = Benchmark("df.query('dates < @ts')", - series_setup, - start_date=datetime(2013, 9, 27)) - -index_setup = setup + """ -df = DataFrame({'a': np.random.randn(N)}, index=index) -""" - -query_datetime_index = Benchmark("df.query('index < @ts')", - index_setup, start_date=datetime(2013, 9, 27)) - -setup = setup + """ -N = 1000000 -df = DataFrame({'a': np.random.randn(N)}) -min_val = df['a'].min() -max_val = df['a'].max() -""" - -query_with_boolean_selection = Benchmark("df.query('(a >= @min_val) & (a <= @max_val)')", - setup, start_date=datetime(2013, 9, 27)) - diff --git a/vb_suite/frame_ctor.py b/vb_suite/frame_ctor.py deleted file mode 100644 index 0d57da7b88d3b..0000000000000 --- a/vb_suite/frame_ctor.py +++ /dev/null @@ -1,123 +0,0 @@ -from vbench.benchmark import Benchmark -from datetime import datetime -try: - import pandas.tseries.offsets as offsets -except: - import pandas.core.datetools as offsets - -common_setup = """from .pandas_vb_common import * -try: - from pandas.tseries.offsets import * -except: - from pandas.core.datetools import * -""" - -#---------------------------------------------------------------------- -# Creation from nested dict - -setup = common_setup + """ -N, K = 5000, 50 -index = tm.makeStringIndex(N) -columns = tm.makeStringIndex(K) -frame = DataFrame(np.random.randn(N, K), index=index, columns=columns) - -try: - data = frame.to_dict() -except: - data = frame.toDict() - -some_dict = data.values()[0] -dict_list = [dict(zip(columns, row)) for row in frame.values] -""" - -frame_ctor_nested_dict = Benchmark("DataFrame(data)", setup) - -# From JSON-like stuff -frame_ctor_list_of_dict = Benchmark("DataFrame(dict_list)", setup, - start_date=datetime(2011, 12, 20)) - -series_ctor_from_dict = Benchmark("Series(some_dict)", setup) - -# nested dict, integer indexes, regression described in #621 -setup = common_setup + """ -data = dict((i,dict((j,float(j)) for j in range(100))) for i in xrange(2000)) -""" -frame_ctor_nested_dict_int64 = Benchmark("DataFrame(data)", setup) - -# dynamically generate benchmarks for every offset -# -# get_period_count & get_index_for_offset are there because blindly taking each -# offset times 1000 can easily go out of Timestamp bounds and raise errors. -dynamic_benchmarks = {} -n_steps = [1, 2] -offset_kwargs = {'WeekOfMonth': {'weekday': 1, 'week': 1}, - 'LastWeekOfMonth': {'weekday': 1, 'week': 1}, - 'FY5253': {'startingMonth': 1, 'weekday': 1}, - 'FY5253Quarter': {'qtr_with_extra_week': 1, 'startingMonth': 1, 'weekday': 1}} - -offset_extra_cases = {'FY5253': {'variation': ['nearest', 'last']}, - 'FY5253Quarter': {'variation': ['nearest', 'last']}} - -for offset in offsets.__all__: - for n in n_steps: - kwargs = {} - if offset in offset_kwargs: - kwargs = offset_kwargs[offset] - - if offset in offset_extra_cases: - extras = offset_extra_cases[offset] - else: - extras = {'': ['']} - - for extra_arg in extras: - for extra in extras[extra_arg]: - if extra: - kwargs[extra_arg] = extra - setup = common_setup + """ - -def get_period_count(start_date, off): - ten_offsets_in_days = ((start_date + off * 10) - start_date).days - if ten_offsets_in_days == 0: - return 1000 - else: - return min(9 * ((Timestamp.max - start_date).days // - ten_offsets_in_days), - 1000) - -def get_index_for_offset(off): - start_date = Timestamp('1/1/1900') - return date_range(start_date, - periods=min(1000, get_period_count(start_date, off)), - freq=off) - -idx = get_index_for_offset({}({}, **{})) -df = DataFrame(np.random.randn(len(idx),10), index=idx) -d = dict([ (col,df[col]) for col in df.columns ]) -""".format(offset, n, kwargs) - key = 'frame_ctor_dtindex_{}x{}'.format(offset, n) - if extra: - key += '__{}_{}'.format(extra_arg, extra) - dynamic_benchmarks[key] = Benchmark("DataFrame(d)", setup, name=key) - -# Have to stuff them in globals() so vbench detects them -globals().update(dynamic_benchmarks) - -# from a mi-series -setup = common_setup + """ -mi = MultiIndex.from_tuples([(x,y) for x in range(100) for y in range(100)]) -s = Series(randn(10000), index=mi) -""" -frame_from_series = Benchmark("DataFrame(s)", setup) - -#---------------------------------------------------------------------- -# get_numeric_data - -setup = common_setup + """ -df = DataFrame(randn(10000, 25)) -df['foo'] = 'bar' -df['bar'] = 'baz' -df = df.consolidate() -""" - -frame_get_numeric_data = Benchmark('df._get_numeric_data()', setup, - start_date=datetime(2011, 11, 1)) diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py deleted file mode 100644 index 46343e9c607fd..0000000000000 --- a/vb_suite/frame_methods.py +++ /dev/null @@ -1,525 +0,0 @@ -from vbench.api import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -""" - -#---------------------------------------------------------------------- -# lookup - -setup = common_setup + """ -df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh')) -df['foo'] = 'bar' - -row_labels = list(df.index[::10])[:900] -col_labels = list(df.columns) * 100 -row_labels_all = np.array(list(df.index) * len(df.columns), dtype='object') -col_labels_all = np.array(list(df.columns) * len(df.index), dtype='object') -""" - -frame_fancy_lookup = Benchmark('df.lookup(row_labels, col_labels)', setup, - start_date=datetime(2012, 1, 12)) - -frame_fancy_lookup_all = Benchmark('df.lookup(row_labels_all, col_labels_all)', - setup, - start_date=datetime(2012, 1, 12)) - -#---------------------------------------------------------------------- -# fillna in place - -setup = common_setup + """ -df = DataFrame(randn(10000, 100)) -df.values[::2] = np.nan -""" - -frame_fillna_inplace = Benchmark('df.fillna(0, inplace=True)', setup, - start_date=datetime(2012, 4, 4)) - - -#---------------------------------------------------------------------- -# reindex both axes - -setup = common_setup + """ -df = DataFrame(randn(10000, 10000)) -idx = np.arange(4000, 7000) -""" - -frame_reindex_axis0 = Benchmark('df.reindex(idx)', setup) - -frame_reindex_axis1 = Benchmark('df.reindex(columns=idx)', setup) - -frame_reindex_both_axes = Benchmark('df.reindex(index=idx, columns=idx)', - setup, start_date=datetime(2011, 1, 1)) - -frame_reindex_both_axes_ix = Benchmark('df.ix[idx, idx]', setup, - start_date=datetime(2011, 1, 1)) - -#---------------------------------------------------------------------- -# reindex with upcasts -setup = common_setup + """ -df=DataFrame(dict([(c, { - 0: randint(0, 2, 1000).astype(np.bool_), - 1: randint(0, 1000, 1000).astype(np.int16), - 2: randint(0, 1000, 1000).astype(np.int32), - 3: randint(0, 1000, 1000).astype(np.int64) - }[randint(0, 4)]) for c in range(1000)])) -""" - -frame_reindex_upcast = Benchmark('df.reindex(permutation(range(1200)))', setup) - -#---------------------------------------------------------------------- -# boolean indexing - -setup = common_setup + """ -df = DataFrame(randn(10000, 100)) -bool_arr = np.zeros(10000, dtype=bool) -bool_arr[:1000] = True -""" - -frame_boolean_row_select = Benchmark('df[bool_arr]', setup, - start_date=datetime(2011, 1, 1)) - -#---------------------------------------------------------------------- -# iteritems (monitor no-copying behaviour) - -setup = common_setup + """ -df = DataFrame(randn(10000, 1000)) -df2 = DataFrame(randn(3000,1),columns=['A']) -df3 = DataFrame(randn(3000,1)) - -def f(): - if hasattr(df, '_item_cache'): - df._item_cache.clear() - for name, col in df.iteritems(): - pass - -def g(): - for name, col in df.iteritems(): - pass - -def h(): - for i in range(10000): - df2['A'] - -def j(): - for i in range(10000): - df3[0] - -""" - -# as far back as the earliest test currently in the suite -frame_iteritems = Benchmark('f()', setup, - start_date=datetime(2010, 6, 1)) - -frame_iteritems_cached = Benchmark('g()', setup, - start_date=datetime(2010, 6, 1)) - -frame_getitem_single_column = Benchmark('h()', setup, - start_date=datetime(2010, 6, 1)) - -frame_getitem_single_column2 = Benchmark('j()', setup, - start_date=datetime(2010, 6, 1)) - -#---------------------------------------------------------------------- -# assignment - -setup = common_setup + """ -idx = date_range('1/1/2000', periods=100000, freq='D') -df = DataFrame(randn(100000, 1),columns=['A'],index=idx) -def f(df): - x = df.copy() - x['date'] = x.index -""" - -frame_assign_timeseries_index = Benchmark('f(df)', setup, - start_date=datetime(2013, 10, 1)) - - -#---------------------------------------------------------------------- -# to_string - -setup = common_setup + """ -df = DataFrame(randn(100, 10)) -""" - -frame_to_string_floats = Benchmark('df.to_string()', setup, - start_date=datetime(2010, 6, 1)) - -#---------------------------------------------------------------------- -# to_html - -setup = common_setup + """ -nrows=500 -df = DataFrame(randn(nrows, 10)) -df[0]=period_range("2000","2010",nrows) -df[1]=range(nrows) - -""" - -frame_to_html_mixed = Benchmark('df.to_html()', setup, - start_date=datetime(2011, 11, 18)) - - -# truncated repr_html, single index - -setup = common_setup + """ -nrows=10000 -data=randn(nrows,10) -idx=MultiIndex.from_arrays(np.tile(randn(3,nrows/100),100)) -df=DataFrame(data,index=idx) - -""" - -frame_html_repr_trunc_mi = Benchmark('df._repr_html_()', setup, - start_date=datetime(2013, 11, 25)) - -# truncated repr_html, MultiIndex - -setup = common_setup + """ -nrows=10000 -data=randn(nrows,10) -idx=randn(nrows) -df=DataFrame(data,index=idx) - -""" - -frame_html_repr_trunc_si = Benchmark('df._repr_html_()', setup, - start_date=datetime(2013, 11, 25)) - - -# insert many columns - -setup = common_setup + """ -N = 1000 - -def f(K=500): - df = DataFrame(index=range(N)) - new_col = np.random.randn(N) - for i in range(K): - df[i] = new_col -""" - -frame_insert_500_columns_end = Benchmark('f()', setup, start_date=datetime(2011, 1, 1)) - -setup = common_setup + """ -N = 1000 - -def f(K=100): - df = DataFrame(index=range(N)) - new_col = np.random.randn(N) - for i in range(K): - df.insert(0,i,new_col) -""" - -frame_insert_100_columns_begin = Benchmark('f()', setup, start_date=datetime(2011, 1, 1)) - -#---------------------------------------------------------------------- -# strings methods, #2602 - -setup = common_setup + """ -s = Series(['abcdefg', np.nan]*500000) -""" - -series_string_vector_slice = Benchmark('s.str[:5]', setup, - start_date=datetime(2012, 8, 1)) - -#---------------------------------------------------------------------- -# df.info() and get_dtype_counts() # 2807 - -setup = common_setup + """ -df = pandas.DataFrame(np.random.randn(10,10000)) -""" - -frame_get_dtype_counts = Benchmark('df.get_dtype_counts()', setup, - start_date=datetime(2012, 8, 1)) - -## -setup = common_setup + """ -df = pandas.DataFrame(np.random.randn(10,10000)) -""" - -frame_repr_wide = Benchmark('repr(df)', setup, - start_date=datetime(2012, 8, 1)) - -## -setup = common_setup + """ -df = pandas.DataFrame(np.random.randn(10000, 10)) -""" - -frame_repr_tall = Benchmark('repr(df)', setup, - start_date=datetime(2012, 8, 1)) - -## -setup = common_setup + """ -df = DataFrame(randn(100000, 1)) -""" - -frame_xs_row = Benchmark('df.xs(50000)', setup) - -## -setup = common_setup + """ -df = DataFrame(randn(1,100000)) -""" - -frame_xs_col = Benchmark('df.xs(50000,axis = 1)', setup) - -#---------------------------------------------------------------------- -# nulls/masking - -## masking -setup = common_setup + """ -data = np.random.randn(1000, 500) -df = DataFrame(data) -df = df.where(df > 0) # create nans -bools = df > 0 -mask = isnull(df) -""" - -frame_mask_bools = Benchmark('bools.mask(mask)', setup, - start_date=datetime(2013,1,1)) - -frame_mask_floats = Benchmark('bools.astype(float).mask(mask)', setup, - start_date=datetime(2013,1,1)) - -## isnull -setup = common_setup + """ -data = np.random.randn(1000, 1000) -df = DataFrame(data) -""" -frame_isnull = Benchmark('isnull(df)', setup, - start_date=datetime(2012,1,1)) - -## dropna -dropna_setup = common_setup + """ -data = np.random.randn(10000, 1000) -df = DataFrame(data) -df.ix[50:1000,20:50] = np.nan -df.ix[2000:3000] = np.nan -df.ix[:,60:70] = np.nan -""" -frame_dropna_axis0_any = Benchmark('df.dropna(how="any",axis=0)', dropna_setup, - start_date=datetime(2012,1,1)) -frame_dropna_axis0_all = Benchmark('df.dropna(how="all",axis=0)', dropna_setup, - start_date=datetime(2012,1,1)) - -frame_dropna_axis1_any = Benchmark('df.dropna(how="any",axis=1)', dropna_setup, - start_date=datetime(2012,1,1)) - -frame_dropna_axis1_all = Benchmark('df.dropna(how="all",axis=1)', dropna_setup, - start_date=datetime(2012,1,1)) - -# dropna on mixed dtypes -dropna_mixed_setup = common_setup + """ -data = np.random.randn(10000, 1000) -df = DataFrame(data) -df.ix[50:1000,20:50] = np.nan -df.ix[2000:3000] = np.nan -df.ix[:,60:70] = np.nan -df['foo'] = 'bar' -""" -frame_dropna_axis0_any_mixed_dtypes = Benchmark('df.dropna(how="any",axis=0)', dropna_mixed_setup, - start_date=datetime(2012,1,1)) -frame_dropna_axis0_all_mixed_dtypes = Benchmark('df.dropna(how="all",axis=0)', dropna_mixed_setup, - start_date=datetime(2012,1,1)) - -frame_dropna_axis1_any_mixed_dtypes = Benchmark('df.dropna(how="any",axis=1)', dropna_mixed_setup, - start_date=datetime(2012,1,1)) - -frame_dropna_axis1_all_mixed_dtypes = Benchmark('df.dropna(how="all",axis=1)', dropna_mixed_setup, - start_date=datetime(2012,1,1)) - -## dropna multi -dropna_setup = common_setup + """ -data = np.random.randn(10000, 1000) -df = DataFrame(data) -df.ix[50:1000,20:50] = np.nan -df.ix[2000:3000] = np.nan -df.ix[:,60:70] = np.nan -df.index = MultiIndex.from_tuples(df.index.map(lambda x: (x, x))) -df.columns = MultiIndex.from_tuples(df.columns.map(lambda x: (x, x))) -""" -frame_count_level_axis0_multi = Benchmark('df.count(axis=0, level=1)', dropna_setup, - start_date=datetime(2012,1,1)) - -frame_count_level_axis1_multi = Benchmark('df.count(axis=1, level=1)', dropna_setup, - start_date=datetime(2012,1,1)) - -# dropna on mixed dtypes -dropna_mixed_setup = common_setup + """ -data = np.random.randn(10000, 1000) -df = DataFrame(data) -df.ix[50:1000,20:50] = np.nan -df.ix[2000:3000] = np.nan -df.ix[:,60:70] = np.nan -df['foo'] = 'bar' -df.index = MultiIndex.from_tuples(df.index.map(lambda x: (x, x))) -df.columns = MultiIndex.from_tuples(df.columns.map(lambda x: (x, x))) -""" -frame_count_level_axis0_mixed_dtypes_multi = Benchmark('df.count(axis=0, level=1)', dropna_mixed_setup, - start_date=datetime(2012,1,1)) - -frame_count_level_axis1_mixed_dtypes_multi = Benchmark('df.count(axis=1, level=1)', dropna_mixed_setup, - start_date=datetime(2012,1,1)) - -#---------------------------------------------------------------------- -# apply - -setup = common_setup + """ -s = Series(np.arange(1028.)) -df = DataFrame({ i:s for i in range(1028) }) -""" -frame_apply_user_func = Benchmark('df.apply(lambda x: np.corrcoef(x,s)[0,1])', setup, - name = 'frame_apply_user_func', - start_date=datetime(2012,1,1)) - -setup = common_setup + """ -df = DataFrame(np.random.randn(1000,100)) -""" -frame_apply_lambda_mean = Benchmark('df.apply(lambda x: x.sum())', setup, - name = 'frame_apply_lambda_mean', - start_date=datetime(2012,1,1)) -setup = common_setup + """ -df = DataFrame(np.random.randn(1000,100)) -""" -frame_apply_np_mean = Benchmark('df.apply(np.mean)', setup, - name = 'frame_apply_np_mean', - start_date=datetime(2012,1,1)) - -setup = common_setup + """ -df = DataFrame(np.random.randn(1000,100)) -""" -frame_apply_pass_thru = Benchmark('df.apply(lambda x: x)', setup, - name = 'frame_apply_pass_thru', - start_date=datetime(2012,1,1)) - -setup = common_setup + """ -df = DataFrame(np.random.randn(1000,100)) -""" -frame_apply_axis_1 = Benchmark('df.apply(lambda x: x+1,axis=1)', setup, - name = 'frame_apply_axis_1', - start_date=datetime(2012,1,1)) - -setup = common_setup + """ -df = DataFrame(np.random.randn(1000,3),columns=list('ABC')) -""" -frame_apply_ref_by_name = Benchmark('df.apply(lambda x: x["A"] + x["B"],axis=1)', setup, - name = 'frame_apply_ref_by_name', - start_date=datetime(2012,1,1)) - -#---------------------------------------------------------------------- -# dtypes - -setup = common_setup + """ -df = DataFrame(np.random.randn(1000,1000)) -""" -frame_dtypes = Benchmark('df.dtypes', setup, - start_date=datetime(2012,1,1)) - -#---------------------------------------------------------------------- -# equals -setup = common_setup + """ -def make_pair(frame): - df = frame - df2 = df.copy() - df2.ix[-1,-1] = np.nan - return df, df2 - -def test_equal(name): - df, df2 = pairs[name] - return df.equals(df) - -def test_unequal(name): - df, df2 = pairs[name] - return df.equals(df2) - -float_df = DataFrame(np.random.randn(1000, 1000)) -object_df = DataFrame([['foo']*1000]*1000) -nonunique_cols = object_df.copy() -nonunique_cols.columns = ['A']*len(nonunique_cols.columns) - -pairs = dict([(name, make_pair(frame)) - for name, frame in (('float_df', float_df), ('object_df', object_df), ('nonunique_cols', nonunique_cols))]) -""" -frame_float_equal = Benchmark('test_equal("float_df")', setup) -frame_object_equal = Benchmark('test_equal("object_df")', setup) -frame_nonunique_equal = Benchmark('test_equal("nonunique_cols")', setup) - -frame_float_unequal = Benchmark('test_unequal("float_df")', setup) -frame_object_unequal = Benchmark('test_unequal("object_df")', setup) -frame_nonunique_unequal = Benchmark('test_unequal("nonunique_cols")', setup) - -#----------------------------------------------------------------------------- -# interpolate -# this is the worst case, where every column has NaNs. -setup = common_setup + """ -df = DataFrame(randn(10000, 100)) -df.values[::2] = np.nan -""" - -frame_interpolate = Benchmark('df.interpolate()', setup, - start_date=datetime(2014, 2, 7)) - -setup = common_setup + """ -df = DataFrame({'A': np.arange(0, 10000), - 'B': np.random.randint(0, 100, 10000), - 'C': randn(10000), - 'D': randn(10000)}) -df.loc[1::5, 'A'] = np.nan -df.loc[1::5, 'C'] = np.nan -""" - -frame_interpolate_some_good = Benchmark('df.interpolate()', setup, - start_date=datetime(2014, 2, 7)) -frame_interpolate_some_good_infer = Benchmark('df.interpolate(downcast="infer")', - setup, - start_date=datetime(2014, 2, 7)) - - -#------------------------------------------------------------------------- -# frame shift speedup issue-5609 - -setup = common_setup + """ -df = DataFrame(np.random.rand(10000,500)) -# note: df._data.blocks are f_contigous -""" -frame_shift_axis0 = Benchmark('df.shift(1,axis=0)', setup, - start_date=datetime(2014,1,1)) -frame_shift_axis1 = Benchmark('df.shift(1,axis=1)', setup, - name = 'frame_shift_axis_1', - start_date=datetime(2014,1,1)) - - -#----------------------------------------------------------------------------- -# from_records issue-6700 - -setup = common_setup + """ -def get_data(n=100000): - return ((x, x*20, x*100) for x in range(n)) -""" - -frame_from_records_generator = Benchmark('df = DataFrame.from_records(get_data())', - setup, - name='frame_from_records_generator', - start_date=datetime(2013,10,4)) # issue-4911 - -frame_from_records_generator_nrows = Benchmark('df = DataFrame.from_records(get_data(), nrows=1000)', - setup, - name='frame_from_records_generator_nrows', - start_date=datetime(2013,10,04)) # issue-4911 - -#----------------------------------------------------------------------------- -# duplicated - -setup = common_setup + ''' -n = 1 << 20 - -t = date_range('2015-01-01', freq='S', periods=n // 64) -xs = np.random.randn(n // 64).round(2) - -df = DataFrame({'a':np.random.randint(- 1 << 8, 1 << 8, n), - 'b':np.random.choice(t, n), - 'c':np.random.choice(xs, n)}) -''' - -frame_duplicated = Benchmark('df.duplicated()', setup, - name='frame_duplicated') diff --git a/vb_suite/generate_rst_files.py b/vb_suite/generate_rst_files.py deleted file mode 100644 index 92e7cd4d59b71..0000000000000 --- a/vb_suite/generate_rst_files.py +++ /dev/null @@ -1,2 +0,0 @@ -from suite import benchmarks, generate_rst_files -generate_rst_files(benchmarks) diff --git a/vb_suite/gil.py b/vb_suite/gil.py deleted file mode 100644 index df2bd2dcd8db4..0000000000000 --- a/vb_suite/gil.py +++ /dev/null @@ -1,110 +0,0 @@ -from vbench.api import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -""" - -basic = common_setup + """ -try: - from pandas.util.testing import test_parallel - have_real_test_parallel = True -except ImportError: - have_real_test_parallel = False - def test_parallel(num_threads=1): - def wrapper(fname): - return fname - - return wrapper - -N = 1000000 -ngroups = 1000 -np.random.seed(1234) - -df = DataFrame({'key' : np.random.randint(0,ngroups,size=N), - 'data' : np.random.randn(N) }) - -if not have_real_test_parallel: - raise NotImplementedError -""" - -setup = basic + """ - -def f(): - df.groupby('key')['data'].sum() - -# run consecutivily -def g2(): - for i in range(2): - f() -def g4(): - for i in range(4): - f() -def g8(): - for i in range(8): - f() - -# run in parallel -@test_parallel(num_threads=2) -def pg2(): - f() - -@test_parallel(num_threads=4) -def pg4(): - f() - -@test_parallel(num_threads=8) -def pg8(): - f() - -""" - -nogil_groupby_sum_4 = Benchmark( - 'pg4()', setup, - start_date=datetime(2015, 1, 1)) - -nogil_groupby_sum_8 = Benchmark( - 'pg8()', setup, - start_date=datetime(2015, 1, 1)) - - -#### test all groupby funcs #### - -setup = basic + """ - -@test_parallel(num_threads=2) -def pg2(): - df.groupby('key')['data'].func() - -""" - -for f in ['sum','prod','var','count','min','max','mean','last']: - - name = "nogil_groupby_{f}_2".format(f=f) - bmark = Benchmark('pg2()', setup.replace('func',f), start_date=datetime(2015, 1, 1)) - bmark.name = name - globals()[name] = bmark - -del bmark - - -#### test take_1d #### -setup = basic + """ -from pandas.core import common as com - -N = 1e7 -df = DataFrame({'int64' : np.arange(N,dtype='int64'), - 'float64' : np.arange(N,dtype='float64')}) -indexer = np.arange(100,len(df)-100) - -@test_parallel(num_threads=2) -def take_1d_pg2_int64(): - com.take_1d(df.int64.values,indexer) - -@test_parallel(num_threads=2) -def take_1d_pg2_float64(): - com.take_1d(df.float64.values,indexer) - -""" - -nogil_take1d_float64 = Benchmark('take_1d_pg2_int64()', setup, start_date=datetime(2015, 1, 1)) -nogil_take1d_int64 = Benchmark('take_1d_pg2_float64()', setup, start_date=datetime(2015, 1, 1)) diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py deleted file mode 100644 index 268d71f864823..0000000000000 --- a/vb_suite/groupby.py +++ /dev/null @@ -1,620 +0,0 @@ -from vbench.api import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -""" - -setup = common_setup + """ -N = 100000 -ngroups = 100 - -def get_test_data(ngroups=100, n=100000): - unique_groups = range(ngroups) - arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object) - - if len(arr) < n: - arr = np.asarray(list(arr) + unique_groups[:n - len(arr)], - dtype=object) - - random.shuffle(arr) - return arr - -# aggregate multiple columns -df = DataFrame({'key1' : get_test_data(ngroups=ngroups), - 'key2' : get_test_data(ngroups=ngroups), - 'data1' : np.random.randn(N), - 'data2' : np.random.randn(N)}) -def f(): - df.groupby(['key1', 'key2']).agg(lambda x: x.values.sum()) - -simple_series = Series(np.random.randn(N)) -key1 = df['key1'] -""" - -stmt1 = "df.groupby(['key1', 'key2'])['data1'].agg(lambda x: x.values.sum())" -groupby_multi_python = Benchmark(stmt1, setup, - start_date=datetime(2011, 7, 1)) - -stmt3 = "df.groupby(['key1', 'key2']).sum()" -groupby_multi_cython = Benchmark(stmt3, setup, - start_date=datetime(2011, 7, 1)) - -stmt = "df.groupby(['key1', 'key2'])['data1'].agg(np.std)" -groupby_multi_series_op = Benchmark(stmt, setup, - start_date=datetime(2011, 8, 1)) - -groupby_series_simple_cython = \ - Benchmark('simple_series.groupby(key1).sum()', setup, - start_date=datetime(2011, 3, 1)) - - -stmt4 = "df.groupby('key1').rank(pct=True)" -groupby_series_simple_cython = Benchmark(stmt4, setup, - start_date=datetime(2014, 1, 16)) - -#---------------------------------------------------------------------- -# 2d grouping, aggregate many columns - -setup = common_setup + """ -labels = np.random.randint(0, 100, size=1000) -df = DataFrame(randn(1000, 1000)) -""" - -groupby_frame_cython_many_columns = Benchmark( - 'df.groupby(labels).sum()', setup, - start_date=datetime(2011, 8, 1), - logy=True) - -#---------------------------------------------------------------------- -# single key, long, integer key - -setup = common_setup + """ -data = np.random.randn(100000, 1) -labels = np.random.randint(0, 1000, size=100000) -df = DataFrame(data) -""" - -groupby_frame_singlekey_integer = \ - Benchmark('df.groupby(labels).sum()', setup, - start_date=datetime(2011, 8, 1), logy=True) - -#---------------------------------------------------------------------- -# group with different functions per column - -setup = common_setup + """ -fac1 = np.array(['A', 'B', 'C'], dtype='O') -fac2 = np.array(['one', 'two'], dtype='O') - -df = DataFrame({'key1': fac1.take(np.random.randint(0, 3, size=100000)), - 'key2': fac2.take(np.random.randint(0, 2, size=100000)), - 'value1' : np.random.randn(100000), - 'value2' : np.random.randn(100000), - 'value3' : np.random.randn(100000)}) -""" - -groupby_multi_different_functions = \ - Benchmark("""df.groupby(['key1', 'key2']).agg({'value1' : 'mean', - 'value2' : 'var', - 'value3' : 'sum'})""", - setup, start_date=datetime(2011, 9, 1)) - -groupby_multi_different_numpy_functions = \ - Benchmark("""df.groupby(['key1', 'key2']).agg({'value1' : np.mean, - 'value2' : np.var, - 'value3' : np.sum})""", - setup, start_date=datetime(2011, 9, 1)) - -#---------------------------------------------------------------------- -# size() speed - -setup = common_setup + """ -n = 100000 -offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') -dates = np.datetime64('now') + offsets -df = DataFrame({'key1': np.random.randint(0, 500, size=n), - 'key2': np.random.randint(0, 100, size=n), - 'value1' : np.random.randn(n), - 'value2' : np.random.randn(n), - 'value3' : np.random.randn(n), - 'dates' : dates}) -""" - -groupby_multi_size = Benchmark("df.groupby(['key1', 'key2']).size()", - setup, start_date=datetime(2011, 10, 1)) - -groupby_dt_size = Benchmark("df.groupby(['dates']).size()", - setup, start_date=datetime(2011, 10, 1)) - -groupby_dt_timegrouper_size = Benchmark("df.groupby(TimeGrouper(key='dates', freq='M')).size()", - setup, start_date=datetime(2011, 10, 1)) - -#---------------------------------------------------------------------- -# count() speed - -setup = common_setup + """ -n = 10000 -offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') - -dates = np.datetime64('now') + offsets -dates[np.random.rand(n) > 0.5] = np.datetime64('nat') - -offsets[np.random.rand(n) > 0.5] = np.timedelta64('nat') - -value2 = np.random.randn(n) -value2[np.random.rand(n) > 0.5] = np.nan - -obj = np.random.choice(list('ab'), size=n).astype(object) -obj[np.random.randn(n) > 0.5] = np.nan - -df = DataFrame({'key1': np.random.randint(0, 500, size=n), - 'key2': np.random.randint(0, 100, size=n), - 'dates': dates, - 'value2' : value2, - 'value3' : np.random.randn(n), - 'ints': np.random.randint(0, 1000, size=n), - 'obj': obj, - 'offsets': offsets}) -""" - -groupby_multi_count = Benchmark("df.groupby(['key1', 'key2']).count()", - setup, name='groupby_multi_count', - start_date=datetime(2014, 5, 5)) - -setup = common_setup + """ -n = 10000 - -df = DataFrame({'key1': randint(0, 500, size=n), - 'key2': randint(0, 100, size=n), - 'ints': randint(0, 1000, size=n), - 'ints2': randint(0, 1000, size=n)}) -""" - -groupby_int_count = Benchmark("df.groupby(['key1', 'key2']).count()", - setup, name='groupby_int_count', - start_date=datetime(2014, 5, 6)) -#---------------------------------------------------------------------- -# Series.value_counts - -setup = common_setup + """ -s = Series(np.random.randint(0, 1000, size=100000)) -""" - -series_value_counts_int64 = Benchmark('s.value_counts()', setup, - start_date=datetime(2011, 10, 21)) - -# value_counts on lots of strings - -setup = common_setup + """ -K = 1000 -N = 100000 -uniques = tm.makeStringIndex(K).values -s = Series(np.tile(uniques, N // K)) -""" - -series_value_counts_strings = Benchmark('s.value_counts()', setup, - start_date=datetime(2011, 10, 21)) - -#value_counts on float dtype - -setup = common_setup + """ -s = Series(np.random.randint(0, 1000, size=100000)).astype(float) -""" - -series_value_counts_float64 = Benchmark('s.value_counts()', setup, - start_date=datetime(2015, 8, 17)) - -#---------------------------------------------------------------------- -# pivot_table - -setup = common_setup + """ -fac1 = np.array(['A', 'B', 'C'], dtype='O') -fac2 = np.array(['one', 'two'], dtype='O') - -ind1 = np.random.randint(0, 3, size=100000) -ind2 = np.random.randint(0, 2, size=100000) - -df = DataFrame({'key1': fac1.take(ind1), -'key2': fac2.take(ind2), -'key3': fac2.take(ind2), -'value1' : np.random.randn(100000), -'value2' : np.random.randn(100000), -'value3' : np.random.randn(100000)}) -""" - -stmt = "df.pivot_table(index='key1', columns=['key2', 'key3'])" -groupby_pivot_table = Benchmark(stmt, setup, start_date=datetime(2011, 12, 15)) - - -#---------------------------------------------------------------------- -# dict return values - -setup = common_setup + """ -labels = np.arange(1000).repeat(10) -data = Series(randn(len(labels))) -f = lambda x: {'first': x.values[0], 'last': x.values[-1]} -""" - -groupby_apply_dict_return = Benchmark('data.groupby(labels).apply(f)', - setup, start_date=datetime(2011, 12, 15)) - -#---------------------------------------------------------------------- -# First / last functions - -setup = common_setup + """ -labels = np.arange(10000).repeat(10) -data = Series(randn(len(labels))) -data[::3] = np.nan -data[1::3] = np.nan -data2 = Series(randn(len(labels)),dtype='float32') -data2[::3] = np.nan -data2[1::3] = np.nan -labels = labels.take(np.random.permutation(len(labels))) -""" - -groupby_first_float64 = Benchmark('data.groupby(labels).first()', setup, - start_date=datetime(2012, 5, 1)) - -groupby_first_float32 = Benchmark('data2.groupby(labels).first()', setup, - start_date=datetime(2013, 1, 1)) - -groupby_last_float64 = Benchmark('data.groupby(labels).last()', setup, - start_date=datetime(2012, 5, 1)) - -groupby_last_float32 = Benchmark('data2.groupby(labels).last()', setup, - start_date=datetime(2013, 1, 1)) - -groupby_nth_float64_none = Benchmark('data.groupby(labels).nth(0)', setup, - start_date=datetime(2012, 5, 1)) -groupby_nth_float32_none = Benchmark('data2.groupby(labels).nth(0)', setup, - start_date=datetime(2013, 1, 1)) -groupby_nth_float64_any = Benchmark('data.groupby(labels).nth(0,dropna="all")', setup, - start_date=datetime(2012, 5, 1)) -groupby_nth_float32_any = Benchmark('data2.groupby(labels).nth(0,dropna="all")', setup, - start_date=datetime(2013, 1, 1)) - -# with datetimes (GH7555) -setup = common_setup + """ -df = DataFrame({'a' : date_range('1/1/2011',periods=100000,freq='s'),'b' : range(100000)}) -""" - -groupby_first_datetimes = Benchmark('df.groupby("b").first()', setup, - start_date=datetime(2013, 5, 1)) -groupby_last_datetimes = Benchmark('df.groupby("b").last()', setup, - start_date=datetime(2013, 5, 1)) -groupby_nth_datetimes_none = Benchmark('df.groupby("b").nth(0)', setup, - start_date=datetime(2013, 5, 1)) -groupby_nth_datetimes_any = Benchmark('df.groupby("b").nth(0,dropna="all")', setup, - start_date=datetime(2013, 5, 1)) - -# with object -setup = common_setup + """ -df = DataFrame({'a' : ['foo']*100000,'b' : range(100000)}) -""" - -groupby_first_object = Benchmark('df.groupby("b").first()', setup, - start_date=datetime(2013, 5, 1)) -groupby_last_object = Benchmark('df.groupby("b").last()', setup, - start_date=datetime(2013, 5, 1)) -groupby_nth_object_none = Benchmark('df.groupby("b").nth(0)', setup, - start_date=datetime(2013, 5, 1)) -groupby_nth_object_any = Benchmark('df.groupby("b").nth(0,dropna="any")', setup, - start_date=datetime(2013, 5, 1)) - -#---------------------------------------------------------------------- -# groupby_indices replacement, chop up Series - -setup = common_setup + """ -try: - rng = date_range('1/1/2000', '12/31/2005', freq='H') - year, month, day = rng.year, rng.month, rng.day -except: - rng = date_range('1/1/2000', '12/31/2000', offset=datetools.Hour()) - year = rng.map(lambda x: x.year) - month = rng.map(lambda x: x.month) - day = rng.map(lambda x: x.day) - -ts = Series(np.random.randn(len(rng)), index=rng) -""" - -groupby_indices = Benchmark('len(ts.groupby([year, month, day]))', - setup, start_date=datetime(2012, 1, 1)) - -#---------------------------------------------------------------------- -# median - -#---------------------------------------------------------------------- -# single key, long, integer key - -setup = common_setup + """ -data = np.random.randn(100000, 2) -labels = np.random.randint(0, 1000, size=100000) -df = DataFrame(data) -""" - -groupby_frame_median = \ - Benchmark('df.groupby(labels).median()', setup, - start_date=datetime(2011, 8, 1), logy=True) - - -setup = common_setup + """ -data = np.random.randn(1000000, 2) -labels = np.random.randint(0, 1000, size=1000000) -df = DataFrame(data) -""" - -groupby_simple_compress_timing = \ - Benchmark('df.groupby(labels).mean()', setup, - start_date=datetime(2011, 8, 1)) - - -#---------------------------------------------------------------------- -# DataFrame Apply overhead - -setup = common_setup + """ -N = 10000 -labels = np.random.randint(0, 2000, size=N) -labels2 = np.random.randint(0, 3, size=N) -df = DataFrame({'key': labels, -'key2': labels2, -'value1': randn(N), -'value2': ['foo', 'bar', 'baz', 'qux'] * (N / 4)}) -def f(g): - return 1 -""" - -groupby_frame_apply_overhead = Benchmark("df.groupby('key').apply(f)", setup, - start_date=datetime(2011, 10, 1)) - -groupby_frame_apply = Benchmark("df.groupby(['key', 'key2']).apply(f)", setup, - start_date=datetime(2011, 10, 1)) - - -#---------------------------------------------------------------------- -# DataFrame nth - -setup = common_setup + """ -df = DataFrame(np.random.randint(1, 100, (10000, 2))) -""" - -# Not really a fair test as behaviour has changed! -groupby_frame_nth_none = Benchmark("df.groupby(0).nth(0)", setup, - start_date=datetime(2014, 3, 1)) - -groupby_series_nth_none = Benchmark("df[1].groupby(df[0]).nth(0)", setup, - start_date=datetime(2014, 3, 1)) -groupby_frame_nth_any= Benchmark("df.groupby(0).nth(0,dropna='any')", setup, - start_date=datetime(2014, 3, 1)) - -groupby_series_nth_any = Benchmark("df[1].groupby(df[0]).nth(0,dropna='any')", setup, - start_date=datetime(2014, 3, 1)) - - -#---------------------------------------------------------------------- -# Sum booleans #2692 - -setup = common_setup + """ -N = 500 -df = DataFrame({'ii':range(N),'bb':[True for x in range(N)]}) -""" - -groupby_sum_booleans = Benchmark("df.groupby('ii').sum()", setup) - - -#---------------------------------------------------------------------- -# multi-indexed group sum #9049 - -setup = common_setup + """ -N = 50 -df = DataFrame({'A': range(N) * 2, 'B': range(N*2), 'C': 1}).set_index(["A", "B"]) -""" - -groupby_sum_multiindex = Benchmark("df.groupby(level=[0, 1]).sum()", setup) - - -#---------------------------------------------------------------------- -# Transform testing - -setup = common_setup + """ -n_dates = 400 -n_securities = 250 -n_columns = 3 -share_na = 0.1 - -dates = date_range('1997-12-31', periods=n_dates, freq='B') -dates = Index(map(lambda x: x.year * 10000 + x.month * 100 + x.day, dates)) - -secid_min = int('10000000', 16) -secid_max = int('F0000000', 16) -step = (secid_max - secid_min) // (n_securities - 1) -security_ids = map(lambda x: hex(x)[2:10].upper(), range(secid_min, secid_max + 1, step)) - -data_index = MultiIndex(levels=[dates.values, security_ids], - labels=[[i for i in range(n_dates) for _ in xrange(n_securities)], range(n_securities) * n_dates], - names=['date', 'security_id']) -n_data = len(data_index) - -columns = Index(['factor{}'.format(i) for i in range(1, n_columns + 1)]) - -data = DataFrame(np.random.randn(n_data, n_columns), index=data_index, columns=columns) - -step = int(n_data * share_na) -for column_index in range(n_columns): - index = column_index - while index < n_data: - data.set_value(data_index[index], columns[column_index], np.nan) - index += step - -f_fillna = lambda x: x.fillna(method='pad') -""" - -groupby_transform = Benchmark("data.groupby(level='security_id').transform(f_fillna)", setup) -groupby_transform_ufunc = Benchmark("data.groupby(level='date').transform(np.max)", setup) - -setup = common_setup + """ -np.random.seed(0) - -N = 120000 -N_TRANSITIONS = 1400 - -# generate groups -transition_points = np.random.permutation(np.arange(N))[:N_TRANSITIONS] -transition_points.sort() -transitions = np.zeros((N,), dtype=np.bool) -transitions[transition_points] = True -g = transitions.cumsum() - -df = DataFrame({ 'signal' : np.random.rand(N)}) -""" -groupby_transform_series = Benchmark("df['signal'].groupby(g).transform(np.mean)", setup) - -setup = common_setup + """ -np.random.seed(0) - -df=DataFrame( { 'id' : np.arange( 100000 ) / 3, - 'val': np.random.randn( 100000) } ) -""" - -groupby_transform_series2 = Benchmark("df.groupby('id')['val'].transform(np.mean)", setup) - -setup = common_setup + ''' -np.random.seed(2718281) -n = 20000 -df = DataFrame(np.random.randint(1, n, (n, 3)), - columns=['jim', 'joe', 'jolie']) -''' - -stmt = "df.groupby(['jim', 'joe'])['jolie'].transform('max')"; -groupby_transform_multi_key1 = Benchmark(stmt, setup) -groupby_transform_multi_key2 = Benchmark(stmt, setup + "df['jim'] = df['joe']") - -setup = common_setup + ''' -np.random.seed(2718281) -n = 200000 -df = DataFrame(np.random.randint(1, n / 10, (n, 3)), - columns=['jim', 'joe', 'jolie']) -''' -groupby_transform_multi_key3 = Benchmark(stmt, setup) -groupby_transform_multi_key4 = Benchmark(stmt, setup + "df['jim'] = df['joe']") - -setup = common_setup + ''' -np.random.seed(27182) -n = 100000 -df = DataFrame(np.random.randint(1, n / 100, (n, 3)), - columns=['jim', 'joe', 'jolie']) -''' - -groupby_agg_builtins1 = Benchmark("df.groupby('jim').agg([sum, min, max])", setup) -groupby_agg_builtins2 = Benchmark("df.groupby(['jim', 'joe']).agg([sum, min, max])", setup) - - -setup = common_setup + ''' -arr = np.random.randint(- 1 << 12, 1 << 12, (1 << 17, 5)) -i = np.random.choice(len(arr), len(arr) * 5) -arr = np.vstack((arr, arr[i])) # add sume duplicate rows - -i = np.random.permutation(len(arr)) -arr = arr[i] # shuffle rows - -df = DataFrame(arr, columns=list('abcde')) -df['jim'], df['joe'] = np.random.randn(2, len(df)) * 10 -''' - -groupby_int64_overflow = Benchmark("df.groupby(list('abcde')).max()", setup, - name='groupby_int64_overflow') - - -setup = common_setup + ''' -from itertools import product -from string import ascii_letters, digits - -n = 5 * 7 * 11 * (1 << 9) -alpha = list(map(''.join, product(ascii_letters + digits, repeat=4))) -f = lambda k: np.repeat(np.random.choice(alpha, n // k), k) - -df = DataFrame({'a': f(11), 'b': f(7), 'c': f(5), 'd': f(1)}) -df['joe'] = (np.random.randn(len(df)) * 10).round(3) - -i = np.random.permutation(len(df)) -df = df.iloc[i].reset_index(drop=True).copy() -''' - -groupby_multi_index = Benchmark("df.groupby(list('abcd')).max()", setup, - name='groupby_multi_index') - -#---------------------------------------------------------------------- -# groupby with a variable value for ngroups - - -ngroups_list = [100, 10000] -no_arg_func_list = [ - 'all', - 'any', - 'count', - 'cumcount', - 'cummax', - 'cummin', - 'cumprod', - 'cumsum', - 'describe', - 'diff', - 'first', - 'head', - 'last', - 'mad', - 'max', - 'mean', - 'median', - 'min', - 'nunique', - 'pct_change', - 'prod', - 'rank', - 'sem', - 'size', - 'skew', - 'std', - 'sum', - 'tail', - 'unique', - 'var', - 'value_counts', -] - - -_stmt_template = "df.groupby('value')['timestamp'].%s" -_setup_template = common_setup + """ -np.random.seed(1234) -ngroups = %s -size = ngroups * 2 -rng = np.arange(ngroups) -df = DataFrame(dict( - timestamp=rng.take(np.random.randint(0, ngroups, size=size)), - value=np.random.randint(0, size, size=size) -)) -""" -START_DATE = datetime(2011, 7, 1) - - -def make_large_ngroups_bmark(ngroups, func_name, func_args=''): - bmark_name = 'groupby_ngroups_%s_%s' % (ngroups, func_name) - stmt = _stmt_template % ('%s(%s)' % (func_name, func_args)) - setup = _setup_template % ngroups - bmark = Benchmark(stmt, setup, start_date=START_DATE) - # MUST set name - bmark.name = bmark_name - return bmark - - -def inject_bmark_into_globals(bmark): - if not bmark.name: - raise AssertionError('benchmark must have a name') - globals()[bmark.name] = bmark - - -for ngroups in ngroups_list: - for func_name in no_arg_func_list: - bmark = make_large_ngroups_bmark(ngroups, func_name) - inject_bmark_into_globals(bmark) - -# avoid bmark to be collected as Benchmark object -del bmark diff --git a/vb_suite/hdfstore_bench.py b/vb_suite/hdfstore_bench.py deleted file mode 100644 index 393fd4cc77e66..0000000000000 --- a/vb_suite/hdfstore_bench.py +++ /dev/null @@ -1,278 +0,0 @@ -from vbench.api import Benchmark -from datetime import datetime - -start_date = datetime(2012, 7, 1) - -common_setup = """from .pandas_vb_common import * -import os - -f = '__test__.h5' -def remove(f): - try: - os.remove(f) - except: - pass - -""" - -#---------------------------------------------------------------------- -# get from a store - -setup1 = common_setup + """ -index = tm.makeStringIndex(25000) -df = DataFrame({'float1' : randn(25000), - 'float2' : randn(25000)}, - index=index) -remove(f) -store = HDFStore(f) -store.put('df1',df) -""" - -read_store = Benchmark("store.get('df1')", setup1, cleanup="store.close()", - start_date=start_date) - - -#---------------------------------------------------------------------- -# write to a store - -setup2 = common_setup + """ -index = tm.makeStringIndex(25000) -df = DataFrame({'float1' : randn(25000), - 'float2' : randn(25000)}, - index=index) -remove(f) -store = HDFStore(f) -""" - -write_store = Benchmark( - "store.put('df2',df)", setup2, cleanup="store.close()", - start_date=start_date) - -#---------------------------------------------------------------------- -# get from a store (mixed) - -setup3 = common_setup + """ -index = tm.makeStringIndex(25000) -df = DataFrame({'float1' : randn(25000), - 'float2' : randn(25000), - 'string1' : ['foo'] * 25000, - 'bool1' : [True] * 25000, - 'int1' : np.random.randint(0, 250000, size=25000)}, - index=index) -remove(f) -store = HDFStore(f) -store.put('df3',df) -""" - -read_store_mixed = Benchmark( - "store.get('df3')", setup3, cleanup="store.close()", - start_date=start_date) - - -#---------------------------------------------------------------------- -# write to a store (mixed) - -setup4 = common_setup + """ -index = tm.makeStringIndex(25000) -df = DataFrame({'float1' : randn(25000), - 'float2' : randn(25000), - 'string1' : ['foo'] * 25000, - 'bool1' : [True] * 25000, - 'int1' : np.random.randint(0, 250000, size=25000)}, - index=index) -remove(f) -store = HDFStore(f) -""" - -write_store_mixed = Benchmark( - "store.put('df4',df)", setup4, cleanup="store.close()", - start_date=start_date) - -#---------------------------------------------------------------------- -# get from a table (mixed) - -setup5 = common_setup + """ -N=10000 -index = tm.makeStringIndex(N) -df = DataFrame({'float1' : randn(N), - 'float2' : randn(N), - 'string1' : ['foo'] * N, - 'bool1' : [True] * N, - 'int1' : np.random.randint(0, N, size=N)}, - index=index) - -remove(f) -store = HDFStore(f) -store.append('df5',df) -""" - -read_store_table_mixed = Benchmark( - "store.select('df5')", setup5, cleanup="store.close()", - start_date=start_date) - - -#---------------------------------------------------------------------- -# write to a table (mixed) - -setup6 = common_setup + """ -index = tm.makeStringIndex(25000) -df = DataFrame({'float1' : randn(25000), - 'float2' : randn(25000), - 'string1' : ['foo'] * 25000, - 'bool1' : [True] * 25000, - 'int1' : np.random.randint(0, 25000, size=25000)}, - index=index) -remove(f) -store = HDFStore(f) -""" - -write_store_table_mixed = Benchmark( - "store.append('df6',df)", setup6, cleanup="store.close()", - start_date=start_date) - -#---------------------------------------------------------------------- -# select from a table - -setup7 = common_setup + """ -index = tm.makeStringIndex(25000) -df = DataFrame({'float1' : randn(25000), - 'float2' : randn(25000) }, - index=index) - -remove(f) -store = HDFStore(f) -store.append('df7',df) -""" - -read_store_table = Benchmark( - "store.select('df7')", setup7, cleanup="store.close()", - start_date=start_date) - - -#---------------------------------------------------------------------- -# write to a table - -setup8 = common_setup + """ -index = tm.makeStringIndex(25000) -df = DataFrame({'float1' : randn(25000), - 'float2' : randn(25000) }, - index=index) -remove(f) -store = HDFStore(f) -""" - -write_store_table = Benchmark( - "store.append('df8',df)", setup8, cleanup="store.close()", - start_date=start_date) - -#---------------------------------------------------------------------- -# get from a table (wide) - -setup9 = common_setup + """ -df = DataFrame(np.random.randn(25000,100)) - -remove(f) -store = HDFStore(f) -store.append('df9',df) -""" - -read_store_table_wide = Benchmark( - "store.select('df9')", setup9, cleanup="store.close()", - start_date=start_date) - - -#---------------------------------------------------------------------- -# write to a table (wide) - -setup10 = common_setup + """ -df = DataFrame(np.random.randn(25000,100)) - -remove(f) -store = HDFStore(f) -""" - -write_store_table_wide = Benchmark( - "store.append('df10',df)", setup10, cleanup="store.close()", - start_date=start_date) - -#---------------------------------------------------------------------- -# get from a table (wide) - -setup11 = common_setup + """ -index = date_range('1/1/2000', periods = 25000) -df = DataFrame(np.random.randn(25000,100), index = index) - -remove(f) -store = HDFStore(f) -store.append('df11',df) -""" - -query_store_table_wide = Benchmark( - "store.select('df11', [ ('index', '>', df.index[10000]), ('index', '<', df.index[15000]) ])", setup11, cleanup="store.close()", - start_date=start_date) - - -#---------------------------------------------------------------------- -# query from a table - -setup12 = common_setup + """ -index = date_range('1/1/2000', periods = 25000) -df = DataFrame({'float1' : randn(25000), - 'float2' : randn(25000) }, - index=index) - -remove(f) -store = HDFStore(f) -store.append('df12',df) -""" - -query_store_table = Benchmark( - "store.select('df12', [ ('index', '>', df.index[10000]), ('index', '<', df.index[15000]) ])", setup12, cleanup="store.close()", - start_date=start_date) - -#---------------------------------------------------------------------- -# select from a panel table - -setup13 = common_setup + """ -p = Panel(randn(20, 1000, 25), items= [ 'Item%03d' % i for i in range(20) ], - major_axis=date_range('1/1/2000', periods=1000), minor_axis = [ 'E%03d' % i for i in range(25) ]) - -remove(f) -store = HDFStore(f) -store.append('p1',p) -""" - -read_store_table_panel = Benchmark( - "store.select('p1')", setup13, cleanup="store.close()", - start_date=start_date) - - -#---------------------------------------------------------------------- -# write to a panel table - -setup14 = common_setup + """ -p = Panel(randn(20, 1000, 25), items= [ 'Item%03d' % i for i in range(20) ], - major_axis=date_range('1/1/2000', periods=1000), minor_axis = [ 'E%03d' % i for i in range(25) ]) - -remove(f) -store = HDFStore(f) -""" - -write_store_table_panel = Benchmark( - "store.append('p2',p)", setup14, cleanup="store.close()", - start_date=start_date) - -#---------------------------------------------------------------------- -# write to a table (data_columns) - -setup15 = common_setup + """ -df = DataFrame(np.random.randn(10000,10),columns = [ 'C%03d' % i for i in range(10) ]) - -remove(f) -store = HDFStore(f) -""" - -write_store_table_dc = Benchmark( - "store.append('df15',df,data_columns=True)", setup15, cleanup="store.close()", - start_date=start_date) - diff --git a/vb_suite/index_object.py b/vb_suite/index_object.py deleted file mode 100644 index 2ab2bc15f3853..0000000000000 --- a/vb_suite/index_object.py +++ /dev/null @@ -1,173 +0,0 @@ -from vbench.benchmark import Benchmark -from datetime import datetime - -SECTION = "Index / MultiIndex objects" - - -common_setup = """from .pandas_vb_common import * -""" - -#---------------------------------------------------------------------- -# intersection, union - -setup = common_setup + """ -rng = DatetimeIndex(start='1/1/2000', periods=10000, freq=datetools.Minute()) -if rng.dtype == object: - rng = rng.view(Index) -else: - rng = rng.asobject -rng2 = rng[:-1] -""" - -index_datetime_intersection = Benchmark("rng.intersection(rng2)", setup) -index_datetime_union = Benchmark("rng.union(rng2)", setup) - -setup = common_setup + """ -rng = date_range('1/1/2000', periods=10000, freq='T') -rng2 = rng[:-1] -""" - -datetime_index_intersection = Benchmark("rng.intersection(rng2)", setup, - start_date=datetime(2013, 9, 27)) -datetime_index_union = Benchmark("rng.union(rng2)", setup, - start_date=datetime(2013, 9, 27)) - -# integers -setup = common_setup + """ -N = 1000000 -options = np.arange(N) - -left = Index(options.take(np.random.permutation(N)[:N // 2])) -right = Index(options.take(np.random.permutation(N)[:N // 2])) -""" - -index_int64_union = Benchmark('left.union(right)', setup, - start_date=datetime(2011, 1, 1)) - -index_int64_intersection = Benchmark('left.intersection(right)', setup, - start_date=datetime(2011, 1, 1)) - -#---------------------------------------------------------------------- -# string index slicing -setup = common_setup + """ -idx = tm.makeStringIndex(1000000) - -mask = np.arange(1000000) % 3 == 0 -series_mask = Series(mask) -""" -index_str_slice_indexer_basic = Benchmark('idx[:-1]', setup) -index_str_slice_indexer_even = Benchmark('idx[::2]', setup) -index_str_boolean_indexer = Benchmark('idx[mask]', setup) -index_str_boolean_series_indexer = Benchmark('idx[series_mask]', setup) - -#---------------------------------------------------------------------- -# float64 index -#---------------------------------------------------------------------- -# construction -setup = common_setup + """ -baseidx = np.arange(1e6) -""" - -index_float64_construct = Benchmark('Index(baseidx)', setup, - name='index_float64_construct', - start_date=datetime(2014, 4, 13)) - -setup = common_setup + """ -idx = tm.makeFloatIndex(1000000) - -mask = np.arange(idx.size) % 3 == 0 -series_mask = Series(mask) -""" -#---------------------------------------------------------------------- -# getting -index_float64_get = Benchmark('idx[1]', setup, name='index_float64_get', - start_date=datetime(2014, 4, 13)) - - -#---------------------------------------------------------------------- -# slicing -index_float64_slice_indexer_basic = Benchmark('idx[:-1]', setup, - name='index_float64_slice_indexer_basic', - start_date=datetime(2014, 4, 13)) -index_float64_slice_indexer_even = Benchmark('idx[::2]', setup, - name='index_float64_slice_indexer_even', - start_date=datetime(2014, 4, 13)) -index_float64_boolean_indexer = Benchmark('idx[mask]', setup, - name='index_float64_boolean_indexer', - start_date=datetime(2014, 4, 13)) -index_float64_boolean_series_indexer = Benchmark('idx[series_mask]', setup, - name='index_float64_boolean_series_indexer', - start_date=datetime(2014, 4, 13)) - -#---------------------------------------------------------------------- -# arith ops -index_float64_mul = Benchmark('idx * 2', setup, name='index_float64_mul', - start_date=datetime(2014, 4, 13)) -index_float64_div = Benchmark('idx / 2', setup, name='index_float64_div', - start_date=datetime(2014, 4, 13)) - - -# Constructing MultiIndex from cartesian product of iterables -# - -setup = common_setup + """ -iterables = [tm.makeStringIndex(10000), range(20)] -""" - -multiindex_from_product = Benchmark('MultiIndex.from_product(iterables)', - setup, name='multiindex_from_product', - start_date=datetime(2014, 6, 30)) - -#---------------------------------------------------------------------- -# MultiIndex with DatetimeIndex level - -setup = common_setup + """ -level1 = range(1000) -level2 = date_range(start='1/1/2012', periods=100) -mi = MultiIndex.from_product([level1, level2]) -""" - -multiindex_with_datetime_level_full = \ - Benchmark("mi.copy().values", setup, - name='multiindex_with_datetime_level_full', - start_date=datetime(2014, 10, 11)) - - -multiindex_with_datetime_level_sliced = \ - Benchmark("mi[:10].values", setup, - name='multiindex_with_datetime_level_sliced', - start_date=datetime(2014, 10, 11)) - -# multi-index duplicated -setup = common_setup + """ -n, k = 200, 5000 -levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)] -labels = [np.random.choice(n, k * n) for lev in levels] -mi = MultiIndex(levels=levels, labels=labels) -""" - -multiindex_duplicated = Benchmark('mi.duplicated()', setup, - name='multiindex_duplicated') - -#---------------------------------------------------------------------- -# repr - -setup = common_setup + """ -dr = pd.date_range('20000101', freq='D', periods=100000) -""" - -datetime_index_repr = \ - Benchmark("dr._is_dates_only", setup, - start_date=datetime(2012, 1, 11)) - -setup = common_setup + """ -n = 3 * 5 * 7 * 11 * (1 << 10) -low, high = - 1 << 12, 1 << 12 -f = lambda k: np.repeat(np.random.randint(low, high, n // k), k) - -i = np.random.permutation(n) -mi = MultiIndex.from_arrays([f(11), f(7), f(5), f(3), f(1)])[i] -""" - -multiindex_sortlevel_int64 = Benchmark('mi.sortlevel()', setup, - name='multiindex_sortlevel_int64') diff --git a/vb_suite/indexing.py b/vb_suite/indexing.py deleted file mode 100644 index ff634bf2a8fc7..0000000000000 --- a/vb_suite/indexing.py +++ /dev/null @@ -1,292 +0,0 @@ -from vbench.benchmark import Benchmark -from datetime import datetime - -SECTION = 'Indexing and scalar value access' - -common_setup = """from .pandas_vb_common import * -""" - -#---------------------------------------------------------------------- -# Series.__getitem__, get_value, __getitem__(slice) - -setup = common_setup + """ -tm.N = 1000 -ts = tm.makeTimeSeries() -dt = ts.index[500] -""" -statement = "ts[dt]" -bm_getitem = Benchmark(statement, setup, ncalls=100000, - name='time_series_getitem_scalar') - -setup = common_setup + """ -index = tm.makeStringIndex(1000) -s = Series(np.random.rand(1000), index=index) -idx = index[100] -""" -statement = "s.get_value(idx)" -bm_get_value = Benchmark(statement, setup, - name='series_get_value', - start_date=datetime(2011, 11, 12)) - - -setup = common_setup + """ -index = tm.makeStringIndex(1000000) -s = Series(np.random.rand(1000000), index=index) -""" -series_getitem_pos_slice = Benchmark("s[:800000]", setup, - name="series_getitem_pos_slice") - - -setup = common_setup + """ -index = tm.makeStringIndex(1000000) -s = Series(np.random.rand(1000000), index=index) -lbl = s.index[800000] -""" -series_getitem_label_slice = Benchmark("s[:lbl]", setup, - name="series_getitem_label_slice") - - -#---------------------------------------------------------------------- -# DataFrame __getitem__ - -setup = common_setup + """ -index = tm.makeStringIndex(1000) -columns = tm.makeStringIndex(30) -df = DataFrame(np.random.rand(1000, 30), index=index, - columns=columns) -idx = index[100] -col = columns[10] -""" -statement = "df[col][idx]" -bm_df_getitem = Benchmark(statement, setup, - name='dataframe_getitem_scalar') - -setup = common_setup + """ -try: - klass = DataMatrix -except: - klass = DataFrame - -index = tm.makeStringIndex(1000) -columns = tm.makeStringIndex(30) -df = klass(np.random.rand(1000, 30), index=index, columns=columns) -idx = index[100] -col = columns[10] -""" -statement = "df[col][idx]" -bm_df_getitem2 = Benchmark(statement, setup, - name='datamatrix_getitem_scalar') - - -#---------------------------------------------------------------------- -# ix get scalar - -setup = common_setup + """ -index = tm.makeStringIndex(1000) -columns = tm.makeStringIndex(30) -df = DataFrame(np.random.randn(1000, 30), index=index, columns=columns) -idx = index[100] -col = columns[10] -""" - -indexing_frame_get_value_ix = Benchmark("df.ix[idx,col]", setup, - name='indexing_frame_get_value_ix', - start_date=datetime(2011, 11, 12)) - -indexing_frame_get_value = Benchmark("df.get_value(idx,col)", setup, - name='indexing_frame_get_value', - start_date=datetime(2011, 11, 12)) - -setup = common_setup + """ -mi = MultiIndex.from_tuples([(x,y) for x in range(1000) for y in range(1000)]) -s = Series(np.random.randn(1000000), index=mi) -""" - -series_xs_mi_ix = Benchmark("s.ix[999]", setup, - name='series_xs_mi_ix', - start_date=datetime(2013, 1, 1)) - -setup = common_setup + """ -mi = MultiIndex.from_tuples([(x,y) for x in range(1000) for y in range(1000)]) -s = Series(np.random.randn(1000000), index=mi) -df = DataFrame(s) -""" - -frame_xs_mi_ix = Benchmark("df.ix[999]", setup, - name='frame_xs_mi_ix', - start_date=datetime(2013, 1, 1)) - -#---------------------------------------------------------------------- -# Boolean DataFrame row selection - -setup = common_setup + """ -df = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) -indexer = df['B'] > 0 -obj_indexer = indexer.astype('O') -""" -indexing_dataframe_boolean_rows = \ - Benchmark("df[indexer]", setup, name='indexing_dataframe_boolean_rows') - -indexing_dataframe_boolean_rows_object = \ - Benchmark("df[obj_indexer]", setup, - name='indexing_dataframe_boolean_rows_object') - -setup = common_setup + """ -df = DataFrame(np.random.randn(50000, 100)) -df2 = DataFrame(np.random.randn(50000, 100)) -""" -indexing_dataframe_boolean = \ - Benchmark("df > df2", setup, name='indexing_dataframe_boolean', - start_date=datetime(2012, 1, 1)) - -setup = common_setup + """ -try: - import pandas.core.computation.expressions as expr -except: - expr = None - -if expr is None: - raise NotImplementedError -df = DataFrame(np.random.randn(50000, 100)) -df2 = DataFrame(np.random.randn(50000, 100)) -expr.set_numexpr_threads(1) -""" - -indexing_dataframe_boolean_st = \ - Benchmark("df > df2", setup, name='indexing_dataframe_boolean_st',cleanup="expr.set_numexpr_threads()", - start_date=datetime(2013, 2, 26)) - - -setup = common_setup + """ -try: - import pandas.core.computation.expressions as expr -except: - expr = None - -if expr is None: - raise NotImplementedError -df = DataFrame(np.random.randn(50000, 100)) -df2 = DataFrame(np.random.randn(50000, 100)) -expr.set_use_numexpr(False) -""" - -indexing_dataframe_boolean_no_ne = \ - Benchmark("df > df2", setup, name='indexing_dataframe_boolean_no_ne',cleanup="expr.set_use_numexpr(True)", - start_date=datetime(2013, 2, 26)) -#---------------------------------------------------------------------- -# MultiIndex sortlevel - -setup = common_setup + """ -a = np.repeat(np.arange(100), 1000) -b = np.tile(np.arange(1000), 100) -midx = MultiIndex.from_arrays([a, b]) -midx = midx.take(np.random.permutation(np.arange(100000))) -""" -sort_level_zero = Benchmark("midx.sortlevel(0)", setup, - start_date=datetime(2012, 1, 1)) -sort_level_one = Benchmark("midx.sortlevel(1)", setup, - start_date=datetime(2012, 1, 1)) - -#---------------------------------------------------------------------- -# Panel subset selection - -setup = common_setup + """ -p = Panel(np.random.randn(100, 100, 100)) -inds = range(0, 100, 10) -""" - -indexing_panel_subset = Benchmark('p.ix[inds, inds, inds]', setup, - start_date=datetime(2012, 1, 1)) - -#---------------------------------------------------------------------- -# Iloc - -setup = common_setup + """ -df = DataFrame({'A' : [0.1] * 3000, 'B' : [1] * 3000}) -idx = np.array(range(30)) * 99 -df2 = DataFrame({'A' : [0.1] * 1000, 'B' : [1] * 1000}) -df2 = concat([df2, 2*df2, 3*df2]) -""" - -frame_iloc_dups = Benchmark('df2.iloc[idx]', setup, - start_date=datetime(2013, 1, 1)) - -frame_loc_dups = Benchmark('df2.loc[idx]', setup, - start_date=datetime(2013, 1, 1)) - -setup = common_setup + """ -df = DataFrame(dict( A = [ 'foo'] * 1000000)) -""" - -frame_iloc_big = Benchmark('df.iloc[:100,0]', setup, - start_date=datetime(2013, 1, 1)) - -#---------------------------------------------------------------------- -# basic tests for [], .loc[], .iloc[] and .ix[] - -setup = common_setup + """ -s = Series(np.random.rand(1000000)) -""" - -series_getitem_scalar = Benchmark("s[800000]", setup) -series_getitem_slice = Benchmark("s[:800000]", setup) -series_getitem_list_like = Benchmark("s[[800000]]", setup) -series_getitem_array = Benchmark("s[np.arange(10000)]", setup) - -series_loc_scalar = Benchmark("s.loc[800000]", setup) -series_loc_slice = Benchmark("s.loc[:800000]", setup) -series_loc_list_like = Benchmark("s.loc[[800000]]", setup) -series_loc_array = Benchmark("s.loc[np.arange(10000)]", setup) - -series_iloc_scalar = Benchmark("s.iloc[800000]", setup) -series_iloc_slice = Benchmark("s.iloc[:800000]", setup) -series_iloc_list_like = Benchmark("s.iloc[[800000]]", setup) -series_iloc_array = Benchmark("s.iloc[np.arange(10000)]", setup) - -series_ix_scalar = Benchmark("s.ix[800000]", setup) -series_ix_slice = Benchmark("s.ix[:800000]", setup) -series_ix_list_like = Benchmark("s.ix[[800000]]", setup) -series_ix_array = Benchmark("s.ix[np.arange(10000)]", setup) - - -# multi-index slicing -setup = common_setup + """ -np.random.seed(1234) -idx=pd.IndexSlice -n=100000 -mdt = pandas.DataFrame() -mdt['A'] = np.random.choice(range(10000,45000,1000), n) -mdt['B'] = np.random.choice(range(10,400), n) -mdt['C'] = np.random.choice(range(1,150), n) -mdt['D'] = np.random.choice(range(10000,45000), n) -mdt['x'] = np.random.choice(range(400), n) -mdt['y'] = np.random.choice(range(25), n) - - -test_A = 25000 -test_B = 25 -test_C = 40 -test_D = 35000 - -eps_A = 5000 -eps_B = 5 -eps_C = 5 -eps_D = 5000 -mdt2 = mdt.set_index(['A','B','C','D']).sortlevel() -""" - -multiindex_slicers = Benchmark('mdt2.loc[idx[test_A-eps_A:test_A+eps_A,test_B-eps_B:test_B+eps_B,test_C-eps_C:test_C+eps_C,test_D-eps_D:test_D+eps_D],:]', setup, - start_date=datetime(2015, 1, 1)) - -#---------------------------------------------------------------------- -# take - -setup = common_setup + """ -s = Series(np.random.rand(100000)) -ts = Series(np.random.rand(100000), - index=date_range('2011-01-01', freq='S', periods=100000)) -indexer = [True, False, True, True, False] * 20000 -""" - -series_take_intindex = Benchmark("s.take(indexer)", setup) -series_take_dtindex = Benchmark("ts.take(indexer)", setup) diff --git a/vb_suite/inference.py b/vb_suite/inference.py deleted file mode 100644 index aaa51aa5163ce..0000000000000 --- a/vb_suite/inference.py +++ /dev/null @@ -1,36 +0,0 @@ -from vbench.api import Benchmark -from datetime import datetime -import sys - -# from GH 7332 - -setup = """from .pandas_vb_common import * -import pandas as pd -N = 500000 -df_int64 = DataFrame(dict(A = np.arange(N,dtype='int64'), B = np.arange(N,dtype='int64'))) -df_int32 = DataFrame(dict(A = np.arange(N,dtype='int32'), B = np.arange(N,dtype='int32'))) -df_uint32 = DataFrame(dict(A = np.arange(N,dtype='uint32'), B = np.arange(N,dtype='uint32'))) -df_float64 = DataFrame(dict(A = np.arange(N,dtype='float64'), B = np.arange(N,dtype='float64'))) -df_float32 = DataFrame(dict(A = np.arange(N,dtype='float32'), B = np.arange(N,dtype='float32'))) -df_datetime64 = DataFrame(dict(A = pd.to_datetime(np.arange(N,dtype='int64'),unit='ms'), - B = pd.to_datetime(np.arange(N,dtype='int64'),unit='ms'))) -df_timedelta64 = DataFrame(dict(A = df_datetime64['A']-df_datetime64['B'], - B = df_datetime64['B'])) -""" - -dtype_infer_int64 = Benchmark('df_int64["A"] + df_int64["B"]', setup, - start_date=datetime(2014, 1, 1)) -dtype_infer_int32 = Benchmark('df_int32["A"] + df_int32["B"]', setup, - start_date=datetime(2014, 1, 1)) -dtype_infer_uint32 = Benchmark('df_uint32["A"] + df_uint32["B"]', setup, - start_date=datetime(2014, 1, 1)) -dtype_infer_float64 = Benchmark('df_float64["A"] + df_float64["B"]', setup, - start_date=datetime(2014, 1, 1)) -dtype_infer_float32 = Benchmark('df_float32["A"] + df_float32["B"]', setup, - start_date=datetime(2014, 1, 1)) -dtype_infer_datetime64 = Benchmark('df_datetime64["A"] - df_datetime64["B"]', setup, - start_date=datetime(2014, 1, 1)) -dtype_infer_timedelta64_1 = Benchmark('df_timedelta64["A"] + df_timedelta64["B"]', setup, - start_date=datetime(2014, 1, 1)) -dtype_infer_timedelta64_2 = Benchmark('df_timedelta64["A"] + df_timedelta64["A"]', setup, - start_date=datetime(2014, 1, 1)) diff --git a/vb_suite/io_bench.py b/vb_suite/io_bench.py deleted file mode 100644 index af5f6076515cc..0000000000000 --- a/vb_suite/io_bench.py +++ /dev/null @@ -1,150 +0,0 @@ -from vbench.api import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -from io import StringIO -""" - -#---------------------------------------------------------------------- -# read_csv - -setup1 = common_setup + """ -index = tm.makeStringIndex(10000) -df = DataFrame({'float1' : randn(10000), - 'float2' : randn(10000), - 'string1' : ['foo'] * 10000, - 'bool1' : [True] * 10000, - 'int1' : np.random.randint(0, 100000, size=10000)}, - index=index) -df.to_csv('__test__.csv') -""" - -read_csv_standard = Benchmark("read_csv('__test__.csv')", setup1, - start_date=datetime(2011, 9, 15)) - -#---------------------------------- -# skiprows - -setup1 = common_setup + """ -index = tm.makeStringIndex(20000) -df = DataFrame({'float1' : randn(20000), - 'float2' : randn(20000), - 'string1' : ['foo'] * 20000, - 'bool1' : [True] * 20000, - 'int1' : np.random.randint(0, 200000, size=20000)}, - index=index) -df.to_csv('__test__.csv') -""" - -read_csv_skiprows = Benchmark("read_csv('__test__.csv', skiprows=10000)", setup1, - start_date=datetime(2011, 9, 15)) - -#---------------------------------------------------------------------- -# write_csv - -setup2 = common_setup + """ -index = tm.makeStringIndex(10000) -df = DataFrame({'float1' : randn(10000), - 'float2' : randn(10000), - 'string1' : ['foo'] * 10000, - 'bool1' : [True] * 10000, - 'int1' : np.random.randint(0, 100000, size=10000)}, - index=index) -""" - -write_csv_standard = Benchmark("df.to_csv('__test__.csv')", setup2, - start_date=datetime(2011, 9, 15)) - -#---------------------------------- -setup = common_setup + """ -df = DataFrame(np.random.randn(3000, 30)) -""" -frame_to_csv = Benchmark("df.to_csv('__test__.csv')", setup, - start_date=datetime(2011, 1, 1)) -#---------------------------------- - -setup = common_setup + """ -df=DataFrame({'A':range(50000)}) -df['B'] = df.A + 1.0 -df['C'] = df.A + 2.0 -df['D'] = df.A + 3.0 -""" -frame_to_csv2 = Benchmark("df.to_csv('__test__.csv')", setup, - start_date=datetime(2011, 1, 1)) - -#---------------------------------- -setup = common_setup + """ -from pandas import concat, Timestamp - -def create_cols(name): - return [ "%s%03d" % (name,i) for i in range(5) ] -df_float = DataFrame(np.random.randn(5000, 5),dtype='float64',columns=create_cols('float')) -df_int = DataFrame(np.random.randn(5000, 5),dtype='int64',columns=create_cols('int')) -df_bool = DataFrame(True,index=df_float.index,columns=create_cols('bool')) -df_object = DataFrame('foo',index=df_float.index,columns=create_cols('object')) -df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=create_cols('date')) - -# add in some nans -df_float.ix[30:500,1:3] = np.nan - -df = concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1) - -""" -frame_to_csv_mixed = Benchmark("df.to_csv('__test__.csv')", setup, - start_date=datetime(2012, 6, 1)) - -#---------------------------------------------------------------------- -# parse dates, ISO8601 format - -setup = common_setup + """ -rng = date_range('1/1/2000', periods=1000) -data = '\\n'.join(rng.map(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))) -""" - -stmt = ("read_csv(StringIO(data), header=None, names=['foo'], " - " parse_dates=['foo'])") -read_parse_dates_iso8601 = Benchmark(stmt, setup, - start_date=datetime(2012, 3, 1)) - -setup = common_setup + """ -rng = date_range('1/1/2000', periods=1000) -data = DataFrame(rng, index=rng) -""" - -stmt = ("data.to_csv('__test__.csv', date_format='%Y%m%d')") - -frame_to_csv_date_formatting = Benchmark(stmt, setup, - start_date=datetime(2013, 9, 1)) - -#---------------------------------------------------------------------- -# infer datetime format - -setup = common_setup + """ -rng = date_range('1/1/2000', periods=1000) -data = '\\n'.join(rng.map(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))) -""" - -stmt = ("read_csv(StringIO(data), header=None, names=['foo'], " - " parse_dates=['foo'], infer_datetime_format=True)") - -read_csv_infer_datetime_format_iso8601 = Benchmark(stmt, setup) - -setup = common_setup + """ -rng = date_range('1/1/2000', periods=1000) -data = '\\n'.join(rng.map(lambda x: x.strftime("%Y%m%d"))) -""" - -stmt = ("read_csv(StringIO(data), header=None, names=['foo'], " - " parse_dates=['foo'], infer_datetime_format=True)") - -read_csv_infer_datetime_format_ymd = Benchmark(stmt, setup) - -setup = common_setup + """ -rng = date_range('1/1/2000', periods=1000) -data = '\\n'.join(rng.map(lambda x: x.strftime("%m/%d/%Y %H:%M:%S.%f"))) -""" - -stmt = ("read_csv(StringIO(data), header=None, names=['foo'], " - " parse_dates=['foo'], infer_datetime_format=True)") - -read_csv_infer_datetime_format_custom = Benchmark(stmt, setup) diff --git a/vb_suite/io_sql.py b/vb_suite/io_sql.py deleted file mode 100644 index ba8367e7e356b..0000000000000 --- a/vb_suite/io_sql.py +++ /dev/null @@ -1,126 +0,0 @@ -from vbench.api import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -import sqlite3 -import sqlalchemy -from sqlalchemy import create_engine - -engine = create_engine('sqlite:///:memory:') -con = sqlite3.connect(':memory:') -""" - -sdate = datetime(2014, 6, 1) - - -#------------------------------------------------------------------------------- -# to_sql - -setup = common_setup + """ -index = tm.makeStringIndex(10000) -df = DataFrame({'float1' : randn(10000), - 'float2' : randn(10000), - 'string1' : ['foo'] * 10000, - 'bool1' : [True] * 10000, - 'int1' : np.random.randint(0, 100000, size=10000)}, - index=index) -""" - -sql_write_sqlalchemy = Benchmark("df.to_sql('test1', engine, if_exists='replace')", - setup, start_date=sdate) - -sql_write_fallback = Benchmark("df.to_sql('test1', con, if_exists='replace')", - setup, start_date=sdate) - - -#------------------------------------------------------------------------------- -# read_sql - -setup = common_setup + """ -index = tm.makeStringIndex(10000) -df = DataFrame({'float1' : randn(10000), - 'float2' : randn(10000), - 'string1' : ['foo'] * 10000, - 'bool1' : [True] * 10000, - 'int1' : np.random.randint(0, 100000, size=10000)}, - index=index) -df.to_sql('test2', engine, if_exists='replace') -df.to_sql('test2', con, if_exists='replace') -""" - -sql_read_query_sqlalchemy = Benchmark("read_sql_query('SELECT * FROM test2', engine)", - setup, start_date=sdate) - -sql_read_query_fallback = Benchmark("read_sql_query('SELECT * FROM test2', con)", - setup, start_date=sdate) - -sql_read_table_sqlalchemy = Benchmark("read_sql_table('test2', engine)", - setup, start_date=sdate) - - -#------------------------------------------------------------------------------- -# type specific write - -setup = common_setup + """ -df = DataFrame({'float' : randn(10000), - 'string' : ['foo'] * 10000, - 'bool' : [True] * 10000, - 'datetime' : date_range('2000-01-01', periods=10000, freq='s')}) -df.loc[1000:3000, 'float'] = np.nan -""" - -sql_float_write_sqlalchemy = \ - Benchmark("df[['float']].to_sql('test_float', engine, if_exists='replace')", - setup, start_date=sdate) - -sql_float_write_fallback = \ - Benchmark("df[['float']].to_sql('test_float', con, if_exists='replace')", - setup, start_date=sdate) - -sql_string_write_sqlalchemy = \ - Benchmark("df[['string']].to_sql('test_string', engine, if_exists='replace')", - setup, start_date=sdate) - -sql_string_write_fallback = \ - Benchmark("df[['string']].to_sql('test_string', con, if_exists='replace')", - setup, start_date=sdate) - -sql_datetime_write_sqlalchemy = \ - Benchmark("df[['datetime']].to_sql('test_datetime', engine, if_exists='replace')", - setup, start_date=sdate) - -#sql_datetime_write_fallback = \ -# Benchmark("df[['datetime']].to_sql('test_datetime', con, if_exists='replace')", -# setup3, start_date=sdate) - -#------------------------------------------------------------------------------- -# type specific read - -setup = common_setup + """ -df = DataFrame({'float' : randn(10000), - 'datetime' : date_range('2000-01-01', periods=10000, freq='s')}) -df['datetime_string'] = df['datetime'].map(str) - -df.to_sql('test_type', engine, if_exists='replace') -df[['float', 'datetime_string']].to_sql('test_type', con, if_exists='replace') -""" - -sql_float_read_query_sqlalchemy = \ - Benchmark("read_sql_query('SELECT float FROM test_type', engine)", - setup, start_date=sdate) - -sql_float_read_table_sqlalchemy = \ - Benchmark("read_sql_table('test_type', engine, columns=['float'])", - setup, start_date=sdate) - -sql_float_read_query_fallback = \ - Benchmark("read_sql_query('SELECT float FROM test_type', con)", - setup, start_date=sdate) - -sql_datetime_read_as_native_sqlalchemy = \ - Benchmark("read_sql_table('test_type', engine, columns=['datetime'])", - setup, start_date=sdate) - -sql_datetime_read_and_parse_sqlalchemy = \ - Benchmark("read_sql_table('test_type', engine, columns=['datetime_string'], parse_dates=['datetime_string'])", - setup, start_date=sdate) diff --git a/vb_suite/join_merge.py b/vb_suite/join_merge.py deleted file mode 100644 index 238a129552e90..0000000000000 --- a/vb_suite/join_merge.py +++ /dev/null @@ -1,270 +0,0 @@ -from vbench.benchmark import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -""" - -setup = common_setup + """ -level1 = tm.makeStringIndex(10).values -level2 = tm.makeStringIndex(1000).values -label1 = np.arange(10).repeat(1000) -label2 = np.tile(np.arange(1000), 10) - -key1 = np.tile(level1.take(label1), 10) -key2 = np.tile(level2.take(label2), 10) - -shuf = np.arange(100000) -random.shuffle(shuf) -try: - index2 = MultiIndex(levels=[level1, level2], labels=[label1, label2]) - index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], - labels=[np.arange(10).repeat(10000), - np.tile(np.arange(100).repeat(100), 10), - np.tile(np.tile(np.arange(100), 100), 10)]) - df_multi = DataFrame(np.random.randn(len(index2), 4), index=index2, - columns=['A', 'B', 'C', 'D']) -except: # pre-MultiIndex - pass - -try: - DataFrame = DataMatrix -except: - pass - -df = pd.DataFrame({'data1' : np.random.randn(100000), - 'data2' : np.random.randn(100000), - 'key1' : key1, - 'key2' : key2}) - - -df_key1 = pd.DataFrame(np.random.randn(len(level1), 4), index=level1, - columns=['A', 'B', 'C', 'D']) -df_key2 = pd.DataFrame(np.random.randn(len(level2), 4), index=level2, - columns=['A', 'B', 'C', 'D']) - -df_shuf = df.reindex(df.index[shuf]) -""" - -#---------------------------------------------------------------------- -# DataFrame joins on key - -join_dataframe_index_single_key_small = \ - Benchmark("df.join(df_key1, on='key1')", setup, - name='join_dataframe_index_single_key_small') - -join_dataframe_index_single_key_bigger = \ - Benchmark("df.join(df_key2, on='key2')", setup, - name='join_dataframe_index_single_key_bigger') - -join_dataframe_index_single_key_bigger_sort = \ - Benchmark("df_shuf.join(df_key2, on='key2', sort=True)", setup, - name='join_dataframe_index_single_key_bigger_sort', - start_date=datetime(2012, 2, 5)) - -join_dataframe_index_multi = \ - Benchmark("df.join(df_multi, on=['key1', 'key2'])", setup, - name='join_dataframe_index_multi', - start_date=datetime(2011, 10, 20)) - -#---------------------------------------------------------------------- -# Joins on integer keys -setup = common_setup + """ -df = pd.DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2), - 'key2': np.tile(np.arange(250).repeat(10), 4), - 'value': np.random.randn(10000)}) -df2 = pd.DataFrame({'key1': np.arange(500), 'value2': randn(500)}) -df3 = df[:5000] -""" - - -join_dataframe_integer_key = Benchmark("merge(df, df2, on='key1')", setup, - start_date=datetime(2011, 10, 20)) -join_dataframe_integer_2key = Benchmark("merge(df, df3)", setup, - start_date=datetime(2011, 10, 20)) - -#---------------------------------------------------------------------- -# DataFrame joins on index - - -#---------------------------------------------------------------------- -# Merges -setup = common_setup + """ -N = 10000 - -indices = tm.makeStringIndex(N).values -indices2 = tm.makeStringIndex(N).values -key = np.tile(indices[:8000], 10) -key2 = np.tile(indices2[:8000], 10) - -left = pd.DataFrame({'key' : key, 'key2':key2, - 'value' : np.random.randn(80000)}) -right = pd.DataFrame({'key': indices[2000:], 'key2':indices2[2000:], - 'value2' : np.random.randn(8000)}) -""" - -merge_2intkey_nosort = Benchmark('merge(left, right, sort=False)', setup, - start_date=datetime(2011, 10, 20)) - -merge_2intkey_sort = Benchmark('merge(left, right, sort=True)', setup, - start_date=datetime(2011, 10, 20)) - -#---------------------------------------------------------------------- -# Appending DataFrames - -setup = common_setup + """ -df1 = pd.DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) -df2 = df1.copy() -df2.index = np.arange(10000, 20000) -mdf1 = df1.copy() -mdf1['obj1'] = 'bar' -mdf1['obj2'] = 'bar' -mdf1['int1'] = 5 -try: - mdf1.consolidate(inplace=True) -except: - pass -mdf2 = mdf1.copy() -mdf2.index = df2.index -""" - -stmt = "df1.append(df2)" -append_frame_single_homogenous = \ - Benchmark(stmt, setup, name='append_frame_single_homogenous', - ncalls=500, repeat=1) - -stmt = "mdf1.append(mdf2)" -append_frame_single_mixed = Benchmark(stmt, setup, - name='append_frame_single_mixed', - ncalls=500, repeat=1) - -#---------------------------------------------------------------------- -# data alignment - -setup = common_setup + """n = 1000000 -# indices = tm.makeStringIndex(n) -def sample(values, k): - sampler = np.random.permutation(len(values)) - return values.take(sampler[:k]) -sz = 500000 -rng = np.arange(0, 10000000000000, 10000000) -stamps = np.datetime64(datetime.now()).view('i8') + rng -idx1 = np.sort(sample(stamps, sz)) -idx2 = np.sort(sample(stamps, sz)) -ts1 = Series(np.random.randn(sz), idx1) -ts2 = Series(np.random.randn(sz), idx2) -""" -stmt = "ts1 + ts2" -series_align_int64_index = \ - Benchmark(stmt, setup, - name="series_align_int64_index", - start_date=datetime(2010, 6, 1), logy=True) - -stmt = "ts1.align(ts2, join='left')" -series_align_left_monotonic = \ - Benchmark(stmt, setup, - name="series_align_left_monotonic", - start_date=datetime(2011, 12, 1), logy=True) - -#---------------------------------------------------------------------- -# Concat Series axis=1 - -setup = common_setup + """ -n = 1000 -indices = tm.makeStringIndex(1000) -s = Series(n, index=indices) -pieces = [s[i:-i] for i in range(1, 10)] -pieces = pieces * 50 -""" - -concat_series_axis1 = Benchmark('concat(pieces, axis=1)', setup, - start_date=datetime(2012, 2, 27)) - -setup = common_setup + """ -df = pd.DataFrame(randn(5, 4)) -""" - -concat_small_frames = Benchmark('concat([df] * 1000)', setup, - start_date=datetime(2012, 1, 1)) - - -#---------------------------------------------------------------------- -# Concat empty - -setup = common_setup + """ -df = pd.DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s')) -empty = pd.DataFrame() -""" - -concat_empty_frames1 = Benchmark('concat([df,empty])', setup, - start_date=datetime(2012, 1, 1)) -concat_empty_frames2 = Benchmark('concat([empty,df])', setup, - start_date=datetime(2012, 1, 1)) - - -#---------------------------------------------------------------------- -# Ordered merge - -setup = common_setup + """ -groups = tm.makeStringIndex(10).values - -left = pd.DataFrame({'group': groups.repeat(5000), - 'key' : np.tile(np.arange(0, 10000, 2), 10), - 'lvalue': np.random.randn(50000)}) - -right = pd.DataFrame({'key' : np.arange(10000), - 'rvalue' : np.random.randn(10000)}) - -""" - -stmt = "ordered_merge(left, right, on='key', left_by='group')" - -#---------------------------------------------------------------------- -# outer join of non-unique -# GH 6329 - -setup = common_setup + """ -date_index = date_range('01-Jan-2013', '23-Jan-2013', freq='T') -daily_dates = date_index.to_period('D').to_timestamp('S','S') -fracofday = date_index.view(np.ndarray) - daily_dates.view(np.ndarray) -fracofday = fracofday.astype('timedelta64[ns]').astype(np.float64)/864e11 -fracofday = TimeSeries(fracofday, daily_dates) -index = date_range(date_index.min().to_period('A').to_timestamp('D','S'), - date_index.max().to_period('A').to_timestamp('D','E'), - freq='D') -temp = TimeSeries(1.0, index) -""" - -join_non_unique_equal = Benchmark('fracofday * temp[fracofday.index]', setup, - start_date=datetime(2013, 1, 1)) - - -setup = common_setup + ''' -np.random.seed(2718281) -n = 50000 - -left = pd.DataFrame(np.random.randint(1, n/500, (n, 2)), - columns=['jim', 'joe']) - -right = pd.DataFrame(np.random.randint(1, n/500, (n, 2)), - columns=['jolie', 'jolia']).set_index('jolie') -''' - -left_outer_join_index = Benchmark("left.join(right, on='jim')", setup, - name='left_outer_join_index') - - -setup = common_setup + """ -low, high, n = -1 << 10, 1 << 10, 1 << 20 -left = pd.DataFrame(np.random.randint(low, high, (n, 7)), - columns=list('ABCDEFG')) -left['left'] = left.sum(axis=1) - -i = np.random.permutation(len(left)) -right = left.iloc[i].copy() -right.columns = right.columns[:-1].tolist() + ['right'] -right.index = np.arange(len(right)) -right['right'] *= -1 -""" - -i8merge = Benchmark("merge(left, right, how='outer')", setup, - name='i8merge') diff --git a/vb_suite/make.py b/vb_suite/make.py deleted file mode 100755 index 5a8a8215db9a4..0000000000000 --- a/vb_suite/make.py +++ /dev/null @@ -1,167 +0,0 @@ -#!/usr/bin/env python - -""" -Python script for building documentation. - -To build the docs you must have all optional dependencies for statsmodels -installed. See the installation instructions for a list of these. - -Note: currently latex builds do not work because of table formats that are not -supported in the latex generation. - -Usage ------ -python make.py clean -python make.py html -""" - -import glob -import os -import shutil -import sys -import sphinx - -os.environ['PYTHONPATH'] = '..' - -SPHINX_BUILD = 'sphinxbuild' - - -def upload(): - 'push a copy to the site' - os.system('cd build/html; rsync -avz . pandas@pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/vbench/ -essh') - - -def clean(): - if os.path.exists('build'): - shutil.rmtree('build') - - if os.path.exists('source/generated'): - shutil.rmtree('source/generated') - - -def html(): - check_build() - if os.system('sphinx-build -P -b html -d build/doctrees ' - 'source build/html'): - raise SystemExit("Building HTML failed.") - - -def check_build(): - build_dirs = [ - 'build', 'build/doctrees', 'build/html', - 'build/plots', 'build/_static', - 'build/_templates'] - for d in build_dirs: - try: - os.mkdir(d) - except OSError: - pass - - -def all(): - clean() - html() - - -def auto_update(): - msg = '' - try: - clean() - html() - upload() - sendmail() - except (Exception, SystemExit), inst: - msg += str(inst) + '\n' - sendmail(msg) - - -def sendmail(err_msg=None): - from_name, to_name = _get_config() - - if err_msg is None: - msgstr = 'Daily vbench uploaded successfully' - subject = "VB: daily update successful" - else: - msgstr = err_msg - subject = "VB: daily update failed" - - import smtplib - from email.MIMEText import MIMEText - msg = MIMEText(msgstr) - msg['Subject'] = subject - msg['From'] = from_name - msg['To'] = to_name - - server_str, port, login, pwd = _get_credentials() - server = smtplib.SMTP(server_str, port) - server.ehlo() - server.starttls() - server.ehlo() - - server.login(login, pwd) - try: - server.sendmail(from_name, to_name, msg.as_string()) - finally: - server.close() - - -def _get_dir(subdir=None): - import getpass - USERNAME = getpass.getuser() - if sys.platform == 'darwin': - HOME = '/Users/%s' % USERNAME - else: - HOME = '/home/%s' % USERNAME - - if subdir is None: - subdir = '/code/scripts' - conf_dir = '%s%s' % (HOME, subdir) - return conf_dir - - -def _get_credentials(): - tmp_dir = _get_dir() - cred = '%s/credentials' % tmp_dir - with open(cred, 'r') as fh: - server, port, un, domain = fh.read().split(',') - port = int(port) - login = un + '@' + domain + '.com' - - import base64 - with open('%s/cron_email_pwd' % tmp_dir, 'r') as fh: - pwd = base64.b64decode(fh.read()) - - return server, port, login, pwd - - -def _get_config(): - tmp_dir = _get_dir() - with open('%s/addresses' % tmp_dir, 'r') as fh: - from_name, to_name = fh.read().split(',') - return from_name, to_name - -funcd = { - 'html': html, - 'clean': clean, - 'upload': upload, - 'auto_update': auto_update, - 'all': all, -} - -small_docs = False - -# current_dir = os.getcwd() -# os.chdir(os.path.dirname(os.path.join(current_dir, __file__))) - -if len(sys.argv) > 1: - for arg in sys.argv[1:]: - func = funcd.get(arg) - if func is None: - raise SystemExit('Do not know how to handle %s; valid args are %s' % ( - arg, funcd.keys())) - func() -else: - small_docs = False - all() -# os.chdir(current_dir) diff --git a/vb_suite/measure_memory_consumption.py b/vb_suite/measure_memory_consumption.py deleted file mode 100755 index bb73cf5da4302..0000000000000 --- a/vb_suite/measure_memory_consumption.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -from __future__ import print_function - -"""Short one-line summary - -long summary -""" - - -def main(): - import shutil - import tempfile - import warnings - - from pandas import Series - - from vbench.api import BenchmarkRunner - from suite import (REPO_PATH, BUILD, DB_PATH, PREPARE, - dependencies, benchmarks) - - from memory_profiler import memory_usage - - warnings.filterwarnings('ignore', category=FutureWarning) - - try: - TMP_DIR = tempfile.mkdtemp() - runner = BenchmarkRunner( - benchmarks, REPO_PATH, REPO_PATH, BUILD, DB_PATH, - TMP_DIR, PREPARE, always_clean=True, - # run_option='eod', start_date=START_DATE, - module_dependencies=dependencies) - results = {} - for b in runner.benchmarks: - k = b.name - try: - vs = memory_usage((b.run,)) - v = max(vs) - # print(k, v) - results[k] = v - except Exception as e: - print("Exception caught in %s\n" % k) - print(str(e)) - - s = Series(results) - s.sort() - print((s)) - - finally: - shutil.rmtree(TMP_DIR) - - -if __name__ == "__main__": - main() diff --git a/vb_suite/miscellaneous.py b/vb_suite/miscellaneous.py deleted file mode 100644 index da2c736e79ea7..0000000000000 --- a/vb_suite/miscellaneous.py +++ /dev/null @@ -1,32 +0,0 @@ -from vbench.benchmark import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -""" - -#---------------------------------------------------------------------- -# cache_readonly - -setup = common_setup + """ -from pandas.util.decorators import cache_readonly - -class Foo: - - @cache_readonly - def prop(self): - return 5 -obj = Foo() -""" -misc_cache_readonly = Benchmark("obj.prop", setup, name="misc_cache_readonly", - ncalls=2000000) - -#---------------------------------------------------------------------- -# match - -setup = common_setup + """ -uniques = tm.makeStringIndex(1000).values -all = uniques.repeat(10) -""" - -match_strings = Benchmark("match(all, uniques)", setup, - start_date=datetime(2012, 5, 12)) diff --git a/vb_suite/packers.py b/vb_suite/packers.py deleted file mode 100644 index 69ec10822b392..0000000000000 --- a/vb_suite/packers.py +++ /dev/null @@ -1,252 +0,0 @@ -from vbench.api import Benchmark -from datetime import datetime - -start_date = datetime(2013, 5, 1) - -common_setup = """from .pandas_vb_common import * -import os -import pandas as pd -from pandas.core import common as com -from pandas.compat import BytesIO -from random import randrange - -f = '__test__.msg' -def remove(f): - try: - os.remove(f) - except: - pass - -N=100000 -C=5 -index = date_range('20000101',periods=N,freq='H') -df = DataFrame(dict([ ("float{0}".format(i),randn(N)) for i in range(C) ]), - index=index) - -N=100000 -C=5 -index = date_range('20000101',periods=N,freq='H') -df2 = DataFrame(dict([ ("float{0}".format(i),randn(N)) for i in range(C) ]), - index=index) -df2['object'] = ['%08x'%randrange(16**8) for _ in range(N)] -remove(f) -""" - -#---------------------------------------------------------------------- -# msgpack - -setup = common_setup + """ -df2.to_msgpack(f) -""" - -packers_read_pack = Benchmark("pd.read_msgpack(f)", setup, start_date=start_date) - -setup = common_setup + """ -""" - -packers_write_pack = Benchmark("df2.to_msgpack(f)", setup, cleanup="remove(f)", start_date=start_date) - -#---------------------------------------------------------------------- -# pickle - -setup = common_setup + """ -df2.to_pickle(f) -""" - -packers_read_pickle = Benchmark("pd.read_pickle(f)", setup, start_date=start_date) - -setup = common_setup + """ -""" - -packers_write_pickle = Benchmark("df2.to_pickle(f)", setup, cleanup="remove(f)", start_date=start_date) - -#---------------------------------------------------------------------- -# csv - -setup = common_setup + """ -df.to_csv(f) -""" - -packers_read_csv = Benchmark("pd.read_csv(f)", setup, start_date=start_date) - -setup = common_setup + """ -""" - -packers_write_csv = Benchmark("df.to_csv(f)", setup, cleanup="remove(f)", start_date=start_date) - -#---------------------------------------------------------------------- -# hdf store - -setup = common_setup + """ -df2.to_hdf(f,'df') -""" - -packers_read_hdf_store = Benchmark("pd.read_hdf(f,'df')", setup, start_date=start_date) - -setup = common_setup + """ -""" - -packers_write_hdf_store = Benchmark("df2.to_hdf(f,'df')", setup, cleanup="remove(f)", start_date=start_date) - -#---------------------------------------------------------------------- -# hdf table - -setup = common_setup + """ -df2.to_hdf(f,'df',format='table') -""" - -packers_read_hdf_table = Benchmark("pd.read_hdf(f,'df')", setup, start_date=start_date) - -setup = common_setup + """ -""" - -packers_write_hdf_table = Benchmark("df2.to_hdf(f,'df',table=True)", setup, cleanup="remove(f)", start_date=start_date) - -#---------------------------------------------------------------------- -# sql - -setup = common_setup + """ -import sqlite3 -from sqlalchemy import create_engine -engine = create_engine('sqlite:///:memory:') - -df2.to_sql('table', engine, if_exists='replace') -""" - -packers_read_sql= Benchmark("pd.read_sql_table('table', engine)", setup, start_date=start_date) - -setup = common_setup + """ -import sqlite3 -from sqlalchemy import create_engine -engine = create_engine('sqlite:///:memory:') -""" - -packers_write_sql = Benchmark("df2.to_sql('table', engine, if_exists='replace')", setup, start_date=start_date) - -#---------------------------------------------------------------------- -# json - -setup_int_index = """ -import numpy as np -df.index = np.arange(N) -""" - -setup = common_setup + """ -df.to_json(f,orient='split') -""" -packers_read_json_date_index = Benchmark("pd.read_json(f, orient='split')", setup, start_date=start_date) -setup = setup + setup_int_index -packers_read_json = Benchmark("pd.read_json(f, orient='split')", setup, start_date=start_date) - -setup = common_setup + """ -""" -packers_write_json_date_index = Benchmark("df.to_json(f,orient='split')", setup, cleanup="remove(f)", start_date=start_date) - -setup = setup + setup_int_index -packers_write_json = Benchmark("df.to_json(f,orient='split')", setup, cleanup="remove(f)", start_date=start_date) -packers_write_json_T = Benchmark("df.to_json(f,orient='columns')", setup, cleanup="remove(f)", start_date=start_date) - -setup = common_setup + """ -from numpy.random import randint -from collections import OrderedDict - -cols = [ - lambda i: ("{0}_timedelta".format(i), [pd.Timedelta('%d seconds' % randrange(1e6)) for _ in range(N)]), - lambda i: ("{0}_int".format(i), randint(1e8, size=N)), - lambda i: ("{0}_timestamp".format(i), [pd.Timestamp( 1418842918083256000 + randrange(1e9, 1e18, 200)) for _ in range(N)]) - ] -df_mixed = DataFrame(OrderedDict([cols[i % len(cols)](i) for i in range(C)]), - index=index) -""" -packers_write_json_mixed_delta_int_tstamp = Benchmark("df_mixed.to_json(f,orient='split')", setup, cleanup="remove(f)", start_date=start_date) - -setup = common_setup + """ -from numpy.random import randint -from collections import OrderedDict -cols = [ - lambda i: ("{0}_float".format(i), randn(N)), - lambda i: ("{0}_int".format(i), randint(1e8, size=N)) - ] -df_mixed = DataFrame(OrderedDict([cols[i % len(cols)](i) for i in range(C)]), - index=index) -""" -packers_write_json_mixed_float_int = Benchmark("df_mixed.to_json(f,orient='index')", setup, cleanup="remove(f)", start_date=start_date) -packers_write_json_mixed_float_int_T = Benchmark("df_mixed.to_json(f,orient='columns')", setup, cleanup="remove(f)", start_date=start_date) - -setup = common_setup + """ -from numpy.random import randint -from collections import OrderedDict -cols = [ - lambda i: ("{0}_float".format(i), randn(N)), - lambda i: ("{0}_int".format(i), randint(1e8, size=N)), - lambda i: ("{0}_str".format(i), ['%08x'%randrange(16**8) for _ in range(N)]) - ] -df_mixed = DataFrame(OrderedDict([cols[i % len(cols)](i) for i in range(C)]), - index=index) -""" -packers_write_json_mixed_float_int_str = Benchmark("df_mixed.to_json(f,orient='split')", setup, cleanup="remove(f)", start_date=start_date) - -#---------------------------------------------------------------------- -# stata - -setup = common_setup + """ -df.to_stata(f, {'index': 'tc'}) -""" -packers_read_stata = Benchmark("pd.read_stata(f)", setup, start_date=start_date) - -packers_write_stata = Benchmark("df.to_stata(f, {'index': 'tc'})", setup, cleanup="remove(f)", start_date=start_date) - -setup = common_setup + """ -df['int8_'] = [randint(np.iinfo(np.int8).min, np.iinfo(np.int8).max - 27) for _ in range(N)] -df['int16_'] = [randint(np.iinfo(np.int16).min, np.iinfo(np.int16).max - 27) for _ in range(N)] -df['int32_'] = [randint(np.iinfo(np.int32).min, np.iinfo(np.int32).max - 27) for _ in range(N)] -df['float32_'] = np.array(randn(N), dtype=np.float32) -df.to_stata(f, {'index': 'tc'}) -""" - -packers_read_stata_with_validation = Benchmark("pd.read_stata(f)", setup, start_date=start_date) - -packers_write_stata_with_validation = Benchmark("df.to_stata(f, {'index': 'tc'})", setup, cleanup="remove(f)", start_date=start_date) - -#---------------------------------------------------------------------- -# Excel - alternative writers -setup = common_setup + """ -bio = BytesIO() -""" - -excel_writer_bench = """ -bio.seek(0) -writer = pd.io.excel.ExcelWriter(bio, engine='{engine}') -df[:2000].to_excel(writer) -writer.save() -""" - -benchmark_xlsxwriter = excel_writer_bench.format(engine='xlsxwriter') - -packers_write_excel_xlsxwriter = Benchmark(benchmark_xlsxwriter, setup) - -benchmark_openpyxl = excel_writer_bench.format(engine='openpyxl') - -packers_write_excel_openpyxl = Benchmark(benchmark_openpyxl, setup) - -benchmark_xlwt = excel_writer_bench.format(engine='xlwt') - -packers_write_excel_xlwt = Benchmark(benchmark_xlwt, setup) - - -#---------------------------------------------------------------------- -# Excel - reader - -setup = common_setup + """ -bio = BytesIO() -writer = pd.io.excel.ExcelWriter(bio, engine='xlsxwriter') -df[:2000].to_excel(writer) -writer.save() -""" - -benchmark_read_excel=""" -bio.seek(0) -pd.read_excel(bio) -""" - -packers_read_excel = Benchmark(benchmark_read_excel, setup) diff --git a/vb_suite/pandas_vb_common.py b/vb_suite/pandas_vb_common.py deleted file mode 100644 index bd2e8a1c1d504..0000000000000 --- a/vb_suite/pandas_vb_common.py +++ /dev/null @@ -1,30 +0,0 @@ -from pandas import * -import pandas as pd -from datetime import timedelta -from numpy.random import randn -from numpy.random import randint -from numpy.random import permutation -import pandas.util.testing as tm -import random -import numpy as np -try: - from pandas.compat import range -except ImportError: - pass - -np.random.seed(1234) -try: - import pandas._tseries as lib -except: - import pandas._libs.lib as lib - -try: - Panel = WidePanel -except Exception: - pass - -# didn't add to namespace until later -try: - from pandas.core.index import MultiIndex -except ImportError: - pass diff --git a/vb_suite/panel_ctor.py b/vb_suite/panel_ctor.py deleted file mode 100644 index 9f497e7357a61..0000000000000 --- a/vb_suite/panel_ctor.py +++ /dev/null @@ -1,76 +0,0 @@ -from vbench.benchmark import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -""" - -#---------------------------------------------------------------------- -# Panel.from_dict homogenization time - -START_DATE = datetime(2011, 6, 1) - -setup_same_index = common_setup + """ -# create 100 dataframes with the same index -dr = np.asarray(DatetimeIndex(start=datetime(1990,1,1), end=datetime(2012,1,1), - freq=datetools.Day(1))) -data_frames = {} -for x in range(100): - df = DataFrame({"a": [0]*len(dr), "b": [1]*len(dr), - "c": [2]*len(dr)}, index=dr) - data_frames[x] = df -""" - -panel_from_dict_same_index = \ - Benchmark("Panel.from_dict(data_frames)", - setup_same_index, name='panel_from_dict_same_index', - start_date=START_DATE, repeat=1, logy=True) - -setup_equiv_indexes = common_setup + """ -data_frames = {} -for x in range(100): - dr = np.asarray(DatetimeIndex(start=datetime(1990,1,1), end=datetime(2012,1,1), - freq=datetools.Day(1))) - df = DataFrame({"a": [0]*len(dr), "b": [1]*len(dr), - "c": [2]*len(dr)}, index=dr) - data_frames[x] = df -""" - -panel_from_dict_equiv_indexes = \ - Benchmark("Panel.from_dict(data_frames)", - setup_equiv_indexes, name='panel_from_dict_equiv_indexes', - start_date=START_DATE, repeat=1, logy=True) - -setup_all_different_indexes = common_setup + """ -data_frames = {} -start = datetime(1990,1,1) -end = datetime(2012,1,1) -for x in range(100): - end += timedelta(days=1) - dr = np.asarray(date_range(start, end)) - df = DataFrame({"a": [0]*len(dr), "b": [1]*len(dr), - "c": [2]*len(dr)}, index=dr) - data_frames[x] = df -""" -panel_from_dict_all_different_indexes = \ - Benchmark("Panel.from_dict(data_frames)", - setup_all_different_indexes, - name='panel_from_dict_all_different_indexes', - start_date=START_DATE, repeat=1, logy=True) - -setup_two_different_indexes = common_setup + """ -data_frames = {} -start = datetime(1990,1,1) -end = datetime(2012,1,1) -for x in range(100): - if x == 50: - end += timedelta(days=1) - dr = np.asarray(date_range(start, end)) - df = DataFrame({"a": [0]*len(dr), "b": [1]*len(dr), - "c": [2]*len(dr)}, index=dr) - data_frames[x] = df -""" -panel_from_dict_two_different_indexes = \ - Benchmark("Panel.from_dict(data_frames)", - setup_two_different_indexes, - name='panel_from_dict_two_different_indexes', - start_date=START_DATE, repeat=1, logy=True) diff --git a/vb_suite/panel_methods.py b/vb_suite/panel_methods.py deleted file mode 100644 index 28586422a66e3..0000000000000 --- a/vb_suite/panel_methods.py +++ /dev/null @@ -1,28 +0,0 @@ -from vbench.api import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -""" - -#---------------------------------------------------------------------- -# shift - -setup = common_setup + """ -index = date_range(start="2000", freq="D", periods=1000) -panel = Panel(np.random.randn(100, len(index), 1000)) -""" - -panel_shift = Benchmark('panel.shift(1)', setup, - start_date=datetime(2012, 1, 12)) - -panel_shift_minor = Benchmark('panel.shift(1, axis="minor")', setup, - start_date=datetime(2012, 1, 12)) - -panel_pct_change_major = Benchmark('panel.pct_change(1, axis="major")', setup, - start_date=datetime(2014, 4, 19)) - -panel_pct_change_minor = Benchmark('panel.pct_change(1, axis="minor")', setup, - start_date=datetime(2014, 4, 19)) - -panel_pct_change_items = Benchmark('panel.pct_change(1, axis="items")', setup, - start_date=datetime(2014, 4, 19)) diff --git a/vb_suite/parser_vb.py b/vb_suite/parser_vb.py deleted file mode 100644 index bb9ccbdb5e854..0000000000000 --- a/vb_suite/parser_vb.py +++ /dev/null @@ -1,112 +0,0 @@ -from vbench.api import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -from pandas import read_csv, read_table -""" - -setup = common_setup + """ -import os -N = 10000 -K = 8 -df = DataFrame(np.random.randn(N, K) * np.random.randint(100, 10000, (N, K))) -df.to_csv('test.csv', sep='|') -""" - -read_csv_vb = Benchmark("read_csv('test.csv', sep='|')", setup, - cleanup="os.remove('test.csv')", - start_date=datetime(2012, 5, 7)) - - -setup = common_setup + """ -import os -N = 10000 -K = 8 -format = lambda x: '{:,}'.format(x) -df = DataFrame(np.random.randn(N, K) * np.random.randint(100, 10000, (N, K))) -df = df.applymap(format) -df.to_csv('test.csv', sep='|') -""" - -read_csv_thou_vb = Benchmark("read_csv('test.csv', sep='|', thousands=',')", - setup, - cleanup="os.remove('test.csv')", - start_date=datetime(2012, 5, 7)) - -setup = common_setup + """ -data = ['A,B,C'] -data = data + ['1,2,3 # comment'] * 100000 -data = '\\n'.join(data) -""" - -stmt = "read_csv(StringIO(data), comment='#')" -read_csv_comment2 = Benchmark(stmt, setup, - start_date=datetime(2011, 11, 1)) - -setup = common_setup + """ -try: - from cStringIO import StringIO -except ImportError: - from io import StringIO - -import os -N = 10000 -K = 8 -data = '''\ -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -''' -data = data * 200 -""" -cmd = ("read_table(StringIO(data), sep=',', header=None, " - "parse_dates=[[1,2], [1,3]])") -sdate = datetime(2012, 5, 7) -read_table_multiple_date = Benchmark(cmd, setup, start_date=sdate) - -setup = common_setup + """ -try: - from cStringIO import StringIO -except ImportError: - from io import StringIO - -import os -N = 10000 -K = 8 -data = '''\ -KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -''' -data = data * 200 -""" -cmd = "read_table(StringIO(data), sep=',', header=None, parse_dates=[1])" -sdate = datetime(2012, 5, 7) -read_table_multiple_date_baseline = Benchmark(cmd, setup, start_date=sdate) - -setup = common_setup + """ -try: - from cStringIO import StringIO -except ImportError: - from io import StringIO - -data = '''\ -0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336 -0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285 -0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126 -0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394 -0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020 -''' -data = data * 200 -""" -cmd = "read_csv(StringIO(data), sep=',', header=None, float_precision=None)" -sdate = datetime(2014, 8, 20) -read_csv_default_converter = Benchmark(cmd, setup, start_date=sdate) -cmd = "read_csv(StringIO(data), sep=',', header=None, float_precision='high')" -read_csv_precise_converter = Benchmark(cmd, setup, start_date=sdate) -cmd = "read_csv(StringIO(data), sep=',', header=None, float_precision='round_trip')" -read_csv_roundtrip_converter = Benchmark(cmd, setup, start_date=sdate) diff --git a/vb_suite/perf_HEAD.py b/vb_suite/perf_HEAD.py deleted file mode 100755 index 143d943b9eadf..0000000000000 --- a/vb_suite/perf_HEAD.py +++ /dev/null @@ -1,243 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -from __future__ import print_function - -"""Run all the vbenches in `suite`, and post the results as a json blob to gist - -""" - -import urllib2 -from contextlib import closing -from urllib2 import urlopen -import json - -import pandas as pd - -WEB_TIMEOUT = 10 - - -def get_travis_data(): - """figure out what worker we're running on, and the number of jobs it's running - """ - import os - jobid = os.environ.get("TRAVIS_JOB_ID") - if not jobid: - return None, None - - with closing(urlopen("https://api.travis-ci.org/workers/")) as resp: - workers = json.loads(resp.read()) - - host = njobs = None - for item in workers: - host = item.get("host") - id = ((item.get("payload") or {}).get("job") or {}).get("id") - if id and str(id) == str(jobid): - break - if host: - njobs = len( - [x for x in workers if host in x['host'] and x['payload']]) - - return host, njobs - - -def get_utcdatetime(): - try: - from datetime import datetime - return datetime.utcnow().isoformat(" ") - except: - pass - - -def dump_as_gist(data, desc="The Commit", njobs=None): - host, njobs2 = get_travis_data()[:2] - - if njobs: # be slightly more reliable - njobs = max(njobs, njobs2) - - content = dict(version="0.1.1", - timings=data, - datetime=get_utcdatetime(), # added in 0.1.1 - hostname=host, # added in 0.1.1 - njobs=njobs # added in 0.1.1, a measure of load on the travis box - ) - - payload = dict(description=desc, - public=True, - files={'results.json': dict(content=json.dumps(content))}) - try: - with closing(urlopen("https://api.github.com/gists", - json.dumps(payload), timeout=WEB_TIMEOUT)) as r: - if 200 <= r.getcode() < 300: - print("\n\n" + "-" * 80) - - gist = json.loads(r.read()) - file_raw_url = gist['files'].items()[0][1]['raw_url'] - print("[vbench-gist-raw_url] %s" % file_raw_url) - print("[vbench-html-url] %s" % gist['html_url']) - print("[vbench-api-url] %s" % gist['url']) - - print("-" * 80 + "\n\n") - else: - print("api.github.com returned status %d" % r.getcode()) - except: - print("Error occured while dumping to gist") - - -def main(): - import warnings - from suite import benchmarks - - exit_code = 0 - warnings.filterwarnings('ignore', category=FutureWarning) - - host, njobs = get_travis_data()[:2] - results = [] - for b in benchmarks: - try: - d = b.run() - d.update(dict(name=b.name)) - results.append(d) - msg = "{name:<40}: {timing:> 10.4f} [ms]" - print(msg.format(name=results[-1]['name'], - timing=results[-1]['timing'])) - - except Exception as e: - exit_code = 1 - if (type(e) == KeyboardInterrupt or - 'KeyboardInterrupt' in str(d)): - raise KeyboardInterrupt() - - msg = "{name:<40}: ERROR:\n<-------" - print(msg.format(name=b.name)) - if isinstance(d, dict): - if d['succeeded']: - print("\nException:\n%s\n" % str(e)) - else: - for k, v in sorted(d.iteritems()): - print("{k}: {v}".format(k=k, v=v)) - - print("------->\n") - - dump_as_gist(results, "testing", njobs=njobs) - - return exit_code - - -if __name__ == "__main__": - import sys - sys.exit(main()) - -##################################################### -# functions for retrieving and processing the results - - -def get_vbench_log(build_url): - with closing(urllib2.urlopen(build_url)) as r: - if not (200 <= r.getcode() < 300): - return - - s = json.loads(r.read()) - s = [x for x in s['matrix'] if "VBENCH" in ((x.get('config', {}) - or {}).get('env', {}) or {})] - # s=[x for x in s['matrix']] - if not s: - return - id = s[0]['id'] # should be just one for now - with closing(urllib2.urlopen("https://api.travis-ci.org/jobs/%s" % id)) as r2: - if not 200 <= r.getcode() < 300: - return - s2 = json.loads(r2.read()) - return s2.get('log') - - -def get_results_raw_url(build): - "Taks a Travis a build number, retrieves the build log and extracts the gist url" - import re - log = get_vbench_log("https://api.travis-ci.org/builds/%s" % build) - if not log: - return - l = [x.strip( - ) for x in log.split("\n") if re.match(".vbench-gist-raw_url", x)] - if l: - s = l[0] - m = re.search("(https://[^\s]+)", s) - if m: - return m.group(0) - - -def convert_json_to_df(results_url): - """retrieve json results file from url and return df - - df contains timings for all successful vbenchmarks - """ - - with closing(urlopen(results_url)) as resp: - res = json.loads(resp.read()) - timings = res.get("timings") - if not timings: - return - res = [x for x in timings if x.get('succeeded')] - df = pd.DataFrame(res) - df = df.set_index("name") - return df - - -def get_build_results(build): - "Returns a df with the results of the VBENCH job associated with the travis build" - r_url = get_results_raw_url(build) - if not r_url: - return - - return convert_json_to_df(r_url) - - -def get_all_results(repo_id=53976): # travis pandas-dev/pandas id - """Fetches the VBENCH results for all travis builds, and returns a list of result df - - unsuccesful individual vbenches are dropped. - """ - from collections import OrderedDict - - def get_results_from_builds(builds): - dfs = OrderedDict() - for build in builds: - build_id = build['id'] - build_number = build['number'] - print(build_number) - res = get_build_results(build_id) - if res is not None: - dfs[build_number] = res - return dfs - - base_url = 'https://api.travis-ci.org/builds?url=%2Fbuilds&repository_id={repo_id}' - url = base_url.format(repo_id=repo_id) - url_after = url + '&after_number={after}' - dfs = OrderedDict() - - while True: - with closing(urlopen(url)) as r: - if not (200 <= r.getcode() < 300): - break - builds = json.loads(r.read()) - res = get_results_from_builds(builds) - if not res: - break - last_build_number = min(res.keys()) - dfs.update(res) - url = url_after.format(after=last_build_number) - - return dfs - - -def get_all_results_joined(repo_id=53976): - def mk_unique(df): - for dupe in df.index.get_duplicates(): - df = df.ix[df.index != dupe] - return df - dfs = get_all_results(repo_id) - for k in dfs: - dfs[k] = mk_unique(dfs[k]) - ss = [pd.Series(v.timing, name=k) for k, v in dfs.iteritems()] - results = pd.concat(reversed(ss), 1) - return results diff --git a/vb_suite/plotting.py b/vb_suite/plotting.py deleted file mode 100644 index 79e81e9eea8f4..0000000000000 --- a/vb_suite/plotting.py +++ /dev/null @@ -1,25 +0,0 @@ -from vbench.benchmark import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * - -try: - from pandas import date_range -except ImportError: - def date_range(start=None, end=None, periods=None, freq=None): - return DatetimeIndex(start, end, periods=periods, offset=freq) - -""" - -#----------------------------------------------------------------------------- -# Timeseries plotting - -setup = common_setup + """ -N = 2000 -M = 5 -df = DataFrame(np.random.randn(N,M), index=date_range('1/1/1975', periods=N)) -""" - -plot_timeseries_period = Benchmark("df.plot()", setup=setup, - name='plot_timeseries_period') - diff --git a/vb_suite/reindex.py b/vb_suite/reindex.py deleted file mode 100644 index 443eb43835745..0000000000000 --- a/vb_suite/reindex.py +++ /dev/null @@ -1,225 +0,0 @@ -from vbench.benchmark import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -""" - -#---------------------------------------------------------------------- -# DataFrame reindex columns - -setup = common_setup + """ -df = DataFrame(index=range(10000), data=np.random.rand(10000,30), - columns=range(30)) -""" -statement = "df.reindex(columns=df.columns[1:5])" - -frame_reindex_columns = Benchmark(statement, setup) - -#---------------------------------------------------------------------- - -setup = common_setup + """ -rng = DatetimeIndex(start='1/1/1970', periods=10000, freq=datetools.Minute()) -df = DataFrame(np.random.rand(10000, 10), index=rng, - columns=range(10)) -df['foo'] = 'bar' -rng2 = Index(rng[::2]) -""" -statement = "df.reindex(rng2)" -dataframe_reindex = Benchmark(statement, setup) - -#---------------------------------------------------------------------- -# multiindex reindexing - -setup = common_setup + """ -N = 1000 -K = 20 - -level1 = tm.makeStringIndex(N).values.repeat(K) -level2 = np.tile(tm.makeStringIndex(K).values, N) -index = MultiIndex.from_arrays([level1, level2]) - -s1 = Series(np.random.randn(N * K), index=index) -s2 = s1[::2] -""" -statement = "s1.reindex(s2.index)" -reindex_multi = Benchmark(statement, setup, - name='reindex_multiindex', - start_date=datetime(2011, 9, 1)) - -#---------------------------------------------------------------------- -# Pad / backfill - -def pad(source_series, target_index): - try: - source_series.reindex(target_index, method='pad') - except: - source_series.reindex(target_index, fillMethod='pad') - -def backfill(source_series, target_index): - try: - source_series.reindex(target_index, method='backfill') - except: - source_series.reindex(target_index, fillMethod='backfill') - -setup = common_setup + """ -rng = date_range('1/1/2000', periods=100000, freq=datetools.Minute()) - -ts = Series(np.random.randn(len(rng)), index=rng) -ts2 = ts[::2] -ts3 = ts2.reindex(ts.index) -ts4 = ts3.astype('float32') - -def pad(source_series, target_index): - try: - source_series.reindex(target_index, method='pad') - except: - source_series.reindex(target_index, fillMethod='pad') -def backfill(source_series, target_index): - try: - source_series.reindex(target_index, method='backfill') - except: - source_series.reindex(target_index, fillMethod='backfill') -""" - -statement = "pad(ts2, ts.index)" -reindex_daterange_pad = Benchmark(statement, setup, - name="reindex_daterange_pad") - -statement = "backfill(ts2, ts.index)" -reindex_daterange_backfill = Benchmark(statement, setup, - name="reindex_daterange_backfill") - -reindex_fillna_pad = Benchmark("ts3.fillna(method='pad')", setup, - name="reindex_fillna_pad", - start_date=datetime(2011, 3, 1)) - -reindex_fillna_pad_float32 = Benchmark("ts4.fillna(method='pad')", setup, - name="reindex_fillna_pad_float32", - start_date=datetime(2013, 1, 1)) - -reindex_fillna_backfill = Benchmark("ts3.fillna(method='backfill')", setup, - name="reindex_fillna_backfill", - start_date=datetime(2011, 3, 1)) -reindex_fillna_backfill_float32 = Benchmark("ts4.fillna(method='backfill')", setup, - name="reindex_fillna_backfill_float32", - start_date=datetime(2013, 1, 1)) - -#---------------------------------------------------------------------- -# align on level - -setup = common_setup + """ -index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], - labels=[np.arange(10).repeat(10000), - np.tile(np.arange(100).repeat(100), 10), - np.tile(np.tile(np.arange(100), 100), 10)]) -random.shuffle(index.values) -df = DataFrame(np.random.randn(len(index), 4), index=index) -df_level = DataFrame(np.random.randn(100, 4), index=index.levels[1]) -""" - -reindex_frame_level_align = \ - Benchmark("df.align(df_level, level=1, copy=False)", setup, - name='reindex_frame_level_align', - start_date=datetime(2011, 12, 27)) - -reindex_frame_level_reindex = \ - Benchmark("df_level.reindex(df.index, level=1)", setup, - name='reindex_frame_level_reindex', - start_date=datetime(2011, 12, 27)) - - -#---------------------------------------------------------------------- -# sort_index, drop_duplicates - -# pathological, but realistic -setup = common_setup + """ -N = 10000 -K = 10 - -key1 = tm.makeStringIndex(N).values.repeat(K) -key2 = tm.makeStringIndex(N).values.repeat(K) - -df = DataFrame({'key1' : key1, 'key2' : key2, - 'value' : np.random.randn(N * K)}) -col_array_list = list(df.values.T) -""" -statement = "df.sort_index(by=['key1', 'key2'])" -frame_sort_index_by_columns = Benchmark(statement, setup, - start_date=datetime(2011, 11, 1)) - -# drop_duplicates - -statement = "df.drop_duplicates(['key1', 'key2'])" -frame_drop_duplicates = Benchmark(statement, setup, - start_date=datetime(2011, 11, 15)) - -statement = "df.drop_duplicates(['key1', 'key2'], inplace=True)" -frame_drop_dup_inplace = Benchmark(statement, setup, - start_date=datetime(2012, 5, 16)) - -lib_fast_zip = Benchmark('lib.fast_zip(col_array_list)', setup, - name='lib_fast_zip', - start_date=datetime(2012, 1, 1)) - -setup = setup + """ -df.ix[:10000, :] = np.nan -""" -statement2 = "df.drop_duplicates(['key1', 'key2'])" -frame_drop_duplicates_na = Benchmark(statement2, setup, - start_date=datetime(2012, 5, 15)) - -lib_fast_zip_fillna = Benchmark('lib.fast_zip_fillna(col_array_list)', setup, - start_date=datetime(2012, 5, 15)) - -statement2 = "df.drop_duplicates(['key1', 'key2'], inplace=True)" -frame_drop_dup_na_inplace = Benchmark(statement2, setup, - start_date=datetime(2012, 5, 16)) - -setup = common_setup + """ -s = Series(np.random.randint(0, 1000, size=10000)) -s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10)) -""" - -series_drop_duplicates_int = Benchmark('s.drop_duplicates()', setup, - start_date=datetime(2012, 11, 27)) - -series_drop_duplicates_string = \ - Benchmark('s2.drop_duplicates()', setup, - start_date=datetime(2012, 11, 27)) - -#---------------------------------------------------------------------- -# fillna, many columns - - -setup = common_setup + """ -values = np.random.randn(1000, 1000) -values[::2] = np.nan -df = DataFrame(values) -""" - -frame_fillna_many_columns_pad = Benchmark("df.fillna(method='pad')", - setup, - start_date=datetime(2011, 3, 1)) - -#---------------------------------------------------------------------- -# blog "pandas escaped the zoo" - -setup = common_setup + """ -n = 50000 -indices = tm.makeStringIndex(n) - -def sample(values, k): - from random import shuffle - sampler = np.arange(len(values)) - shuffle(sampler) - return values.take(sampler[:k]) - -subsample_size = 40000 - -x = Series(np.random.randn(50000), indices) -y = Series(np.random.randn(subsample_size), - index=sample(indices, subsample_size)) -""" - -series_align_irregular_string = Benchmark("x + y", setup, - start_date=datetime(2010, 6, 1)) diff --git a/vb_suite/replace.py b/vb_suite/replace.py deleted file mode 100644 index 9326aa5becca9..0000000000000 --- a/vb_suite/replace.py +++ /dev/null @@ -1,36 +0,0 @@ -from vbench.api import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -from datetime import timedelta - -N = 1000000 - -try: - rng = date_range('1/1/2000', periods=N, freq='min') -except NameError: - rng = DatetimeIndex('1/1/2000', periods=N, offset=datetools.Minute()) - date_range = DateRange - -ts = Series(np.random.randn(N), index=rng) -""" - -large_dict_setup = """from .pandas_vb_common import * -from pandas.compat import range -n = 10 ** 6 -start_value = 10 ** 5 -to_rep = dict((i, start_value + i) for i in range(n)) -s = Series(np.random.randint(n, size=10 ** 3)) -""" - -replace_fillna = Benchmark('ts.fillna(0., inplace=True)', common_setup, - name='replace_fillna', - start_date=datetime(2012, 4, 4)) -replace_replacena = Benchmark('ts.replace(np.nan, 0., inplace=True)', - common_setup, - name='replace_replacena', - start_date=datetime(2012, 5, 15)) -replace_large_dict = Benchmark('s.replace(to_rep, inplace=True)', - large_dict_setup, - name='replace_large_dict', - start_date=datetime(2014, 4, 6)) diff --git a/vb_suite/reshape.py b/vb_suite/reshape.py deleted file mode 100644 index daab96103f2c5..0000000000000 --- a/vb_suite/reshape.py +++ /dev/null @@ -1,65 +0,0 @@ -from vbench.api import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -index = MultiIndex.from_arrays([np.arange(100).repeat(100), - np.roll(np.tile(np.arange(100), 100), 25)]) -df = DataFrame(np.random.randn(10000, 4), index=index) -""" - -reshape_unstack_simple = Benchmark('df.unstack(1)', common_setup, - start_date=datetime(2011, 10, 1)) - -setup = common_setup + """ -udf = df.unstack(1) -""" - -reshape_stack_simple = Benchmark('udf.stack()', setup, - start_date=datetime(2011, 10, 1)) - -setup = common_setup + """ -def unpivot(frame): - N, K = frame.shape - data = {'value' : frame.values.ravel('F'), - 'variable' : np.asarray(frame.columns).repeat(N), - 'date' : np.tile(np.asarray(frame.index), K)} - return DataFrame(data, columns=['date', 'variable', 'value']) -index = date_range('1/1/2000', periods=10000, freq='h') -df = DataFrame(randn(10000, 50), index=index, columns=range(50)) -pdf = unpivot(df) -f = lambda: pdf.pivot('date', 'variable', 'value') -""" - -reshape_pivot_time_series = Benchmark('f()', setup, - start_date=datetime(2012, 5, 1)) - -# Sparse key space, re: #2278 - -setup = common_setup + """ -NUM_ROWS = 1000 -for iter in range(10): - df = DataFrame({'A' : np.random.randint(50, size=NUM_ROWS), - 'B' : np.random.randint(50, size=NUM_ROWS), - 'C' : np.random.randint(-10,10, size=NUM_ROWS), - 'D' : np.random.randint(-10,10, size=NUM_ROWS), - 'E' : np.random.randint(10, size=NUM_ROWS), - 'F' : np.random.randn(NUM_ROWS)}) - idf = df.set_index(['A', 'B', 'C', 'D', 'E']) - if len(idf.index.unique()) == NUM_ROWS: - break -""" - -unstack_sparse_keyspace = Benchmark('idf.unstack()', setup, - start_date=datetime(2011, 10, 1)) - -# Melt - -setup = common_setup + """ -from pandas.core.reshape import melt -df = DataFrame(np.random.randn(10000, 3), columns=['A', 'B', 'C']) -df['id1'] = np.random.randint(0, 10, 10000) -df['id2'] = np.random.randint(100, 1000, 10000) -""" - -melt_dataframe = Benchmark("melt(df, id_vars=['id1', 'id2'])", setup, - start_date=datetime(2012, 8, 1)) diff --git a/vb_suite/run_suite.py b/vb_suite/run_suite.py deleted file mode 100755 index 43bf24faae43a..0000000000000 --- a/vb_suite/run_suite.py +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env python -from vbench.api import BenchmarkRunner -from suite import * - - -def run_process(): - runner = BenchmarkRunner(benchmarks, REPO_PATH, REPO_URL, - BUILD, DB_PATH, TMP_DIR, PREPARE, - always_clean=True, - run_option='eod', start_date=START_DATE, - module_dependencies=dependencies) - runner.run() - -if __name__ == '__main__': - run_process() diff --git a/vb_suite/series_methods.py b/vb_suite/series_methods.py deleted file mode 100644 index c545f419c2dec..0000000000000 --- a/vb_suite/series_methods.py +++ /dev/null @@ -1,39 +0,0 @@ -from vbench.api import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -""" - -setup = common_setup + """ -s1 = Series(np.random.randn(10000)) -s2 = Series(np.random.randint(1, 10, 10000)) -s3 = Series(np.random.randint(1, 10, 100000)).astype('int64') -values = [1,2] -s4 = s3.astype('object') -""" - -series_nlargest1 = Benchmark("s1.nlargest(3, keep='last');" - "s1.nlargest(3, keep='first')", - setup, - start_date=datetime(2014, 1, 25)) -series_nlargest2 = Benchmark("s2.nlargest(3, keep='last');" - "s2.nlargest(3, keep='first')", - setup, - start_date=datetime(2014, 1, 25)) - -series_nsmallest2 = Benchmark("s1.nsmallest(3, keep='last');" - "s1.nsmallest(3, keep='first')", - setup, - start_date=datetime(2014, 1, 25)) - -series_nsmallest2 = Benchmark("s2.nsmallest(3, keep='last');" - "s2.nsmallest(3, keep='first')", - setup, - start_date=datetime(2014, 1, 25)) - -series_isin_int64 = Benchmark('s3.isin(values)', - setup, - start_date=datetime(2014, 1, 25)) -series_isin_object = Benchmark('s4.isin(values)', - setup, - start_date=datetime(2014, 1, 25)) diff --git a/vb_suite/source/_static/stub b/vb_suite/source/_static/stub deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/vb_suite/source/conf.py b/vb_suite/source/conf.py deleted file mode 100644 index d83448fd97d09..0000000000000 --- a/vb_suite/source/conf.py +++ /dev/null @@ -1,225 +0,0 @@ -# -*- coding: utf-8 -*- -# -# pandas documentation build configuration file, created by -# -# This file is execfile()d with the current directory set to its containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -import sys -import os - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# sys.path.append(os.path.abspath('.')) -sys.path.insert(0, os.path.abspath('../sphinxext')) - -sys.path.extend([ - - # numpy standard doc extensions - os.path.join(os.path.dirname(__file__), - '..', '../..', - 'sphinxext') - -]) - -# -- General configuration ----------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be extensions -# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. sphinxext. - -extensions = ['sphinx.ext.autodoc', - 'sphinx.ext.doctest'] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates', '_templates/autosummary'] - -# The suffix of source filenames. -source_suffix = '.rst' - -# The encoding of source files. -# source_encoding = 'utf-8' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'pandas' -copyright = u'2008-2011, the pandas development team' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -import pandas - -# version = '%s r%s' % (pandas.__version__, svn_version()) -version = '%s' % (pandas.__version__) - -# The full version, including alpha/beta/rc tags. -release = version - -# JP: added from sphinxdocs -autosummary_generate = True - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -# today = '' -# Else, today_fmt is used as the format for a strftime call. -# today_fmt = '%B %d, %Y' - -# List of documents that shouldn't be included in the build. -# unused_docs = [] - -# List of directories, relative to source directory, that shouldn't be searched -# for source files. -exclude_trees = [] - -# The reST default role (used for this markup: `text`) to use for all documents. -# default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -# add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -# add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -# show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -# modindex_common_prefix = [] - - -# -- Options for HTML output --------------------------------------------- - -# The theme to use for HTML and HTML Help pages. Major themes that come with -# Sphinx are currently 'default' and 'sphinxdoc'. -html_theme = 'agogo' - -# The style sheet to use for HTML and HTML Help pages. A file of that name -# must exist either in Sphinx' static/ path, or in one of the custom paths -# given in html_static_path. -# html_style = 'statsmodels.css' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# html_theme_options = {} - -# Add any paths that contain custom themes here, relative to this directory. -html_theme_path = ['themes'] - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -html_title = 'Vbench performance benchmarks for pandas' - -# A shorter title for the navigation bar. Default is the same as html_title. -# html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -# html_logo = None - -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -# html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -# html_last_updated_fmt = '%b %d, %Y' - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -# html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -# html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -# html_additional_pages = {} - -# If false, no module index is generated. -html_use_modindex = True - -# If false, no index is generated. -# html_use_index = True - -# If true, the index is split into individual pages for each letter. -# html_split_index = False - -# If true, links to the reST sources are added to the pages. -# html_show_sourcelink = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -# html_use_opensearch = '' - -# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). -# html_file_suffix = '' - -# Output file base name for HTML help builder. -htmlhelp_basename = 'performance' - - -# -- Options for LaTeX output -------------------------------------------- - -# The paper size ('letter' or 'a4'). -# latex_paper_size = 'letter' - -# The font size ('10pt', '11pt' or '12pt'). -# latex_font_size = '10pt' - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, author, documentclass [howto/manual]). -latex_documents = [ - ('index', 'performance.tex', - u'pandas vbench Performance Benchmarks', - u'Wes McKinney', 'manual'), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -# latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -# latex_use_parts = False - -# Additional stuff for the LaTeX preamble. -# latex_preamble = '' - -# Documents to append as an appendix to all manuals. -# latex_appendices = [] - -# If false, no module index is generated. -# latex_use_modindex = True - - -# Example configuration for intersphinx: refer to the Python standard library. -# intersphinx_mapping = {'http://docs.scipy.org/': None} -import glob -autosummary_generate = glob.glob("*.rst") diff --git a/vb_suite/source/themes/agogo/layout.html b/vb_suite/source/themes/agogo/layout.html deleted file mode 100644 index cd0f3d7ffc9c7..0000000000000 --- a/vb_suite/source/themes/agogo/layout.html +++ /dev/null @@ -1,95 +0,0 @@ -{# - agogo/layout.html - ~~~~~~~~~~~~~~~~~ - - Sphinx layout template for the agogo theme, originally written - by Andi Albrecht. - - :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. - :license: BSD, see LICENSE for details. -#} -{% extends "basic/layout.html" %} - -{% block header %} -
-
- {%- if logo %} - - {%- endif %} - {%- block headertitle %} -

{{ shorttitle|e }}

- {%- endblock %} -
- {%- for rellink in rellinks|reverse %} - {{ rellink[3] }} - {%- if not loop.last %}{{ reldelim2 }}{% endif %} - {%- endfor %} -
-
-
-{% endblock %} - -{% block content %} -
-
- -
- {%- block document %} - {{ super() }} - {%- endblock %} -
-
-
-
-{% endblock %} - -{% block footer %} - -{% endblock %} - -{% block relbar1 %}{% endblock %} -{% block relbar2 %}{% endblock %} diff --git a/vb_suite/source/themes/agogo/static/agogo.css_t b/vb_suite/source/themes/agogo/static/agogo.css_t deleted file mode 100644 index ef909b72e20f6..0000000000000 --- a/vb_suite/source/themes/agogo/static/agogo.css_t +++ /dev/null @@ -1,476 +0,0 @@ -/* - * agogo.css_t - * ~~~~~~~~~~~ - * - * Sphinx stylesheet -- agogo theme. - * - * :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * - */ - -* { - margin: 0px; - padding: 0px; -} - -body { - font-family: {{ theme_bodyfont }}; - line-height: 1.4em; - color: black; - background-color: {{ theme_bgcolor }}; -} - - -/* Page layout */ - -div.header, div.content, div.footer { - max-width: {{ theme_pagewidth }}; - margin-left: auto; - margin-right: auto; -} - -div.header-wrapper { - background: {{ theme_headerbg }}; - padding: 1em 1em 0; - border-bottom: 3px solid #2e3436; - min-height: 0px; -} - - -/* Default body styles */ -a { - color: {{ theme_linkcolor }}; -} - -div.bodywrapper a, div.footer a { - text-decoration: underline; -} - -.clearer { - clear: both; -} - -.left { - float: left; -} - -.right { - float: right; -} - -.line-block { - display: block; - margin-top: 1em; - margin-bottom: 1em; -} - -.line-block .line-block { - margin-top: 0; - margin-bottom: 0; - margin-left: 1.5em; -} - -h1, h2, h3, h4 { - font-family: {{ theme_headerfont }}; - font-weight: normal; - color: {{ theme_headercolor2 }}; - margin-bottom: .8em; -} - -h1 { - color: {{ theme_headercolor1 }}; -} - -h2 { - padding-bottom: .5em; - border-bottom: 1px solid {{ theme_headercolor2 }}; -} - -a.headerlink { - visibility: hidden; - color: #dddddd; - padding-left: .3em; -} - -h1:hover > a.headerlink, -h2:hover > a.headerlink, -h3:hover > a.headerlink, -h4:hover > a.headerlink, -h5:hover > a.headerlink, -h6:hover > a.headerlink, -dt:hover > a.headerlink { - visibility: visible; -} - -img { - border: 0; -} - -pre { - background-color: #EEE; - padding: 0.5em; -} - -div.admonition { - margin-top: 10px; - margin-bottom: 10px; - padding: 2px 7px 1px 7px; - border-left: 0.2em solid black; -} - -p.admonition-title { - margin: 0px 10px 5px 0px; - font-weight: bold; -} - -dt:target, .highlighted { - background-color: #fbe54e; -} - -/* Header */ - -/* -div.header { - padding-top: 10px; - padding-bottom: 10px; -} -*/ - -div.header {} - -div.header h1 { - font-family: {{ theme_headerfont }}; - font-weight: normal; - font-size: 180%; - letter-spacing: .08em; -} - -div.header h1 a { - color: white; -} - -div.header div.rel { - text-decoration: none; -} -/* margin-top: 1em; */ - -div.header div.rel a { - margin-top: 1em; - color: {{ theme_headerlinkcolor }}; - letter-spacing: .1em; - text-transform: uppercase; - padding: 3px 1em; -} - -p.logo { - float: right; -} - -img.logo { - border: 0; -} - - -/* Content */ -div.content-wrapper { - background-color: white; - padding: 1em; -} -/* - padding-top: 20px; - padding-bottom: 20px; -*/ - -/* float: left; */ - -div.document { - max-width: {{ theme_documentwidth }}; -} - -div.body { - padding-right: 2em; - text-align: {{ theme_textalign }}; -} - -div.document ul { - margin: 1.5em; - list-style-type: square; -} - -div.document dd { - margin-left: 1.2em; - margin-top: .4em; - margin-bottom: 1em; -} - -div.document .section { - margin-top: 1.7em; -} -div.document .section:first-child { - margin-top: 0px; -} - -div.document div.highlight { - padding: 3px; - background-color: #eeeeec; - border-top: 2px solid #dddddd; - border-bottom: 2px solid #dddddd; - margin-top: .8em; - margin-bottom: .8em; -} - -div.document h2 { - margin-top: .7em; -} - -div.document p { - margin-bottom: .5em; -} - -div.document li.toctree-l1 { - margin-bottom: 1em; -} - -div.document .descname { - font-weight: bold; -} - -div.document .docutils.literal { - background-color: #eeeeec; - padding: 1px; -} - -div.document .docutils.xref.literal { - background-color: transparent; - padding: 0px; -} - -div.document blockquote { - margin: 1em; -} - -div.document ol { - margin: 1.5em; -} - - -/* Sidebar */ - - -div.sidebar { - width: {{ theme_sidebarwidth }}; - padding: 0 1em; - float: right; - font-size: .93em; -} - -div.sidebar a, div.header a { - text-decoration: none; -} - -div.sidebar a:hover, div.header a:hover { - text-decoration: underline; -} - -div.sidebar h3 { - color: #2e3436; - text-transform: uppercase; - font-size: 130%; - letter-spacing: .1em; -} - -div.sidebar ul { - list-style-type: none; -} - -div.sidebar li.toctree-l1 a { - display: block; - padding: 1px; - border: 1px solid #dddddd; - background-color: #eeeeec; - margin-bottom: .4em; - padding-left: 3px; - color: #2e3436; -} - -div.sidebar li.toctree-l2 a { - background-color: transparent; - border: none; - margin-left: 1em; - border-bottom: 1px solid #dddddd; -} - -div.sidebar li.toctree-l3 a { - background-color: transparent; - border: none; - margin-left: 2em; - border-bottom: 1px solid #dddddd; -} - -div.sidebar li.toctree-l2:last-child a { - border-bottom: none; -} - -div.sidebar li.toctree-l1.current a { - border-right: 5px solid {{ theme_headerlinkcolor }}; -} - -div.sidebar li.toctree-l1.current li.toctree-l2 a { - border-right: none; -} - - -/* Footer */ - -div.footer-wrapper { - background: {{ theme_footerbg }}; - border-top: 4px solid #babdb6; - padding-top: 10px; - padding-bottom: 10px; - min-height: 80px; -} - -div.footer, div.footer a { - color: #888a85; -} - -div.footer .right { - text-align: right; -} - -div.footer .left { - text-transform: uppercase; -} - - -/* Styles copied from basic theme */ - -img.align-left, .figure.align-left, object.align-left { - clear: left; - float: left; - margin-right: 1em; -} - -img.align-right, .figure.align-right, object.align-right { - clear: right; - float: right; - margin-left: 1em; -} - -img.align-center, .figure.align-center, object.align-center { - display: block; - margin-left: auto; - margin-right: auto; -} - -.align-left { - text-align: left; -} - -.align-center { - clear: both; - text-align: center; -} - -.align-right { - text-align: right; -} - -/* -- search page ----------------------------------------------------------- */ - -ul.search { - margin: 10px 0 0 20px; - padding: 0; -} - -ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; -} - -ul.search li a { - font-weight: bold; -} - -ul.search li div.context { - color: #888; - margin: 2px 0 0 30px; - text-align: left; -} - -ul.keywordmatches li.goodmatch a { - font-weight: bold; -} - -/* -- index page ------------------------------------------------------------ */ - -table.contentstable { - width: 90%; -} - -table.contentstable p.biglink { - line-height: 150%; -} - -a.biglink { - font-size: 1.3em; -} - -span.linkdescr { - font-style: italic; - padding-top: 5px; - font-size: 90%; -} - -/* -- general index --------------------------------------------------------- */ - -table.indextable td { - text-align: left; - vertical-align: top; -} - -table.indextable dl, table.indextable dd { - margin-top: 0; - margin-bottom: 0; -} - -table.indextable tr.pcap { - height: 10px; -} - -table.indextable tr.cap { - margin-top: 10px; - background-color: #f2f2f2; -} - -img.toggler { - margin-right: 3px; - margin-top: 3px; - cursor: pointer; -} - -/* -- viewcode extension ---------------------------------------------------- */ - -.viewcode-link { - float: right; -} - -.viewcode-back { - float: right; - font-family:: {{ theme_bodyfont }}; -} - -div.viewcode-block:target { - margin: -1px -3px; - padding: 0 3px; - background-color: #f4debf; - border-top: 1px solid #ac9; - border-bottom: 1px solid #ac9; -} - -th.field-name { - white-space: nowrap; -} diff --git a/vb_suite/source/themes/agogo/static/bgfooter.png b/vb_suite/source/themes/agogo/static/bgfooter.png deleted file mode 100644 index 9ce5bdd902943..0000000000000 Binary files a/vb_suite/source/themes/agogo/static/bgfooter.png and /dev/null differ diff --git a/vb_suite/source/themes/agogo/static/bgtop.png b/vb_suite/source/themes/agogo/static/bgtop.png deleted file mode 100644 index a0d4709bac8f7..0000000000000 Binary files a/vb_suite/source/themes/agogo/static/bgtop.png and /dev/null differ diff --git a/vb_suite/source/themes/agogo/theme.conf b/vb_suite/source/themes/agogo/theme.conf deleted file mode 100644 index 3fc88580f1ab4..0000000000000 --- a/vb_suite/source/themes/agogo/theme.conf +++ /dev/null @@ -1,19 +0,0 @@ -[theme] -inherit = basic -stylesheet = agogo.css -pygments_style = tango - -[options] -bodyfont = "Verdana", Arial, sans-serif -headerfont = "Georgia", "Times New Roman", serif -pagewidth = 70em -documentwidth = 50em -sidebarwidth = 20em -bgcolor = #eeeeec -headerbg = url(bgtop.png) top left repeat-x -footerbg = url(bgfooter.png) top left repeat-x -linkcolor = #ce5c00 -headercolor1 = #204a87 -headercolor2 = #3465a4 -headerlinkcolor = #fcaf3e -textalign = justify \ No newline at end of file diff --git a/vb_suite/sparse.py b/vb_suite/sparse.py deleted file mode 100644 index b1c1a2f24e41d..0000000000000 --- a/vb_suite/sparse.py +++ /dev/null @@ -1,65 +0,0 @@ -from vbench.benchmark import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -""" - -#---------------------------------------------------------------------- - -setup = common_setup + """ -from pandas.core.sparse import SparseSeries, SparseDataFrame - -K = 50 -N = 50000 -rng = np.asarray(date_range('1/1/2000', periods=N, - freq='T')) - -# rng2 = np.asarray(rng).astype('M8[ns]').astype('i8') - -series = {} -for i in range(1, K + 1): - data = np.random.randn(N)[:-i] - this_rng = rng[:-i] - data[100:] = np.nan - series[i] = SparseSeries(data, index=this_rng) -""" -stmt = "SparseDataFrame(series)" - -bm_sparse1 = Benchmark(stmt, setup, name="sparse_series_to_frame", - start_date=datetime(2011, 6, 1)) - - -setup = common_setup + """ -from pandas.core.sparse import SparseDataFrame -""" - -stmt = "SparseDataFrame(columns=np.arange(100), index=np.arange(1000))" - -sparse_constructor = Benchmark(stmt, setup, name="sparse_frame_constructor", - start_date=datetime(2012, 6, 1)) - - -setup = common_setup + """ -s = pd.Series([np.nan] * 10000) -s[0] = 3.0 -s[100] = -1.0 -s[999] = 12.1 -s.index = pd.MultiIndex.from_product((range(10), range(10), range(10), range(10))) -ss = s.to_sparse() -""" - -stmt = "ss.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True)" - -sparse_series_to_coo = Benchmark(stmt, setup, name="sparse_series_to_coo", - start_date=datetime(2015, 1, 3)) - -setup = common_setup + """ -import scipy.sparse -import pandas.core.sparse.series -A = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(100, 100)) -""" - -stmt = "ss = pandas.core.sparse.series.SparseSeries.from_coo(A)" - -sparse_series_from_coo = Benchmark(stmt, setup, name="sparse_series_from_coo", - start_date=datetime(2015, 1, 3)) diff --git a/vb_suite/stat_ops.py b/vb_suite/stat_ops.py deleted file mode 100644 index 8d7c30dc9fdcf..0000000000000 --- a/vb_suite/stat_ops.py +++ /dev/null @@ -1,126 +0,0 @@ -from vbench.benchmark import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -""" - -#---------------------------------------------------------------------- -# nanops - -setup = common_setup + """ -s = Series(np.random.randn(100000), index=np.arange(100000)) -s[::2] = np.nan -""" - -stat_ops_series_std = Benchmark("s.std()", setup) - -#---------------------------------------------------------------------- -# ops by level - -setup = common_setup + """ -index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], - labels=[np.arange(10).repeat(10000), - np.tile(np.arange(100).repeat(100), 10), - np.tile(np.tile(np.arange(100), 100), 10)]) -random.shuffle(index.values) -df = DataFrame(np.random.randn(len(index), 4), index=index) -df_level = DataFrame(np.random.randn(100, 4), index=index.levels[1]) -""" - -stat_ops_level_frame_sum = \ - Benchmark("df.sum(level=1)", setup, - start_date=datetime(2011, 11, 15)) - -stat_ops_level_frame_sum_multiple = \ - Benchmark("df.sum(level=[0, 1])", setup, repeat=1, - start_date=datetime(2011, 11, 15)) - -stat_ops_level_series_sum = \ - Benchmark("df[1].sum(level=1)", setup, - start_date=datetime(2011, 11, 15)) - -stat_ops_level_series_sum_multiple = \ - Benchmark("df[1].sum(level=[0, 1])", setup, repeat=1, - start_date=datetime(2011, 11, 15)) - -sum_setup = common_setup + """ -df = DataFrame(np.random.randn(100000, 4)) -dfi = DataFrame(np.random.randint(1000, size=df.shape)) -""" - -stat_ops_frame_sum_int_axis_0 = \ - Benchmark("dfi.sum()", sum_setup, start_date=datetime(2013, 7, 25)) - -stat_ops_frame_sum_float_axis_0 = \ - Benchmark("df.sum()", sum_setup, start_date=datetime(2013, 7, 25)) - -stat_ops_frame_mean_int_axis_0 = \ - Benchmark("dfi.mean()", sum_setup, start_date=datetime(2013, 7, 25)) - -stat_ops_frame_mean_float_axis_0 = \ - Benchmark("df.mean()", sum_setup, start_date=datetime(2013, 7, 25)) - -stat_ops_frame_sum_int_axis_1 = \ - Benchmark("dfi.sum(1)", sum_setup, start_date=datetime(2013, 7, 25)) - -stat_ops_frame_sum_float_axis_1 = \ - Benchmark("df.sum(1)", sum_setup, start_date=datetime(2013, 7, 25)) - -stat_ops_frame_mean_int_axis_1 = \ - Benchmark("dfi.mean(1)", sum_setup, start_date=datetime(2013, 7, 25)) - -stat_ops_frame_mean_float_axis_1 = \ - Benchmark("df.mean(1)", sum_setup, start_date=datetime(2013, 7, 25)) - -#---------------------------------------------------------------------- -# rank - -setup = common_setup + """ -values = np.concatenate([np.arange(100000), - np.random.randn(100000), - np.arange(100000)]) -s = Series(values) -""" - -stats_rank_average = Benchmark('s.rank()', setup, - start_date=datetime(2011, 12, 12)) - -stats_rank_pct_average = Benchmark('s.rank(pct=True)', setup, - start_date=datetime(2014, 1, 16)) -stats_rank_pct_average_old = Benchmark('s.rank() / len(s)', setup, - start_date=datetime(2014, 1, 16)) -setup = common_setup + """ -values = np.random.randint(0, 100000, size=200000) -s = Series(values) -""" - -stats_rank_average_int = Benchmark('s.rank()', setup, - start_date=datetime(2011, 12, 12)) - -setup = common_setup + """ -df = DataFrame(np.random.randn(5000, 50)) -""" - -stats_rank2d_axis1_average = Benchmark('df.rank(1)', setup, - start_date=datetime(2011, 12, 12)) - -stats_rank2d_axis0_average = Benchmark('df.rank()', setup, - start_date=datetime(2011, 12, 12)) - -# rolling functions - -setup = common_setup + """ -arr = np.random.randn(100000) -""" - -stats_rolling_mean = Benchmark('rolling_mean(arr, 100)', setup, - start_date=datetime(2011, 6, 1)) - -# spearman correlation - -setup = common_setup + """ -df = DataFrame(np.random.randn(1000, 30)) -""" - -stats_corr_spearman = Benchmark("df.corr(method='spearman')", setup, - start_date=datetime(2011, 12, 4)) diff --git a/vb_suite/strings.py b/vb_suite/strings.py deleted file mode 100644 index 0948df5673a0d..0000000000000 --- a/vb_suite/strings.py +++ /dev/null @@ -1,59 +0,0 @@ -from vbench.api import Benchmark - -common_setup = """from .pandas_vb_common import * -""" - -setup = common_setup + """ -import string -import itertools as IT - -def make_series(letters, strlen, size): - return Series( - [str(x) for x in np.fromiter(IT.cycle(letters), count=size*strlen, dtype='|S1') - .view('|S{}'.format(strlen))]) - -many = make_series('matchthis'+string.ascii_uppercase, strlen=19, size=10000) # 31% matches -few = make_series('matchthis'+string.ascii_uppercase*42, strlen=19, size=10000) # 1% matches -""" - -strings_cat = Benchmark("many.str.cat(sep=',')", setup) -strings_title = Benchmark("many.str.title()", setup) -strings_count = Benchmark("many.str.count('matchthis')", setup) -strings_contains_many = Benchmark("many.str.contains('matchthis')", setup) -strings_contains_few = Benchmark("few.str.contains('matchthis')", setup) -strings_contains_many_noregex = Benchmark( - "many.str.contains('matchthis', regex=False)", setup) -strings_contains_few_noregex = Benchmark( - "few.str.contains('matchthis', regex=False)", setup) -strings_startswith = Benchmark("many.str.startswith('matchthis')", setup) -strings_endswith = Benchmark("many.str.endswith('matchthis')", setup) -strings_lower = Benchmark("many.str.lower()", setup) -strings_upper = Benchmark("many.str.upper()", setup) -strings_replace = Benchmark("many.str.replace(r'(matchthis)', r'\1\1')", setup) -strings_repeat = Benchmark( - "many.str.repeat(list(IT.islice(IT.cycle(range(1,4)),len(many))))", setup) -strings_match = Benchmark("many.str.match(r'mat..this')", setup) -strings_extract = Benchmark("many.str.extract(r'(\w*)matchthis(\w*)')", setup) -strings_join_split = Benchmark("many.str.join(r'--').str.split('--')", setup) -strings_join_split_expand = Benchmark("many.str.join(r'--').str.split('--',expand=True)", setup) -strings_len = Benchmark("many.str.len()", setup) -strings_findall = Benchmark("many.str.findall(r'[A-Z]+')", setup) -strings_pad = Benchmark("many.str.pad(100, side='both')", setup) -strings_center = Benchmark("many.str.center(100)", setup) -strings_slice = Benchmark("many.str.slice(5,15,2)", setup) -strings_strip = Benchmark("many.str.strip('matchthis')", setup) -strings_lstrip = Benchmark("many.str.lstrip('matchthis')", setup) -strings_rstrip = Benchmark("many.str.rstrip('matchthis')", setup) -strings_get = Benchmark("many.str.get(0)", setup) - -setup = setup + """ -s = make_series(string.ascii_uppercase, strlen=10, size=10000).str.join('|') -""" -strings_get_dummies = Benchmark("s.str.get_dummies('|')", setup) - -setup = common_setup + """ -import pandas.util.testing as testing -ser = Series(testing.makeUnicodeIndex()) -""" - -strings_encode_decode = Benchmark("ser.str.encode('utf-8').str.decode('utf-8')", setup) diff --git a/vb_suite/suite.py b/vb_suite/suite.py deleted file mode 100644 index 45053b6610896..0000000000000 --- a/vb_suite/suite.py +++ /dev/null @@ -1,164 +0,0 @@ -from vbench.api import Benchmark, GitRepo -from datetime import datetime - -import os - -modules = ['attrs_caching', - 'binary_ops', - 'ctors', - 'frame_ctor', - 'frame_methods', - 'groupby', - 'index_object', - 'indexing', - 'io_bench', - 'io_sql', - 'inference', - 'hdfstore_bench', - 'join_merge', - 'gil', - 'miscellaneous', - 'panel_ctor', - 'packers', - 'parser_vb', - 'panel_methods', - 'plotting', - 'reindex', - 'replace', - 'sparse', - 'strings', - 'reshape', - 'stat_ops', - 'timeseries', - 'timedelta', - 'eval'] - -by_module = {} -benchmarks = [] - -for modname in modules: - ref = __import__(modname) - by_module[modname] = [v for v in ref.__dict__.values() - if isinstance(v, Benchmark)] - benchmarks.extend(by_module[modname]) - -for bm in benchmarks: - assert(bm.name is not None) - -import getpass -import sys - -USERNAME = getpass.getuser() - -if sys.platform == 'darwin': - HOME = '/Users/%s' % USERNAME -else: - HOME = '/home/%s' % USERNAME - -try: - import ConfigParser - - config = ConfigParser.ConfigParser() - config.readfp(open(os.path.expanduser('~/.vbenchcfg'))) - - REPO_PATH = config.get('setup', 'repo_path') - REPO_URL = config.get('setup', 'repo_url') - DB_PATH = config.get('setup', 'db_path') - TMP_DIR = config.get('setup', 'tmp_dir') -except: - REPO_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "../")) - REPO_URL = 'git@github.com:pandas-dev/pandas.git' - DB_PATH = os.path.join(REPO_PATH, 'vb_suite/benchmarks.db') - TMP_DIR = os.path.join(HOME, 'tmp/vb_pandas') - -PREPARE = """ -python setup.py clean -""" -BUILD = """ -python setup.py build_ext --inplace -""" -dependencies = ['pandas_vb_common.py'] - -START_DATE = datetime(2010, 6, 1) - -# repo = GitRepo(REPO_PATH) - -RST_BASE = 'source' - -# HACK! - -# timespan = [datetime(2011, 1, 1), datetime(2012, 1, 1)] - - -def generate_rst_files(benchmarks): - import matplotlib as mpl - mpl.use('Agg') - import matplotlib.pyplot as plt - - vb_path = os.path.join(RST_BASE, 'vbench') - fig_base_path = os.path.join(vb_path, 'figures') - - if not os.path.exists(vb_path): - print('creating %s' % vb_path) - os.makedirs(vb_path) - - if not os.path.exists(fig_base_path): - print('creating %s' % fig_base_path) - os.makedirs(fig_base_path) - - for bmk in benchmarks: - print('Generating rst file for %s' % bmk.name) - rst_path = os.path.join(RST_BASE, 'vbench/%s.txt' % bmk.name) - - fig_full_path = os.path.join(fig_base_path, '%s.png' % bmk.name) - - # make the figure - plt.figure(figsize=(10, 6)) - ax = plt.gca() - bmk.plot(DB_PATH, ax=ax) - - start, end = ax.get_xlim() - - plt.xlim([start - 30, end + 30]) - plt.savefig(fig_full_path, bbox_inches='tight') - plt.close('all') - - fig_rel_path = 'vbench/figures/%s.png' % bmk.name - rst_text = bmk.to_rst(image_path=fig_rel_path) - with open(rst_path, 'w') as f: - f.write(rst_text) - - with open(os.path.join(RST_BASE, 'index.rst'), 'w') as f: - print >> f, """ -Performance Benchmarks -====================== - -These historical benchmark graphs were produced with `vbench -`__. - -The ``.pandas_vb_common`` setup script can be found here_ - -.. _here: https://github.com/pandas-dev/pandas/tree/master/vb_suite - -Produced on a machine with - - - Intel Core i7 950 processor - - (K)ubuntu Linux 12.10 - - Python 2.7.2 64-bit (Enthought Python Distribution 7.1-2) - - NumPy 1.6.1 - -.. toctree:: - :hidden: - :maxdepth: 3 -""" - for modname, mod_bmks in sorted(by_module.items()): - print >> f, ' vb_%s' % modname - modpath = os.path.join(RST_BASE, 'vb_%s.rst' % modname) - with open(modpath, 'w') as mh: - header = '%s\n%s\n\n' % (modname, '=' * len(modname)) - print >> mh, header - - for bmk in mod_bmks: - print >> mh, bmk.name - print >> mh, '-' * len(bmk.name) - print >> mh, '.. include:: vbench/%s.txt\n' % bmk.name diff --git a/vb_suite/test.py b/vb_suite/test.py deleted file mode 100644 index da30c3e1a5f76..0000000000000 --- a/vb_suite/test.py +++ /dev/null @@ -1,67 +0,0 @@ -from pandas import * -import matplotlib.pyplot as plt - -import sqlite3 - -from vbench.git import GitRepo - - -REPO_PATH = '/home/adam/code/pandas' -repo = GitRepo(REPO_PATH) - -con = sqlite3.connect('vb_suite/benchmarks.db') - -bmk = '36900a889961162138c140ce4ae3c205' -# bmk = '9d7b8c04b532df6c2d55ef497039b0ce' -bmk = '4481aa4efa9926683002a673d2ed3dac' -bmk = '00593cd8c03d769669d7b46585161726' -bmk = '3725ab7cd0a0657d7ae70f171c877cea' -bmk = '3cd376d6d6ef802cdea49ac47a67be21' -bmk2 = '459225186023853494bc345fd180f395' -bmk = 'c22ca82e0cfba8dc42595103113c7da3' -bmk = 'e0e651a8e9fbf0270ab68137f8b9df5f' -bmk = '96bda4b9a60e17acf92a243580f2a0c3' - - -def get_results(bmk): - results = con.execute( - "select * from results where checksum='%s'" % bmk).fetchall() - x = Series(dict((t[1], t[3]) for t in results)) - x.index = x.index.map(repo.timestamps.get) - x = x.sort_index() - return x - -x = get_results(bmk) - - -def graph1(): - dm_getitem = get_results('459225186023853494bc345fd180f395') - dm_getvalue = get_results('c22ca82e0cfba8dc42595103113c7da3') - - plt.figure() - ax = plt.gca() - - dm_getitem.plot(label='df[col][idx]', ax=ax) - dm_getvalue.plot(label='df.get_value(idx, col)', ax=ax) - - plt.ylabel('ms') - plt.legend(loc='best') - - -def graph2(): - bm = get_results('96bda4b9a60e17acf92a243580f2a0c3') - plt.figure() - ax = plt.gca() - - bm.plot(ax=ax) - plt.ylabel('ms') - -bm = get_results('36900a889961162138c140ce4ae3c205') -fig = plt.figure() -ax = plt.gca() -bm.plot(ax=ax) -fig.autofmt_xdate() - -plt.xlim([bm.dropna().index[0] - datetools.MonthEnd(), - bm.dropna().index[-1] + datetools.MonthEnd()]) -plt.ylabel('ms') diff --git a/vb_suite/test_perf.py b/vb_suite/test_perf.py deleted file mode 100755 index be546b72f9465..0000000000000 --- a/vb_suite/test_perf.py +++ /dev/null @@ -1,616 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -""" -What ----- -vbench is a library which can be used to benchmark the performance -of a codebase over time. -Although vbench can collect data over many commites, generate plots -and other niceties, for Pull-Requests the important thing is the -performance of the HEAD commit against a known-good baseline. - -This script tries to automate the process of comparing these -two commits, and is meant to run out of the box on a fresh -clone. - -How ---- -These are the steps taken: -1) create a temp directory into which vbench will clone the temporary repo. -2) instantiate a vbench runner, using the local repo as the source repo. -3) perform a vbench run for the baseline commit, then the target commit. -4) pull the results for both commits from the db. use pandas to align -everything and calculate a ration for the timing information. -5) print the results to the log file and to stdout. - -""" - -# IMPORTANT NOTE -# -# This script should run on pandas versions at least as far back as 0.9.1. -# devs should be able to use the latest version of this script with -# any dusty old commit and expect it to "just work". -# One way in which this is useful is when collecting historical data, -# where writing some logic around this script may prove easier -# in some cases then running vbench directly (think perf bisection). -# -# *please*, when you modify this script for whatever reason, -# make sure you do not break its functionality when running under older -# pandas versions. -# Note that depreaction warnings are turned off in main(), so there's -# no need to change the actual code to supress such warnings. - -import shutil -import os -import sys -import argparse -import tempfile -import time -import re - -import random -import numpy as np - -import pandas as pd -from pandas import DataFrame, Series - -from suite import REPO_PATH -VB_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -DEFAULT_MIN_DURATION = 0.01 -HEAD_COL="head[ms]" -BASE_COL="base[ms]" - -try: - import git # gitpython -except Exception: - print("Error: Please install the `gitpython` package\n") - sys.exit(1) - -class RevParseAction(argparse.Action): - def __call__(self, parser, namespace, values, option_string=None): - import subprocess - cmd = 'git rev-parse --short -verify {0}^{{commit}}'.format(values) - rev_parse = subprocess.check_output(cmd, shell=True) - setattr(namespace, self.dest, rev_parse.strip()) - - -parser = argparse.ArgumentParser(description='Use vbench to measure and compare the performance of commits.') -parser.add_argument('-H', '--head', - help='Execute vbenches using the currently checked out copy.', - dest='head', - action='store_true', - default=False) -parser.add_argument('-b', '--base-commit', - help='The commit serving as performance baseline ', - type=str, action=RevParseAction) -parser.add_argument('-t', '--target-commit', - help='The commit to compare against the baseline (default: HEAD).', - type=str, action=RevParseAction) -parser.add_argument('--base-pickle', - help='name of pickle file with timings data generated by a former `-H -d FILE` run. '\ - 'filename must be of the form -*.* or specify --base-commit seperately', - type=str) -parser.add_argument('--target-pickle', - help='name of pickle file with timings data generated by a former `-H -d FILE` run '\ - 'filename must be of the form -*.* or specify --target-commit seperately', - type=str) -parser.add_argument('-m', '--min-duration', - help='Minimum duration (in ms) of baseline test for inclusion in report (default: %.3f).' % DEFAULT_MIN_DURATION, - type=float, - default=0.01) -parser.add_argument('-o', '--output', - metavar="", - dest='log_file', - help='Path of file in which to save the textual report (default: vb_suite.log).') -parser.add_argument('-d', '--outdf', - metavar="FNAME", - dest='outdf', - default=None, - help='Name of file to df.save() the result table into. Will overwrite') -parser.add_argument('-r', '--regex', - metavar="REGEX", - dest='regex', - default="", - help='Regex pat, only tests whose name matches the regext will be run.') -parser.add_argument('-s', '--seed', - metavar="SEED", - dest='seed', - default=1234, - type=int, - help='Integer value to seed PRNG with') -parser.add_argument('-n', '--repeats', - metavar="N", - dest='repeats', - default=3, - type=int, - help='Number of times to run each vbench, result value is the best of') -parser.add_argument('-c', '--ncalls', - metavar="N", - dest='ncalls', - default=3, - type=int, - help='Number of calls to in each repetition of a vbench') -parser.add_argument('-N', '--hrepeats', - metavar="N", - dest='hrepeats', - default=1, - type=int, - help='implies -H, number of times to run the vbench suite on the head commit.\n' - 'Each iteration will yield another column in the output' ) -parser.add_argument('-a', '--affinity', - metavar="a", - dest='affinity', - default=1, - type=int, - help='set processor affinity of process by default bind to cpu/core #1 only. ' - 'Requires the "affinity" or "psutil" python module, will raise Warning otherwise') -parser.add_argument('-u', '--burnin', - metavar="u", - dest='burnin', - default=1, - type=int, - help='Number of extra iteration per benchmark to perform first, then throw away. ' ) - -parser.add_argument('-S', '--stats', - default=False, - action='store_true', - help='when specified with -N, prints the output of describe() per vbench results. ' ) - -parser.add_argument('--temp-dir', - metavar="PATH", - default=None, - help='Specify temp work dir to use. ccache depends on builds being invoked from consistent directory.' ) - -parser.add_argument('-q', '--quiet', - default=False, - action='store_true', - help='Suppress report output to stdout. ' ) - -def get_results_df(db, rev): - """Takes a git commit hash and returns a Dataframe of benchmark results - """ - bench = DataFrame(db.get_benchmarks()) - results = DataFrame(map(list,db.get_rev_results(rev).values())) - - # Sinch vbench.db._reg_rev_results returns an unlabeled dict, - # we have to break encapsulation a bit. - results.columns = db._results.c.keys() - results = results.join(bench['name'], on='checksum').set_index("checksum") - return results - - -def prprint(s): - print("*** %s" % s) - -def pre_hook(): - import gc - gc.disable() - -def post_hook(): - import gc - gc.enable() - -def profile_comparative(benchmarks): - - from vbench.api import BenchmarkRunner - from vbench.db import BenchmarkDB - from vbench.git import GitRepo - from suite import BUILD, DB_PATH, PREPARE, dependencies - - TMP_DIR = args.temp_dir or tempfile.mkdtemp() - - try: - - prprint("Opening DB at '%s'...\n" % DB_PATH) - db = BenchmarkDB(DB_PATH) - - prprint("Initializing Runner...") - - # all in a good cause... - GitRepo._parse_commit_log = _parse_wrapper(args.base_commit) - - runner = BenchmarkRunner( - benchmarks, REPO_PATH, REPO_PATH, BUILD, DB_PATH, - TMP_DIR, PREPARE, always_clean=True, - # run_option='eod', start_date=START_DATE, - module_dependencies=dependencies) - - repo = runner.repo # (steal the parsed git repo used by runner) - h_head = args.target_commit or repo.shas[-1] - h_baseline = args.base_commit - - # ARGH. reparse the repo, without discarding any commits, - # then overwrite the previous parse results - # prprint("Slaughtering kittens...") - (repo.shas, repo.messages, - repo.timestamps, repo.authors) = _parse_commit_log(None,REPO_PATH, - args.base_commit) - - prprint('Target [%s] : %s\n' % (h_head, repo.messages.get(h_head, ""))) - prprint('Baseline [%s] : %s\n' % (h_baseline, - repo.messages.get(h_baseline, ""))) - - prprint("Removing any previous measurements for the commits.") - db.delete_rev_results(h_baseline) - db.delete_rev_results(h_head) - - # TODO: we could skip this, but we need to make sure all - # results are in the DB, which is a little tricky with - # start dates and so on. - prprint("Running benchmarks for baseline [%s]" % h_baseline) - runner._run_and_write_results(h_baseline) - - prprint("Running benchmarks for target [%s]" % h_head) - runner._run_and_write_results(h_head) - - prprint('Processing results...') - - head_res = get_results_df(db, h_head) - baseline_res = get_results_df(db, h_baseline) - - report_comparative(head_res,baseline_res) - - finally: - # print("Disposing of TMP_DIR: %s" % TMP_DIR) - shutil.rmtree(TMP_DIR) - -def prep_pickle_for_total(df, agg_name='median'): - """ - accepts a datafram resulting from invocation with -H -d o.pickle - If multiple data columns are present (-N was used), the - `agg_name` attr of the datafram will be used to reduce - them to a single value per vbench, df.median is used by defa - ult. - - Returns a datadrame of the form expected by prep_totals - """ - def prep(df): - agg = getattr(df,agg_name) - df = DataFrame(agg(1)) - cols = list(df.columns) - cols[0]='timing' - df.columns=cols - df['name'] = list(df.index) - return df - - return prep(df) - -def prep_totals(head_res, baseline_res): - """ - Each argument should be a dataframe with 'timing' and 'name' columns - where name is the name of the vbench. - - returns a 'totals' dataframe, suitable as input for print_report. - """ - head_res, baseline_res = head_res.align(baseline_res) - ratio = head_res['timing'] / baseline_res['timing'] - totals = DataFrame({HEAD_COL:head_res['timing'], - BASE_COL:baseline_res['timing'], - 'ratio':ratio, - 'name':baseline_res.name}, - columns=[HEAD_COL, BASE_COL, "ratio", "name"]) - totals = totals.ix[totals[HEAD_COL] > args.min_duration] - # ignore below threshold - totals = totals.dropna( - ).sort("ratio").set_index('name') # sort in ascending order - return totals - -def report_comparative(head_res,baseline_res): - try: - r=git.Repo(VB_DIR) - except: - import pdb - pdb.set_trace() - - totals = prep_totals(head_res,baseline_res) - - h_head = args.target_commit - h_baseline = args.base_commit - h_msg = b_msg = "Unknown" - try: - h_msg = r.commit(h_head).message.strip() - except git.exc.BadObject: - pass - try: - b_msg = r.commit(h_baseline).message.strip() - except git.exc.BadObject: - pass - - - print_report(totals,h_head=h_head,h_msg=h_msg, - h_baseline=h_baseline,b_msg=b_msg) - - if args.outdf: - prprint("The results DataFrame was written to '%s'\n" % args.outdf) - totals.save(args.outdf) - -def profile_head_single(benchmark): - import gc - results = [] - - # just in case - gc.collect() - - try: - from ctypes import cdll, CDLL - cdll.LoadLibrary("libc.so.6") - libc = CDLL("libc.so.6") - libc.malloc_trim(0) - except: - pass - - - N = args.hrepeats + args.burnin - - results = [] - try: - for i in range(N): - gc.disable() - d=dict() - - try: - d = benchmark.run() - - except KeyboardInterrupt: - raise - except Exception as e: # if a single vbench bursts into flames, don't die. - err="" - try: - err = d.get("traceback") - if err is None: - err = str(e) - except: - pass - print("%s died with:\n%s\nSkipping...\n" % (benchmark.name, err)) - - results.append(d.get('timing',np.nan)) - gc.enable() - gc.collect() - - finally: - gc.enable() - - if results: - # throw away the burn_in - results = results[args.burnin:] - sys.stdout.write('.') - sys.stdout.flush() - return Series(results, name=benchmark.name) - - # df = DataFrame(results) - # df.columns = ["name",HEAD_COL] - # return df.set_index("name")[HEAD_COL] - -def profile_head(benchmarks): - print( "Performing %d benchmarks (%d runs each)" % ( len(benchmarks), args.hrepeats)) - - ss= [profile_head_single(b) for b in benchmarks] - print("\n") - - results = DataFrame(ss) - results.columns=[ "#%d" %i for i in range(args.hrepeats)] - # results.index = ["#%d" % i for i in range(len(ss))] - # results = results.T - - shas, messages, _,_ = _parse_commit_log(None,REPO_PATH,base_commit="HEAD^") - print_report(results,h_head=shas[-1],h_msg=messages[-1]) - - - if args.outdf: - prprint("The results DataFrame was written to '%s'\n" % args.outdf) - DataFrame(results).save(args.outdf) - -def print_report(df,h_head=None,h_msg="",h_baseline=None,b_msg=""): - - name_width=45 - col_width = 10 - - hdr = ("{:%s}" % name_width).format("Test name") - hdr += ("|{:^%d}" % col_width)* len(df.columns) - hdr += "|" - hdr = hdr.format(*df.columns) - hdr = "-"*len(hdr) + "\n" + hdr + "\n" + "-"*len(hdr) + "\n" - ftr=hdr - s = "\n" - s+= "Invoked with :\n" - s+= "--ncalls: %s\n" % (args.ncalls or 'Auto') - s+= "--repeats: %s\n" % (args.repeats) - s+= "\n\n" - - s += hdr - # import ipdb - # ipdb.set_trace() - for i in range(len(df)): - lfmt = ("{:%s}" % name_width) - lfmt += ("| {:%d.4f} " % (col_width-2))* len(df.columns) - lfmt += "|\n" - s += lfmt.format(df.index[i],*list(df.iloc[i].values)) - - s+= ftr + "\n" - - s += "Ratio < 1.0 means the target commit is faster then the baseline.\n" - s += "Seed used: %d\n\n" % args.seed - - if h_head: - s += 'Target [%s] : %s\n' % (h_head, h_msg) - if h_baseline: - s += 'Base [%s] : %s\n\n' % ( - h_baseline, b_msg) - - stats_footer = "\n" - if args.stats : - try: - pd.options.display.expand_frame_repr=False - except: - pass - stats_footer += str(df.T.describe().T) + "\n\n" - - s+= stats_footer - logfile = open(args.log_file, 'w') - logfile.write(s) - logfile.close() - - if not args.quiet: - prprint(s) - - if args.stats and args.quiet: - prprint(stats_footer) - - prprint("Results were also written to the logfile at '%s'" % - args.log_file) - - - -def main(): - from suite import benchmarks - - if not args.log_file: - args.log_file = os.path.abspath( - os.path.join(REPO_PATH, 'vb_suite.log')) - - saved_dir = os.path.curdir - if args.outdf: - # not bullet-proof but enough for us - args.outdf = os.path.realpath(args.outdf) - - if args.log_file: - # not bullet-proof but enough for us - args.log_file = os.path.realpath(args.log_file) - - random.seed(args.seed) - np.random.seed(args.seed) - - if args.base_pickle and args.target_pickle: - baseline_res = prep_pickle_for_total(pd.load(args.base_pickle)) - target_res = prep_pickle_for_total(pd.load(args.target_pickle)) - - report_comparative(target_res, baseline_res) - sys.exit(0) - - if args.affinity is not None: - try: # use psutil rather then stale affinity module. Thanks @yarikoptic - import psutil - if hasattr(psutil.Process, 'set_cpu_affinity'): - psutil.Process(os.getpid()).set_cpu_affinity([args.affinity]) - print("CPU affinity set to %d" % args.affinity) - except ImportError: - print("-a/--affinity specified, but the 'psutil' module is not available, aborting.\n") - sys.exit(1) - - print("\n") - prprint("LOG_FILE = %s" % args.log_file) - if args.outdf: - prprint("PICKE_FILE = %s" % args.outdf) - - print("\n") - - # move away from the pandas root dir, to avoid possible import - # surprises - os.chdir(os.path.dirname(os.path.abspath(__file__))) - - benchmarks = [x for x in benchmarks if re.search(args.regex,x.name)] - - for b in benchmarks: - b.repeat = args.repeats - if args.ncalls: - b.ncalls = args.ncalls - - if benchmarks: - if args.head: - profile_head(benchmarks) - else: - profile_comparative(benchmarks) - else: - print( "No matching benchmarks") - - os.chdir(saved_dir) - -# hack , vbench.git ignores some commits, but we -# need to be able to reference any commit. -# modified from vbench.git -def _parse_commit_log(this,repo_path,base_commit=None): - from vbench.git import _convert_timezones - from pandas import Series - from dateutil import parser as dparser - - git_cmd = 'git --git-dir=%s/.git --work-tree=%s ' % (repo_path, repo_path) - githist = git_cmd + ('log --graph --pretty=format:'+ - '\"::%h::%cd::%s::%an\"'+ - ('%s..' % base_commit)+ - '> githist.txt') - os.system(githist) - githist = open('githist.txt').read() - os.remove('githist.txt') - - shas = [] - timestamps = [] - messages = [] - authors = [] - for line in githist.split('\n'): - if '*' not in line.split("::")[0]: # skip non-commit lines - continue - - _, sha, stamp, message, author = line.split('::', 4) - - # parse timestamp into datetime object - stamp = dparser.parse(stamp) - - shas.append(sha) - timestamps.append(stamp) - messages.append(message) - authors.append(author) - - # to UTC for now - timestamps = _convert_timezones(timestamps) - - shas = Series(shas, timestamps) - messages = Series(messages, shas) - timestamps = Series(timestamps, shas) - authors = Series(authors, shas) - return shas[::-1], messages[::-1], timestamps[::-1], authors[::-1] - -# even worse, monkey patch vbench -def _parse_wrapper(base_commit): - def inner(repo_path): - return _parse_commit_log(repo_path,base_commit) - return inner - -if __name__ == '__main__': - args = parser.parse_args() - if (not args.head - and not (args.base_commit and args.target_commit) - and not (args.base_pickle and args.target_pickle)): - parser.print_help() - sys.exit(1) - elif ((args.base_pickle or args.target_pickle) and not - (args.base_pickle and args.target_pickle)): - print("Must specify Both --base-pickle and --target-pickle.") - sys.exit(1) - - if ((args.base_pickle or args.target_pickle) and not - (args.base_commit and args.target_commit)): - if not args.base_commit: - print("base_commit not specified, Assuming base_pickle is named -foo.*") - args.base_commit = args.base_pickle.split('-')[0] - if not args.target_commit: - print("target_commit not specified, Assuming target_pickle is named -foo.*") - args.target_commit = args.target_pickle.split('-')[0] - - import warnings - warnings.filterwarnings('ignore',category=FutureWarning) - warnings.filterwarnings('ignore',category=DeprecationWarning) - - if args.base_commit and args.target_commit: - print("Verifying specified commits exist in repo...") - r=git.Repo(VB_DIR) - for c in [ args.base_commit, args.target_commit ]: - try: - msg = r.commit(c).message.strip() - except git.BadObject: - print("The commit '%s' was not found, aborting..." % c) - sys.exit(1) - else: - print("%s: %s" % (c,msg)) - - main() diff --git a/vb_suite/timedelta.py b/vb_suite/timedelta.py deleted file mode 100644 index 378968ea1379a..0000000000000 --- a/vb_suite/timedelta.py +++ /dev/null @@ -1,32 +0,0 @@ -from vbench.api import Benchmark -from datetime import datetime - -common_setup = """from .pandas_vb_common import * -from pandas import to_timedelta -""" - -#---------------------------------------------------------------------- -# conversion - -setup = common_setup + """ -arr = np.random.randint(0,1000,size=10000) -""" - -stmt = "to_timedelta(arr,unit='s')" -timedelta_convert_int = Benchmark(stmt, setup, start_date=datetime(2014, 1, 1)) - -setup = common_setup + """ -arr = np.random.randint(0,1000,size=10000) -arr = [ '{0} days'.format(i) for i in arr ] -""" - -stmt = "to_timedelta(arr)" -timedelta_convert_string = Benchmark(stmt, setup, start_date=datetime(2014, 1, 1)) - -setup = common_setup + """ -arr = np.random.randint(0,60,size=10000) -arr = [ '00:00:{0:02d}'.format(i) for i in arr ] -""" - -stmt = "to_timedelta(arr)" -timedelta_convert_string_seconds = Benchmark(stmt, setup, start_date=datetime(2014, 1, 1)) diff --git a/vb_suite/timeseries.py b/vb_suite/timeseries.py deleted file mode 100644 index 15bc89d62305f..0000000000000 --- a/vb_suite/timeseries.py +++ /dev/null @@ -1,445 +0,0 @@ -from vbench.api import Benchmark -from datetime import datetime -from pandas import * - -N = 100000 -try: - rng = date_range(start='1/1/2000', periods=N, freq='min') -except NameError: - rng = DatetimeIndex(start='1/1/2000', periods=N, freq='T') - def date_range(start=None, end=None, periods=None, freq=None): - return DatetimeIndex(start=start, end=end, periods=periods, offset=freq) - - -common_setup = """from .pandas_vb_common import * -from datetime import timedelta -N = 100000 - -rng = date_range(start='1/1/2000', periods=N, freq='T') - -if hasattr(Series, 'convert'): - Series.resample = Series.convert - -ts = Series(np.random.randn(N), index=rng) -""" - -#---------------------------------------------------------------------- -# Lookup value in large time series, hash map population - -setup = common_setup + """ -rng = date_range(start='1/1/2000', periods=1500000, freq='S') -ts = Series(1, index=rng) -""" - -stmt = "ts[ts.index[len(ts) // 2]]; ts.index._cleanup()" -timeseries_large_lookup_value = Benchmark(stmt, setup, - start_date=datetime(2012, 1, 1)) - -#---------------------------------------------------------------------- -# Test slice minutely series - -timeseries_slice_minutely = Benchmark('ts[:10000]', common_setup) - -#---------------------------------------------------------------------- -# Test conversion - -setup = common_setup + """ - -""" - -timeseries_1min_5min_ohlc = Benchmark( - "ts[:10000].resample('5min', how='ohlc')", - common_setup, - start_date=datetime(2012, 5, 1)) - -timeseries_1min_5min_mean = Benchmark( - "ts[:10000].resample('5min', how='mean')", - common_setup, - start_date=datetime(2012, 5, 1)) - -#---------------------------------------------------------------------- -# Irregular alignment - -setup = common_setup + """ -lindex = np.random.permutation(N)[:N // 2] -rindex = np.random.permutation(N)[:N // 2] -left = Series(ts.values.take(lindex), index=ts.index.take(lindex)) -right = Series(ts.values.take(rindex), index=ts.index.take(rindex)) -""" - -timeseries_add_irregular = Benchmark('left + right', setup) - -#---------------------------------------------------------------------- -# Sort large irregular time series - -setup = common_setup + """ -N = 100000 -rng = date_range(start='1/1/2000', periods=N, freq='s') -rng = rng.take(np.random.permutation(N)) -ts = Series(np.random.randn(N), index=rng) -""" - -timeseries_sort_index = Benchmark('ts.sort_index()', setup, - start_date=datetime(2012, 4, 1)) - -#---------------------------------------------------------------------- -# Shifting, add offset - -setup = common_setup + """ -rng = date_range(start='1/1/2000', periods=10000, freq='T') -""" - -datetimeindex_add_offset = Benchmark('rng + timedelta(minutes=2)', setup, - start_date=datetime(2012, 4, 1)) - -setup = common_setup + """ -N = 10000 -rng = date_range(start='1/1/1990', periods=N, freq='53s') -ts = Series(np.random.randn(N), index=rng) -dates = date_range(start='1/1/1990', periods=N * 10, freq='5s') -""" -timeseries_asof_single = Benchmark('ts.asof(dates[0])', setup, - start_date=datetime(2012, 4, 27)) - -timeseries_asof = Benchmark('ts.asof(dates)', setup, - start_date=datetime(2012, 4, 27)) - -setup = setup + 'ts[250:5000] = np.nan' - -timeseries_asof_nan = Benchmark('ts.asof(dates)', setup, - start_date=datetime(2012, 4, 27)) - -#---------------------------------------------------------------------- -# Time zone - -setup = common_setup + """ -rng = date_range(start='1/1/2000', end='3/1/2000', tz='US/Eastern') -""" - -timeseries_timestamp_tzinfo_cons = \ - Benchmark('rng[0]', setup, start_date=datetime(2012, 5, 5)) - -#---------------------------------------------------------------------- -# Resampling period - -setup = common_setup + """ -rng = period_range(start='1/1/2000', end='1/1/2001', freq='T') -ts = Series(np.random.randn(len(rng)), index=rng) -""" - -timeseries_period_downsample_mean = \ - Benchmark("ts.resample('D', how='mean')", setup, - start_date=datetime(2012, 4, 25)) - -setup = common_setup + """ -rng = date_range(start='1/1/2000', end='1/1/2001', freq='T') -ts = Series(np.random.randn(len(rng)), index=rng) -""" - -timeseries_timestamp_downsample_mean = \ - Benchmark("ts.resample('D', how='mean')", setup, - start_date=datetime(2012, 4, 25)) - -# GH 7754 -setup = common_setup + """ -rng = date_range(start='2000-01-01 00:00:00', - end='2000-01-01 10:00:00', freq='555000U') -int_ts = Series(5, rng, dtype='int64') -ts = int_ts.astype('datetime64[ns]') -""" - -timeseries_resample_datetime64 = Benchmark("ts.resample('1S', how='last')", setup) - -#---------------------------------------------------------------------- -# to_datetime - -setup = common_setup + """ -rng = date_range(start='1/1/2000', periods=20000, freq='H') -strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in rng] -""" - -timeseries_to_datetime_iso8601 = \ - Benchmark('to_datetime(strings)', setup, - start_date=datetime(2012, 7, 11)) - -timeseries_to_datetime_iso8601_format = \ - Benchmark("to_datetime(strings, format='%Y-%m-%d %H:%M:%S')", setup, - start_date=datetime(2012, 7, 11)) - -setup = common_setup + """ -rng = date_range(start='1/1/2000', periods=10000, freq='D') -strings = Series(rng.year*10000+rng.month*100+rng.day,dtype=np.int64).apply(str) -""" - -timeseries_to_datetime_YYYYMMDD = \ - Benchmark('to_datetime(strings,format="%Y%m%d")', setup, - start_date=datetime(2012, 7, 1)) - -setup = common_setup + """ -s = Series(['19MAY11','19MAY11:00:00:00']*100000) -""" -timeseries_with_format_no_exact = Benchmark("to_datetime(s,format='%d%b%y',exact=False)", \ - setup, start_date=datetime(2014, 11, 26)) -timeseries_with_format_replace = Benchmark("to_datetime(s.str.replace(':\S+$',''),format='%d%b%y')", \ - setup, start_date=datetime(2014, 11, 26)) - -# ---- infer_freq -# infer_freq - -setup = common_setup + """ -from pandas.tseries.frequencies import infer_freq -rng = date_range(start='1/1/1700', freq='D', periods=100000) -a = rng[:50000].append(rng[50002:]) -""" - -timeseries_infer_freq = \ - Benchmark('infer_freq(a)', setup, start_date=datetime(2012, 7, 1)) - -# setitem PeriodIndex - -setup = common_setup + """ -rng = period_range(start='1/1/1990', freq='S', periods=20000) -df = DataFrame(index=range(len(rng))) -""" - -period_setitem = \ - Benchmark("df['col'] = rng", setup, - start_date=datetime(2012, 8, 1)) - -setup = common_setup + """ -rng = date_range(start='1/1/2000 9:30', periods=10000, freq='S', tz='US/Eastern') -""" - -datetimeindex_normalize = \ - Benchmark('rng.normalize()', setup, - start_date=datetime(2012, 9, 1)) - -setup = common_setup + """ -from pandas.tseries.offsets import Second -s1 = date_range(start='1/1/2000', periods=100, freq='S') -curr = s1[-1] -slst = [] -for i in range(100): - slst.append(curr + Second()), periods=100, freq='S') - curr = slst[-1][-1] -""" - -# dti_append_tz = \ -# Benchmark('s1.append(slst)', setup, start_date=datetime(2012, 9, 1)) - - -setup = common_setup + """ -rng = date_range(start='1/1/2000', periods=1000, freq='H') -df = DataFrame(np.random.randn(len(rng), 2), rng) -""" - -dti_reset_index = \ - Benchmark('df.reset_index()', setup, start_date=datetime(2012, 9, 1)) - -setup = common_setup + """ -rng = date_range(start='1/1/2000', periods=1000, freq='H', - tz='US/Eastern') -df = DataFrame(np.random.randn(len(rng), 2), index=rng) -""" - -dti_reset_index_tz = \ - Benchmark('df.reset_index()', setup, start_date=datetime(2012, 9, 1)) - -setup = common_setup + """ -rng = date_range(start='1/1/2000', periods=1000, freq='T') -index = rng.repeat(10) -""" - -datetimeindex_unique = Benchmark('index.unique()', setup, - start_date=datetime(2012, 7, 1)) - -# tz_localize with infer argument. This is an attempt to emulate the results -# of read_csv with duplicated data. Not passing infer_dst will fail -setup = common_setup + """ -dst_rng = date_range(start='10/29/2000 1:00:00', - end='10/29/2000 1:59:59', freq='S') -index = date_range(start='10/29/2000', end='10/29/2000 00:59:59', freq='S') -index = index.append(dst_rng) -index = index.append(dst_rng) -index = index.append(date_range(start='10/29/2000 2:00:00', - end='10/29/2000 3:00:00', freq='S')) -""" - -datetimeindex_infer_dst = \ -Benchmark('index.tz_localize("US/Eastern", infer_dst=True)', - setup, start_date=datetime(2013, 9, 30)) - - -#---------------------------------------------------------------------- -# Resampling: fast-path various functions - -setup = common_setup + """ -rng = date_range(start='20130101',periods=100000,freq='50L') -df = DataFrame(np.random.randn(100000,2),index=rng) -""" - -dataframe_resample_mean_string = \ - Benchmark("df.resample('1s', how='mean')", setup) - -dataframe_resample_mean_numpy = \ - Benchmark("df.resample('1s', how=np.mean)", setup) - -dataframe_resample_min_string = \ - Benchmark("df.resample('1s', how='min')", setup) - -dataframe_resample_min_numpy = \ - Benchmark("df.resample('1s', how=np.min)", setup) - -dataframe_resample_max_string = \ - Benchmark("df.resample('1s', how='max')", setup) - -dataframe_resample_max_numpy = \ - Benchmark("df.resample('1s', how=np.max)", setup) - - -#---------------------------------------------------------------------- -# DatetimeConverter - -setup = common_setup + """ -from pandas.tseries.converter import DatetimeConverter -""" - -datetimeindex_converter = \ - Benchmark('DatetimeConverter.convert(rng, None, None)', - setup, start_date=datetime(2013, 1, 1)) - -# Adding custom business day -setup = common_setup + """ -import datetime as dt -import pandas as pd -try: - import pandas.tseries.holiday -except ImportError: - pass -import numpy as np - -date = dt.datetime(2011,1,1) -dt64 = np.datetime64('2011-01-01 09:00Z') -hcal = pd.tseries.holiday.USFederalHolidayCalendar() - -day = pd.offsets.Day() -year = pd.offsets.YearBegin() -cday = pd.offsets.CustomBusinessDay() -cmb = pd.offsets.CustomBusinessMonthBegin(calendar=hcal) -cme = pd.offsets.CustomBusinessMonthEnd(calendar=hcal) - -cdayh = pd.offsets.CustomBusinessDay(calendar=hcal) -""" -timeseries_day_incr = Benchmark("date + day",setup) - -timeseries_day_apply = Benchmark("day.apply(date)",setup) - -timeseries_year_incr = Benchmark("date + year",setup) - -timeseries_year_apply = Benchmark("year.apply(date)",setup) - -timeseries_custom_bday_incr = \ - Benchmark("date + cday",setup) - -timeseries_custom_bday_decr = \ - Benchmark("date - cday",setup) - -timeseries_custom_bday_apply = \ - Benchmark("cday.apply(date)",setup) - -timeseries_custom_bday_apply_dt64 = \ - Benchmark("cday.apply(dt64)",setup) - -timeseries_custom_bday_cal_incr = \ - Benchmark("date + 1 * cdayh",setup) - -timeseries_custom_bday_cal_decr = \ - Benchmark("date - 1 * cdayh",setup) - -timeseries_custom_bday_cal_incr_n = \ - Benchmark("date + 10 * cdayh",setup) - -timeseries_custom_bday_cal_incr_neg_n = \ - Benchmark("date - 10 * cdayh",setup) - -# Increment custom business month -timeseries_custom_bmonthend_incr = \ - Benchmark("date + cme",setup) - -timeseries_custom_bmonthend_incr_n = \ - Benchmark("date + 10 * cme",setup) - -timeseries_custom_bmonthend_decr_n = \ - Benchmark("date - 10 * cme",setup) - -timeseries_custom_bmonthbegin_incr_n = \ - Benchmark("date + 10 * cmb",setup) - -timeseries_custom_bmonthbegin_decr_n = \ - Benchmark("date - 10 * cmb",setup) - - -#---------------------------------------------------------------------- -# month/quarter/year start/end accessors - -setup = common_setup + """ -N = 10000 -rng = date_range(start='1/1/1', periods=N, freq='B') -""" - -timeseries_is_month_start = Benchmark('rng.is_month_start', setup, - start_date=datetime(2014, 4, 1)) - -#---------------------------------------------------------------------- -# iterate over DatetimeIndex/PeriodIndex -setup = common_setup + """ -N = 1000000 -M = 10000 -idx1 = date_range(start='20140101', freq='T', periods=N) -idx2 = period_range(start='20140101', freq='T', periods=N) - -def iter_n(iterable, n=None): - i = 0 - for _ in iterable: - i += 1 - if n is not None and i > n: - break -""" - -timeseries_iter_datetimeindex = Benchmark('iter_n(idx1)', setup) - -timeseries_iter_periodindex = Benchmark('iter_n(idx2)', setup) - -timeseries_iter_datetimeindex_preexit = Benchmark('iter_n(idx1, M)', setup) - -timeseries_iter_periodindex_preexit = Benchmark('iter_n(idx2, M)', setup) - - -#---------------------------------------------------------------------- -# apply an Offset to a DatetimeIndex -setup = common_setup + """ -N = 100000 -idx1 = date_range(start='20140101', freq='T', periods=N) -delta_offset = pd.offsets.Day() -fast_offset = pd.offsets.DateOffset(months=2, days=2) -slow_offset = pd.offsets.BusinessDay() - -""" - -timeseries_datetimeindex_offset_delta = Benchmark('idx1 + delta_offset', setup) -timeseries_datetimeindex_offset_fast = Benchmark('idx1 + fast_offset', setup) -timeseries_datetimeindex_offset_slow = Benchmark('idx1 + slow_offset', setup) - -# apply an Offset to a Series containing datetime64 values -setup = common_setup + """ -N = 100000 -s = Series(date_range(start='20140101', freq='T', periods=N)) -delta_offset = pd.offsets.Day() -fast_offset = pd.offsets.DateOffset(months=2, days=2) -slow_offset = pd.offsets.BusinessDay() - -""" - -timeseries_series_offset_delta = Benchmark('s + delta_offset', setup) -timeseries_series_offset_fast = Benchmark('s + fast_offset', setup) -timeseries_series_offset_slow = Benchmark('s + slow_offset', setup)